How do I raise the memory limit in python? - python

I have written a python (2.7) script but it use a lot of memory so I get a out of memory error. Is it possible to use memory?
My code (or the github):
from itertools import combinations
import numpy
# Find the unused members and put this in a other group
def findMembers(listIn,listMembers):
lengthlist2 = (len(listMembers)-len(listIn[0]))
group2 = [0] * lengthlist2 #making the other groups based on the length of the first group
for i in listIn:
wichRow = 0
for x in listMembers:
if not (x in i) :
group2[wichRow] = x
wichRow += 1
listIn.append(group2)
return listIn
#you give a list of members and the numbers of groups
#you get back all the possibilities of combinations
def findCombinations(listMembers,numbersOfGroups):
groupTemp = [] #list needed to save correctly all the combinations
group = [] #list needed for keep it simple
newGroup = [] #list that will be returned
for listPossibilities in combinations(listMembers,(len(listMembers)/numbersOfGroups)):
groupTemp.append(list(listPossibilities))
group.append(groupTemp) #saving all the possibilities
groupTemp = []
for k in group:
# place the unused members in group2
k = (findMembers(k,listMembers))
if numbersOfGroups > 2:
groupTemp = []
groupTemp = findCombinations(k[1],numbersOfGroups-1)
for i in groupTemp:
listTemp = []
listTemp.append(k[0])
listTemp.extend(i)
newGroup.append(listTemp)
else:
newGroup = group
return newGroup
# Calculate the happiness of the group
def findHappiness(tabel,listIn):
happiness = 0
for i in listIn:
for j in i:
for k in i:
happiness += tabel[j][k]
return happiness
def buildTabel(members): #build a random survey
tabel = numpy.random.random((members,members))
return tabel
def calculateHappiness(group):
print "Finding all the happiness: "
maxhappiness = 0
i = 0
for x in group:
happiness = findHappiness(tabel,x)
if happiness > maxhappiness:
maxhappiness = happiness
y = x
progress = int(round((((i)*1.0/(len(group)))*100.0)))
update_progress(progress)
i += 1
print "\n Best solution: ", y, " with: ", maxhappiness, " happiness"
def update_progress(progress):
print '\r[{0}] {1}%'.format('#'*(progress/5), progress),
if __name__ == "__main__":
members = 24 # members of the group
numbersOfGroups = 3
tabel = buildTabel(members) #preferences will be stored here
listMembers = (range(members)) #members of the group that need to be divided
print "Searching all the combinations..."
group = findCombinations(listMembers,numbersOfGroups) #find all the combinations (recursive)
print len(group)," combinations"
calculateHappiness(group) #calculate the most happiest group and print
the error:
Searching all the combinations...
Traceback (most recent call last):
File "main.py", line 75, in <module>
calculateHappiness(group) #calculate the most happiest group and print
File "main.py", line 38, in findCombinations
newGroup = group
MemoryError
I'm using windows 10 64bit with 6gb ram. Is it possible to use virtual ram or disk space of mine hard drive disk?

Related

Python process being killed due to out of 256Gb memory

The training dataset is a 42GB JSON file. mesh is medical subject headings, consider it as an id or a label. neighbors_mesh is a 28,000 dimension list that has information about mesh that are close to each other. We got this data from training mesh terms for 1.07 M data through KNN. The MLB fit transform returns a 28,000-dimension vector of 0 and 1. But each element is int64 by default. I have tried to reduce it by mask.astype(int__). It's still 32bit.
The iteration blocks 256GB of memory after running for about 1M iterations and still gets killed.
My python version is 3.9
The machine has 256GB memory, 20GB Swap memory, and 48 Core CPU, and GPU.
def build_dataset(train_path, neighbors, journal_mesh, MeSH_id_pair_file, index_dic):
mapping_id = {}
with open(MeSH_id_pair_file, 'r') as f:
for line in f:
(key, value) = line.split('=')
mapping_id[key] = value.strip()
meshIDs = list(mapping_id.values())
meshIDs = label2index(meshIDs, index_dic)
meshIDs_str = [str(x) for x in meshIDs]
print('Total number of labels %d' % len(meshIDs_str))
mlb = MultiLabelBinarizer(classes=meshIDs_str)
mlb.fit(meshIDs_str)
pmid_neighbors, neighbors_mesh = read_neighbors(neighbors, index_dic)
f = open(train_path, encoding="utf8")
objects = ijson.items(f, 'articles.item')
dataset = []
print("Objects: ", type(objects))
print("pmid neighboors: ", type(pmid_neighbors))
for i, obj in enumerate(tqdm(objects)):
data_point = {}
try:
ids = obj["pmid"]
heading = obj['title'].strip()
heading = heading.translate(str.maketrans('', '', '[]'))
abstract = obj["abstractText"].strip()
clean_abstract = abstract.translate(str.maketrans('', '', '[]'))
if len(heading) == 0 or heading == 'In process':
print('paper ', ids, ' does not have title!')
continue
elif len(clean_abstract) == 0:
print('paper ', ids, ' does not have abstract!')
continue
else:
mesh_id = obj['mesh']
journal = obj['journal']
year = obj['year']
mesh_from_journal = journal_mesh[journal]
mesh_from_neighbors = []
if i < len(pmid_neighbors) and ids == pmid_neighbors[i]:
mesh_from_neighbors = neighbors_mesh[i]
mesh_from_journal_str = [str(x) for x in mesh_from_journal]
mesh_from_neighbors_str = [str(x) for x in mesh_from_neighbors]
mesh = list(set(mesh_from_journal_str + mesh_from_neighbors_str))
mask = mlb.fit_transform([mesh])
mask = mask.astype(np.int_)
mask = mask.tolist()
print("MEsh Size: ", sys.getsizeof(mask))
print("Mesh content size: ", sys.getsizeof(mask[0][0]))
print("Mesh content type: ", type(mask[0][0]))
data_point['pmid'] = ids
data_point['title'] = heading
data_point['abstractText'] = clean_abstract
data_point['meshID'] = mesh_id
data_point['meshMask'] = mask
data_point['year'] = year
dataset.append(data_point)
print("dataset Size: ", sys.getsizeof(dataset))
except AttributeError:
print(f'An excaption occured for pmid: {obj["pmid"].strip()}', AttributeError.args())
pubmed = {'articles': dataset}
return pubmed
I successfully finished running the code by adding f.close() after the iteration is completed. The result is a 88GB dataset. But I am still curious why it's taking up so much space.

what is the diffrence beetween the bleu score and the vaerage sentence of bleu score

i'm having a hard time finding the bleus core for my seq to seq model for the task of question generation , my questions are the following :
if i use the sentence bleu to find the score beetween each refrence and the output and then devide the total of these sentence-bleu scores by the len of the test data , will it be the same as the corpus bleu ?
and for the corpus bleu implemented in the code as the nltk corpus bleu ?
import ntpath
import sys
import codecs
import os
import math
import operator
import functools
def fetch_data(cand, ref):
references = []
if '.eng' in ref:
reference_file = codecs.open(ref, 'r', 'utf-8')
references.append(reference_file.readlines())
else:
for root, dirs, files in os.walk(ref):
for f in files:
reference_file = codecs.open(os.path.join(root, f), 'r', 'utf-8')
references.append(reference_file.readlines())
candidate_file = codecs.open(cand, 'r', 'utf-8')
candidate = candidate_file.readlines()
return candidate, references
def count_ngram(candidate, references, n):
clipped_count = 0
count = 0
r = 0
c = 0
for si in range(len(candidate)):
# Calculate precision for each sentence
ref_counts = []
ref_lengths = []
# Build dictionary of ngram counts
for reference in references:
ref_sentence = reference[si]
ngram_d = {}
words = ref_sentence.strip().split()
ref_lengths.append(len(words))
limits = len(words) - n + 1
# loop through the sentance consider the ngram length
for i in range(limits):
ngram = ' '.join(words[i:i+n]).lower()
if ngram in ngram_d.keys():
ngram_d[ngram] += 1
else:
ngram_d[ngram] = 1
ref_counts.append(ngram_d)
# candidate
cand_sentence = candidate[si]
cand_dict = {}
words = cand_sentence.strip().split()
limits = len(words) - n + 1
for i in range(0, limits):
ngram = ' '.join(words[i:i + n]).lower()
if ngram in cand_dict:
cand_dict[ngram] += 1
else:
cand_dict[ngram] = 1
clipped_count += clip_count(cand_dict, ref_counts)
count += limits
r += best_length_match(ref_lengths, len(words))
c += len(words)
if clipped_count == 0:
pr = 0
else:
pr = float(clipped_count) / count
bp = brevity_penalty(c, r)
return pr, bp
def clip_count(cand_d, ref_ds):
"""Count the clip count for each ngram considering all references"""
count = 0
for m in cand_d.keys():
m_w = cand_d[m]
m_max = 0
for ref in ref_ds:
if m in ref:
m_max = max(m_max, ref[m])
m_w = min(m_w, m_max)
count += m_w
return count
def best_length_match(ref_l, cand_l):
"""Find the closest length of reference to that of candidate"""
least_diff = abs(cand_l-ref_l[0])
best = ref_l[0]
for ref in ref_l:
if abs(cand_l-ref) < least_diff:
least_diff = abs(cand_l-ref)
best = ref
return best
def brevity_penalty(c, r):
if c > r:
bp = 1
else:
bp = math.exp(1-(float(r)/c))
return bp
def geometric_mean(precisions):
return (functools.reduce(operator.mul, precisions)) ** (1.0 / len(precisions))
def BLEU(candidate, references):
precisions = []
for i in range(4):
pr, bp = count_ngram(candidate, references, i+1)
precisions.append(pr)
bleu = geometric_mean(precisions) * bp
return bleu
if __name__ == "__main__":
candidate, references = fetch_data(sys.argv[1], sys.argv[2])
bleu = BLEU(candidate, references)
print (bleu)
I'm not sure about the implementation you show but for implementations strictly following the original paper such as NLTKs it would not be the same: https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py#L123.
Using sentence-BLEU means basically calling corpus-BLEU with just a one-sentence-corpus, but the other way around doesn't work. The scores should not be drastically different but they do differ because of macro-average vs micro-average.
I used BLEU for Seq2Seq evaluation before and just used sentence-BLEU and it worked just fine.

display ";" separated values in table format with python

I'm having the following function which picks results from a results.txt file and displays it to the user, the first row comes out nicely but the rest of the results are not well-aligned. I'm not sure what's causing this. here's the function
def show_result():
'''prints the score list'''
file_exist()
print_whole_list = []
print("Results")
print("*"*41)
result = open("results.txt","r")
res = result.readlines()
print("Name: Lap1: Lap2: Lap3: In total: Average:")
for i in res:
temp_list = i.split(";")
total = int(temp_list[1]) + int(temp_list[2]) + int(temp_list[3])
average = int(total) / 3.0
average = round(average, 2)
temp_list.insert(4, total)
temp_list.insert(5, average)
print_whole_list.extend(temp_list)
for row in print_whole_list:
print("{0:{width}}".format(row, width=10), end=' ')
result.close()
The records in the text file:
Kembos;23;43;23;
Moses;42;51;43;
Ben;43;23;21;
You can use tabulate instead to populate your data in a tabular format in python
from tabulate import tabulate
result = open("results.txt","r")
res = result.readlines()
final = []
for line in res:
temp_list = line.split(';')[:-1]
total = int(temp_list[1]) + int(temp_list[2]) + int(temp_list[3])
average = round(int(total) / 3.0, 2)
temp_list.append(total)
temp_list.append(average)
final.append(temp_list)
print(tabulate(final, headers=["Name", "Lap1", "Lap2", "Lap3", "In Total", "Average"]))
Above code will give the following output:

How to import random seed in cycle to obtain the same result in anytime?

I need to get the same results in any iteration. I tried to use random.seed() in my script, but it doesn't work. How can I fix it?
My script:
import statistics
Oddr=[]
Oddr_fem=[]
Oddr_mal=[]
chi = []
chi_fem=[]
chi_mal=[]
for k in range(100):
random.seed(10)
result = []
exclude_hlthy = []
for i in set(sick['predicted_age']):
sick_ppl = sick.index[sick['predicted_age'] == i].tolist()
L_sick = len(sick_ppl)
if L_sick == 0:
continue
hlth_peers = healthy[healthy.predicted_age == i]
L_healthy = hlth_peers.shape[0]
if L_healthy < len(sick_ppl):
pass
else:
hlthy_subsample = list(np.random.choice([x for x in hlth_peers.index if not x in exclude_hlthy],
L_sick, replace = False))
exclude_hlthy += hlthy_subsample
result += hlthy_subsample
table_ready = healthy.loc[result]
whole_table = table_ready.append(sick, ignore_index=False)
cross_tab = pd.crosstab(index=whole_table['dc013'], columns=whole_table['rate_aging'])
oddsratio=(cross_tab[1][1]*cross_tab[0][0])/(cross_tab[1][0]*cross_tab[0][1])
#Oddr += oddsratio
Oddr.append(oddsratio)
In this script I got several random tables whole_table from one subsample.
For every iteration you're creating a new random.seed(10) so try to put this line of code out of the for cycle and then should work

object of type '_Task' has no len() error

I am using the parallel programming module for python I have a function that returns me an array but when I print the variable that contain the value of the function parallelized returns me "pp._Task object at 0x04696510" and not the value of the matrix.
Here is the code:
from __future__ import print_function
import scipy, pylab
from scipy.io.wavfile import read
import sys
import peakpicker as pea
import pp
import fingerprint as fhash
import matplotlib
import numpy as np
import tdft
import subprocess
import time
if __name__ == '__main__':
start=time.time()
#Peak picking dimensions
f_dim1 = 30
t_dim1 = 80
f_dim2 = 10
t_dim2 = 20
percentile = 80
base = 100 # lowest frequency bin used (peaks below are too common/not as useful for identification)
high_peak_threshold = 75
low_peak_threshold = 60
#TDFT parameters
windowsize = 0.008 #set the window size (0.008s = 64 samples)
windowshift = 0.004 #set the window shift (0.004s = 32 samples)
fftsize = 1024 #set the fft size (if srate = 8000, 1024 --> 513 freq. bins separated by 7.797 Hz from 0 to 4000Hz)
#Hash parameters
delay_time = 250 # 250*0.004 = 1 second#200
delta_time = 250*3 # 750*0.004 = 3 seconds#300
delta_freq = 128 # 128*7.797Hz = approx 1000Hz#80
#Time pair parameters
TPdelta_freq = 4
TPdelta_time = 2
#Cargando datos almacenados
database=np.loadtxt('database.dat')
songnames=np.loadtxt('songnames.dat', dtype=str, delimiter='\t')
separator = '.'
print('Please enter an audio sample file to identify: ')
userinput = raw_input('---> ')
subprocess.call(['ffmpeg','-y','-i',userinput, '-ac', '1','-ar', '8k', 'filesample.wav'])
sample = read('filesample.wav')
userinput = userinput.split(separator,1)[0]
print('Analyzing the audio sample: '+str(userinput))
srate = sample[0] #sample rate in samples/second
audio = sample[1] #audio data
spectrogram = tdft.tdft(audio, srate, windowsize, windowshift, fftsize)
mytime = spectrogram.shape[0]
freq = spectrogram.shape[1]
print('The size of the spectrogram is time: '+str(mytime)+' and freq: '+str(freq))
threshold = pea.find_thres(spectrogram, percentile, base)
peaks = pea.peak_pick(spectrogram,f_dim1,t_dim1,f_dim2,t_dim2,threshold,base)
print('The initial number of peaks is:'+str(len(peaks)))
peaks = pea.reduce_peaks(peaks, fftsize, high_peak_threshold, low_peak_threshold)
print('The reduced number of peaks is:'+str(len(peaks)))
#Store information for the spectrogram graph
samplePeaks = peaks
sampleSpectro = spectrogram
hashSample = fhash.hashSamplePeaks(peaks,delay_time,delta_time,delta_freq)
print('The dimensions of the hash matrix of the sample: '+str(hashSample.shape))
# tuple of all parallel python servers to connect with
ppservers = ()
#ppservers = ("10.0.0.1",)
if len(sys.argv) > 1:
ncpus = int(sys.argv[1])
# Creates jobserver with ncpus workers
job_server = pp.Server(ncpus, ppservers=ppservers)
else:
# Creates jobserver with automatically detected number of workers
job_server = pp.Server(ppservers=ppservers)
print ("Starting pp with", job_server.get_ncpus(), "workers")
print('Attempting to identify the sample audio clip.')
Here I call the function in fingerprint, the commented line worked, but when I try parallelize don't work:
timepairs = job_server.submit(fhash.findTimePairs, (database, hashSample, TPdelta_freq, TPdelta_time, ))
# timepairs = fhash.findTimePairs(database, hashSample, TPdelta_freq, TPdelta_time)
print (timepairs)
#Compute number of matches by song id to determine a match
numSongs = len(songnames)
songbins= np.zeros(numSongs)
numOffsets = len(timepairs)
offsets = np.zeros(numOffsets)
index = 0
for i in timepairs:
offsets[index]=i[0]-i[1]
index = index+1
songbins[i[2]] += 1
# Identify the song
#orderarray=np.column_stack((songbins,songnames))
#orderarray=orderarray[np.lexsort((songnames,songbins))]
q3=np.percentile(songbins, 75)
q1=np.percentile(songbins, 25)
j=0
for i in songbins:
if i>(q3+(3*(q3-q1))):
print("Result-> "+str(i)+":"+songnames[j])
j+=1
end=time.time()
print('Tiempo: '+str(end-start)+' s')
print("Time elapsed: ", +time.time() - start, "s")
fig3 = pylab.figure(1003)
ax = fig3.add_subplot(111)
ind = np.arange(numSongs)
width = 0.35
rects1 = ax.bar(ind,songbins,width,color='blue',align='center')
ax.set_ylabel('Number of Matches')
ax.set_xticks(ind)
xtickNames = ax.set_xticklabels(songnames)
matplotlib.pyplot.setp(xtickNames)
pylab.title('Song Identification')
fig3.show()
pylab.show()
print('The sample song is: '+str(songnames[np.argmax(songbins)]))
The function in fingerprint that I try to parallelize is:
def findTimePairs(hash_database,sample_hash,deltaTime,deltaFreq):
"Find the matching pairs between sample audio file and the songs in the database"
timePairs = []
for i in sample_hash:
for j in hash_database:
if(i[0] > (j[0]-deltaFreq) and i[0] < (j[0] + deltaFreq)):
if(i[1] > (j[1]-deltaFreq) and i[1] < (j[1] + deltaFreq)):
if(i[2] > (j[2]-deltaTime) and i[2] < (j[2] + deltaTime)):
timePairs.append((j[3],i[3],j[4]))
else:
continue
else:
continue
else:
continue
return timePairs
The complete error is:
Traceback (most recent call last):
File "analisisPrueba.py", line 93, in <module>
numOffsets = len(timepairs)
TypeError: object of type '_Task' has no len()
The submit() method submits a task to the server. What you get back is a reference to the task, not its result. (How could it return its result? submit() returns before any of that work has been done!) You should instead provide a callback function to receive the results. For example, timepairs.append is a function that will take the result and append it to the list timepairs.
timepairs = []
job_server.submit(fhash.findTimePairs, (database, hashSample, TPdelta_freq, TPdelta_time, ), callback=timepairs.append)
(Each findTimePairs call should calculate one result, in case that isn't obvious, and you should submit multiple tasks. Otherwise you're invoking all the machinery of Parallel Python for no reason. And make sure you call job_server.wait() to wait for all the tasks to finish before trying to do anything with your results. In short, read the documentation and some example scripts and make sure you understand how it works.)

Categories