My code should compare two vectors saved as dictionary (two pickle files) and save the result into a pickle file too. This works but very slowly. For one compare result I'm waiting about 7:2o min. Because I have a lot of videos (exactly 2033) this prog will run about 10 days. This is too long. How can I speed up my code for Python 2.7?
import math
import csv
import pickle
from itertools import izip
global_ddc_file = 'E:/global_ddc.p'
io = 'E:/AV-Datensatz'
v_source = ''
def dot_product(v1, v2):
return sum(map(lambda x: x[0] * x[1], izip(v1, v2))) # izip('ABCD', 'xy') --> Ax By
def cosine_measure(v1, v2):
prod = dot_product(v1, v2)
len1 = math.sqrt(dot_product(v1, v1))
len2 = math.sqrt(dot_product(v2, v2))
if (len1 * len2) <> 0:
out = prod / (len1 * len2)
else: out = 0
return out
def findSource(v):
v_id = "/"+v[0].lstrip("<http://av.tib.eu/resource/video").rstrip(">")
v_source = io + v_id
v_file = v_source + '/vector.p'
source = [v_id, v_source, v_file]
return source
def getVector(v, vectorCol):
with open (v, 'rb') as f:
try:
vector_v = pickle.load(f)
except: print 'file couldnt be loaded'
tf_idf = []
tf_idf = [vec[1][vectorCol] for vec in vector_v]
return tf_idf
def compareVectors(v1, v2, vectorCol):
v1_source = findSource(v1)
v2_source = findSource(v2)
V1 = getVector(v1_source[2], vectorCol)
V2 = getVector(v2_source[2], vectorCol)
sim = [v1_source[0], v2_source[0], cosine_measure(V1, V2)]
return sim
#with open('videos_av_portal_cc_3.0_nur2bspStanford.csv', 'rb') as dataIn:
with open('videos_av_portal_cc_3.0_vollstaendig.csv', 'rb') as dataIn:
#with open('videos_av_portal_cc_3.0.csv', 'rb') as dataIn:
try:
reader = csv.reader(dataIn)
v_source = []
for row in reader:
v_source.append(findSource(row))
#print v_source
for one in v_source:
print one[1]
compVec = []
for another in v_source:
if one <> another:
compVec.append(compareVectors(one, another, 3))
compVec_sort = sorted(compVec, key=lambda cosim: cosim[2], reverse = True)
# save vector file for each video
with open (one[1] + '/compare.p','wb') as f:
pickle.dump(compVec_sort,f)
finally:
dataIn.close()
Split code in 2 parts:
1. Load Dictionary in vectors
2. Compare 2 dictionaries using multiprocessmultiprocess example
3. Launch process simultaneously according to memory availability and end the process after 8 mins. Then update the 3rd dictionary.
4. Then relaunch process on next set of data , follow step 3 and continue till the dictionary length.
This should reduce total turnaround time.
Let me know if you need code .
Related
I'm using the csv module to save arrays to csv, however I want to make it abit more dynamic. Apologies if this is already up here, I tried searching but to no avail.
this is my current code, it works well....
filename = 'zeroTime_.csv' #save to csv
f = open(filename, "w")
f.write("{},{},{},{}\n".format("R[0]", "F[0]","R[1]", "F[1]"))
for x in zip(R[0], F[0],R[1], F[1]):
f.write("{},{},{},{}\n".format(x[0], x[1], x[2], x[3]))
f.close()
However I want to make it so that if array R has more than 2 columns in, it will still work
I tried creating the "R[0]", "F[0]","R[1]", "F[1]" as a string and just using that but it doesnt like it, my code creates a R and F for each channel denoted by NumCh:
for x in range(NumCh):
title = title +'"R['+str(x)+']",'+'"F['+str(x)+']",'
bracket = bracket + '{},{},'
title = title[:-1]
bracket = bracket[:-1]
bracket = bracket + '\n"'
filename = 'zeroTime_.csv' #save to csv
f = open(filename, "w")
f.write(bracket.format(title))
for x in zip(R[0], F[0],R[1], F[1]):
f.write(bracket.format(x[0], x[1], x[2], x[3]))
f.close()
Gives me the error:
Traceback (most recent call last):
File "base.py", line 8, in <module>
b = pd.proc(NumCh, a)
File "C:\Users\jtpkalaverick\Documents\Python\module\time_proc.py", line 24, in proc
f.write(bracket.format(title))
IndexError: Replacement index 1 out of range for positional args tuple
(I'm running this within a module addressed as pd.)
edit (30.06.22)
i have 2 modules tick_proc that produces some arbitrary arrays and time_proc that does some basic maths on the arrays. NumCh and samples are passed into the modules and are just ints
main code:
import tick_proc as tp
import time_proc as pd
NumCh = 2
samples = 10
a = tp.collect_data(NumCh, samples)
b = pd.proc(NumCh, a)
print('b', b)
tick proc:
print('Importing module "TickProc"')
R = []
F = []
def collect_data(NumCh, samples):
for x in range(NumCh):
R.append([])
F.append([])
for x in range(NumCh):
for y in range(samples):
R[x].append(y*x)
F[x].append(y*x -1)
return F, R
time_proc:
import csv
def proc(NumCh, a):
R = a[0]
F = a[1]
T = []
bracket = '"'
title = ''
vars = ''
for x in range(NumCh):
T.append([])
title = title +'"R['+str(x)+']",'+'"F['+str(x)+']",'
bracket = bracket + '{},{},'
title = title[:-1]
bracket = bracket[:-1]
bracket = bracket + '\n"'
print(bracket)
print(title)
for i in range(NumCh):
for j in range(len(R[i])):
T[i].append(R[i][j]-F[i][j])
filename = 'zeroTime_.csv' #save to csv
f = open(filename, "w")
f.write(bracket.format(title))
for x in zip(R[0], F[0],R[1], F[1]):
f.write(bracket.format(x[0], x[1], x[2], x[3]))
f.close()
return T
I'm working on word Embeddings for arabic dialect -like slang of a region- while preprocessing the data:
load json
extract the lines
clean from urls,emojis nd other
remove any list has minimum of 2 words
create context, target for a window of 2
with tf.keras.preprocessing.text.Tokenizer fit_on_texts
Problem: create from text to matrix
what i want from step 7 is to make onehot encoding so i can feed it to the network
def loadJson(file):
import json
lines=[]
with open(file) as f:
for line in f:
lines.append(json.loads(line))
return lines
def extractSentences(lines,language):
posts=[]
comments=[]
for line in lines:
if line['language']==language:
posts.append(line['message'])
for j in line['comments']:
if j['language']==language:
comments.append(j['message'])
return posts, comments
def removeSpecialChar(posts):
import re
def remov(p):
l=re.sub(' {2,}',' ',re.sub('[^ـابتةثجحخدذرزىسشصضطظعغفقكلمنهويءآأؤإئّّّّّ:ّّّّّ]',' ',re.sub('َ|ً|ُ|ٌ|ِ|ٍ|ْ','',r""+p.strip())))
myre = re.compile(u'['
u'\U0001F300-\U0001F64F'
u'\U0001F680-\U0001F6FF'
u'\u2600-\u26FF\u2700-\u27BF]+',
re.UNICODE)
return myre.sub('',l)
return list(map(remov,posts))
def delEmpty(posts,size=2):
while True:
p=len(posts)
for j,i in enumerate(posts):
if len(i.split(' '))<2:
#print(i.split(' '))
del posts[j]
if p-len(posts)==0:
return
def contextAndTarget(posts,k=2):
import numpy as np
context = []
target = []
for j,i in enumerate(posts):
ul = [ k for k in i.split(' ') if len(k)>2]
for handel in range(len(ul)-1):
for e in range(k):
if e+handel<len(ul):
context.append(ul[handel])
target.append(ul[e+handel])
X = []
X.extend(target)
X.extend(context)
Y = []
Y.extend(context)
Y.extend(target)
return X,Y
after that i apply the processing on the json file nd do all the steps
lines=loadJson('data.json')
posts,comments=extractSentences(lines,'ARABIC')
posts=removeSpecialChar(posts)
delEmpty(posts)
X,Y=contextAndTarget(posts)
tokenPosts=preprocessing.text.Tokenizer()
tokenPosts.fit_on_texts(X)
vocab_size=len(tokenPosts.word_counts)+1
#just right here it crashes nd the RAM increase suddenly
xLines,yLines=tokenPosts.texts_to_matrix (X),tokenPosts.texts_to_matrix (Y)
I have a JSON that has some values that must be extracted, processed and with those, add new values to the file. To do this, I use multiprocessing and although it is a priori synchronized I got race conditions.
The function that is called just transforms a rating between a range of 0-5 to a range of 0-100.
Thanks in advance!
import json
import multiprocessing
n = 1000
maxRating = 5
percentage = 100
inputfile= 'rawData.JSON'
outfile= 'processedData.JSON'
#load data into dictionary "data"
with open(inputfile) as f:
data = json.load(f)
#create an empty dictionary that will contain the new informations
results = {}
def saver(init,end,q,l):
for num in range(init, end):
l.acquire()
rating = data["bars"][num]["rating"]
ratioRating = (percentage * rating) / maxRating
results["ratingP"] = ratioRating
print(ratioRating)
#put data in queue
q.put(results)
l.release()
#main function
if __name__ == '__main__':
i = 0
cores = 4
q = multiprocessing.Queue()
lock = multiprocessing.Lock()
if(cores > 1): #parallel
for i in range (cores):
init = (i*n)/cores
fin = ((i+1)*n)/cores
p = multiprocessing.Process(target = saver, args = (init,fin,q,lock)).start()
for i in range (n):
data["bars"][i].update(q.get()) #update "data" dictionary adding new processed data
else: #sequential
saver(0,n,q)
for l in range (n):
data["bars"][l].update(q.get()) #update "data" dictionary adding new processed data
#write the updated JSON file with the added processed data
with open(outfile,'w') as outfile:
json.dump(data,outfile)
I am reading from a huge file (232MB) line by line.
First, i recognize each line according to a Regular Expression.
Then for each line, I am writing to different city.txt files under the 'report' directory according to a cityname in each line. However, this process takes a while. I am wondering if there is anyway of speeding up the process?
Example of input file: (each column split by a \t)
2015-02-03 19:20 Sane Diebgo Music 692.08 Cash
Actually i have tested the code with writing to different files and not writing to different file(simply process the large file and come up with 2 dicts) the time difference is huge. 80% of the time is spent writing to different files
def processFile(file):
pattern = re.compile(r"(\d{4}-\d{2}-\d{2})\t(\d{2}:\d{2})\t(.+)\t(.+)\t(\d+\.\d+|\d+)\t(\w+)\n")
f = open(file)
total_sale = 0
city_dict = dict()
categories_dict = dict()
os.makedirs("report", exist_ok = True)
for line in f:
valid_entry = pattern.search(line)
if valid_entry == None:
print("Invalid entry: '{}'".format(line.strip()))
continue
else:
entry_sale = float(valid_entry.group(5))
total_sale += entry_sale
city_dict.update({valid_entry.group(3) : city_dict.get(valid_entry.group(3), 0) + entry_sale})
categories_dict.update({valid_entry.group(4) : categories_dict.get(valid_entry.group(4), 0) + entry_sale})
filename = "report/" + valid_entry.group(3) + ".txt"
if os.path.exists(filename):
city_file = open(filename, "a")
city_file.write(valid_entry.group(0))
city_file.close()
else:
city_file = open(filename, "w")
city_file.write(valid_entry.group(0))
city_file.close()
f.close()
return (city_dict, categories_dict, total_sale)
The dictionary lookups and updates could be improved by using defaultdict:
from collections import defaultdict
city_dict = defaultdict(float)
categories_dict = defaultdict(float)
...
city = valid_entry.group(3)
category = valid_entry.group(4)
...
city_dict[city] += entry_sale
category_dict[category] += entry_sale
I am creating my own bootloader for an ATXmega128A4U. To use the bootloader I want to transform the ELF-file of the firmware into a memory map used in the the ATXmega.
For that I use python and the modul "pyelftools". The documentation of it is poor and so I run into a problem: I do not know what information I can use to get the address, offset etc. from the data at the sections.
My goal is to create a bytearray, copy the data/code into it and transfer it to the bootlaoder. Below is my code:
import sys
# If pyelftools is not installed, the example can also run from the root or
# examples/ dir of the source distribution.
sys.path[0:0] = ['.', '..']
from elftools.common.py3compat import bytes2str
from elftools.elf.elffile import ELFFile
# 128k flash for the ATXmega128a4u
flashsize = 128 * 1024
def process_file(filename):
with open(filename, 'rb') as f:
# get the data
elffile = ELFFile(f)
dataSec = elffile.get_section_by_name(b'.data')
textSec = elffile.get_section_by_name(b'.text')
# prepare the memory
flashMemory = bytearray(flashsize)
# the data section
startAddr = dataSec.header.sh_offset
am = dataSec.header.sh_size
i = 0
while i < am:
val = dataSec.stream.read(1)
flashMemory[startAddr] = val[0]
startAddr += 1
i += 1
# the text section
startAddr = textSec.header.sh_offset
am = textSec.header.sh_size
i = 0
while i < am:
print(str(startAddr) + ' : ' + str(i))
val = textSec.stream.read(1)
flashMemory[startAddr] = val[0]
startAddr += 1
i += 1
print('finished')
if __name__ == '__main__':
process_file('firmware.elf')
Hope someone can tell me how to solve this problem.
I manged to solve the problem.
don't read the data manualy from the stream by "textSec.stream.read" use "textSec.data()" instead. Internaly (see "sections.py") a seek operation in the file is done. Afterwards the data is read. The result will be the valid data chunk.
The following code reads the code(text) section of a atxmega firmware and copies it into a bytearray which has the layout of the flash of an atxmega128a4u device.
#vlas_tepesch: the hex conversation is not needed and the the 64k pitfall is avoided.
sys.path[0:0] = ['.', '..']
from elftools.common.py3compat import bytes2str
from elftools.elf.elffile import ELFFile
# 128k flash for the ATXmega128a4u
flashsize = 128 * 1024
def __printSectionInfo (s):
print ('[{nr}] {name} {type} {addr} {offs} {size}'.format(
nr = s.header['sh_name'],
name = s.name,
type = s.header['sh_type'],
addr = s.header['sh_addr'],
offs = s.header['sh_offset'],
size = s.header['sh_size']
)
)
def process_file(filename):
print('In file: ' + filename)
with open(filename, 'rb') as f:
# get the data
elffile = ELFFile(f)
print ('sections:')
for s in elffile.iter_sections():
__printSectionInfo(s)
print ('get the code from the .text section')
textSec = elffile.get_section_by_name(b'.text')
# prepare the memory
flashMemory = bytearray(flashsize)
# the text section
startAddr = textSec.header['sh_addr']
val = textSec.data()
flashMemory[startAddr:startAddr+len(val)] = val
# print memory
print('finished')
if __name__ == '__main__':
process_file('firmware.elf')
Tanks for the comments!