How do I speed up this file creation process? - python

I am trying to create a large flat file with fixed width columns that contains multiple layers, but processing seems to be very slow, most likely because I am iterating over each row.
For context, this is for transmitting insurance policy information.
The hierarchy goes like this:
-Policy row
--Property on policy
---Coverage on property
--Property on policy
---Coverage on property
--Owner on policy
--Owner on policy
--Owner on policy
Currently I'm loading the four record types into separate dataframes, and then doing a for loop over each type by pulling them based on the parent record's ID, and then writing them to the file. I'm hoping for some sort of hierarchical dataFrame merge that doesn't force me to scan the file each time I want a record.
import re
import pandas as pd
import math
def MakeNumeric(instring):
output = re.sub('[^0-9]', '', str(instring))
return str(output)
def Pad(instring, padchar, length, align):
if instring is None: # Takes care of NULL values
instring = ''
instring = str(instring).upper()
instring = instring.replace(',', '').replace('\n', '').replace('\r', '')
instring = instring[:length]
if align == 'L':
output = instring + (padchar * (length - len(instring)))
elif align == 'R':
output = (padchar * (length - len(instring))) + instring
else:
output = instring
return output
def FileCreation():
POLR = pd.read_parquet(r'POLR.parquet')
PRP1 = pd.read_parquet(r'PRP1.parquet')
PROP = pd.read_parquet(r'PROP.parquet')
SUBJ = pd.read_parquet(r'SUBJ.parquet')
rownum = 1
totalrownum = 1
POLRCt = 0
size = 900000
POLR = [POLR.loc[i:i + size - 1, :] for i in range(0, len(POLR), size)]
FileCt = 0
print('Predicted File Count: ' + str(math.ceil(len(POLR[0])/ size)) )
for df in POLR:
FileCt += 1
filename = r'OutputFile.' + Pad(FileCt, '0', 2, 'R')
with open(filename, 'a+') as outfile:
for i, row in df.iterrows():
row[0] = Pad(rownum, '0', 9, 'R')
row[1] = Pad(row[1], ' ', 4, 'L')
row[2] = Pad(row[2], '0', 5, 'R')
# I do this for all 50 columns
outfile.write((','.join(row[:51])).replace(',', '') + '\n')
rownum += 1
totalrownum += 1
for i2, row2 in PROP[PROP.ID == row[51]].iterrows():
row2[0] = Pad(rownum, '0', 9, 'R')
row2[1] = Pad(row2[1], ' ', 4, 'L')
row2[2] = Pad(row2[2], '0', 5, 'R')
# I do this for all 105 columns
outfile.write((','.join(row2[:106])).replace(',', '') + '\n')
rownum += 1
totalrownum += 1
for i3, row3 in PRP1[(PRP1['id'] == row2['ID']) & (PRP1['VNum'] == row2['vnum'])].iterrows():
row3[0] = Pad(rownum, '0', 9, 'R')
row3[1] = Pad(row3[1], ' ', 4, 'L')
row3[2] = Pad(row3[2], '0', 5, 'R')
# I do this for all 72 columns
outfile.write((','.join(row3[:73])).replace(',', '') + '\n')
rownum += 1
totalrownum += 1
for i2, row2 in SUBJ[SUBJ['id'] == row['id']].iterrows():
row2[0] = Pad(rownum, '0', 9, 'R')
row2[1] = Pad(row2[1], ' ', 4, 'L')
row2[2] = Pad(row2[2], '0', 5, 'R')
# I do this for all 24 columns
outfile.write((','.join(row2[:25])).replace(',', '') + '\n')
rownum += 1
totalrownum += 1
POLRCt += 1
print('File {} of {} '.format(str(FileCt),str(len(POLR)) ) + str((POLRCt - 1) / len(df.index) * 100) + '% Finished\r')
rownum += 1
rownum = 1
POLRCt = 1
I'm essentially looking for a script that doesn't take multiple days to create a 27M record file.

I ended up populating temp tables for each record level, and creating keys, then inserting them into a permanent staging table and assigning an clustered index to the keys.
I then queried the results while using OFFSET and FETCH NEXT %d ROWS ONLY to reduce memory size. I then used the multiprocessing library to break the workload out for each thread on the CPU.
Ultimately, the combination of these have reduced the runtime to about 20% of what it was when this question was originally posted.

Related

Pandas fetch line from datas into columns

i have a problem with my script.
I`ve made a script that fetch some datas from lines of a raw.txt file into columns to excel.
Its worked at the beggining but now when i added more datas in the file its not, if you can help me or have another solve.
**This is my script:
import xlrd, xlwt, re
from svnscripts.timestampdirectory import createdir, path_dir
import os
import pandas as pd
import time
def clearcasevobs():
pathdest = path_dir()
dest = createdir()
timestr = time.strftime("%Y-%m-%d")
txtName = rf"{pathdest}\{timestr}-clearcaseRawData-vobsDetails.txt"
workBook = xlwt.Workbook(encoding='ascii')
workSheet = workBook.add_sheet('sheet1')
fp = open(txtName, 'r+b')
# header
workSheet.write(0, 0, "Tag")
workSheet.write(0, 1, "CreateDate")
workSheet.write(0, 2, "Created By")
workSheet.write(0, 3, "Storage Host Pathname")
workSheet.write(0, 4, "Storage Global Pathname")
workSheet.write(0, 5, "DB Schema Version")
workSheet.write(0, 6, "Mod_by_rem_user")
workSheet.write(0, 7, "Atomic Checkin")
workSheet.write(0, 8, "Owner user")
workSheet.write(0, 9, "Owner Group")
workSheet.write(0, 10, "ACLs enabled")
workSheet.write(0, 11, "FeatureLevel")
row = 0
entries = 0
fullentry = []
for linea in fp.readlines():
str_linea = linea.decode('gb2312', 'ignore')
str_linea = str_linea[:-2] # str string
txt = str_linea
arr = str_linea
if arr[:9] == "versioned":
txt = arr
entries += 1
s = txt.index("/")
e = txt.index('"', s)
txt = txt[s:e]
fullentry.append(txt)
elif arr.find("created") >= 0:
entries += 1
txt = arr
s = txt.index("created")
e = txt.index("by")
txt1 = txt[s + 7:20]
fullentry.append(txt1)
txt2 = txt[e + 3:]
fullentry.append(txt2)
elif arr.find("VOB storage host:pathname") >= 0:
entries += 1
txt = arr
s = txt.index('"')
e = txt.index('"', s + 1)
txt = txt[s + 1:e]
fullentry.append(txt)
elif arr.find("VOB storage global pathname") >= 0:
entries += 1
txt = arr
s = txt.index('"')
e = txt.index('"', s + 1)
txt = txt[s + 1:e]
fullentry.append(txt)
elif arr.find("database schema version:") >= 0:
entries += 1
txt = arr
txt = txt[-2:]
fullentry.append(txt)
elif arr.find("modification by remote privileged user:") >= 0:
entries += 1
txt = arr
s = txt.index(':')
txt = txt[s + 2:]
fullentry.append(txt)
elif arr.find("tomic checkin:") >= 0:
entries += 1
txt = arr
s = txt.index(':')
txt = txt[s + 2:]
fullentry.append(txt)
elif arr.find("owner ") >= 0:
entries += 1
txt = arr
s = txt.index('owner')
txt = txt[s + 5:]
fullentry.append(txt)
elif arr.find("group tmn") >= 0:
if arr.find("tmn/root") == -1:
entries += 1
txt = arr
s = txt.index('group')
entries += 1
txt = txt[s + 5:]
fullentry.append(txt)
elif arr.find("ACLs enabled:") >= 0:
entries += 1
txt = arr
txt = txt[-2:]
fullentry.append(txt)
elif arr.find("FeatureLevel =") >= 0:
entries += 1
txt = arr
txt = txt[-1:]
fullentry.append(txt)
if (row == 65536):
break;
finalarr = []
finalarr1 = []
temp = 0
row = 1
for r in fullentry:
finalarr.append(r)
temp += 1
if temp == 12:
finalarr1.append(finalarr)
temp = 0
col = 0
for arr in finalarr:
workSheet.write(row, col, arr)
col += 1
row += 1
finalarr.clear()
if (row == 65536):
break;
workBook.save(os.path.join(dest, "ClearcaseReport.xls"))
fp.close()
This is my file.txt datas(the file that script need to work and doesnt):
https://paste.pythondiscord.com/sedowagigo
This is how should output as excel file::
Details:
-The script that i did basically should read the datas inside the .txt file and based on the keywords that i put to create the columns and add the wanted datas in the right columns, but also should ignore the sh*t/raw datas dat dont need to be processed.
-First time was working thats why i have also the output photo .xls, but now its not working anymore because i added more datas inside and have more junkies... If someone can help me or you know other method im open to all
This is the old .txt file that i tested the script and works: https://paste.pythondiscord.com/ohewatahuv
This is the error that i received when i use the script on the new file that i attach inside the pastebin at the beggining ( https://paste.pythondiscord.com/sedowagigo ):
Ty for help!

How to iterate over the rows from 2 files, compare the values and update the value in a file when the condition is met?

For changing the values from 10 to 18, 19 or 20, I am splitting the string, access the substrings and then trying to change it. Its working but just not changing the values. Here is the solution I am trying to implement:
oldFileName = 'tryout.hmo'
newFileName = 'tryout_NEW.hmo'
topoFileName = 'Density.topo'
readme = open( oldFileName, "r" )
oldLines = readme.readlines()
readme = open(topoFileName, "r")
Lines = readme.readlines()
readme.close()
newFile = open(newFileName,"w")
for row in oldLines:
for line in Lines:
tmp = line.split()
list = row.rstrip()
tmp1 = list.split()
newFile.write(row)
if row.find("BEG_ELEM_DATA") > -1:
if tmp[0] == tmp1[0]:
if tmp[2] == 1 and tmp[3] == 0:
# it is magnet, value 18
newFile.write(tmp1.replace(tmp1[1], "18"))
elif tmp[2] == 1 and tmp[3] == 1:
# it is iron, value 19
newFile.write(tmp1.replace(tmp1[1], "19"))
else:
# it is air, value 20
newFile.write(tmp1.replace(tmp1[1], "20"))
newFile.close()
I would really appreciate it if you could able to solve this problem in above script, then I guess it should work.
I'm also still a beginner in Python, but I tried to solve your problem and here is my solution:
I guess there are way better ways to do it because here you have to import all data to a dataframe before comparing it.
Also I don't know if you can read your data with pd.read_csv to a dataframe because I don't know *.hmo and *.topo
import pandas as pd
df = pd.read_csv('tryout.csv', delimiter=';')
df2 = pd.read_csv('density.csv', delimiter=';')
for idx, row in df.iterrows():
for idx2, row2 in df2.iterrows():
if row[0] == row2[0]:
if row2[2] == 1 and row2[3] == 0 :
# it is magnet, value 18
row[1] = 18
elif row2[2] == 1 and row2[3] == 1 :
# it is iron, value 19
row[1] = 19
else:
# it is air, value 20
row[1] = 20
df.to_csv('new_tryout.csv')
What my code is doing here, it loads both files to dataframes. Then iterate over every line to compare where the ID in both files is the same (e.g 3749).
If true there are the 3 if statements whether it is magnet/iron/air and change the value in df to the right number.
At the end save the new df to a new file 'new_tryout.csv'
I created 2 testfiles for it and it worked the way it should.
Finally, here is the solution you were searching for.
import pandas as pd
df2 = pd.read_csv('Density.topo', header = 0, names = list('ABCD'), delimiter=r'\s+', skiprows=1)
df2[['C', 'D']]= df2[['C', 'D']].round()
new_file_content=''
with open('tryout.hmo', 'r') as f:
for line in f:
if line[11:13] == '10':
if line[3].isspace():
ID_to_search_for = line[4:8] # number with 4 digits
else:
ID_to_search_for = line[3:8] # number with 5 digits
search_idx = df2[df2['A'] == ID_to_search_for].index[0]
if df2['C'][search_idx] == 1 and df2['D'][search_idx] == 0:
change = '18' #magnet
new_line = line[:11] + change + line[13:]
elif df2['C'][search_idx] == 1 and df2['D'][search_idx] == 1:
change = '19' #iron
new_line = line[:11] + change + line[13:]
else:
change = '20' #air
new_line = line[:11] + change + line[13:]
new_file_content += new_line
else:
new_file_content += line
with open('tryout_changed.hmo', 'w') as f:
f.write(new_file_content)
if you don't want to use dataframes, you can do it like this:
with open('density.topo') as f:
lists_of_list = [line.rstrip().split() for line in f]
new_file_content=''
with open('tryout_test.hmo', 'r') as f:
for line in f:
if line[11:13] == '10':
if line[3].isspace():
ID_to_search_for = line[4:8] # number with 4 digits
else:
ID_to_search_for = line[3:8] # number with 5 digits
for idx, sublist in enumerate(lists_of_list):
if sublist[0] == ID_to_search_for:
if lists_of_list[idx][2] == 1 and lists_of_list[idx][3] == 0:
change = '18' #magnet
new_line = line[:11] + change + line[13:]
elif lists_of_list[idx][2] == 1 and lists_of_list[idx][3] == 1:
change = '19' #iron
new_line = line[:11] + change + line[13:]
else:
change = '20' #air
new_line = line[:11] + change + line[13:]
new_file_content += new_line
else:
new_file_content += line
with open('tryout_changed.hmo', 'w') as f:
f.write(new_file_content)
ok, here is my final answer. It does (again) all things you were searching for. Please debug your code in your IDE if there is a problem. You should start using context manager instead of open and closing files step by step.
I wrote the new code around your code in the question and added some comments to it.
oldFileName = 'tryout.hmo'
newFileName = 'tryout_NEW.hmo'
topoFileName = 'Density.topo'
readme = open( oldFileName, "r" )
oldLines = readme.readlines()
m = int(oldLines[3])
print(m)
new_m = m+3
m1 = str(m)
new_m1 = str(new_m)
Phrase = "END_COMP_DATA"
#n = "Phrase not found" #not used --> not needed
with open(oldFileName,"r") as oldFile:
for number, lin in enumerate(oldFile):
if Phrase in lin:
n = number
#insert 3 lines to tryout_new at the right position (--> row n)
magnet = f" {m+1} "'" topo_magnet"'"\n"
iron = f" {m+2} "'" topo_iron"'"\n"
air = f" {m+3} "'" topo_air"'"\n"
oldLines[n:n] = [magnet, iron, air]
newFile = open(newFileName,"w")
flag = 0
with open('density.topo') as f:
data_density = [line.rstrip().split() for line in f]
for idx, row in enumerate(oldLines):
lst = row.rstrip() #I think you shouldn't name a variable like a class in python (list). use 'lst' or something like that
tmp_tryout = lst.split()
if row.find("BEG_ELEM_DATA") > -1:
flag = 1
if flag == 1 and len(tmp_tryout)>1:
# if the column has more than 2 columns (after split), check for the "10"
if tmp_tryout[1] == '10':
# density_idx_line searchs in density.topo for a match with tmp_tryout[0] (e.g. 3749) and stores the whole line
density_idx_line = list(filter(lambda x: x[0] == tmp_tryout[0], data_density))
if len(density_idx_line) >0:
if density_idx_line[0][2] == '1.0' and density_idx_line[0][3] == '1e-05':
# the ' 10 ' is the 10 with a whitespace before and after it. Only like this only the 10 gets replaced (and not e.g. 3104 to 3184)
newFile.write(row.replace(' 10 ', ' 18 '))
elif density_idx_line[0][2] == '1.0' and density_idx_line[0][3] == '1.0':
newFile.write(row.replace(' 10 ', ' 19 '))
else:
newFile.write(row.replace(' 10 ', ' 20 '))
else:
newFile.write(row)
else:
if idx == 3:
newFile.write(row.replace(m1, new_m1))
else:
newFile.write(row)
newFile.close()
print ("script terminated successfully!")
ok, here is another solution. For anybody else who reads this: this is still only a temporary solution but #Sagar and me both don't know to do it better.
import pandas as pd
df = pd.read_csv('tryout.hmo', header = 0, names = list('ABCDEFGHIJKLM'), delimiter=r'\s+', skiprows=[i for i in range(52362)])
df2 = pd.read_csv('Density.topo', header = 0, names = list('ANOP'), delimiter=r'\s+', skiprows=1)
df2 = df2.iloc[:-3, :]
df3 = df.merge(df2, how='outer', on='A')
df3[['O','P']] = df3[['O','P']].fillna(-1).astype(int).replace(-1, np.nan)
df3['B']= df3.apply(lambda x: 18 if x['B']==10 and x['O']==1 and x['P']==0 else (
19 if x['B']==10 and x['O']==1 and x['P']==1 else (
20 if x['B']==10 and x['O']==0 and x['P']==0 else x['B'])), axis=1)
df3.to_csv('new_tryout.csv')
It finished the code in less than a second, so it is far better than iterrows or itertuples.
The new csv file includes both the tryout file and the density file. They are merged together by the first column of tryout file (ID i guess)
I didn't check all of this very big file but from the few random points I checked, it seems as this way works.

Count Not Counting Properly

I've scraped a bunch of words from the dictionary, and created a massive CSV file with all of them, one word per row.
I have another function, which reads from that massive CSV file, and then creates smaller CSV files.
The function is supposed to create CSV files with only 500 words/rows, but something is amiss. The first file has 501 words/rows. The rest of the files have 502 words/rows.
Man, maybe I'm tired, but I can't seem to spot what exactly is causing this in my code. Or is there nothing wrong with my code at all?
Below is the part of the function that I'm assuming is causing the problem. The full function can be seen below that.
Suspect Part of Function
def create_csv_files():
limit = 500
count = 0
filecount = 1
zfill = 3
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format('1'.zfill(zfill))
with open('C:\\Users\\Anthony\\Desktop\\Scrape\\Results\\dictionary.csv') as readfile:
csvReader = csv.reader(readfile)
for row in csvReader:
term = row[0]
if ' ' in term:
term = term.replace(' ', '')
if count <= limit:
count += 1
else:
count = 0
filecount += 1
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format(str(filecount).zfill(zfill))
aw = 'a' if os.path.exists(filename) else 'w'
with open(filename, aw, newline='') as writefile:
fieldnames = [ 'term' ]
writer = csv.DictWriter(writefile, fieldnames=fieldnames)
writer.writerow({
'term': term
})
The Whole Function
def create_csv_files():
limit = 500
count = 0
filecount = 1
zfill = 3
idiomsfilename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\idioms.csv'
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format('1'.zfill(zfill))
with open('C:\\Users\\Anthony\\Desktop\\Scrape\\Results\\dictionary.csv') as readfile:
csvReader = csv.reader(readfile)
for row in csvReader:
term = row[0]
if 'idiom' in row[0] and row[0] != ' idiom':
term = row[0][:-5]
aw = 'a' if os.path.exists(idiomsfilename) else 'w'
with open(idiomsfilename, aw, newline='') as idiomsfile:
idiomsfieldnames = ['idiom']
idiomswriter = csv.DictWriter(idiomsfile, fieldnames=idiomsfieldnames)
idiomswriter.writerow({
'idiom':term
})
continue
else:
if ' ' in term:
term = term.replace(' ', '')
if count <= limit:
count += 1
else:
count = 0
filecount += 1
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format(str(filecount).zfill(zfill))
aw = 'a' if os.path.exists(filename) else 'w'
with open(filename, aw, newline='') as writefile:
fieldnames = [ 'term' ]
writer = csv.DictWriter(writefile, fieldnames=fieldnames)
writer.writerow({
'term': term
})
print(term)
So the reason why the files have weird number of rows is because of your if-else conditions.
You increment count when count is less than or equal to limit. For your very first iteration, you increment to 1 then write your first term, then increment and so on. Because you use <= instead of the strict inequality, you will still increment at count = 500 and write the 501st word.
From the second loop onwards, your first word is written at count = 0. The loop terminates again at count = 501 so you write 502 words this time.
To fix this, check for count >= limit, and create a new file if so. Increment count after you write to the CSV file and not before. That should help.
def create_csv_files():
limit = 500
count = 0
filecount = 1
zfill = 3
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format('1'.zfill(zfill))
with open('C:\\Users\\Anthony\\Desktop\\Scrape\\Results\\dictionary.csv') as readfile:
csvReader = csv.reader(readfile)
for row in csvReader:
term = row[0]
if ' ' in term:
term = term.replace(' ', '')
# Remove if and keep else
if count >= limit:
count = 0
filecount += 1
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format(str(filecount).zfill(zfill))
aw = 'a' if os.path.exists(filename) else 'w'
with open(filename, aw, newline='') as writefile:
fieldnames = [ 'term' ]
writer = csv.DictWriter(writefile, fieldnames=fieldnames)
writer.writerow({
'term': term
})
count += 1 # Increment here

When try to call a Python function the main where I call restart

I've a problem with my Python code: when I call the function from another class, the class where I am calling the function restart, and the compiler gives this error message:
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
The class where I launch the code is this:
finestre = creatore_finestre()
print(finestre[0])
The code of the function is:
DIR_DATA = '../../../data/'
SIGNALS_INDEX = {
'HR': 0,
'ABPSys': 1,
'ABPDias': 2,
'ABPMean': 3,
'CVP': 4,
'PULSE': 5,
'RESP': 6,
'SpO2': 7,
'NBPSys': 8,
'NBPDias': 9,
'NBPMean': 10,
}
def download_information_database(id_patient):
wfdb.dldatabase('mimic2db/numerics', DIR_DATA + id_patient, records=[id_patient])
def create_csv(id_patient, signal):
# Download the patient information
download_information_database(id_patient)
# Firstly, we read the patient information
patient_dir = DIR_DATA + id_patient + "/"
record = wfdb.rdsamp(patient_dir + id_patient, channels=[SIGNALS_INDEX[signal]])
# We calculate the datetime base
date_str = record.basedate + ' ' + record.basetime
date = datetime.datetime.strptime(date_str, '%d/%m/%Y %H:%M:%S')
# We read the signal values
signal = record.p_signals
# We write the csv file
with open(patient_dir + id_patient + '.csv', 'w+') as csvfile:
writer = csv.writer(csvfile, delimiter=',',lineterminator="\n")
for s in signal:
date = date + datetime.timedelta(minutes=1)
if not math.isnan(float(s)):
writer.writerow([date.strftime("'[%H:%M:%S %d/%m/%Y]'"),str(int(s[0]) * 1000)])
def creatore_finestre():
#Open the file of information for each patients
in_file = open("../../../data/CodePatientsTraining.csv", "r+")
lettore_file = csv.reader(in_file)
#Create dictionary of list
finestre = defaultdict(list)
for nomeFile in lettore_file:
print(nomeFile[0])
create_csv(nomeFile[0],"ABPMean")
f = open("../../../data/" + nomeFile[0] + "/" + nomeFile[0] + ".csv", "r+")
reader = csv.reader(f)
line = in_file.readline()
lista = list(reader)
i = 0
for timestamp in lista:
if (timestamp[0] != nomeFile[1]):
i += 1
else:
print(timestamp[0], nomeFile[1], i)
break
decade = 0
somma = 0
arrivo = 1
minute = 10
while (i != 0):
i -= 1
if (lista[i][1] != '0' and lista[i][1] != '-' and int(lista[i][1]) > 0):
somma += float(lista[i][1])
decade += 1
if (decade == minute):
f = SlidingWindows((somma / minute), nomeFile[4])
finestre[arrivo].append(f)
print("T[" + str(arrivo) + "]:")
for value in finestre[arrivo]:
print(value)
decade = 0
arrivo += 1
somma = 0
return finestre
My idea is to create a SlidingWindows for each CSV file in the function, and take all the sliding windows from the other class.
import your_module
if __name__ == '__main__':
extractor = your_Module.your_function()
extractor.runInParallel(numProcesses=2, numThreads=4)

Parsing a big text file, extract data & store it in a CSV file.. Too Slow

I have a big log file (say 1-3 Gb) which I need to parse, extract data & save it in a CSV file.
Text File Data
* D:40035FC8 wr-long 00000008 \\core0\Global\u4TimeHiCnt 1.000us
* D:40027C5C rd-byte 00 *core0\Global\Ypf_OILL_OilLvlOn 20.342us
* D:40010044 rd-word 0FE2 *l\u2SAD_OILLVS_RecoveryCounter 0.160us
* D:40010044 wr-word 0FE1 *l\u2SAD_OILLVS_RecoveryCounter 0.040us
* D:40035FC8 wr-long 00000008 \\core0\Global\u4TimeHiCnt 1.000us
I have to extract the variable name which is after the last \ and then the number of Read & Write along with the datatype & store it in a CSV file.
CSV File Result
Variable Datatype CORE 0 CORE 1 CORE X
Read Write Read Write Read Write
OS_inKernel byte 0 0 111768 111878 0 0
OS_globalIntLevel long 0 0 281604 237901 0 0
The problem is it takes too much time. Can you pls look in to the attached code & suggest ways to make it faster.
import string
import sys
import time
MyFile = open("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\core1_sram_ReadWrite.txt")#core0_sram_ReadWrite_rawdata
GeneratedFile = open(str(("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\")+'ParsedOutput.csv'),'w')
try:
MyVariableList = []
TimeStartTest = time.time() #Starting Time
GeneratedFile.write('\nVariable')
GeneratedFile.write(', Datatype')
GeneratedFile.write(', CORE 0')
GeneratedFile.write(',, CORE 1')
GeneratedFile.write(',, CORE X')
GeneratedFile.write('\n,, Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write(', Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write(', Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write('\n')
for CurrentLine in MyFile:
NoofSpaces = 0
if CurrentLine.find('\\') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('\\')+1:].split(' ')[0]
elif CurrentLine.find('*\\') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('*\\')+1:].split(' ')[0]
elif CurrentLine.find('*') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('*')+1:].split(' ')[0]
VariableFound = 0
MyVariableList.sort()
Lowerbound = 0
Upperbound = len(MyVariableList)-1
while Lowerbound <= Upperbound and VariableFound == 0:
middle_pos = (Lowerbound+Upperbound) // 2
if MyVariableList[middle_pos] < MyVariable:
Lowerbound = middle_pos + 1
elif MyVariableList[middle_pos] > MyVariable:
Upperbound = middle_pos - 1
else:
VariableFound = 1
if VariableFound == 0:
MyVariableList.append(MyVariable)
try:
MyFile1 = open("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\core1_sram_ReadWrite.txt")#core0_sram_ReadWrite_rawdata
Core0_ReadCount = 0
Core0_WriteCount = 0
Core1_ReadCount = 0
Core1_WriteCount = 0
CoreX_ReadCount = 0
CoreX_WriteCount = 0
for CurrentLine1 in MyFile1:
if CurrentLine1.find(MyVariable) != -1:
## CORE 0 ##
if CurrentLine1.find("0\\Global") != -1:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
Core0_ReadCount = Core0_ReadCount + 1
elif DataOperation == 'wr':
Core0_WriteCount = Core0_WriteCount + 1
## CORE 1 ##
elif CurrentLine1.find("1\\Global") != -1:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
Core1_ReadCount = Core1_ReadCount + 1
elif DataOperation == 'wr':
Core1_WriteCount = Core1_WriteCount + 1
## CORE X ##
else:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
CoreX_ReadCount = CoreX_ReadCount + 1
elif DataOperation == 'wr':
CoreX_WriteCount = CoreX_WriteCount + 1
GeneratedFile.write('\n %s' %MyVariable)
GeneratedFile.write(', %s' %DataType)
GeneratedFile.write(', %d' %Core0_ReadCount)
GeneratedFile.write(', %d' %Core0_WriteCount)
GeneratedFile.write(', %d' %Core1_ReadCount)
GeneratedFile.write(', %d' %Core1_WriteCount)
GeneratedFile.write(', %d' %CoreX_ReadCount)
GeneratedFile.write(', %d' %CoreX_WriteCount)
GeneratedFile.write('\n')
finally:
MyFile1.close()
except:
print sys.exc_info()
finally:
GeneratedFile.close()
MyFile.close()
TimeStopTest = time.time()
print str(int((TimeStopTest - TimeStartTest)/60))
You'd better use with statement, like this:
# if this file is line based
with open('test.txt') as f:
for line in f:
# process line, do something with line

Categories