How to Make Automatically? Panning Matplotlib Function - python

So I have been trying to make a function in Python that makes my graph automatically pan rather than using the buttons on the Matplotlib graph.
(the MultiScatter function)
The way the code works is that it looks at a file and takes the numbers inside and turns them into arrays with x being each "beat" and y being the different types of numbers. And I want to take those arrays and turn those into a graph that automatically pans from the first beat to the last one.
from tkinter import *
from tkinter import ttk
from tkinter import filedialog
from PIL import ImageTk, Image
import numpy as np
from matplotlib.axis import Axis
import matplotlib.pyplot as plt
import os
import glob
import time
directions = []
nameDirections = []
beats = 0
root = Tk()
nameDirections = ["left", "down", "up", "right"]
directions = []
#NameDirections are the type of arrows that are being measured.
#Directions contains the numbers that measure the amount of times each direction is in the text file for a Dance Dance Revolution game code
e = Entry(root, width=50)
e.pack()
root.title("Insert GUI Title Here")
root.geometry('600x400')
root.resizable(False, False)
#Size of GUI
def openFolder():
folPath = filedialog.askdirectory()
return folPath
def openSmFile():
folPath = filedialog.askopenfilename()
return folPath
#Opens an SM file (Stepmania code file)
def checkDirections():
folPath = openFolder()
for fpath in glob.iglob(f'{folPath}/*'):
if (fpath.endswith('.sm')):
file = open(fpath, "r")
lines = []
lines = file.readlines()
left = 0
down = 0
up = 0
right = 0
beats = 0
for line in lines:
i = 0
if not ("," in line or "." in line or "#" in line or ";" in line or "-" in line or line == "" or len(line) != 5):
for alpha in line:
if i == 0 and alpha != "0":
left += 1
if i == 1 and alpha != "0":
down += 1
if i == 2 and alpha != "0":
up += 1
if i == 3 and alpha != "0":
right += 1
i += 1
beats += 1
print("The file name is " + str(fpath))
print("There are " + str(left) + " lefts in this song.")
print("There are " + str(down) + " downs in this song.")
print("There are " + str(up) + " ups in this song.")
print("There are " + str(right) + " rights in this song.")
print("There are " + str(beats) + " beats.")
#Prints number of each type of arrow. This was taken from code I wrote earlier and I just changed it up for the near-identical function below that returns instead
#Because i was too lazy to make a good solution
def graph(thing):
fpath = openSmFile()
if (fpath.endswith('.sm')):
file = open(fpath, "r")
lines = []
lines = file.readlines()
left = 0
down = 0
up = 0
right = 0
for line in lines:
i = 0
if not ("," in line or "." in line or "#" in line or ";" in line or "-" in line or line == ""):
for alpha in line:
if i == 0 and alpha != "0":
left += 1
if i == 1 and alpha != "0":
down += 1
if i == 2 and alpha != "0":
up += 1
if i == 3 and alpha != "0":
right += 1
i += 1
directions = [left, down, up, right]
plt.title(str(os.path.basename(fpath)))
if (thing == "bar"):
plt.bar(nameDirections, directions)
if (thing == "pie"):
plt.pie(directions, labels=nameDirections, autopct='%1.1f%%',shadow=True, startangle=90)
if (thing == "scatter"):
plt.scatter(directions, nameDirections)
else:
print("This file is not valid.")
def ScatterTime(text):
if (text == ""):
fpath = openSmFile()
else:
fpath = text
if (fpath.endswith('.sm')):
file = open(fpath, "r")
lines = []
lines = file.readlines()
arrowDirections = []
xs = []
ys = []
x = 0
counter = 0
for line in lines:
if counter > 250:
break
y = -1
i = 0
if not ("," in line or "." in line or "#" in line or ";" in line or "-" in line or line == "" or ":" in line or len(line) != 5):
for alpha in line:
if i == 0 and alpha != "0":
xs.append(x)
ys.append(0)
arrowDirections.append("left")
if i == 1 and alpha != "0":
xs.append(x)
ys.append(1)
arrowDirections.append("down")
if i == 2 and alpha != "0":
xs.append(x)
ys.append(2)
arrowDirections.append("up")
if i == 3 and alpha != "0":
xs.append(x)
ys.append(3)
arrowDirections.append("right")
i += 1
x = x + 1
counter = counter + 1
#print(len(ys))
plt.title(str(os.path.basename(fpath)))
# fullrange = list(range(1, beat))
#u, ind = np.unique(arrowDirections, return_inverse=True)
#plt.xticks(range(len(u)), u)
#plt.scatter(ind, fullrange, s=beats * 10, marker = ".",)
plt.scatter(xs,ys)
else:
print("This file is not valid.")
#Creates a scatterplot
def returnCounter(thing):
fpath = thing
if (fpath.endswith('.sm')):
file = open(fpath, "r")
lines = []
lines = file.readlines()
counter = 0
for line in lines:
if counter > 250:
break
counter = counter + 1
return counter
else:
print("This file is not valid.")
def returnSmFile():
return openSmFile()
def returnXs(text):
if (text == ""):
fpath = openSmFile()
else:
fpath = text
if (fpath.endswith('.sm')):
file = open(fpath, "r")
lines = []
lines = file.readlines()
arrowDirections = []
xs = []
ys = []
x = 0
counter = 0
for line in lines:
if counter > 250:
break
y = -1
i = 0
if not ("," in line or "." in line or "#" in line or ";" in line or "-" in line or line == "" or ":" in line or len(line) != 5):
for alpha in line:
if i == 0 and alpha != "0":
xs.append(x)
ys.append(0)
arrowDirections.append("left")
if i == 1 and alpha != "0":
xs.append(x)
ys.append(1)
arrowDirections.append("down")
if i == 2 and alpha != "0":
xs.append(x)
ys.append(2)
arrowDirections.append("up")
if i == 3 and alpha != "0":
xs.append(x)
ys.append(3)
arrowDirections.append("right")
i += 1
x = x + 1
plt.title(str(os.path.basename(fpath)))
return(xs)
else:
print("This file is not valid.")
def returnYs(text):
if (text == ""):
fpath = openSmFile()
else:
fpath = text
if (fpath.endswith('.sm')):
file = open(fpath, "r")
lines = []
lines = file.readlines()
arrowDirections = []
xs = []
ys = []
x = 0
counter = 0
for line in lines:
if counter > 250:
break
y = -1
i = 0
if not ("," in line or "." in line or "#" in line or ";" in line or "-" in line or line == "" or ":" in line or len(line) != 5):
for alpha in line:
if i == 0 and alpha != "0":
xs.append(x)
ys.append(0)
arrowDirections.append("left")
if i == 1 and alpha != "0":
xs.append(x)
ys.append(1)
arrowDirections.append("down")
if i == 2 and alpha != "0":
xs.append(x)
ys.append(2)
arrowDirections.append("up")
if i == 3 and alpha != "0":
xs.append(x)
ys.append(3)
arrowDirections.append("right")
i += 1
x = x + 1
plt.title(str(os.path.basename(fpath)))
return(ys)
else:
print("This file is not valid.")
#Creates a scatterplot
def testFunction():
fpath = openSmFile()
if (fpath.endswith('.sm')):
file = open(fpath, "r")
lines = []
lines = file.readlines()
for line in lines:
i = 0
if not ("," in line or "." in line or "#" in line or ";" in line or "-" in line or line == "" or ":" in line or len(line) != 5):
print(line)
print(len(line))
else:
print("This file is not valid.")
#Not relevant. Tests arrays in function
def bar():
graph("bar")
plt.show()
#Creates a bar graph
def ILovePie():
graph("pie")
plt.show()
def Scatter():
ScatterTime("")
plt.show()
def multiScatter():
sm = returnSmFile()
Ys = returnYs(sm)
Xs = returnXs(sm)
counter = returnCounter(sm) / 10
plt.figure(num=None, figsize = [counter, 3])
plt.scatter(Xs, Ys)
while (counter != returnCounter(sm)):
plt.show()
counter += returnCounter(sm)
plt.figure(num=None, figsize = [counter, 3])
plt.scatter(Xs, Ys)
counter += returnCounter(sm) / 10
time.sleep(0.2)
barGraph = Button(root, text="Click to show a bar graph", command=bar)
pieGraph = Button(root, text = "Click to show a pie graph", command = ILovePie)
runThrough = Button(root, text="Click to print number of each arrow", command=checkDirections)
scatterGraph = Button(root, text = "Click to show a scatterplot", command = Scatter)
testButton = Button(root, text = "Test", command = testFunction)
multiButton = Button(root, text = "Show each section of the scatterplot", command = multiScatter)
barGraph.pack()
runThrough.pack()
pieGraph.pack()
scatterGraph.pack()
testButton.pack()
multiButton.pack()
root.mainloop()
#Creates the buttons to show the graphs and the run text command```

Related

Get values form a txt, increase them save them in the same txt

I'm trying to get some values from a txt, increase them (until here everything good) and then write the new variables on the file but I can't write the variables on the file although I've changed the variable in a string..
the code is
with open("setup.txt", "r") as f:
for i, line in enumerate(f):
str = line.split(",")
if i == 0:
minL = int(str[0])
maxL = int(str[1])
minL += 2
maxL += 2
elif i == 1:
minF = int(str[0])
maxF = int(str[1])
minF += 1
maxF += 1
minL = str(minL)
with open("setup.txt", "w") as f:
f.write(minL)
f.close()
the txt is just:
15, 25
2, 9
EDIT*********
Sorry I just made a mistake when i copied the code, I've already put "w" for the writing mode but this doesn't work
the error is
line 15, in <module>
minL = str(minL)
NameError: name 'minL' is not defined
but I defined minL
Is 17 your expected output?
I copied and modified your code:
with open("setup.txt", "r") as f:
for i, line in enumerate(f):
stri = line.split(",")
if i == 0:
minL = int(stri[0])
maxL = int(stri[1])
minL += 2
maxL += 2
elif i == 1:
minF = int(stri[0])
maxF = int(stri[1])
minF += 1
maxF += 1
minL = str(minL)
with open("setup.txt", "w") as f:
f.write(minL)
f.close()
I got error with indentation (which I fixed) and changed str = line.split(",") to stri = line.split(",")

Getting an error because of the " \ " characters

I'm getting:
"unexpected character after line continuation character"
How should I write the line = line.strip("\xef\xbb\n\xbf")line without getting that error.
dataFile = open("data.txt","r")
updateFile = open("update","r")
newFile = open("newdata","w")
dataMatrix = []
updateMatrix = []
cardList = []
for line in dataFile:
line = line.strip("\xef\xbb\n\xbf")
tmp = line.split(" ")
cardNum = tmp[0]
cardName = " ".join(tmp[1:-2])
cardDate = tmp[-2]
cardSum = tmp[-1]
dataMatrix.append([cardNum,cardName,cardDate,cardSum])
cardList.append(cardNum)
i = 0
updateDate = ""
for line in updateFile:
line = line.strip("\xef\xbb\n\xbf")
if i==0 : updateDate = line; i=1; continue;
tmp = line.split(" ")
upNum = tmp[0]
upName = " ".join(tmp[1:-1])
upSum = tmp[-1]
updateMatrix.append([upNum,upName,upSum])
for row in updateMatrix:
if row[0] in cardList:
index = cardList.index(row[0])
plus = row[2]
if plus[0] == "+":
plus = int(plus[1:])
else:
plus = -int(plus[1:])
curSum = int(dataMatrix[index][3])
newSum = curSum+plus
dataMatrix[index][3] = newSum
dataMatrix[index][2] = updateDate
# dataMatrix[index][]
else:
dataMatrix.append([row[0],row[1],updateDate,row[2][1:]])
dataMatrix.sort(key=lambda row: row[0])
for row in dataMatrix:
print row
newFile.write(" ".join(str(a) for a in row) + "\n")

python scripts showing different result( with one error ) in two similar input files

The script, originally taken and modified from (http://globplot.embl.de/):
#!/usr/bin/env python
# Copyright (C) 2003 Rune Linding - EMBL
# GlobPlot TM
# GlobPlot is licensed under the Academic Free license
from string import *
from sys import argv
from Bio import File
from Bio import SeqIO
import fpformat
import sys
import tempfile
import os
from os import system,popen3
import math
# Russell/Linding
RL = {'N':0.229885057471264,'P':0.552316012226663,'Q':-0.187676577424997,'A':-0.261538461538462,'R':-0.176592654077609, \
'S':0.142883029808825,'C':-0.0151515151515152,'T':0.00887797506611258,'D':0.227629796839729,'E':-0.204684629516228, \
'V':-0.386174834235195,'F':-0.225572305974316,'W':-0.243375458622095,'G':0.433225711769886,'H':-0.00121743364986608, \
'Y':-0.20750516775322,'I':-0.422234699606962,'K':-0.100092289621613,'L':-0.337933495925287,'M':-0.225903614457831}
def Sum(seq,par_dict):
sum = 0
results = []
raws = []
sums = []
p = 1
for residue in seq:
try:
parameter = par_dict[residue]
except:
parameter = 0
if p == 1:
sum = parameter
else:
sum = sum + parameter#*math.log10(p)
ssum = float(fpformat.fix(sum,10))
sums.append(ssum)
p +=1
return sums
def getSlices(dydx_data, DOM_join_frame, DOM_peak_frame, DIS_join_frame, DIS_peak_frame):
DOMslices = []
DISslices = []
in_DOMslice = 0
in_DISslice = 0
beginDOMslice = 0
endDOMslice = 0
beginDISslice = 0
endDISslice = 0
for i in range( len(dydx_data) ):
#close dom slice
if in_DOMslice and dydx_data[i] > 0:
DOMslices.append([beginDOMslice, endDOMslice])
in_DOMslice = 0
#close dis slice
elif in_DISslice and dydx_data[i] < 0:
DISslices.append([beginDISslice, endDISslice])
in_DISslice = 0
# elseif inSlice expandslice
elif in_DOMslice:
endDOMslice += 1
elif in_DISslice:
endDISslice += 1
# if not in slice and dydx !== 0 start slice
if dydx_data[i] > 0 and not in_DISslice:
beginDISslice = i
endDISslice = i
in_DISslice = 1
elif dydx_data[i] < 0 and not in_DOMslice:
beginDOMslice = i
endDOMslice = i
in_DOMslice = 1
#last slice
if in_DOMslice:
DOMslices.append([beginDOMslice, endDOMslice])
if in_DISslice:
DISslices.append([beginDISslice,endDISslice])
k = 0
l = 0
while k < len(DOMslices):
if k+1 < len(DOMslices) and DOMslices[k+1][0]-DOMslices[k][1] < DOM_join_frame:
DOMslices[k] = [ DOMslices[k][0], DOMslices[k+1][1] ]
del DOMslices[k+1]
elif DOMslices[k][1]-DOMslices[k][0]+1 < DOM_peak_frame:
del DOMslices[k]
else:
k += 1
while l < len(DISslices):
if l+1 < len(DISslices) and DISslices[l+1][0]-DISslices[l][1] < DIS_join_frame:
DISslices[l] = [ DISslices[l][0], DISslices[l+1][1] ]
del DISslices[l+1]
elif DISslices[l][1]-DISslices[l][0]+1 < DIS_peak_frame:
del DISslices[l]
else:
l += 1
return DOMslices, DISslices
def SavitzkyGolay(window,derivative,datalist):
SG_bin = 'sav_gol'
stdin, stdout, stderr = popen3(SG_bin + '-D' + str(derivative) + ' -n' + str(window)+','+str(window))
for data in datalist:
stdin.write(`data`+'\n')
try:
stdin.close()
except:
print stderr.readlines()
results = stdout.readlines()
stdout.close()
SG_results = []
for result in results:
SG_results.append(float(fpformat.fix(result,6)))
return SG_results
def reportSlicesTXT(slices, sequence, maskFlag):
if maskFlag == 'DOM':
coordstr = '|GlobDoms:'
elif maskFlag == 'DIS':
coordstr = '|Disorder:'
else:
raise SystemExit
if slices == []:
#by default the sequence is in uppercase which is our search space
s = sequence
else:
# insert seq before first slide
if slices[0][0] > 0:
s = sequence[0:slices[0][0]]
else:
s = ''
for i in range(len(slices)):
#skip first slice
if i > 0:
coordstr = coordstr + ', '
coordstr = coordstr + str(slices[i][0]+1) + '-' + str(slices[i][1]+1)
#insert the actual slice
if maskFlag == 'DOM':
s = s + lower(sequence[slices[i][0]:(slices[i][1]+1)])
if i < len(slices)-1:
s = s + upper(sequence[(slices[i][1]+1):(slices[i+1][0])])
#last slice
elif slices[i][1] < len(sequence)-1:
s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
elif maskFlag == 'DIS':
s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
#insert untouched seq between disorder segments, 2-run labelling
if i < len(slices)-1:
s = s + sequence[(slices[i][1]+1):(slices[i+1][0])]
#last slice
elif slices[i][1] < len(sequence)-1:
s = s + sequence[(slices[i][1]+1):(len(sequence))]
return s,coordstr
def runGlobPlot():
try:
smoothFrame = int(sys.argv[1])
DOM_joinFrame = int(sys.argv[2])
DOM_peakFrame = int(sys.argv[3])
DIS_joinFrame = int(sys.argv[4])
DIS_peakFrame = int(sys.argv[5])
file = str(sys.argv[6])
db = open(file,'r')
except:
print 'Usage:'
print ' ./GlobPipe.py SmoothFrame DOMjoinFrame DOMpeakFrame DISjoinFrame DISpeakFrame FASTAfile'
print ' Optimised for ELM: ./GlobPlot.py 10 8 75 8 8 sequence_file'
print ' Webserver settings: ./GlobPlot.py 10 15 74 4 5 sequence_file'
raise SystemExit
for cur_record in SeqIO.parse(db, "fasta"):
#uppercase is searchspace
seq = upper(str(cur_record.seq))
# sum function
sum_vector = Sum(seq,RL)
# Run Savitzky-Golay
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
dydx_vector = SavitzkyGolay('smoothFrame',1, sum_vector)
#test
sumHEAD = sum_vector[:smoothFrame]
sumTAIL = sum_vector[len(sum_vector)-smoothFrame:]
newHEAD = []
newTAIL = []
for i in range(len(sumHEAD)):
try:
dHEAD = (sumHEAD[i+1]-sumHEAD[i])/2
except:
dHEAD = (sumHEAD[i]-sumHEAD[i-1])/2
try:
dTAIL = (sumTAIL[i+1]-sumTAIL[i])/2
except:
dTAIL = (sumTAIL[i]-sumTAIL[i-1])/2
newHEAD.append(dHEAD)
newTAIL.append(dTAIL)
dydx_vector[:smoothFrame] = newHEAD
dydx_vector[len(dydx_vector)-smoothFrame:] = newTAIL
globdoms, globdis = getSlices(dydx_vector, DOM_joinFrame, DOM_peakFrame, DIS_joinFrame, DIS_peakFrame)
s_domMask, coordstrDOM = reportSlicesTXT(globdoms, seq, 'DOM')
s_final, coordstrDIS = reportSlicesTXT(globdis, s_domMask, 'DIS')
sys.stdout.write('>'+cur_record.id+coordstrDOM+coordstrDIS+'\n')
print s_final
print '\n'
return
runGlobPlot()
My input and output files are here: link
This script takes a input (input1.fa) and gives following output output1.txt
But when I try to run this script with similar type but larger input file (input2.fa) .. It shows following error:
Traceback (most recent call last):
File "final_script_globpipe.py", line 207, in <module>
runGlobPlot()
File "final_script_globpipe.py", line 179, in runGlobPlot
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
File "final_script_globpipe.py", line 105, in SavitzkyGolay
stdin.write(`data`+'\n')
IOError: [Errno 22] Invalid argument
I have no idea where the problem is. Any type of suggestion is appriciated.
I am using python 2.7 in windows 7 machine. I have also attached the Savitzky Golay module which is needed to run the script.
Thanks
UPDATE:
After trying to reproduce the error on linux it's showing a similar behavior, working fine with the first file but with the second is returning Errno32.
Traceback:
Traceback (most recent call last):
File "Glob.py", line 207, in <module>
runGlobPlot()
File "Glob.py", line 179, in runGlobPlot
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
File "Glob.py", line 105, in SavitzkyGolay
stdin.write(`data`+'\n')
IOError: [Errno 32] Broken pipe
Update:
Some calls of the SG_bin return that the -n parameter is the wrong type.
Wrong type of parameter for flag -n. Has to be unsigned,unsigned
This parameter comes from the window variable that is passed to the SavitzkyGolay function.
Surrounding the stdin.write with a trycatch block reveals that it breaks a hadnfull of times.
try:
for data in datalist:
stdin.write(repr(data)+'\n')
except:
print "It broke"

Python 'Tuple' object has no attribute 'has_key'

I'm running a piece of freely available python code used to detect CNVs in single cell sequencing data:
#!/usr/bin/env python
import sys
def main():
infilename = sys.argv[1]
outfilename = sys.argv[2]
statfilename = sys.argv[3]
chrominfo = ("/path/hg19.chrom.sizes.txt", 0)
bins = ("/path/hg19.bin.boundaries.50k.bowtie.k50.sorted.txt", 0)
INFILE = open(infilename, "r")
OUTFILE = open(outfilename, "w")
STATFILE = open(statfilename, "w")
binCounts = []
for i in range(len(bins)):
binCounts.append(0)
print len(binCounts)
print len(bins)
counter = 0
totalReads = 0
prevChrompos = ""
for x in INFILE:
arow = x.rstrip().split("\t")
thisChrom = arow[2]
thisChrompos = arow[3]
if thisChrom.find("_") > -1:
#print thisChrom
continue
if thisChrom == "chrM":
#print thisChrom
continue
if thisChrom == "":
continue
if chrominfo.has_key(thisChrom):
pass
else:
continue
totalReads += 1
thisChrominfo = chrominfo[thisChrom]
thisAbspos = long(thisChrompos) + long(thisChrominfo[2])
counter += 1
indexUp = len(bins) - 1
indexDown = 0
indexMid = int((indexUp - indexDown) / 2.0)
while True:
if thisAbspos >= long(bins[indexMid][2]):
indexDown = indexMid + 0
indexMid = int((indexUp - indexDown) / 2.0) + indexMid
else:
indexUp = indexMid + 0
indexMid = int((indexUp - indexDown) / 2.0) + indexDown
if indexUp - indexDown < 2:
break
binCounts[indexDown] += 1
prevChrompos = thisChrompos
for i in range(len(binCounts)):
thisRatio = float(binCounts[i]) / (float(counter) / float(len(bins)))
OUTFILE.write("\t".join(bins[i][0:3]))
OUTFILE.write("\t")
OUTFILE.write(str(binCounts[i]))
OUTFILE.write("\t")
OUTFILE.write(str(thisRatio))
OUTFILE.write("\n")
binCounts.sort()
STATFILE.write("TotalReads\tMedianBinCount\n")
STATFILE.write(str(totalReads))
STATFILE.write("\t")
STATFILE.write(str(binCounts[len(bins)/2]))
STATFILE.write("\n")
INFILE.close()
OUTFILE.close()
STATFILE.close()
def fileToDictionary(inputFile, indexColumn):
input = open(inputFile, "r")
rd = dict()
# input.readline()
for x in input:
arow = x.rstrip().split("\t")
id = arow[indexColumn]
if rd.has_key(id):
#rd[id].append(arow)
print "duplicate knowngene id = " + id
print "arow = " + str(arow)
print "rd[id] = " + str(rd[id])
else:
rd[id] = arow
input.close()
return(rd)
def fileToArray(inputFile, skipFirst):
input = open(inputFile, "r")
ra = []
for i in range(skipFirst):
input.readline()
for x in input:
arow = x.rstrip().split("\t")
ra.append(arow)
input.close()
return(ra)
if __name__ == "__main__":
main()
I'm getting an error on line 40:
Traceback (most recent call last):
File "/path/varbin.50k.sam.py", line 129, in <module>
main()
File "/path/varbin.50k.sam.py", line 40, in main
**if chrominfo.has_key(thisChrom):
AttributeError: 'tuple' object has no attribute 'has_key'**
I don't work regularly in Python, can someone offer a suggestion?
Where do I begin?
Your code is expecting a dictionary and getting a tuple. I think you've missed a step: You need to change
chrominfo = ("/path/hg19.chrom.sizes.txt", 0)
To
chrominfo = fileToDictionary("/path/hg19.chrom.sizes.txt", 0)
Note also that if dict.has_key(key) has been deprecated in favour of if key in dict.keys()

Parsing a big text file, extract data & store it in a CSV file.. Too Slow

I have a big log file (say 1-3 Gb) which I need to parse, extract data & save it in a CSV file.
Text File Data
* D:40035FC8 wr-long 00000008 \\core0\Global\u4TimeHiCnt 1.000us
* D:40027C5C rd-byte 00 *core0\Global\Ypf_OILL_OilLvlOn 20.342us
* D:40010044 rd-word 0FE2 *l\u2SAD_OILLVS_RecoveryCounter 0.160us
* D:40010044 wr-word 0FE1 *l\u2SAD_OILLVS_RecoveryCounter 0.040us
* D:40035FC8 wr-long 00000008 \\core0\Global\u4TimeHiCnt 1.000us
I have to extract the variable name which is after the last \ and then the number of Read & Write along with the datatype & store it in a CSV file.
CSV File Result
Variable Datatype CORE 0 CORE 1 CORE X
Read Write Read Write Read Write
OS_inKernel byte 0 0 111768 111878 0 0
OS_globalIntLevel long 0 0 281604 237901 0 0
The problem is it takes too much time. Can you pls look in to the attached code & suggest ways to make it faster.
import string
import sys
import time
MyFile = open("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\core1_sram_ReadWrite.txt")#core0_sram_ReadWrite_rawdata
GeneratedFile = open(str(("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\")+'ParsedOutput.csv'),'w')
try:
MyVariableList = []
TimeStartTest = time.time() #Starting Time
GeneratedFile.write('\nVariable')
GeneratedFile.write(', Datatype')
GeneratedFile.write(', CORE 0')
GeneratedFile.write(',, CORE 1')
GeneratedFile.write(',, CORE X')
GeneratedFile.write('\n,, Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write(', Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write(', Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write('\n')
for CurrentLine in MyFile:
NoofSpaces = 0
if CurrentLine.find('\\') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('\\')+1:].split(' ')[0]
elif CurrentLine.find('*\\') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('*\\')+1:].split(' ')[0]
elif CurrentLine.find('*') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('*')+1:].split(' ')[0]
VariableFound = 0
MyVariableList.sort()
Lowerbound = 0
Upperbound = len(MyVariableList)-1
while Lowerbound <= Upperbound and VariableFound == 0:
middle_pos = (Lowerbound+Upperbound) // 2
if MyVariableList[middle_pos] < MyVariable:
Lowerbound = middle_pos + 1
elif MyVariableList[middle_pos] > MyVariable:
Upperbound = middle_pos - 1
else:
VariableFound = 1
if VariableFound == 0:
MyVariableList.append(MyVariable)
try:
MyFile1 = open("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\core1_sram_ReadWrite.txt")#core0_sram_ReadWrite_rawdata
Core0_ReadCount = 0
Core0_WriteCount = 0
Core1_ReadCount = 0
Core1_WriteCount = 0
CoreX_ReadCount = 0
CoreX_WriteCount = 0
for CurrentLine1 in MyFile1:
if CurrentLine1.find(MyVariable) != -1:
## CORE 0 ##
if CurrentLine1.find("0\\Global") != -1:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
Core0_ReadCount = Core0_ReadCount + 1
elif DataOperation == 'wr':
Core0_WriteCount = Core0_WriteCount + 1
## CORE 1 ##
elif CurrentLine1.find("1\\Global") != -1:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
Core1_ReadCount = Core1_ReadCount + 1
elif DataOperation == 'wr':
Core1_WriteCount = Core1_WriteCount + 1
## CORE X ##
else:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
CoreX_ReadCount = CoreX_ReadCount + 1
elif DataOperation == 'wr':
CoreX_WriteCount = CoreX_WriteCount + 1
GeneratedFile.write('\n %s' %MyVariable)
GeneratedFile.write(', %s' %DataType)
GeneratedFile.write(', %d' %Core0_ReadCount)
GeneratedFile.write(', %d' %Core0_WriteCount)
GeneratedFile.write(', %d' %Core1_ReadCount)
GeneratedFile.write(', %d' %Core1_WriteCount)
GeneratedFile.write(', %d' %CoreX_ReadCount)
GeneratedFile.write(', %d' %CoreX_WriteCount)
GeneratedFile.write('\n')
finally:
MyFile1.close()
except:
print sys.exc_info()
finally:
GeneratedFile.close()
MyFile.close()
TimeStopTest = time.time()
print str(int((TimeStopTest - TimeStartTest)/60))
You'd better use with statement, like this:
# if this file is line based
with open('test.txt') as f:
for line in f:
# process line, do something with line

Categories