Python 'Tuple' object has no attribute 'has_key' - python

I'm running a piece of freely available python code used to detect CNVs in single cell sequencing data:
#!/usr/bin/env python
import sys
def main():
infilename = sys.argv[1]
outfilename = sys.argv[2]
statfilename = sys.argv[3]
chrominfo = ("/path/hg19.chrom.sizes.txt", 0)
bins = ("/path/hg19.bin.boundaries.50k.bowtie.k50.sorted.txt", 0)
INFILE = open(infilename, "r")
OUTFILE = open(outfilename, "w")
STATFILE = open(statfilename, "w")
binCounts = []
for i in range(len(bins)):
binCounts.append(0)
print len(binCounts)
print len(bins)
counter = 0
totalReads = 0
prevChrompos = ""
for x in INFILE:
arow = x.rstrip().split("\t")
thisChrom = arow[2]
thisChrompos = arow[3]
if thisChrom.find("_") > -1:
#print thisChrom
continue
if thisChrom == "chrM":
#print thisChrom
continue
if thisChrom == "":
continue
if chrominfo.has_key(thisChrom):
pass
else:
continue
totalReads += 1
thisChrominfo = chrominfo[thisChrom]
thisAbspos = long(thisChrompos) + long(thisChrominfo[2])
counter += 1
indexUp = len(bins) - 1
indexDown = 0
indexMid = int((indexUp - indexDown) / 2.0)
while True:
if thisAbspos >= long(bins[indexMid][2]):
indexDown = indexMid + 0
indexMid = int((indexUp - indexDown) / 2.0) + indexMid
else:
indexUp = indexMid + 0
indexMid = int((indexUp - indexDown) / 2.0) + indexDown
if indexUp - indexDown < 2:
break
binCounts[indexDown] += 1
prevChrompos = thisChrompos
for i in range(len(binCounts)):
thisRatio = float(binCounts[i]) / (float(counter) / float(len(bins)))
OUTFILE.write("\t".join(bins[i][0:3]))
OUTFILE.write("\t")
OUTFILE.write(str(binCounts[i]))
OUTFILE.write("\t")
OUTFILE.write(str(thisRatio))
OUTFILE.write("\n")
binCounts.sort()
STATFILE.write("TotalReads\tMedianBinCount\n")
STATFILE.write(str(totalReads))
STATFILE.write("\t")
STATFILE.write(str(binCounts[len(bins)/2]))
STATFILE.write("\n")
INFILE.close()
OUTFILE.close()
STATFILE.close()
def fileToDictionary(inputFile, indexColumn):
input = open(inputFile, "r")
rd = dict()
# input.readline()
for x in input:
arow = x.rstrip().split("\t")
id = arow[indexColumn]
if rd.has_key(id):
#rd[id].append(arow)
print "duplicate knowngene id = " + id
print "arow = " + str(arow)
print "rd[id] = " + str(rd[id])
else:
rd[id] = arow
input.close()
return(rd)
def fileToArray(inputFile, skipFirst):
input = open(inputFile, "r")
ra = []
for i in range(skipFirst):
input.readline()
for x in input:
arow = x.rstrip().split("\t")
ra.append(arow)
input.close()
return(ra)
if __name__ == "__main__":
main()
I'm getting an error on line 40:
Traceback (most recent call last):
File "/path/varbin.50k.sam.py", line 129, in <module>
main()
File "/path/varbin.50k.sam.py", line 40, in main
**if chrominfo.has_key(thisChrom):
AttributeError: 'tuple' object has no attribute 'has_key'**
I don't work regularly in Python, can someone offer a suggestion?
Where do I begin?

Your code is expecting a dictionary and getting a tuple. I think you've missed a step: You need to change
chrominfo = ("/path/hg19.chrom.sizes.txt", 0)
To
chrominfo = fileToDictionary("/path/hg19.chrom.sizes.txt", 0)
Note also that if dict.has_key(key) has been deprecated in favour of if key in dict.keys()

Related

FileNotFoundError: [Errno 2] No such file or directory: 'o'

I'm getting this error message when using yield.
When I remove the yield results and yield timeout the code works fine without the error message
I don't know what is directory or file 'o' since I'm not using it in any way in the code.
here is my full code:
import gradio as gr
import ipaddress
import requests
from requests.auth import HTTPBasicAuth
import os
import string
from datetime import date, datetime
####SETTING UP DATE AND TIME WITH ISRAELI FORMAT###
current_date = date.today()
current_month = current_date.strftime('%B')
current_year = current_date.strftime('%Y')
date_reformat = current_date.strftime('%d/%m/%y')
current_day = current_date.strftime('%d')
###SWITCH###
def switch_ver(ip):
with open('switches_successful_results.txt','w') as switches_successful, open('switches_failed_results.txt', 'w') as switches_failed:
ip_addr = ip.split()
for i in ip_addr:
ip_addr = list(ipaddress.ip_network(i))
try:
basic=HTTPBasicAuth('some','password')
login = requests.post('http://'+i+':80/rest/v7/login-sessions', auth=basic)
cookie = login.cookies
get_ver = requests.get('http://'+i+':80/rest/v7/system/status', cookies=cookie)
get_ver = get_ver.json()
get_ver = get_ver['firmware_version']
get_ver = get_ver
with open('switches_successful_results.txt', 'a+') as sw:
results = 'Switch version for {} is: {} \n'.format(i, get_ver)
sw.write(results)
yield results
except requests.exceptions.ConnectTimeout:
timeout = 'Could not connect to switch: '+i+' REQUEST TIMED OUT\n'
with open('switches_failed_results.txt', 'a+') as sw:
sw.write(timeout)
yield timeout
with open('switches_successful_results.txt','r') as switches_successful, open('switches_failed_results.txt', 'r') as switches_failed:
summary = switches_failed.read() + switches_successful.read()
return (summary),['switches_successful_results.txt', 'switches_failed_results.txt']
###IPBlockerK###
def block_ip(ip):
duplicate_ips = []
blocked_ips = []
invalid_ips = []
with open('fortigate_ips.txt','r+') as f, open('fortigate_urls.txt', 'r+') as u:
fortigate_ips = f.read()
fortigate_urls = u.read()
ip_addr = ip.split()
for i in ip_addr:
try:
list(ipaddress.ip_network(i))
if i in fortigate_ips:
duplicate_ips.append(i)
elif ipaddress.ip_address(i).is_private:
invalid_ips.append(i)
else:
blocked_ips.append(i)
f.write(i + '\n')
except ValueError:
if i in fortigate_ips or i in fortigate_urls:
duplicate_ips.append(i)
elif i[0] in string.ascii_letters or i[0] == '*':
blocked_ips.append(i)
u.write(i + '\n')
else:
invalid_ips.append(i)
current_time = datetime.now()
current_time = current_time.strftime('%H:%M:%S')
if os.path.exists(current_year) == False:
os.makedirs(current_year + '\\'+ current_month + '\\' + current_day)
os.chdir(current_year+ '\\' + current_month +'\\'+ current_day)
with open('Blocked_IPs.txt', 'a+') as Blocked_IPs:
to_file = ('###############{}###############\n'.format(current_time)+'\n'.join(blocked_ips))+'\n'
Blocked_IPs.write(to_file)
os.chdir('D:\\programs\\Python310\\Projects\\net_sec')
elif os.path.exists(current_year) == True and os.path.exists(current_year + '\\'+ current_month) == False:
os.chdir(current_year)
os.makedirs(current_month + '\\' + current_day)
os.chdir(current_month +'\\'+ current_day)
with open('Blocked_IPs.txt', 'a+') as Blocked_IPs:
to_file = ('###############{}###############\n'.format(current_time)+'\n'.join(blocked_ips))+'\n'
Blocked_IPs.write(to_file)
os.chdir('D:\\programs\\Python310\\Projects\\net_sec')
elif os.path.exists(current_year) == True and os.path.exists(current_year + '\\'+ current_month) == True and os.path.exists(current_year + '\\'+ current_month + '\\' + current_day) == False:
os.chdir(current_year + '\\'+ current_month)
os.mkdir(current_day)
os.chdir(current_day)
with open('Blocked_IPs.txt', 'a+') as Blocked_IPs:
to_file = ('###############{}###############\n'.format(current_time)+'\n'.join(blocked_ips))+'\n'
Blocked_IPs.write(to_file)
os.chdir('D:\\programs\\Python310\\Projects\\net_sec')
else:
os.chdir(current_year + '\\' + current_month + '\\' + current_day)
with open('Blocked_IPs.txt', 'a+') as Blocked_IPs:
to_file = ('###############{}###############\n'.format(current_time)+'\n'.join(blocked_ips))+'\n'
Blocked_IPs.write(to_file)
os.chdir('D:\\programs\\Python310\\Projects\\net_sec')
blocked_ips_result = 'Following IP\s or URLs were Blocked!: \n'+'\n'.join(blocked_ips) +'\n'
duplicate_ips_result = 'Skipped!...Found duplicates IP\s for: \n'+'\n'.join(duplicate_ips) +'\n'
invalid_ips_result = 'Skipped!..Invalid IP\s for \n'+'\n'.join(invalid_ips) +'\n'
with open('fortigate_ips.txt', 'r') as f, open('fortigate_urls.txt', 'r') as u:
current_commit_stats = len(blocked_ips)
ips_stats = len(f.readlines())
urls_stats = len(u.readlines())
total_stats = ips_stats + urls_stats
if bool(duplicate_ips) == True and bool(blocked_ips) == False:
print(1)
return duplicate_ips_result, current_commit_stats, ips_stats, urls_stats, total_stats
elif bool(duplicate_ips) == True and bool(blocked_ips) == True and bool(invalid_ips) == True:
print(2)
return invalid_ips_result + duplicate_ips_result + blocked_ips_result, current_commit_stats, ips_stats, urls_stats, total_stats
elif bool(invalid_ips) == True and bool(blocked_ips) == True:
print(3)
return invalid_ips_result + blocked_ips_result, current_commit_stats, ips_stats, urls_stats, total_stats
elif bool(invalid_ips) == True and bool(blocked_ips) == True:
print(4)
return invalid_ips_result + blocked_ips_result, current_commit_stats, ips_stats, urls_stats, total_stats
else:
print(5)
return (blocked_ips_result), current_commit_stats, ips_stats, urls_stats, total_stats
###GRADIO GUI###
#f = open('fortigate_ips.txt', 'r')
#fortigate = (f.read().split())
#f.close()
with gr.Blocks(title = 'Switcher') as switches_ver:
gr.Markdown('Welcome to IPBlocker')
with gr.Tab(label = 'IPBlocker'):
with gr.Row():
with gr.Column():
ips_to_block = gr.Textbox(label = "IPs", lines = 10, placeholder=('Please fill Ips to block'))
block_btn = gr.Button('Block')
#ip_lookup = gr.Dropdown(fortigate)
with gr.Column():
output_textbox = gr.Textbox(label = "Results", lines=10)
with gr.Row():
current_commit_stats = gr.Textbox(label = 'Current IP\s or URLs added to block:')
forti_ips_stats = gr.Textbox(label = 'Total blocked IP\s on Fortigate: ')
forti_urls_stats = gr.Textbox(label = 'Total URLs blocked on Fortigate')
forti_total_stats = gr.Textbox(label = 'Total blocked IP\s and URLs on Fortigate')
block_btn.click(fn=block_ip, inputs = ips_to_block, outputs = [output_textbox, current_commit_stats, forti_ips_stats, forti_urls_stats, forti_total_stats])
with gr.Tab(label = 'Switcher'):
with gr.Row():
with gr.Column():
switch_box = gr.Textbox(label = 'Switches', lines = 10, placeholder='Please fill switches IPs...')
show_ver = gr.Button('Show current switches version')
upgrade_ver = gr.Button('Upgrade selected switches')
with gr.Column():
output_textbox = gr.Textbox(label='Results',lines = 10)
output_file = gr.File(['switches_successful_results.txt', 'switches_failed_results.txt'])
show_ver.click(fn=switch_ver, inputs = switch_box, outputs = [output_textbox, output_file])
upgrade_ver.click(fn=block_ip, inputs = ips_to_block, outputs=[output_textbox, output_file])
switches_ver.queue(concurrency_count=20, max_size=20).launch()
full error traceback:
Traceback (most recent call last):
File "D:\programs\Python310\lib\site-packages\gradio\routes.py", line 273, in run_predict
output = await app.blocks.process_api(
File "D:\programs\Python310\lib\site-packages\gradio\blocks.py", line 757, in process_api
predictions = self.postprocess_data(fn_index, result["prediction"], state)
File "D:\programs\Python310\lib\site-packages\gradio\blocks.py", line 721, in postprocess_data
block.postprocess(prediction_value)
File "D:\programs\Python310\lib\site-packages\gradio\components.py", line 2147, in postprocess
"name": processing_utils.create_tmp_copy_of_file(
File "D:\programs\Python310\lib\site-packages\gradio\processing_utils.py", line 323, in create_tmp_copy_of_file
shutil.copy2(file_path, file_obj.name)
File "D:\programs\Python310\lib\shutil.py", line 434, in copy2
copyfile(src, dst, follow_symlinks=follow_symlinks)
File "D:\programs\Python310\lib\shutil.py", line 254, in copyfile
with open(src, 'rb') as fsrc:
FileNotFoundError: [Errno 2] No such file or directory: 'o'
The 'o' came from the timeout text "Could not connect..."
From what I understand about gradio, the result, for both yield and return seems to be processed to outputs, which is output_textbox and output_file
As the yield result is timeout (similar goes for results yield case):
output_textbox = timeout[0] = 'C'
output_file = timeout[1] = 'o'
If you want to remove the errors, you should change the yield result to be compatible to the outputs.
For example:
yield timeout, ['switches_successful_results.txt', 'switches_failed_results.txt']
If you are using yield you can iterate only once. It doesn't keep data on memory for all time. Check this out: https://stackoverflow.com/a/231855/17318894

python error could not convert string to float

We are getting this below error while migrating the data from slack channel to a file, when we execute the script for fetching the data for one day, it executing perfectly.
But when we execute the script for 2 months data, it gives 10 days data in separate file but getting throwing an error on particular date. It might be possible that the source data on slack is bit different from expected
Traceback (most recent call last):
File "C:\Users\Slack SCript\script.py", line 218, in <module>
main()
File "C:\Users\Slack SCript\script.py", line 201, in main
parse(message['text'])
File "C:\Users\Slack SCript\script.py", line 114, in parse
size = float(elements[1])
ValueError: could not convert string to float:
As per the source data we found that some value is 0 maybe the error we got because of this value. is there any way to skip or continue future.
from slackclient import SlackClient
import time
import os
import sys
import datetime
from dateutil.relativedelta import relativedelta
servers = ("fd2a", "ff1a", "hh3b", "kw1a", "kw1b", "lo8a", "os5a", "os5b", "sg2a", "sg2b", 'sy1a', 'va1a', 'va1b')
types = ("", "nfs", "cluster")
currser = "d"
currtype = ""
used = {}
total = {}
available = {}
ts = 0
dir_name = "data"
def savedata(dir_path, filename, data):
f = open(dir_path + filename, "w") # opens file with name of "test.txt"
print(dir_path + filename)
f.write(data)
f.close()
def reset_data():
print("datareset")
for i in range(0, len(servers)):
for j in range(0, len(types)):
used[servers[i] + types[j]] = 0
total[servers[i] + types[j]] = 0
available[servers[i] + types[j]] = 0
def write_data(ts):
datastr = ''
global used
global total
ttotaltotalsum = 0
for j in range(0, len(types)):
datastr += types[j] + '\n'
datastr += "Name\t" + "Region\t" + "total(TB)\t" + "used(TB)\t" + "available(TB)\t" + "Used(%)\n"
for i in range(0, len(servers)):
tused = used[servers[i] + types[j]]
ttotal = total[servers[i] + types[j]]
ttotaltotalsum += ttotal
if (ttotal != 0):
datastr += (
servers[i][0:len(servers[i]) - 1] + "\t\t" +
servers[i][len(servers[i]) - 1] + "\t\t" +
"{:.1f}".format(ttotal / 1024) + " \t\t" +
"{:.1f}".format(tused / 1024) + " \t\t" +
"{:.1f}".format((ttotal - tused) / 1024) +"\t\t"+
"{:.1f}".format(tused / ttotal * 100) + " \t\t" +
" \n")
print("..")
if (ttotaltotalsum > 0):
hour= datetime.datetime.fromtimestamp(int(ts)).hour
day= datetime.datetime.fromtimestamp(int(ts)).day
month= datetime.datetime.fromtimestamp(int(ts)).month
year=datetime.datetime.fromtimestamp(int(ts)).year
if hour < 12:
savedata("data/", "Storage-Update-M-" +
str(day) + "-" +
str(month) + "-" +
str(year) + ".txt", datastr)
else:
savedata("data/", "Storage-Update-E-" +
str(day) + "-" +
str(month) + "-" +
str(year) + ".txt", datastr)
def parse(text):
global currser
global currtype
global used
global total
global available
global ts
content = text.split("\n")
for line in content:
line = line[:len(line)]
if line.__contains__("Netapp Cluster"):
for server in servers:
if line.__contains__(server):
currser = server
for type in types:
if line.__contains__(type):
currtype = type
# print(line)
if line.__contains__("Total available capacity"):
# print(line)
# print ("contains","Total available capacity------")
elements = line.split(":")
# print (elements)
size = float(elements[1])
# print(size)
total[currser + currtype] += size
# print(size,"TOTAL capacity",total)
elif line.__contains__("size provisioned"):
# print(line)
# print("contains", "Total LUN size provisioned------- ")
elements = line.split(":")
# print(elements)
size = float(elements[1])
# print(size)
used[currser + currtype] += size
# print(size, "Used", used)
# print( currser)
# print( currtype)
# print( used)
# print(total)
# print(available)
return (used, total)
def make_dir(dir_name):
if not os.path.exists(dir_name):
os.makedirs(dir_name)
def main():
slack_token = ""
channel_name = ''
time_on_last_message = time.time()
channel_id = ""
ts = 0.000
threshmins = 20
channels_call = SlackClient(slack_token).api_call("channels.list")
print(channels_call)
print(channels_call.keys())
for channel in channels_call["channels"]:
if channel["name"] == channel_name:
channel_id = channel["id"]
print(channel)
make_dir(dir_name)
print(channel_id)
reset_data()
time_since_last_update = time.time() - time_on_last_message
print("Waiting for new data....", time.time() - time_on_last_message)
if time_since_last_update > threshmins * 60:
write_data(ts)
reset_data()
sc = SlackClient(slack_token)
date_after_month = datetime.datetime.now() + relativedelta(months=-6)
date_after_month=date_after_month.timestamp()
while True:
breakflag=0
data = sc.api_call(
"channels.history",
channel=channel_id,
oldest=date_after_month,
count=1000,
)
if (data['ok'] == True):
messages = data['messages']
for message in reversed(messages):
# print(message['ts'])
if float(message['ts']) > ts:
print("difference=", float(message['ts']) - ts)
if float(message['ts']) - ts > (threshmins * 60):
print("greater diffrrece>reset................")
write_data(ts)
print(ts)
reset_data()
time_on_last_message = time.time()
ts = float(message['ts'])
parse(message['text'])
if (data["has_more"] == True):
print("has more")
date_after_month=message['ts']
else:
breakflag=1
else:
print("No data returned or error")
time.sleep(1) # in Seconds
if(breakflag==1):
break
main()
Based on the error message, elements[1] is empty. And Python cannot convert an empty string to float:
>>> float("")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: could not convert string to float:
The elements[1] element is a string that can't be parsed to a float. The easiest way would be to attach a debugger and investigate what is being parsed. Then change your code to parse it better.
The second easiest way would be to binary search for the record that makes it fail and fix your code to parse it better.
The totally absolutely preferred way would be to, when you found what the case was that your code didn't support, you would write a test that proves that that case was added:
def test_parse_xyz():
assert [("blablabla", None)] == parse(["blablabla: -certainly_not_a_float"])
These tests can automatically be detected by e.g. pytest:
$ pytest parser.py

Getting an error because of the " \ " characters

I'm getting:
"unexpected character after line continuation character"
How should I write the line = line.strip("\xef\xbb\n\xbf")line without getting that error.
dataFile = open("data.txt","r")
updateFile = open("update","r")
newFile = open("newdata","w")
dataMatrix = []
updateMatrix = []
cardList = []
for line in dataFile:
line = line.strip("\xef\xbb\n\xbf")
tmp = line.split(" ")
cardNum = tmp[0]
cardName = " ".join(tmp[1:-2])
cardDate = tmp[-2]
cardSum = tmp[-1]
dataMatrix.append([cardNum,cardName,cardDate,cardSum])
cardList.append(cardNum)
i = 0
updateDate = ""
for line in updateFile:
line = line.strip("\xef\xbb\n\xbf")
if i==0 : updateDate = line; i=1; continue;
tmp = line.split(" ")
upNum = tmp[0]
upName = " ".join(tmp[1:-1])
upSum = tmp[-1]
updateMatrix.append([upNum,upName,upSum])
for row in updateMatrix:
if row[0] in cardList:
index = cardList.index(row[0])
plus = row[2]
if plus[0] == "+":
plus = int(plus[1:])
else:
plus = -int(plus[1:])
curSum = int(dataMatrix[index][3])
newSum = curSum+plus
dataMatrix[index][3] = newSum
dataMatrix[index][2] = updateDate
# dataMatrix[index][]
else:
dataMatrix.append([row[0],row[1],updateDate,row[2][1:]])
dataMatrix.sort(key=lambda row: row[0])
for row in dataMatrix:
print row
newFile.write(" ".join(str(a) for a in row) + "\n")

python scripts showing different result( with one error ) in two similar input files

The script, originally taken and modified from (http://globplot.embl.de/):
#!/usr/bin/env python
# Copyright (C) 2003 Rune Linding - EMBL
# GlobPlot TM
# GlobPlot is licensed under the Academic Free license
from string import *
from sys import argv
from Bio import File
from Bio import SeqIO
import fpformat
import sys
import tempfile
import os
from os import system,popen3
import math
# Russell/Linding
RL = {'N':0.229885057471264,'P':0.552316012226663,'Q':-0.187676577424997,'A':-0.261538461538462,'R':-0.176592654077609, \
'S':0.142883029808825,'C':-0.0151515151515152,'T':0.00887797506611258,'D':0.227629796839729,'E':-0.204684629516228, \
'V':-0.386174834235195,'F':-0.225572305974316,'W':-0.243375458622095,'G':0.433225711769886,'H':-0.00121743364986608, \
'Y':-0.20750516775322,'I':-0.422234699606962,'K':-0.100092289621613,'L':-0.337933495925287,'M':-0.225903614457831}
def Sum(seq,par_dict):
sum = 0
results = []
raws = []
sums = []
p = 1
for residue in seq:
try:
parameter = par_dict[residue]
except:
parameter = 0
if p == 1:
sum = parameter
else:
sum = sum + parameter#*math.log10(p)
ssum = float(fpformat.fix(sum,10))
sums.append(ssum)
p +=1
return sums
def getSlices(dydx_data, DOM_join_frame, DOM_peak_frame, DIS_join_frame, DIS_peak_frame):
DOMslices = []
DISslices = []
in_DOMslice = 0
in_DISslice = 0
beginDOMslice = 0
endDOMslice = 0
beginDISslice = 0
endDISslice = 0
for i in range( len(dydx_data) ):
#close dom slice
if in_DOMslice and dydx_data[i] > 0:
DOMslices.append([beginDOMslice, endDOMslice])
in_DOMslice = 0
#close dis slice
elif in_DISslice and dydx_data[i] < 0:
DISslices.append([beginDISslice, endDISslice])
in_DISslice = 0
# elseif inSlice expandslice
elif in_DOMslice:
endDOMslice += 1
elif in_DISslice:
endDISslice += 1
# if not in slice and dydx !== 0 start slice
if dydx_data[i] > 0 and not in_DISslice:
beginDISslice = i
endDISslice = i
in_DISslice = 1
elif dydx_data[i] < 0 and not in_DOMslice:
beginDOMslice = i
endDOMslice = i
in_DOMslice = 1
#last slice
if in_DOMslice:
DOMslices.append([beginDOMslice, endDOMslice])
if in_DISslice:
DISslices.append([beginDISslice,endDISslice])
k = 0
l = 0
while k < len(DOMslices):
if k+1 < len(DOMslices) and DOMslices[k+1][0]-DOMslices[k][1] < DOM_join_frame:
DOMslices[k] = [ DOMslices[k][0], DOMslices[k+1][1] ]
del DOMslices[k+1]
elif DOMslices[k][1]-DOMslices[k][0]+1 < DOM_peak_frame:
del DOMslices[k]
else:
k += 1
while l < len(DISslices):
if l+1 < len(DISslices) and DISslices[l+1][0]-DISslices[l][1] < DIS_join_frame:
DISslices[l] = [ DISslices[l][0], DISslices[l+1][1] ]
del DISslices[l+1]
elif DISslices[l][1]-DISslices[l][0]+1 < DIS_peak_frame:
del DISslices[l]
else:
l += 1
return DOMslices, DISslices
def SavitzkyGolay(window,derivative,datalist):
SG_bin = 'sav_gol'
stdin, stdout, stderr = popen3(SG_bin + '-D' + str(derivative) + ' -n' + str(window)+','+str(window))
for data in datalist:
stdin.write(`data`+'\n')
try:
stdin.close()
except:
print stderr.readlines()
results = stdout.readlines()
stdout.close()
SG_results = []
for result in results:
SG_results.append(float(fpformat.fix(result,6)))
return SG_results
def reportSlicesTXT(slices, sequence, maskFlag):
if maskFlag == 'DOM':
coordstr = '|GlobDoms:'
elif maskFlag == 'DIS':
coordstr = '|Disorder:'
else:
raise SystemExit
if slices == []:
#by default the sequence is in uppercase which is our search space
s = sequence
else:
# insert seq before first slide
if slices[0][0] > 0:
s = sequence[0:slices[0][0]]
else:
s = ''
for i in range(len(slices)):
#skip first slice
if i > 0:
coordstr = coordstr + ', '
coordstr = coordstr + str(slices[i][0]+1) + '-' + str(slices[i][1]+1)
#insert the actual slice
if maskFlag == 'DOM':
s = s + lower(sequence[slices[i][0]:(slices[i][1]+1)])
if i < len(slices)-1:
s = s + upper(sequence[(slices[i][1]+1):(slices[i+1][0])])
#last slice
elif slices[i][1] < len(sequence)-1:
s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
elif maskFlag == 'DIS':
s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
#insert untouched seq between disorder segments, 2-run labelling
if i < len(slices)-1:
s = s + sequence[(slices[i][1]+1):(slices[i+1][0])]
#last slice
elif slices[i][1] < len(sequence)-1:
s = s + sequence[(slices[i][1]+1):(len(sequence))]
return s,coordstr
def runGlobPlot():
try:
smoothFrame = int(sys.argv[1])
DOM_joinFrame = int(sys.argv[2])
DOM_peakFrame = int(sys.argv[3])
DIS_joinFrame = int(sys.argv[4])
DIS_peakFrame = int(sys.argv[5])
file = str(sys.argv[6])
db = open(file,'r')
except:
print 'Usage:'
print ' ./GlobPipe.py SmoothFrame DOMjoinFrame DOMpeakFrame DISjoinFrame DISpeakFrame FASTAfile'
print ' Optimised for ELM: ./GlobPlot.py 10 8 75 8 8 sequence_file'
print ' Webserver settings: ./GlobPlot.py 10 15 74 4 5 sequence_file'
raise SystemExit
for cur_record in SeqIO.parse(db, "fasta"):
#uppercase is searchspace
seq = upper(str(cur_record.seq))
# sum function
sum_vector = Sum(seq,RL)
# Run Savitzky-Golay
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
dydx_vector = SavitzkyGolay('smoothFrame',1, sum_vector)
#test
sumHEAD = sum_vector[:smoothFrame]
sumTAIL = sum_vector[len(sum_vector)-smoothFrame:]
newHEAD = []
newTAIL = []
for i in range(len(sumHEAD)):
try:
dHEAD = (sumHEAD[i+1]-sumHEAD[i])/2
except:
dHEAD = (sumHEAD[i]-sumHEAD[i-1])/2
try:
dTAIL = (sumTAIL[i+1]-sumTAIL[i])/2
except:
dTAIL = (sumTAIL[i]-sumTAIL[i-1])/2
newHEAD.append(dHEAD)
newTAIL.append(dTAIL)
dydx_vector[:smoothFrame] = newHEAD
dydx_vector[len(dydx_vector)-smoothFrame:] = newTAIL
globdoms, globdis = getSlices(dydx_vector, DOM_joinFrame, DOM_peakFrame, DIS_joinFrame, DIS_peakFrame)
s_domMask, coordstrDOM = reportSlicesTXT(globdoms, seq, 'DOM')
s_final, coordstrDIS = reportSlicesTXT(globdis, s_domMask, 'DIS')
sys.stdout.write('>'+cur_record.id+coordstrDOM+coordstrDIS+'\n')
print s_final
print '\n'
return
runGlobPlot()
My input and output files are here: link
This script takes a input (input1.fa) and gives following output output1.txt
But when I try to run this script with similar type but larger input file (input2.fa) .. It shows following error:
Traceback (most recent call last):
File "final_script_globpipe.py", line 207, in <module>
runGlobPlot()
File "final_script_globpipe.py", line 179, in runGlobPlot
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
File "final_script_globpipe.py", line 105, in SavitzkyGolay
stdin.write(`data`+'\n')
IOError: [Errno 22] Invalid argument
I have no idea where the problem is. Any type of suggestion is appriciated.
I am using python 2.7 in windows 7 machine. I have also attached the Savitzky Golay module which is needed to run the script.
Thanks
UPDATE:
After trying to reproduce the error on linux it's showing a similar behavior, working fine with the first file but with the second is returning Errno32.
Traceback:
Traceback (most recent call last):
File "Glob.py", line 207, in <module>
runGlobPlot()
File "Glob.py", line 179, in runGlobPlot
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
File "Glob.py", line 105, in SavitzkyGolay
stdin.write(`data`+'\n')
IOError: [Errno 32] Broken pipe
Update:
Some calls of the SG_bin return that the -n parameter is the wrong type.
Wrong type of parameter for flag -n. Has to be unsigned,unsigned
This parameter comes from the window variable that is passed to the SavitzkyGolay function.
Surrounding the stdin.write with a trycatch block reveals that it breaks a hadnfull of times.
try:
for data in datalist:
stdin.write(repr(data)+'\n')
except:
print "It broke"

Parsing a big text file, extract data & store it in a CSV file.. Too Slow

I have a big log file (say 1-3 Gb) which I need to parse, extract data & save it in a CSV file.
Text File Data
* D:40035FC8 wr-long 00000008 \\core0\Global\u4TimeHiCnt 1.000us
* D:40027C5C rd-byte 00 *core0\Global\Ypf_OILL_OilLvlOn 20.342us
* D:40010044 rd-word 0FE2 *l\u2SAD_OILLVS_RecoveryCounter 0.160us
* D:40010044 wr-word 0FE1 *l\u2SAD_OILLVS_RecoveryCounter 0.040us
* D:40035FC8 wr-long 00000008 \\core0\Global\u4TimeHiCnt 1.000us
I have to extract the variable name which is after the last \ and then the number of Read & Write along with the datatype & store it in a CSV file.
CSV File Result
Variable Datatype CORE 0 CORE 1 CORE X
Read Write Read Write Read Write
OS_inKernel byte 0 0 111768 111878 0 0
OS_globalIntLevel long 0 0 281604 237901 0 0
The problem is it takes too much time. Can you pls look in to the attached code & suggest ways to make it faster.
import string
import sys
import time
MyFile = open("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\core1_sram_ReadWrite.txt")#core0_sram_ReadWrite_rawdata
GeneratedFile = open(str(("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\")+'ParsedOutput.csv'),'w')
try:
MyVariableList = []
TimeStartTest = time.time() #Starting Time
GeneratedFile.write('\nVariable')
GeneratedFile.write(', Datatype')
GeneratedFile.write(', CORE 0')
GeneratedFile.write(',, CORE 1')
GeneratedFile.write(',, CORE X')
GeneratedFile.write('\n,, Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write(', Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write(', Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write('\n')
for CurrentLine in MyFile:
NoofSpaces = 0
if CurrentLine.find('\\') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('\\')+1:].split(' ')[0]
elif CurrentLine.find('*\\') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('*\\')+1:].split(' ')[0]
elif CurrentLine.find('*') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('*')+1:].split(' ')[0]
VariableFound = 0
MyVariableList.sort()
Lowerbound = 0
Upperbound = len(MyVariableList)-1
while Lowerbound <= Upperbound and VariableFound == 0:
middle_pos = (Lowerbound+Upperbound) // 2
if MyVariableList[middle_pos] < MyVariable:
Lowerbound = middle_pos + 1
elif MyVariableList[middle_pos] > MyVariable:
Upperbound = middle_pos - 1
else:
VariableFound = 1
if VariableFound == 0:
MyVariableList.append(MyVariable)
try:
MyFile1 = open("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\core1_sram_ReadWrite.txt")#core0_sram_ReadWrite_rawdata
Core0_ReadCount = 0
Core0_WriteCount = 0
Core1_ReadCount = 0
Core1_WriteCount = 0
CoreX_ReadCount = 0
CoreX_WriteCount = 0
for CurrentLine1 in MyFile1:
if CurrentLine1.find(MyVariable) != -1:
## CORE 0 ##
if CurrentLine1.find("0\\Global") != -1:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
Core0_ReadCount = Core0_ReadCount + 1
elif DataOperation == 'wr':
Core0_WriteCount = Core0_WriteCount + 1
## CORE 1 ##
elif CurrentLine1.find("1\\Global") != -1:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
Core1_ReadCount = Core1_ReadCount + 1
elif DataOperation == 'wr':
Core1_WriteCount = Core1_WriteCount + 1
## CORE X ##
else:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
CoreX_ReadCount = CoreX_ReadCount + 1
elif DataOperation == 'wr':
CoreX_WriteCount = CoreX_WriteCount + 1
GeneratedFile.write('\n %s' %MyVariable)
GeneratedFile.write(', %s' %DataType)
GeneratedFile.write(', %d' %Core0_ReadCount)
GeneratedFile.write(', %d' %Core0_WriteCount)
GeneratedFile.write(', %d' %Core1_ReadCount)
GeneratedFile.write(', %d' %Core1_WriteCount)
GeneratedFile.write(', %d' %CoreX_ReadCount)
GeneratedFile.write(', %d' %CoreX_WriteCount)
GeneratedFile.write('\n')
finally:
MyFile1.close()
except:
print sys.exc_info()
finally:
GeneratedFile.close()
MyFile.close()
TimeStopTest = time.time()
print str(int((TimeStopTest - TimeStartTest)/60))
You'd better use with statement, like this:
# if this file is line based
with open('test.txt') as f:
for line in f:
# process line, do something with line

Categories