Parsing Data from live website in Python Enumerate problem! - python

The following script is supposed to fetch a specific line number and parse it from a live website. It works for like 30 loops but then it seems like enumerate(f) stops working correctly... the "i" in the for loop seems to stop at line 130 instead of like 200 something. Could this be due to the website I'm trying to fetch data from or something else? Thanks!!
import sgmllib
class MyParser(sgmllib.SGMLParser):
"A simple parser class."
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.divs = []
self.descriptions = []
self.inside_div_element = 0
def start_div(self, attributes):
"Process a hyperlink and its 'attributes'."
for name, value in attributes:
if name == "id":
self.divs.append(value)
self.inside_div_element = 1
def end_div(self):
"Record the end of a hyperlink."
self.inside_div_element = 0
def handle_data(self, data):
"Handle the textual 'data'."
if self.inside_div_element:
self.descriptions.append(data)
def get_div(self):
"Return the list of hyperlinks."
return self.divs
def get_descriptions(self, check):
"Return a list of descriptions."
if check == 1:
self.descriptions.pop(0)
return self.descriptions
def rm_descriptions(self):
"Remove all descriptions."
self.descriptions.pop()
import urllib
import linecache
import sgmllib
tempLine = ""
tempStr = " "
tempStr2 = ""
myparser = MyParser()
count = 0
user = ['']
oldUser = ['none']
oldoldUser = [' ']
array = [" ", 0]
index = 0
found = 0
k = 0
j = 0
posIndex = 0
a = 0
firstCheck = 0
fCheck = 0
while a < 1000:
print a
f = urllib.urlopen("SITE")
a = a+1
for i, line in enumerate(f):
if i == 187:
print i
tempLine = line
print line
myparser.parse(line)
if fCheck == 1:
result = oldUser[0] is oldUser[1]
u1 = oldUser[0]
u2 = oldUser[1]
tempStr = oldUser[1]
if u1 == u2:
result = 1
else:
result = user is oldUser
fCheck = 1
user = myparser.get_descriptions(firstCheck)
tempStr = user[0]
firstCheck = 1
if result:
array[index+1] = array[index+1] +0
else:
j = 0
for z in array:
k = j+2
tempStr2 = user[0]
if k < len(array) and tempStr2 == array[k]:
array[j+3] = array[j+3] + 1
index = j+2
found = 1
break
j = j+1
if found == 0:
array.append(tempStr)
array.append(0)
oldUser = user
found = 0
print array
elif i > 200:
print "HERE"
break
print array
f.close()

Perhaps the number of lines on that web page are fewer than you think? What does this give you?:
print max(i for i, _ in enumerate(urllib.urlopen("SITE")))

Aside: Your indentation is stuffed after the while a < 1000: line. Excessive empty lines and one-letter names don't assist the understanding of your code.
enumerate is not broken. Instead of such speculation, inspect your data. Suggestion: replace
for i, line in enumerate(f):
by
lines = list(f)
print "=== a=%d linecount=%d === % (a, len(lines))
for i, line in enumerate(lines):
print " a=%d i=%d line=%r" % (a, i, line)
Examine the output carefully.

Related

Looking for resistance genes in water sample using kmers [Python]

I need some help with my code. I need to look for the presence of resistance genes in a water sample. That translates in having a huge file of reads coming from the water sample and a file of resistances genes. My problem is making the code run under 5 minutes, a thing that is not happening right now. Probably the issue relays on discarting reads as fast as possible, on having a smart method to only analyze meaningful reads. Do you have any suggestion? I cannot use any non standard python library
This is my code
import time
def build_lyb(TargetFile):
TargetFile = open(TargetFile)
res_gen = {}
for line in TargetFile:
if line.startswith(">"):
header = line[:-1]
res_gen[header] = ""
else:
res_gen[header] += line[:-1]
return res_gen
def build_kmers(sequence, k_size):
kmers = []
n_kmers = len(sequence) - k_size + 1
for i in range(n_kmers):
kmer = sequence[i:i + k_size]
kmers.append(kmer)
return kmers
def calculation(kmers, g):
matches = []
for i in range(0, len(genes[g])):
matches.append(0)
k = 0
while k < len(kmers):
if kmers[k] in genes[g]:
pos = genes[g].find(kmers[k])
for i in range(pos, pos+19):
matches[i] = 1
k += 19
else:
k += 1
return matches
def coverage(matches, g):
counter = 0
for i in matches[g]:
if i >= 1:
counter += 1
cov = counter/len(res_genes[g])*100
return cov
st = time.time()
genes = build_lyb("resistance_genes.fsa")
infile = open('test2.txt', 'r')
res_genes = {}
Flag = False
n_line = 0
for line in infile:
n_line += 1
if line.startswith("+"):
Flag = False
if Flag:
kmers = build_kmers(line[:-1], 19)
for g in genes:
counter = 18
k = 20
while k <= 41:
if kmers[k] in genes[g]:
counter += 19
k += 19
else:
k += 1
if counter >= 56:
print(n_line)
l1 = calculation(kmers, g)
if g in res_genes:
l2 = res_genes[g]
lr = [sum(i) for i in zip(l1, l2)]
res_genes[g] = lr
else:
res_genes[g] = l1
if line.startswith('#'):
Flag = True
for g in res_genes:
print(g)
for i in genes[g]:
print(i, " ", end='')
print('')
for i in res_genes[g]:
print(i, " ", end='')
print('')
print(coverage(res_genes, g))
et = time.time()
elapsed_time = et-st
print("Execution time:", elapsed_time, "s")

Search Strings in a List with Loop Return Order

I'm very new to Python and I have a question.
I have a List that looks like this:
List = ["B-Guild","I-Guild","I-Guild","L-Guild","B-Gene","L-Gene","U-Car"]
All of the words with B-(I)-L belong to each other and I want to use a function to show that.
def combine(x):
foo = []
regexp_B = ("B-" + r'.*')
regexp_I = ("I-" + r'.*')
regexp_L = ("L-" + r'.*')
regexp_U = ("U-" + r'.*')
for i in range(0,len(x),1):
if re.match(regexp_B, x[i]):
print("Found B")
foo.append[i+x[i]]
if re.match(regexp_I, x[i+1]):
print("Found I")
foo.append[i+1+x[i+1]]
if re.match(regexp_I, x[i+1]):
print("Found I")
foo.append[i+1+x[i+1]]
else:
print("Found L")
foo.append[i+1+x[i+1]]
else:
print("Found L")
foo.append[i1+x[i1]]
elif re.match(regexp_L, x[i]):
print("L")
foo.append[i1+x[i1]]
elif re.match(regexp_U, x[i]):
print("Found U")
foo.append[i1+x[i1]]
return foo
List_New = combine(List)
Desired Output:
foo = ["0B-Guild","0I-Guild","0I-Guild","OL-Guild","1B-Gene","1L-Gene","2U-Car"]
Edit:
The output follows this logic: Every time a "B-" prefix appears, the words to follow are part of one "theme" until a "L-" prefix appears. These words got to have the same number before them so they can be grouped for further functions. "U-" prefixes don't follow that logic and just need a number before them to distinguish them from the other words. Think of it as a Counter that groups these word into a cluster.
def combine(some_list):
current_group = 0 # starts with 0
g_size = 0 # current group size
for elem in some_list:
g_size += 1
if elem.startswith('U-') and g_size > 1:
g_size = 1
current_group += 1
yield '{}{}'.format(current_group, elem)
if elem.startswith(('L-', 'U-')): # each L- or U- also finishes a group
g_size = 0
current_group += 1
>>> List = ["B-Guild","I-Guild","I-Guild","L-Guild","B-Gene","L-Gene","U-Car"]
>>> print(list(combine(List)))
>>> List = ["B-Guild","I-Guild","I-Guild","L-Guild","B-Guild","L-Guild","U-Guild"]
>>> print(list(combine(List)))

How to get position of opening/ending HTML tag in Python

How to solve this on Python3, using what lib, and using what sample code?
I have html file, at position Line:Col I have middle of html tag
<table ......>; how to get position of <table> tag edges (brackets < >) and position of its </table> tag edges?
(note: several table tags may be one inside another).
Like said in this SO answer, you should not use regex to parse an HTML file as the standard is highly irregular. You should instead use an HTML parsing library like html.parser : This library offers you HTMLParser.getpos() which returns you the line number and offset of the tag.
This gets you the coordinates of each tag with html.parser, where I monkeypatch the goahead function with a simple modification, calling the custom method get_endpos:
from html.parser import HTMLParser, starttagopen
from html import unescape
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.start_tags = []
self.end_tags = []
self.last_append = []
def handle_starttag(self, tag, attrs):
self.start_tags.append((tag, (self.getpos()[0], self.getpos()[1]),))
self.last_append = self.start_tags
def handle_endtag(self, tag):
self.end_tags.append((tag, (self.getpos()[0], self.getpos()[1]),))
self.last_append = self.end_tags
def get_endpos(self):
self.last_append[-1] = self.last_append[-1] + ((self.getpos()[0], self.getpos()[1]),)
def get_tags(self):
return self.start_tags, self.end_tags
def _reset(self):
HTMLParser.reset(self)
self.start_tags = []
self.end_tags = []
parser = MyHTMLParser()
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
if self.convert_charrefs and not self.cdata_elem:
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
# or there's more text incoming. If the latter is True,
# we can't pass the text to handle_data in case we have
# a charref cut in half at end. Try to determine if
# this is the case before proceeding by looking for an
# & near the end and see if it's followed by a space or ;.
amppos = rawdata.rfind('&', max(i, n-34))
if (amppos >= 0 and
not re.compile(r'[\s;]').search(rawdata, amppos)):
break # wait till we get all the text
j = n
else:
match = self.interesting.search(rawdata, i) # < or &
if match:
j = match.start()
else:
if self.cdata_elem:
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
if i == n: break
startswith = rawdata.startswith
if startswith('<', i):
if starttagopen.match(rawdata, i): # < + letter
k = self.parse_starttag(i)
elif startswith("</", i):
k = self.parse_endtag(i)
elif startswith("<!--", i):
k = self.parse_comment(i)
elif startswith("<?", i):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
elif (i + 1) < n:
self.handle_data("<")
k = i + 1
else:
break
if k < 0:
if not end:
break
k = rawdata.find('>', i + 1)
if k < 0:
k = rawdata.find('<', i + 1)
if k < 0:
k = i + 1
else:
k += 1
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:k]))
else:
self.handle_data(rawdata[i:k])
i = self.updatepos(i, k)
self.get_endpos() # only modification: gets end position of tags
elif startswith("&#", i):
match = charref.match(rawdata, i)
if match:
name = match.group()[2:-1]
self.handle_charref(name)
k = match.end()
if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
continue
else:
if ";" in rawdata[i:]: # bail by consuming &#
self.handle_data(rawdata[i:i+2])
i = self.updatepos(i, i+2)
break
elif startswith('&', i):
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
k = match.end()
if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
continue
match = incomplete.match(rawdata, i)
if match:
# match.group() will contain at least 2 chars
if end and match.group() == rawdata[i:]:
k = match.end()
if k <= i:
k = n
i = self.updatepos(i, i + 1)
# incomplete
break
elif (i + 1) < n:
# not the end of the buffer, and can't be confused
# with some other construct
self.handle_data("&")
i = self.updatepos(i, i + 1)
else:
break
else:
assert 0, "interesting.search() lied"
# end while
if end and i < n and not self.cdata_elem:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]
MyHTMLParser.goahead = goahead
parser.feed(your_html_file_as_a_string)
print(parser.get_tags())

local variable 'moodsc' referenced before assignment

In the code below I get the following error:
"local variable 'moodsc' referenced before assignment"
I'm new to programming and python. I'm struggling with interpreting other questions on the similar topic. Any context around this specific code would be helpful.
import re
import json
import sys
def moodScore(sent, myTweets):
scores = {} # initialize an empty dictionary
new_mdsc = {} # intitalize an empty dictionary
txt = {}
for line in sent:
term, score = line.split("\t") # The file is tab-delimited. "\t" means "tab character"
scores[term] = int(score) # Convert the score to an integer.
data = [] # initialize an empty list
for line in myTweets:
tweet = json.loads(line)
if "text" in tweet and "lang" in tweet and tweet["lang"] == "en":
clean = re.compile("\W+")
clean_txt = clean.sub(" ", tweet["text"]).strip()
line = clean_txt.lower().split()
moodsc = 0
pos = 0
neg = 0
count = 1
for word in range(0, len(line)):
if line[word] in scores:
txt[word] = int(scores[line[word]])
else:
txt[word] = int(0)
moodsc += txt[word]
print txt
if any(v > 0 for v in txt.values()):
pos = 1
if any(v < 0 for v in txt.values()):
neg = 1
for word in range(0, len(line)): # score each word in line
if line[word] not in scores:
if str(line[word]) in new_mdsc.keys():
moodsc2 = new_mdsc[str(line[word])][0] + moodsc
pos2 = new_mdsc[str(line[word])][1] + pos
neg2 = new_mdsc[str(line[word])][2] + neg
count2 = new_mdsc[str(line[word])][3] + count
new_mdsc[str(line[word])] = [moodsc2, pos2, neg2, count2]
else:
new_mdsc[str(line[word])] = [moodsc, pos, neg, count]
def new_dict():
for val in new_mdsc.values():
comp = val[0] / val[3]
val.append(comp)
for key, val in new_mdsc.items():
print (key, val[4])
def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
moodScore(sent_file, tweet_file)
# new_dict()
if __name__ == '__main__':
main()
Ok #joshp, I think you need to globalise some variables, because the error is 'moodsc referenced before assignment', I think the code only gets as far as moodsc += txt[word] but you may also have trouble with pos and neg.
Try global moodsc and pos etc. before you define moodsc and pos etc. If this doesn't work try global moodsc before moodsc += txt[word] and so forth, you may need to use global in both places for it to work, I often find that this is needed in my code, to globalise it at definition and wherever else you use it (at the start of each function and statement where it is used).

Linebreak not working - Python

Somehow the linebreaks are not working as they should.
This is what I get:
Expected:
O meu u2 2 post
http://www.yahoo.com
1 Gosto, 0 Nao gosto
<BLANKLINE>
O meu u2 post
http://www.altavista.com
1 Gosto, 0 Nao gosto
Got:
'O meu u2 2 post\nhttp://www.yahoo.com\n1 Gosto, 0 Nao Gosto\n\nO meu u2\nhttp://www.yahoo.com\n1 Gosto, 0 Nao Gosto'
This is the code used in the function.
The important parts should be the str and showRecentComments functions
class Comments():
def __init__(self, u=None, text='', link=None):
self.u = u
self.text = text
self.link = link
self.topo = None
self.fim = None
def __str__(self):
actual = self.topo
s = ''
if actual == None:
return None
while actual != None:
if actual.seg == None:
s += str(actual)
actual = actual.seg
else:
s += str(actual) + '\n' + '\n'
actual = actual.seg
return s
def add(self,comment):
if self.topo == None:
self.topo = comment
self.fim = comment
else:
comment.seg = self.topo
self.topo.ant = comment
self.topo = comment
def remove(self,comment):
actual = self.topo
if (self.topo == self.fim) and (self.topo == comment):
self.topo = None
self.fim = None
while actual!=None:
if actual == comment:
if self.topo==comment:
actual.seg.ant = None
self.topo = actual.seg
elif self.fim==comment:
actual.ant.seg = None
self.fim = actual.ant
else:
actual.seg.ant = actual.ant
actual.ant.seg = actual.seg
break
else:
actual = actual.seg
def countLike(self):
count = 0
actual = self.topo
while actual != None:
if len(actual.likeList) >= 1:
count += 1
actual = actual.seg
else:
actual = actual.seg
return count
def showRecentComments(self,n):
count = 1
actual = self.topo
sC = ''
if actual == None:
return None
while actual != None:
if count < n:
sC += str(actual) + '\n' + '\n'
count += 1
actual = actual.seg
elif count == n:
sC += str(actual)
count += 1
actual = actual.seg
elif count > n:
break
return sC
Regards, Nelson Gregório
It looks like you're looking at the representation of the string, which will show you the newline characters as \n. If you print or write to e.g. stdout (sys.stdout.write(s)) the string instead, the newlines will be expanded.

Categories