Problem extracting content from text files using Python - python

I am trying to capture the data here in the second table (Field crops) titled "Prices Received, United States,July 2010, with Comparisons". I am using Panda dataframes to capture the table from the text file and then I will output it to a CSV file.
My code is as follows
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if 'Dollars' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 6
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
def take_table(txt_data):
comodity = []
q = []
w = []
e = []
t = []
p = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6 = data_row
comodity.append(col_1)
q.append(col_2)
w.append(col_3)
e.append(col_4)
t.append(col_5)
p.append(col_6)
table_data = {'comodity': comodity, 'q': q,
'w': w, 'e': e, 't': t}
return table_data
And, then I am doing this:
import requests
import pandas as pd
txt_data = requests.get("https://downloads.usda.library.cornell.edu/usda-esmis/files/c821gj76b/6w924d00c/9z903130m/AgriPric-07-30-2010.txt").text
splited_data = txt_data.split('\n')
table_title = 'Prices Received, United States'
END_TABLE_LINE = '-------------------------------------------'
_, table_start,_ = find_no_line_start_table(table_title,splited_data)
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
dict_table = take_table(txt_data)
pd.DataFrame(dict_table)
c = pd.DataFrame(dict_table)
IndexError: list assignment index out of range
However, I am getting an error here. Can anyone help me figure out what I am doing wrong?

Cause of error:
data_row is a list of 6 elements.
number_columns = 6
# ...
data_row = [''] * number_columns # [''] * 6
and index will increment with each iteration where first_column_done = True. But first_column_done will be True when : is encountered in a word, i.e
if ':' in w:
first_column_done = True
hence, for each iteration after first_column_done turns True, index will increment until it gets more than 6 which is the bound of list data_row.
def row(l):
l = l.split()
number_columns = 6
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w # error pos.
In other words, U get this error for each line that contains a number of words greater than 6 - index after the first occurence of : within a word in that line.
Fix:
Use split(':') and list comprehension as well as python tertiary operator.
def row(l):
row = [ col.strip() for col in l.split(':') ]
row[2:] = row[2].split()
return [ row[i] if i < len(row) else '' for i in range(6) ]

Related

How can I use a loop to get values for dictionary without overwriting the previous value?

I have to save a dictionary on a seprate json file. The values for the dictionary are being scraped forom a website. I want the values to add up but with every new one the old one is replaced.
gar = -1
Pirmasfilmasvaroni = varoni[gar]
while (gar < 7):
gar = gar + 1
#atdaliju varonus atkariba no filnmas
#cik varoni
garums = len(Pirmasfilmasvaroni)
z = (garums-1)
u = (z-1)
count = -1
while (count < z):
count = count + 1
pirmais = Pirmasfilmasvaroni[count]
Psaturs = requests.get(pirmais)
if Psaturs.status_code == 200:
Pdati = Psaturs.text
Pinfo = json.loads(Pdati)
var = Pinfo['result']['properties']['name']
dic = {gar:[var]}
with open("Filmas_un_varoni_kas_tajas_piedalas.json", "w") as js:
json.dump(dic, js, indent=4)
varoni={}
gar = -1
Pirmasfilmasvaroni = varoni[gar]
# create an empty dictionary
dic = {}
while (gar < 7):
gar = gar + 1
#atdaliju varonus atkariba no filnmas
#cik varoni
garums = len(Pirmasfilmasvaroni)
z = (garums-1)
u = (z-1)
count = -1
while (count < z):
count = count + 1
pirmais = Pirmasfilmasvaroni[count]
Psaturs = requests.get(pirmais)
if Psaturs.status_code == 200:
Pdati = Psaturs.text
Pinfo = json.loads(Pdati)
var = Pinfo['result']['properties']['name']
# add the new key-value pair to the dictionary
dic[gar] = var
# write the dictionary to the json file
with open("Filmas_un_varoni_kas_tajas_piedalas.json", "w") as js:
json.dump(dic, js, indent=4)

Problems porting code from Python 2.7 to 3.6

I have a fragment of code which loads data from a .csv file.
It's written for Python 2.7 but in Python 3.6 does not work.
def load_new_data(self):
full = list()
with open(self.filename, 'rb') as csv_in:
myreader2 = csv.reader(csv_in, delimiter=';')
count = 0
for row in myreader2:
if count == 0:
headers = row[1:]
count += 1
elif count == 1:
count += 1
else:
current_row = row[1:-1]
full.append(current_row)
count += 1
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print '\t Removing incomplete variables.'
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df
the error I get is:
212
213 count = 0
--> 214 for row in myreader2:
215 if count == 0:
216 headers = row[1:]
Error: iterator should return strings, not bytes (did you open the file in
text mode?)
I did try changing the 'rb' to 'r' and 'rt' and even deleting it, as other posts here suggest, but with no success...
try this
def load_new_data(self):
full = list()
with open(self.filename, 'r') as csv_in:
myreader2 = csv.reader(csv_in, delimiter=';')
count = 0
for row in myreader2:
if count == 0:
headers = row[1:]
count += 1
elif count == 1:
count += 1
else:
current_row = row[1:-1]
full.append(current_row)
count += 1
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print ('\t Removing incomplete variables.')
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df
You should try codecs, for open file. Be careful this file encoding.
Sample:
def load_new_data(self):
with codecs.open(self.filename, 'rb', encoding="cp1251") as csv_in: # cp1251 replace for your encoding!
myreader2 = csv.reader(csv_in, delimiter=';')
headers = next(myreader2)[1:]
next(myreader2)
full = [row[1:] for row in myreader2]
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print('\t Removing incomplete variables.')
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df

time complexity issues of my program

my viterbi code program becomes exponential. can you help me find the place i can change to make it dynamic program. I need to remember and use only the 2 previous tags of words.
thanks a lot.
from collections import defaultdict
import sys
import re
import feature_maker as fm
bla = ''
all_states = set()
#distirbuition over all of the corpus
POS_probability = fm.load_obj('probas')
POS_probability['START'] = 1.0
def cpd_tagwords(words, tag):
pattern = re.compile("\W")# to check for .,: etc.
if pattern.match(words) and tag == words:
return 1
elif pattern.match(tag):
return 0
for word in emle.split("\n"):
if word.__contains__(words) and word.__contains__(tag):
return word[word.index(":") + 2:]
#if we dont have data about the word with the tag,just retturn the probability
#to get the tag over all of the word in the corpus.
return POS_probability[tag]
def cpd_tags(early, prev, current):
lambda1 = 0
lambda3 = 0
lambda6 = 0
for word in qmle.split("\n"):
word1 = word.split()
if len(word1) > 0:
if word1[0].__contains__(current): #for tuple of 1
if len(word1) == 2:
lambda1 = word[word.index("]:") + 3:]
if len(word1) > 2 and word1[1].__contains__(prev): #for tuple of 2
if len(word1) == 3:
lambda3 = word[word.index("]:") + 3:]
if len(word1) > 3 and word1[2].__contains__(early): #for tuple of 3
if len(word1) == 4:
lambda6 = word[word.index("]:") + 3:]
return (0.6*float(lambda6)) + (0.3*float(lambda3)) + (0.1*float(lambda1))
#map: popular_copuler['POS'] = list of all pos that can come before it.
popular_copules = fm.load_obj('popular_copules')
# Viterbi Algo
def viterbi(sentence, tags1):
def findSet(index,tag):
if tag == 'ALL':
return tags1
if index in range(1, len(sentence) + 1):
possible_tags = set(popular_copules[tag])
if possible_tags == set([]):
return tags1
return set(popular_copules[tag])
elif index == 0 or index == -1:
return {'START'}
# stores (word:tag) in this whole sentence
sentence_with_tag = defaultdict(str)
# inner function to commpute pi values--start
def pi_viterbi(k, u, v, sentence):#here is the start of the bad sequence
prob = defaultdict(float)
# initialization
if k == 0 and u == 'START' and v == 'START':
return (1., 'START')
else:
for w in findSet(k - 2,u):
prev = pi_viterbi(k - 1, w, u, sentence)[0]
# tuple((w,u,v))
q = cpd_tags(w, u, v)**
e = cpd_tagwords(sentence[k - 1].lower(), v)
probability = float(prev) * q * float(e)
prob[tuple((w, u))] = probability**
#here is the end of the bad sequence
max_tuple = max(prob.items(), key=lambda x: x[1])
# print (max_tuple[1],max_tuple[0][0])
return max_tuple[1], max_tuple[0][0]
# inner function to commpute pi values--end
sentence_with_tag = list()
backpointer = defaultdict(str)
tags = defaultdict(str)
k = len(sentence)
u_glob = ''
v_glob = ''
glob = 0.
for i in range(1, k + 1):
prob = defaultdict(float)
#for current word we check all the tags
""" changed from for u in findSet(i - 1):"""
for u in findSet(i ,'ALL'):
#going backwards we call findset with u so it gives us only
# tags v that go togeter alot with u(this is purnnig)
""" changed from for v in findSet(i)"""
for v in findSet(i-1,u_glob):
#siwtched u and v
value, w = pi_viterbi(i, v, u, sentence)#the v recursion in the algorithm
prob[tuple((i, u, v))] = value
backpointer[tuple((i, u, v))] = w #bp from the algorithm
max_tuple = max(prob.items(), key=lambda x: x[1])
backpointer[tuple((i, max_tuple[0][1], max_tuple[0][-1]))] = max_tuple[0][1] # bp (k,u,v)= tag w
# sentence_with_tag.append(max_tuple[0][-1])
u_glob = max_tuple[0][-2]
v_glob = max_tuple[0][-1]
glob = max_tuple[1]
print ('Max', max_tuple)
tags[k - 1] = u_glob
tags[k] = v_glob
for i in range((k - 2), 0, -1):
tag = backpointer[tuple(((i + 2), tags[i + 1], tags[i + 2]))]
tags[i] = tag
tag_list = list()
for i in range(1, len(tags) + 1):
tag_list.append(tags[i])
file = open(sys.argv[4], 'w')
file.truncate()
for word in tag_list:
file.write(word)
# tag list as results
return tag_list
file=open(sys.argv[1],"r+")
fQ = open(sys.argv[2], 'r')
qmle = fQ.read()
fQ.close()
f = open("tags.txt",'r+')
tags = f.read()
f.close()
fe = open(sys.argv[3], 'r')
emle = fe.read()
distinct_tags = set()
# what is the list of all tags?
for word in tags.split():
distinct_tags.add(word)
sentence = []
sentence1 = []
sentence1 = file.read()
sentence = sentence1.split()
file.close()
file = open(sys.argv[4], 'w')
file.truncate()
viterbi(sentence, distinct_tags)
how can I reduce the time complexity?

Sequence match using Python

I am working on RNA sequence matching
seq = 'UCAGCUGUCAGUCAUGAUC'
sub_seq =['UGUCAG', 'CAGUCA', 'UCAGCU','GAUC']
I am matching the sub_seq to the seq, matched sub_seq is under the seq, if there is no matched, use dash line. Output looks like this:
UCAGCUGUCAGUCAUGAUC
UCAGCU--CAGUCA-GAUC
-----UGUCAG--------
I try to use the dictionary to do this
index_dict = {}
for i in xrange(len(sub_seq)):
index_dict[seq.find(sub_seq[i])] = {}
index_dict[seq.find(sub_seq[i])]['sequence'] = sub_seq[i]
index_dict[seq.find(sub_seq[i])]['end_index'] = seq.find(sub_seq[i]) + len(sub_seq[i]) - 1
I cannot figure out the algorithm to do alignment, any help will be appreciated!
seq_l = len(seq)
for ele in sub_seq:
start = seq.find(ele)
ln = len(ele)
if start != -1:
end = start + ln
print("-" * start + ele + "-"*(seq_l- end))
else:
print("-" * seq_l)
-----UGUCAG--------
--------CAGUCA-----
UCAGCU-------------
---------------GAUC
Not sure where UCAGCU--CAGUCA-GAUC comes from as you are only using a single sub sequence at a time in your code
Assuming you'll let me change your index_dict slightly, consider:
seq = 'UCAGCUGUCAGUCAUGAUC'
sub_seq =['UGUCAG', 'CAGUCA', 'UCAGCU','GAUC']
index_dict = {}
for i in xrange(len(sub_seq)):
index_dict[seq.find(sub_seq[i])] = {
'sequence': sub_seq[i],
'end_index': seq.find(sub_seq[i]) + len(sub_seq[i]) # Note this changed
}
sorted_keys = sorted(index_dict)
lines = []
while True:
if not sorted_keys: break
line = []
next_index = 0
for k in sorted_keys:
if k >= next_index:
line.append(k)
next_index = index_dict[k]['end_index']
# Remove keys we used, append line to lines
for k in line: sorted_keys.remove(k)
lines.append(line)
# Build output lines
olines = []
for line in lines:
oline = ''
for k in line:
oline += '-' * (k - len(oline)) # Add dashes before subseq
oline += index_dict[k]['sequence'] # Add subsequence
oline += '-' * (len(seq) - len(oline)) # Add trailing dashes
olines.append(oline)
print seq
print '\n'.join(olines)
Output:
UCAGCUGUCAGUCAUGAUC
UCAGCU--CAGUCA-GAUC
-----UGUCAG--------
Note this is pretty verbose, and could be condensed a bit. The while True and for line in lines loops could probably be merged into one, but it should help explain one possible approach.
Edit: This is one way you might join the last two loops:
seq = 'UCAGCUGUCAGUCAUGAUC'
sub_seq =['UGUCAG', 'CAGUCA', 'UCAGCU','GAUC']
index_dict = {}
for i in xrange(len(sub_seq)):
index_dict[seq.find(sub_seq[i])] = {
'sequence': sub_seq[i],
'end_index': seq.find(sub_seq[i]) + len(sub_seq[i]) # Note this changed
}
sorted_keys = sorted(index_dict)
lines = []
while True:
if not sorted_keys: break
line = ''
next_index = 0
keys_used = []
for k in sorted_keys:
if k >= next_index:
line += '-' * (k - len(line)) # Add dashes before subseq
line += index_dict[k]['sequence'] # Add subsequence
next_index = index_dict[k]['end_index'] # Update next_index
keys_used.append(k) # Mark key as used
for k in keys_used: sorted_keys.remove(k) # Remove used keys
line += '-' * (len(seq) - len(line)) # Add trailing dashes
lines.append(line) # Add line to lines
print seq
print '\n'.join(lines)
Output:
UCAGCUGUCAGUCAUGAUC
UCAGCU--CAGUCA-GAUC
-----UGUCAG--------

Multidimensional arrays - PYTHON

i have problems with the array indexes in python.
at function readfile it crashes and prints: "list index out of range"
inputarr = []
def readfile(filename):
lines = readlines(filename)
with open(filename, 'r') as f:
i = 0
j= 0
k = 0
for line in f:
line = line.rstrip("\n")
if not line == '':
inputarr[j][k] = line
k += 1
#print("\tnew entry\tj=%d\tk=%d" % (j, k))
elif line == '':
k = 0
j += 1
#print("new block!\tj=%d\tk=%d" % (j, k))
i += 1
processing(i, lines)
This error is due to you trying to assign to an index of inputarr that is outside the bounds of the list. This causes an error in python (unlike some other languages like javascript which automatically extend an array if you try to access an index that is outside the initial bounds of the array).
You need to either pre-fill inputarr so it has the right shape and size, or you need to dynamically create it as you go. I prefer the latter:
inputarr = [[]]
# ^^ Set up the first row
def readfile(filename):
lines = readlines(filename)
with open(filename, 'r') as f:
i = 0
j= 0
k = 0
for line in f:
line = line.rstrip("\n")
if not line == '':
inputarr[j].append(line)
# ^^^^^^^^ Add a new value to the end of the current row of inputarr
k += 1
#print("\tnew entry\tj=%d\tk=%d" % (j, k))
elif line == '':
k = 0
inputarr.append([])
# ^^^^^^^^^^^^^^^^^^^ Add a new blank row to inputarr
j += 1
#print("new block!\tj=%d\tk=%d" % (j, k))
i += 1
processing(i, lines)
It happens because inputarr is empty. For example:
lst = []
lst[0] = 1 // error
In your case:
inputarr = []
j = 0
...
inputarr[j][k] = line // inputarr= []; j = 0; so inputarr[0] = ...!ERROR

Categories