Multidimensional arrays - PYTHON - python

i have problems with the array indexes in python.
at function readfile it crashes and prints: "list index out of range"
inputarr = []
def readfile(filename):
lines = readlines(filename)
with open(filename, 'r') as f:
i = 0
j= 0
k = 0
for line in f:
line = line.rstrip("\n")
if not line == '':
inputarr[j][k] = line
k += 1
#print("\tnew entry\tj=%d\tk=%d" % (j, k))
elif line == '':
k = 0
j += 1
#print("new block!\tj=%d\tk=%d" % (j, k))
i += 1
processing(i, lines)

This error is due to you trying to assign to an index of inputarr that is outside the bounds of the list. This causes an error in python (unlike some other languages like javascript which automatically extend an array if you try to access an index that is outside the initial bounds of the array).
You need to either pre-fill inputarr so it has the right shape and size, or you need to dynamically create it as you go. I prefer the latter:
inputarr = [[]]
# ^^ Set up the first row
def readfile(filename):
lines = readlines(filename)
with open(filename, 'r') as f:
i = 0
j= 0
k = 0
for line in f:
line = line.rstrip("\n")
if not line == '':
inputarr[j].append(line)
# ^^^^^^^^ Add a new value to the end of the current row of inputarr
k += 1
#print("\tnew entry\tj=%d\tk=%d" % (j, k))
elif line == '':
k = 0
inputarr.append([])
# ^^^^^^^^^^^^^^^^^^^ Add a new blank row to inputarr
j += 1
#print("new block!\tj=%d\tk=%d" % (j, k))
i += 1
processing(i, lines)

It happens because inputarr is empty. For example:
lst = []
lst[0] = 1 // error
In your case:
inputarr = []
j = 0
...
inputarr[j][k] = line // inputarr= []; j = 0; so inputarr[0] = ...!ERROR

Related

Python code for optimal alignment score and sequence giving wrong result

This is my first time coding, so please do understand my code is very messy. I have done two different ways to get the optimal score and the optimal sequence, unfortunately both of my answers are wrong. In my code I have included a way to open a fasta file, but since this seemed to not work I also just included the sequences in the code myself.
My optimal score is computed but not printed for some reason- it is also wrong I have 208 when I should get 275. I also dont get a correct alignment score back.
The two sequences are
The scoring alignment needs to follow , 11 for internal gaps, 8 for terminal gaps on the 5' end, 7 for gaps on the 3' end, 4 for mismatches, 0 for matches
My file is at [removed link]
my_file = open("one.fasta","w")
my_file.write (""">Testseq1
TCTGGTGTCCTAGGCGTAGAGGAACCACACCAATCCATCCCGAACTCTGGTGGTTAAACTCTACTGCGGTGACGATACT""")
sequenceone= open("one.fasta","r")
line = sequenceone.readline()
header = ""
seqA = ""
while line:
line = line.rstrip("\n")
if ">" in line:
header = line
else :
seqA = seqA + line
line = sequenceone.readline()
my_file.close()
my_files = open("two.fasta","w")
my_files.write (""">Testseq2
TGGTGCGGTCATACCAGCGCTAATGCACCGGATCCCATCAGAACTCCGCAGTTAAGCGCGCTTGGGCCAGAACAGTACTGGGATGGGTGTCC""")
sequencetwo= open("two.fasta","r")
line = sequencetwo.readline()
header = ""
seqB = ""
while line:
line = line.rstrip("\n")
if ">" in line:
header = line
else :
seqB = seqB + line
line = sequencetwo.readline()
my_files.close()
alphabet = ["A","C","G","T"]
score = [[8,8,8,8,8],\
[0,4,4,4,11],\
[4,0,4,4,11],\
[4,4,0,4,11],\
[4,4,4,0,11],\
[7,7,7,7,7]]
def Global(a,b):
D = []
for i in range(len(a)+1):
D.append([0]* (len(b)+1))
for i in range(len(a)+1):
D[i][0] = D[i-1][0] + score[alphabet.index(a[i-1])][-1]
for i in range(len(b)+1):
D[0][i] = D[0][i-1] + score[-1][alphabet.index(b[i-1])]
for i in range (1, len(a)+1):
for j in range (1, len(b)+1):
distHor = D[i][j-1] + score[-1][alphabet.index(b[j-1])]
distVer = D[i-1][j] + score[alphabet.index(a[i-1])][-1]
if a[i-1] == b[j-1]:
distDiag = D[i-1][j-1]
else:
distDiag = D[i-1][j-1] + score[alphabet.index(a[i-1])][alphabet.index(b[j-1])]
D[i][j] = min(distHor, distVer, distDiag)
return D[-1][-1]
seqA = "TCTGGTGTCCTAGGCGTAGAGGAACCACACCAATCCATCCCGAACTCTGGTGGTTAAACTCTACTGCGGTGACGATACT"
seqB = "TGGTGCGGTCATACCAGCGCTAATGCACCGGATCCCATCAGAACTCCGCAGTTAAGCGCGCTTGGGCCAGAACAGTACTGGGATGGGTGTCC"
row = len(seqA)+1
column = len(seqB)+1
match = 0
mismatch = 4
gap = 11
align1=""
align2=""
matrix=[[[[None] for i in range (2)] for i in range(column)] for i in range(row)]
for i in range(column):
matrix[0][i][0]=gap*i
if(i>0):
matrix[0][i][1]="hor"
for i in range(row):
matrix[i][0][0]=gap*i
if(i>0):
matrix[i][0][1]="ver"
for i in range(1,row):
for j in range(1,column):
hor=matrix[i][j-1][0]+gap
ver=matrix[i-1][j][0]+gap
if (seqA[i-1]==seqB[j-1]):
diag=matrix[i-1][j-1][0]+match
else:
diag=matrix[i-1][j-1][0]+mismatch
var = {hor:"hor",ver:"ver",diag:"diag"}
hvd=[hor,ver,diag]
matrix[i][j][0]=max(hvd)
matrix[i][j][1]=var.get(max(var))
k=row
l=column
while(True):
if(l==1 and k==1):
break
else:
if(matrix[k-1][l-1][1]=="ver"):
align1+=seqA[k-2]
align2+="-"
k-=1
elif(matrix[k-1][l-1][1]=="hor"):
align1+="-"
align2+=seqB[l-2]
l-=1
elif(matrix[k-1][l-1][1]=="diag"):
align1+=seqA[k-2]
align2+=seqB[l-2]
k-=1
l-=1
align1=align1[::-1]
align2=align2[::-1]
print (align1)
print (align2)
Global(seqA,seqB)
Please can anyone guide me on what I am doing wrong?

Problem extracting content from text files using Python

I am trying to capture the data here in the second table (Field crops) titled "Prices Received, United States,July 2010, with Comparisons". I am using Panda dataframes to capture the table from the text file and then I will output it to a CSV file.
My code is as follows
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if 'Dollars' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 6
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
def take_table(txt_data):
comodity = []
q = []
w = []
e = []
t = []
p = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6 = data_row
comodity.append(col_1)
q.append(col_2)
w.append(col_3)
e.append(col_4)
t.append(col_5)
p.append(col_6)
table_data = {'comodity': comodity, 'q': q,
'w': w, 'e': e, 't': t}
return table_data
And, then I am doing this:
import requests
import pandas as pd
txt_data = requests.get("https://downloads.usda.library.cornell.edu/usda-esmis/files/c821gj76b/6w924d00c/9z903130m/AgriPric-07-30-2010.txt").text
splited_data = txt_data.split('\n')
table_title = 'Prices Received, United States'
END_TABLE_LINE = '-------------------------------------------'
_, table_start,_ = find_no_line_start_table(table_title,splited_data)
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
dict_table = take_table(txt_data)
pd.DataFrame(dict_table)
c = pd.DataFrame(dict_table)
IndexError: list assignment index out of range
However, I am getting an error here. Can anyone help me figure out what I am doing wrong?
Cause of error:
data_row is a list of 6 elements.
number_columns = 6
# ...
data_row = [''] * number_columns # [''] * 6
and index will increment with each iteration where first_column_done = True. But first_column_done will be True when : is encountered in a word, i.e
if ':' in w:
first_column_done = True
hence, for each iteration after first_column_done turns True, index will increment until it gets more than 6 which is the bound of list data_row.
def row(l):
l = l.split()
number_columns = 6
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w # error pos.
In other words, U get this error for each line that contains a number of words greater than 6 - index after the first occurence of : within a word in that line.
Fix:
Use split(':') and list comprehension as well as python tertiary operator.
def row(l):
row = [ col.strip() for col in l.split(':') ]
row[2:] = row[2].split()
return [ row[i] if i < len(row) else '' for i in range(6) ]

Get values form a txt, increase them save them in the same txt

I'm trying to get some values from a txt, increase them (until here everything good) and then write the new variables on the file but I can't write the variables on the file although I've changed the variable in a string..
the code is
with open("setup.txt", "r") as f:
for i, line in enumerate(f):
str = line.split(",")
if i == 0:
minL = int(str[0])
maxL = int(str[1])
minL += 2
maxL += 2
elif i == 1:
minF = int(str[0])
maxF = int(str[1])
minF += 1
maxF += 1
minL = str(minL)
with open("setup.txt", "w") as f:
f.write(minL)
f.close()
the txt is just:
15, 25
2, 9
EDIT*********
Sorry I just made a mistake when i copied the code, I've already put "w" for the writing mode but this doesn't work
the error is
line 15, in <module>
minL = str(minL)
NameError: name 'minL' is not defined
but I defined minL
Is 17 your expected output?
I copied and modified your code:
with open("setup.txt", "r") as f:
for i, line in enumerate(f):
stri = line.split(",")
if i == 0:
minL = int(stri[0])
maxL = int(stri[1])
minL += 2
maxL += 2
elif i == 1:
minF = int(stri[0])
maxF = int(stri[1])
minF += 1
maxF += 1
minL = str(minL)
with open("setup.txt", "w") as f:
f.write(minL)
f.close()
I got error with indentation (which I fixed) and changed str = line.split(",") to stri = line.split(",")

Python: Count kmers from fasta files

I want to count the kmers from a fasta file. I have the following script:
import operator
seq = open('file', 'r')
kmers = {}
k = 5
for i in range(len(seq) - k + 1):
kmer = seq[i:i+k]
if kmer in kmers:
kmers[kmer] += 1
else:
kmers[kmer] = 1
for kmer, count in kmers.items():
print (kmer + "\t" + str(count))
sortedKmer = sorted(kmers.items(), key=itemgetter(1), reverse=True)
for item in sortedKmer:
print (item[0] + "\t" + str(item[1]))
This works fine for a file with only one sequence, but now I have a fasta file with several contigs.
My fasta file looks like this:
>1
GTCTTCCGGCGAGCGGGCTTTTCACCCGCTTTATCGTTACTTATGTCAGCATTCGCACTT
CTGATACCTCCAGCAACCCTCACAGGCCACCTTCGCAGGCTTACAGAACGCTCCCCTACC
CAACAACGCATAAACGTCGCTGCCGCAGCTTCGGTGCATGGTTTAGCCCCGTTACATCTT
CCGCGCAGGCCGACTCGACCAGTGAGCTATTACGCTTTCTTTAAATGATGGCTGCTTCTA
AGCCAACATCCTGGCTGTCTGG
>2
AAAGAAAGCGTAATAGCTCACTGGTCGAGTCGGCCTGCGCGGAAGATGTAACGGGGCTAA
ACCATGCACCGAAGCTGCGGCAGCGACACTCAGGTGTTGTTGGGTAGGGGAGCGTTCTGT
AAGCCTGTGAAGGTGGCCTGTGAGGGTTGCTGGAGGTATCAGAAGTGCGAATGCTGACAT
AAGTAACGATAAAGCGGGTGAAAAGCCCGCTCGCCGGAAGACCAAGGGTTCCTGTCCAAC
GTTAATCGGGGCAGG
How can I change the script that it take first the sequence after ">1", print that output, go to ">2", print that output etc?
I have never heard about kmer or fasta, but I think I understand what you are trying to do.
You can try to split on a regex involving '>', but I would recommend processing the file line by line and accumulate kmers before printing them appropriately when reaching the '>1'-lines. See below code with comments
import operator
def printSeq(name, seq):
# Extract your code into a function and print header for current kmer
print("%s\n################################" %name)
kmers = {}
k = 5
for i in range(len(seq) - k + 1):
kmer = seq[i:i+k]
if kmer in kmers:
kmers[kmer] += 1
else:
kmers[kmer] = 1
for kmer, count in kmers.items():
print (kmer + "\t" + str(count))
sortedKmer = sorted(kmers.items(), reverse=True)
for item in sortedKmer:
print (item[0] + "\t" + str(item[1]))
with open('file', 'r') as f:
seq = ""
key = ""
for line in f.readlines():
# Loop over lines in file
if line.startswith(">"):
# if we get '>' it is time for a new sequence
if key and seq:
# if it wasn't the first we should print it before overwriting the variables
printSeq(key, seq)
# store name after '>' and reset sequence
key = line[1:].strip()
seq = ""
else:
# accumulate kmer until we hit another '>'
seq += line.strip()
# when we are done with all the lines, print the last sequence
printSeq(key, seq)
I tried the following with your example FASTA file and it should work:
def count_kmers(seq, k, kmers):
for i in range(len(seq) - k + 1):
kmr = seq[i:i + k]
if kmr in kmers:
kmers[kmr] += 1
else:
kmers[kmr] = 1
filename = raw_input('File name/path: ')
k = input('Value for k: ')
kmers = {}
# Put each line of the file into a list (avoid empty lines)
with open(filename) as f:
lines = [l.strip() for l in f.readlines() if l.strip() != '']
# Find the line indices where a new sequence starts
idx = [i for (i, l) in enumerate(lines) if l[0] == '>']
idx += [len(lines)]
for i in xrange(len(idx) - 1):
start = idx[i] + 1
stop = idx[i + 1]
sequence = ''.join(lines[start:stop])
count_kmers(sequence, k, kmers)
print kmers
Hope it helps :)

Sequence match using Python

I am working on RNA sequence matching
seq = 'UCAGCUGUCAGUCAUGAUC'
sub_seq =['UGUCAG', 'CAGUCA', 'UCAGCU','GAUC']
I am matching the sub_seq to the seq, matched sub_seq is under the seq, if there is no matched, use dash line. Output looks like this:
UCAGCUGUCAGUCAUGAUC
UCAGCU--CAGUCA-GAUC
-----UGUCAG--------
I try to use the dictionary to do this
index_dict = {}
for i in xrange(len(sub_seq)):
index_dict[seq.find(sub_seq[i])] = {}
index_dict[seq.find(sub_seq[i])]['sequence'] = sub_seq[i]
index_dict[seq.find(sub_seq[i])]['end_index'] = seq.find(sub_seq[i]) + len(sub_seq[i]) - 1
I cannot figure out the algorithm to do alignment, any help will be appreciated!
seq_l = len(seq)
for ele in sub_seq:
start = seq.find(ele)
ln = len(ele)
if start != -1:
end = start + ln
print("-" * start + ele + "-"*(seq_l- end))
else:
print("-" * seq_l)
-----UGUCAG--------
--------CAGUCA-----
UCAGCU-------------
---------------GAUC
Not sure where UCAGCU--CAGUCA-GAUC comes from as you are only using a single sub sequence at a time in your code
Assuming you'll let me change your index_dict slightly, consider:
seq = 'UCAGCUGUCAGUCAUGAUC'
sub_seq =['UGUCAG', 'CAGUCA', 'UCAGCU','GAUC']
index_dict = {}
for i in xrange(len(sub_seq)):
index_dict[seq.find(sub_seq[i])] = {
'sequence': sub_seq[i],
'end_index': seq.find(sub_seq[i]) + len(sub_seq[i]) # Note this changed
}
sorted_keys = sorted(index_dict)
lines = []
while True:
if not sorted_keys: break
line = []
next_index = 0
for k in sorted_keys:
if k >= next_index:
line.append(k)
next_index = index_dict[k]['end_index']
# Remove keys we used, append line to lines
for k in line: sorted_keys.remove(k)
lines.append(line)
# Build output lines
olines = []
for line in lines:
oline = ''
for k in line:
oline += '-' * (k - len(oline)) # Add dashes before subseq
oline += index_dict[k]['sequence'] # Add subsequence
oline += '-' * (len(seq) - len(oline)) # Add trailing dashes
olines.append(oline)
print seq
print '\n'.join(olines)
Output:
UCAGCUGUCAGUCAUGAUC
UCAGCU--CAGUCA-GAUC
-----UGUCAG--------
Note this is pretty verbose, and could be condensed a bit. The while True and for line in lines loops could probably be merged into one, but it should help explain one possible approach.
Edit: This is one way you might join the last two loops:
seq = 'UCAGCUGUCAGUCAUGAUC'
sub_seq =['UGUCAG', 'CAGUCA', 'UCAGCU','GAUC']
index_dict = {}
for i in xrange(len(sub_seq)):
index_dict[seq.find(sub_seq[i])] = {
'sequence': sub_seq[i],
'end_index': seq.find(sub_seq[i]) + len(sub_seq[i]) # Note this changed
}
sorted_keys = sorted(index_dict)
lines = []
while True:
if not sorted_keys: break
line = ''
next_index = 0
keys_used = []
for k in sorted_keys:
if k >= next_index:
line += '-' * (k - len(line)) # Add dashes before subseq
line += index_dict[k]['sequence'] # Add subsequence
next_index = index_dict[k]['end_index'] # Update next_index
keys_used.append(k) # Mark key as used
for k in keys_used: sorted_keys.remove(k) # Remove used keys
line += '-' * (len(seq) - len(line)) # Add trailing dashes
lines.append(line) # Add line to lines
print seq
print '\n'.join(lines)
Output:
UCAGCUGUCAGUCAUGAUC
UCAGCU--CAGUCA-GAUC
-----UGUCAG--------

Categories