Looking for resistance genes in water sample using kmers [Python] - python

I need some help with my code. I need to look for the presence of resistance genes in a water sample. That translates in having a huge file of reads coming from the water sample and a file of resistances genes. My problem is making the code run under 5 minutes, a thing that is not happening right now. Probably the issue relays on discarting reads as fast as possible, on having a smart method to only analyze meaningful reads. Do you have any suggestion? I cannot use any non standard python library
This is my code
import time
def build_lyb(TargetFile):
TargetFile = open(TargetFile)
res_gen = {}
for line in TargetFile:
if line.startswith(">"):
header = line[:-1]
res_gen[header] = ""
else:
res_gen[header] += line[:-1]
return res_gen
def build_kmers(sequence, k_size):
kmers = []
n_kmers = len(sequence) - k_size + 1
for i in range(n_kmers):
kmer = sequence[i:i + k_size]
kmers.append(kmer)
return kmers
def calculation(kmers, g):
matches = []
for i in range(0, len(genes[g])):
matches.append(0)
k = 0
while k < len(kmers):
if kmers[k] in genes[g]:
pos = genes[g].find(kmers[k])
for i in range(pos, pos+19):
matches[i] = 1
k += 19
else:
k += 1
return matches
def coverage(matches, g):
counter = 0
for i in matches[g]:
if i >= 1:
counter += 1
cov = counter/len(res_genes[g])*100
return cov
st = time.time()
genes = build_lyb("resistance_genes.fsa")
infile = open('test2.txt', 'r')
res_genes = {}
Flag = False
n_line = 0
for line in infile:
n_line += 1
if line.startswith("+"):
Flag = False
if Flag:
kmers = build_kmers(line[:-1], 19)
for g in genes:
counter = 18
k = 20
while k <= 41:
if kmers[k] in genes[g]:
counter += 19
k += 19
else:
k += 1
if counter >= 56:
print(n_line)
l1 = calculation(kmers, g)
if g in res_genes:
l2 = res_genes[g]
lr = [sum(i) for i in zip(l1, l2)]
res_genes[g] = lr
else:
res_genes[g] = l1
if line.startswith('#'):
Flag = True
for g in res_genes:
print(g)
for i in genes[g]:
print(i, " ", end='')
print('')
for i in res_genes[g]:
print(i, " ", end='')
print('')
print(coverage(res_genes, g))
et = time.time()
elapsed_time = et-st
print("Execution time:", elapsed_time, "s")

Related

Program doesn't work after making exe file by pyinstaller

I made a Python program using numpy, sympy, sys which worked well in IDE. Here is the code...
import numpy as np
from sympy import *
import sys
f = open("result.txt", 'w+')
for line in sys.stdin:
f.write(line)
f.close()
f = open("result.txt", 'r')
data = f.read().split("\n")
i = 0
while i < len(data):
data[i] = data[i].split(' ')
i += 1
print(data)
amount = int(data[0][0])
print(amount)
matrix = np.zeros((amount, amount))
i = 0
j = 0
while i < amount:
while j < amount:
matrix[i, j] = int(data[i + 1][j])
j += 1
j = 0
i += 1
i = 0
j = 0
counter = 0
formula = 1
while i < amount:
while j < amount:
if matrix[i, j] == 1:
x = symbols(str(i + 1))
y = symbols(str(j + 1))
if counter == 0:
formula = (~x | ~y)
counter += 1
else:
formula = formula & (~x | ~y)
j += 1
j = 0
i += 1
formula_to_string = pycode(simplify_logic(formula, form='dnf', force=True))
massive_to_parse = formula_to_string.split("or")
k = 1
i = 0
while i < len(massive_to_parse):
print("{", end='')
while k < amount + 1:
try:
massive_to_parse[i].index(str(k))
except ValueError:
print("V",k, sep='', end='')
finally:
k += 1
print("}-максимальное внутренне устойчивое множество")
k = 1
i += 1
Then I made an exe file using pyinstaller with this command...
pyinstaller main.py
but when i launch it, this errors appear.
How can I fix this problem? Because I need exe file for university to launch it from other program on C

list assignment index out of range by code python?

I keep getting an
IndexError: list assignment index out of range.
The error on line 78
This code is written to find motif DNA to bioinformatics
How we can solve this error or the problem ?
Here is my code:
from math import log
class MotifMedianFinding(object):
def __init__(self, input_file):
super(MotifMedianFinding, self).__init__()
self.input_lines = open("C:\\Users\\A.Khassawneh\\Desktop\\fasta.txt")
def output(self):
#main method to call both functions
sequences = {}
for line in self.input_lines:
if '>' in line:
sequences[line] = self.input_lines.next()
for label, seq in sequences.iteritems():
print "DNA:" + seq + "\n\n\n\n\n"
median = self.median_string(seq, 5,5, len(seq))
self.motif(seq, median,5,len(seq))
def median_string(self, dna, t, n, l):
#bound and search method of calulating median string
start_pos = start_pos = [1,1,1,1,1]
best_dist = 1000000000
i = 1
while i > 0:
if i < l:
prefix = str(start_pos)
opt_dist = self.hamming_score(prefix, dna)
if opt_dist > best_dist:
s,i = self.bypass(start_pos,i,l,4)
else:
s,i = self.next_vertex(start_pos,i,l,4)
else:
word = str(s)
if self.hamming_score(word, dna) < best_dist:
best_dist = self.hamming_score(word, dna)
bestword = word
s,i = self.next_vertex(start_pos,i,l,4)
print "Best Word: %s (tot_dis = %s)" % (bestword,best_dist)
return bestword
def motif(self, dna, t, n, l):
#bound and search method of calculating motif
start_pos = [1,1,1,1,1]
best_score = 0
i = 1
while 1 > 0:
if i < t:
opt_score = Score(s, i, dna) + (t-1) * l
if opt_score < best_score:
start_pos, i = self.bypass(start_pos, i, t, n-l+1)
else:
start_pos, i = self.next_vertex(start_pos, i, t, n-l+1)
else:
if self.score(start_pos, dna) > best_score:
best_score = self.score(start_pos)
best_motif = str(s)
start_pos, i = self.next_vertex(start_pos, i, t, n-l+1)
print "motif consensus string: %s (consensus_score = %s) " % (best_motif, best_score)
print "motif positions/string s=(s1..st): %s" % ', '.join(start_pos)
return best_motif
def bypass(vertex, level, l, k):
#skip uncessary calculations in the tree
j = level
for ind in xrange(j,1,-1):
if a[j] < k:
a[j] = a[j] + 1
return vertex, j
return vertex, 0
def next_vertex(self, vertex, level, L, k):
#transverse the tree of a strand of genes
if level <L:
vertex[level+1] = 1
return vertex,level+1
else:
j = L
for ind in xrange(j,1,-1):
if vertex[ind] < k:
vertex[j] = vertex[j] + 1
return vertex, j
return vertex, 0
def score(start_pos):
# biggest score of motif
total = 0
for i in start_pos:
total += i
return total
def hamming_score(self, s, dna):
pass
motif_median = MotifMedianFinding('HMP-part.fa')
motif_median.output()
xrange(x,y) goes from x to y-1 (x, x+1.... y-1). In your code, it would have been fine to do xrange(1,j), because that wouldn't have included j. But if you swap it to xrange(j,1,-1), you go (j, j-1.... 2).
Basically, you probably need to change it to xrange(j-1,0,-1) depending on your intended range.

python - transposing row and column

This is my code. I am new to programming, so can you help me solve on how to transpose the row and the column properly?
def scytale_encrypt(plain_text, key):
chars = [c for c in plain_text if c not in (' ',',','.','?','!',':',';',"'")]
chunks = math.ceil(len(chars)/ float(key))
inters, i, j = [], 1, 1
cipher, k = [], 0
item=[]
while i <= chunks :
inters.append(tuple(chars[j-1:(j + key)-1]))
i += 1
j += key
while k < key:
l = 0
while l < chunks:
if k >= len(inters[l])and plain_text.islower():
cipher.append('z')
elif k >= len(inters[l]):
cipher.append('Z')
else:
cipher.append(inters[l][k])
l += 1
k += 1
print (inters)
return ''.join(cipher);
my input
My inpputed plain text and key
Here is code & output:
aMyArray = [ ['a1', 'a2','a3','a4','a5'], ['b1','b2','b3'], ['c1','c2','c3']]
aMyArrayTrsp = []
aMyArrayTemp = []
sRec = ""
iMaxRows = 0
iMaxCols = 0
print "initial array:"
for i in range(len(aMyArray)):
sRec = ""
for j in range(len(aMyArray[i])):
sRec = sRec+","+aMyArray[i][j]
print sRec
print "Finding max"
iMaxCols = len(aMyArray)
for i in range(len(aMyArray)):
if(iMaxRows<len(aMyArray[i])):
iMaxRows = len(aMyArray[i])
print("Max rows="+str(iMaxRows))
print("Max cols="+str(iMaxCols))
print "Creating a blank 2D list.."
for i in range( iMaxRows ):
aMyArrayTemp = []
for j in range(iMaxCols ):
aMyArrayTemp.append("")
aMyArrayTrsp.append(aMyArrayTemp)
print "transposing.."
for i in range( iMaxRows ):
for j in range(iMaxCols):
try:
aMyArrayTrsp[i][j]=aMyArray[j][i]
except IndexError:
aMyArrayTrsp[i][j]=""
print "Final output:"
for i in range(len(aMyArrayTrsp)):
sRec = ""
for j in range(len(aMyArrayTrsp[i])):
sRec = sRec+","+aMyArrayTrsp[i][j]
print sRec
Here is the Output.
initial array:
,a1,a2,a3,a4,a5
,b1,b2,b3
,c1,c2,c3
Finding max
Max rows=5
Max cols=3
Creating a blank 2D list..
transposing..
Final output:
,a1,b1,c1
,a2,b2,c2
,a3,b3,c3
,a4,,
,a5,,

knapsack branch and bound wrong result

I have converted the code given at this link into a python version. The code is supposed to calculate the correct value of maximum value to be filled in knapsack of weight W. I have attached the code below:
#http://www.geeksforgeeks.org/branch-and-bound-set-2-implementation-of-01-knapsack/
from queue import Queue
class Node:
def __init__(self):
self.level = None
self.profit = None
self.bound = None
self.weight = None
def __str__(self):
return "Level: %s Profit: %s Bound: %s Weight: %s" % (self.level, self.profit, self.bound, self.weight)
def bound(node, n, W, items):
if(node.weight >= W):
return 0
profit_bound = int(node.profit)
j = node.level + 1
totweight = int(node.weight)
while ((j < n) and (totweight + items[j].weight) <= W):
totweight += items[j].weight
profit_bound += items[j].value
j += 1
if(j < n):
profit_bound += (W - totweight) * items[j].value / float(items[j].weight)
return profit_bound
Q = Queue()
def KnapSackBranchNBound(weight, items, total_items):
items = sorted(items, key=lambda x: x.value/float(x.weight), reverse=True)
u = Node()
v = Node()
u.level = -1
u.profit = 0
u.weight = 0
Q.put(u)
maxProfit = 0;
while not Q.empty():
u = Q.get()
if u.level == -1:
v.level = 0
if u.level == total_items - 1:
continue
v.level = u.level + 1
v.weight = u.weight + items[v.level].weight
v.profit = u.profit + items[v.level].value
if (v.weight <= weight and v.profit > maxProfit):
maxProfit = v.profit;
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
Q.put(v)
v.weight = u.weight
v.profit = u.profit
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
# print items[v.level]
Q.put(v)
return maxProfit
if __name__ == "__main__":
from collections import namedtuple
Item = namedtuple("Item", ['index', 'value', 'weight'])
input_data = open("test.data").read()
lines = input_data.split('\n')
firstLine = lines[0].split()
item_count = int(firstLine[0])
capacity = int(firstLine[1])
print "running from main"
items = []
for i in range(1, item_count+1):
line = lines[i]
parts = line.split()
items.append(Item(i-1, int(parts[0]), float(parts[1])))
kbb = KnapSackBranchNBound(capacity, items, item_count)
print kbb
The program is supposed to calculate value of 235 for following items inside file test.data:
5 10
40 2
50 3.14
100 1.98
95 5
30 3
The first line shows number of items and knapsack weight. Lines below first line shows the value and weight of those items. Items are made using a namedtuple and sorted according to value/weight. For this problem I am getting 135 instead of 235. What am I doing wrong here?
EDIT:
I have solved the problem of finding correct items based on branch and bound. If needed, one can check it here
The problem is that you're inserting multiple references to the same Node() object into your queue. The fix is to initialize two new v objects in each iteration of the while-loop as follows:
while not Q.empty():
u = Q.get()
v = Node() # Added line
if u.level == -1:
v.level = 0
if u.level == total_items - 1:
continue
v.level = u.level + 1
v.weight = u.weight + items[v.level].weight
v.profit = u.profit + items[v.level].value
if (v.weight <= weight and v.profit > maxProfit):
maxProfit = v.profit;
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
Q.put(v)
v = Node() # Added line
v.level = u.level + 1 # Added line
v.weight = u.weight
v.profit = u.profit
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
# print(items[v.level])
Q.put(v)
Without these reinitializations, you're modifying the v object that you already inserted into the queue.
This is different from C++ where the Node objects are values that are implicitly copied into the queue to avoid aliasing problems such as these.

Parsing Data from live website in Python Enumerate problem!

The following script is supposed to fetch a specific line number and parse it from a live website. It works for like 30 loops but then it seems like enumerate(f) stops working correctly... the "i" in the for loop seems to stop at line 130 instead of like 200 something. Could this be due to the website I'm trying to fetch data from or something else? Thanks!!
import sgmllib
class MyParser(sgmllib.SGMLParser):
"A simple parser class."
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.divs = []
self.descriptions = []
self.inside_div_element = 0
def start_div(self, attributes):
"Process a hyperlink and its 'attributes'."
for name, value in attributes:
if name == "id":
self.divs.append(value)
self.inside_div_element = 1
def end_div(self):
"Record the end of a hyperlink."
self.inside_div_element = 0
def handle_data(self, data):
"Handle the textual 'data'."
if self.inside_div_element:
self.descriptions.append(data)
def get_div(self):
"Return the list of hyperlinks."
return self.divs
def get_descriptions(self, check):
"Return a list of descriptions."
if check == 1:
self.descriptions.pop(0)
return self.descriptions
def rm_descriptions(self):
"Remove all descriptions."
self.descriptions.pop()
import urllib
import linecache
import sgmllib
tempLine = ""
tempStr = " "
tempStr2 = ""
myparser = MyParser()
count = 0
user = ['']
oldUser = ['none']
oldoldUser = [' ']
array = [" ", 0]
index = 0
found = 0
k = 0
j = 0
posIndex = 0
a = 0
firstCheck = 0
fCheck = 0
while a < 1000:
print a
f = urllib.urlopen("SITE")
a = a+1
for i, line in enumerate(f):
if i == 187:
print i
tempLine = line
print line
myparser.parse(line)
if fCheck == 1:
result = oldUser[0] is oldUser[1]
u1 = oldUser[0]
u2 = oldUser[1]
tempStr = oldUser[1]
if u1 == u2:
result = 1
else:
result = user is oldUser
fCheck = 1
user = myparser.get_descriptions(firstCheck)
tempStr = user[0]
firstCheck = 1
if result:
array[index+1] = array[index+1] +0
else:
j = 0
for z in array:
k = j+2
tempStr2 = user[0]
if k < len(array) and tempStr2 == array[k]:
array[j+3] = array[j+3] + 1
index = j+2
found = 1
break
j = j+1
if found == 0:
array.append(tempStr)
array.append(0)
oldUser = user
found = 0
print array
elif i > 200:
print "HERE"
break
print array
f.close()
Perhaps the number of lines on that web page are fewer than you think? What does this give you?:
print max(i for i, _ in enumerate(urllib.urlopen("SITE")))
Aside: Your indentation is stuffed after the while a < 1000: line. Excessive empty lines and one-letter names don't assist the understanding of your code.
enumerate is not broken. Instead of such speculation, inspect your data. Suggestion: replace
for i, line in enumerate(f):
by
lines = list(f)
print "=== a=%d linecount=%d === % (a, len(lines))
for i, line in enumerate(lines):
print " a=%d i=%d line=%r" % (a, i, line)
Examine the output carefully.

Categories