Consensus sequence help in python

Consensus sequence help in python - python

I am having difficulty getting this scoring function to work. The objective of my program is to make a t x n matrix and find a consensus sequence.
I keep getting a error :
TypeError: 'int' object is not subscriptable.
Any help would be appreciated.
def Score(s, i, l, dna):
t = len(dna) # t = number of dna sequences
# Step 1: Extract the alignment corresponding to starting positions in s
alignment = []
for j in range(0, i):
alignment.append(dna[j][s[j]:s[j]+l])
# Step 2: Create the corresponding profile matrix
profile = [[],[],[],[]] # prepare an empty 4 x l profile matrix first
for j in range(0, 4):
profile[j] = [0] * l
for c in range(0, l): # for each column number c
for r in range(0, i): # for each row number r in column c
if alignment[r][c] == 'a':
profile[0][c] = profile[0][c] + 1
elif alignment[r][c] == 't':
profile[1][c] = profile[1][c] + 1
elif alignment[r][c] == 'g':
profile[2][c] = profile[2][c] + 1
else:
profile[3][c] = profile[3][c] + 1
# Step 3: Compute the score from the profile matrix
score = 0
for c in range(0, l):
score = score + max([profile[0][c], profile[1][c], profile[2][c], profile[3][c]])
return score

Is your variable dna a dictionary,
if so use def Score(s, i, l, **dna)
If it is int variable, you can't access it as dna[j][s[j]:s[j]+l]

Related

CS50 DNA: STR counter only works most of the time

Here is a mock up of the function. A lot of the samples have one or two STRs coming back as 1. Can someone help me understand what I am doing wrong?
dnaSamp = input("DNA: ")
strSeq = ["TATC"]###["AGATC", "TTTTTTCT", "AATG", "TCTAG", "GATA", "TATC", "GAAA", "TCTG"]
hiScore = [0] * len(strSeq)
for i in range(len(strSeq)): # cycle throught the varios STRs
for j in range(len(dnaSamp) - (len(strSeq)-1)): # loop over dna sample
k = j + len(strSeq[i]) # variable to control the length of the STR sequence
if dnaSamp[j : k] == strSeq[i]:
counter = 0
for l in range(len(dnaSamp)): #if match look at next set
if dnaSamp[j + (l * len(strSeq[i])) : k + (l * len(strSeq[i]))] == strSeq[i]:
counter += 1
continue
break
if counter > hiScore[i]:
hiScore[i] = counter #save highest counter
print(f"{strSeq[i]} = {hiScore[i]}" )

Row not adding to matrix in while loop in sympy

The aim of this program is to build a 3x3 matrix which then reduces additional rows, but, for some reason, after the second row is added to M in the while loop, it replaces it with the new row, rather than adding a third row, and, then, reducing additional (most likely 3) vectors after that. Here's the code:
from sympy import *
init_printing(use_unicode= True)
A = []
def reduceOneRow(M):
k = 0
for i in range(k,min(M.shape)-1):
if M[i,i]!=0 or i ==2:
for j in range(k,min(M.shape)-1):
T = Matrix([M.row(j+1)-(M[j+1,i]/M[i,i])*M.row(i)])
A.append(M[j+1]/M[i,i])
M.row_del(j+1)
M = M.row_insert(j+1,T)
k = k+1
else:
i = i+1
return M
# M = Matrix([[1,1,1],[1,4,7],[3,2,5]])
# reduceOneRow(M)
# A
#The following block of code generates a list of monomials, but not in reverse
#lexicagraphical order. This can be fixed later. Ultimately, I'd like to
#make it it's own function
sigma = symbols('x1:4')
D = [1]
for d in D:
for s in sigma:
if s*d not in D:
D.append(s*d)
if len(D) > 20:
break
# print(D)
# print(D[9].subs([('x1',4),('x2',2),('x3',3)]))
#We begin with the set up described in C1
P = [(1,2,3),(4,5,6),(7,8,9)]
G = []
Q = []
S = []
L = [1]
M = Matrix([])
#Here we being step C2.
while L != []:#what follows this while statement is the loop C2-C5 and back
t = L[0]
L.remove(L[0])
K = Matrix([]) #K is a kind of bucket matrix
if t==1: #this block generates the firs line in M. It had to be separate
for j in range(len(P)):#because of the way sympy works. 1 is int, rather
K = K.col_insert(j,Matrix([1])) #than a symbol
else: #here we generate all other rows of M, using K for the name of the rows
for p in P:
K = K.col_insert(0,Matrix([t.subs([(sigma[0],p[0]),(sigma[1],p[1]),(sigma[2],p[2])])]))
# K = K.col_insert(i,Matrix([t.subs([(sigma[0],p[0]),(sigma[1],p[1]),(sigma[2],p[2])]))
M = M.row_insert(min(M.shape)+1,K) #K gets added to M
M
A = []
reduceOneRow(M)#row reduces M and produces the ai in C3
sum = 0
for n in range(len(A)):
sum = sum + A[n]*S[n]
V = M.row(-1)
if V == zeros(1,len(V)):
G.append(t - sum)
M.row_del(-1)
else:
S.append(t-sum)
Q.append(t)
for i in range(1,4):
#if not t*D[i] == Q[0]:
L.append(t*D[i])
L
print('G =',' ',G,' ','Q =',Q)

I figure it out. I changed 'reduceRowOne(M)' to 'M = reduceRowOne'. Ugh.
Thank you all who took a look at this!

Finding first pair of numbers in array that sum to value

Im trying to solve the following Codewars problem: https://www.codewars.com/kata/sum-of-pairs/train/python
Here is my current implementation in Python:
def sum_pairs(ints, s):
right = float("inf")
n = len(ints)
m = {}
dup = {}
for i, x in enumerate(ints):
if x not in m.keys():
m[x] = i # Track first index of x using hash map.
elif x in m.keys() and x not in dup.keys():
dup[x] = i
for x in m.keys():
if s - x in m.keys():
if x == s-x and x in dup.keys():
j = m[x]
k = dup[x]
else:
j = m[x]
k = m[s-x]
comp = max(j,k)
if comp < right and j!= k:
right = comp
if right > n:
return None
return [s - ints[right],ints[right]]
The code seems to produce correct results, however the input can consist of array with up to 10 000 000 elements, so the execution times out for large inputs. I need help with optimizing/modifying the code so that it can handle sufficiently large arrays.

Your code inefficient for large list test cases so it gives timeout error. Instead you can do:
def sum_pairs(lst, s):
seen = set()
for item in lst:
if s - item in seen:
return [s - item, item]
seen.add(item)
We put the values in seen until we find a value that produces the specified sum with one of the seen values.
For more information go: Referance link

Maybe this code:
def sum_pairs(lst, s):
c = 0
while c<len(lst)-1:
if c != len(lst)-1:
x= lst[c]
spam = c+1
while spam < len(lst):
nxt= lst[spam]
if nxt + x== s:
return [x, nxt]
spam += 1
else:
return None
c +=1
lst = [5, 6, 5, 8]
s = 14
print(sum_pairs(lst, s))
Output:
[6, 8]

This answer unfortunately still times out, even though it's supposed to run in O(n^3) (since it is dominated by the sort, the rest of the algorithm running in O(n)). I'm not sure how you can obtain better than this complexity, but I thought I might put this idea out there.
def sum_pairs(ints, s):
ints_with_idx = enumerate(ints)
# Sort the array of ints
ints_with_idx = sorted(ints_with_idx, key = lambda (idx, num) : num)
diff = 1000000
l = 0
r = len(ints) - 1
# Indexes of the sum operands in sorted array
lSum = 0
rSum = 0
while l < r:
# Compute the absolute difference between the current sum and the desired sum
sum = ints_with_idx[l][1] + ints_with_idx[r][1]
absDiff = abs(sum - s)
if absDiff < diff:
# Update the best difference
lSum = l
rSum = r
diff = absDiff
elif sum > s:
# Decrease the large value
r -= 1
else:
# Test to see if the indexes are better (more to the left) for the same difference
if absDiff == diff:
rightmostIdx = max(ints_with_idx[l][0], ints_with_idx[r][0])
if rightmostIdx < max(ints_with_idx[lSum][0], ints_with_idx[rSum][0]):
lSum = l
rSum = r
# Increase the small value
l += 1
# Retrieve indexes of sum operands
aSumIdx = ints_with_idx[lSum][0]
bSumIdx = ints_with_idx[rSum][0]
# Retrieve values of operands for sum in correct order
aSum = ints[min(aSumIdx, bSumIdx)]
bSum = ints[max(aSumIdx, bSumIdx)]
if aSum + bSum == s:
return [aSum, bSum]
else:
return None

Trying to figure out longest path algorithm python

I'm trying to make a python script, that gets me the longest repeated character in a given matrix (horizontally and vertically).
Example:
I have this matrix:
afaaf
rbaca
rlaff
Giving this matrix for input, it should result: a 3
You can see that that the 3rd column of the matrix, is full of a's and also, it's the most repeated character in the matrix.
What I have:
#!/bin/python2.7
#Longest string in matrix
#Given a matrix filled with letters. Find the longest string, containing only the same letter, which can be obtained by starting
#with any position and then moving horizontally and vertically (each cell can be visited no more than 1 time).
# Settings here
# -------------
string_matrix = """
afaaf
rbaca
rlaff
"""
pos = (0,0)
# -------------
import pdb
import time
import collections
from collections import defaultdict
import re
rows = 0
columns = 0
matrix = []
matrix2 = []
counter = 0
res_l = []
i = 0
c = ''
# if matrix2 is full of 1's, stop
def stop():
for i in range(0, rows):
for j in range(0, columns):
if matrix2[i][j] == 0:
return False
return True
# checks the points, and returns the most repeated char and length
def check_points(points1, points2):
r = []
r.append(-1)
r.append('')
# create strings from matrix
s1 = ''
s2 = ''
for point in points1:
s1 += matrix[point[0]][point[1]]
for point in points2:
s2 += matrix[point[0]][point[1]]
rr = {}
for c in s1:
rr[c] = 0
for c in s2:
rr[c] = 0
for i in range(0, len(s1)):
k = 1
for j in range(i+1, len(s1)):
if s1[i] == s1[j]:
k += 1
else:
break
if k > rr[s1[i]]:
rr[s1[i]] = k
for i in range(0, len(s2)):
k = 1
for j in range(i+1, len(s2)):
if s2[i] == s2[j]:
k += 1
else:
break
if k > rr[s2[i]]:
rr[s2[i]] = k
m = -1
c = ''
for key,value in rr.iteritems():
if value > m:
m = value
c = key
return m, c
# Depth-first search, recursive
def search(pos):
global res_l
global matrix2
global c
counter = 0
x = pos[0]
y = pos[1]
c = matrix[x][y]
# base clause
# when matrix2 is all checked
if stop():
return counter, c
points1 = []
points2 = []
allpoints = []
for i in range(0, columns):
if matrix2[x][i] != 1:
points1.append([x, i])
allpoints.append([x, i])
for i in range(0, rows):
if matrix2[i][x] != 1:
points2.append([i, x])
allpoints.append([i, x])
r = check_points(points1, points2)
if r[0] > counter:
counter = r[0]
c = r[1]
matrix2[x][y] = 1
for point in allpoints:
rr = search(point)
if rr[0] > counter:
counter = int(rr[0])
c = rr[1]
#print 'c: ' + str(c) + ' - k: ' + str(counter)
return counter, c
def main():
# create the matrix from string
string_matrix_l = string_matrix.strip()
splited = string_matrix_l.split('\n')
global rows
global columns
global matrix
global matrix2
rows = len(splited)
columns = len(splited[1])
# initialize matrixes with 0
matrix = [[0 for x in range(columns)] for x in range(rows)]
matrix2 = [[0 for x in range(columns)] for x in range(rows)]
# string to matrix
i = 0
for s in splited:
s = s.strip()
if s == '':
continue
j = 0
for c in s:
try:## Heading ##
matrix[i][j] = c
#print 'ok: ' + str(i) + ' ' + str(j) + ' ' + c
except:
print 'fail: index out of range matrix[' + str(i) + '][' + str(j)+'] ' + c
j = j + 1
i = i + 1
# print some info
print 'Given matrix: ' + str(matrix) + '\n'
print 'Start position: ' + str(pos)
print 'Start character: ' + str(matrix[pos[0]][pos[1]])
# get the result
res = search(pos)
print '-------------------------------------'
print '\nChar: ' + str(res[1]) + '\nLength: ' + str(res[0])
if __name__ == "__main__":
main()
This is my source code.
The example given above, is also used in the source code. The result given is: r 2 which is wrong ... again, should be a 3
It has 4 functions: main, search, stop and check_points.
main is initializing things up,
search is my recursive function that takes one parameter (the start point), and should recursively check for the longest string. I have another matrix, same length as original, which is just 1 and 0. 1 means the position was visited, 0, not. The search function is setting 1 on the right position after a certain position was processed by the search function.
stop is checking if matrix2 is full of 1's, in this case, the matrix was all parsed
check_points takes 2 parameters, 2 list of points, and returns the most repeated character and it's length for those points
What doesn't work:
Most of the time is giving me the wrong character as result, even thought the count might be right sometimes. Sometimes it's working on horizontally, sometimes it doesn't. I am sure that I'm doing something wrong, but ... it's over 1 week now since I'm trying to figure out how to do this. Asked another question here on stackoverflow, got bit further but ... still stuck.
Any suggestion is appreciated.

You can use itertools.groupby to quickly find the count of repetitions of some character, and izip_longest(*matrix) to transpose the matrix (swap its rows and columns).
from itertools import groupby, izip_longest
matrix_string = """
afaaf
rbaca
rlaff
"""
def longest_repetition(row):
return max((sum(1 for item in group), letter)
for letter, group in groupby(row)
if letter is not None)
def main():
matrix = [[letter for letter in row.strip()]
for row in matrix_string.strip().split('\n')]
count, letter = max(
max(longest_repetition(row) for row in matrix),
max(longest_repetition(col) for col in izip_longest(*matrix))
)
print letter, count
if __name__ == '__main__':
main()
Since you've updated the requirement here is a recursive version of the code with some explanations. If it were not an assignment and this task came up in some real life problem, you should really have used the first version.
matrix_string = """
afaaf
rbaca
rlaff
"""
def find_longest_repetition(matrix):
rows = len(matrix)
cols = len(matrix[0])
# row, col - row and column of the current character.
# direction - 'h' if we are searching for repetitions in horizontal direction (i.e., in a row).
# 'v' if we are searching in vertical direction.
# result - (count, letter) of the longest repetition we have seen by now.
# This order allows to compare results directly and use `max` to get the better one
# current - (count, letter) of the repetition we have seen just before the current character.
def recurse(row, col, direction, result, current=(0, None)):
# Check if we need to start a new row, new column,
# new direction, or finish the recursion.
if direction == 'h': # If we are moving horizontally
if row >= rows: # ... and finished all rows
return recurse( # restart from the (0, 0) position in vertical direction.
0, 0,
'v',
result
)
if col >= cols: # ... and finished all columns in the current row
return recurse( # start the next row.
row + 1, 0,
direction,
result
)
else: # If we are moving vertically
if col >= cols: # ... and finished all columns
return result # then we have analysed all possible repetitions.
if row >= rows: # ... and finished all rows in the current column
return recurse( # start the next column.
0, col + 1,
direction,
result
)
# Figure out where to go next in the current direction
d_row, d_col = (0, 1) if direction == 'h' else (1, 0)
# Try to add current character to the current repetition
count, letter = current
if matrix[row][col] == letter:
updated_current = count + 1, letter
else:
updated_current = 1, matrix[row][col]
# Go on with the next character in the current direction
return recurse(
row + d_row,
col + d_col,
direction,
max(updated_current, result), # Update the result, if necessary
updated_current
)
return recurse(0, 0, 'h', (0, None))
def main():
matrix = [[letter for letter in row.strip()]
for row in matrix_string.strip().split('\n')]
count, letter = find_longest_repetition(matrix)
print letter, count
if __name__ == '__main__':
main()

You can also try the collections.Counter(string).most_common() to get the most repetitions of a character.
from collections import Counter
string_matrix = """
afaaf
rbaca
rlaff
"""
def GetMostRepetitions(pos):
mc = []
for ii in range(pos[0],len(working_mat)):
mc.extend(Counter(working_mat[ii]).most_common(1))
for jj in range(pos[1],len(working_mat[0])):
column = []
for kk in range(ii,len(working_mat)):
column.append(working_mat[kk][jj])
mc.extend(Counter(column).most_common(1))
m = 0
for item in mc:
if item[1] > m:
m = item[1]
k = item[0]
print(k, m)
working_mat = string_matrix.strip().split('\n')
for ii in range(len(working_mat)):
for jj in range(len(working_mat[0])):
pos = (ii,jj)
GetMostRepetitions(pos)
As Kolmar said, you can also use a better way to transpose the matrix.

Converting phone number range list to prefix list

I have a phone number range, for example:
3331234-3332345
I need to write a function that converts it to list of prefixes:
3331234
...
3331239
333124
...
333129
33313
...
33319
33320
...
33322
333231
333232
333233
3332341
...
3332345
Question is not so easy. I don't need to get a list of numbers between range start and end.

My working code. It not very quick, too. Optimizations welcome.
def diap_to_prefix(a, b):
lst = ['%0*d'%(max(len(str(a)), len(str(b))), x) for x in range(int(a), int(b)+1)]
new_lst = []
while len(lst) != len(new_lst):
lst = new_lst or lst
new_lst = []
c = lst[0]
tmp_lst = [c]
for i in lst[1:]:
if c[:-1] == i[:-1]:
c = i
tmp_lst.append(c)
else:
if len(tmp_lst) == 10:
new_lst.append(c[:-1])
else:
new_lst.extend(tmp_lst)
c = i
tmp_lst = [c]
if len(tmp_lst) == 10:
new_lst.append(c[:-1])
else:
new_lst.extend(tmp_lst)
return lst

My new more optimal solution (py3.4)
def diap_to_prefix(a, b):
def inner(aa, bb, p):
if p == 1:
if a <= aa <= b:
yield aa
return
for d in range(aa, bb + 1, p):
if a <= d and d + p - 1 <= b:
yield d // p
elif not (bb < a or aa > b):
for i in range(10):
yield from inner(d + i * p // 10, d + (i + 1) * p // 10 - 1, p // 10)
a, b = int(a), int(b)
p = 10**(max(len(str(x)) for x in (a, b)) - 1)
yield from inner(a // p * p, b // p * p + p - 1, p)

You need to get the common prefix of the values separated by "-", so:
Use .split to get these and iterate through them until you find a difference
Complete the first value with zeros (to get the least number) until you get phone_len digits and do the same for the maximum (with nines)
Then, you have a simple range of numbers
Iterate through them and convert them to strings
Here it is:
phone_len = 7
R = "33312345-3332345".split("-")
prefix = ""
for i in range(len(R[0])):
if R[0][i] == R[1][i]:
prefix += R[0][i]
else:
break
m = int(R[0]+"0"*(phone_len-len(R[0])))
M = int(R[1]+"9"*(phone_len-len(R[0])))
phones = [str(n) for n in range(m, M+1)]

Here's a sketch of one way to handle this problem. I've used ellipses to mark the spots where you'll need to fill in the details explained in the comments. I'd write a function to derive the initial value of 'maxpower', everything else is simple enough to be written inline.
firstnumber = 3331234
lastnumber = 3332345
current = firstnumber
while current <= lastnumber:
# Find the largest power of 10 that exactly divides 'current'.
# Call this value 'maxpower'. 'maxpower' is a candidate for the
# size of the block of numbers that will be represented by the
# next output value.
maxpower = ... # 1, 10, 100, 1000, 10000, and so on
# If a block of size 'maxpower' would take us past the
# 'lastnumber', we can't use that block size. We must try a
# smaller block. Divide 'maxpower' by 10 until the block size
# becomes acceptable.
while (current + maxpower) > ... :
maxpower /= 10
# Now 'maxpower' is the largest acceptable size for the next
# block, so the desired prefix is 'current' divided by 'maxpower'.
# Emit that value, then add 'maxpower' to 'current' to get the new
# 'current' value for the next iteration.
print ...
current += maxpower

My working code. It not very quick, but working. Optimizations welcome.
def fill(root, prefix, value, parent, pkey):
if len(prefix) > 1:
if prefix[0] in root:
fill(root[prefix[0]], prefix[1:], value, root, prefix[0])
if pkey:
if len(parent[pkey]) == 10:
parent[pkey] = value
elif type(root) == type({}):
root[prefix[0]] = {}
fill(root[prefix[0]], prefix[1:], value, root, prefix[0])
if pkey:
if len(parent[pkey]) == 10:
parent[pkey] = value
elif type(root) == type({}):
root[prefix[0]] = value
if pkey:
if len(parent[pkey]) == 10:
parent[pkey] = value
return root
def compact(prefixes, current):
if not type(prefixes) == type({}):
return [current]
else:
rlist = []
for k, v in prefixes.iteritems():
rlist.extend(compact(v, current + k))
continue
return rlist
if __name__ == '__main__':
plist = {}
for x in range(4440000, 4490000):
fill(plist, str(x), 'value', plist, None)
#print plist
print compact(plist, '')

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Consensus sequence help in python - python

Is your variable dna a dictionary, if so use def Score(s, i, l, **dna) If it is int variable, you can't access it as dna[j][s[j]:s[j]+l]

Related

CS50 DNA: STR counter only works most of the time

Row not adding to matrix in while loop in sympy

Finding first pair of numbers in array that sum to value

Trying to figure out longest path algorithm python

Converting phone number range list to prefix list

Categories

Resources