I'm working on this python problem:
Given a sequence of the DNA bases {A, C, G, T}, stored as a string, returns a conditional probability table in a data structure such that one base (b1) can be looked up, and then a second (b2), to get the probability p(b2 | b1) of the second base occurring immediately after the first. (Assumes the length of seq is >= 3, and that the probability of any b1 and b2 which have never been seen together is 0. Ignores the probability that b1 will be followed by the end of the string.)
You may use the collections module, but no other libraries.
However I'm running into a roadblock:
word = 'ATCGATTGAGCTCTAGCG'
def dna_prob2(seq):
tbl = dict()
levels = set(word)
freq = dict.fromkeys(levels, 0)
for i in seq:
freq[i] += 1
for i in levels:
tbl[i] = {x:0 for x in levels}
lastlevel = ''
for i in tbl:
if lastlevel != '':
tbl[lastlevel][i] += 1
lastlevel = i
for i in tbl:
print(i,tbl[i][i] / freq[i])
return tbl
tbl['T']['T'] / freq[i]
Basically, the end result is supposed to be the final line tbl you see above. However, when I try to do that in print(i,tbl[i][i] /freq[i), and run dna_prob2(word), I get 0.0s for everything.
Wondering if anyone here can help out.
Thanks!
I am not sure what it is your code is doing, but this works:
def makeprobs(word):
singles = {}
probs = {}
thedict={}
ll = len(word)
for i in range(ll-1):
x1 = word[i]
x2 = word[i+1]
singles[x1] = singles.get(x1, 0)+1.0
thedict[(x1, x2)] = thedict.get((x1, x2), 0)+1.0
for i in thedict:
probs[i] = thedict[i]/singles[i[0]]
return probs
I finally got back to my professor. This is what it was trying to accomplish:
word = 'ATCGATTGAGCTCTAGCG'
def dna_prob2(seq):
tbl = dict()
levels = set(seq)
freq = dict.fromkeys(levels, 0)
for i in seq:
freq[i] += 1
for i in levels:
tbl[i] = {x:0 for x in levels}
lastlevel = ''
for i in seq:
if lastlevel != '':
tbl[lastlevel][i] += 1
lastlevel = i
return tbl, freq
condfreq, freq = dna_prob2(word)
print(condfreq['T']['T']/freq['T'])
print(condfreq['G']['A']/freq['A'])
print(condfreq['C']['G']/freq['G'])
Hope this helps.
Related
I am making a function which takes a tuple containing strings, selects one string, then selects a start and end index within that string. Within this index, the string is chopped up into pieces and permuted randomly and then the tuple is returned with the modified string. However, there appears to be some sort of error when defining the start and end index in the function and then splicing the string based on it. In some runs it works as expected, and in others it does not and hence throws an error which I avoid in the while loop. Can anyone tell me what could be going on here:
def chromothripsist(seqs, keep_frequency = 0.9):
seqs = list(seqs)
c = list(range(len(seqs)))
chrom = random.sample(c, 1)
orig_seq = seqs[chrom[0]]
n = len(orig_seq)
NotLongEnough = True
while(NotLongEnough):
splits = random.randint(5,10)
##IF SPLITS > len(seq) then this bugs
print(orig_seq)
lowbd = int(0.1*n)
upbd = int(0.9*n)
distance = int(np.random.uniform(lowbd, upbd))
stidx = random.randint(0,n-1)
edidx = max(stidx + distance, n-1)
dist = edidx - stidx
print(splits, stidx, edidx)
if(dist > splits):
NotLongEnough = False
break
first_part = orig_seq[:stidx]
last_part = orig_seq[edidx:]
seq = orig_seq[stidx:edidx]
# THE ABOVE LINES ARE NOT WORKING AS EXPECTED
print(seq)
print(stidx, edidx)
print(len(seq), splits)
breakpoints = np.random.choice(len(seq), splits-1, replace = False)
breakpoints.sort()
subseq = []
curridx = 0
for i in breakpoints:
subseq.append(seq[curridx:i])
curridx = i
subseq.append(seq[breakpoints[-1]:])
rearrange = np.random.permutation(splits)
#n_to_select= int(splits *keep_frequency)
n_to_select = len(rearrange)
rearrange = random.sample(list(rearrange), n_to_select)
build_seq = ''
for i in rearrange:
build_seq += subseq[i]
seqs[chrom[0]] = first_part + build_seq + last_part
breakpoints = list(breakpoints)
return tuple(seqs), [stidx, edidx, breakpoints, rearrange, chrom[0]]
I try to generate the following sequences.
text = ACCCEBCE
target = 000000D0
a random text of different characters is generated. In the text sequence, if the following subsequences are found, the target is going to be D or E. Otherwise, the target will be 0.
ABC --> D
BCD --> E
I write the following code. It works well if I generate a small number of characters. But it does not give any output if I make timesteps = 1000 etc.
import string
import random as rn
import numpy as np
def is_subseq(x, y):
it = iter(y)
return all(any(c == ch for c in it) for ch in x)
def count(a, b, m, n):
# If both first and second string
# is empty, or if second string
# is empty, return 1
if ((m == 0 and n == 0) or n == 0):
return 1
# If only first string is empty
# and second string is not empty,
# return 0
if (m == 0):
return 0
# If last characters are same
# Recur for remaining strings by
# 1. considering last characters
# of both strings
# 2. ignoring last character
# of first string
if (a[m - 1] == b[n - 1]):
return (count(a, b, m - 1, n - 1) +
count(a, b, m - 1, n))
else:
# If last characters are different,
# ignore last char of first string
# and recur for remaining string
return count(a, b, m - 1, n)
# create a sequence classification instance
def get_sequence(n_timesteps):
alphabet="ABCDE"#string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=n_timesteps))
print(text)
seq_length=3
subseqX = []
subseqY = []
for i in range(0, len(alphabet) - seq_length, 1):
seq_in = alphabet[i:i + seq_length]
seq_out = alphabet[i + seq_length]
subseqX.append([char for char in seq_in])
subseqY.append(seq_out)
print(seq_in, "\t-->\t",seq_out)
y2 = []
match = 0
countlist=np.zeros(len(subseqX))
for i, val in enumerate(text):
found = False
counter = 0
for g, val2 in enumerate(subseqX):
listToStr = ''.join(map(str, subseqX[g]))
howmany = count(text[:i], listToStr, len(text[:i]),len(listToStr))
if is_subseq(listToStr, text[:i]):
if countlist[g] < howmany:
match = match + howmany
countlist[g] = howmany
temp = g
found = True
if found:
y2.append(subseqY[temp])
else:
y2.append(0)
print("counter:\t", counter)
print(text)
print(y2)
# define problem properties
n_timesteps = 100
get_sequence(n_timesteps)
It might be because of the depth of the recursive function. But I need to generate 1000 or 10000 characters.
How can I fix this problem? Any ideas?
I'm not sure I understand all you're trying to do (lots of code there), but I believe this simplified form of the function should work. It maintains a set of subsequences seen so far. It only extends them by adding the next letter when it is encountered. This allows the flagging to know if the prefix to the sequence up to the current character has been seen before.
def flagSequence(S,letters="ABCDE",seqLen=3):
subSeqs = set()
result = "0"
for c in S[:-1]:
p = letters.index(c)
subSeqs.add(c)
if p>0:
subSeqs.update([s+c for s in subSeqs if s[-1]==letters[p-1]])
if p in range(seqLen-1,len(letters)-1) and letters[p-seqLen+1:p+1] in subSeqs:
result += letters[p+1]
else:
result += "0"
return result
output:
text = "BDBACCBECEECAEAEDCAACBCCDDDBBDEEDABDBDE"
print(text)
print(flagSequence(text))
BDBACCBECEECAEAEDCAACBCCDDDBBDEEDABDBDE
000000000D00D0000ED00D0DDEEE00E00E00E0E
with more letters:
alphabet=string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=10000))
flags = flagSequence(text,alphabet)
print(text[:60])
print(flags[:60])
CHUJKAMWCAAIBXGIZFHALAWWFDDELXREMOQQVXFPNYJRQESRVEJKIAQILYSJ...
000000000000000000000M000000FM00FN00000G0OZK0RFTS0FKLJ0RJMZT...
with longer sequences:
alphabet=string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=10000))
flags = flagSequence(text,alphabet,seqLen=10)
print(text[200:260])
print(flags[200:260])
...PMZCDQXAOHVMTRLYCNCJABGGNZYAWIHJJCQKMMAENQFHNQTOQOPPGHVQZXZU...
...00N0000Y000WN000Z0O0K0000O0Z0X00KK00LNN00O000O00P0PQQ00WR0Y0...
Basically, whenever two strings in a list are separated by one or more zeroes, I want to join them together. ['a',0,'b'] => ["ab"].
I've tried yield and I really can't find a good way to say if you find a zero in the list, concatenate the next non-zero to the previous string.
I've used yield before, but I am just not approaching this correctly. Mind you, I don't insist on using yield, it just seemed the most likely approach to work, since a simple list comprehension won't do it.
Sample data and expected outputs:
dataexp = [
#input #expected
(["a"], ["a"]),
([0,0,"a","b",], ["a","b"]),
([0,"a","0",], ["a","0"]),
(["a",0,"b",], ["ab"]),
(["a",0,0,"b",], ["ab"]),
(["a","b",0], ["a","b"]),
(["a","b","c"], ["a","b","c"]),
(["a",0,"b",0, "c"], ["abc"]),
]
Some sample code
I just don't handle the concatenate logic correctly and only filter5 is a serious attempt.
dataexp = [
#input #expected
([0,0,"a","b",], ["a","b"]),
([0,"a","0",], ["a","0"]),
(["a",0,"b",], ["ab"]),
(["a",0,0,"b",], ["ab"]),
(["a","b",0], ["a","b"]),
(["a","b","c"], ["a","b","c"]),
(["a",0,"b",0, "c"], ["abc"]),
]
def filter0(li):
return [val for val in li if isinstance(val, str)]
def filter3(li):
pos = -1
len_li = len(li)
while pos < len_li-1:
pos += 1
if li[pos] == 0:
continue
else:
res = li[pos]
yield res
def filter5(li):
len_li = len(li)
pos = 2
p0 = p1 = None
while pos < len_li-1:
cur = li[pos]
if p0 in (0, None):
p0 = cur
pos +=1
continue
if cur == 0:
p1 = cur
pos += 1
continue
elif p1 == 0:
p0 = p0 + cur
pos += 1
continue
else:
p1 = cur
pos += 1
yield p0
if p0:
yield p0
if p1:
yield p1
for fn in [filter0, filter3, filter5]:
name = fn.__name__
print(f"\n\n{name}:")
for inp, exp in dataexp:
try:
got = list(fn(inp))
except (Exception,) as e:
got = str(e)
msg = "%-20.20s for %-80.80s \nexp :%s:\ngot :%-80.80s:" % (name, inp, exp, got)
if exp == got:
print(f"\n✅{msg}")
else:
print(f"\n❌{msg}")
I am generating html dynamically by pushing strings into a big List[str] then "\n".join() it. Most of the time, that's fine, browsers ignore whitespace, but Cypress does care about the \n in <td>xyz\n</td>. So, rather than changing everything, I thought I'd find a way to suppress the newline by using mylist.extend(0, "</td>"). But now I am just curious at the look-behind/ahead nature of this list problem. And, if you think Django or Jinja Templates are better suited, you'd be correct, except that this is generating Django Templates, rather than the final html.
I see no benefit of using a generator here. You can just keep track of the state determining your concat condition and either append or concatenate:
from typing import List, Literal, List
def process_list(l: List[Union[str, Literal[0]]]) -> List[str]:
result, concat = [], False
for e in l:
if e == 0:
concat = True
continue
if concat and result:
result[-1] += e
else:
result.append(e)
concat = False
return result
I am trying to write a code to compare each string in a list to each other and then generate its regex for similarity
list = ["LONDON-UK-L16-N1",
"LONDON-UK-L17-N1",
"LONDON-UK-L16-N2",
"LONDON-UK-L17-N2",
"PARIS-France-L16-N2"]
I am trying to get an output as below
LONDON-UK-L(16|17)-N(1|2)
is that possible? thanks
Update: just to make it clear i am trying to
input: list, or strings
Action: compare list items to each other, and check for similarity (to fix it-first group of a string), and use regex for any other not similar part of item, so instead of having for items, we can have a single output (using regex)
output: regex to match not similar
input:
tez15-3-s1-y2
tez15-3-s2-y2
bro40-55-s1-y2
output:
tez15-3-s(1|2)-y2
,bro40-55-s1-y2
Its not entirely clear from your question what the exact problem is. Since the data you gave as an example is consistent and well ordered, this problem can be solved easily by simply splitting up the items in the list and categorising them.
loc_list = ["LONDON-UK-L16-N1", "LONDON-UK-L17-N1", "LONDON-UK-L16-N2",
"LONDON-UK-L16-N2", "PARIS-France-L16-N2"]
split_loc_list = [location.split("-") for location in loc_list]
locs = {}
for loc in split_loc_list:
locs.setdefault("-".join(loc[0:2]), {}).\
setdefault("L", set()).add(loc[2].strip("L"))
locs.setdefault("-".join(loc[0:2]), {}).\
setdefault("N", set()).add(loc[3].strip("N"))
for loc, vals in locs.items():
L_vals_sorted = sorted(list(map(int,vals["L"])))
L_vals_joined = "|".join(map(str,L_vals_sorted))
N_vals_sorted = sorted(list(map(int,vals["N"])))
N_vals_joined = "|".join(map(str,N_vals_sorted))
print(f"{loc}-L({L_vals_joined})-N({N_vals_joined})")
will output:
LONDON-UK-L(16|17)-N(1|2)
PARIS-France-L(16)-N(2)
Since there were only two tags here ("L" and "N"), I just wrote them into the code. If there are many tags possible, then you can strip by any letter using:
import re
split = re.findall('\d+|\D+', loc[2])
key, val = split[0], split[1]
locs.setdefault("-".join(loc[0:2]), {}).\
setdefault(key, set()).add(val)
Then iterate through all the tags instead of just fetching "L" and "N" in the second loop.
I post this new (second) implementation on this problem, I think more accurate and hope helpful:
import re
data = [
'LONDON-UK-L16-N1',
'LONDON-UK-L17-N1',
'LONDON-UK-L16-N2',
'LONDON-UK-L17-N2',
'LONDON-UK-L18-N2',
'PARIS-France-L16-N2',
]
def merge(data):
data.sort()
data = [y for y in [x.split('-') for x in data]]
for col in range(len(data[0]) - 1, -1, -1):
result = []
def add_result():
result.append([])
if headstr:
result[-1] += headstr.split('-')
if len(list(findnum)) > 1:
result[-1] += [f'{findstr}({"|".join(sorted(findnum))})']
elif len(list(findnum)) == 1:
result[-1] += [f'{findstr}{findnum[0]}']
if tailstr:
result[-1] += tailstr.split('-')
_headstr = lambda x, y: '-'.join(x[:y])
_tailstr = lambda x, y: '-'.join(x[y + 1:])
_findstr = lambda x: re.findall('(\D+)', x)[0] if re.findall('(\D+)', x) else ''
_findnum = lambda x: re.findall('(\d+)', x)[0] if re.findall('(\d+)', x) else ''
headstr = _headstr(data[0], col)
tailstr = _tailstr(data[0], col)
findstr = _findstr(data[0][col])
findnum = []
for row in data:
if headstr + findstr + tailstr != _headstr(row, col) + _findstr(row[col]) + _tailstr(row, col):
add_result()
headstr = _headstr(row, col)
tailstr = _tailstr(row, col)
findstr = _findstr(row[col])
findnum = []
if _findnum(row[col]) not in findnum:
findnum.append(_findnum(row[col]))
else:
add_result()
data = result[:]
return ['-'.join(x) for x in result]
print(merge(data)) # ['LONDON-UK-L(16|17)-N(1|2)', 'LONDON-UK-L18-N2', 'PARIS-France-L16-N2']
I've implemented the following solution:
import re
data = [
'LONDON-UK-L16-N1',
'LONDON-UK-L17-N1',
'LONDON-UK-L16-N2',
'LONDON-UK-L16-N2',
'PARIS-France-L16-N2'
]
def deconstruct(data):
data = [y for y in [x.split('-') for x in data]]
result = dict()
for x in data:
pointer = result
for y in x:
substr = re.findall('(\D+)', y)
if substr:
substr = substr[0]
if not substr in pointer:
pointer[substr] = {0: set()}
pointer = pointer[substr]
substr = re.findall('(\d+)', y)
if substr:
substr = substr[0]
pointer[0].add(substr)
return result
def construct(data, level=0):
result = []
for key in data.keys():
if key != 0:
if len(data[key][0]) == 1:
nums = list(data[key][0])[0]
elif len(data[key][0]) > 1:
nums = '(' + '|'.join(sorted(list(data[key][0]))) + ')'
else:
nums = ''
deeper_result = construct(data[key], level + 1)
if not deeper_result:
result.append([key + nums])
else:
for d in deeper_result:
result.append([key + nums] + d)
return result if level > 0 else ['-'.join(x) for x in result]
print(construct(deconstruct(data)))
# ['LONDON-UK-L(16|17)-N(1|2)', 'PARIS-France-L16-N2']
Don't use 'list' as a variable name... it's a reserved word.
import re
lst = ['LONDON-UK-L16-N1', 'LONDON-UK-L17-N1', 'LONDON-UK-L16-N2', 'LONDON-UK-L16-N2', 'PARIS-France-L16-N2']
def check_it(string):
return re.search(r'[a-zA-Z\-]*L(\d)*-N(\d)*', string)
[check_it(x).group(0) for x in lst]
will output:
['LONDON-UK-L16-N1',
'LONDON-UK-L17-N1',
'LONDON-UK-L16-N2',
'LONDON-UK-L16-N2',
'PARIS-France-L16-N2']
From there, look into groups and define a group to cover the pieces that you want to use for similarity.
In my attempt to solve the above question, I've written the following code:
Logic: Create a frequency dict for each character in the string (key= character, value= frequency of the character). If any character's frequency is greater than ceil(n/2), there is no solution. Else, print the most frequent character followed by reducing its frequency in the dict/
import math, operator
def rearrangeString(s):
# Fill this in.
n = len(s)
freqDict = {}
for i in s:
if i not in freqDict.keys():
freqDict[i] = 1
else:
freqDict[i] += 1
for j in list(freqDict.values()):
if j > math.ceil(n / 2):
return None
return maxArrange(freqDict)[:-4]
temp = ""
def maxArrange(inp):
global temp
n = len(inp)
if list(inp.values()) != [0] * n:
resCh = max(inp.items(), key=operator.itemgetter(1))[0]
if resCh is not None and resCh != temp:
inp[resCh] -= 1
# Terminates with None
temp = resCh
return resCh + str(maxArrange(inp))
# Driver
print(rearrangeString("abbccc"))
# cbcabc
print(rearrangeString("abbcccc"))
In the first try, with input abbccc, it gives the right answer, i.e. cbcabc, but fails for the input abbcccc, returning ccbcabc, without handling it using the temp variable, else returning cbcabc and skipping c altogether when handled using temp
How should I modify the logic, or is there a better approach?