python string splitting with multiple splitting points - python

Ok so ill get straight to the point here is my code
def digestfragmentwithenzyme(seqs, enzymes):
fragment = []
for seq in seqs:
for enzyme in enzymes:
results = []
prog = re.compile(enzyme[0])
for dingen in prog.finditer(seq):
results.append(dingen.start() + enzyme[1])
results.reverse()
#result = 0
for result in results:
fragment.append(seq[result:])
seq = seq[:result]
fragment.append(seq[:result])
fragment.reverse()
return fragment
Input for this function is a list of multiple strings (seq) e.g. :
List = ["AATTCCGGTCGGGGCTCGGGGG","AAAGCAAAATCAAAAAAGCAAAAAATC"]
And enzymes as input:
[["TC", 1],["GC",1]]
(note: there can be multiple given but most of them are in this matter of letters with ATCG)
The function should return a list that, in this example, contain 2 lists:
Outputlist = [["AATT","CCGGT","CGGGG","CT","CGGGGG"],["AAAG","CAAAAT","CAAAAAAG","CAAAAAAT","C"]]
Right now i am having troubles with splitting it twice and getting the right output.
Little bit more information about the function. It looks through the string (seq) for the recognizion point. in this case TC or GC and splits it on the 2nd index of enzymes. it should do that for both strings in the list with both enzymes.

Assuming the idea is to split at each enzyme, at the index point where enzymes are multiple letters, and the split, in essence comes between the two letters. Don't need regex.
You can do this by looking for the occurrences and inserting a split indicator at the correct index and then post-process the result to actually split.
For example:
def digestfragmentwithenzyme(seqs, enzymes):
# preprocess enzymes once, then apply to each sequence
replacements = []
for enzyme in enzymes:
replacements.append((enzyme[0], enzyme[0][0:enzyme[1]] + '|' + enzyme[0][enzyme[1]:]))
result = []
for seq in seqs:
for r in replacements:
seq = seq.replace(r[0], r[1]) # So AATTC becomes AATT|C
result.append(seq.split('|')) # So AATT|C becomes AATT, C
return result
def test():
seqs = ["AATTCCGGTCGGGGCTCGGGGG","AAAGCAAAATCAAAAAAGCAAAAAATC"]
enzymes = [["TC", 1],["GC",1]]
print digestfragmentwithenzyme(seqs, enzymes)

Here is my solution:
Replace TC with T C, GC with G C (this is done based on index given) and then split based on space character....
def digest(seqs, enzymes):
res = []
for li in seqs:
for en in enzymes:
li = li.replace(en[0],en[0][:en[1]]+" " + en[0][en[1]:])
r = li.split()
res.append(r)
return res
seqs = ["AATTCCGGTCGGGGCTCGGGGG","AAAGCAAAATCAAAAAAGCAAAAAATC"]
enzymes = [["TC", 1],["GC",1]]
#enzymes = [["AAT", 2],["GC",1]]
print seqs
print digest(seqs, enzymes)
the results are:
for ([["TC", 1],["GC",1]])
['AATTCCGGTCGGGGCTCGGGGG', 'AAAGCAAAATCAAAAAAGCAAAAAATC']
[['AATT', 'CCGGT', 'CGGGG', 'CT', 'CGGGGG'], ['AAAG', 'CAAAAT', 'CAAAAAAG', 'CAA
AAAAT', 'C']]
for ([["AAT", 2],["GC",1]])
['AATTCCGGTCGGGGCTCGGGGG', 'AAAGCAAAATCAAAAAAGCAAAAAATC']
[['AA', 'TTCCGGTCGGGG', 'CTCGGGGG'], ['AAAG', 'CAAAA', 'TCAAAAAAG', 'CAAAAAA', '
TC']]

Here is something that should work using regex. In this solution, I find all occurrences of your enzyme strings and split using their corresponding index.
def digestfragmentwithenzyme(seqs, enzymes):
out = []
dic = dict(enzymes) # dictionary of enzyme indices
for seq in seqs:
sub = []
pos1 = 0
enzstr = '|'.join(enz[0] for enz in enzymes) # "TC|GC" in this case
for match in re.finditer('('+enzstr+')', seq):
index = dic[match.group(0)]
pos2 = match.start()+index
sub.append(seq[pos1:pos2])
pos1 = pos2
sub.append(seq[pos1:])
out.append(sub)
# [['AATT', 'CCGGT', 'CGGGG', 'CT', 'CGGGGG'], ['AAAG', 'CAAAAT', 'CAAAAAAG', 'CAAAAAAT', 'C']]
return out

Use positive lookbehind and lookahead regex search:
import re
def digest_fragment_with_enzyme(sequences, enzymes):
pattern = '|'.join('((?<={})(?={}))'.format(strs[:ind], strs[ind:]) for strs, ind in enzymes)
print pattern # prints ((?<=T)(?=C))|((?<=G)(?=C))
for seq in sequences:
indices = [0] + [m.start() for m in re.finditer(pattern, seq)] + [len(seq)]
yield [seq[start: end] for start, end in zip(indices, indices[1:])]
seq = ["AATTCCGGTCGGGGCTCGGGGG", "AAAGCAAAATCAAAAAAGCAAAAAATC"]
enzymes = [["TC", 1], ["GC", 1]]
print list(digest_fragment_with_enzyme(seq, enzymes))
Output:
[['AATT', 'CCGGT', 'CGGGG', 'CT', 'CGGGGG'],
['AAAG', 'CAAAAT', 'CAAAAAAG', 'CAAAAAAT', 'C']]

The simplest answer I can think of:
input_list = ["AATTCCGGTCGGGGCTCGGGGG","AAAGCAAAATCAAAAAAGCAAAAAATC"]
enzymes = ['TC', 'GC']
output = []
for string in input_list:
parts = []
left = 0
for right in range(1,len(string)):
if string[right-1:right+1] in enzymes:
parts.append(string[left:right])
left = right
parts.append(string[left:])
output.append(parts)
print(output)

Throwing my hat in the ring here.
Using dict for patterns rather than list of lists.
Joining pattern as others have done to avoid fancy regexes.
.
import re
sequences = ["AATTCCGGTCGGGGCTCGGGGG","AAAGCAAAATCAAAAAAGCAAAAAATC"]
patterns = { 'TC': 1, 'GC': 1 }
def intervals(patterns, text):
pattern = '|'.join(patterns.keys())
start = 0
for match in re.finditer(pattern, text):
index = match.start() + patterns.get(match.group())
yield text[start:index]
start = index
yield text[index:len(text)]
print [list(intervals(patterns, s)) for s in sequences]
# [['AATT', 'CCGGT', 'CGGGG', 'CT', 'CGGGGG'], ['AAAG', 'CAAAAT', 'CAAAAAAG', 'CAAAAAAT', 'C']]

Related

Regex match items in list + trailing N numbers (Python)

I have a list of expected animals:
expectedAnimals = ['cat-', 'snake-', 'hedgehog-']
Then I have a user input (in string format) that contains some or all of the expected animals from the above list follwed by N numbers. These animals are separated by random delimiting symbols (non-integer):
Examples:
inputString1 = 'cat-235##randomtext-123...snake-1,dog-2:snake-22~!cat-8844'
inputString2 = 'hedgehog-2>cat-1|snake-22#cat-2<$dog-55 snake-93242522. cat-3 .rat-2 snake-22 cat-8844'
My goal (with which I am struggling) is to write the function filterAnimals that should return the following correct results:
approvedAnimals1 = filterAnimals(inputString1)
['cat-235', 'snake-1', 'snake-22', 'cat-8844']
approvedAnimals2 = filterAnimals(inputString2):
['hedgehog-2', 'cat-1', 'snake-22', 'cat-2', 'snake-93242522', 'cat-3', 'snake-22', 'cat-8844']
My current implementation works partially but honestly I would like to re-write it from scratch:
def filterAnimals(inputString):
expectedAnimals = ['cat-', 'snake-', 'hedgehog-']
start_indexes = []
end_indexes = []
for animal in expectedAnimals:
temp_start_indexes = [i for i in range(len(inputString)) if inputString.startswith(animal, i)]
if len(temp_start_indexes) > 0:
start_indexes.append(temp_start_indexes)
for start_ind in temp_start_indexes:
for i in range(start_ind + len(animal), len(inputString)):
if inputString[i].isdigit() and i == len(inputString) - 1:
end_indexes.append(i + 1)
break
if not inputString[i].isdigit():
end_indexes.append(i)
break
start_indexes_flat = [item for sublist in start_indexes for item in sublist]
list_size = min(len(start_indexes_flat), len(end_indexes))
approvedAnimals = []
if list_size > 0:
for x in range(list_size):
approvedAnimals.append(inputString[start_indexes_flat[x]:end_indexes[x]])
return approvedAnimals
You can build an alternation pattern from expectedAnimals and use re.findall to find all matches as a list:
import re
def filterAnimals(inputString):
return re.findall(rf"(?:{'|'.join(expectedAnimals)})\d+", inputString)
Demo: https://replit.com/#blhsing/OffensiveEveryWebportal
import re
# matches expected animals followed by N numbers
pattern=re.compile("(cat|snake|hedgehog)-\d+")
inputString1 = 'cat-235##randomtext-123...snake-1,dog-2:snake-22~!cat-8844'
inputString2 = 'hedgehog-2>cat-1|snake-22#cat-2<$dog-55 snake-93242522. cat-3 .rat-2 snake-22 cat-8844'
animals_1 = [i.group() for i in pattern.finditer(inputString1)]
# will return ['cat-235', 'snake-1', 'snake-22', 'cat-8844']
animals_2 = [i.group() for i in pattern.finditer(inputString2)]
# will return ['hedgehog-2', 'cat-1', 'snake-22', 'cat-2', 'snake-93242522', 'cat-3', 'snake-22', 'cat-8844']

Count of sub-strings that contain character X at least once. E.g Input: str = “abcd”, X = ‘b’ Output: 6

This question was asked in an exam but my code (given below) passed just 2 cases out of 7 cases.
Input Format : single line input seperated by comma
Input: str = “abcd,b”
Output: 6
“ab”, “abc”, “abcd”, “b”, “bc” and “bcd” are the required sub-strings.
def slicing(s, k, n):
loop_value = n - k + 1
res = []
for i in range(loop_value):
res.append(s[i: i + k])
return res
x, y = input().split(',')
n = len(x)
res1 = []
for i in range(1, n + 1):
res1 += slicing(x, i, n)
count = 0
for ele in res1:
if y in ele:
count += 1
print(count)
When the target string (ts) is found in the string S, you can compute the number of substrings containing that instance by multiplying the number of characters before the target by the number of characters after the target (plus one on each side).
This will cover all substrings that contain this instance of the target string leaving only the "after" part to analyse further, which you can do recursively.
def countsubs(S,ts):
if ts not in S: return 0 # shorter or no match
before,after = S.split(ts,1) # split on target
result = (len(before)+1)*(len(after)+1) # count for this instance
return result + countsubs(ts[1:]+after,ts) # recurse with right side
print(countsubs("abcd","b")) # 6
This will work for single character and multi-character targets and will run much faster than checking all combinations of substrings one by one.
Here is a simple solution without recursion:
def my_function(s):
l, target = s.split(',')
result = []
for i in range(len(l)):
for j in range(i+1, len(l)+1):
ss = l[i] + l[i+1:j]
if target in ss:
result.append(ss)
return f'count = {len(result)}, substrings = {result}'
print(my_function("abcd,b"))
#count = 6, substrings = ['ab', 'abc', 'abcd', 'b', 'bc', 'bcd']
Here you go, this should help
from itertools import combinations
output = []
initial = input('Enter string and needed letter seperated by commas: ') #Asking for input
list1 = initial.split(',') #splitting the input into two parts i.e the actual text and the letter we want common in output
text = list1[0]
final = [''.join(l) for i in range(len(text)) for l in combinations(text, i+1)] #this is the core part of our code, from this statement we get all the available combinations of the set of letters (all the way from 1 letter combinations to nth letter)
for i in final:
if 'b' in i:
output.append(i) #only outputting the results which have the required letter/phrase in it

Remove a pattern from list element and return another list in Python

Let's assume I have a list like this
List=["Face123","Body234","Face565"]
I would like to obtain as output a list without character/substring described in another list.
NonDesideredPattern["Face","Body"]
Output=[123,234,565].
Create a function which returns a string without the undesired patterns.
Then use this function in a comprehension list:
import re
def remove_pattern(string, patterns):
result = string
for p in patterns:
result = re.sub(p, '', result)
return result
inputs = ["Face123", "Body234", "Face565"]
undesired_patterns = ["Face", "Body"]
outputs = [remove_pattern(e, undesired_patterns) for e in inputs]
I am not sure, this is 100% efficient, but you could do something like this:
def eval_list(og_list):
list_parts = []
list_nums = []
for element in og_list:
part = ""
num = ""
for char in element:
if char.isalpha():
part += char
else:
num += char
list_parts.append(part)
list_nums.append(num)
return list_parts, list_nums
(if you are always working with alphabetical syntax and then a number)
Use re.compile and re.sub
import re
lst = ["Face123", "Body234", "Face565"]
lst_no_desired_pattern = ["Face","Body"]
pattern = re.compile("|".join(lst_no_desired_pattern))
lst_output = [re.sub(pattern, "", word) for word in lst]
Result:
['123', '234', '565']

Managing duplicates when sorting by character order in string

I am trying to solve through a challenge where I have to reorder the letters in string s in the order it appears on string t. For example:
For s = "weather" and t = "therapyw", the output should be
sortByString(s, t) = "theeraw";
For s = "good" and t = "odg", the output should be
sortByString(s, t) = "oodg".
This is my code:
def sortByString(s, t):
s_list = list(s)
t_list = list(t)
output = []
for i in range(len(t_list)):
if t_list[i] in s_list:
output.insert(i, t_list[i])
return ''.join(output)
It works for all cases except if the same letter exists more than once.
s: "weather"
t: "therapyw"
Output:
"theraw"
Expected Output:
"theeraw"
How can I handle this situation in my code above? What am I missing? I appreciate all help but instead of just blurting out the answer, I would like to know what I'm doing wrong.
The issue with your current code is that it only adds one copy of each character in t to output, regardless of how many times it occurs in s. You can work around that by looping over the count of that character in s and appending to output for each count:
def sortByString(s, t):
s_list = list(s)
t_list = list(t)
output = []
for i in range(len(t_list)):
for _ in range(s_list.count(t_list[i])):
output.append(t_list[i])
return ''.join(output)
print(sortByString('weather',"therapyw"))
print(sortByString('good',"odg"))
Output:
theeraw
oodg
You can simplify the loop by just adding copies of a list with the current character according to the count of the character in s:
for c in t_list:
output = output + [c] * s_list.count(c)
Easy way
Use enumerate and turn your string into a dict
def sortByString(s, t):
s_list = list(s)
t_list = list(t)
orderdict = {char: index for index, char in enumerate(t_list)}
output = sorted(list('weather'),key=orderdict.get)
return ''.join(output)
This will allow repeated values
Example
>>> sortByString('weather',"therapyw")
'theeraw'
Modification to OP's code
Just add the element number of times it appear in s to the output
def sortByString(s,t):
s_list = list(s)
t_list = list(t)
output = []
for i in range(len(t_list)):
if t_list[i] in s_list:
output.append(t_list[i]*s_list.count(t_list[i]))
return ''.join(output)
output
>>> sortByString('weather',"therapyw")
'theeraw'
2 steps:
a. create a sorted list of characters in s and their order in t using index()
b. use zip(* to extract the sorted list of characters
s = "weather"
t = "therapy"
a = sorted([(t.index(c),c) for c in s])
b = ''.join(list(zip(*a))[1])
print(b)
Output:
theeraw

Regex to find multiple combinations from a string with specific characters as start and end

I am trying to find matches of possible strings with start characters: 'ATG' and end characters 'TAA' or 'TGA' or 'TAG'.
For example, if my string is:
seq = 'GATGATCGATGCTGACGTATAGGTTAAC'
I want to use regular expressions to match these 3:
match1 = 'ATGATCGATGCTGA'
match2 = 'ATGATCGATGCTGACGTATAG'
match3 = 'ATGATCGATGCTGACGTATAGGTTAA'
And the same if you use the second 'ATG':
MATCH4 = 'ATGCTGA'
MATCH5 = 'ATGCTGACGTATAG'
MATCH6 = 'ATGCTGACGTATAGGTTAA'
So far i have written this:
Frame1_ORF = re.match(\r '(^(ATG)?:(TGA|TAA|TAG)$)',Frame1)
But i have sth incorrect with my syntax or my symbols.
Could you help me find these 6 matches?
Thanks.
If you are trying to extract strings as defined within a bigger string, this could do the job
matches = (
'**ATG**ATCG**ATG**C**TGA**',
'**ATG**ATCG**ATG**C**TGA**CGTA**TAG**',
'**ATG**ATCG**ATG**C**TGA**CGTA**TAG**GT**TAA**'
)
for string in matches:
print(re.findall('((ATG)(.*)(TGA|TAA|TAG))', string)[0][0])
Output
ATG**ATCG**ATG**C**TGA
ATG**ATCG**ATG**C**TGA**CGTA**TAG
ATG**ATCG**ATG**C**TGA**CGTA**TAG**GT**TAA
To build on the answer provided by #Arount:
matches = (
'**ATG**ATCG**ATG**C**TGA**',
'**ATG**ATCG**ATG**C**TGA**CGTA**TAG**',
'**ATG**ATCG**ATG**C**TGA**CGTA**TAG**GT**TAA**'
)
for string in matches:
print(re.findall('(\*\*(?:ATG)(?:.*)(?:TGA|TAA|TAG)\*\*)', string)[0])
It outputs:
**ATG**ATCG**ATG**C**TGA**
**ATG**ATCG**ATG**C**TGA**CGTA**TAG**
**ATG**ATCG**ATG**C**TGA**CGTA**TAG**GT**TAA**
The 2 changes I made:
Appended and prepended \*\*
The inner groups are non-capturing ones now (?:). It's no longer needed to do [0][0]. Just [0] now.
This should do it:
import re
resp =[]
seq = 'G**ATG**ATCG**ATG**C**TGA**CGTA**TAG**GT**TAA**C'
while re.findall(r'(?<=ATG)(.*)(?=(TGA|TAA|TAG))',seq):
seq = re.findall(r'(?<=ATG)(.*)(?=(TGA|TAA|TAG))', seq)
resp.append("ATG"+seq[0][0]+seq[0][1])
seq="ATG"+seq[0][0]
print(resp)
Another possibility is to generate a list of combinations of the sequence that follow the rules of the desired sequence, and filter the final result so that all the sequences left exist in the input sequence:
seq = 'G**ATG**ATCG**ATG**C**TGA**CGTA**TAG**GT**TAA**C'
def new_data(f):
def wrapper():
full = map('**'.join, f(seq.split('**')))
return list(filter(lambda x:x in seq, full)))
return wrapper
#new_data
def outer(data):
def combinations(d, current = []):
if current and current[0] == 'ATG' and current[-1] in ['TAA', 'TGA', 'TAG']:
yield current
else:
for i in d:
if i in ['TAA', 'TGA', 'TAG'] and current:
yield current + [i]
else:
if not current and i == 'ATG':
yield from combinations(d, [i]+current)
elif current and current.count(i) < seq.split('**').count(i):
yield from combinations(d, current+[i])
return list(combinations(data))
results = list(map(lambda x:'**'+x+'**', outer()))
Output:
['**ATG**ATCG**ATG**C**TGA**', '**ATG**ATCG**ATG**C**TGA**', '**ATG**ATCG**ATG**C**TGA**', '**ATG**ATCG**ATG**C**TGA**', '**ATG**C**TGA**', '**ATG**C**TGA**', '**ATG**ATCG**ATG**C**TGA**', '**ATG**ATCG**ATG**C**TGA**', '**ATG**ATCG**ATG**C**TGA**', '**ATG**ATCG**ATG**C**TGA**', '**ATG**C**TGA**', '**ATG**C**TGA**']
Edit: regarding your recent data format change, you can create a function to find the overall groupings:
def combinations():
groupings = {'G', 'ATG', 'ATCG', 'ATG', 'C', 'TGA', 'CGTA', 'TAG', 'GT', 'TAA', 'C'}
seq = 'GATGATCGATGCTGACGTATAGGTTAAC'
while seq:
possibilities = [i for i in groupings if seq.startswith(i)]
seq = seq[len(max(possibilities, key=len)):]
yield max(possibilities, key=len)
seq = '**'.join(combinations())
results = list(map(lambda x:'**'+x+'**', outer()))

Categories