Getting all possible combinations of a regular expression string - python

I have a regular expression: ATG(C|G|A)(C|T)GA
The above regular expression could take any form with only OR (|) special characters at any position in the string and any number of alphabets within the brackets.
I want to match all combinations of this string in a list:
ATGCCGA
ATGCTGA
ATGGCGA
ATGGTGA
ATGACGA
ATGATGA
I am unable to find any python library that could do this.

You could take the cartesian product of the dynamic parts of the string using itertools.product then join with the other static parts of the string.
>>> from itertools import product
>>> [f'ATG{i}{j}GA' for i,j in product('CGA', 'CT')]
['ATGCCGA', 'ATGCTGA', 'ATGGCGA', 'ATGGTGA', 'ATGACGA', 'ATGATGA']

You can use recursion:
import collections
s = 'ATG(C|G|A)(C|T)GA'
def combos(d):
r, k = [], None
while d:
if (c:=d.popleft()) not in '|()':
k = (k if k else '')+c
elif c == '|':
if k:
r.append(k)
k = None
elif c == '(':
r = [v+(k or '')+i for i in combos(d) for v in (r if r else [''])]
k = None
else:
if k:
r.append(k)
k = None
break
yield from ([i+(k or '') for i in r] if r else [k])
print(list(combos(collections.deque(list(s)))))
Output:
['ATGCCGA', 'ATGGCGA', 'ATGACGA', 'ATGCTGA', 'ATGGTGA', 'ATGATGA']

Related

Encode the DNA string in such a way that similar subsequent characters are grouped into number of occurrences along with the char

I need help in writing the Python code which would return the following output_string as mentioned below in the examples.
Example 1:
input_string = "AAABCCCCDDA"
output_string = "3AB4C2DA"
Example 2:
input_string = "ABBBBCCDDDDAAAAA"
output_string = "A4B2C4D5A"
You can use itertools.groupby.
In python 3.8+, You can use walrus operator (:=) and write a short approach.
>>> from itertools import groupby
>>> input_string = "ABBBBCCDDDDAAAAA"
>>> ''.join(f"{len_g}{k}" if (len_g := len(list(g))) > 1 else k for k, g in groupby(input_string))
'A4B2C4D5A'
In Python < 3.8:
from itertools import groupby
input_string = "AAABCCCCDDA"
st = ''
for k, g in groupby(input_string):
len_g = len(list(g))
if len_g>1:
st += f"{len_g}{k}"
else:
st += k
print(st)
Output:'3AB4C2DA'
it seems like regex also can do the trick:
from re import sub
dna = "AAABCCCCDDA"
sub(r'(\w)\1+',lambda m: str(len(m[0]))+m[1],dna) # '3AB4C2DA'

Remove N consecutive repeated characters in a string

I am trying to solve a problem where the user inputs a string say str = "aaabbcc" and an integer n = 2.
So the function is supposed to remove characters that appearing 'n' times from the str and output only "aaa".
I tried couple of approaches and I'm not able to obtain the right output.
Are there any Regular expression functions that I could use or any recursive functions or just plain old iterations.
Thanks in advance.
Using itertools.groupby
Ex:
from itertools import groupby
s = "aaabbcc"
n = 2
result = ""
for k, v in groupby(s):
value = list(v)
if not len(value) == n:
result += "".join(value)
print(result)
Output:
aaa
You can use itertools.groupby:
>>> s = "aaabbccddddddddddeeeee"
>>> from itertools import groupby
>>> n = 3
>>> groups = (list(values) for _, values in groupby(s))
>>> "".join("".join(v) for v in groups if len(v) < n)
'bbcc'
from collections import Counter
counts = Counter(string)
string = "".join(c for c in string if counts[c] != 2)
Edit: Wait, sorry, I missed "consecutive". This will remove characters that occur exactly two times in the whole string (fitting your example, but not the general case).
Consecutive filter is a bit more complex, but doable - just find the consecutive runs first, then filter out the ones which have length two.
runs = [[string[0], 0]]
for c in string:
if c == runs[-1][0]:
runs[-1][1] += 1
else:
runs.append([c, 1])
string = "".join(c*length for c,length in runs if length != 2)
Edit2: As the other answers correctly point out, the first part of this is done natively by groupby
from itertools import groupby
string = "".join(c*length for c,length in groupby(string) if length != 2)
In [15]: some_string = 'aaabbcc'
In [16]: n = 2
In [17]: final_string = ''
In [18]: for k, v in Counter(some_string).items():
...: if v != n:
...: final_string += k * v
...:
In [19]: final_string
Out[19]: 'aaa'
You'll need: from collections import Counter
from collections import defaultdict
def fun(string,n):
dic = defaultdict(int)
for i in string:
dic[i]+=1
check = []
for i in dic:
if dic[i]==n:
check.append(i)
for i in check:
del dic[i]
return dic
string = "aaabbcc"
n = 2
result = fun(string, n)
sol =''
for i in result:
sol+=i*result[i]
print(sol)
output
aaa

How to find and get rid of consecutive repeated punctuation signs without using regular expressions in python?

I want to get rid of repeated consecutive punctuation signs and only leave one of them.
If I have
string = 'Is it raining????',
I want to get
string = 'Is it raining?'
But I don't want to get rid of '...'
I also need to do this without using regular expressions. I am a beginner in python and would appreciate any advice or hint. Thanks :)
Yet another groupby approach:
from itertools import groupby
from string import punctuation
punc = set(punctuation) - set('.')
s = 'Thisss is ... a test!!! string,,,,, with 1234445556667 rrrrepeats????'
print(s)
newtext = []
for k, g in groupby(s):
if k in punc:
newtext.append(k)
else:
newtext.extend(g)
print(''.join(newtext))
output
Thisss is ... a test!!! string,,,,, with 1234445556667 rrrrepeats????
Thisss is ... a test! string, with 1234445556667 rrrrepeats?
import string
from itertools import groupby
# get all punctuation minus period.
puncs = set(string.punctuation)-set('.')
s = 'Is it raining???? No but...,,,, it is snowing!!!!!!!###!######'
# get count of consecutive characters
t = [[k,len(list(g))] for k, g in groupby(s)]
s = ''
for ele in t:
char = ele[0]
count = ele[1]
if char in puncs and count > 1:
count = 1
s+=char*count
print s
#Is it raining? No but..., it is snowing!#!###
How about the following kind of approach:
import string
text = 'Is it raining???? No,,,, but...,,,, it is snoooowing!!!!!!!'
for punctuation in string.punctuation:
if punctuation != '.':
while True:
replaced = text.replace(punctuation * 2, punctuation)
if replaced == text:
break
text = replaced
print(text)
This would give the following output:
Is it raining? No, but..., it is snoooowing!
Or for a more efficient version giving the same results:
import string
text = 'Is it raining???? No,,,, but...,,,, it is snoooowing!!!!!!!'
last = None
output = []
for c in text:
if c == '.':
output.append(c)
elif c != last:
if c in string.punctuation:
last = c
else:
last = None
output.append(c)
print(''.join(output))
from itertools import groupby
s = 'Is it raining???? okkkk!!! ll... yeh""" ok?'
replaceables = [ch for i, ch in enumerate(s) if i > 0 and s[i - 1] == ch and (not ch.isalpha() and ch != '.')]
replaceables = [list(g) for k, g in groupby(replaceables)]
start = 0
for replaceable in replaceables:
replaceable = ''.join(replaceable)
start = s.find(replaceable, start)
r = s[start:].replace(replaceable, '', 1)
s = s.replace(s[start:], r)
print s

How to compress by removing duplicates in python?

I have strings with blocks of the same character in, eg '1254,,,,,,,,,,,,,,,,982'. What I'm aiming to do is replace that with something along the lines of '1254(,16)982' so that the original string can be reconstructed. If anyone could point me in the right direction that would be greatly appreciated
You're looking for run-length encoding: here is a Python implementation based loosely on this one.
import itertools
def runlength_enc(s):
'''Return a run-length encoded version of the string'''
enc = ((x, sum(1 for _ in gp)) for x, gp in itertools.groupby(s))
removed_1s = [((c, n) if n > 1 else c) for c, n in enc]
joined = [["".join(g)] if n == 1 else list(g)
for n, g in itertools.groupby(removed_1s, key=len)]
return list(itertools.chain(*joined))
def runlength_decode(enc):
return "".join((c[0] * c[1] if len(c) == 2 else c) for c in enc)
For your example:
print runlength_enc("1254,,,,,,,,,,,,,,,,982")
# ['1254', (',', 16), '982']
print runlength_decode(runlength_enc("1254,,,,,,,,,,,,,,,,982"))
# 1254,,,,,,,,,,,,,,,,982
(Note that this will be efficient only if there are very long runs in your string).
If you don't care about the exact compressed form you may want to look at zlib.compress and zlib.decompress. zlibis a standard Python library that can compress a single string and will probably get better compression than a self implemented compression algorithm.
using regular expressions:
s = '1254,,,,,,,,,,,,,,,,982'
import re
c = re.sub(r'(.)\1+', lambda m: '(%s%d)' % (m.group(1), len(m.group(0))), s)
print c # 1254(,16)982
using itertools
import itertools
c = ''
for chr, g in itertools.groupby(s):
k = len(list(g))
c += chr if k == 1 else '(%s%d)' % (chr, k)
print c # 1254(,16)982

removing non-numeric characters from a string

strings = ["1 asdf 2", "25etrth", "2234342 awefiasd"] #and so on
Which is the easiest way to get [1, 25, 2234342]?
How can this be done without a regex module or expression like (^[0-9]+)?
One could write a helper function to extract the prefix:
def numeric_prefix(s):
n = 0
for c in s:
if not c.isdigit():
return n
else:
n = n * 10 + int(c)
return n
Example usage:
>>> strings = ["1asdf", "25etrth", "2234342 awefiasd"]
>>> [numeric_prefix(s) for s in strings]
[1, 25, 2234342]
Note that this will produce correct output (zero) when the input string does not have a numeric prefix (as in the case of empty string).
Working from Mikel's solution, one could write a more concise definition of numeric_prefix:
import itertools
def numeric_prefix(s):
n = ''.join(itertools.takewhile(lambda c: c.isdigit(), s))
return int(n) if n else 0
new = []
for item in strings:
new.append(int(''.join(i for i in item if i.isdigit())))
print new
[1, 25, 2234342]
Basic usage of regular expressions:
import re
strings = ["1asdf", "25etrth", "2234342 awefiasd"]
regex = re.compile('^(\d*)')
for s in strings:
mo = regex.match(s)
print s, '->', mo.group(0)
1asdf -> 1
25etrth -> 25
2234342 awefiasd -> 2234342
Building on sahhhm's answer, you can fix the "1 asdf 1" problem by using takewhile.
from itertools import takewhile
def isdigit(char):
return char.isdigit()
numbers = []
for string in strings:
result = takewhile(isdigit, string)
resultstr = ''.join(result)
if resultstr:
number = int(resultstr)
if number:
numbers.append(number)
So you only want the leading digits? And you want to avoid regexes? Probably there's something shorter but this is the obvious solution.
nlist = []
for s in strings:
if not s or s[0].isalpha(): continue
for i, c in enumerate(s):
if not c.isdigit():
nlist.append(int(s[:i]))
break
else:
nlist.append(int(s))

Categories