find a Pattern Match in string in Python - python

I am trying to find a amino acid pattern (B-C or M-D, where '-' could be any alphabet other than 'P') in a protein sequence let say 'VATLDSCBACSKVNDNVKNKVKVKNVKMLDHHHV'. Protein sequence in in a fasta file.
I have tried a lot but couldn't find any solution.
I tried a lot. the following code is one of them
import Bio
from Bio import SeqIO
seqs= SeqIO.parse(X, 'fasta') ### to read the sequences from fasta file
for aa in seqs:
x=aa.seq ## gives the sequences as a string (.seq is a build in function of Biopython)
for val, i in enumerate(x):
if i=='B':
if (x[val+2])=='C':
if x[val+1]!='P':
pattern=((x[val]:x[val+2])) ## trying to print full sequence B-C
But unfortunately none of them work.
It would be great if someone can help me out with this problem.

>>> x = 'VATLDSCBACSKVNDNVKNKVKVKNVKMLDHHHV'
>>> import re
>>> m = re.search('B(.+?)C', x)
>>> m
<_sre.SRE_Match object at 0x10262aeb0>
>>> m = re.search('B(.+?)C', x).group(0)
>>> m
'BAC'
>>> m = re.search('M(.+?)D', x).group(0)
>>> m
'MLD'
>>> re.search(r"(?<=M).*?(?=D)", x).group(0)
'L'
>>> re.search(r"(?<=B).*?(?=C)", x).group(0)
'A'

A common solution for pattern matching is the usage of regex.
A possible regex for your problem is B[^P]C|M[^P]D.
The following code has been generated by regex101 with the regex I propose and the test string you gave us. It find all matching pattern with their positions in the original string.
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"B[^P]C|M[^P]D"
test_str = "VATLDSCBACSKVNDNVKNKVKVKNVKMLDHHHV"
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.

In python you can use the Regex module (re):
import re # import the RE module
import Bio
from Bio import SeqIO
seqs = SeqIO.parse(X, 'fasta')
for sequence in seqs:
line = sequence.se
RE = r'B[A-OQ-Z]C|M[A-OQ-Z]D'
# [A-OQ-Z] : Match from A to O and from Q to Z (exl. P)
# | : is an operator OR = either the left or right part should match
# The r before the string specify that the string is regex: r"regex"
results = re.findall(RE, line)
# The function findall will return a list of all non-overlapping matches.
# To iterate over each result :
for res in results:
print(res)
Then you can also modify the Regular expression to match any other rule you would like to match.
More information about the findall function here : re.findall(...)
The following website can help you build a regex :
https://regex101.com/

Use a regular expression with an exception assertion "^".
import re
string = 'VATLDSCBACSKVNDNVKNKVKVKNVKMLDHHHV'
re.findall(r"B[^P]C|M[^P]D", string)
Output:
['BAC', 'MLD']

Related

Why isn't my Python regex matching part of the query string? [duplicate]

This question already has answers here:
Retrieving parameters from a URL
(20 answers)
Closed 3 years ago.
I'm using Python 3.7. I want to extract the portion of a url between the "q=...&" part of a query string. I have this code
href = span.a['href']
print("href:" + href)
matchObj = re.match( r'q=(.*?)\&', href, re.M|re.I)
if matchObj:
criteria = matchObj.group(1)
but despite the fact that my href is this
href:/search?hl=en-US&q=bet+i+won+t+get+one+share&tbm=isch&tbs=simg:CAQSkwEJyapBtj9kKiIahwELEKjU2AQaAAwLELCMpwgaYgpgCAMSKMILxAufFcsLnBWeFZsVnRWABMcPsCKgLaMtoi2hLZ0tqziiI6w4uSQaMG01mL5LQ62s4q5ZMf-Wetz68lCkHfrFOOKs2CELzQJlPjHIMzmlp2Ny-a5t7hZbiCAEDAsQjq7-CBoKCggIARIEXLNODAw&sa=X&ved=0ahUKEwjThcCx59ziAhWKHLkGHfWjDs4Q2A4ILCgB
the "matchObj" is always NoneType and the subsequent lines aren't evaluated. What else do I need to do to fix my regex?
You can use the urllib module
Ex:
import urllib.parse as urlparse
url = "href:/search?hl=en-US&q=bet+i+won+t+get+one+share&tbm=isch&tbs=simg:CAQSkwEJyapBtj9kKiIahwELEKjU2AQaAAwLELCMpwgaYgpgCAMSKMILxAufFcsLnBWeFZsVnRWABMcPsCKgLaMtoi2hLZ0tqziiI6w4uSQaMG01mL5LQ62s4q5ZMf-Wetz68lCkHfrFOOKs2CELzQJlPjHIMzmlp2Ny-a5t7hZbiCAEDAsQjq7-CBoKCggIARIEXLNODAw&sa=X&ved=0ahUKEwjThcCx59ziAhWKHLkGHfWjDs4Q2A4ILCgB"
data = urlparse.urlparse(url)
print(urlparse.parse_qs(data.query)['q'][0])
Output:
bet i won t get one share
You're using the wrong function if you wish to match in the middle of the string.
re.match only matches from start of the string
If zero or more characters at the beginning of string match the
regular expression pattern, return a corresponding match object.
Here use re.search instead.
import re
href = 'href:/search?hl=en-US&q=bet+i+won+t+get+one+share&tbm=isch&tbs=simg:CAQSkwEJyapBtj9kKiIahwELEKjU2AQaAAwLELCMpwgaYgpgCAMSKMILxAufFcsLnBWeFZsVnRWABMcPsCKgLaMtoi2hLZ0tqziiI6w4uSQaMG01mL5LQ62s4q5ZMf-Wetz68lCkHfrFOOKs2CELzQJlPjHIMzmlp2Ny-a5t7hZbiCAEDAsQjq7-CBoKCggIARIEXLNODAw&sa=X&ved=0ahUKEwjThcCx59ziAhWKHLkGHfWjDs4Q2A4ILCgB'
print("href:" + href)
matchObj = re.search( r'q=(.*?)\&', href, re.M|re.I)
if matchObj:
criteria = matchObj.group(1)
print(criteria)
'bet+i+won+t+get+one+share'
Here, we would apply a simple expression with left and right boundaries such as:
&q=(.+?)&
Demo
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"&q=(.+?)&"
test_str = "href:/search?hl=en-US&q=bet+i+won+t+get+one+share&tbm=isch&tbs=simg:CAQSkwEJyapBtj9kKiIahwELEKjU2AQaAAwLELCMpwgaYgpgCAMSKMILxAufFcsLnBWeFZsVnRWABMcPsCKgLaMtoi2hLZ0tqziiI6w4uSQaMG01mL5LQ62s4q5ZMf-Wetz68lCkHfrFOOKs2CELzQJlPjHIMzmlp2Ny-a5t7hZbiCAEDAsQjq7-CBoKCggIARIEXLNODAw&sa=X&ved=0ahUKEwjThcCx59ziAhWKHLkGHfWjDs4Q2A4ILCgB
"
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
RegEx Circuit
jex.im visualizes regular expressions:

Python re.findall finds strangelly wrong patterns [duplicate]

This question already has answers here:
re.findall behaves weird
(3 answers)
Closed 3 years ago.
I m generally curious why re.findall makes sutch weid stuff as finding empty strings, tuples (what that suppose to mean). It seems it does not take clausures () normally, als o interpretes | wrong like ab | cd is (ab)| (cd) , not a (b|c)d like you would think normally. Because of that i cant define regex what i need.
But in this example ie see clear wrong behaviour on the simple pattern:
([a-zA-Z0-9]+\.+)+[a-zA-Z0-9]{1,3}
what describes simple urls like gskinner.com, www.capitolconnection.org what you can see on regex help in https://regexr.com/ , i recognize with re.findall :
hotmail.
living.
item.
2.
4S.
means letters then just. How can that be?
Full code, where i try to filter out jonk from the text is :
import re
singles = r'[()\.\/$%=0-9,?!=; \t\n\r\f\v\":\[\]><]'
digits_str = singles + r'[()\-\.\/$%=0-9 \t\n\r\f\v\'\":\[\]]*'
#small_word = '[a-zA-Z0-9]{1,3}'
#junk_then_small_word = singles + small_word + '(' + singles + small_word + ')*'
email = singles + '\S+#\S*'
http_str = r'[^\.]+\.+[^\.]+\.+([^\.]+\.+)+?'
http = '(http|https|www)' + http_str
web_address = '([a-zA-Z0-9]+\.+)+[a-zA-Z0-9]{1,3}'
pat = email + '|' + digits_str
d_pat = re.compile(web_address)
text = '''"Lucy Gonzalez" test-defis-wtf <stagecoachmama#hotmail.com> on 11/28/2000 01:02:22 PM
http://www.living.com/shopping/item/item.jhtml?.productId=LC-JJHY-2.00-10.4S.I will send checks
directly to the vendor for any bills pre 4/20. I will fax you copies. I will also try and get the payphone transferred.
www.capitolconnection.org <http://www.capitolconnection.org>.
and/or =3D=3D=3D=3D=3D=3D=3D= O\'rourke'''
print('findall:')
for x in re.findall(d_pat,text):
print(x)
print('split:')
for x in re.split(d_pat,text):
print(x)
From the documentation of re.findall:
If one or more groups are present in the pattern, return a list of groups; this will be a list of tuples if the pattern has more than one group.
Your regex has groups, namely the part in parenthesis. If you want to display the entire match, put your regex in one big group (put parenthesis around the whole thing) and then do print(x[0]) instead of print(x).
I'm guessing that our expression has to be modified here, and that might be the problem, for instance, if we wish to match the desired patterns we would start with an expression similar to:
([a-zA-Z0-9]+)\.
if we wish to have 1 to 3 chars after the ., we would expand it to:
([a-zA-Z0-9]+)\.([a-zA-Z0-9]{1,3})?
Demo 1
Demo 2
Test
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"([a-zA-Z0-9]+)\.([a-zA-Z0-9]{1,3})?"
test_str = ("hotmail.\n"
"living.\n"
"item.\n"
"2.\n"
"4S.\n"
"hotmail.com\n"
"living.org\n"
"item.co\n"
"2.321\n"
"4S.123")
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.

How to Find Words Not Containing Specific Letters?

I'm trying to write a code using regex and my text file. My file contains these words line by line:
nana
abab
nanac
eded
My purpose is: displaying the words which does not contain the letters which are given as substring's letters.
For example, if my substring is "bn", my output should be only eded. Because nana and nanac contains "n" and abab contains "b".
I have written a code but it only checks first letter of my substring:
import re
substring = "bn"
def xstring():
with open("deneme.txt") as f:
for line in f:
for word in re.findall(r'\w+', line):
for letter in substring:
if len(re.findall(letter, word)) == 0:
print(word)
#yield word
xstring()
How do I solve this problem?
Here, we would just want to have a simple expression such as:
^[^bn]+$
We are adding b and n in a not-char class [^bn] and collecting all other chars, then by adding ^ and $ anchors we will be failing all strings that might have b and n.
Demo
Test
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"^[^bn]+$"
test_str = ("nana\n"
"abab\n"
"nanac\n"
"eded")
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
RegEx
If this expression wasn't desired, it can be modified/changed in regex101.com.
RegEx Circuit
jex.im visualizes regular expressions:
#Xosrov has the right approach, with a few minor issues and typos. The below version of the same logic works
import re
def xstring(substring, words):
regex = re.compile('[%s]' % ''.join(sorted(set(substring))))
# Excluding words matching regex.pattern
for word in words:
if not re.search(regex, word):
print(word)
words = [
'nana',
'abab',
'nanac',
'eded',
]
xstring("bn", words)
If you want to check if a string has a set of letters, use brackets.
For example using [bn] will match words that contain one of those letters.
import re
substring = "bn"
regex = re.compile('[' + substring + ']')
def xstring():
with open("dename.txt") as f:
for line in f:
if(re.search(regex, line) is None):
print(line)
xstring()
It might not be the most efficient but you could try doing something with set intersections the following code segment will print the the value in the string word only if it does not contain any of the letters 'b' or 'n'
if (not any(set(word) & set('bn'))):
print(word)

How to get matched word from regex match object after using finditer

I have made this pattern to get the url link of the blog post (which can be separated by hyphens or underscores etc in my websites url to match it with the database and display the corresponding post). Whenever I append the matches to a list, all of them are re match objects. How do I obtain the matched word?
I have tried using search and match but those do not return separate word.
import re
pattern = r"[a-zA-Z0-9]+[^-]+"
matches = re.finditer(pattern, "this-is-a-sample-post")
matches_lst = [i for i in matches]
So suppose I have the string "this-is-a-sample-post", I want to get "this is a sample post".
I want a list of the matched words so that I can use the " ".join() method and match the string with my database.
import re
pattern = r"[a-zA-Z0-9]+[^-]+"
string = "this-is-a-sample-post"
matches = re.finditer(pattern, string)
matches_lst = [i.group(0) for i in matches]
print("Made with finditer:")
print(matches_lst)
print("Made with findall")
matches_lst = re.findall(pattern, string)
print(matches_lst)
print("Made with split")
print(string.split("-"))
print("Made with replace and split")
print(string.replace("-"," ").split())
Output: >>>
Made with finditer:
['this', 'is', 'sample', 'post']
Made with findall
['this', 'is', 'sample', 'post']
Made with split
['this', 'is', 'a', 'sample', 'post']
Made with replace and split
['this', 'is', 'a', 'sample', 'post']
>>>
Replace:
matches_lst = [i for i in matches]
With:
matches_lst = [i.group(0) for i in matches]
Or you could just use findall which will give you a list:
matches = re.findall(pattern, "this-is-a-sample-post")
My guess is that we might also want to slightly modify our expression in the question, if we wish to capture the words and not the dashes:
Demo
Test
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"([a-zA-Z0-9]+)"
test_str = "this-is-a-sample-post"
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
As suggested in comment, also re.sub is a solution:
import re
s = 'this-is-example'
s = sub('-', ' ', s)
Naive str.replace works too:
s = 'this-is-example'
s = s.replace('-', ' ')
From the current regular expression pattern(r"[a-zA-Z0-9]+[^-]+"), it will fetch only "this is sample post" and missing the "a". because here it is looking for one or more characters.
To get the complete sentence change the pattern to
r'[a-zA-Z0-9]*[^-]'
You can do it 3 ways:
Using the re.sub to replace the "-" with " "(space)
>>> re.sub("-", " ", "this-is-a-sample-post")
O/P: 'this is a sample post'
Fetch the output of finditer() into a list and do the join.
>>> text = "this-is-a-sample-post"
>>> a = [m.group(0) for m in re.finditer(r'[a-zA-Z0-9]*[^-]', text)]
>>> " ".join(a)
o/p: 'this is a sample post'
Fetch the output into a string and replace the '-' with space
str = "this-is-a-sample-post"
str.replace('-', ' ')
o/p:'this is a sample post'

regex python Fasta

Thank you for your previous advices,
I have another regex problem:
now I have a list with this pattern:
*7 3 279 0
*33 2 254 0.0233918128654971
*39 2 276 0.027431421446384
and a file with DNA sequencing in Fasta format:
EDIT reformated lines
>OCTU1
GCTTGTCTCAAAGATTAAGCCATGCATGTATAAGCACAAGCCTAAAATGGTGAAGCCGCGAATAGCTCATTACAACAGTCGTAGTTTATTGGAAAGTTCACTATGGATAACTGTGGTAATTCTAGAGCTAATACATGTTCCAATCCTCGACTCACGGAGAGGTGCATTTATTAGAACAAAGCTGATCAGACTATGTCTGTCTCAGGTTGACTCTGAATAACTTTGCTAATCGCACAGTCTTTGTACTGGCGATGTATCTTTCATGCTATGTA
>OCTU2
GCTGCTTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTATTCCCCGTTACCCGTTCAACCATGGTAGGCCCTACTACCATCAAAGTTGATAGGGCAGATATTTGAAAGACATCGCCGCACAAAGGCTATGCGATTAGCAAAGTTATTAGATCAACGACGCAGCGATCGGCTTTGACTAATAAATCACCCCTCCAGTTGGGGACTTTTACATGTATTAGCTCTAGAATTACCACAGTTATCCATTAGTGAAGTACCTTCCAATAAACTATACTGTTTAATGAGCCATTCGCGGTTTCACCGTAAAATTAGGTTGTCTTAGACATGCATGGCTTAATCTTTGTAGACAAGC
I'd need to find the numbers in the list with * (e.g., 7 or 33) in the Fasta file (e.g., >OCTU7 and >OCTU33) and copy in another file only the Fasta sequences that are present in the list, this is my script:
regex=re.compile(r'.+\d+\s+')
OCTU=b.readlines()
while OCTU:
for line in a:
if regex.match(OCTU)==line:
c.write(OCTU)
The scripts seems to work but I think the pattern is not correct because the file created is empty.
Thank you in advance for your precious advices.
You could first collect the id numbers from file a to a set for fast lookup later:
seta = set()
regexa = re.compile(r'\*(\d+)') #matches asterisk followed by digits, captures digits
for line in a:
m = regexa.match(line) #looks for match at start of line
if m:
seta.add(m.group(1))
Then loop over b. Use b.next() inside the loop to get the second line where the sequence is.
regexb = re.compile(r'>OCTU(\d+)') #matches ">OCTU" followed by digits, captures digits
for line in b:
m = regexb.match(line)
if m:
sequence = b.next()
if m.group(1) in seta:
c.write(line)
c.write(sequence)
You may want to use Biopython to parse the fasta file.
Then you can slice out the number and look it up in your list and access the sequence and sequence name more reliably...If a fasta file has line wrapping the above method may run into problems...
import collections
from Bio import SeqIO
infile = "yourfastafile.fasta"
outfile = "desired_outfilename.fasta"
dct = collections.OrderedDict()
for record in SeqIO.parse(open(infile), "fasta"):
dct[record.description()] = str(record.seq).upper()
for k,v in dct.items():
if int(k[4:]) in seta: #from answer above
with open(outfile, "a") as handle:
handle.write(">" + k + "\n" + str(v) + "\n")
coding=utf8
the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r">.+\n[acgtnACGTN\n]+"
test_str = (">AB000263 |acc=AB000263|descr=Homo sapiens mRNA for prepro cortistatin like peptide, complete cds.|len=368\n"
"ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC\n"
"CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC\n"
"CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG\n"
"AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC\n"
"CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG\n"
"TTTAATTACAGACCTGAA")
matches = re.finditer(regex, test_str)
for matchNum, match in enumerate(matches):
matchNum = matchNum + 1
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.

Categories