How to Find Words Not Containing Specific Letters? - python

I'm trying to write a code using regex and my text file. My file contains these words line by line:
nana
abab
nanac
eded
My purpose is: displaying the words which does not contain the letters which are given as substring's letters.
For example, if my substring is "bn", my output should be only eded. Because nana and nanac contains "n" and abab contains "b".
I have written a code but it only checks first letter of my substring:
import re
substring = "bn"
def xstring():
with open("deneme.txt") as f:
for line in f:
for word in re.findall(r'\w+', line):
for letter in substring:
if len(re.findall(letter, word)) == 0:
print(word)
#yield word
xstring()
How do I solve this problem?

Here, we would just want to have a simple expression such as:
^[^bn]+$
We are adding b and n in a not-char class [^bn] and collecting all other chars, then by adding ^ and $ anchors we will be failing all strings that might have b and n.
Demo
Test
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"^[^bn]+$"
test_str = ("nana\n"
"abab\n"
"nanac\n"
"eded")
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
RegEx
If this expression wasn't desired, it can be modified/changed in regex101.com.
RegEx Circuit
jex.im visualizes regular expressions:

#Xosrov has the right approach, with a few minor issues and typos. The below version of the same logic works
import re
def xstring(substring, words):
regex = re.compile('[%s]' % ''.join(sorted(set(substring))))
# Excluding words matching regex.pattern
for word in words:
if not re.search(regex, word):
print(word)
words = [
'nana',
'abab',
'nanac',
'eded',
]
xstring("bn", words)

If you want to check if a string has a set of letters, use brackets.
For example using [bn] will match words that contain one of those letters.
import re
substring = "bn"
regex = re.compile('[' + substring + ']')
def xstring():
with open("dename.txt") as f:
for line in f:
if(re.search(regex, line) is None):
print(line)
xstring()

It might not be the most efficient but you could try doing something with set intersections the following code segment will print the the value in the string word only if it does not contain any of the letters 'b' or 'n'
if (not any(set(word) & set('bn'))):
print(word)

Related

find a Pattern Match in string in Python

I am trying to find a amino acid pattern (B-C or M-D, where '-' could be any alphabet other than 'P') in a protein sequence let say 'VATLDSCBACSKVNDNVKNKVKVKNVKMLDHHHV'. Protein sequence in in a fasta file.
I have tried a lot but couldn't find any solution.
I tried a lot. the following code is one of them
import Bio
from Bio import SeqIO
seqs= SeqIO.parse(X, 'fasta') ### to read the sequences from fasta file
for aa in seqs:
x=aa.seq ## gives the sequences as a string (.seq is a build in function of Biopython)
for val, i in enumerate(x):
if i=='B':
if (x[val+2])=='C':
if x[val+1]!='P':
pattern=((x[val]:x[val+2])) ## trying to print full sequence B-C
But unfortunately none of them work.
It would be great if someone can help me out with this problem.
>>> x = 'VATLDSCBACSKVNDNVKNKVKVKNVKMLDHHHV'
>>> import re
>>> m = re.search('B(.+?)C', x)
>>> m
<_sre.SRE_Match object at 0x10262aeb0>
>>> m = re.search('B(.+?)C', x).group(0)
>>> m
'BAC'
>>> m = re.search('M(.+?)D', x).group(0)
>>> m
'MLD'
>>> re.search(r"(?<=M).*?(?=D)", x).group(0)
'L'
>>> re.search(r"(?<=B).*?(?=C)", x).group(0)
'A'
A common solution for pattern matching is the usage of regex.
A possible regex for your problem is B[^P]C|M[^P]D.
The following code has been generated by regex101 with the regex I propose and the test string you gave us. It find all matching pattern with their positions in the original string.
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"B[^P]C|M[^P]D"
test_str = "VATLDSCBACSKVNDNVKNKVKVKNVKMLDHHHV"
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
In python you can use the Regex module (re):
import re # import the RE module
import Bio
from Bio import SeqIO
seqs = SeqIO.parse(X, 'fasta')
for sequence in seqs:
line = sequence.se
RE = r'B[A-OQ-Z]C|M[A-OQ-Z]D'
# [A-OQ-Z] : Match from A to O and from Q to Z (exl. P)
# | : is an operator OR = either the left or right part should match
# The r before the string specify that the string is regex: r"regex"
results = re.findall(RE, line)
# The function findall will return a list of all non-overlapping matches.
# To iterate over each result :
for res in results:
print(res)
Then you can also modify the Regular expression to match any other rule you would like to match.
More information about the findall function here : re.findall(...)
The following website can help you build a regex :
https://regex101.com/
Use a regular expression with an exception assertion "^".
import re
string = 'VATLDSCBACSKVNDNVKNKVKVKNVKMLDHHHV'
re.findall(r"B[^P]C|M[^P]D", string)
Output:
['BAC', 'MLD']

Why isn't my Python regex matching part of the query string? [duplicate]

This question already has answers here:
Retrieving parameters from a URL
(20 answers)
Closed 3 years ago.
I'm using Python 3.7. I want to extract the portion of a url between the "q=...&" part of a query string. I have this code
href = span.a['href']
print("href:" + href)
matchObj = re.match( r'q=(.*?)\&', href, re.M|re.I)
if matchObj:
criteria = matchObj.group(1)
but despite the fact that my href is this
href:/search?hl=en-US&q=bet+i+won+t+get+one+share&tbm=isch&tbs=simg:CAQSkwEJyapBtj9kKiIahwELEKjU2AQaAAwLELCMpwgaYgpgCAMSKMILxAufFcsLnBWeFZsVnRWABMcPsCKgLaMtoi2hLZ0tqziiI6w4uSQaMG01mL5LQ62s4q5ZMf-Wetz68lCkHfrFOOKs2CELzQJlPjHIMzmlp2Ny-a5t7hZbiCAEDAsQjq7-CBoKCggIARIEXLNODAw&sa=X&ved=0ahUKEwjThcCx59ziAhWKHLkGHfWjDs4Q2A4ILCgB
the "matchObj" is always NoneType and the subsequent lines aren't evaluated. What else do I need to do to fix my regex?
You can use the urllib module
Ex:
import urllib.parse as urlparse
url = "href:/search?hl=en-US&q=bet+i+won+t+get+one+share&tbm=isch&tbs=simg:CAQSkwEJyapBtj9kKiIahwELEKjU2AQaAAwLELCMpwgaYgpgCAMSKMILxAufFcsLnBWeFZsVnRWABMcPsCKgLaMtoi2hLZ0tqziiI6w4uSQaMG01mL5LQ62s4q5ZMf-Wetz68lCkHfrFOOKs2CELzQJlPjHIMzmlp2Ny-a5t7hZbiCAEDAsQjq7-CBoKCggIARIEXLNODAw&sa=X&ved=0ahUKEwjThcCx59ziAhWKHLkGHfWjDs4Q2A4ILCgB"
data = urlparse.urlparse(url)
print(urlparse.parse_qs(data.query)['q'][0])
Output:
bet i won t get one share
You're using the wrong function if you wish to match in the middle of the string.
re.match only matches from start of the string
If zero or more characters at the beginning of string match the
regular expression pattern, return a corresponding match object.
Here use re.search instead.
import re
href = 'href:/search?hl=en-US&q=bet+i+won+t+get+one+share&tbm=isch&tbs=simg:CAQSkwEJyapBtj9kKiIahwELEKjU2AQaAAwLELCMpwgaYgpgCAMSKMILxAufFcsLnBWeFZsVnRWABMcPsCKgLaMtoi2hLZ0tqziiI6w4uSQaMG01mL5LQ62s4q5ZMf-Wetz68lCkHfrFOOKs2CELzQJlPjHIMzmlp2Ny-a5t7hZbiCAEDAsQjq7-CBoKCggIARIEXLNODAw&sa=X&ved=0ahUKEwjThcCx59ziAhWKHLkGHfWjDs4Q2A4ILCgB'
print("href:" + href)
matchObj = re.search( r'q=(.*?)\&', href, re.M|re.I)
if matchObj:
criteria = matchObj.group(1)
print(criteria)
'bet+i+won+t+get+one+share'
Here, we would apply a simple expression with left and right boundaries such as:
&q=(.+?)&
Demo
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"&q=(.+?)&"
test_str = "href:/search?hl=en-US&q=bet+i+won+t+get+one+share&tbm=isch&tbs=simg:CAQSkwEJyapBtj9kKiIahwELEKjU2AQaAAwLELCMpwgaYgpgCAMSKMILxAufFcsLnBWeFZsVnRWABMcPsCKgLaMtoi2hLZ0tqziiI6w4uSQaMG01mL5LQ62s4q5ZMf-Wetz68lCkHfrFOOKs2CELzQJlPjHIMzmlp2Ny-a5t7hZbiCAEDAsQjq7-CBoKCggIARIEXLNODAw&sa=X&ved=0ahUKEwjThcCx59ziAhWKHLkGHfWjDs4Q2A4ILCgB
"
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
RegEx Circuit
jex.im visualizes regular expressions:

Regex for Text Between Brackets and Text Between Semicolons

I have the following shape of string: PW[Yasui Chitetsu]; and would like to get only the name inside the brackets: Yasui Chitetsu. I'm trying something like
[^(PW\[)](.*)[^\]]
as a regular expression, but the last bracket is still in it. How do I unselect it? I don't think I need anything fancy like look behinds, etc, for this case.
The Problems with What You've Tried
There are a few problems with what you've tried:
It will omit the first and last characters of your match from the group, giving you something like asui Chitets.
It will have even more errors on strings that start with P or W. For example, in PW[Paul McCartney], you would match only ul McCartne with the group and ul McCartney with the full match.
The Regex
You want something like this:
(?<=\[)([^]]+)(?=\])
Here's a regex101 demo.
Explanation
(?<=\[) means that the match must be preceded by [
([^]]+) matches 1 or more characters that are not ]
(?=\])means that the match must be followed by ]
Sample Code
Here's some sample code (from the above regex101 link):
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"(?<=\[)([^]]+)(?=\])"
test_str = "PW[Yasui Chitetsu]"
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches):
matchNum = matchNum + 1
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Semicolons
In your title, you mentioned finding text between semicolons. The same logic would work for that, giving you this regex:
(?<=;)([^;]+)(?=;)

Python Regex: Symbol + in every letter in the same word

I am using Python.
I want to make a regex that allos the following examples:
Day
Dday
Daay
Dayy
Ddaay
Ddayy
...
So, each letter of a word, one or more times.
How can I write it easily? Exist an expression that make it easy?
I have a lot of words.
Thanks
We can try using the following regex pattern:
^([A-Za-z])\1*([A-Za-z])\2*([A-Za-z])\3*$
This matches and captures a single letter, followed by any number of occurrences of this letter. The \1 you see in the above pattern is a backreference which represents the previous matched letter (and so on for \2 and \3).
Code:
word = "DdddddAaaaYyyyy"
matchObj = re.match( r'^([A-Za-z])\1*([A-Za-z])\2*([A-Za-z])\3*$', word, re.M|re.I)
if matchObj:
print "matchObj.group() : ", matchObj.group()
print "matchObj.group(1) : ", matchObj.group(1)
print "matchObj.group(2) : ", matchObj.group(2)
print "matchObj.group(3) : ", matchObj.group(3)
else:
print "No match!!"
Demo
To match a character one or more times you can use the + quantifier. To build the full pattern dynamically you would need to split the word to characters and add a + after each of them:
pattern = "".join(char + "+" for char in word)
Then just match the pattern case insensitively.
Demo:
>>> import re
>>> word = "Day"
>>> pattern = "".join(char + "+" for char in word)
>>> pattern
'D+a+y+'
>>> words = ["Dday", "Daay", "Dayy", "Ddaay", "Ddayy"]
>>> all(re.match(pattern, word, re.I) for word in words)
True
Try /d+a+y+/gi:
d+ Matches d one or more times.
a+ Matches a one or more times.
y+ Matches y one or more times.
As per my original comment, the below does exactly what I explain.
Since you want to be able to use this on many words, I think this is what you're looking for.
import re
word = "day"
regex = r"^"+("+".join(list(word)))+"+$"
test_str = ("Day\n"
"Dday\n"
"Daay\n"
"Dayy\n"
"Ddaay\n"
"Ddayy")
matches = re.finditer(regex, test_str, re.IGNORECASE | re.MULTILINE)
for matchNum, match in enumerate(matches):
matchNum = matchNum + 1
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
This works by converting the string into a list, then converting it back to string, joining it on +, and appending the same. The resulting regex will be ^d+a+y+$. Since the input you presented is separated by newline characters, I've added re.MULTILINE.

regex python Fasta

Thank you for your previous advices,
I have another regex problem:
now I have a list with this pattern:
*7 3 279 0
*33 2 254 0.0233918128654971
*39 2 276 0.027431421446384
and a file with DNA sequencing in Fasta format:
EDIT reformated lines
>OCTU1
GCTTGTCTCAAAGATTAAGCCATGCATGTATAAGCACAAGCCTAAAATGGTGAAGCCGCGAATAGCTCATTACAACAGTCGTAGTTTATTGGAAAGTTCACTATGGATAACTGTGGTAATTCTAGAGCTAATACATGTTCCAATCCTCGACTCACGGAGAGGTGCATTTATTAGAACAAAGCTGATCAGACTATGTCTGTCTCAGGTTGACTCTGAATAACTTTGCTAATCGCACAGTCTTTGTACTGGCGATGTATCTTTCATGCTATGTA
>OCTU2
GCTGCTTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTATTCCCCGTTACCCGTTCAACCATGGTAGGCCCTACTACCATCAAAGTTGATAGGGCAGATATTTGAAAGACATCGCCGCACAAAGGCTATGCGATTAGCAAAGTTATTAGATCAACGACGCAGCGATCGGCTTTGACTAATAAATCACCCCTCCAGTTGGGGACTTTTACATGTATTAGCTCTAGAATTACCACAGTTATCCATTAGTGAAGTACCTTCCAATAAACTATACTGTTTAATGAGCCATTCGCGGTTTCACCGTAAAATTAGGTTGTCTTAGACATGCATGGCTTAATCTTTGTAGACAAGC
I'd need to find the numbers in the list with * (e.g., 7 or 33) in the Fasta file (e.g., >OCTU7 and >OCTU33) and copy in another file only the Fasta sequences that are present in the list, this is my script:
regex=re.compile(r'.+\d+\s+')
OCTU=b.readlines()
while OCTU:
for line in a:
if regex.match(OCTU)==line:
c.write(OCTU)
The scripts seems to work but I think the pattern is not correct because the file created is empty.
Thank you in advance for your precious advices.
You could first collect the id numbers from file a to a set for fast lookup later:
seta = set()
regexa = re.compile(r'\*(\d+)') #matches asterisk followed by digits, captures digits
for line in a:
m = regexa.match(line) #looks for match at start of line
if m:
seta.add(m.group(1))
Then loop over b. Use b.next() inside the loop to get the second line where the sequence is.
regexb = re.compile(r'>OCTU(\d+)') #matches ">OCTU" followed by digits, captures digits
for line in b:
m = regexb.match(line)
if m:
sequence = b.next()
if m.group(1) in seta:
c.write(line)
c.write(sequence)
You may want to use Biopython to parse the fasta file.
Then you can slice out the number and look it up in your list and access the sequence and sequence name more reliably...If a fasta file has line wrapping the above method may run into problems...
import collections
from Bio import SeqIO
infile = "yourfastafile.fasta"
outfile = "desired_outfilename.fasta"
dct = collections.OrderedDict()
for record in SeqIO.parse(open(infile), "fasta"):
dct[record.description()] = str(record.seq).upper()
for k,v in dct.items():
if int(k[4:]) in seta: #from answer above
with open(outfile, "a") as handle:
handle.write(">" + k + "\n" + str(v) + "\n")
coding=utf8
the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r">.+\n[acgtnACGTN\n]+"
test_str = (">AB000263 |acc=AB000263|descr=Homo sapiens mRNA for prepro cortistatin like peptide, complete cds.|len=368\n"
"ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC\n"
"CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC\n"
"CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG\n"
"AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC\n"
"CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG\n"
"TTTAATTACAGACCTGAA")
matches = re.finditer(regex, test_str)
for matchNum, match in enumerate(matches):
matchNum = matchNum + 1
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.

Categories