Regex Help Python? - python

I have this code:
import re
#TEST CASES
match_dict = ['hello(here)',
'Hello (Hi)',
"'dfsfds Hello (Hi) fdfd' Hello (Yes)",
"Hello ('hi)xx')",
"Hello ('Hi')"]
for s in match_dict:
print "INPUT: %s" % s
m = re.sub(r"(?<!\()'[^']+'", '', s, flags=re.M)
paren_quotes = re.findall(r"Hello\s*\('([^']+)'\)", m, flags=re.M)
output = paren_quotes if paren_quotes else []
m = re.sub(r"Hello\s*\('[^']+'\)", '', m, flags=re.M)
paren_matches = re.findall(r"Hello\s*\(([^)]+)\)", m, flags=re.M)
if paren_matches:
output.extend(paren_matches)
print 'OUTPUT: %s\n' % output
This code is made to output everything in the parentheses after the word 'Hello',
Hello (Hi) would give 'Hi'
My problem is that when I put in:
Hello('Hi')
...It still returns 'Hi' when I want it to return "'Hi'"
Does anyone know how could I fix this code?

Just use non-greedy matching:
matches = re.search(r'^Hello\s*\((.*?)\)', text)

>>> import re
>>> p = re.compile(r'Hello\s*\((.*?)\)', re.M)
>>> m = p.findall("Hello ('Hi')")
>>> print m
["'Hi'"]
>>> m = p.findall("'dfsfds Hello (Hi) fdfd' Hello (Yes)")
>>> print m
['Hi', 'Yes']

Related

String Operation on captured group in re Python

I have a string:
str1 = "abc = def"
I want to convert it to:
str2 = "abc = #Abc#"
I am trying this:
re.sub("(\w+) = (\w+)",r"\1 = %s" % ("#"+str(r"\1").title()+"#"),str1)
but it returns: (without the string operation done)
"abc = #abc#"
What is the possible reason .title() is not working.?
How to use string operation on the captured group in python?
You can see what's going on with the help of a little function:
import re
str1 = "abc = def"
def fun(m):
print("In fun(): " + m)
return m
str2 = re.sub(r"(\w+) = (\w+)",
r"\1 = %s" % ("#" + fun(r"\1") + "#"),
# ^^^^^^^^^^
str1)
Which yields
In fun(): \1
So what you are basically trying to do is to change \1 (not the substitute!) to an uppercase version which obviously remains \1 literally. The \1 is replaced only later with the captured content than your call to str.title().
Go with a lambda function as proposed by #Rakesh.
Try using lambda.
Ex:
import re
str1 = "abc = def"
print( re.sub("(?P<one>(\w+)) = (\w+)",lambda match: r'{0} = #{1}#'.format(match.group('one'), match.group('one').title()), str1) )
Output:
abc = #Abc#

Removing words with special characters "\" and "/"

During the analysis of tweets, I run in the "words" that have either \ or / (could have more than one appearance in one "word"). I would like to have such words removed completely but can not quite nail this
This is what I tried:
sen = 'this is \re\store and b\\fre'
sen1 = 'this i\s /re/store and b//fre/'
slash_back = r'(?:[\w_]+\\[\w_]+)'
slash_fwd = r'(?:[\w_]+/+[\w_]+)'
slash_all = r'(?<!\S)[a-z-]+(?=[,.!?:;]?(?!\S))'
strt = re.sub(slash_back,"",sen)
strt1 = re.sub(slash_fwd,"",sen1)
strt2 = re.sub(slash_all,"",sen1)
print strt
print strt1
print strt2
I would like to get:
this is and
this i\s and
this and
However, I receive:
and
this i\s / and /
i\s /re/store b//fre/
To add: in this scenario the "word" is a string separated either by spaces or punctuation signs (like a regular text)
How's this? I added some punctuation examples:
import re
sen = r'this is \re\store and b\\fre'
sen1 = r'this i\s /re/store and b//fre/'
sen2 = r'this is \re\store, and b\\fre!'
sen3 = r'this i\s /re/store, and b//fre/!'
slash_back = r'\s*(?:[\w_]*\\(?:[\w_]*\\)*[\w_]*)'
slash_fwd = r'\s*(?:[\w_]*/(?:[\w_]*/)*[\w_]*)'
slash_all = r'\s*(?:[\w_]*[/\\](?:[\w_]*[/\\])*[\w_]*)'
strt = re.sub(slash_back,"",sen)
strt1 = re.sub(slash_fwd,"",sen1)
strt2 = re.sub(slash_all,"",sen1)
strt3 = re.sub(slash_back,"",sen2)
strt4 = re.sub(slash_fwd,"",sen3)
strt5 = re.sub(slash_all,"",sen3)
print(strt)
print(strt1)
print(strt2)
print(strt3)
print(strt4)
print(strt5)
Output:
this is and
this i\s and
this and
this is, and!
this i\s, and!
this, and!
One way you could do it without re is with join and a comprehension.
sen = 'this is \re\store and b\\fre'
sen1 = 'this i\s /re/store and b//fre/'
remove_back = lambda s: ' '.join(i for i in s.split() if '\\' not in i)
remove_forward = lambda s: ' '.join(i for i in s.split() if '/' not in i)
>>> print(remove_back(sen))
this is and
>>> print(remove_forward(sen1))
this i\s and
>>> print(remove_back(remove_forward(sen1)))
this and

search for a variable in a text file

im trying to search for a variable in a text file.
sid = '185'
a = ((sid)"\n")
with open(filename) as f:
data = f.readlines()
if a in data:
print 'its here'
else:
print 'its NOT here'
here is my foo.txt
['306\n', '303\n', '313\n', '323\n', '417\n', '281\n', '304\n', '322\n', '320\n', '319\n', '308\n', '310\n', '414\n', '415\n', '314\n', '312\n', '307\n', '305\n', '285\n', '286\n', '318\n', '283\n', '282\n', '294\n', '309\n', '416\n', '418\n', '321\n', '185\n']
i think my problem lies with defining the /n properly? Not sure, as you can tell im a beginner. Any help or recommendations on how to do this would be appreciated.
You can use re.split and following list comprehension to grub the numbers,then use in operand for check the membership:
>>> s = """['306\n', '303\n', '313\n', '323\n', '417\n', '281\n', '304\n', '322\n', '320\n', '319\n', '308\n', '310\n', '414\n', '415\n', '314\n', '312\n', '307\n', '305\n', '285\n', '286\n', '318\n', '283\n', '282\n', '294\n', '309\n', '416\n', '418\n', '321\n', '185\n']"""
>>> import re
>>> def check(var):
... return var in [int(i.strip()) for i in re.split(r'[\[\]\',]*',s) if i.strip()]
...
>>> check(303)
True
>>> check(444)
False
And in your case you need to use open(filename).read() instead of s.Also for larger files you can use set that is more efficient for checking membership:
>>> def check(var):
... return var in set(int(i.strip()) for i in re.split(r'[\[\]\',]*',s) if i.strip())
...
>>> check(444)
False
>>> check(305)
True
You can use re.search function to search for a string
foo = """['306\n', '303\n', '313\n', '323\n', '417\n', '281\n', '304\n', '322\n', '320\n', '319\n', '308\n', '310\n', '414\n', '415\n', '314\n', '312\n', '307\n', '305\n', '285\n', '286\n', '318\n', '283\n', '282\n', '294\n', '309\n', '416\n', '418\n', '321\n', '185\n']"""
sid = '185'
a = sid + '\n'
if re.search(a,foo):
print "its here"
else:
print "its not here"
In general, if there could be more than one line in foo.txt you can use:
sid = '185'
a = sid + '\\n'
filename = 'foo.txt'
with open(filename) as f:
for line in f:
if a in line:
print 'its here'
break
else:
print 'its NOT here'
Don't forget to escape the backslash with '\\n' if foo.txt really is literally as you give it.

Replace a pattern in python

How to replace the pattern in the string with
decoded_str=" Name(++info++)Age(++info++)Adress of the emp(++info++)"
The first pattern "(++info++)" needs to replaced with (++info a++)
The second pattern "(++info++)" needs to replaced with (++info b++)
The third pattern "(++info++)" needs to replaced with (++info c++)
If there many more then it should be replaced accordingly
This should be simple enough:
for character in range(ord('a'), ord('z')):
if "(++info++)" not in decoded_str:
break
decoded_str = decoded_str.replace("(++info++)", "(++info {0}++)".format(chr(character)), 1)
print decoded_str
It has the added benefit of stopping at 'z'. If you want to wrap around:
import itertools
for character in itertools.cycle(range(ord('a'), ord('z'))):
if "(++info++)" not in decoded_str:
break
decoded_str = decoded_str.replace("(++info++)", "(++info {0}++)".format(chr(character)), 1)
print decoded_str
And just for fun, a one-liner, and O(n):
dstr = "".join(x + "(++info {0}++)".format(chr(y)) for x, y in zip(dstr.split("(++info++)"), range(ord('a'), ord('z'))))[:-len("(++info a++)")]
import string
decoded_str = " Name(++info++)Age(++info++)Adress of the emp(++info++)"
s = decoded_str.replace('++info++', '++info %s++')
s % tuple(i for i in string.ascii_lowercase[:s.count('%s')])
Here is a rather ugly yet pragmatic solution:
import string
decoded_str = " Name(++info++)Age(++info++)Adress of the emp(++info++)"
letters = list(string.lowercase)
token = "(++info++)"
rep_token = "(++info %s++)"
i = 0
while (token in decoded_str):
decoded_str = decoded_str.replace(token, rep_token % letters[i], 1)
i += 1
print decoded_str
>>> import re
>>> rx = re.compile(r'\(\+\+info\+\+\)')
>>> s = "Name(++info++)Age(++info++)Adress of the emp(++info++)"
>>> atoz = iter("abcdefghijklmnopqrstuvwxyz")
>>> rx.sub(lambda m: '(++info ' + next(atoz) + '++)', s)
'Name(++info a++)Age(++info b++)Adress of the emp(++info c++)'
Here's a quick hack to do it:
string=" Name(++info++)Age(++info++)Adress of the emp(++info++)"
def doit(s):
import string
allTheLetters = list(string.lowercase)
i=0
s2 = s.replace("++info++","++info "+allTheLetters[i]+"++",1)
while (s2!=s):
s=s2
i=i+1
s2 = s.replace("++info++","++info "+allTheLetters[i]+"++",1)
return s
Note that performance is probably not very great.
import re, string
decoded_str=" Name(++info++)Age(++info++)Adress of the emp(++info++)"
sub_func=('(++info %s++)'%c for c in '.'+string.ascii_lowercase).send
sub_func(None)
print re.sub('\(\+\+info\+\+\)', sub_func, decoded_str)
from itertools import izip
import string
decoded_str=" Name(++info++)Age(++info++)Adress of the emp(++info++)"
parts = iter(decoded_str.split("(++info++)"))
first_part = next(parts)
tags = iter(string.ascii_lowercase)
encoded_str=first_part+"".join("(++info %s++)%s"%x for x in izip(tags, parts))
print encoded_str
decoded_str=" Name(++info++)Age(++info++)Adress of the emp(++info++)"
import re
for i, f in enumerate(re.findall(r"\(\+\+info\+\+\)",decoded_str)):
decoded_str = re.sub(r"\(\+\+info\+\+\)","(++info %s++)"%chr(97+i),decoded_str,1)
print decoded_str

String Conversion

s='This is sample'
i need to convert like this
s='"This is sample"'
output="This is sample"
how to do this in dynamic
Thanks in advance
orig = 'This is sample'
converted = '"%s"' % orig
>>> s= 'This is a sample'
>>> s = '"' + s + '"' # or s = '"%s"' % s
>>> s
'"This is a sample"'
>>> print(s)
"This is a sample"
>>>

Categories