python re.search not working on multiline string

python re.search not working on multiline string - python

I have this file loaded in string:
// some preceding stuff
static char header_data[] = {
1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,
1,1,1,0,1,1,0,1,1,0,1,1,0,1,1,1,
1,1,0,1,0,1,0,1,1,0,1,0,1,0,1,1,
1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,
0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,
1,0,0,0,1,1,0,1,1,1,1,1,0,1,1,1,
0,1,0,0,0,1,0,0,1,1,1,1,0,0,0,0,
0,1,1,0,0,0,0,0,0,1,1,1,1,1,1,0,
0,1,1,1,0,0,0,0,0,0,1,1,0,1,1,0,
0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,
1,1,1,0,1,1,0,0,1,1,0,0,0,1,1,1,
1,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,
1,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,
1,1,0,1,0,1,0,1,1,1,1,0,0,0,0,1,
1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,
1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1
};
I want to get only the block with ones and zeros, and then somehow process it.
I imported re, and tried:
In [11]: re.search('static char header_data(.*);', src, flags=re.M)
In [12]: re.findall('static char header_data(.*);', src, flags=re.M)
Out[12]: []
Why doesn't it match anything? How to fix this? (It's python3)

You need to use the re.S flag, not re.M.
re.M (re.MULTILINE) controls the behavior of ^ and $ (whether they match at the start/end of the entire string or of each line).
re.S (re.DOTALL) controls the behavior of the . and is the option you need when you want to allow the dot to match newlines.
See also the documentation.

and then somehow process it.
Here we go to get a useable list out of the file:
import re
match = re.search(r"static char header_data\[\] = {(.*?)};", src, re.DOTALL)
if match:
header_data = "".join(match.group(1).split()).split(',')
print header_data
.*? is a non-greedy match so you really will get just the value between this set of braces.
A more expicit way without DOTALL or MULTILINE would be
match = re.search(r"static char header_data\[\] = {([01,\s\r\n]*?)};", src)

If the format of the file does not change, you might as well not resort to re but use slices. Something on these lines could be useful
>>> file_in_string
'\n// some preceding stuff\nstatic char header_data[] = {\n 1,1,1,1,1,1,0,0,0
,0,1,1,1,1,1,1,\n 1,1,1,0,1,1,0,1,1,0,1,1,0,1,1,1,\n 1,1,0,1,0,1,0,1,1,0,1
,0,1,0,1,1,\n 1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,\n 0,0,0,1,1,1,1,1,1,1,1,1,1
,0,1,1,\n 1,0,0,0,1,1,0,1,1,1,1,1,0,1,1,1,\n 0,1,0,0,0,1,0,0,1,1,1,1,0,0,0
,0,\n 0,1,1,0,0,0,0,0,0,1,1,1,1,1,1,0,\n 0,1,1,1,0,0,0,0,0,0,1,1,0,1,1,0,\
n 0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,\n 1,1,1,0,1,1,0,0,1,1,0,0,0,1,1,1,\n
1,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,\n 1,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,\n 1,1
,0,1,0,1,0,1,1,1,1,0,0,0,0,1,\n 1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,\n 1,1,1,1
,1,1,0,0,0,0,1,1,1,1,1,1\n };\n'
>>> lines = file_in_string.split()
>>> lines[9:-1]
['1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,', '1,1,1,0,1,1,0,1,1,0,1,1,0,1,1,1,', '1,1,0,
1,0,1,0,1,1,0,1,0,1,0,1,1,', '1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,', '0,0,0,1,1,1,1,
1,1,1,1,1,1,0,1,1,', '1,0,0,0,1,1,0,1,1,1,1,1,0,1,1,1,', '0,1,0,0,0,1,0,0,1,1,1,
1,0,0,0,0,', '0,1,1,0,0,0,0,0,0,1,1,1,1,1,1,0,', '0,1,1,1,0,0,0,0,0,0,1,1,0,1,1,
0,', '0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,', '1,1,1,0,1,1,0,0,1,1,0,0,0,1,1,1,', '1,
1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,', '1,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,', '1,1,0,1,0,
1,0,1,1,1,1,0,0,0,0,1,', '1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,', '1,1,1,1,1,1,0,0,0,
0,1,1,1,1,1,1']

Related

re.search seaches only in one line [duplicate]

I have this file loaded in string:
// some preceding stuff
static char header_data[] = {
1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,
1,1,1,0,1,1,0,1,1,0,1,1,0,1,1,1,
1,1,0,1,0,1,0,1,1,0,1,0,1,0,1,1,
1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,
0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,
1,0,0,0,1,1,0,1,1,1,1,1,0,1,1,1,
0,1,0,0,0,1,0,0,1,1,1,1,0,0,0,0,
0,1,1,0,0,0,0,0,0,1,1,1,1,1,1,0,
0,1,1,1,0,0,0,0,0,0,1,1,0,1,1,0,
0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,
1,1,1,0,1,1,0,0,1,1,0,0,0,1,1,1,
1,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,
1,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,
1,1,0,1,0,1,0,1,1,1,1,0,0,0,0,1,
1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,
1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1
};
I want to get only the block with ones and zeros, and then somehow process it.
I imported re, and tried:
In [11]: re.search('static char header_data(.*);', src, flags=re.M)
In [12]: re.findall('static char header_data(.*);', src, flags=re.M)
Out[12]: []
Why doesn't it match anything? How to fix this? (It's python3)

You need to use the re.S flag, not re.M.
re.M (re.MULTILINE) controls the behavior of ^ and $ (whether they match at the start/end of the entire string or of each line).
re.S (re.DOTALL) controls the behavior of the . and is the option you need when you want to allow the dot to match newlines.
See also the documentation.

and then somehow process it.
Here we go to get a useable list out of the file:
import re
match = re.search(r"static char header_data\[\] = {(.*?)};", src, re.DOTALL)
if match:
header_data = "".join(match.group(1).split()).split(',')
print header_data
.*? is a non-greedy match so you really will get just the value between this set of braces.
A more expicit way without DOTALL or MULTILINE would be
match = re.search(r"static char header_data\[\] = {([01,\s\r\n]*?)};", src)

If the format of the file does not change, you might as well not resort to re but use slices. Something on these lines could be useful
>>> file_in_string
'\n// some preceding stuff\nstatic char header_data[] = {\n 1,1,1,1,1,1,0,0,0
,0,1,1,1,1,1,1,\n 1,1,1,0,1,1,0,1,1,0,1,1,0,1,1,1,\n 1,1,0,1,0,1,0,1,1,0,1
,0,1,0,1,1,\n 1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,\n 0,0,0,1,1,1,1,1,1,1,1,1,1
,0,1,1,\n 1,0,0,0,1,1,0,1,1,1,1,1,0,1,1,1,\n 0,1,0,0,0,1,0,0,1,1,1,1,0,0,0
,0,\n 0,1,1,0,0,0,0,0,0,1,1,1,1,1,1,0,\n 0,1,1,1,0,0,0,0,0,0,1,1,0,1,1,0,\
n 0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,\n 1,1,1,0,1,1,0,0,1,1,0,0,0,1,1,1,\n
1,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,\n 1,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,\n 1,1
,0,1,0,1,0,1,1,1,1,0,0,0,0,1,\n 1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,\n 1,1,1,1
,1,1,0,0,0,0,1,1,1,1,1,1\n };\n'
>>> lines = file_in_string.split()
>>> lines[9:-1]
['1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,', '1,1,1,0,1,1,0,1,1,0,1,1,0,1,1,1,', '1,1,0,
1,0,1,0,1,1,0,1,0,1,0,1,1,', '1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,', '0,0,0,1,1,1,1,
1,1,1,1,1,1,0,1,1,', '1,0,0,0,1,1,0,1,1,1,1,1,0,1,1,1,', '0,1,0,0,0,1,0,0,1,1,1,
1,0,0,0,0,', '0,1,1,0,0,0,0,0,0,1,1,1,1,1,1,0,', '0,1,1,1,0,0,0,0,0,0,1,1,0,1,1,
0,', '0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,', '1,1,1,0,1,1,0,0,1,1,0,0,0,1,1,1,', '1,
1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,', '1,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,', '1,1,0,1,0,
1,0,1,1,1,1,0,0,0,0,1,', '1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,', '1,1,1,1,1,1,0,0,0,
0,1,1,1,1,1,1']

Python regex match anything enclosed in either quotations brackets braces or parenthesis

UPDATE
This is still not entirely the solution so far. It is only for preceding repeated closing characters (e.g )), ]], }}). I'm still looking for a way to capture enclosed contents and will update this.
Code:
>>> import re
>>> re.search(r'(\(.+?[?<!)]\))', '((x(y)z))', re.DOTALL).groups()
('((x(y)z))',)
Details:
r'(\(.+?[?<!)]\))'
() - Capturing group special characters.
\( and \) - The open and closing characters (e.g ', ", (), {}, [])
.+? - Match any character content (use with re.DOTALL flag)
[?<!)] - The negative lookbehind for character ) (replace this with the matching closing character). This will basically find any ) character where \) character does not precede (more info here).
I was trying to parse something like a variable assignment statement for this lexer thing I'm working with, just trying to get the basic logic behind interpreters/compilers.
Here's the basic assignment statements and literals I'm dealing with:
az = none
az_ = true
az09 = false
az09_ = +0.9
az_09 = 'az09_'
_az09 = "az09_"
_az = [
"az",
0.9
]
_09 = {
0: az
1: 0.9
}
_ = (
true
)
Somehow, I managed to parse those simple assignments like none, true, false, and numeric literals. Here's where I'm currently stuck at:
import sys
import re
# validate command-line arguments
if (len(sys.argv) != 2): raise ValueError('usage: parse <script>')
# parse the variable name and its value
def handle_assignment(index, source):
# TODO: handle quotations, brackets, braces, and parenthesis values
variable = re.search(r'[\S\D]([\w]+)\s+?=\s+?(none|true|false|[-+]?\d+\.?\d+|[\'\"].*[\'\"])', source[index:])
if variable is not None:
print('{}={}'.format(variable.group(1), variable.group(2)))
index += source[index:].index(variable.group(2))
return index
# parse through the source element by element
with open(sys.argv[1]) as file:
source = file.read()
index = 0
while index < len(source):
# checks if the line matches a variable assignment statement
if re.match(r'[\S\D][\w]+\s+?=', source[index:]):
index = handle_assignment(index, source)
index += 1
I was looking for a way to capture those values with enclosed quotations, brackets, braces, and parenthesis.
Probably, will update this post if I found an answer.

Use a regexp with multiple alternatives for each matching pair.
re.match(r'\'.*?\'|".*?"|\(.*?\)|\[.*?\]|\{.*?\}', s)
Note, however, that if there are nested brackets, this will match the first ending bracket, e.g. if the input is
(words (and some more words))
the result will be
(words (and some more words)
Regular expressions are not appropriate for matching nested structures, you should use a more powerful parsing technique.

Solution for #Barmar's recursive characters using the regex third-party module:
pip install regex
python3
>>> import regex
>>> recurParentheses = regex.compile(r'[(](?:[^()]|(?R))*[)]')
>>> recurParentheses.findall('(z(x(y)z)x) ((x)(y)(z))')
['(z(x(y)z)x)', '((x)(y)(z))']
>>> recurCurlyBraces = regex.compile(r'[{](?:[^{}]|(?R))*[}]')
>>> recurCurlyBraces.findall('{z{x{y}z}x} {{x}{y}{z}}')
['{z{x{y}z}x}', '{{x}{y}{z}}']
>>> recurSquareBrackets = regex.compile(r'[[](?:[^][]|(?R))*[]]')
>>> recurSquareBrackets.findall('[z[x[y]z]x] [[x][y][z]]')
['[z[x[y]z]x]', '[[x][y][z]]']
For string literal recursion, I suggest take a look at this.

How to remove text before a particular character or string in multi-line text?

I want to remove all the text before and including */ in a string.
For example, consider:
string = ''' something
other things
etc. */ extra text.
'''
Here I want extra text. as the output.
I tried:
string = re.sub("^(.*)(?=*/)", "", string)
I also tried:
string = re.sub(re.compile(r"^.\*/", re.DOTALL), "", string)
But when I print string, it did not perform the operation I wanted and the whole string is printing.

I suppose you're fine without regular expressions:
string[string.index("*/ ")+3:]
And if you want to strip that newline:
string[string.index("*/ ")+3:].rstrip()

The problem with your first regex is that . does not match newlines as you noticed. With your second one, you were closer but forgot the * that time. This would work:
string = re.sub(re.compile(r"^.*\*/", re.DOTALL), "", string)
You can also just get the part of the string that comes after your "*/":
string = re.search(r"(\*/)(.*)", string, re.DOTALL).group(2)

Update: After doing some research, I found that the pattern (\n|.) to match everything including newlines is inefficient. I've updated the answer to use [\s\S] instead as shown on the answer I linked.
The problem is that . in python regex matches everything except newlines. For a regex solution, you can do the following:
import re
strng = ''' something
other things
etc. */ extra text.
'''
print(re.sub("[\s\S]+\*/", "", strng))
# extra text.
Add in a .strip() if you want to remove that remaining leading whitespace.

to keep text until that symbol you can do:
split_str = string.split(' ')
boundary = split_str.index('*/')
new = ' '.join(split_str[0:boundary])
print(new)
which gives you:
something
other things
etc.

string_list = string.split('*/')[1:]
string = '*/'.join(string_list)
print(string)
gives output as
' extra text. \n'

Replace commas enclosed in curly braces

I try to replace commas with semicolons enclosed in curly braces.
Sample string:
text = "a,b,{'c','d','e','f'},g,h"
I am aware that it comes down to lookbehinds and lookaheads, but somehow it won't work like I want it to:
substr = re.sub(r"(?<=\{)(.+?)(,)(?=.+\})",r"\1;", text)
It returns:
a,b,{'c';'d','e','f'},g,h
However, I am aiming for the following:
a,b,{'c';'d';'e';'f'},g,h
Any idea how I can achieve this?
Any help much appreciated :)

You can match the whole block {...} (with {[^{}]+}) and replace commas inside it only with a lambda:
import re
text = "a,b,{'c','d','e','f'},g,h"
print(re.sub(r"{[^{}]+}", lambda x: x.group(0).replace(",", ";"), text))
See IDEONE demo
Output: a,b,{'c';'d';'e';'f'},g,h
By declaring lambda x we can get access to each match object, and get the whole match value using x.group(0). Then, all we need is replace a comma with a semi-colon.
This regex does not support recursive patterns. To use a recursive pattern, you need PyPi regex module. Something like m = regex.sub(r"\{(?:[^{}]|(?R))*}", lambda x: x.group(0).replace(",", ";"), text) should work.

Below I have posted a solution that does not rely on an regular expression. It uses a stack (list) to determine if a character is inside a curly bracket {. Regular expression are more elegant, however, they can be harder to modify when requirements change. Please note that the example below also works for nested brackets.
text = "a,b,{'c','d','e','f'},g,h"
output=''
stack = []
for char in text:
if char == '{':
stack.append(char)
elif char == '}':
stack.pop()
#Check if we are inside a curly bracket
if len(stack)>0 and char==',':
output += ';'
else:
output += char
print output
This gives:
'a,b,{'c';'d';'e';'f'},g,h
You can also rewrite this as a map function if you use a the global variable for stack:
stack = []
def replace_comma_in_curly_brackets(char):
if char == '{':
stack.append(char)
elif char == '}':
stack.pop()
#Check if we are inside a curly bracket
if len(stack)>0 and char==',':
return ';'
return char
text = "a,b,{'c','d','e','f'},g,h"
print ''.join(map(str, map(replace_comma_in_curly_brackets,text)))
Regarding performance, when running the above two methods and the regular expression solution proposed by #stribizhev on the test string at the end of this post, I get the following timings:
Regular expression (#stribizshev): 0.38 seconds
Map function: 26.3 seconds
For loop: 251 seconds
This is the test string that is 55,300,00 characters long:
text = "a,able,about,across,after,all,almost,{also,am,among,an,and,any,are,as,at,be,because},been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" * 100000

If you don't have nested braces, it might be enough to just look ahead at each , if there is a closing } ahead without any opening { in between. Search for
,(?=[^{]*})
and replace with ;
, match a comma literally
(?=...) the lookahead to check
if there's ahead [^{]* any amount of characters, that are not {
followed by a closing curly brace }
See demo at regex101

Using regex to extract information from string

I am trying to write a regex in Python to extract some information from a string.
Given:
"Only in Api_git/Api/folder A: new.txt"
I would like to print:
Folder Path: Api_git/Api/folder A
Filename: new.txt
After having a look at some examples on the re manual page, I'm still a bit stuck.
This is what I've tried so far
m = re.match(r"(Only in ?P<folder_path>\w+:?P<filename>\w+)","Only in Api_git/Api/folder A: new.txt")
print m.group('folder_path')
print m.group('filename')
Can anybody point me in the right direction??

Get the matched group from index 1 and 2 using capturing groups.
^Only in ([^:]*): (.*)$
Here is demo
sample code:
import re
p = re.compile(ur'^Only in ([^:]*): (.*)$')
test_str = u"Only in Api_git/Api/folder A: new.txt"
re.findall(p, test_str)
If you want to print in the below format then try with substitution.
Folder Path: Api_git/Api/folder A
Filename: new.txt
DEMO
sample code:
import re
p = re.compile(ur'^Only in ([^:]*): (.*)$')
test_str = u"Only in Api_git/Api/folder A: new.txt"
subst = u"Folder Path: $1\nFilename: $2"
result = re.sub(p, subst, test_str)

Your pattern: (Only in ?P<folder_path>\w+:?P<filename>\w+) has a few flaws in it.
The ?P construct is only valid as the first bit inside a parenthesized expression,
so we need this.
(Only in (?P<folder_path>\w+):(?P<filename>\w+))
The \w character class is only for letters and underscores. It won't match / or ., for example. We need to use a different character class that more closely aligns with requirements. In fact, we can just use ., the class of nearly all characters:
(Only in (?P<folder_path>.+):(?P<filename>.+))
The colon has a space after it in your example text. We need to match it:
(Only in (?P<folder_path>.+): (?P<filename>.+))
The outermost parentheses are not needed. They aren't wrong, just not needed:
Only in (?P<folder_path>.+): (?P<filename>.+)
It is often convenient to provide the regular expression separate from the call to the regular expression engine. This is easily accomplished by creating a new variable, for example:
regex = r'Only in (?P<folder_path>.+): (?P<filename>.+)'
... # several lines later
m = re.match(regex, "Only in Api_git/Api/folder A: new.txt")
The above is purely for the convenience of the programmer: it neither saves nor squanders time or memory space. There is, however, a technique that can save some of the time involved in regular expressions: compiling.
Consider this code segment:
regex = r'Only in (?P<folder_path>.+): (?P<filename>.+)'
for line in input_file:
m = re.match(regex, line)
...
For each iteration of the loop, the regular expression engine must interpret the regular expression and apply it to the line variable. The re module allows us to separate the interpretation from the application; we can interpret once but apply several times:
regex = re.compile(r'Only in (?P<folder_path>.+): (?P<filename>.+)')
for line in input_file:
m = re.match(regex, line)
...
Now, your original program should look like this:
regex = re.compile(r'Only in (?P<folder_path>.+): (?P<filename>.+)')
m = re.match(regex, "Only in Api_git/Api/folder A: new.txt")
print m.group('folder_path')
print m.group('filename')
However, I'm a fan of using comments to explain regular expressions. My version, including some general cleanup, looks like this:
import re
regex = re.compile(r'''(?x) # Verbose
Only\ in\ # Literal match
(?P<folder_path>.+) # match longest sequence of anything, and put in 'folder_path'
:\ # Literal match
(?P<filename>.+) # match longest sequence of anything and put in 'filename'
''')
with open('diff.out') as input_file:
for line in input_file:
m = re.match(regex, line)
if m:
print m.group('folder_path')
print m.group('filename')

It really depends on the limitation of the input, if this is the only input this will do the trick.
^Only in (?P<folder_path>[a-zA-Z_/ ]*): (?P<filename>[a-z]*.txt)$

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

python re.search not working on multiline string - python

Related

re.search seaches only in one line [duplicate]

Python regex match anything enclosed in either quotations brackets braces or parenthesis

How to remove text before a particular character or string in multi-line text?

Replace commas enclosed in curly braces

Using regex to extract information from string

Categories

Resources