Capture a Repeating Group in Python using RegEx (see example) - python

I am writing a regular expression in python to capture the contents inside an SSI tag.
I want to parse the tag:
<!--#include file="/var/www/localhost/index.html" set="one" -->
into the following components:
Tag Function (ex: include, echo or set)
Name of attribute, found before the = sign
Value of attribute, found in between the "'s
The problem is that I am at a loss on how to grab these repeating groups, as name/value pairs may occur one or more times in a tag. I have spent hours on this.
Here is my current regex string:
^\<\!\-\-\#([a-z]+?)\s([a-z]*\=\".*\")+? \-\-\>$
It captures the include in the first group and file="/var/www/localhost/index.html" set="one" in the second group, but what I am after is this:
group 1: "include"
group 2: "file"
group 3: "/var/www/localhost/index.html"
group 4 (optional): "set"
group 5 (optional): "one"
(continue for every other name="value" pair)
I am using this site to develop my regex

Grab everything that can be repeated, then parse them individually. This is probably a good use case for named groups, as well!
import re
data = """<!--#include file="/var/www/localhost/index.html" set="one" reset="two" -->"""
pat = r'''^<!--#([a-z]+) ([a-z]+)="(.*?)" ((?:[a-z]+?=".+")+?) -->'''
result = re.match(pat, data)
result.groups()
('include', 'file', '/var/www/localhost/index.html', 'set="one" reset="two"')
Then iterate through it:
g1, g2, g3, g4 = result.groups()
for keyvalue in g4.split(): # split on whitespace
key, value = keyvalue.split('=')
# do something with them

A way with the new python regex module:
#!/usr/bin/python
import regex
s = r'<!--#include file="/var/www/localhost/index.html" set="one" -->'
p = r'''(?x)
(?>
\G(?<!^)
|
<!-- \# (?<function> [a-z]+ )
)
\s+
(?<key> [a-z]+ ) \s* = \s* " (?<val> [^"]* ) "
'''
matches = regex.finditer(p, s)
for m in matches:
if m.group("function"):
print ("function: " + m.group("function"))
print (" key: " + m.group("key") + "\n value: " + m.group("val") + "\n")
The way with re module:
#!/usr/bin/python
import re
s = r'<!--#include file="/var/www/localhost/index.html" set="one" -->'
p = r'''(?x)
<!-- \# (?P<function> [a-z]+ )
\s+
(?P<params> (?: [a-z]+ \s* = \s* " [^"]* " \s*? )+ )
-->
'''
matches = re.finditer(p, s)
for m in matches:
print ("function: " + m.group("function"))
for param in re.finditer(r'[a-z]+|"([^"]*)"', m.group("params")):
if param.group(1):
print (" value: " + param.group(1) + "\n")
else:
print (" key: " + param.group())

I recommend against using a single regular expression to capture every item in a repeating group. Instead--and unfortunately, I don't know Python, so I'm answering it in the language I understand, which is Java--I recommend first extracting all attributes, and then looping through each item, like this:
import java.util.regex.Pattern;
import java.util.regex.Matcher;
public class AllAttributesInTagWithRegexLoop {
public static final void main(String[] ignored) {
String input = "<!--#include file=\"/var/www/localhost/index.html\" set=\"one\" -->";
Matcher m = Pattern.compile(
"<!--#(include|echo|set) +(.*)-->").matcher(input);
m.matches();
String tagFunc = m.group(1);
String allAttrs = m.group(2);
System.out.println("Tag function: " + tagFunc);
System.out.println("All attributes: " + allAttrs);
m = Pattern.compile("(\\w+)=\"([^\"]+)\"").matcher(allAttrs);
while(m.find()) {
System.out.println("name=\"" + m.group(1) +
"\", value=\"" + m.group(2) + "\"");
}
}
}
Output:
Tag function: include
All attributes: file="/var/www/localhost/index.html" set="one"
name="file", value="/var/www/localhost/index.html"
name="set", value="one"
Here's an answer that may be of interest: https://stackoverflow.com/a/23062553/2736496
Please consider bookmarking the Stack Overflow Regular Expressions FAQ for future reference.

Unfortunately python does not allow for recursive regular expressions.
You can instead do this:
import re
string = '''<!--#include file="/var/www/localhost/index.html" set="one" set2="two" -->'''
regexString = '''<!--\#(?P<tag>\w+)\s(?P<name>\w+)="(?P<value>.*?")\s(?P<keyVal>.*)\s-->'''
regex = re.compile(regexString)
match = regex.match(string)
tag = match.group('tag')
name = match.group('name')
value = match.group('value')
keyVal = match.group('keyVal').split()
for item in keyVal:
key, val in item.split('=')
# You can now do whatever you want with the key=val pair

The regex library allows capturing repeated groups (while builtin re does not). This allows for a simple solution without needing external for-loops to parse the groups afterwards.
import regex
string = r'<!--#include file="/var/www/localhost/index.html" set="one" -->'
rgx = regex.compile(
r'<!--#(?<fun>[a-z]+)(\s+(?<key>[a-z]+)\s*=\s*"(?<val>[^"]*)")+')
match = rgx.match(string)
keys, values = match.captures('key', 'val')
print(match['fun'], *map(' = '.join, zip(keys, values)), sep='\n ')
gives you what you're after
include
file = /var/www/localhost/index.html
set = one

Related

Regex match everything between special tag

I have the following string that I need to parse and get the values of anything inside the defined \$ tags
for example, the string
The following math equation: \$f(x) = x^2\$ is the same as \$g(x) = x^(4/2) \$
I want to parse whatever is in between the \$ tags, so that the result will contain both equations
'f(x) = x^2'
'g(x) = x^(4/2) '
I tried something like re.compile(r'\\\$(.)*\\$') but it didnt work.
You almost got it, just missing a backslash and a question mark (so it stops as soon as it finds the second \$ and doesn't match the longest string possible): r'\\\$(.*?)\\\$'
>>> pattern = r'\\\$(.*?)\\\$'
>>> data = "The following math equation: \$f(x) = x^2\$ is the same as \$g(x) = x^(4/2) \$"
>>> re.findall(pattern, data)
['f(x) = x^2', 'g(x) = x^(4/2) ']
That regex can fit:
/\\\$.{0,}\\\$/g
/ - begin
\\\$ - escaped: \$
. - any character between
{0,} - at least 0 chars (any number of chars, actually)
\\\$ - escaped: \$
/ - end
g - global search
This works:
import re
regex = r'\\\$(.*)\\\$'
r = re.compile(regex)
print r.match("\$f(x) = x^2\$").group(1)
print r.match("\$g(x) = x^(4/2) \$").group(1)

Picking up field value using Python regex

This is an example of two lines in a file that I am trying to pick up information from.
...
{ "SubtitleSettings_REPOSITORY", FieldType_STRING, (int32_t)REPOSITORY},
{ "PREFERRED_SUBTITLE_LANGUAGE", FieldType_STRING,SUBTITLE_LANGUAGE},
...
What I want to do is to find out the 3rd field of this weird data structure for the given string to match to 1st field, i.e.
SubtitleSettings_REPOSITORY => REPOSITORY
PREFERRED_SUBTITLE_LANGUAGE => SUBTITLE_LANGUAGE
The regx in my Python code can only handles the second line, but not cope with the first line. How I can improve it?
import re
...
#field is given a value in previous code, can be "SubtitleSettings_REPOSITORY", or "PREFERRED_SUBTITLE_LANGUAGE"
match = re.search(field+'"[, \t]+(\w+)[, \t]+(\w+)', src_file.read(), re.M|re.I)
return_value = match.group(2)
You can insert (?:\(\w+\))?, which allows (and ignores) an optional word in parentheses there:
match = re.search(field+'"[, \t]+(\w+)[, \t]+(?:\(\w+\))?(\w+)', line, re.M|re.I)
With this, the line matches and you get 'REPOSITORY' as desired.
import re
with open("input.txt") as f:
pattern = "\{ \"(.+)\",.+,(.+)\}"
for line in f:
first, third = re.findall(pattern, line.strip())[0]
print first.strip(), "=>", third.strip()
prints
SubtitleSettings_REPOSITORY => (int32_t)REPOSITORY
PREFERRED_SUBTITLE_LANGUAGE => SUBTITLE_LANGUAGE
where input.txt contains
{ "SubtitleSettings_REPOSITORY", FieldType_STRING, (int32_t)REPOSITORY},
{ "PREFERRED_SUBTITLE_LANGUAGE", FieldType_STRING,SUBTITLE_LANGUAGE}
Breakdown:
\{ \"(.+)\" matches strings with the structure { + space + " + text + " and extracts text
,.+,(.+)\} matches strings with the structure , + text1 + , + text2 + } and extracts text2

Extracting part of string in parenthesis using python

I have a csv file with a column with strings. Part of the string is in parentheses. I wish to move the part of string in parentheses to a different column and retain the rest of the string as it is.
For instance: I wish to convert:
LC(Carbamidomethyl)RLK
to
LCRLK Carbamidomethyl
Regex solution
If you only have one parentheses group in your string, you can use this regex:
>>> a = "LC(Carbamidomethyl)RLK"
>>> re.sub('(.*)\((.+)\)(.*)', '\g<1>\g<3> \g<2>', a)
'LCRLK Carbamidomethyl'
>>> a = "LCRLK"
>>> re.sub('(.*)\((.+)\)(.*)', '\g<1>\g<3> \g<2>', a)
'LCRLK' # works with no parentheses too
Regex decomposed:
(.*) #! Capture begin of the string
\( # match first parenthesis
(.+) #! Capture content into parentheses
\) # match the second
(.*) #! Capture everything after
---------------
\g<1>\g<3> \g<2> # Write each capture in the correct order
String manipulation solution
A faster solution, for huge data set is:
begin, end = a.find('('), a.find(')')
if begin != -1 and end != -1:
a = a[:begin] + a[end+1:] + " " + a[begin+1:end]
The process is to get the positions of parentheses (if there's any) and cut the string where we want. Then, we concatenate the result.
Performance of each method
It's clear that the string manipulation is the fastest method:
>>> timeit.timeit("re.sub('(.*)\((.+)\)(.*)', '\g<1>\g<3> \g<2>', a)", setup="a = 'LC(Carbadidomethyl)RLK'; import re")
15.214869976043701
>>> timeit.timeit("begin, end = a.find('('), a.find(')') ; b = a[:begin] + a[end+1:] + ' ' + a[begin+1:end]", setup="a = 'LC(Carbamidomethyl)RL'")
1.44008207321167
Multi parentheses set
See comments
>>> a = "DRC(Carbamidomethyl)KPVNTFVHESLADVQAVC(Carbamidomethyl)SQKNVACK"
>>> while True:
... begin, end = a.find('('), a.find(')')
... if begin != -1 and end != -1:
... a = a[:begin] + a[end+1:] + " " + a[begin+1:end]
... else:
... break
...
>>> a
'DRCKPVNTFVHESLADVQAVCSQKNVACK Carbamidomethyl Carbamidomethyl'

Find string from slash to a space or a character

I'l like to know how to find a string that is between slach and a bracket or ']' like for example.
data = "(AVP:SMTP/xx#xx.xx) R:AVP:SMS.0/+44648474 id:24"
data2 = "(AVP:SMTP/<xxx#xx.xx>) R:AVP:FAX.0/<thisword> id:25"
si the idea is to get only xx#xx.xx and +44648474 for the first data and xx#xx.xx and thiswordfor the data2
I've tried this regex:
k = re.findall(r"/(\S+)",data2)
but it returns <xxx#xx.xx>) and <thisword>
and what i'd like to get is xx#xx.xx and thisword
This one works.
import re
data = "(AVP:SMTP/xx#xx.xx) R:AVP:SMS.0/+44648474 id:24"
data2 = "(AVP:SMTP/<xxx#xx.xx>) R:AVP:FAX.0/<thisword> id:25"
regex = re.compile(r"/<?([^>\s\)]+)")
print regex.findall(data)
print regex.findall(data2)
>>>
['xx#xx.xx', '+44648474']
['xxx#xx.xx', 'thisword']
This regex breakdown:
/ : the / character.
<? : optionaly a < character.
( : start capture group.
[^>\s\)]+ : capture anything that is not >, \s (whitespace), or ).
) : close capture group.
You can exclude such delimiters by using lookaround assertions:
k = re.findall(r"(?<=/<)[^>]+(?=>)",data2)
This would ensure "/<" before the match, match then everything that is not ">" at least once and succeed when there is a ">" after the match.

Regex Python / group quantifiers

I want to match a list of variables which look like directories, e.g.:
Same/Same2/Foot/Ankle/Joint/Actuator/Sensor/Temperature/Value=4.123
Same/Same2/Battery/Name=SomeString
Same/Same2/Home/Land/Some/More/Stuff=0.34
The length of the "subdirectories" is variable having an upper bound (above it's 9).
I want to group every subdirectory except the 1st one which I named "Same" above.
The best I could come up with is:
^(?:([^/]+)/){4,8}([^/]+)=(.*)
It already looks for 4-8 subdirectories but only groups the last one. Why's that?
Is there a better solution using group quantifiers?
Edit: Solved. Will use split() instead.
import re
regx = re.compile('(?:(?<=\A)|(?<=/)).+?(?=/|\Z)')
for ss in ('Same/Same2/Foot/Ankle/Joint/Actuator/Sensor/Temperature/Value=4.123',
'Same/Same2/Battery/Name=SomeString',
'Same/Same2/Home/Land/Some/More/Stuff=0.34'):
print ss
print regx.findall(ss)
print
Edit 1
Now you have given more info on what you want to obtain ( _"Same/Same2/Battery/Name=SomeString becoming SAME2_BATTERY_NAME=SomeString"_ ) better solutions can be proposed: either with a regex or with split() , + replace()
import re
from os import sep
sep2 = r'\\' if sep=='\\' else '/'
pat = '^(?:.+?%s)(.+$)' % sep2
print 'pat==%s\n' % pat
ragx = re.compile(pat)
for ss in ('Same\Same2\Foot\Ankle\Joint\Actuator\Sensor\Temperature\Value=4.123',
'Same\Same2\Battery\Name=SomeString',
'Same\Same2\Home\Land\Some\More\Stuff=0.34'):
print ss
print ragx.match(ss).group(1).replace(sep,'_')
print ss.split(sep,1)[1].replace(sep,'_')
print
result
pat==^(?:.+?\\)(.+$)
Same\Same2\Foot\Ankle\Joint\Actuator\Sensor\Temperature\Value=4.123
Same2_Foot_Ankle_Joint_Actuator_Sensor_Temperature_Value=4.123
Same2_Foot_Ankle_Joint_Actuator_Sensor_Temperature_Value=4.123
Same\Same2\Battery\Name=SomeString
Same2_Battery_Name=SomeString
Same2_Battery_Name=SomeString
Same\Same2\Home\Land\Some\More\Stuff=0.34
Same2_Home_Land_Some_More_Stuff=0.34
Same2_Home_Land_Some_More_Stuff=0.34
Edit 2
Re-reading your comment, I realized that I didn't take in account that you want to upper the part of the strings that lies before the '=' sign but not after it.
Hence, this new code that exposes 3 methods that answer this requirement. You will choose which one you prefer:
import re
from os import sep
sep2 = r'\\' if sep=='\\' else '/'
pot = '^(?:.+?%s)(.+?)=([^=]*$)' % sep2
print 'pot==%s\n' % pot
rogx = re.compile(pot)
pet = '^(?:.+?%s)(.+?(?==[^=]*$))' % sep2
print 'pet==%s\n' % pet
regx = re.compile(pet)
for ss in ('Same\Same2\Foot\Ankle\Joint\Sensor\Value=4.123',
'Same\Same2\Battery\Name=SomeString',
'Same\Same2\Ocean\Atlantic\North=',
'Same\Same2\Maths\Addition\\2+2=4\Simple=ohoh'):
print ss + '\n' + len(ss)*'-'
print 'rogx groups '.rjust(32),rogx.match(ss).groups()
a,b = ss.split(sep,1)[1].rsplit('=',1)
print 'split split '.rjust(32),(a,b)
print 'split split join upper replace %s=%s' % (a.replace(sep,'_').upper(),b)
print 'regx split group '.rjust(32),regx.match(ss.split(sep,1)[1]).group()
print 'regx split sub '.rjust(32),\
regx.sub(lambda x: x.group(1).replace(sep,'_').upper(), ss)
print
result, on a Windows platform
pot==^(?:.+?\\)(.+?)=([^=]*$)
pet==^(?:.+?\\)(.+?(?==[^=]*$))
Same\Same2\Foot\Ankle\Joint\Sensor\Value=4.123
----------------------------------------------
rogx groups ('Same2\\Foot\\Ankle\\Joint\\Sensor\\Value', '4.123')
split split ('Same2\\Foot\\Ankle\\Joint\\Sensor\\Value', '4.123')
split split join upper replace SAME2_FOOT_ANKLE_JOINT_SENSOR_VALUE=4.123
regx split group Same2\Foot\Ankle\Joint\Sensor\Value
regx split sub SAME2_FOOT_ANKLE_JOINT_SENSOR_VALUE=4.123
Same\Same2\Battery\Name=SomeString
----------------------------------
rogx groups ('Same2\\Battery\\Name', 'SomeString')
split split ('Same2\\Battery\\Name', 'SomeString')
split split join upper replace SAME2_BATTERY_NAME=SomeString
regx split group Same2\Battery\Name
regx split sub SAME2_BATTERY_NAME=SomeString
Same\Same2\Ocean\Atlantic\North=
--------------------------------
rogx groups ('Same2\\Ocean\\Atlantic\\North', '')
split split ('Same2\\Ocean\\Atlantic\\North', '')
split split join upper replace SAME2_OCEAN_ATLANTIC_NORTH=
regx split group Same2\Ocean\Atlantic\North
regx split sub SAME2_OCEAN_ATLANTIC_NORTH=
Same\Same2\Maths\Addition\2+2=4\Simple=ohoh
-------------------------------------------
rogx groups ('Same2\\Maths\\Addition\\2+2=4\\Simple', 'ohoh')
split split ('Same2\\Maths\\Addition\\2+2=4\\Simple', 'ohoh')
split split join upper replace SAME2_MATHS_ADDITION_2+2=4_SIMPLE=ohoh
regx split group Same2\Maths\Addition\2+2=4\Simple
regx split sub SAME2_MATHS_ADDITION_2+2=4_SIMPLE=ohoh
I probably misunderstood what exactly you want to do, but here is how you would do it without regex:
for entry in list_of_vars:
key, value = entry.split('=')
key_components = key.split('/')
if 4 <= len(key_components) <= 8:
# here the actual work is done
print "%s=%s" % ('_'.join(key_components[1:]).upper(), value)
Just use split?
>>> p='Same/Same2/Foot/Ankle/Joint/Actuator/Sensor/Temperature/Value=4.123'
>>> p.split('/')
['Same', 'Same2', 'Foot', 'Ankle', 'Joint', 'Actuator', 'Sensor', 'Temperature', 'Value=4.123']
Also, if you want that key/val pair you can do something like this...
>>> s = p.split('/')
>>> s[-1].split('=')
['Value', '4.123']
A couple of variations on your theme. For one, I've always found regexen to be cryptic to the point of unmaintainable, so I wrote the pyparsing module. In my mind, I look at your code and think, "oh, it's a list of '/'-delimited strings, an '=' sign, and then some kind of rvalue." And that translates pretty directly into the pyparsing parser definition code. By adding a name here and there in the parser ("key" and "value", similar to named groups in regex), the output is pretty easily processed.
data="""\
Same/Same2/Foot/Ankle/Joint/Actuator/Sensor/Temperature/Value=4.123
Same/Same2/Battery/Name=SomeString
Same/Same2/Home/Land/Some/More/Stuff=0.34""".splitlines()
from pyparsing import Word, alphas, alphanums, Word, nums, QuotedString, delimitedList
wd = Word(alphas, alphanums)
number = Word(nums+'+-', nums+'.').setParseAction(lambda t:float(t[0]))
rvalue = wd | number | QuotedString('"')
defn = delimitedList(wd, '/')('key') + '=' + rvalue('value')
for d in data:
result = defn.parseString(d)
Second, I question your approach at defining all of those variable names - creating variable names on the fly based on your data is a pretty well-recognized Code Smell (not necessarily bad, but you might really want to rethink this approach). I used a recursive defaultdict to create a navigable structure so that you can easily do operations like "find all the entries that are sub-elements of "Same2" (in this case, "Foot", "Battery", and "Home") - this kind of work is more difficult when trying to sift through some collection of variable names as found in locals(), it seems to me you will end up re-parsing these names to reconstruct the key hierarchy.
from collections import defaultdict
class recursivedefaultdict(defaultdict):
def __init__(self, attrFactory=int):
self.default_factory = lambda : type(self)(attrFactory)
self._attrFactory = attrFactory
def __getattr__(self, attr):
newval = self._attrFactory()
setattr(self, attr, newval)
return newval
table = recursivedefaultdict()
# parse each entry, and accumulate into hierarchical dict
for d in data:
# use pyparsing parser, gives us key (list of names) and value
result = defn.parseString(d)
t = table
for k in result.key[:-1]:
t = t[k]
t[result.key[-1]] = result.value
# recursive method to iterate over hierarchical dict
def showTable(t, indent=''):
for k,v in t.items():
print indent+k,
if isinstance(v,dict):
print
showTable(v, indent+' ')
else:
print v
showTable(table)
Prints:
Same
Same2
Foot
Ankle
Joint
Actuator
Sensor
Temperature
Value 4.123
Battery
Name SomeString
Home
Land
Some
More
Stuff 0.34
If you are really set on defining those variable names, then adding some helpful parse actions to pyparsing will reformat the parsed data at parse time, so that it's directly processable afterwards:
wd = Word(alphas, alphanums)
number = Word(nums+'+-', nums+'.').setParseAction(lambda t:float(t[0]))
rvaluewd = wd.copy().setParseAction(lambda t: '"%s"' % t[0])
rvalue = rvaluewd | number | QuotedString('"')
defn = delimitedList(wd, '/')('key') + '=' + rvalue('value')
def joinNamesWithAllCaps(tokens):
tokens["key"] = '_'.join(map(str.upper, tokens.key))
defn.setParseAction(joinNamesWithAllCaps)
for d in data:
result = defn.parseString(d)
print result.key,'=', result.value
Prints:
SAME_SAME2_FOOT_ANKLE_JOINT_ACTUATOR_SENSOR_TEMPERATURE_VALUE = 4.123
SAME_SAME2_BATTERY_NAME = "SomeString"
SAME_SAME2_HOME_LAND_SOME_MORE_STUFF = 0.34
(Note that this also encloses your SomeString value in quotes, so that the resulting assignment statement is valid Python.)

Categories