How to parse a string in Python - python

How to parse string composed of n parameter and randomly sorted such as:
{ UserID : 36875; tabName : QuickAndEasy}
{ RecipeID : 1150; UserID : 36716}
{ isFromLabel : 0; UserID : 36716; type : recipe; searchWord : soup}
{ UserID : 36716; tabName : QuickAndEasy}
Ultimately I'm looking to ouput parameters in separate columns for a table.

The regex ([^{}\s:]+)\s*:\s*([^{}\s;]+) works on your examples. You need to be aware, though, that all the matches will be strings, so if you want to store 36875 as a number, you'll need to do some additional processing.
import re
regex = re.compile(
r"""( # Match and capture in group 1:
[^{}\s:]+ # One or more characters except braces, whitespace or :
) # End of group 1
\s*:\s* # Match a colon, optionally surrounded by whitespace
( # Match and capture in group 2:
[^{}\s;]+ # One or more characters except braces, whitespace or ;
) # End of group 2""",
re.VERBOSE)
You can then do
>>> dict(regex.findall("{ isFromLabel : 0; UserID : 36716; type : recipe; searchWord : soup}"))
{'UserID': '36716', 'isFromLabel': '0', 'searchWord': 'soup', 'type': 'recipe'}
Test it live on regex101.com.

lines = "{ UserID : 36875; tabName : QuickAndEasy } ", \
"{ RecipeID : 1150; UserID : 36716}", \
"{ isFromLabel : 0; UserID : 36716; type : recipe; searchWord : soup}" , \
"{ UserID : 36716; tabName : QuickAndEasy}"
counter = 0
mappedLines = {}
for line in lines:
counter = counter + 1
lineDict = {}
line = line.replace("{","")
line = line.replace("}","")
line = line.strip()
fieldPairs = line.split(";")
for pair in fieldPairs:
fields = pair.split(":")
key = fields[0].strip()
value = fields[1].strip()
lineDict[key] = value
mappedLines[counter] = lineDict
def printField(key, lineSets, comma_desired = True):
if key in lineSets:
print(lineSets[key],end="")
if comma_desired:
print(",",end="")
else:
print()
for key in range(1,len(mappedLines) + 1):
lineSets = mappedLines[key]
printField("UserID",lineSets)
printField("tabName",lineSets)
printField("RecipeID",lineSets)
printField("type",lineSets)
printField("searchWord",lineSets)
printField("isFromLabel",lineSets,False)
CSV output:
36875,QuickAndEasy,,,,
36716,,1150,,,
36716,,,recipe,soup,0
36716,QuickAndEasy,,,,
The code above was Python 3.4. You can get similar output with 2.7 by replacing the function and the last for loop with this:
def printFields(keys, lineSets):
output_line = ""
for key in keys:
if key in lineSets:
output_line = output_line + lineSets[key] + ","
else:
output_line += ","
print output_line[0:len(output_line) - 1]
fields = ["UserID", "tabName", "RecipeID", "type", "searchWord", "isFromLabel"]
for key in range(1,len(mappedLines) + 1):
lineSets = mappedLines[key]
printFields(fields,lineSets)

Related

Parsing Erlang data to Python dictionary

I have an erlang script from which I would like to get some data and store it in python dictionary.
It is easy to parse the script to get string like this:
{userdata,
[{tags,
[#dt{number=111},
#mp{id='X23.W'}]},
{log,
'LG22'},
{instruction,
"String that can contain characters like -, _ or numbers"}
]
}.
desired result:
userdata = {"tags": {"dt": {"number": 111}, "mp": {"id": "X23.W"}},
"log": "LG22",
"instruction": "String that can contain characters like -, _ or numbers"}
# "#" mark for data in "tags" is not required in this structure.
# Also value for "tags" can be any iterable structure: tuple, list or dictionary.
But I am not sure how to transfer this data into a python dictionary. My first idea was to use json.loads but it requires many modifications (putting words into quotes marks, replacing "," with ":" and many more).
Moreover, keys in userdata are not limited to some pool. In this case, there are 'tags', 'log' and 'instruction', but there can be many more eg. 'slogan', 'ids', etc.
Also, I am not sure about the order. I assume that the keys can appear in random order.
My code (it is not working for id='X23.W' so I removed '.' from input):
import re
import json
in_ = """{userdata, [{tags, [#dt{number=111}, #mp{id='X23W'}]}, {log, 'LG22'}, {instruction, "String that can contain characters like -, _ or numbers"}]}"""
buff = in_.replace("{userdata, [", "")[:-2]
re_helper = re.compile(r"(#\w+)")
buff = re_helper.sub(r'\1:', buff)
partition = buff.partition("instruction")
section_to_replace = partition[0]
replacer = re.compile(r"(\w+)")
match = replacer.sub(r'"\1"', section_to_replace)
buff = ''.join([match, '"instruction"', partition[2]])
buff = buff.replace("#", "")
buff = buff.replace('",', '":')
buff = buff.replace("}, {", "}, \n{")
buff = buff.replace("=", ":")
buff = buff.replace("'", "")
temp = buff.split("\n")
userdata = {}
buff = temp[0][:-2]
buff = buff.replace("[", "{")
buff = buff.replace("]", "}")
userdata .update(json.loads(buff))
for i, v in enumerate(temp[1:]):
v = v.strip()
if v.endswith(","):
v = v[:-1]
userdata .update(json.loads(v))
print(userdata)
Output:
{'tags': {'dt': {'number': '111'}, 'mp': {'id': 'X23W'}}, 'instruction': 'String that can contain characters like -, _ or numbers', 'log': 'LG22'}
import json
import re
in_ = """{userdata, [{tags, [#dt{number=111}, #mp{id='X23.W'}]}, {log, 'LG22'}, {instruction, "String that can contain characters like -, _ or numbers"}]}"""
qouted_headers = re.sub(r"\{(\w+),", r'{"\1":', in_)
changed_hashed_list_to_dict = re.sub(r"\[(#.*?)\]", r'{\1}', qouted_headers)
hashed_variables = re.sub(r'#(\w+)', r'"\1":', changed_hashed_list_to_dict)
equality_signes_replaced_and_quoted = re.sub(r'{(\w+)=', r'{"\1":', hashed_variables)
replace_single_qoutes = equality_signes_replaced_and_quoted.replace('\'', '"')
result = json.loads(replace_single_qoutes)
print(result)
Produces:
{'userdata': [{'tags': {'dt': {'number': 111}, 'mp': {'id': 'X23.W'}}}, {'log': 'LG22'}, {'instruction': 'String that can contain characters like -, _ or numbers'}]}

How do I Use re() in Python and Return Capture Groups within an "If" Statement?

Although I've been using Perl for many years, I've always had trouble with anything more than fairly basic use of Regular Expresions in the language. This is
only a worse situation now, as I'm trying to learn Python... and the use of re() is even more unclear to me.
I'm trying to check for a match if a substring is in a string, using re()
and also am using capture groups to extract some info from the matching process. However, I can't get things to work in a couple of
contexts; when using a re() call and assigning the returned values all
within an "if" statement.. and how to handle the situation when .groups items are not defined
in the match objects (when a match is not made).
So, what follows are examples of what I'm trying to do coded in Perl and Python, with their respective outputs.
I'd appreciate any pointers on how I might better approach the problem using Python.
Perl Code:
use strict;
use warnings;
my ($idx, $dvalue);
while (my $rec = <DATA>) {
chomp($rec);
if ( ($idx, $dvalue) = ($rec =~ /^XA([0-9]+)=(.*?)!/) ) {
printf(" Matched:\n");
printf(" rec: >%s<\n", $rec);
printf(" index = >%s< value = >%s<\n", $idx, $dvalue);
} elsif ( ($idx, $dvalue) = ($rec =~ /^PZ([0-9]+)=(.*?[^#])!/) ) {
printf(" Matched:\n");
printf(" rec: >%s<\n", $rec);
printf(" index = >%s< value = >%s<\n", $idx, $dvalue);
} else {
printf("\n Unknown Record format, \\%s\\\n\n", $rec);
}
}
close(DATA);
exit(0)
__DATA__
DUD=ABC!QUEUE=D23!
XA32=7!P^=32!
PZ112=123^!PQ=ABC!
Perl Output:
Unknown Record format, \DUD=ABC!QUEUE=D23!\
Matched:
rec: >XA32=7!P^=32!<
index = >32< value = >7<
Matched:
rec: >PZ112=123^!PQ=ABC!<
index = >112< value = >123^<
Python Code:
import re
string = 'XA32=7!P^=32!'
with open('data.dat', 'r') as fh:
for rec in fh:
orec = ' rec: >' + rec.rstrip('\n') + '<'
print(orec)
# always using 'string' at least lets this program run
(index, dvalue) = re.search(r'^XA([0-9]+)=(.*?[^#])!', string).groups()
# The following works when there is a match... but fails with an error when
# a match is NOT found, viz:-
# ...
# (index, dvalue) = re.search(r'^XA([0-9]+)=(.*?[^#])!', rec).groups()
#
# Traceback (most recent call last):
# File "T:\tmp\a.py", line 13, in <module>
# (index, dvalue) = re.search(r'^XA([0-9]+)=(.*?[^#])!', rec).groups()
# AttributeError: 'NoneType' object has no attribute 'groups'
#
buf = ' index = >' + index + '<' + ' value = >' + dvalue + '<'
print(buf)
exit(0)
data.dat contents:
DUD=ABC!QUEUE=D23!
XA32=7!P^=32!
PZ112=123^!PQ=ABC!
Python Output:
rec: >DUD=ABC!QUEUE=D23!<
index = >32< value = >7<
rec: >XA32=7!P^=32!<
index = >32< value = >7<
rec: >PZ112=123^!PQ=ABC!<
index = >32< value = >7<
Another development: Some more code to help me understand this better... but I'm unsure about when/how to use the match.group() or match.groups() ...
Python Code:
import re
rec = 'XA22=11^!S^=64!ABC=0,0!PX=0!SP=12B!'
print("rec = >{}<".format(rec))
# ----
index = 0 ; dvalue = 0 ; x = 0
match = re.match(r'XA([0-9]+)=(.*?[^#])!(.*?)!', rec)
if match:
(index, dvalue, x) = match.groups()
print("3 (): index = >{}< value = >{}< x = >{}<".format(index, dvalue, x))
# ----
index = 0 ; dvalue = 0 ; x = 0
match = re.match(r'XA([0-9]+)=(.*?[^#])!', rec)
if match:
(index, dvalue) = match.groups()
print("2 (): index = >{}< value = >{}< x = >{}<".format(index, dvalue, x))
# ----
index = 0 ; dvalue = 0 ; x = 0
match = re.match(r'XA([0-9]+)=', rec)
if match:
#(index) = match.groups() # Why doesn't this work like above examples!?
(index, ) = match.groups() # ...and yet this works!?
# Does match.groups ALWAYS returns a tuple!?
#(index) = match.group(1) # This also works; 0 = entire matched string?
print("1 (): index = >{}< value = >{}< x = >{}<".format(index, dvalue, x))
# ----
index = 0 ; dvalue = 0 ; x = 0
match = re.search(r'S\^=([0-9]+)!', rec)
if match:
(index, ) = match.groups() # Returns tuple(?!)
print("1 (): index = >{}< value = >{}< x = >{}<".format(index, dvalue, x))
Again, I'd appreciate any thoughts on which is the 'preferred' way.. or if there's another way to deal with the groups.
You need to check for a match first, then use the groups. I.e.
compile the regexes (optional for most cases nowadays, according to the documentation)
apply each regex to the string to generate a match object
match() only matches at the beginning of a string, i.e. with an implicit ^ anchor
search() matches anywhere in the string
check if the match object is valid
extract the groups
skip to next loop iteration
# works with Python 2 and Python 3
import re
with open('dummy.txt', 'r') as fh:
for rec in fh:
orec = ' rec: >' + rec.rstrip('\n') + '<'
print(orec)
match = re.match(r'XA([0-9]+)=(.*?[^#])!', rec)
if match:
(index, dvalue) = match.groups()
print(" index = >{}< value = >{}<".format(index, dvalue))
continue
match = re.match(r'PZ([0-9]+)=(.*?[^#])!', rec)
if match:
(index, dvalue) = match.groups()
print(" index = >{}< value = >{}<".format(index, dvalue))
continue
print(" Unknown Record format")
Output:
$ python dummy.py
rec: >DUD=ABC!QUEUE=D23!<
Unknown Record format
rec: >XA32=7!P^=32!<
index = >32< value = >7<
rec: >PZ112=123^!PQ=ABC!<
index = >112< value = >123^<
But I'm wondering why you don't simplify your Perl & Python code to just use a single regex instead? E.g.:
match = re.match(r'(?:XA|PZ)([0-9]+)=(.*?[^#])!', rec)
if match:
(index, dvalue) = match.groups()
print(" index = >{}< value = >{}<".format(index, dvalue))
else:
print(" Unknown Record format")

Extracting the values within the square braces

I have a file testfile with the set of server names as below.
app-server-l11[2-5].test.com
server-l34[5-8].test.com
dd-server-l[2-4].test.com
Can you please help in getting output to be as follow.
app-server-l112.test.com
app-server-l113.test.com
app-server-l114.test.com
app-server-l115.test.com
server-l345.test.com
server-l346.test.com
server-l347.test.com
server-l348.test.com
dd-server-l2.test.com
dd-server-l3.test.com
dd-server-l4.test.com
With GNU awk for the 3rd arg to match():
$ awk 'match($0,/(.*)\[([0-9]+)-([0-9]+)\](.*)/,a){for (i=a[2]; i<=a[3]; i++) print a[1] i a[4]}' file
app-server-l112.test.com
app-server-l113.test.com
app-server-l114.test.com
app-server-l115.test.com
server-l345.test.com
server-l346.test.com
server-l347.test.com
server-l348.test.com
dd-server-l2.test.com
dd-server-l3.test.com
dd-server-l4.test.com
In GNU awk:
$ awk -F"[][]" '{split($2,a,"-"); for(i=a[1];i<=a[2];i++) print $1 i $3}' file
app-server-l112.test.com
app-server-l113.test.com
app-server-l114.test.com
app-server-l115.test.com
server-l345.test.com
server-l346.test.com
server-l347.test.com
server-l348.test.com
dd-server-l2.test.com
dd-server-l3.test.com
dd-server-l4.test.com
split to fields by [ and ] using FS
use split the get the range start (a[1]) and end (a[2])
iterate the range with for and output
There is no checking whether there was a range or not. It could be implemented with something like: print (NF==3 ? $1 i $3 : $1 ).
Worst and ugliest example:
var='app-server-l11[2-5].test.com'
for i in range(int(var[(var.find('[') +1)]), int(var[(var.find("]") - 1)])+1):
print 'app-server-l11' + str(i) + '.test.com'
Use your imagination!
ser_nm = ['app-server-l11[2-5].test.com','server-134[5-8].test.com','dd-server-[2-4].test.com']
for nm in ser_nm:
for i in range(int(nm[nm.find('[')+1 : nm.find('-',(nm.find('[')+1))]), int(nm[nm.find('-',(nm.find('[')+1))+1:nm.find(']') ] )+1):
print(nm[:nm.find('[')] + str(i) + nm[nm.find(']')+1:])
This will also take care of cases where server names are like this:
'server-134[52-823].test.com'
not the best solution, but it works...
inp = open('input.txt', 'r+').read()
print(inp)
result= ''
for i in inp.split('\n'):
if len(i) > 1:
print(repr(i))
f1 = i.find('[')
f2 = i.find(']')+1
b1 = i[:f1]
b2 = i[f2:]
ins = i[f1:f2]
ins = ins[1:-1]
for j in range(int(ins.split("-")[0]),int(ins.split("-")[1])+1):
result+=b1+str(j)+b2+'\n'
outp = open('output.txt', 'w')
outp.write(result)
outp.close()
You can use the below command for the required output without any complex statement.
awk -f test.awk file.txt
test.awk must contains the below lines:
{
if(a=match($0,"\\["))
{
start=strtonum(substr($0,a+1,1));
end=strtonum(substr($0,a+3,1));
copy=$0;
for(i=start;i<=end;i++)
{
sub("\\[[0-9]{1,}-[0-9]{1,}\\]",i,copy);
print copy;
copy = $0;
}
}
else
{
print $0;
}
}
file.txt contains your input file like below lines:
app-server-l11[2-5].test.com
server-l34[5-8].test.com
dd-server-l[2-4].test.com
output:
app-server-l112.test.com
app-server-l113.test.com
app-server-l114.test.com
app-server-l115.test.com
server-l345.test.com
server-l346.test.com
server-l347.test.com
server-l348.test.com
dd-server-l2.test.com
dd-server-l3.test.com
dd-server-l4.test.com
As this sounds like a school assignment I'm going to be fairly vague.
I would use a regular expression to extract the numeric range and the rest of the address components, then use a loop to iterate over the extracted numeric range to build each address (using the other captured address components).
Since it's been over a week:
import re
inputs = [ "app-server-l11[2-5].test.com", "server-l34[5-8].test.com", "dd-server-l[2-4].test.com" ]
pattern = r"\s*(?P<subdomain>[a-zA-Z0-9-_.]+)\[(?P<range_start>\d+)-(?P<range_end>\d+)\](?P<domain>\S+)"
expr = re.compile( pattern )
def expand_domain( domain ):
mo = expr.match( domain )
if mo is not None:
groups = mo.groupdict()
subdomain = groups[ "subdomain" ]
domain = groups[ "domain" ]
range_start = int( groups[ "range_start" ] )
range_end = int( groups[ "range_end" ] )
result = [ "{}{:d}{}".format( subdomain, index, domain ) for index in range( range_start, range_end + 1 ) ]
return result
else:
raise ValueError( "'{}' does not match the expected input.".format( domain ) )
for domain in inputs:
print( "'{}':".format( domain ) )
for exp_dom in expand_domain( domain ):
print( "--> {}".format( exp_dom ) )

insert a variable into an url

I got a CSV file with numbers and I want to insert these numbers into a specific location in an url : jus after " "value": "
Here is my code :
with open('update_cases_id.csv') as p:
for lines in p:
uuid = lines.rstrip()
url_POST = "www.example.com/"
values = {}
values['return_type'] = 'retrieval'
values['format'] = 'TSV'
values['size'] = '70'
values['filters'] = '{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value": .format(uuid)}}]}'
data = urllib.urlencode(values)
url_final = url_POST + '?' + data
req2 = urllib2.Request(url_final)
req2.add_header('cookie', cookie)
handle = urllib2.urlopen(req2)
( edited :
example input : 123456-123456-987654
example output : it s data text )
You can do this with string formatting, this should work for you:
# ...snip
values['filters'] = '{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value":%s}]}' % uuid
# snip...
The %s will be replaced by the uuid by the % replacement operator:
>>> values = {}
>>> uuid = 1234
>>> values['filters'] = '{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value":%s}]}' % uuid
>>> values
{'filters': '{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value":1234}]}'}
Try to use Template.
from string import Template
params = Template('{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value": ${your_value}}}]}')
params = params.safe_substitute(your_value=123)
# params is '{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value":123}]}'

Get correct brace grouping from string

I have files with incorrect JSON that I want to start fixing by getting it into properly grouped chunks.
The brace grouping {{ {} {} } } {{}} {{{}}} should already be correct
How can I grab all the top-level braces, correctly grouped, as separate strings?
If you don't want to install any extra modules simple function will do:
def top_level(s):
depth = 0
start = -1
for i, c in enumerate(s):
if c == '{':
if depth == 0:
start = i
depth += 1
elif c == '}' and depth:
depth -= 1
if depth == 0:
yield s[start:i+1]
print(list(top_level('{{ {} {} } } {{}} {{{}}}')))
Output:
['{{ {} {} } }', '{{}}', '{{{}}}']
It will skip invalid braces but could be easily modified to report an error when they are spotted.
Using the regex module:
In [1]: import regex
In [2]: braces = regex.compile(r"\{(?:[^{}]++|(?R))*\}")
In [3]: braces.findall("{{ {} {} } } {{}} {{{}}}")
Out[3]: ['{{ {} {} } }', '{{}}', '{{{}}}']
pyparsing can be really helpful here. It will handle pathological cases where you have braces inside strings, etc. It might be a little tricky to do all of this work yourself, but fortunately, somebody (the author of the library) has already done the hard stuff for us.... I'll reproduce the code here to prevent link-rot:
# jsonParser.py
#
# Implementation of a simple JSON parser, returning a hierarchical
# ParseResults object support both list- and dict-style data access.
#
# Copyright 2006, by Paul McGuire
#
# Updated 8 Jan 2007 - fixed dict grouping bug, and made elements and
# members optional in array and object collections
#
json_bnf = """
object
{ members }
{}
members
string : value
members , string : value
array
[ elements ]
[]
elements
value
elements , value
value
string
number
object
array
true
false
null
"""
from pyparsing import *
TRUE = Keyword("true").setParseAction( replaceWith(True) )
FALSE = Keyword("false").setParseAction( replaceWith(False) )
NULL = Keyword("null").setParseAction( replaceWith(None) )
jsonString = dblQuotedString.setParseAction( removeQuotes )
jsonNumber = Combine( Optional('-') + ( '0' | Word('123456789',nums) ) +
Optional( '.' + Word(nums) ) +
Optional( Word('eE',exact=1) + Word(nums+'+-',nums) ) )
jsonObject = Forward()
jsonValue = Forward()
jsonElements = delimitedList( jsonValue )
jsonArray = Group(Suppress('[') + Optional(jsonElements) + Suppress(']') )
jsonValue << ( jsonString | jsonNumber | Group(jsonObject) | jsonArray | TRUE | FALSE | NULL )
memberDef = Group( jsonString + Suppress(':') + jsonValue )
jsonMembers = delimitedList( memberDef )
jsonObject << Dict( Suppress('{') + Optional(jsonMembers) + Suppress('}') )
jsonComment = cppStyleComment
jsonObject.ignore( jsonComment )
def convertNumbers(s,l,toks):
n = toks[0]
try:
return int(n)
except ValueError, ve:
return float(n)
jsonNumber.setParseAction( convertNumbers )
Phew! That's a lot ... Now how do we use it? The general strategy here will be to scan the string for matches and then slice those matches out of the original string. Each scan result is a tuple of the form (lex-tokens, start_index, stop_index). For our use, we don't care about the lex-tokens, just the start and stop. We could do: string[result[1], result[2]] and it would work. We can also do string[slice(*result[1:])] -- Take your pick.
results = jsonObject.scanString(testdata)
for result in results:
print '*' * 80
print testdata[slice(*result[1:])]

Categories