convert string which contains sub string to dictionary - python

I am tring to convert particular strings which are in particular format to Python dictionary.
String format is like below,
st1 = 'key1 key2=value2 key3="key3.1, key3.2=value3.2 , key3.3 = value3.3, key3.4" key4'
I want to parse it and convert to dictionary as below,
dict1 {
key1: None,
key2: value2,
key3: {
key3.1: None,
key3.2: value3.2,
key3.3: value3.3,
key3.2: None
}
key4: None,
I tried to use python re package and string split function. not able to acheive the result. I have thousands of string in same format, I am trying to automate it. could someone help.

If all your strings are consistent, and only have 1 layer of sub dict, this code below should do the trick, you may need to make tweaks/changes to it.
import json
st1 = 'key1 key2=item2 key3="key3.1, key3.2=item3.2 , key3.3 = item3.3, key3.4" key4'
st1 = st1.replace(' = ', '=')
st1 = st1.replace(' ,', ',')
new_dict = {}
no_keys=False
while not no_keys:
st1 = st1.lstrip()
if " " in st1:
item = st1.split(" ")[0]
else:
item = st1
if '=' in item:
if '="' in item:
item = item.split('=')[0]
new_dict[item] = {}
st1 = st1.replace(f'{item}=','')
sub_items = st1.split('"')[1]
sub_values = sub_items.split(',')
for sub_item in sub_values:
if "=" in sub_item:
sub_key, sub_value = sub_item.split('=')
new_dict[item].update({sub_key.strip():sub_value.strip()})
else:
new_dict[item].update({sub_item.strip(): None})
st1 = st1.replace(f'"{sub_items}"', '')
else:
key, value = item.split('=')
new_dict.update({key:value})
st1 = st1.replace(f"{item} ","")
else:
new_dict.update({item: None})
st1 = st1.replace(f"{item}","")
if st1 == "":
no_keys=True
print(json.dumps(new_dict, indent=4))

Consider use parsing tool like lark. A simple example to your case:
_grammar = r'''
?start: value
?value: object
| NON_SEPARATOR_STRING?
object : "\"" [pair (_SEPARATOR pair)*] "\""
pair : NON_SEPARATOR_STRING [_PAIRTOR] value
NON_SEPARATOR_STRING: /[a-zA-z0-9\.]+/
_SEPARATOR: /[, ]+/
| ","
_PAIRTOR: " = "
| "="
'''
parser = Lark(_grammar)
st1 = 'key1 key2=value2 key3="key3.1, key3.2=value3.2 , key3.3 = value3.3, key3.4" key4'
tree = parser.parse(f'"{st1}"')
print(tree.pretty())
"""
object
pair
key1
value
pair
key2
value2
pair
key3
object
pair
key3.1
value
pair
key3.2
value3.2
pair
key3.3
value3.3
pair
key3.4
value
pair
key4
value
"""
Then you can write your own Transformer to transform this tree to your desired date type.

Related

Parsing Erlang data to Python dictionary

I have an erlang script from which I would like to get some data and store it in python dictionary.
It is easy to parse the script to get string like this:
{userdata,
[{tags,
[#dt{number=111},
#mp{id='X23.W'}]},
{log,
'LG22'},
{instruction,
"String that can contain characters like -, _ or numbers"}
]
}.
desired result:
userdata = {"tags": {"dt": {"number": 111}, "mp": {"id": "X23.W"}},
"log": "LG22",
"instruction": "String that can contain characters like -, _ or numbers"}
# "#" mark for data in "tags" is not required in this structure.
# Also value for "tags" can be any iterable structure: tuple, list or dictionary.
But I am not sure how to transfer this data into a python dictionary. My first idea was to use json.loads but it requires many modifications (putting words into quotes marks, replacing "," with ":" and many more).
Moreover, keys in userdata are not limited to some pool. In this case, there are 'tags', 'log' and 'instruction', but there can be many more eg. 'slogan', 'ids', etc.
Also, I am not sure about the order. I assume that the keys can appear in random order.
My code (it is not working for id='X23.W' so I removed '.' from input):
import re
import json
in_ = """{userdata, [{tags, [#dt{number=111}, #mp{id='X23W'}]}, {log, 'LG22'}, {instruction, "String that can contain characters like -, _ or numbers"}]}"""
buff = in_.replace("{userdata, [", "")[:-2]
re_helper = re.compile(r"(#\w+)")
buff = re_helper.sub(r'\1:', buff)
partition = buff.partition("instruction")
section_to_replace = partition[0]
replacer = re.compile(r"(\w+)")
match = replacer.sub(r'"\1"', section_to_replace)
buff = ''.join([match, '"instruction"', partition[2]])
buff = buff.replace("#", "")
buff = buff.replace('",', '":')
buff = buff.replace("}, {", "}, \n{")
buff = buff.replace("=", ":")
buff = buff.replace("'", "")
temp = buff.split("\n")
userdata = {}
buff = temp[0][:-2]
buff = buff.replace("[", "{")
buff = buff.replace("]", "}")
userdata .update(json.loads(buff))
for i, v in enumerate(temp[1:]):
v = v.strip()
if v.endswith(","):
v = v[:-1]
userdata .update(json.loads(v))
print(userdata)
Output:
{'tags': {'dt': {'number': '111'}, 'mp': {'id': 'X23W'}}, 'instruction': 'String that can contain characters like -, _ or numbers', 'log': 'LG22'}
import json
import re
in_ = """{userdata, [{tags, [#dt{number=111}, #mp{id='X23.W'}]}, {log, 'LG22'}, {instruction, "String that can contain characters like -, _ or numbers"}]}"""
qouted_headers = re.sub(r"\{(\w+),", r'{"\1":', in_)
changed_hashed_list_to_dict = re.sub(r"\[(#.*?)\]", r'{\1}', qouted_headers)
hashed_variables = re.sub(r'#(\w+)', r'"\1":', changed_hashed_list_to_dict)
equality_signes_replaced_and_quoted = re.sub(r'{(\w+)=', r'{"\1":', hashed_variables)
replace_single_qoutes = equality_signes_replaced_and_quoted.replace('\'', '"')
result = json.loads(replace_single_qoutes)
print(result)
Produces:
{'userdata': [{'tags': {'dt': {'number': 111}, 'mp': {'id': 'X23.W'}}}, {'log': 'LG22'}, {'instruction': 'String that can contain characters like -, _ or numbers'}]}

Convert a nested dictionary into a string

I am trying to achieve the following:
Input = {
'key1':{'key11':'val11', 'key12':'val12'},
'key2':{'key21':{'key211':'val211', 'key212':'val212'}},
'key3':'val3',
'key4':{'key41':'val41', 'key42':'val42'}
}
Output =
1) When value is a dictionary, then create an output string variable =
key1 { key11 val11 key12 val12 } key2 { key21 { key211 val211 key212 val212 }} key4 { key41 val41 key42 val42}
2) When value is not a dictionary, then print "just a string element"
Below is my code:
from collections import defaultdict
def stringBuilder(dictionary):
stringOption = ""
innerString = ""
# print dictionary
for key, value in dictionary.iteritems():
if isinstance(value, dict):
stringBuilder(value)
else:
innerString = innerString + " " + str(key) + " " + str(value)
print innerString
stringOption = "{" + innerString + " }"
print stringOption
return stringOption
d = {'key1':{'key11':'val11', 'key12':'val12'}, 'key2':{'key21':{'key211':'val211', 'key212':'val212'}}, 'key3':'val3', 'key4':{'key41':'val41', 'key42':'val42'}}
print d
stringOption = ""
for key, value in d.iteritems():
if isinstance(value, dict):
stringOption = stringOption + " " + str(key) + " " + stringBuilder(value)
print stringOption
else:
print "just a string element"
print stringOption
Here is the output that I get:
{'key2': {'key21': {'key211': 'val211', 'key212': 'val212'}}}
key211 val211
{ key211 val211 }
key2
key2
Your main problem is that you discard the returned string of your dict case:
if isinstance(value, dict):
stringBuilder(value)
Instead, try saving the value to pass back up the line:
if isinstance(value, dict):
stringOption = stringBuilder(value)
Resulting output:
key2 { key211 val211 key212 val212 } key1 { key12 val12 key11 val11 } key4 { key41 val41 key42 val42 }
Rather than writing your own implementation (unless that's what you want) why not use the json library? You can extend JSONEncoder and JSONDecoder to suit your use case?
A great recursion exercise!
def rec(x):
# assuming values are either str or otherwise dict
if isinstance(x, str):
return x
return '{' + ' '.join([k + ' ' + rec(v) for (k, v) in x.items()]) + '}'
Input = {
'key1':{'key11':'val11', 'key12':'val12'},
'key2':{'key21':{'key211':'val211', 'key212':'val212'}},
'key3':'val3',
'key4':{'key41':'val41', 'key42':'val42'}
}
Output:
In [6]: rec(Input)
Out[6]: '{key1 {key11 val11 key12 val12} key2 {key21 {key211 val211 key212 val212}} key3 val3 key4 {key41 val41 key42 val42}}'

Python parsing an ugly configuration file

I have an application generating a weird config file
app_id1 {
key1 = val
key2 = val
...
}
app_id2 {
key1 = val
key2 = val
...
}
...
And I am struggling on how to parse this in python. The keys of each app may vary too.
I can't change the application to generate the configuration file in some easily parsable format :)
Any suggestions on how to do this pythonically ?
I am thinking along the lines of dict of dict
conf = {'app_id1': {'key1' : 'val', 'key2' : 'val'},
'app_id2' : {'key1' : 'val', 'key2' : 'val'}
}
Try something like this:
I assumed you read the content of the file to a string
config_file_string = '''app_id1 {
key1 = val
key2 = val
key3 = val
}
app_id2 {
key1 = val
key2 = val
}'''
config = {}
appid = ''
for line in config_file_string.splitlines():
print(line)
if line.endswith('{'):
appid = line.split()[0].strip()
placeholder_dict = {}
elif line.startswith('}'):
config[appid] = placeholder_dict
else:
placeholder_dict[line.split('=')[0].strip()] = line.split('=')[1].strip()
print(config)
This returns:
{'app_id2': {'key2 ': ' val', 'key1 ': ' val'}, 'app_id1': {'key3 ': ' val', 'key2 ': ' val', 'key1 ': ' val'}}
You could use regex: (\w+)\s*\{([^}]*) will find a name { values } construct, and ([^\s=]+)\s*=\s*([^\n]*) will find key = value pairs.
As a one-liner, assuming the contents of the file are in the variable s:
config= {key:dict(re.findall(r'([^\s=]+)\s*=\s*([^\n]*)', values)) for key,values in re.findall(r'(\w+)\s*\{([^}]*)', s)}
You can use pyparsing for less strict grammar:
from pyparsing import alphanums, restOfLine, OneOrMore, Word, Suppress
from copy import copy
lbrace,rbrace,eq = map(Suppress,"{}=")
configitem = {}
configall = {}
wd = Word(alphanums+'_')
kw = wd + eq + restOfLine
kw.setParseAction(lambda x: configitem.__setitem__(x[0],x[1].strip()))
group = wd + lbrace + OneOrMore(kw) + rbrace
group.addParseAction(lambda x: configall.__setitem__(x[0],copy(configitem)))
group.addParseAction(lambda x: configitem.clear())
config = OneOrMore(group)
config_file_string = '''app_id1
{
key1 = val
key2 = val
key3 = val
}
app_id2 {
key1 = val
key2 = val
}'''
config.parseString(config_file_string)
print(configall)

python generating nested dictionary key error

I am trying to create a nested dictionary from a mysql query but I am getting a key error
result = {}
for i, q in enumerate(query):
result['data'][i]['firstName'] = q.first_name
result['data'][i]['lastName'] = q.last_name
result['data'][i]['email'] = q.email
error
KeyError: 'data'
desired result
result = {
'data': {
0: {'firstName': ''...}
1: {'firstName': ''...}
2: {'firstName': ''...}
}
}
You wanted to create a nested dictionary
result = {} will create an assignment for a flat dictionary, whose items can have any values like "string", "int", "list" or "dict"
For this flat assignment
python knows what to do for result["first"]
If you want "first" also to be another dictionary you need to tell Python by an assingment
result['first'] = {}.
otherwise, Python raises "KeyError"
I think you are looking for this :)
>>> from collections import defaultdict
>>> mydict = lambda: defaultdict(mydict)
>>> result = mydict()
>>> result['Python']['rules']['the world'] = "Yes I Agree"
>>> result['Python']['rules']['the world']
'Yes I Agree'
result = {}
result['data'] = {}
for i, q in enumerate(query):
result['data']['i'] = {}
result['data'][i]['firstName'] = q.first_name
result['data'][i]['lastName'] = q.last_name
result['data'][i]['email'] = q.email
Alternatively, you can use you own class which adds the extra dicts automatically
class AutoDict(dict):
def __missing__(self, k):
self[k] = AutoDict()
return self[k]
result = AutoDict()
for i, q in enumerate(query):
result['data'][i]['firstName'] = q.first_name
result['data'][i]['lastName'] = q.last_name
result['data'][i]['email'] = q.email
result['data'] does exist. So you cannot add data to it.
Try this out at the start:
result = {'data': []};
You have to create the key data first:
result = {}
result['data'] = {}
for i, q in enumerate(query):
result['data'][i] = {}
result['data'][i]['firstName'] = q.first_name
result['data'][i]['lastName'] = q.last_name
result['data'][i]['email'] = q.email

Parsing and sorting keys in Python dictionary

I created the following dictionary:
code dictionary = {u'News; comment; negative': u'contradictory about news', u'News; comment': u'something about news'}
I now want to write some Python code that goes through the dictionary's keys and separates out the codes and their corresponding values. So for the first element in the dictionary, I want to end up with:
News: 'contradictory about news', 'something about news'
comment: 'contradictory about news', 'something about news'
negative: 'contradictory about news'
The end result can be a dictionary, list, or tab or comma-separated text.
You can see my attempt to do this here:
from bs4 import BeautifulSoup as Soup
f = open('transcript.xml','r')
soup = Soup(f)
#print soup.prettify()
#searches text for all w:commentrangestart tags and makes a dictionary that matches ids with text
textdict = {}
for i in soup.find_all('w:commentrangestart'):
# variable 'key' is assigned to the tag id
key = i.parent.contents[1].attrs['w:id']
key = str(key)
#variable 'value' is assigned to the tag's text
value= ''.join(i.nextSibling.findAll(text=True))
# key / value pairs are added to the dictionary 'textdict'
textdict[key]=value
print "Transcript Text = " , textdict
# makes a dictionary that matches ids with codes
codedict = {}
for i in soup.find_all('w:comment'):
key = i.attrs['w:id']
key = str(key)
value= ''.join(i.findAll(text=True))
codedict[key]=value
print "Codes = ", codedict
# makes a dictionary that matches all codes with text
output = {}
for key in set(textdict.keys()).union(codedict.keys()):
print "key= ", key
txt = textdict[key]
print "txt = ", txt
ct = codedict[key]
print "ct= ", ct
output[ct] = txt
#print "output = ", output
print "All code dictionary = ", output
#codelist={}
#for key in output:
# codelist =key.split(";")
#print "codelist= " , codelist
code_negative = {}
code_news = {}
print output.keys()
for i in output:
if 'negative' in output.keys():
print 'yay'
code_negative[i]=textdict[i]
print 'text coded negative: ' , code_negative
if 'News' in i:
code_news[i]=textdict[i]
print 'text coded News: ' ,code_news
For some reason though, I keep getting a key error when I run the last function:
code_negative = {}
code_news = {}
for i in output:
if 'negative' in output.keys():
code_negative[i]=textdict[i]
print 'text coded negative: ' , code_negative
if 'News' in i:
code_news[i]=textdict[i]
print 'text coded News: ' ,code_news
Any ideas? Thanks!
The following code should work, if I understood the problem correctly:
from collections import defaultdict
out = defaultdict(list)
for k, v in code_dictionary.viewitems():
for item in k.split('; '):
out[item].append(v)
output = {u'News; comment; negative': u'contradictory about news', u'News; comment': u'something about news'}
negatives = []
comments = []
news = []
for k, v in output.items():
key_parts = k.split('; ')
key_parts = [part.lower() for part in key_parts]
if 'negative' in key_parts:
negatives.append(v)
if 'news' in key_parts:
news.append(v)
if 'comment' in key_parts:
comments.append(v)

Categories