I have a data in format:
id1 id2 value
Something like
1 234 0.2
1 235 0.1
and so on.
I want to convert it in json format:
{
"nodes": [ {"name":"1"}, #first element
{"name":"234"}, #second element
{"name":"235"} #third element
] ,
"links":[{"source":1,"target":2,"value":0.2},
{"source":1,"target":3,"value":0.1}
]
}
So, from the original data to above format.. the nodes contain all the set of (distinct) names present in the original data and the links are basically the line number of source and target in the values list returned by nodes.
For example:
1 234 0.2
1 is in the first element in the list of values holded by the key "nodes"
234 is the second element in the list of values holded by the key "nodes"
Hence the link dictionary is {"source":1,"target":2,"value":0.2}
How do i do this efficiently in python.. I am sure there should be better way than what I am doing which is so messy :(
Here is what I am doing
from collections import defaultdict
def open_file(filename,output=None):
f = open(filename,"r")
offset = 3429
data_dict = {}
node_list = []
node_dict = {}
link_list = []
num_lines = 0
line_ids = []
for line in f:
line = line.strip()
tokens = line.split()
mod_wid = int(tokens[1]) + offset
if not node_dict.has_key(tokens[0]):
d = {"name": tokens[0],"group":1}
node_list.append(d)
node_dict[tokens[0]] = True
line_ids.append(tokens[0])
if not node_dict.has_key(mod_wid):
d = {"name": str(mod_wid),"group":1}
node_list.append(d)
node_dict[mod_wid] = True
line_ids.append(mod_wid)
link_d = {"source": line_ids.index(tokens[0]),"target":line_ids.index(mod_wid),"value":tokens[2]}
link_list.append(link_d)
if num_lines > 10000:
break
num_lines +=1
data_dict = {"nodes":node_list, "links":link_list}
print "{\n"
for k,v in data_dict.items():
print '"'+k +'"' +":\n [ \n "
for each_v in v:
print each_v ,","
print "\n],"
print "}"
open_file("lda_input.tsv")
I'm assuming by "efficiently" you're talking about programmer efficiency—how easy it is to read, maintain, and code the logic—rather than runtime speed efficiency. If you're worried about the latter, you're probably worried for no reason. (But the code below will probably be faster anyway.)
The key to coming up with a better solution is to think more abstractly. Think about rows in a CSV file, not lines in a text file; create a dict that can be rendered in JSON rather than trying to generate JSON via string processing; wrap things up in functions if you want to do them repeatedly; etc. Something like this:
import csv
import json
import sys
def parse(inpath, namedict):
lastname = [0]
def lookup_name(name):
try:
print('Looking up {} in {}'.format(name, names))
return namedict[name]
except KeyError:
lastname[0] += 1
print('Adding {} as {}'.format(name, lastname[0]))
namedict[name] = lastname[0]
return lastname[0]
with open(inpath) as f:
reader = csv.reader(f, delimiter=' ', skipinitialspace=True)
for id1, id2, value in reader:
yield {'source': lookup_name(id1),
'target': lookup_name(id2),
'value': value}
for inpath in sys.argv[1:]:
names = {}
links = list(parse(inpath, names))
nodes = [{'name': name} for name in names]
outpath = inpath + '.json'
with open(outpath, 'w') as f:
json.dump({'nodes': nodes, 'links': links}, f, indent=4)
Don't construct the JSON manually. Make it out of an existing Python object with the json module:
def parse(data):
nodes = set()
links = set()
for line in data.split('\n'):
fields = line.split()
id1, id2 = map(int, fields[:2])
value = float(fields[2])
nodes.update((id1, id2))
links.add((id1, id2, value))
return {
'nodes': [{
'name': node
} for node in nodes],
'links': [{
'source': link[0],
'target': link[1],
'value': link[2]
} for link in links]
}
Now, you can use json.dumps to get a string:
>>> import json
>>> data = '1 234 0.2\n1 235 0.1'
>>> parsed = parse(data)
>>> parsed
{'links': [{'source': 1, 'target': 235, 'value': 0.1},
{'source': 1, 'target': 234, 'value': 0.2}],
'nodes': [{'name': 1}, {'name': 234}, {'name': 235}]}
>>> json.dumps(parsed)
'{"nodes": [{"name": 1}, {"name": 234}, {"name": 235}], "links": [{"source": 1, "target": 235, "value": 0.1}, {"source": 1, "target": 234, "value": 0.2}]}'
Related
I'am trying to add json data to the lists.
Json looks like this:
[{'genus': 'Musa', 'name': 'Banana', 'id': 1, 'family': 'Musaceae', 'order': 'Zingiberales', 'nutritions': {'carbohydrates': 22, 'protein': 1, 'fat': 0.2, 'calories': 96, 'sugar': 17.2}}]
But with my function, i can append to lists only those objects:
genus': 'Musa', 'name': 'Banana', 'id': 1, 'family': 'Musaceae', 'order': 'Zingiberales'
Can't get anything from 'nutritions'.
Adding code:
import requests
import json
name = []
id = []
family = []
genus = []
order = []
carbohydrates = []
protein = []
fat = []
calories = []
sugar = []
def scrape_all_fruits():
data_list = []
try:
for ID in range(1, 10):
url = f'https://www.fruityvice.com/api/fruit/{ID}'
response = requests.get(url)
data = response.json()
data_list.append(data)
except:
pass
return data_list
def listify(fruit_stats):
alist = json.dumps(scrape_all_fruits())
jsonSTr = json.loads(alist)
for i in jsonSTr:
try:
name.append(i['name'])
id.append(i['id'])
family.append(i['family'])
genus.append(i['genus'])
order.append(i['order'])
carbohydrates.append(i['carbohydrates'])
protein.append(i['protein'])
# fat.append(i['fat'])
calories.append(i['calories'])
sugar.append(i['sugar'])
for nutrs in i:
fat.append(nutrs.a['fat'])
except:
pass
return fruit_stats
print(listify(fat))
Can anyone explain to me what iam doing wrong ? Thank You in advance.
jsonSTr in your code is a dictionary. By default looping over a dictionary returns its keys.
You can fix this by either looking up via the key you receive:
name.append(jsonSTr[i]["name"])
or by looping over the values:
for i in jsonSTr.values():
if you need both the key and the value you can use the items() method.
IMO, your code is too complicated. If you have a nested dictionary, you can flatten it with a special function (example from here).
from collections.abc import MutableMapping
import pandas as pd
import json
def flatten_dict(d: MutableMapping, sep: str= ".") -> MutableMapping:
[flat_dict] = pd.json_normalize(d, sep=sep).to_dict(orient="records")
return flat_dict
Data = []
test_response = '[{"genus": "Musa", "name": "Banana", "id": 1, "family": "Musaceae", "order": "Zingiberales", "nutritions": {"carbohydrates": 22, "protein": 1, "fat": 0.2, "calories": 96, "sugar": 17.2}}]'
ResponseJSON = json.loads(test_response)
Data.append(flatten_dict(ResponseJSON[0]))
print(json.dumps(Data, indent=4))
Output:
[
{
"genus": "Musa",
"name": "Banana",
"id": 1,
"family": "Musaceae",
"order": "Zingiberales",
"nutritions.carbohydrates": 22,
"nutritions.protein": 1,
"nutritions.fat": 0.2,
"nutritions.calories": 96,
"nutritions.sugar": 17.2
}
]
What to do further, is up to you, but the output structure is good for pandas tables, which can be easily manipulated:
Table = pd.DataFrame(Data)
Hope that helps!
I have a file with the following structure:
SE|text|Baz
SE|entity|Bla
SE|relation|Bla
SE|relation|Foo
SE|text|Bla
SE|entity|Foo
SE|text|Zoo
SE|relation|Bla
SE|relation|Baz
Records (i.e., blocks) are separated by an empty line. Each line in a block starts with a SE tag. text tag always occurs in the first line of each block.
I wonder how to properly extract only blocks with a relation tag, which is not necessarily present in each block. My attempt is pasted below:
from itertools import groupby
with open('test.txt') as f:
for nonempty, group in groupby(f, bool):
if nonempty:
process_block() ## ?
Desired output is a json dump:
{
"result": [
{
"text": "Baz",
"relation": ["Bla","Foo"]
},
{
"text": "Zoo",
"relation": ["Bla","Baz"]
}
]
}
I have a proposed solution in pure python that returns a block if it contains the value in any position. This could most likely be done more elegant in a proper framework like pandas.
from pprint import pprint
fname = 'ex.txt'
# extract blocks
with open(fname, 'r') as f:
blocks = [[]]
for line in f:
if len(line) == 1:
blocks.append([])
else:
blocks[-1] += [line.strip().split('|')]
# remove blocks that don't contain 'relation
blocks = [block for block in blocks
if any('relation' == x[1] for x in block)]
pprint(blocks)
# [[['SE', 'text', 'Baz'],
# ['SE', 'entity', 'Bla'],
# ['SE', 'relation', 'Bla'],
# ['SE', 'relation', 'Foo']],
# [['SE', 'text', 'Zoo'], ['SE', 'relation', 'Bla'], ['SE', 'relation', 'Baz']]]
# To export to proper json format the following can be done
import pandas as pd
import json
results = []
for block in blocks:
df = pd.DataFrame(block)
json_dict = {}
json_dict['text'] = list(df[2][df[1] == 'text'])
json_dict['relation'] = list(df[2][df[1] == 'relation'])
results.append(json_dict)
print(json.dumps(results))
# '[{"text": ["Baz"], "relation": ["Bla", "Foo"]}, {"text": ["Zoo"], "relation": ["Bla", "Baz"]}]'
Let's go through it
Read the file into a list and divide each block by a blank line and divide columns with the | character.
Go through each block in the list and sort out any that does not contain relation.
Print the output.
You can not store the same key twice in a dictionary as mentioned in the comments.
You can read your file, split at '\n\n' into blocks, split blocks into lines at '\n', split lines into data at '|'.
You then can put it into a suiteable datastructure and parse it into a string using module json:
Create data file:
with open("f.txt","w")as f:
f.write('''SE|text|Baz
SE|entity|Bla
SE|relation|Bla
SE|relation|Foo
SE|text|Bla
SE|entity|Foo
SE|text|Zoo
SE|relation|Bla
SE|relation|Baz''')
Read data and process it:
with open("f.txt") as f:
all_text = f.read()
as_blocks = all_text.split("\n\n")
# skip SE when splitting and filter only with |relation|
with_relation = [[k.split("|")[1:]
for k in b.split("\n")]
for b in as_blocks if "|relation|" in b]
print(with_relation)
Create a suiteable data structure - grouping multiple same keys into a list:
result = []
for inner in with_relation:
result.append({})
for k,v in inner:
# add as simple key
if k not in result[-1]:
result[-1][k] = v
# got key 2nd time, read it as list
elif k in result[-1] and not isinstance(result[-1][k], list):
result[-1][k] = [result[-1][k], v]
# got it a 3rd+ time, add to list
else:
result[-1][k].append(v)
print(result)
Create json from data structure:
import json
print( json.dumps({"result":result}, indent=4))
Output:
# with_relation
[[['text', 'Baz'], ['entity', 'Bla'], ['relation', 'Bla'], ['relation', 'Foo']],
[['text', 'Zoo'], ['relation', 'Bla'], ['relation', 'Baz']]]
# result
[{'text': 'Baz', 'entity': 'Bla', 'relation': ['Bla', 'Foo']},
{'text': 'Zoo', 'relation': ['Bla', 'Baz']}]
# json string
{
"result": [
{
"text": "Baz",
"entity": "Bla",
"relation": [
"Bla",
"Foo"
]
},
{
"text": "Zoo",
"relation": [
"Bla",
"Baz"
]
}
]
}
In my opinion this is a very good case for a small parser.
This solution uses a PEG parser called parsimonious but you could totally use another one:
from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
import json
data = """
SE|text|Baz
SE|entity|Bla
SE|relation|Bla
SE|relation|Foo
SE|text|Bla
SE|entity|Foo
SE|text|Zoo
SE|relation|Bla
SE|relation|Baz
"""
class TagVisitor(NodeVisitor):
grammar = Grammar(r"""
content = (ws / block)+
block = line+
line = ~".+" nl?
nl = ~"[\n\r]"
ws = ~"\s+"
""")
def generic_visit(self, node, visited_children):
return visited_children or node
def visit_content(self, node, visited_children):
filtered = [child[0] for child in visited_children if isinstance(child[0], dict)]
return {"result": filtered}
def visit_block(self, node, visited_children):
text, relations = None, []
for child in visited_children:
if child[1] == "text" and not text:
text = child[2].strip()
elif child[1] == "relation":
relations.append(child[2])
if relations:
return {"text": text, "relation": relations}
def visit_line(self, node, visited_children):
tag1, tag2, text = node.text.split("|")
return tag1, tag2, text.strip()
tv = TagVisitor()
result = tv.parse(data)
print(json.dumps(result))
This yields
{"result":
[{"text": "Baz", "relation": ["Bla", "Foo"]},
{"text": "Zoo", "relation": ["Bla", "Baz"]}]
}
The idea is to phrase a grammar, build an abstract syntax tree out of it and return the block's content in a suitable data format.
I'm using Python and a JSON file containing a list of dictionaries like so:
[
{'name':'person1','id':'123','status':'absent'},
{'name':'person2','id':'0980','status':'away'},
{'name':'person3','id':'5235','status':'present'}
]
And I have an incoming dictionary with the same format:
{'name':'person1','id':'324','status':'present'}
The incoming dictionary can have one thing in common, and that's the name key, if the value for the 'name' key hasn't been seen, I add it the json file, if it has, I update the values for the id and status keys in the json file. I'm having trouble updating the list of dictionaries in the json file.
Taking the examples I gave above, the resulting json file should look like this:
[
{'name':'person1','id':'324','status':'present'},
{'name':'person2','id':'0980','status':'away'},
{'name':'person3','id':'5235','status':'present'}
]
I can manage to find the dictionary I want to change with the following:
dict_to_update = next(item for item in <jsonfilename> if item['name'] == 'desired name')
After this, I'm stuck trying to figure out how to then update the specific dictionary in the json file.
Any ideas? Thank you.
Here is how:
with open('file.json', 'r') as r:
lst = json.load(r)
for i,d in enumerate(lst):
if d['name'] == dct['name']:
lst[i] = dct
with open('file.json', 'w') as f:
json.dump(lst , f)
You can also use a function:
def update(lst):
for i,d in enumerate(lst):
if d['name'] == dct['name']:
lst[i] = dct
return lst
with open('file.json', 'r') as r:
lst = update(json.load(r))
with open('file.json', 'w') as f:
json.dump(lst , f)
list_of_dict = [
{"name": "person1", "id": "123", "status": "absent"},
{"name": "person2", "id": "0980", "status": "away"},
{"name": "person3", "id": "5235", "status": "present"},
]
incoming_dictionary = {"name": "person1", "id": "324", "status": "present"}
for index, dictionary in enumerate(list_of_dict):
if incoming_dictionary["name"] == dictionary["name"]:
list_of_dict[
index
] = incoming_dictionary # replace the dictionary with the new one
break
else:
# if no match was found then append the incoming dictionary
list_of_dict.append(incoming_dictionary)
Here's a function that does so, maybe not as elegantly as other answers:
def process(new_dict):
global data #jsonfilename
if new_dict['name'] in [d['name'] for d in data]:
data = [d for d in data if d['name'] != new_dict['name']]
data.append(new_dict)
data = sorted(data, key=lambda i: i['name'])
Full example:
data = [{'name':'person1','id':'123','status':'absent'},
{'name':'person2','id':'0980','status':'away'},
{'name':'person3','id':'5235','status':'present'}]
process({'name':'person1','id':'324','status':'present'}) #an overwritten person
process({'name':'person4','id':'324','status':'present'}) #a new person
Result:
[{'name': 'person1', 'id': '324', 'status': 'present'},
{'name': 'person2', 'id': '0980', 'status': 'away'},
{'name': 'person3', 'id': '5235', 'status': 'present'},
{'name': 'person4', 'id': '324', 'status': 'present'}]
You can do this while avoiding the global keyword as well, but I thought this seemed alright for modifying an existing structure in place.
Maybe this can help you, and if you found another short solution please let me know.
I make an iteration for the list of dictionary, and add a conditional to change the value of key id and status.
Another way is we can use filter and map, to have a short code instead.
datas = [{'name': 'person1', 'id': '123', 'status': 'absent'}, {'name': 'person2', 'id': '0980',
'status': 'away'}, {'name': 'person3', 'id': '5235', 'status': 'present'}]
newData = {'name':'person1','id':'324','status':'present'}
for data in datas:
if data["name"]==newData["name"]:
data["id"]=newData["id"]
data["status"]=newData["status"]
thanks for your time and effort but I think I probably misdeliver what I wanted, my fault.
Long story short, is there any way you can encrypt the certain string or the whole array?
{
"gender": "male",
"phone-number": "1234567890",
"job": "student",
"location": {
"county": "LA-county",
"town": "sunvalley",
"country": "USA",
"apartment-number": "13579abcdefg"
},
"item": {
"item-type": "cloth",
"item-size": "large",
"item-number": "xyz24680abc",
"item-material": "cotton"
},
"hairstyle": "long",
"alive": "true",
}
let's say that apartment-number: 13579abcdefg needs to be encrypted. Can I use fernet as below?
from cryptography.fernet import Fernet
key = Fernet.generate_key()
f = Fernet(key)
encrypt_value = f.encrypt(b"YourString")
f.decrypt(encrypt_value)
I heard some people mention about the base64... which method would you recommend when it comes for encrypting certain values?
Here is a solution that will account for nested JSON:
def mask_sensitive(payload, fields, n_front=3, n_back=3):
out = {}
for k, v in payload.items():
# if it's a dict, recurse
if isinstance(v, dict):
out[k] = mask_sensitive(v, fields, n_front, n_back)
# this assumes the field is a string, and not an iterable
# but you can always add logic to allow ints, lists, etc.
elif k in fields:
out[k] = v[:n_front] + "..." + v[-n_back:]
else:
out[k] = v
return out
There are some things you may want to write logic for, like, if the field is less than 3 characters long, how do you want to pad the sensitive information? But this gives you a good jumping off point. Example:
>>> import pprint
>>> pprint.pprint(mask_sensitive(x, ["phone-number", "apartment-number"]))
{'alive': 'true',
'gender': 'male',
'hairstyle': 'long',
'item': {'item-material': 'cotton',
'item-number': 'xyz24680abc',
'item-size': 'large',
'item-type': 'cloth'},
'job': 'student',
'location': {'apartment-number': '135...efg',
'country': 'USA',
'county': 'LA-county',
'town': 'sunvalley'},
'phone-number': '123...890'}
This code will run over the JSON and will parse the names from the keys in the JSON that in values_to_change list with the format that specified in the question
import json
with open('filename.json') as f:
data = json.load(f)
values_to_change = ["phone-number", "apartment-number", "item-number"]
for k, v in data.items():
if isinstance(v, str):
if k in values_to_change:
data[k] = "{}...{}".format(v[:3], v[-3:])
elif isinstance(v, dict):
for kv, vv in v.items():
if kv in values_to_change:
data[k][kv] = "{}...{}".format(vv[:3], vv[-3:])
with open('newfilename.json', 'w') as f:
json.dump(data, f, indent=2)
Output
{'gender': 'male',
'phone-number': '123...890',
'job': 'student',
'location': {'county': 'LA-county',
'town': 'sunvalley',
'country': 'USA',
'apartment-number': '135...efg'},
'item': {'item-type': 'cloth',
'item-size': 'large',
'item-number': 'xyz...abc',
'item-material': 'cotton'},
'hairstyle': 'long',
'alive': 'true'}
Just get the first three characters, three dots, then the last three characters.
def censor(string):
return string[:3] + "..." + string[-3:]
data["phone-number"] = censor(data["phone-number"])
data["apartment-number"] = censor(data["item-number"])
data["location"]["apartment-number"] = censor(data["location"]["apartment-number"])
Parsing dict is very important in this code. parse_dict method will parse all elements in dict. If any type of any element in data is dict, then we will use recursion, to call parse_dict again, to parse inner dict.
After this we check if key is one of ["phone-number", "apartment-number", "item-number"], if yes, then we change its value, to required format.
In this way, if in future, we want to change any other key, apart from ["phone-number", "apartment-number", "item-number"], then we simply need to append new key, in our list and this code will work.
import json
def parse_dict(data):
for key, value in data.items():
if type(value) == dict:
parse_dict(value)
if key in ["phone-number", "apartment-number", "item-number"]:
data[key] = value[:3] + "..." + value[-3:]
def main():
with open('sample.json') as f:
data = json.load(f)
parse_dict(data)
print("*****: ", data)
with open('newfilename.json', 'w') as f:
json.dump(data, f, indent=2)
main()
Using #Clinton Graham, plus some edits as you will get TypeError: unhashable type: 'slice' otherwise - something like this should work.
import json
import pandas as pd
with open('filename.json') as f:
data = json.load(f)
def reformat(data):
return data.values[:, 3] + "..." + data.valies[-3:]
data["phone-number"] = reformat(data["phone-number"])
data["item"]["item-number"] = reformat(data["item"]["item-number"])
data["location"]["apartment-number"] = reformat(data["location"]["apartment-number"])
print(reformat(data))
I have a text file which I read in. This is a log file so it follows a particular pattern. I need to create a JSON ultimately, but from researching this problem, once it is in a dict it will be a matter of using json.loads() or json.dumps().
A sample of the text file is below.
INFO:20180606_141527:submit:is_test=False
INFO:20180606_141527:submit:username=Mary
INFO:20180606_141527:env:sys.platform=linux2
INFO:20180606_141527:env:os.name=ubuntu
The dict structure which I am ultimatly looking for is
{
"INFO": {
"submit": {
"is_test": false,
"username": "Mary"
},
"env": {
"sys.platform": "linux2",
"os.name": "ubuntu"
}
}
}
I am ignoring the timestamp information in each list for now.
This is a snippet of the code I am using,
import csv
tree_dict = {}
with open('file.log') as file:
for row in file:
for key in reversed(row.split(":")):
tree_dict = {key: tree_dict}
Which results in an undesired output,
{'INFO': {'20180606_141527': {'submit': {'os.name=posix\n': {'INFO': {'20180606_141527': {'submit': {'sys.platform=linux2\n': {'INFO': {'20180606_141527': {'submit': {'username=a227874\n': {'INFO': {'20180606_141527': {'submit': {'is_test=False\n': {}}}}}}}}}}}}}}}}}
I need to dynamically populate the dict because I don't know the actual field/key names.
with open('demo.txt') as f:
lines = f.readlines()
dct = {}
for line in lines:
# param1 == INFO
# param2 == submit or env
# params3 == is_test=False etc.
param1, _, param2, params3 = line.strip().split(':')
# create dct[param1] = {} if it is not created
dct.setdefault(param1, {})
# create dct[param1][param2] = {} if it is no created
dct[param1].setdefault(param2, {})
# for example params3 == is_test=False
# split it by '=' and now we unpack it
# k == is_test
# v == False
k, v = params3.split('=')
# and update our `dict` with the new values
dct[param1][param2].update({k: v})
print(dct)
Output
{
'INFO': {
'submit': {
'is_test': 'False', 'username': 'Mary'
},
'env': {
'sys.platform': 'linux2', 'os.name': 'ubuntu'
}
}
}
This is one of the rare cases where recursion in Python seems to be appropriate and helpful. The following function adds a value to the hierarchical dictionary d specified by the list of keys:
def add_to_dict(d, keys, value):
if len(keys) == 1: # The last key
d[keys[0]] = value
return
if keys[0] not in d:
d[keys[0]] = {} # Create a new subdict
add_to_dict(d[keys[0]], keys[1:], value)
The function works with the dictionaries of arbitrary depth. The rest is just the matter of calling the function:
d = {}
for line in file:
keys, value = line.split("=")
keys = keys.split(":")
add_to_dict(d, keys, value.strip())
Result:
{'INFO': {'20180606_141527': {
'submit': {'is_test': 'False',
'username': 'Mary'},
'env': {'sys.platform': 'linux2',
'os.name': 'ubuntu'}}}}
You can modify the code to exclude certain levels (like the timestamp).
You could use a nested collections.defaultdict() here:
from collections import defaultdict
from pprint import pprint
d = defaultdict(lambda: defaultdict(dict))
with open('sample.txt') as in_file:
for line in in_file:
info, _, category, pair = line.strip().split(':')
props, value = pair.split('=')
d[info][category][props] = value
pprint(d)
Which gives the following:
defaultdict(<function <lambda> at 0x7ff8a341aea0>,
{'INFO': defaultdict(<class 'dict'>,
{'env': {'os.name': 'ubuntu',
'sys.platform': 'linux2'},
'submit': {'is_test': 'False',
'username': 'Mary'}})})
Note: defaultdict() is a subclass of the builtin dict, so their is not reason to convert it to dict in the end result. Additionally, defaultdict() can also be serialized to JSON with json.dumps().
You can use itertools.groupby:
import itertools, re
content = [re.split('\=|:', i.strip('\n')) for i in open('filename.txt')]
new_content = [[a, *c] for a, _, *c in content]
def group_vals(d):
new_d = [[a, [c for _, *c in b]] for a, b in itertools.groupby(sorted(d, key=lambda x:x[0]), key=lambda x:x[0])]
return {a:b[0][0] if len(b) ==1 else group_vals(b) for a, b in new_d}
import json
print(json.dumps(group_vals(new_content), indent=4))
Output:
{
"INFO": {
"env": {
"os.name": "ubuntu",
"sys.platform": "linux2"
},
"submit": {
"is_test": "False",
"username": "Mary"
}
}
}
Check for the presence of keys:
import csv
import json
tree_dict = {}
with open('file.log') as file:
tree_dict = {}
for row in file:
keys = row.split(":")
if keys[0] not in tree_dict:
tree_dict[keys[0]] = {}
if keys[-2] not in tree_dict[keys[0]]:
tree_dict[keys[0]][keys[-2]] = {}
key, value = keys[-1].split("=")
if value == "False":
value = False
if value == "True":
value = True
tree_dict[keys[0]][keys[-2]][key] = value
dumped = json.dumps(tree_dict)
import re
from functools import reduce
with open('file.txt') as f:
lines = f.readlines()
def rec_merge(d1, d2):
for k, v in d1.items():
if k in d2:
d2[k] = rec_merge(v, d2[k])
d3 = d1.copy()
d3.update(d2)
return d3
lst_of_tup = re.findall(r'^([^:]*):[\d_]+:([^:]*):([^=]*)=(.*)$', lines, re.MULTILINE)
lst_of_dct = [reduce(lambda x,y: {y:x}, reversed(t)) for t in lst_of_tup]
dct = reduce(rec_merge, lst_of_dct)
pprint(dct)
# {'INFO': {'env': {'os.name': 'ubuntu', 'sys.platform': 'linux2'},
# 'submit': {'is_test': 'False', 'username': 'Mary'}}}
Source :
import os
with open('file.log') as file:
tree_dict = {}
is_test = False
username = ""
sysplatform = ""
osname = ""
for row in file:
row = row.rstrip('\n')
for key in reversed(row.split(":")):
if not key.find('is_test'):
is_test = key.split('=')[1]
elif not key.find('username'):
username =key.split('=')[1]
elif not key.find('sys.platform'):
sysplatform = key.split('=')[1]
elif not key.find('os.name'):
osname = key.split('=')[1]
tree_dict = {
"INFO": {
"submit": {
"is_test": is_test,
"username": username
},
"env": {
"sys.platform": sysplatform,
"os.name": osname
}
}
}
print(tree_dict)
Result :
{'INFO': {'submit': {'is_test': 'False', 'username': 'Mary'}, 'env': {'sys.platform': 'linux2', 'os.name': 'ubuntu'}}}