Reading un-even JSON with Python - python

I have a program that's taking a LARGE JSON file and reading through the structure, grabbing everything where the key matches something, then storing a number of items form that structure into the database. The problem is that sometimes the structure is off when there is only one item... so as follows:
"stats": {
"first": [
{
"name": "Name1",
"context": "open",
"number": "139"
},
{
"name": "Name2",
"context": "opener",
"number": "135"
}
],
"second": {
"name": "Name1",
"context": "opener",
"amount": "1.5",
"number": "-125"
},
"third": [
{
"name": "Name1",
"context": "open",
"amount": "8.5",
"number": "-110"
},
{
"name": "Name2",
"context": "open",
"amount": "9.0",
"number": "-120"
}
]
}
},
So, you'll notice that second only has one entry, so it's structured differently... I've tried more conditionals than I can think of... how do I check if it's a single entry and move forward? This is probably REALLY simple, I'm just at a loss and not the best at Python data structures (admittedly).
What I'm doing after is grabbign like third[0]['name'] and putting it into a database... so I get an index error when I try on that second node. Also - in some nodes, second WILL have more than one... in others it won't... totally depends on the record.

I would first parse it to a JSON, and then update the dictionary you describe that has keys like "first", "second", etc. as follows:
def repair_dict(d):
for k in list(d):
v = d[k]
if not isinstance(v,list):
d[k] = [v]
It thus repairs the data like:
>>> d = json.loads(data)
>>> d
{'stats': {'third': [{'context': 'open', 'name': 'Name1', 'number': '-110', 'amount': '8.5'}, {'context': 'open', 'name': 'Name2', 'number': '-120', 'amount': '9.0'}], 'second': {'context': 'opener', 'name': 'Name1', 'number': '-125', 'amount': '1.5'}, 'first': [{'context': 'open', 'name': 'Name1', 'number': '139'}, {'context': 'opener', 'name': 'Name2', 'number': '135'}]}}
>>> repair_dict(d['stats'])
>>> d
{'stats': {'third': [{'context': 'open', 'name': 'Name1', 'number': '-110', 'amount': '8.5'}, {'context': 'open', 'name': 'Name2', 'number': '-120', 'amount': '9.0'}], 'second': [{'context': 'opener', 'name': 'Name1', 'number': '-125', 'amount': '1.5'}], 'first': [{'context': 'open', 'name': 'Name1', 'number': '139'}, {'context': 'opener', 'name': 'Name2', 'number': '135'}]}}
Or when pretty printing:
>>> pprint.pprint(d)
{'stats': {'first': [{'context': 'open', 'name': 'Name1', 'number': '139'},
{'context': 'opener', 'name': 'Name2', 'number': '135'}],
'second': [{'amount': '1.5',
'context': 'opener',
'name': 'Name1',
'number': '-125'}],
'third': [{'amount': '8.5',
'context': 'open',
'name': 'Name1',
'number': '-110'},
{'amount': '9.0',
'context': 'open',
'name': 'Name2',
'number': '-120'}]}}

Related

How to groupby columns by value and make json from them? Python3 Pandas

I have a dataset containing all the professors in Turkey. I need to change the shape of this data structure, but I couldn't find a solution. In this data, there is information about the university, faculty, department and title of approximately 44 thousand academicians.
[ { "name": "XX", "title": "PROFESÖR", "university": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculty": "MÜHENDİSLİK FAKÜLTESİ", "department": "BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ" }, { "name": "YY", "title": "PROFESÖR", "university": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculty": "MÜHENDİSLİK FAKÜLTESİ", "department": "ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ" } ]
I have 44000 yields as above and I want to process them. For example, there are nearly 200 universities, I want to separate them.
{ "universities": [ { "id": 1, "name": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculties": [ { "id" : 1, "name": "MÜHENDİSLİK FAKÜLTESİ", "departments" : [ { "id" : 1, "name" : "ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ", "academicians" : [ { "id" : 1, "name":"AA", "title" : "PROFESÖR" }, { "id" : 2, "name":"BB", "title" : "PROFESÖR" }, { "id" : 3, "name":"CC", "title" : "PROFESÖR" } ] }, { "id" : 2, "name" : "HARİTA MÜHENDİSLİĞİ BÖLÜMÜ", "academicians" : [ { "id" : 1, "name":"DD", "title" : "PROFESÖR" }, { "id" : 2, "name":"EE", "title" : "PROFESÖR" } ] } ] } ] } ] }
I want it as in the above format but I couldn't get it done. Can anyone help?
1.) get json datas
js_output = """{'universities': [{'id': 1,
'name': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculties': [{'id': 1,
'name': 'MÜHENDİSLİK FAKÜLTESİ',
'departments': [{'id': 1,
'name': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ',
'academicians': [{'id': 1, 'name': 'AA', 'title': 'PROFESÖR'},
{'id': 2, 'name': 'BB', 'title': 'PROFESÖR'},
{'id': 3, 'name': 'CC', 'title': 'PROFESÖR'}]},
{'id': 2,
'name': 'HARİTA MÜHENDİSLİĞİ BÖLÜMÜ',
'academicians': [{'id': 1, 'name': 'DD', 'title': 'PROFESÖR'},
{'id': 2, 'name': 'EE', 'title': 'PROFESÖR'}]}]}]}]}"""
js_input = """[{'name': 'XX',
'title': 'PROFESÖR',
'university': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'YY',
'title': 'PROFESÖR',
'university': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'}]"""
2.) set json normalize
# get record_path with json keys and get frame list
df_list = []
def get_frames(l,j):
for i in range(len(l)):
df_list.append(pd.json_normalize(j, l[:i+1]))
records = ["universities", "faculties", "departments", "academicians"]
jdo = json.loads(js_output.replace("'",'"'))
get_frames(records, jdo)
3.) concatenate all frames
con = pd.DataFrame()
for df in df_list[:-1]: # because last item is dict and must be opened next step
con = pd.concat([con, df.iloc[:,:-1]], axis=1)
con = pd.concat([con, df_list[-1]], axis=1)
4.) drop na because of example frame is output template
df = con.dropna().copy()
5.) design columns and match input keys for next concatenates
df.columns = [
"uni_id",
"university",
"faculty_id",
"faculty",
"department_id",
"department",
"aca_id",
"name",
"title"
]
6.) refix id sections and join input frame with template
def input_join_to_get_desired_template(jdi):
jdf = pd.DataFrame(jdi)
con_df = pd.concat([df,jdf], ignore_index=True, sort=False)
# enumerate ids with unique counts ↓ =================================================
unique_uni = list(con_df["university"].unique())
unique_fac = list(con_df["faculty"].unique())
unique_dep = list(con_df["department"].unique())
con_df["uni_id"] = con_df["university"].apply(lambda x: unique_uni.index(x)+1)
con_df["faculty_id"] = con_df["faculty"].apply(lambda x: unique_fac.index(x)+1)
con_df["department_id"] = con_df["department"].apply(lambda x: unique_dep.index(x)+1)
# set academicians indexes
l = ["uni_id","faculty_id","department_id","aca_id"]
con_df["aca_id"] = 1
con_df["aca_id"] = con_df.groupby(l)["aca_id"].cumsum().to_frame()
# enumerate ids with unique counts ↑ =================================================
return con_df
jd_input = json.loads(js_input.replace("'",'"'))
result_df = input_join_to_get_desired_template(jd_input)
result_df
7.) get other inputs and test
js_input_test = """[{'name': 'hl',
'title': 'doc',
'university': 'GEBZE ÜNİVERSİTESİ',
'faculty': 'FAKÜLTESİ',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'hl',
'title': 'PROFESÖR',
'university': 'GEBZE ÜNİVERSİTESİ',
'faculty': 'FAKÜLTESİ',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'yz',
'title': 'yrddoc',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'zz',
'title': 'doc',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'hl',
'title': 'PROFESÖR',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'abc',
'title': 'prof',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'HARİTA MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'aaa',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'bbb',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'ccc',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'ddd',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'}]"""
8.) and get results
jd_input = json.loads(js_input_test.replace("'",'"'))
result_df = input_join_to_get_desired_template(jd_input)
result_df

Python: Change a JSON value

Let's say I have the following JSON file named output.
{'fields': [{'name': 2, 'type': 'Int32'},
{'name': 12, 'type': 'string'},
{'name': 9, 'type': 'datetimeoffset'},
}],
'type': 'struct'}
If type key has a value datetimeoffset, I would like to change it to dateTime and if If type key has a value Int32, I would like to change it to integer and like this, I have multiple values to replace.
The expected output is
{'fields': [{ 'name': 2, 'type': 'integer'},
{ 'name': 12, 'type': 'string'},
{ 'name': 9, 'type': 'dateTime'},
,
}],
'type': 'struct'}
Can anyone help with this in Python?
You can try this out:
substitute = {"Int32": "integer", "datetimeoffset": "dateTime"}
x = {'fields': [
{'name': 2, 'type': 'Int32'},
{'name': 12, 'type': 'string'},
{'name': 9, 'type': 'datetimeoffset'}
],'type': 'struct'}
for i in range(len(x['fields'])):
if x['fields'][i]["type"] in substitute:
x['fields'][i]['type'] = substitute[x['fields'][i]['type']]
print(x)
You can use the following code. Include in equivalences dict the values you want to replace:
json = {
'fields': [
{'name': 2, 'type': 'Int32'},
{'name': 12, 'type': 'string'},
{'name': 9, 'type': 'datetimeoffset'},
],
'type': 'struct'
}
equivalences = {"datetimeoffset": "dateTime", "Int32": "integer"}
#Replace values based on equivalences dict
for i, data in enumerate(json["fields"]):
if data["type"] in equivalences.keys():
json["fields"][i]["type"] = equivalences[data["type"]]
print(json)
The output is:
{
"fields": [
{
"name": 2,
"type": "integer"
},
{
"name": 12,
"type": "string"
},
{
"name": 9,
"type": "dateTime"
}
],
"type": "struct"
}
simple but ugly way:
json_ ={'fields': [{'name': 2, 'type': 'Int32'},
{'name': 12, 'type': 'string'},
{'name': 9, 'type': 'datetimeoffset'}], 'type': 'struct'}
result = json.loads(json.dumps(json_ ).replace("datetimeoffset", "dateTime").replace("Int32", "integer"))

Merge two list of dictionary based on key values

I have two list of dictionary:
a =[{ 'id': "1", 'date': "2017-01-24" },{ 'id': "2", 'date': "2018-01-24" },{ 'id': "3", 'date': "2019-01-24" }]
b =[{ 'id': "1", 'name': "abc" },{ 'id': "2",'name': "xyz"},{ 'id': "4",'name': "ijk"}]
I want to merge these dictionaries based on id and the result should be:
[{ 'id': "1", 'date': "2017-01-24",'name': "abc" },{ 'id': "2", 'date': "2018-01-24",'name': "xyz" },{ 'id': "3", 'date': "2019-01-24" },{ 'id': "4",'name': "ijk"}]
How can I do this without iterating in python?
since the dicts are stored in list, you'll either have to iterate or use a vectorized approach such as pandas.... for example:
import pandas as pd
a =[{ 'id': "1", 'date': "2017-01-24" },{ 'id': "2", 'date': "2018-01-24" }]
b =[{ 'id': "1", 'name': "abc" },{ 'id': "2",'name': "xyz"}]
df1 = pd.DataFrame(a)
df2 = pd.DataFrame(b)
out = df1.merge(df2, on='id').to_dict('r')
result:
[{'id': '1', 'date': '2017-01-24', 'name': 'abc'}, {'id': '2', 'date': '2018-01-24', 'name': 'xyz'}]
without testing I'm not sure how this compares speed-wise to just simply iterating. It may take long to iterate, but pandas also has to construct the dataframe and convert output to dict so there's a tradeoff

How do i append a dictionary to a JSON file in python?

I have a JSON looks like this:
{'data': [], 'directed': False, 'multigraph': False, 'elements': {'nodes': [{'data': {'id': 'B2', 'value': 'B2', 'name': 'B2'}}, {'data': {'id': 'SCHROEDER PLZ', 'value': 'SCHROEDER PLZ', 'name': 'SCHROEDER PLZ'}}, {'data': {'id': 'D4', 'value': 'D4', 'name': 'D4'}}, {'data': {'id': 'BLAB PLZ', 'value': 'BLAB PLZ', 'name': 'BLAB PLZ'}}], 'edges': [{'data': {'source': 'B2', 'target': 'SCHROEDER PLZ'}}, {'data': {'source': 'D4', 'target': 'BLAB PLZ'}}]}}
The JSON is a result of the "loads" in my code:
import pandas as pd
import networkx as nx
import json
df= pd.read_csv('.../graph.csv')
g = nx.from_pandas_edgelist(df, source='DISTRICT', target='STREET')
x = nx.cytoscape_data(g)
dump = json.dumps(x)
loads = json.loads(dump)
And this is my csv file structure: The first record is the field name.
OFFENSE_DESCRIPTION,DISTRICT,DAY_OF_WEEK,STREET,INCIDENT_NUMBER,size
INVESTIGATE PERSON,B2,Thursday,SCHROEDER PLZ,854652314,10
INVESTIGATE PERSON,D4,Friday,BLAB PLZ,457856954,3
I want to append "size" values located in my csv file.
In fact, the result must be like the below JSON. in the 'nodes' tags, in the 'data' i want to add 'size' field value.
{'data': [], 'directed': False, 'multigraph': False, 'elements': {'nodes': [{'data': {'id': 'B2', 'value': 'B2', 'name': 'B2','size':10}}, {'data': {'id': 'SCHROEDER PLZ', 'value': 'SCHROEDER PLZ', 'name': 'SCHROEDER PLZ','size':10}}, {'data': {'id': 'D4', 'value': 'D4', 'name': 'D4','size':3}}, {'data': {'id': 'BLAB PLZ', 'value': 'BLAB PLZ', 'name': 'BLAB PLZ','size':3}}], 'edges': [{'data': {'source': 'B2', 'target': 'SCHROEDER PLZ'}}, {'data': {'source': 'D4', 'target': 'BLAB PLZ'}}]}}
An elegant solution is to update node attributes in networkx rather than the output dict. Use nx.set_node_attributes:
df = pd.read_csv('.../graph.csv')
size = dict(df[['DISTRICT', 'size']].values.tolist()
+ df[['STREET', 'size']].values.tolist())
g = nx.from_pandas_edgelist(df, source='DISTRICT', target='STREET')
nx.set_node_attributes(g, size, 'size')
x = nx.cytoscape_data(g)
>>> print(json.dumps(x['elements']['nodes'], indent=4))
[
{
"data": {
"size": 10,
"id": "B2",
"value": "B2",
"name": "B2"
}
},
{
"data": {
"size": 10,
"id": "SCHROEDER PLZ",
"value": "SCHROEDER PLZ",
"name": "SCHROEDER PLZ"
}
},
{
"data": {
"size": 3,
"id": "D4",
"value": "D4",
"name": "D4"
}
},
{
"data": {
"size": 3,
"id": "BLAB PLZ",
"value": "BLAB PLZ",
"name": "BLAB PLZ"
}
}
]

How to get the count for a particular key in the dictionary

My content inside a dictionary is below
I need to now for BusinessArea how many different name key is there, like this need to know Designation also
test=
[ { 'masterid': '1', 'name': 'Group1', 'BusinessArea': [ { 'id': '14', 'name': 'Accounting', 'parentname': 'Finance'}, { 'id': '3', 'name': 'Research', 'parentname': 'R & D' } ], 'Designation': [ { 'id': '16', 'name': 'L1' }, { 'id': '20', 'name': 'L2' }, { 'id': '25', 'name': 'L2' }] },
{ 'masterid': '2', 'name': 'Group1', 'BusinessArea': [ { 'id': '14', 'name': 'Research', 'parentname': '' }, { 'id': '3', 'name': 'Accounting', 'parentname': '' } ], 'Role': [ { 'id': '5032', 'name': 'Tester' }, { 'id': '5033', 'name': 'Developer' } ], 'Designation': [ { 'id': '16', 'name': 'L1' }, { 'id': '20', 'name': 'L2' }, { 'id': '25', 'name': 'L2' }]},
{ 'masterid': '3', 'name': 'Group1', 'BusinessArea': [ { 'id': '14', 'name': 'Engineering' }, { 'id': '3', 'name': 'Engineering', 'parentname': '' } ], 'Role': [ { 'id': '5032', 'name': 'Developer' }, { 'id': '5033', 'name': 'Developer', 'parentname': '' } ], 'Designation': [ { 'id': '16', 'name': 'L1' }, { 'id': '20', 'name': 'L2' }, { 'id': '25', 'name': 'L2' }]}]
I want to get the count of masterid of BusinessArea and Designation which is all the names
Expected out is below
[
{
"name": "BusinessArea",
"values": [
{
"name": "Accounting",
"count": "2"
},
{
"name": "Research",
"count": "2"
},
{
"name": "Engineering",
"count": "1"
}
]
},
{
"name": "Designation",
"values": [
{
"name": "L1",
"count": "3"
},
{
"name": "l2",
"count": "3"
}
]
}
]
Try this:
res=[{'name': 'BusinessArea', 'values': []}, {'name': 'Designation', 'values': []}]
listbus=sum([i['BusinessArea'] for i in test], [])
listdes=sum([i['Designation'] for i in test], [])
res[0]['values']=[{'name':i, 'count':0} for i in set(k['name'] for k in listbus)]
res[1]['values']=[{'name':i, 'count':0} for i in set(k['name'] for k in listdes)]
for i in listbus:
for k in range(len(res[0]['values'])):
if i['name']==res[0]['values'][k]['name']:
res[0]['values'][k]['count']+=1
for i in listdes:
for k in range(len(res[1]['values'])):
if i['name']==res[1]['values'][k]['name']:
res[1]['values'][k]['count']+=1
>>> print(res)
[{'name': 'BusinessArea', 'values': [{'name': 'Accounting', 'count': 2}, {'name': 'Research', 'count': 2}, {'name': 'Engineering', 'count': 2}]}, {'name': 'Designation', 'values': [{'name': 'L1', 'count': 3}, {'name': 'L2', 'count': 6}]}]
You could count unique names using a nested collections.defaultdict:
from collections import defaultdict
from json import dumps
keys = ["BusinessArea", "Designation"]
group_counts = defaultdict(lambda: defaultdict(int))
for group in test:
for key in keys:
names = [item["name"] for item in group[key]]
unique_names = list(dict.fromkeys(names))
for name in unique_names:
group_counts[key][name] += 1
print(dumps(group_counts, indent=2))
Which will give you these counts:
{
"BusinessArea": {
"Accounting": 2,
"Research": 2,
"Engineering": 1
},
"Designation": {
"L1": 3,
"L2": 3
}
}
Then you could modify the result to get the list of dicts you expect:
result = [
{
"name": name,
"values": [{"name": value, "count": count} for value, count in counts.items()],
}
for name, counts in group_counts.items()
]
print(dumps(result, indent=2))
Which gives you this:
[
{
"name": "BusinessArea",
"values": [
{
"name": "Accounting",
"count": 2
},
{
"name": "Research",
"count": 2
},
{
"name": "Engineering",
"count": 1
}
]
},
{
"name": "Designation",
"values": [
{
"name": "L1",
"count": 3
},
{
"name": "L2",
"count": 3
}
]
}
]

Categories