Delete from JSON nested dictionary in list value by iteration - python

I have JSON file 'json_HW.json' in which I have this format JSON:
{
"news": [
{
"content": "Prices on gasoline have soared on 40%",
"city": "Minsk",
"news_date_and_time": "21/03/2022"
},
{
"content": "European shares fall on weak earnings",
"city": "Minsk",
"news_date_and_time": "19/03/2022"
}
],
"ad": [
{
"content": "Rent a flat in the center of Brest for a month",
"city": "Brest",
"days": 15,
"ad_start_date": "15/03/2022"
},
{
"content": "Sell a bookshelf",
"city": "Mogilev",
"days": 7,
"ad_start_date": "20/03/2022"
}
],
"coupon": [
{
"content": "BIG sales up to 50%!",
"city": "Grodno",
"days": 5,
"shop": "Marko",
"coupon_start_date": "17/03/2022"
}
]
}
I need to delete field_name and field_value with their keys when I reach them until the whole information in the file is deleted. When there is no information in the file, I need to delete the file itself
The code I have
data = json.load(open('json_HW.json'))
for category, posts in data.items():
for post in posts:
for field_name, field_value in post.items():
del field_name, field_value
print(data)
But the variable data doesn't change when I delete and delete doesn't work. If it worked I could rewrite my JSON

You are deleting the key and the value, after extracting them from the dictionary,
that doesn't affect the dictionary. What you should do is delete the dictionary entry:
import json
import os
file_name = 'json_HW.json'
data = json.load(open(file_name))
for category in list(data.keys()):
posts = data[category]
elem_indices = []
for idx, post in enumerate(posts):
for field_name in list(post.keys()):
del post[field_name]
if not post:
elem_indices.insert(0, idx) # so you get reverse order
for idx in elem_indices:
del posts[idx]
if not posts:
del data[category]
print(data)
if not data:
print('deleting', file_name)
os.unlink(file_name)
which gives:
{}
deleting json_HW.json
Note that the list() is necessary, post.keys() is a generator and
you cannot change the dict while you are iterating over its keys (or items or values).

if you want to delete key-value from dictionary, you can use del post[key].
but i don't think it works for iteration, cause dictionary size keeps changing.
https://www.geeksforgeeks.org/python-ways-to-remove-a-key-from-dictionary/

Related

How to merge non-fixed key json multilines into one json abstractly

If I have a heavy json file that have 30m entries like that
{"id":3,"price":"231","type":"Y","location":"NY"}
{"id":4,"price":"321","type":"N","city":"BR"}
{"id":5,"price":"354","type":"Y","city":"XE","location":"CP"}
--snip--
{"id":30373779,"price":"121","type":"N","city":"SR","location":"IU"}
{"id":30373780,"price":"432","type":"Y","location":"TB"}
{"id":30373780,"price":"562","type":"N","city":"CQ"}
how I can only abstract the location and the city and parse it into one json like that in python:
{
"orders":{
3:{
"location":"NY"
},
4:{
"city":"BR"
},
5:{
"city":"XE",
"location":"CP"
},
30373779:{
"city":"SR",
"location":"IU"
},
30373780:{
"location":"TB"
},
30373780:{
"city":"CQ"
}
}
}
P.S: beatufy the syntax is not necessary.
Assuming your input file is actually in jsonlines format, then you can read each line, extract the city and location keys from the dict and then append those to a new dict:
import json
from collections import defaultdict
orders = { 'orders' : defaultdict(dict) }
with open('orders.txt', 'r') as f:
for line in f:
o = json.loads(line)
id = o['id']
if 'location' in o:
orders['orders'][id]['location'] = o['location']
if 'city' in o:
orders['orders'][id]['city'] = o['city']
print(orders)
Output for your sample data (note it has two 30373780 id values, so the values get merged into one dict):
{
"orders": {
"3": {
"location": "NY"
},
"4": {
"city": "BR"
},
"5": {
"location": "CP",
"city": "XE"
},
"30373779": {
"location": "IU",
"city": "SR"
},
"30373780": {
"location": "TB",
"city": "CQ"
}
}
}
As you've said that your file is pretty big and you probably don't want to keep all entries in memory here is the way to consume source file line by line and write output immediately:
import json
with open(r"in.jsonp") as i_f, open(r"out.json", "w") as o_f:
o_f.write('{"orders":{')
for i in i_f:
i_obj = json.loads(i)
o_f.write(f'{i_obj["id"]}:')
o_obj = {}
if location := i_obj.get("location"):
o_obj["location"] = location
if city := i_obj.get("city"):
o_obj["city"] = city
json.dump(o_obj, o_f)
o_f.write(",")
o_f.write('}}')
It will generate semi-valid JSON object in same format you've provided in your question.

Convert Nested JSON list API data into CSV using PYTHON

Want to convert Sample JSON data into CSV file using python. I am retrieving JSON data from API.
As my JSON has nested objects, so it normally cannot be directly converted to CSV.I don't want to do any hard coding and I want to make a python code fully dynamic.
So, I have written a function that flatten my JSON Data but I am not able to work out how to iterate all records, finding relevant column names and then output those data into CSV.
In the Sample JSON file I have mentioned only 2 records but in actual there are 100 records.
Sample JSON Look like this:
[
{
"id":"Random_Company_57",
"unid":"75",
"fieldsToValues":{
"Email":"None",
"occupation":"SO1 Change",
"manager":"None",
"First Name":"Bells",
"employeeID":"21011.0",
"loginRequired":"true",
"superUser":"false",
"ldapSuperUser":"false",
"archived":"true",
"password":"None",
"externalUser":"false",
"Username":"Random_Company_57",
"affiliation":"",
"Phone":"+16 22 22 222",
"unidDominoKey":"",
"externalUserActive":"false",
"secondaryOccupation":"SO1 Change",
"retypePassword":"None",
"Last Name":"Christmas"
},
"hierarchyFieldAccess":[
],
"userHierarchies":[
{
"hierarchyField":"Company",
"value":"ABC Company"
},
{
"hierarchyField":"Department",
"value":"gfds"
},
{
"hierarchyField":"Project",
"value":"JKL-SDFGHJW"
},
{
"hierarchyField":"Division",
"value":"Silver RC"
},
{
"hierarchyField":"Site",
"value":"SQ06"
}
],
"locale":{
"id":1,
"dateFormat":"dd/MM/yyyy",
"languageTag":"en-UA"
},
"roles":[
"User"
],
"readAccessRoles":[
],
"preferredLanguage":"en-AU",
"prefName":"Christmas Bells",
"startDate":"None",
"firstName":"Bells",
"lastName":"Christmas",
"fullName":"Christmas Bells",
"lastModified":"2022-02-22T03:47:41.632Z",
"email":"None",
"docNo":"None",
"virtualSuperUser":false
},
{
"id":"xyz.abc#safe.net",
"unid":"98",
"fieldsToValues":{
"Email":"xyz.abc#safe.net",
"occupation":"SO1 Change",
"manager":"None",
"First Name":"Bells",
"employeeID":"21011.0",
"loginRequired":"false",
"superUser":"false",
"ldapSuperUser":"false",
"archived":"false",
"password":"None",
"externalUser":"false",
"Username":"xyz.abc#safe.net",
"affiliation":"",
"Phone":"+16 2222 222 222",
"unidDominoKey":"",
"externalUserActive":"false",
"secondaryOccupation":"SO1 Change",
"retypePassword":"None",
"Last Name":"Christmas"
},
"hierarchyFieldAccess":[
],
"userHierarchies":[
{
"hierarchyField":"Company",
"value":"ABC Company"
},
{
"hierarchyField":"Department",
"value":"PUHJ"
},
{
"hierarchyField":"Project",
"value":"RPOJ-SDFGHJW"
},
{
"hierarchyField":"Division",
"value":"Silver RC"
},
{
"hierarchyField":"Site",
"value":"SQ06"
}
],
"locale":{
"id":1,
"dateFormat":"dd/MM/yyyy",
"languageTag":"en-UA"
},
"roles":[
"User"
],
"readAccessRoles":[
],
"preferredLanguage":"en-AU",
"prefName":"Christmas Bells",
"startDate":"None",
"firstName":"Bells",
"lastName":"Christmas",
"fullName":"Christmas Bells",
"lastModified":"2022-03-16T05:04:13.085Z",
"email":"xyz.abc#safe.net",
"docNo":"None",
"virtualSuperUser":false
}
]
What I have tried.
def flattenjson(b, delim):
val = {}
for i in b.keys():
if isinstance(b[i], dict):
get = flattenjson(b[i], delim)
for j in get.keys():
val[i + delim + j] = get[j]
else:
val[i] = b[i]
print(val)
return val
json=[{Sample JSON String that mentioned above}]
flattenjson(json,"__")
I don't know it is a right way to deal this problem or not?
My final aim is that all the above json data will output in a csv file.
Based on this answer, you could loop through your list of json data and flatten each json with the given function (they always have the same structure?), then build a DataFrame and write the data to csv. That's the easiest way I can think of,
try this:
import pandas as pd
import json
import collections
def flatten(dictionary, parent_key=False, separator='__'):
items = []
for key, value in dictionary.items():
new_key = str(parent_key) + separator + key if parent_key else key
if isinstance(value, collections.MutableMapping):
items.extend(flatten(value, new_key, separator).items())
elif isinstance(value, list):
for k, v in enumerate(value):
items.extend(flatten({str(k): v}, new_key).items())
else:
items.append((new_key, value))
return dict(items)
with open('your_json.json') as f:
data = json.load(f) # data is a the example you provided (list of dicts)
all_records=[]
for jsn in data:
tmp = flatten(jsn)
all_records.append(tmp)
df = pd.DataFrame(all_records)
out = df.to_csv('json_to_csv.csv')

Append to an array inside a JSON object based on key in python

I have some JSON I'm looping through in the following format. I need to create an object for each unique primary key found in the source data and append to an array. I'm not sure how I would create the object on first encounter of the key and append to it on the next encounter. My initial attempt just creates a new object for each object in the source. Wasn't able to find an example in python only js.
Source data format:
[
...
{
"Id": "NOT NEEDED DATA",
"Client": {
"Id": "KEY",
"Name": "NOT NEEDED DATA"
},
"Name": "DESIRED DATAPOINT"
},
...
]
Desired format:
[
...
{
"client_id": "KEY",
"locations": ["DATA", "DATA"]
}
...
]
pseudocode
for i in sourcedata:
client_id = i['Client']['Id']
location_name = i['Name']
obj = {
"client_id": client_id,
"locations": [location_name]
}
new_array.append(obj)
You can first iterate and build a dictionary for then creating a list of dictionaries as specified in your output format.
from collections import defaultdict
# create and populate the dictionary
d = defaultdict(list)
for i in sourcedata:
client_id = i['Client']['Id']
location_name = i['Name']
d[client_id].append(location_name)
# build the result
res = [{"client_id": k, "locations": v} for k,v in d.items()]

How to convert nested JSON data to CSV using python?

I have a file consisting of an array containing over 5000 objects. However, I am having trouble converting one particular part of my JSON file into the appropriate columns in CSV format.
Below is an example version of my data file:
{
"Result": {
"Example 1": {
"Type1": [
{
"Owner": "Name1 Example",
"Description": "Description1 Example",
"Email": "example1_email#email.com",
"Phone": "(123) 456-7890"
}
]
},
"Example 2": {
"Type1": [
{
"Owner": "Name2 Example",
"Description": "Description2 Example",
"Email": "example2_email#email.com",
"Phone": "(111) 222-3333"
}
]
}
}
}
Here is my current code:
import csv
import json
json_file='example.json'
with open(json_file, 'r') as json_data:
x = json.load(json_data)
f = csv.writer(open("example.csv", "w"))
f.writerow(["Address","Type","Owner","Description","Email","Phone"])
for key in x["Result"]:
type = "Type1"
f.writerow([key,
type,
x["Result"][key]["Type1"]["Owner"],
x["Result"][key]["Type1"]["Description"],
x["Result"][key]["Type1"]["Email"],
x["Result"][key]["Type1"]["Phone"]])
My problem is that I'm encountering this issue:
Traceback (most recent call last):
File "./convert.py", line 18, in <module>
x["Result"][key]["Type1"]["Owner"],
TypeError: list indices must be integers or slices, not str
When I try to substitute the last array such as "Owner" to an integer value, I receive this error: IndexError: list index out of range.
When I strictly change the f.writerow function to
f.writerow([key,
type,
x["Result"][key]["Type1"]])
I receive the results in a column, but it merges everything into one column, which makes sense. Picture of the output: https://imgur.com/a/JpDkaAT
I would like the results to be separated based on the label into individual columns instead of being merged into one. Could anyone assist?
Thank you!
Type1 in your data structure is a list, not a dict. So you need to iterate over it instead of referencing by key.
for key in x["Result"]:
# key is now "Example 1" etc.
type1 = x["Result"][key]["Type1"]
# type1 is a list, not a dict
for i in type1:
f.writerow([key,
"Type1",
type1["Owner"],
type1["Description"],
type1["Email"],
type1["Phone"]])
The inner for loop ensure that you're protected from the assumption that "Type1" only ever has one item in the list.
It's definately not the best example, but I'm to sleepy to optimize it.
import csv
def json_to_csv(obj, res):
for k, v in obj.items():
if isinstance(v, dict):
res.append(k)
json_to_csv(v, res)
elif isinstance(v, list):
res.append(k)
for el in v:
json_to_csv(el, res)
else:
res.append(v)
obj = {
"Result": {
"Example 1": {
"Type1": [
{
"Owner": "Name1 Example",
"Description": "Description1 Example",
"Email": "example1_email#email.com",
"Phone": "(123) 456-7890"
}
]
},
"Example 2": {
"Type1": [
{
"Owner": "Name2 Example",
"Description": "Description2 Example",
"Email": "example2_email#email.com",
"Phone": "(111) 222-3333"
}
]
}
}
}
with open("out.csv", "w+") as f:
writer = csv.writer(f)
writer.writerow(["Address","Type","Owner","Description","Email","Phone"])
for k, v in obj["Result"].items():
row = [k]
json_to_csv(v, row)
writer.writerow(row)
Figured it out!
I changed the f.writerow function to the following:
for key in x["Result"]:
type = "Type1"
f.writerow([key,
type,
x["Result"][key]["Type1"][0]["Owner"],
x["Result"][key]["Type1"][0]["Email"]])
...
This allowed me reference the keys within the object. Hopefully this helps someone down the line!

Python Recursively Maintain Keyed Depth

Input/Goal
My input data is an OrderedDict for which there can be a variable depth of nested OrderedDicts so I have opted to handle parsing this output recursively. The desired output is a csv with header.
Elaboration of Problem
My code below will work once I am able to correctly define field_name upon traversing back up a branch after completing all of a branch's leaves. (i.e. Type_1.Field_3.Data will incorrectly be called Type_1.Field_2.Field_3.Data).
Once the leaves on a branch have been exhausted, I want to remove the last .Field_x from the field_name so that a new (correct) one can be added for the following object.
Request for Help
Does anyone see where I can include this feature? Thanks,
...
Dependencies:
Code Snippet:
def get_soql_fields(soql):
soql_fields = re.search('(?<=select)(?s)(.*)(?=from)', soql) # get fields
soql_fields = re.sub(' ', '', soql_fields.group()) # remove extra spaces
fields = re.split(',|\n|\r', soql_fields) # split on commas and newlines
fields = [field for field in fields if field != ''] # remove empty strings
return fields
def parse_output(data, soql):
fields = get_soql_fields(soql)
header = fields
master = [header]
for record in data['records']: # for each 'record' in response
row = []
for obj, value in record.iteritems(): # for each obj in record
if isinstance(value, basestring): # if query base object has desired fields
if obj in fields:
row.append(value)
elif isinstance(value, dict): # traverse down into object
path = obj
row.append(_traverse_output(obj, value, fields, row, path))
master.append(row)
return master
def _traverse_output(obj, value, fields, row, path):
for f, v in value.iteritems(): # for each item in obj
if not isinstance(v, (dict, list, tuple)):
field_name = '{path}.{name}'.format(path=path, name=f) # TODO fix this to full field name
print('FName: {0}'.format(field_name))
if field_name in fields:
print('match')
row.append(v)
elif isinstance(v, dict): # it is a dict
path += '.{obj}'.format(obj=f)
_traverse_output(f, v, fields, row, path)
Example Salesforce SOQL:
select
Type_1.Field_1,
Type_1.Field_2.Data,
Type_1.Field_3,
Type_1.Field_4,
Type_1.Field_5.Data_1.Data,
Type_1.Field_6,
Type_2.Field_1,
Type_2.Field_2
from
Obj_1
limit
1
;
Example Salesforce Output:
{
"records": [
{
"attributes": {
"type": "Obj_1",
"url": "<url>"
},
"Type_1": {
"attributes": {
"type": "Type_1",
"url": "<url>"
},
"Field_1": "<stuff>",
"Field_2": {
"attributes": {
"type": "Field_2",
"url": "<url>"
},
"Data": "<data>"
},
"Field_3": "<data>",
"Field_4": "<data>",
"Field_5": {
"attributes": {
"type": "Field_2",
"url": "<url>"
},
"Data_1": {
"attributes": {
"type": "Data_1",
"url": "<url>"
},
"Data": "<data>"
}
},
"Field_6": 1.0
},
"Type_2": {
"attributes": {
"type": "Type_2",
"url": "<url>"
},
"Field_1": "<data>",
"Field_2": "<data>"
}
}
]
}
I worked out a quick solution for this. I'll just note what I figured out, and append the code I wrote to the end.
Essentially your problem is that you keep trying to modify path in place, which isn't going to work. Instead do something like
new_path = path + '.{obj}'.format(obj=f)
_traverse_output(f, v, fields, row, new_path)
A note about this: it will NOT necessarily result in a row where the values are in the same order as the header (i.e., if Type_1.Field_1 is in position 0 of the header list, then the value corresponding to it might not be).
The easy way to solve this (and handle csvs in general) is to use DictWriter from the csv module, then pass an empty dictionary to your first call where the keys will be the field names and the values will be their values.
Another way to solve the problem is to pre-populate your row list with None or empty strings, then use the list.index method to assign the value to the appropriate position.
I wrote an implementation of _traverse_output as examples for each, though they differ slightly from your code. They take an element of the 'records' list.
Dictionary Example
def _traverse_output_with_dict(record, fields, row_values, field_name=''):
for obj, value in record.iteritems():
new_field_name = '{}.{}'.format(field_name, obj) if field_name else obj
print new_field_name
if not isinstance(value, dict):
if new_field_name in fields:
row_values[new_field_name] = value
else:
_traverse_output_with_dict(value, fields, row_values, new_field_name)
List Example
def _traverse_output_with_list(record, fields, row, field_name=''):
while len(row) < len(fields):
row.append('')
for obj, value in record.iteritems():
new_field_name = '{}.{}'.format(field_name, obj) if field_name else obj
print new_field_name
if not isinstance(value, dict):
if new_field_name in fields:
row[fields.index(new_field_name)] = value
else:
_traverse_output_with_list(value, fields, row, new_field_name)

Categories