I have json data with nested arrays which contains same key name.
My Json format is like(may vary key value pair names):
{
"name": "bharat",
"age": 27,
"vehicles": [
{
"car": "tata",
"bike": "duke",
"plane": "n",
},
{
"car": "odi",
"bike": "duke",
"plane": "n",
}]
}
I have tried
Convert nested JSON to CSV file in Python
but got multiple columns with same keys in vehicles.
My code is
import json
import csv
from elasticsearch import Elasticsearch
import elasticsearch.helpers
with open("query.json") as f:
query=json.load(f)
es = Elasticsearch(['http://xx.xx.xx.xx:xxxx'],verify_certs=False)
results_gen = elasticsearch.helpers.scan(
es,
query=query,
index="demo",
)
def get_leaves(item, key=None):
if isinstance(item, dict):
leaves = []
for i in item.keys():
leaves.extend(get_leaves(item[i], i))
return leaves
elif isinstance(item, list):
leaves = []
for i in item:
leaves.extend(get_leaves(i, key))
return leaves
else:
return [(key, item)]
with open('Data.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
write_header = True
for entry in results_gen:
e=entry['_source']
leaf_entries = sorted(get_leaves(e))
print(leaf_entries)
if write_header:
csv_output.writerow([k for k, v in leaf_entries])
write_header = False
csv_output.writerow([v for k, v in leaf_entries])
I am getting output like
name age car car bike bike plane plane
bharat 27 tata odi duke duke n n
I expect output to be like
name age car bike plane
bharat 27 tata duke n
bharat 27 odi duke n
Something like this (Assuming you are interested in vehicles data only)
data = {"name": "bharat", "age": 27, "vehicles": [{"car": "tata", "bike": "duke", "plane": "n",
}, {"car": "odi", "bike": "duke", "plane": "n",
}]}
with open('out.csv', 'w') as out:
for v in data['vehicles']:
line = ','.join(v.values())
out.write(line + '\n')
You Can use Pandas Normalize class to normalize the json into pandas dataframe and then write the dataframe to csv
Lets say you have Json dictionaries in variable 'data' like: [{json1},{json2},{json3}...]
import json
import pandas
from pandas.io.json import json_normalize
for d in data:
normalized_data = json_normalize(d)
normalized_data .to_csv('csv_file_name', sep='|', mode='a', index=False, na_rep='', header=False)
refer: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.io.json.json_normalize.html
Related
i am doing analysis on semi structured data, and for that i had to flatten both xml and json files to a pandas dataframe, now when the analysis is done, i do the improvement like drop null values and fix some data errors i need to generate xml or json files (depending on which format the user entered).
this is what i'm using to flatten xml :
import xml.etree.ElementTree as et
from collections import defaultdict
import pandas as pd
def flatten_xml(node, key_prefix=()):
"""
Walk an XML node, generating tuples of key parts and values.
"""
# Copy tag content if any
text = (node.text or '').strip()
if text:
yield key_prefix, text
# Copy attributes
for attr, value in node.items():
yield key_prefix + (attr,), value
# Recurse into children
for child in node:
yield from flatten_xml(child, key_prefix + (child.tag,))
def dictify_key_pairs(pairs, key_sep='.'):
"""
Dictify key pairs from flatten_xml, taking care of duplicate keys.
"""
out = {}
# Group by candidate key.
key_map = defaultdict(list)
for key_parts, value in pairs:
key_map[key_sep.join(key_parts)].append(value)
# Figure out the final dict with suffixes if required.
for key, values in key_map.items():
if len(values) == 1: # No need to suffix keys.
out[key] = values[0]
else: # More than one value for this key.
for suffix, value in enumerate(values, 1):
out[f'{key}{key_sep}{suffix}'] = value
return out
# Parse XML with etree
tree = et.parse('NCT00571389.xml').iter()
# Generate flat rows out of the root nodes in the tree
rows = [dictify_key_pairs(flatten_xml(row)) for row in tree]
df = pd.DataFrame(rows)
and this is what i'm using to flatten json :
from collections import defaultdict
import pandas as pd
import json
def flatten_json(nested_json, exclude=['']):
out = {}
def flatten(x, name='', exclude=exclude):
if type(x) is dict:
for a in x:
if a not in exclude: flatten(x[a], name + a + '.')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
f = open('employee_data.json')
this_dict = json.load(f)
df = pd.DataFrame([flatten_json(x) for x in this_dict[list(this_dict.keys())[0]]])
i need to know how to go from a dataframe to the original structure of the files, help please?
edit:
this is the example of the json file i'm using:
{"features": [{"candidate": {"first_name": "Margaret", "last_name": "Mcdonald", "skills": ["skLearn", "Java", "R", "SQL", "Spark", "C++"], "state": "AL", "specialty": "Database", "experience": "Mid", "relocation": "no"}}, {"candidate": {"first_name": "Michael", "last_name": "Carter", "skills": ["TensorFlow", "R", "Spark", "MongoDB", "C++", "SQL"], "state": "AR", "specialty": "Statistics", "experience": "Senior", "relocation": "yes"}}]}
and this is the columns after i flatten them:
candidate.first_name
candidate.last_name
candidate.skills.0
candidate.skills.1
candidate.skills.2
candidate.skills.3
candidate.skills.4
candidate.skills.5
candidate.state
candidate.specialty
candidate.experience
candidate.relocation
candidate.skills.6
candidate.skills.7
candidate.skills.8
Ok, this was not easy and I should have guided you instead of coding it for you, but here is what I've done:
json = {"features": [{"candidate": {"first_name": "Margaret", "last_name": "Mcdonald", "skills": ["skLearn", "Java", "R", "SQL", "Spark", "C++"], "state": "AL", "specialty": "Database", "experience": "Mid", "relocation": "no"}}, {"candidate": {"first_name": "Michael", "last_name": "Carter", "skills": ["TensorFlow", "R", "Spark", "MongoDB", "C++", "SQL"], "state": "AR", "specialty": "Statistics", "experience": "Senior", "relocation": "yes"}}]}
df = pd.DataFrame([flatten_json(x) for x in json[list(json.keys())[0]]])
import re
header = df.columns
print(header)
regex = r'(\w+)\.(\w+)\.?(\d+)?'
m=re.findall(regex,'\n'.join(header))
def make_json(json,feature,pos,value):
if pos+1 == len(feature):
json[feature[pos]] = value
return json
elif feature[pos+1] == '':
json[feature[pos]] = value
return json
elif feature[pos+1].isdigit():
if feature[pos+1] == '0':
json[feature[pos]] = [value]
return json
else:
json[feature[pos]].append(value)
return json
else:
if feature[pos] not in json:
json[feature[pos]] = make_json({},feature,pos+1,value)
return json
else:
json[feature[pos]] = make_json(json[feature[pos]],feature,pos+1,value)
return json
json = {'features': []}
for row in range(len(df)):
cadidate = {}
for col, feature in enumerate(m):
cadidate = make_json(cadidate,feature,0,df.iloc[row][header[col]])
json['features'].append(cadidate)
print(json)
You see I wanted to make it in a recursive way so it can work for more complex json, as long you define the regex right. For your specific example it could be simpler.
I am trying to covert my CSV email list to a JSON format to mass email via API. This is my code thus far but am having trouble with the output. Nothing is outputting on my VS code editor.
import csv
import json
def make_json(csvFilePath, jsonFilePath):
data = {}
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
key = rows['No']
data[key] = rows
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
csvFilePath = r'/data/csv-leads.csv'
jsonFilePath = r'Names.json'
make_json(csvFilePath, jsonFilePath)
Here is my desired JSON format
{
"EmailAddress": "hello#youngstowncoffeeseattle.com",
"Name": "Youngstown Coffee",
"ConsentToTrack": "Yes"
},
Heres my CSV list
No,EmailAddress,ConsentToTrack
Zylberschtein's Delicatessen & Bakery,catering#zylberschtein.com,Yes
Youngstown Coffee,hello#youngstowncoffeeseattle.com,Yes
It looks like you could use a csv.DictReader to make this easier.
If I have data.csv that looks like this:
Name,EmailAddress,ConsentToTrack
Zylberschtein's Delicatessen,catering#zylberschtein.com,yes
Youngstown Coffee,hello#youngstowncoffeeseattle.com,yes
I can convert it into JSON like this:
>>> import csv
>>> import json
>>> fd = open('data.csv')
>>> reader = csv.DictReader(fd)
>>> print(json.dumps(list(reader), indent=2))
[
{
"Name": "Zylberschtein's Delicatessen",
"EmailAddress": "catering#zylberschtein.com",
"ConsentToTrack": "yes"
},
{
"Name": "Youngstown Coffee",
"EmailAddress": "hello#youngstowncoffeeseattle.com",
"ConsentToTrack": "yes"
}
]
Here I've assumed the headers in the CSV can be used verbatim. I'll update this with an exmaple if you need to modify key names (e.g. convert "No" to "Name"),.
If you need to rename a column, it might look more like this:
import csv
import json
with open('data.csv') as fd:
reader = csv.DictReader(fd)
data = []
for row in reader:
row['Name'] = row.pop('No')
data.append(row)
print(json.dumps(data, indent=2))
Given this input:
No,EmailAddress,ConsentToTrack
Zylberschtein's Delicatessen,catering#zylberschtein.com,yes
Youngstown Coffee,hello#youngstowncoffeeseattle.com,yes
This will output:
[
{
"EmailAddress": "catering#zylberschtein.com",
"ConsentToTrack": "yes",
"Name": "Zylberschtein's Delicatessen"
},
{
"EmailAddress": "hello#youngstowncoffeeseattle.com",
"ConsentToTrack": "yes",
"Name": "Youngstown Coffee"
}
]
and to print on my editor is it simply print(json.dumps(list(reader), indent=2))?
I'm not really familiar with your editor; print is how you generate console output in Python.
I want to convert an excel spreadsheet data to a JSON file. Here is the code I currently have:
Data
excel spreadsheet
Code
import xlrd
from collections import OrderedDict
import json
wb = xlrd.open_workbook('./file1.xlsx')
sh = wb.sheet_by_index(0)
data_list = []
for rownum in range(1, sh.nrows):
data = OrderedDict()
row_values = sh.row_values(rownum)
data['name'] = row_values[0]
data['description'] = row_values[1]
data_list.append(data)
data_list = {'columns': data_list}
j = json.dumps(data_list)
with open('seq1.json', 'w') as f:
f.write(j)
Output
{"columns": [{"name": "FILEID", "description": "FILETYPE"}]}
Expected output
{
"columns": [
{
"name": "fileid",
"description": "FILEID"
},
{
"name": "filetype",
"description": "FILETYPE"
},
{
"name": "stusab",
"description": "STUSAB"
},
{
"name": "chariter",
"description": "CHARITER"
},
{
"name": "sequence",
"description": "SEQUENCE"
},
{
"name": "logrecno",
"description": "LOGRECNO"
}
],
The "name" column should be displaying the first row while the "description" column should be displaying the second row.
What modification can I do in my function to get the output I am looking for?
You need to iterate over columns, not rows
import xlrd
from collections import OrderedDict
import json
wb = xlrd.open_workbook('./file1.xls')
sh = wb.sheet_by_index(0)
data_list = []
data = OrderedDict()
for colnum in range(0, sh.ncols):
data['name'] = sh.row_values(0)[colnum]
data['description'] = sh.row_values(1)[colnum]
data_list.append(data.copy())
data_list = {'columns': data_list}
j = json.dumps(data_list)
with open('seq1.json', 'w') as f:
f.write(j)
You should give a try to:
import excel2json
excel2json.convert_from_file('file.xlsx')
You can use pandas
import pandas as pd
df = pd.read_excel('./file1.xlsx')
with open('seq1.json', 'w') as f:
f.write(df.to_json())
Given a json file data.json and I wants to reduce the json file which I store in a variable data_list to three different dictionary crime, crime1 and crime2 by using three different functions
[{"Region": "South", "State": "ALABAMA", "City": "Athens", "Population": "25603", "Murder": "1", "Rape": "1", "Robbery": "16", "Assault": "0", "Burglary": "90", "Theft": "538", "Vehicle_Theft": "8"}, {"Region": "South", "State": "ALABAMA", "City": "Atmore", "Population": "10021", "Murder": "0", "Rape": "3", "Robbery": "14", "Assault": "76", "Burglary": "86", "Theft": "315", "Vehicle_Theft": "12"}]
I load it into a variable
with open('/content/data_crime.json', 'r') as f:
data_list = json.load(f)
I want to reduce data_list into three dictionaries: murder_by_region, violent_by_region, and
nonviolent_by_region.
Create dictionary iterate over data_list create dictionary using accumulating pattern
violent_crime is Murder and Assault and non_violent is Theft and Vehicle_theft
I do it by using function for making all three dictionary
function takes three parameters:
Key: region or state
crime : 'Murder'
data_list:the list containing dictionaries for each city
Here you go:
from collections import defaultdict
import json
murder_by_region = defaultdict(int)
violent_per_region = defaultdict(int)
nonviolent_per_region = defaultdict(int)
with open('/content/data_crime.json') as f:
data_list = json.load(f)
for row in data_list:
region = row['Region']
murder_by_region[region] += int(row.get('Murder', 0))
violent_per_region[region] += int(row.get('Murder', 0)) + int(row.get('Assault', 0))
nonviolent_per_region[region] += int(row.get('Theft', 0)) + int(row.get('Vehicle_Theft', 0))
Why not make it a dictionary of dictionaries where the keys are the city names,
And then do this, it can easily be adjusted to get input like yours.
with open('data_crime.json', 'r') as File:
FileData = json.load(File)
ExitData = {} # empty dict
nonViolent = ['Robbery', 'Burglary', 'etc..']
Violent = ['Assult', 'Rape']
for i in FileData:
# i is the key or in this case the city name
numOfNonViolent = 0
for j in nonViolent:
numOfNonViolent += FileData[i][j]
numOfViolent = 0
for j in Violent:
numOfViolent += FileData[i][j]
# will make a new key for ExitData the key is the city name
ExitData[i] = {
'Violent Crime' : numOfViolent
'NonViolent Crime' : numOfNonViolent
'Murder' : FileData[i]['Murder']
}
I have csv like this:
id,company_name,country,country_id
1,batstop,usa, xx
2,biorice,italy, yy
1,batstop,italy, yy
3,legstart,canada, zz
I want an array of dictionaries to import to firebase. I need to group the different country informations for the same company in a nested list of dictionaries. This is the desired output:
[ {'id':'1', 'agency_name':'batstop', countries [{'country':'usa','country_id':'xx'}, {'country':'italy','country_id':'yy'}]} ,
{'id':'2', 'agency_name':'biorice', countries [{'country':'italy','country_id':'yy'}]},
{'id':'3', 'legstart':'legstart', countries [{'country':'canada','country_id':'zz'}]} ]
Recently I had a similar task, the groupby function from itertools and the itemgetter function from operator - both standard python libraries - helped me a lot. Here's the code considering your csv, note how defining the primary keys of your csv dataset is important.
import csv
import json
from operator import itemgetter
from itertools import groupby
primary_keys = ['id', 'company_name']
# Start extraction
with open('input.csv', 'r') as file:
# Read data from csv
reader = csv.DictReader(file)
# Sort data accordingly to primary keys
reader = sorted(reader, key=itemgetter(*primary_keys))
# Create a list of tuples
# Each tuple containing a dict of the group primary keys and its values, and a list of the group ordered dicts
groups = [(dict(zip(primary_keys, _[0])), list(_[1])) for _ in groupby(reader, key=itemgetter(*primary_keys))]
# Create formatted dict to be converted into firebase objects
group_dicts = []
for group in groups:
group_dict = {
"id": group[0]['id'],
"agency_name": group[0]['company_name'],
"countries": [
dict(country=_['country'], country_id=_['country_id']) for _ in group[1]
],
}
group_dicts.append(group_dict)
print("\n".join([json.dumps(_, indent=2) for _ in group_dicts]))
Here's the output:
{
"id": "1",
"agency_name": "batstop",
"countries": [
{
"country": "usa",
"country_id": " xx"
},
{
"country": "italy",
"country_id": " yy"
}
]
}
{
"id": "2",
"agency_name": "biorice",
"countries": [
{
"country": "italy",
"country_id": " yy"
}
]
}
{
"id": "3",
"agency_name": "legstart",
"countries": [
{
"country": "canada",
"country_id": " zz"
}
]
}
There's no external library,
Hope it suits you well!
You can try this, you may have to change a few parts to get it working with your csv, but hope it's enough to get you started:
csv = [
"1,batstop,usa, xx",
"2,biorice,italy, yy",
"1,batstop,italy, yy",
"3,legstart,canada, zz"
]
output = {} # dictionary useful to avoid searching in list for existing ids
# Parse each row
for line in csv:
cols = line.split(',')
id = int(cols[0])
agency_name = cols[1]
country = cols[2]
country_id = cols[3]
if id in output:
output[id]['countries'].append([{'country': country,
'country_id': country_id}])
else:
output[id] = {'id': id,
'agency_name': agency_name,
'countries': [{'country': country,
'country_id': country_id}]
}
# Put into list
json_output = []
for key in output.keys():
json_output.append( output[key] )
# Check output
for row in json_output:
print(row)