i am doing analysis on semi structured data, and for that i had to flatten both xml and json files to a pandas dataframe, now when the analysis is done, i do the improvement like drop null values and fix some data errors i need to generate xml or json files (depending on which format the user entered).
this is what i'm using to flatten xml :
import xml.etree.ElementTree as et
from collections import defaultdict
import pandas as pd
def flatten_xml(node, key_prefix=()):
"""
Walk an XML node, generating tuples of key parts and values.
"""
# Copy tag content if any
text = (node.text or '').strip()
if text:
yield key_prefix, text
# Copy attributes
for attr, value in node.items():
yield key_prefix + (attr,), value
# Recurse into children
for child in node:
yield from flatten_xml(child, key_prefix + (child.tag,))
def dictify_key_pairs(pairs, key_sep='.'):
"""
Dictify key pairs from flatten_xml, taking care of duplicate keys.
"""
out = {}
# Group by candidate key.
key_map = defaultdict(list)
for key_parts, value in pairs:
key_map[key_sep.join(key_parts)].append(value)
# Figure out the final dict with suffixes if required.
for key, values in key_map.items():
if len(values) == 1: # No need to suffix keys.
out[key] = values[0]
else: # More than one value for this key.
for suffix, value in enumerate(values, 1):
out[f'{key}{key_sep}{suffix}'] = value
return out
# Parse XML with etree
tree = et.parse('NCT00571389.xml').iter()
# Generate flat rows out of the root nodes in the tree
rows = [dictify_key_pairs(flatten_xml(row)) for row in tree]
df = pd.DataFrame(rows)
and this is what i'm using to flatten json :
from collections import defaultdict
import pandas as pd
import json
def flatten_json(nested_json, exclude=['']):
out = {}
def flatten(x, name='', exclude=exclude):
if type(x) is dict:
for a in x:
if a not in exclude: flatten(x[a], name + a + '.')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
f = open('employee_data.json')
this_dict = json.load(f)
df = pd.DataFrame([flatten_json(x) for x in this_dict[list(this_dict.keys())[0]]])
i need to know how to go from a dataframe to the original structure of the files, help please?
edit:
this is the example of the json file i'm using:
{"features": [{"candidate": {"first_name": "Margaret", "last_name": "Mcdonald", "skills": ["skLearn", "Java", "R", "SQL", "Spark", "C++"], "state": "AL", "specialty": "Database", "experience": "Mid", "relocation": "no"}}, {"candidate": {"first_name": "Michael", "last_name": "Carter", "skills": ["TensorFlow", "R", "Spark", "MongoDB", "C++", "SQL"], "state": "AR", "specialty": "Statistics", "experience": "Senior", "relocation": "yes"}}]}
and this is the columns after i flatten them:
candidate.first_name
candidate.last_name
candidate.skills.0
candidate.skills.1
candidate.skills.2
candidate.skills.3
candidate.skills.4
candidate.skills.5
candidate.state
candidate.specialty
candidate.experience
candidate.relocation
candidate.skills.6
candidate.skills.7
candidate.skills.8
Ok, this was not easy and I should have guided you instead of coding it for you, but here is what I've done:
json = {"features": [{"candidate": {"first_name": "Margaret", "last_name": "Mcdonald", "skills": ["skLearn", "Java", "R", "SQL", "Spark", "C++"], "state": "AL", "specialty": "Database", "experience": "Mid", "relocation": "no"}}, {"candidate": {"first_name": "Michael", "last_name": "Carter", "skills": ["TensorFlow", "R", "Spark", "MongoDB", "C++", "SQL"], "state": "AR", "specialty": "Statistics", "experience": "Senior", "relocation": "yes"}}]}
df = pd.DataFrame([flatten_json(x) for x in json[list(json.keys())[0]]])
import re
header = df.columns
print(header)
regex = r'(\w+)\.(\w+)\.?(\d+)?'
m=re.findall(regex,'\n'.join(header))
def make_json(json,feature,pos,value):
if pos+1 == len(feature):
json[feature[pos]] = value
return json
elif feature[pos+1] == '':
json[feature[pos]] = value
return json
elif feature[pos+1].isdigit():
if feature[pos+1] == '0':
json[feature[pos]] = [value]
return json
else:
json[feature[pos]].append(value)
return json
else:
if feature[pos] not in json:
json[feature[pos]] = make_json({},feature,pos+1,value)
return json
else:
json[feature[pos]] = make_json(json[feature[pos]],feature,pos+1,value)
return json
json = {'features': []}
for row in range(len(df)):
cadidate = {}
for col, feature in enumerate(m):
cadidate = make_json(cadidate,feature,0,df.iloc[row][header[col]])
json['features'].append(cadidate)
print(json)
You see I wanted to make it in a recursive way so it can work for more complex json, as long you define the regex right. For your specific example it could be simpler.
Related
Want to convert Sample JSON data into CSV file using python. I am retrieving JSON data from API.
As my JSON has nested objects, so it normally cannot be directly converted to CSV.I don't want to do any hard coding and I want to make a python code fully dynamic.
So, I have written a function that flatten my JSON Data but I am not able to work out how to iterate all records, finding relevant column names and then output those data into CSV.
In the Sample JSON file I have mentioned only 2 records but in actual there are 100 records.
Sample JSON Look like this:
[
{
"id":"Random_Company_57",
"unid":"75",
"fieldsToValues":{
"Email":"None",
"occupation":"SO1 Change",
"manager":"None",
"First Name":"Bells",
"employeeID":"21011.0",
"loginRequired":"true",
"superUser":"false",
"ldapSuperUser":"false",
"archived":"true",
"password":"None",
"externalUser":"false",
"Username":"Random_Company_57",
"affiliation":"",
"Phone":"+16 22 22 222",
"unidDominoKey":"",
"externalUserActive":"false",
"secondaryOccupation":"SO1 Change",
"retypePassword":"None",
"Last Name":"Christmas"
},
"hierarchyFieldAccess":[
],
"userHierarchies":[
{
"hierarchyField":"Company",
"value":"ABC Company"
},
{
"hierarchyField":"Department",
"value":"gfds"
},
{
"hierarchyField":"Project",
"value":"JKL-SDFGHJW"
},
{
"hierarchyField":"Division",
"value":"Silver RC"
},
{
"hierarchyField":"Site",
"value":"SQ06"
}
],
"locale":{
"id":1,
"dateFormat":"dd/MM/yyyy",
"languageTag":"en-UA"
},
"roles":[
"User"
],
"readAccessRoles":[
],
"preferredLanguage":"en-AU",
"prefName":"Christmas Bells",
"startDate":"None",
"firstName":"Bells",
"lastName":"Christmas",
"fullName":"Christmas Bells",
"lastModified":"2022-02-22T03:47:41.632Z",
"email":"None",
"docNo":"None",
"virtualSuperUser":false
},
{
"id":"xyz.abc#safe.net",
"unid":"98",
"fieldsToValues":{
"Email":"xyz.abc#safe.net",
"occupation":"SO1 Change",
"manager":"None",
"First Name":"Bells",
"employeeID":"21011.0",
"loginRequired":"false",
"superUser":"false",
"ldapSuperUser":"false",
"archived":"false",
"password":"None",
"externalUser":"false",
"Username":"xyz.abc#safe.net",
"affiliation":"",
"Phone":"+16 2222 222 222",
"unidDominoKey":"",
"externalUserActive":"false",
"secondaryOccupation":"SO1 Change",
"retypePassword":"None",
"Last Name":"Christmas"
},
"hierarchyFieldAccess":[
],
"userHierarchies":[
{
"hierarchyField":"Company",
"value":"ABC Company"
},
{
"hierarchyField":"Department",
"value":"PUHJ"
},
{
"hierarchyField":"Project",
"value":"RPOJ-SDFGHJW"
},
{
"hierarchyField":"Division",
"value":"Silver RC"
},
{
"hierarchyField":"Site",
"value":"SQ06"
}
],
"locale":{
"id":1,
"dateFormat":"dd/MM/yyyy",
"languageTag":"en-UA"
},
"roles":[
"User"
],
"readAccessRoles":[
],
"preferredLanguage":"en-AU",
"prefName":"Christmas Bells",
"startDate":"None",
"firstName":"Bells",
"lastName":"Christmas",
"fullName":"Christmas Bells",
"lastModified":"2022-03-16T05:04:13.085Z",
"email":"xyz.abc#safe.net",
"docNo":"None",
"virtualSuperUser":false
}
]
What I have tried.
def flattenjson(b, delim):
val = {}
for i in b.keys():
if isinstance(b[i], dict):
get = flattenjson(b[i], delim)
for j in get.keys():
val[i + delim + j] = get[j]
else:
val[i] = b[i]
print(val)
return val
json=[{Sample JSON String that mentioned above}]
flattenjson(json,"__")
I don't know it is a right way to deal this problem or not?
My final aim is that all the above json data will output in a csv file.
Based on this answer, you could loop through your list of json data and flatten each json with the given function (they always have the same structure?), then build a DataFrame and write the data to csv. That's the easiest way I can think of,
try this:
import pandas as pd
import json
import collections
def flatten(dictionary, parent_key=False, separator='__'):
items = []
for key, value in dictionary.items():
new_key = str(parent_key) + separator + key if parent_key else key
if isinstance(value, collections.MutableMapping):
items.extend(flatten(value, new_key, separator).items())
elif isinstance(value, list):
for k, v in enumerate(value):
items.extend(flatten({str(k): v}, new_key).items())
else:
items.append((new_key, value))
return dict(items)
with open('your_json.json') as f:
data = json.load(f) # data is a the example you provided (list of dicts)
all_records=[]
for jsn in data:
tmp = flatten(jsn)
all_records.append(tmp)
df = pd.DataFrame(all_records)
out = df.to_csv('json_to_csv.csv')
I have a file with the following structure:
SE|text|Baz
SE|entity|Bla
SE|relation|Bla
SE|relation|Foo
SE|text|Bla
SE|entity|Foo
SE|text|Zoo
SE|relation|Bla
SE|relation|Baz
Records (i.e., blocks) are separated by an empty line. Each line in a block starts with a SE tag. text tag always occurs in the first line of each block.
I wonder how to properly extract only blocks with a relation tag, which is not necessarily present in each block. My attempt is pasted below:
from itertools import groupby
with open('test.txt') as f:
for nonempty, group in groupby(f, bool):
if nonempty:
process_block() ## ?
Desired output is a json dump:
{
"result": [
{
"text": "Baz",
"relation": ["Bla","Foo"]
},
{
"text": "Zoo",
"relation": ["Bla","Baz"]
}
]
}
I have a proposed solution in pure python that returns a block if it contains the value in any position. This could most likely be done more elegant in a proper framework like pandas.
from pprint import pprint
fname = 'ex.txt'
# extract blocks
with open(fname, 'r') as f:
blocks = [[]]
for line in f:
if len(line) == 1:
blocks.append([])
else:
blocks[-1] += [line.strip().split('|')]
# remove blocks that don't contain 'relation
blocks = [block for block in blocks
if any('relation' == x[1] for x in block)]
pprint(blocks)
# [[['SE', 'text', 'Baz'],
# ['SE', 'entity', 'Bla'],
# ['SE', 'relation', 'Bla'],
# ['SE', 'relation', 'Foo']],
# [['SE', 'text', 'Zoo'], ['SE', 'relation', 'Bla'], ['SE', 'relation', 'Baz']]]
# To export to proper json format the following can be done
import pandas as pd
import json
results = []
for block in blocks:
df = pd.DataFrame(block)
json_dict = {}
json_dict['text'] = list(df[2][df[1] == 'text'])
json_dict['relation'] = list(df[2][df[1] == 'relation'])
results.append(json_dict)
print(json.dumps(results))
# '[{"text": ["Baz"], "relation": ["Bla", "Foo"]}, {"text": ["Zoo"], "relation": ["Bla", "Baz"]}]'
Let's go through it
Read the file into a list and divide each block by a blank line and divide columns with the | character.
Go through each block in the list and sort out any that does not contain relation.
Print the output.
You can not store the same key twice in a dictionary as mentioned in the comments.
You can read your file, split at '\n\n' into blocks, split blocks into lines at '\n', split lines into data at '|'.
You then can put it into a suiteable datastructure and parse it into a string using module json:
Create data file:
with open("f.txt","w")as f:
f.write('''SE|text|Baz
SE|entity|Bla
SE|relation|Bla
SE|relation|Foo
SE|text|Bla
SE|entity|Foo
SE|text|Zoo
SE|relation|Bla
SE|relation|Baz''')
Read data and process it:
with open("f.txt") as f:
all_text = f.read()
as_blocks = all_text.split("\n\n")
# skip SE when splitting and filter only with |relation|
with_relation = [[k.split("|")[1:]
for k in b.split("\n")]
for b in as_blocks if "|relation|" in b]
print(with_relation)
Create a suiteable data structure - grouping multiple same keys into a list:
result = []
for inner in with_relation:
result.append({})
for k,v in inner:
# add as simple key
if k not in result[-1]:
result[-1][k] = v
# got key 2nd time, read it as list
elif k in result[-1] and not isinstance(result[-1][k], list):
result[-1][k] = [result[-1][k], v]
# got it a 3rd+ time, add to list
else:
result[-1][k].append(v)
print(result)
Create json from data structure:
import json
print( json.dumps({"result":result}, indent=4))
Output:
# with_relation
[[['text', 'Baz'], ['entity', 'Bla'], ['relation', 'Bla'], ['relation', 'Foo']],
[['text', 'Zoo'], ['relation', 'Bla'], ['relation', 'Baz']]]
# result
[{'text': 'Baz', 'entity': 'Bla', 'relation': ['Bla', 'Foo']},
{'text': 'Zoo', 'relation': ['Bla', 'Baz']}]
# json string
{
"result": [
{
"text": "Baz",
"entity": "Bla",
"relation": [
"Bla",
"Foo"
]
},
{
"text": "Zoo",
"relation": [
"Bla",
"Baz"
]
}
]
}
In my opinion this is a very good case for a small parser.
This solution uses a PEG parser called parsimonious but you could totally use another one:
from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
import json
data = """
SE|text|Baz
SE|entity|Bla
SE|relation|Bla
SE|relation|Foo
SE|text|Bla
SE|entity|Foo
SE|text|Zoo
SE|relation|Bla
SE|relation|Baz
"""
class TagVisitor(NodeVisitor):
grammar = Grammar(r"""
content = (ws / block)+
block = line+
line = ~".+" nl?
nl = ~"[\n\r]"
ws = ~"\s+"
""")
def generic_visit(self, node, visited_children):
return visited_children or node
def visit_content(self, node, visited_children):
filtered = [child[0] for child in visited_children if isinstance(child[0], dict)]
return {"result": filtered}
def visit_block(self, node, visited_children):
text, relations = None, []
for child in visited_children:
if child[1] == "text" and not text:
text = child[2].strip()
elif child[1] == "relation":
relations.append(child[2])
if relations:
return {"text": text, "relation": relations}
def visit_line(self, node, visited_children):
tag1, tag2, text = node.text.split("|")
return tag1, tag2, text.strip()
tv = TagVisitor()
result = tv.parse(data)
print(json.dumps(result))
This yields
{"result":
[{"text": "Baz", "relation": ["Bla", "Foo"]},
{"text": "Zoo", "relation": ["Bla", "Baz"]}]
}
The idea is to phrase a grammar, build an abstract syntax tree out of it and return the block's content in a suitable data format.
Given a json file data.json and I wants to reduce the json file which I store in a variable data_list to three different dictionary crime, crime1 and crime2 by using three different functions
[{"Region": "South", "State": "ALABAMA", "City": "Athens", "Population": "25603", "Murder": "1", "Rape": "1", "Robbery": "16", "Assault": "0", "Burglary": "90", "Theft": "538", "Vehicle_Theft": "8"}, {"Region": "South", "State": "ALABAMA", "City": "Atmore", "Population": "10021", "Murder": "0", "Rape": "3", "Robbery": "14", "Assault": "76", "Burglary": "86", "Theft": "315", "Vehicle_Theft": "12"}]
I load it into a variable
with open('/content/data_crime.json', 'r') as f:
data_list = json.load(f)
I want to reduce data_list into three dictionaries: murder_by_region, violent_by_region, and
nonviolent_by_region.
Create dictionary iterate over data_list create dictionary using accumulating pattern
violent_crime is Murder and Assault and non_violent is Theft and Vehicle_theft
I do it by using function for making all three dictionary
function takes three parameters:
Key: region or state
crime : 'Murder'
data_list:the list containing dictionaries for each city
Here you go:
from collections import defaultdict
import json
murder_by_region = defaultdict(int)
violent_per_region = defaultdict(int)
nonviolent_per_region = defaultdict(int)
with open('/content/data_crime.json') as f:
data_list = json.load(f)
for row in data_list:
region = row['Region']
murder_by_region[region] += int(row.get('Murder', 0))
violent_per_region[region] += int(row.get('Murder', 0)) + int(row.get('Assault', 0))
nonviolent_per_region[region] += int(row.get('Theft', 0)) + int(row.get('Vehicle_Theft', 0))
Why not make it a dictionary of dictionaries where the keys are the city names,
And then do this, it can easily be adjusted to get input like yours.
with open('data_crime.json', 'r') as File:
FileData = json.load(File)
ExitData = {} # empty dict
nonViolent = ['Robbery', 'Burglary', 'etc..']
Violent = ['Assult', 'Rape']
for i in FileData:
# i is the key or in this case the city name
numOfNonViolent = 0
for j in nonViolent:
numOfNonViolent += FileData[i][j]
numOfViolent = 0
for j in Violent:
numOfViolent += FileData[i][j]
# will make a new key for ExitData the key is the city name
ExitData[i] = {
'Violent Crime' : numOfViolent
'NonViolent Crime' : numOfNonViolent
'Murder' : FileData[i]['Murder']
}
I have json data with nested arrays which contains same key name.
My Json format is like(may vary key value pair names):
{
"name": "bharat",
"age": 27,
"vehicles": [
{
"car": "tata",
"bike": "duke",
"plane": "n",
},
{
"car": "odi",
"bike": "duke",
"plane": "n",
}]
}
I have tried
Convert nested JSON to CSV file in Python
but got multiple columns with same keys in vehicles.
My code is
import json
import csv
from elasticsearch import Elasticsearch
import elasticsearch.helpers
with open("query.json") as f:
query=json.load(f)
es = Elasticsearch(['http://xx.xx.xx.xx:xxxx'],verify_certs=False)
results_gen = elasticsearch.helpers.scan(
es,
query=query,
index="demo",
)
def get_leaves(item, key=None):
if isinstance(item, dict):
leaves = []
for i in item.keys():
leaves.extend(get_leaves(item[i], i))
return leaves
elif isinstance(item, list):
leaves = []
for i in item:
leaves.extend(get_leaves(i, key))
return leaves
else:
return [(key, item)]
with open('Data.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
write_header = True
for entry in results_gen:
e=entry['_source']
leaf_entries = sorted(get_leaves(e))
print(leaf_entries)
if write_header:
csv_output.writerow([k for k, v in leaf_entries])
write_header = False
csv_output.writerow([v for k, v in leaf_entries])
I am getting output like
name age car car bike bike plane plane
bharat 27 tata odi duke duke n n
I expect output to be like
name age car bike plane
bharat 27 tata duke n
bharat 27 odi duke n
Something like this (Assuming you are interested in vehicles data only)
data = {"name": "bharat", "age": 27, "vehicles": [{"car": "tata", "bike": "duke", "plane": "n",
}, {"car": "odi", "bike": "duke", "plane": "n",
}]}
with open('out.csv', 'w') as out:
for v in data['vehicles']:
line = ','.join(v.values())
out.write(line + '\n')
You Can use Pandas Normalize class to normalize the json into pandas dataframe and then write the dataframe to csv
Lets say you have Json dictionaries in variable 'data' like: [{json1},{json2},{json3}...]
import json
import pandas
from pandas.io.json import json_normalize
for d in data:
normalized_data = json_normalize(d)
normalized_data .to_csv('csv_file_name', sep='|', mode='a', index=False, na_rep='', header=False)
refer: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.io.json.json_normalize.html
I am writing a python program to write data to CSV in Python. This is my code below
from pandas.io.json import json_normalize
import json
import csv
data =[
{
"Name": "jonathan",
"Age": 23,
"Occupation": "Lawyer",
"Address":[
{"postal_code":2323,
"place": "TP",
"Location":"Central Singapore"
}
]
},
{
"Name": "jacky",
"Age": 21,
"Occupation": "IT analyst",
"Address":[
{"postal_code":31234,
"place": "CCK",
"Location":"NW Singapore"
}
]
}
]
nested_json = data
new_dict= dict()
# to flatten the json
def flatten_json(nested_json):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
# function to get the data out and flatten and write to csv
def write_to_csv(nested_json):
for i in nested_json:
a = flatten_json(i)
print(a)
with open('dict.csv', 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(a.values())
if __name__ == '__main__':
write_to_csv(nested_json);
The issue is my open csv is writing a blank line after each row of data.
I followed this stackoverflow issue to resolve (CSV file written with Python has blank lines between each row)
but it seems that it is not working. As such there is an alternative way which is to use pandas to remove the blank lines after processing but it seems kinda silly. AS such, May I ask what am I doing wrong here? I ensured that the newline=''.
Thank you very much
I managed to figure out what was wrong.
Please refer to the revised code here. I will close the issue.
from pandas.io.json import json_normalize
import json
import csv
data =[
{
"Name": "jonathan",
"Age": 23,
"Occupation": "Lawyer",
"Address":[
{"postal_code":2323,
"place": "TP",
"Location":"Central Singapore"
}
]
},
{
"Name": "Adrian",
"Age": 21,
"Occupation": "IT analyst",
"Address":[
{"postal_code":31234,
"place": "CCK",
"Location":"NW Singapore"
}
]
}
]
nested_json = data
new_dict= dict()
# to flatten the json
def flatten_json(nested_json):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
# recursiveness to get the data out and flatten
def write_to_csv(nested_json):
with open('dict.csv', 'a', newline='') as csv_file:
writer = csv.writer(csv_file)
for i in nested_json:
a = flatten_json(i)
print(a)
writer.writerow(a.values())
if __name__ == '__main__':
write_to_csv(nested_json);