Removing extra line with Python - python

I am writing a python program to write data to CSV in Python. This is my code below
from pandas.io.json import json_normalize
import json
import csv
data =[
{
"Name": "jonathan",
"Age": 23,
"Occupation": "Lawyer",
"Address":[
{"postal_code":2323,
"place": "TP",
"Location":"Central Singapore"
}
]
},
{
"Name": "jacky",
"Age": 21,
"Occupation": "IT analyst",
"Address":[
{"postal_code":31234,
"place": "CCK",
"Location":"NW Singapore"
}
]
}
]
nested_json = data
new_dict= dict()
# to flatten the json
def flatten_json(nested_json):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
# function to get the data out and flatten and write to csv
def write_to_csv(nested_json):
for i in nested_json:
a = flatten_json(i)
print(a)
with open('dict.csv', 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(a.values())
if __name__ == '__main__':
write_to_csv(nested_json);
The issue is my open csv is writing a blank line after each row of data.
I followed this stackoverflow issue to resolve (CSV file written with Python has blank lines between each row)
but it seems that it is not working. As such there is an alternative way which is to use pandas to remove the blank lines after processing but it seems kinda silly. AS such, May I ask what am I doing wrong here? I ensured that the newline=''.
Thank you very much

I managed to figure out what was wrong.
Please refer to the revised code here. I will close the issue.
from pandas.io.json import json_normalize
import json
import csv
data =[
{
"Name": "jonathan",
"Age": 23,
"Occupation": "Lawyer",
"Address":[
{"postal_code":2323,
"place": "TP",
"Location":"Central Singapore"
}
]
},
{
"Name": "Adrian",
"Age": 21,
"Occupation": "IT analyst",
"Address":[
{"postal_code":31234,
"place": "CCK",
"Location":"NW Singapore"
}
]
}
]
nested_json = data
new_dict= dict()
# to flatten the json
def flatten_json(nested_json):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
# recursiveness to get the data out and flatten
def write_to_csv(nested_json):
with open('dict.csv', 'a', newline='') as csv_file:
writer = csv.writer(csv_file)
for i in nested_json:
a = flatten_json(i)
print(a)
writer.writerow(a.values())
if __name__ == '__main__':
write_to_csv(nested_json);

Related

how do i unflatten a dataframe back to json/xml format?

i am doing analysis on semi structured data, and for that i had to flatten both xml and json files to a pandas dataframe, now when the analysis is done, i do the improvement like drop null values and fix some data errors i need to generate xml or json files (depending on which format the user entered).
this is what i'm using to flatten xml :
import xml.etree.ElementTree as et
from collections import defaultdict
import pandas as pd
def flatten_xml(node, key_prefix=()):
"""
Walk an XML node, generating tuples of key parts and values.
"""
# Copy tag content if any
text = (node.text or '').strip()
if text:
yield key_prefix, text
# Copy attributes
for attr, value in node.items():
yield key_prefix + (attr,), value
# Recurse into children
for child in node:
yield from flatten_xml(child, key_prefix + (child.tag,))
def dictify_key_pairs(pairs, key_sep='.'):
"""
Dictify key pairs from flatten_xml, taking care of duplicate keys.
"""
out = {}
# Group by candidate key.
key_map = defaultdict(list)
for key_parts, value in pairs:
key_map[key_sep.join(key_parts)].append(value)
# Figure out the final dict with suffixes if required.
for key, values in key_map.items():
if len(values) == 1: # No need to suffix keys.
out[key] = values[0]
else: # More than one value for this key.
for suffix, value in enumerate(values, 1):
out[f'{key}{key_sep}{suffix}'] = value
return out
# Parse XML with etree
tree = et.parse('NCT00571389.xml').iter()
# Generate flat rows out of the root nodes in the tree
rows = [dictify_key_pairs(flatten_xml(row)) for row in tree]
df = pd.DataFrame(rows)
and this is what i'm using to flatten json :
from collections import defaultdict
import pandas as pd
import json
def flatten_json(nested_json, exclude=['']):
out = {}
def flatten(x, name='', exclude=exclude):
if type(x) is dict:
for a in x:
if a not in exclude: flatten(x[a], name + a + '.')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
f = open('employee_data.json')
this_dict = json.load(f)
df = pd.DataFrame([flatten_json(x) for x in this_dict[list(this_dict.keys())[0]]])
i need to know how to go from a dataframe to the original structure of the files, help please?
edit:
this is the example of the json file i'm using:
{"features": [{"candidate": {"first_name": "Margaret", "last_name": "Mcdonald", "skills": ["skLearn", "Java", "R", "SQL", "Spark", "C++"], "state": "AL", "specialty": "Database", "experience": "Mid", "relocation": "no"}}, {"candidate": {"first_name": "Michael", "last_name": "Carter", "skills": ["TensorFlow", "R", "Spark", "MongoDB", "C++", "SQL"], "state": "AR", "specialty": "Statistics", "experience": "Senior", "relocation": "yes"}}]}
and this is the columns after i flatten them:
candidate.first_name
candidate.last_name
candidate.skills.0
candidate.skills.1
candidate.skills.2
candidate.skills.3
candidate.skills.4
candidate.skills.5
candidate.state
candidate.specialty
candidate.experience
candidate.relocation
candidate.skills.6
candidate.skills.7
candidate.skills.8
Ok, this was not easy and I should have guided you instead of coding it for you, but here is what I've done:
json = {"features": [{"candidate": {"first_name": "Margaret", "last_name": "Mcdonald", "skills": ["skLearn", "Java", "R", "SQL", "Spark", "C++"], "state": "AL", "specialty": "Database", "experience": "Mid", "relocation": "no"}}, {"candidate": {"first_name": "Michael", "last_name": "Carter", "skills": ["TensorFlow", "R", "Spark", "MongoDB", "C++", "SQL"], "state": "AR", "specialty": "Statistics", "experience": "Senior", "relocation": "yes"}}]}
df = pd.DataFrame([flatten_json(x) for x in json[list(json.keys())[0]]])
import re
header = df.columns
print(header)
regex = r'(\w+)\.(\w+)\.?(\d+)?'
m=re.findall(regex,'\n'.join(header))
def make_json(json,feature,pos,value):
if pos+1 == len(feature):
json[feature[pos]] = value
return json
elif feature[pos+1] == '':
json[feature[pos]] = value
return json
elif feature[pos+1].isdigit():
if feature[pos+1] == '0':
json[feature[pos]] = [value]
return json
else:
json[feature[pos]].append(value)
return json
else:
if feature[pos] not in json:
json[feature[pos]] = make_json({},feature,pos+1,value)
return json
else:
json[feature[pos]] = make_json(json[feature[pos]],feature,pos+1,value)
return json
json = {'features': []}
for row in range(len(df)):
cadidate = {}
for col, feature in enumerate(m):
cadidate = make_json(cadidate,feature,0,df.iloc[row][header[col]])
json['features'].append(cadidate)
print(json)
You see I wanted to make it in a recursive way so it can work for more complex json, as long you define the regex right. For your specific example it could be simpler.

Converting excel spreadsheet to json

I want to convert an excel spreadsheet data to a JSON file. Here is the code I currently have:
Data
excel spreadsheet
Code
import xlrd
from collections import OrderedDict
import json
wb = xlrd.open_workbook('./file1.xlsx')
sh = wb.sheet_by_index(0)
data_list = []
for rownum in range(1, sh.nrows):
data = OrderedDict()
row_values = sh.row_values(rownum)
data['name'] = row_values[0]
data['description'] = row_values[1]
data_list.append(data)
data_list = {'columns': data_list}
j = json.dumps(data_list)
with open('seq1.json', 'w') as f:
f.write(j)
Output
{"columns": [{"name": "FILEID", "description": "FILETYPE"}]}
Expected output
{
"columns": [
{
"name": "fileid",
"description": "FILEID"
},
{
"name": "filetype",
"description": "FILETYPE"
},
{
"name": "stusab",
"description": "STUSAB"
},
{
"name": "chariter",
"description": "CHARITER"
},
{
"name": "sequence",
"description": "SEQUENCE"
},
{
"name": "logrecno",
"description": "LOGRECNO"
}
],
The "name" column should be displaying the first row while the "description" column should be displaying the second row.
What modification can I do in my function to get the output I am looking for?
You need to iterate over columns, not rows
import xlrd
from collections import OrderedDict
import json
wb = xlrd.open_workbook('./file1.xls')
sh = wb.sheet_by_index(0)
data_list = []
data = OrderedDict()
for colnum in range(0, sh.ncols):
data['name'] = sh.row_values(0)[colnum]
data['description'] = sh.row_values(1)[colnum]
data_list.append(data.copy())
data_list = {'columns': data_list}
j = json.dumps(data_list)
with open('seq1.json', 'w') as f:
f.write(j)
You should give a try to:
import excel2json
excel2json.convert_from_file('file.xlsx')
You can use pandas
import pandas as pd
df = pd.read_excel('./file1.xlsx')
with open('seq1.json', 'w') as f:
f.write(df.to_json())

How to append data to JSON list that doesn't contains any key for that list?

[
{
"name": "name one",
"id": 1
},
{
"name": "name two",
"id": 2
}
]
I want to append object to the list in .json file. how do i do?
You could read the existing json content update it and rewrite the updated list.
import json
with open("myfile.json", "r+") as f:
my_file = f.read() # read the current content
my_list = json.loads(my_file) # convert from json object to dictionary type
dict_obj = {
"name": "name three",
"id": 3
}
my_list.append(dict_obj)
f.seek(0) # sets point at the beginning of the file
f.truncate() # Clear previous content
print(f" going to rewrite {my_list}")
f.write(json.dumps(my_list)) # Write updated version file
I'm not entirely sure of what you are asking but perhaps the code below will help:
const myList = [
{
"name": "name one",
"id": 1
},
{
"name": "name two",
"id": 2
}
]
const myNewItem = {
"name": "name three",
"id": 3
}
const addItemIfDifferentId = (list, newItem) => [...list, !list.map(({id}) =>id).includes(newItem.id) ? {...newItem} : {} ]
const newList = addItemIfDifferentId(myList, myNewItem)
newList
Maybe this will help you:
import json
# When loading a .json files it will be a string:
with open('data.json') as json_file:
x = json.load(json_file) //{"key1":"123", "key2":"456", "key3":"789"}
# python object to be appended
y = {"key4": "101112"}
# Load the json string to be an object type:
z = json.loads(x)
# appending the data
z.update(y)
# the result is a JSON string:
print(json.dumps(z))
with open('data.json', 'w') as outfile:
json.dump(z, outfile)

Need to cut off some unnecessary information from a JSON file and preserve the JSON structure

I have a JSON file
[
{
"api_key": "123123112313121321",
"collaborators_count": 1,
"created_at": "",
"custom_event_fields_used": 0,
"discarded_app_versions": [],
"discarded_errors": [],
"errors_url": "https://api.bugsnag.com/projects/1231231231312/errors",
"events_url": "https://api.bugsnag.com/projects/1231231231213/events",
"global_grouping": [],
"html_url": "https://app.bugsnag.com/lol/kek/",
"id": "34234243224224",
"ignore_old_browsers": true,
"ignored_browser_versions": {},
"is_full_view": true,
"language": "javascript",
"location_grouping": [],
"name": "asdasdaasd",
"open_error_count": 3,
"release_stages": [
"production"
],
"resolve_on_deploy": false,
"slug": "wqeqweqwwqweq",
"type": "js",
"updated_at": "2020-04-06T15:22:10.480Z",
"url": "https://api.bugsnag.com/projects/12312312213123",
"url_whitelist": null
}
]
What I need is to remove all lines apart from "id:" and "name:" and preserve the JSON structure. Can anybody advise a Python or bash script to handle this?
With jq:
$ jq 'map({id: .id, name: .name})' input.json
[
{
"id": "34234243224224",
"name": "asdasdaasd"
}
]
Using python, you could first deserialize the JSON file(JSON array of objects) with json.load, then filter out the keys you want with a list comprehension:
from json import load
keys = ["name", "id"]
with open("test.json") as json_file:
data = load(json_file)
filtered_json = [{k: obj.get(k) for k in keys} for obj in data]
print(filtered_json)
Output:
[{'name': 'asdasdaasd', 'id': '34234243224224'}]
If we want to serialize this python list to another output file, we can use json.dump:
from json import load
from json import dump
keys = ["name", "id"]
with open("test.json") as json_file, open("output.json", mode="w") as json_output:
data = load(json_file)
filtered_json = [{k: obj.get(k) for k in keys} for obj in data]
dump(filtered_json, json_output, indent=4, sort_keys=True)
output.json
[
{
"id": "34234243224224",
"name": "asdasdaasd"
}
]
You can try this:
import json
with open('<input filename>', 'r') as f:
data = json.load(f)
new_data = []
for item in data:
new_item = {key: value for key, value in item.items() if key == "id" or key =="name"}
new_data.append(new_item)
with open('<output filename>', 'w') as f:
json.dump(new_data, f)
Covert your JSON into Pandas Dataframe
{
import pandas as pd
df=pd.read_json('your json variable')
res=df.drop(['url_whitelis','api_key'],axis=1)
pd.to_json(res) }

Convert a complex layered JSON to CSV

I am trying to parse through JSON code and write the results into a csv file. The "name" values are supposed to be the column headers and the 'value' values are what need to be stored.This is my code. the CSV file writer does not separate the strings with commas: eventIdlistingsvenueperformer and when I try to do something like: header = col['name']+',' I get: eventId","listings","venue","performer And it isn't read as a csv file so...My questions are: am I going about this right? and how could I separate the strings by commas?
"results": [
{
"columns": [
{
"name": "eventId",
"value": "XXXX",
"defaultHidden": false
},
{
"name": "listings",
"value": "8",
"defaultHidden": false
},
{
"name": "venue",
"value": "Nationwide Arena",
"defaultHidden": false
}]
this is my code:
json_decode=json.loads(data)
report_result = json_decode['results']
with open('testReport2.csv','w') as result_data:
csvwriter = csv.writer(result_data,delimiter=',')
count = 0
for res in report_result:
deeper = res['columns']
for col in deeper:
if count == 0:
header = col['name']
csvwriter.writerow([header,])
count += 1
for written in report_result:
deeper =res['columns']
for col in deeper:
csvwriter.writerow([trouble,])
result_data.close()
try below code:
json_decode=json.loads(data)
report_result = json_decode['results']
new_dict = {}
for result in report_result:
columns = result["columns"]
for value in columns:
new_dict[value['name']] = value['value']
with open('testReport2.csv','w') as result_data:
csvwriter = csv.DictWriter(result_data,delimiter=',',fieldnames=new_dict.keys())
csvwriter.writeheader()
csvwriter.writerow(new_dict)
Try this:
json_decode=json.loads(data)
report_result = json_decode['results']
with open('testReport2.csv','w') as result_data:
csvwriter = csv.writer(result_data,delimiter=',')
header = list(report_result[0]['columns'][0].keys())
csvwriter.writerow(header)
for written in report_result:
for row in written['columns']:
deeper =row.values()
csvwriter.writerow(deeper)
result_data.close()

Categories