Convert unformatted JSON file to CSV - python

I am trying to convert a JSON file into CSV. The issue is JSON file is not formatted uniformly.
{
"attributes": {
"type": "Lead",
"url": "xyz"
},
"FirstName": "Bradford",
"LastName": "Cosenza",
"School_District__c": "Ross County",
"Status": "Open",
"CreatedDate": "2022-12-21T16:34:35.000+0000",
"Email": "something#something.com",
"Lead_ID__c": "00Q3b0000212gxh",
"Id": "00Q3b0000212gxhEAA"
},
{
"attributes": {
"type": "Lead",
"url": "xyz"
},
"FirstName": "Bradford",
"LastName": "Cosenza",
"School_District__c": "Ross County",
"Status": "Open",
"CreatedDate": "2020-03-31T23:25:03.000+0000",
"Verification_Status__c": "Invalid",
"Verification_Date__c": "2022-08-05",
"Email": "something#something.com",
"Lead_ID__c": "00Q3b00001t0uNf",
"Id": "00Q3b00001t0uNfEAI"
},
Here is the snippet from the JSON file, but Verification_Status__c,Verification_Date__c is missing from the 2nd entry.
I used this code
import json
import csv
# Open the JSON file & load its data
with open('duplicate.json') as dat_file:
data = json.load(dat_file)
stud_data = data['records']
# Opening a CSV file for writing in write mode
data_file = open('data_file.csv', 'w')
csv_writer = csv.writer(data_file)
count = 0
for cnt in stud_data:
if count == 0:
header = cnt.keys()
csv_writer.writerow(header)
count += 1
csv_writer.writerow(cnt.values())
data_file.close()
but I am getting scrambled data in CSV file

Can use a csv.DictWriter if the records appear in different order or if the keys are missing from some records.
If there are nested objects in the JSON then they need to be flattened to export in the CSV output.
For the DictWriter, you will need the full set of keys when creating it, so can either create a fixed list at start or do two-passes over the data where the first pass will find the full set of keys and second pass creates the CSV file.
import json
import csv
data = """{
"records":[
{
"attributes": {
"type": "Lead",
"url": "xyz"
},
"FirstName": "Bradford",
"LastName": "Cosenza",
"School_District__c": "Ross County",
"Status": "Open",
"CreatedDate": "2022-12-21T16:34:35.000+0000",
"Email": "something#something.com",
"Lead_ID__c": "00Q3b0000212gxh",
"Id": "00Q3b0000212gxhEAA"
},
{
"attributes": {
"type": "Lead",
"url": "xyz"
},
"FirstName": "Bradford",
"LastName": "Cosenza",
"School_District__c": "Ross County",
"Status": "Open",
"CreatedDate": "2020-03-31T23:25:03.000+0000",
"Verification_Status__c": "Invalid",
"Verification_Date__c": "2022-08-05",
"Email": "something#something.com",
"Lead_ID__c": "00Q3b00001t0uNf",
"Id": "00Q3b00001t0uNfEAI"
}
]}"""
# full set of keys in JSON for the CSV columns
keys = ["Id",
"FirstName",
"LastName",
"School_District__c",
"Status",
"CreatedDate",
"Verification_Status__c",
"Verification_Date__c",
"Email",
"Lead_ID__c",
"type",
"url"
]
Next convert data to list of dictionary objects
and write output to CSV file.
# Open the JSON file & load its data
# use json.loads() to load from string or json.load() to load from file
data = json.loads(data)
stud_data = data['records']
# Opening a CSV file for writing in write mode
with open('data_file.csv', 'w', newline='') as data_file:
csv_writer = csv.DictWriter(data_file, fieldnames=keys)
csv_writer.writeheader()
for row in stud_data:
# flatten the sub-elements in attributes object
attrs = row.pop("attributes", None)
if attrs:
for k,v in attrs.items():
row[k] = v
csv_writer.writerow(row)
Output:
Id,FirstName,LastName,School_District__c,Status,CreatedDate,Verification_Status__c,Verification_Date__c,Email,Lead_ID__c,type,url
00Q3b0000212gxhEAA,Bradford,Cosenza,Ross County,Open,2022-12-21T16:34:35.000+0000,,,something#something.com,00Q3b0000212gxh,Lead,xyz
00Q3b00001t0uNfEAI,Bradford,Cosenza,Ross County,Open,2020-03-31T23:25:03.000+0000,Invalid,2022-08-05,something#something.com,00Q3b00001t0uNf,Lead,xyz

Related

Appending a data type to a json file

I want to append a column name with the value of it's data type to a JSON file. I can't seem to figure out how to get the data type of the value based on the name. Not sure how to append this correctly in the for-loop for data['type'].
Excel Spreadsheet
Code
import xlrd
from collections import OrderedDict
import json
wb = xlrd.open_workbook('./file1.xlsx')
sh = wb.sheet_by_index(0)
data_list = []
data = OrderedDict()
for colnum in range(0, sh.ncols):
data['name'] = sh.row_values(0)[colnum]
data['description'] = sh.row_values(1)[colnum]
data_list.append(data.copy())
data_list = {'columns': data_list}
j = json.dumps(data_list)
with open('seq1.json', 'w') as f:
f.write(j)
Output
{
"columns": [
{
"name": "FILEID",
"description": "FILEID"
},
{
"name": "FILETYPE",
"description": "FILETYPE"
},
}
Expected output
{
"columns": [
{
"name": "fileid",
"description": "FILEID",
"type": "keyword"
},
{
"name": "filetype",
"description": "FILETYPE",
"type": "keyword"
},
}

CSV to Avro with Python: Avro Schema Issue

I am trying to serialise my CSV file into Avro and then iterate through each row and send to a Kafka consumer. Currently I get an issue where the data being send through doesn't match my schema but I am unsure as to why.
below is code to read csv and serialise the rows in it and output to a file with Avro format.
import os, csv, avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
from kafka import KafkaProducer
from collections import namedtuple
output_loc = '{}/avro.avro'.format(os.path.dirname(__file__))
CSV = '{}/oscar_age_male.csv'.format(os.path.dirname(__file__))
fields = ("Index","Year", "Age", "Name", "Movie")
csv_record = namedtuple('csv_record', fields)
def read_csv(path):
with open(path, 'rU') as data:
data.readline()
reader = csv.reader(data, delimiter=",")
for row in map(csv_record._make, reader):
print(row)
yield row
def parse_schema(path='{}/schema.avsc'.format(os.path.dirname(__file__))):
with open(path, 'r') as data:
return avro.schema.parse(data.read())
def serilialise_records(records, outpath=output_loc):
schema = parse_schema()
with open(outpath, 'w') as out:
writer = DataFileWriter(out, DatumWriter(), schema)
for record in records:
record = dict((f, getattr(record, f)) for f in record._fields)
writer.append(record)
serilialise_records(read_csv(CSV))
and here is the error is receive:
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {'Index': '1', 'Year': '1928', 'Age': '44', 'Name': ' "Emil Jannings"', 'Movie': ' "The Last Command The Way of All Flesh"'} is not an example of the schema {
"type": "record",
"name": "Test",
"namespace": "avro_schema_test",
"fields": [
{
"type": "int",
"name": "Index"
},
{
"type": "int",
"name": "Year"
},
{
"type": "int",
"name": "Age"
},
{
"type": "string",
"name": "Name"
},
{
"type": "string",
"name": "Movie"
}
]
}
my Avro schema is:
{
"type": "record",
"namespace": "avro_schema_test",
"name": "Test",
"fields": [
{"name": "Index", "type": "int"},
{"name": "Year", "type": "int"},
{"name": "Age", "type": "int"},
{"name": "Name", "type": "string"},
{"name": "Movie", "type": "string"}
]
}
Once issue is resolved I will iterate through my avro file and send records to Kafka.

CSV file to JSON for nested array generic template using python (for csv to mongodb insert)

I want to create the JSON file from CSV file using the generic python script.
Found hone package from GitHub but some of the functionalities missing in that code.
csv to json
I want to code like generic template CSV to JSON.
[
{
"birth": {
"day": "7",
"month": "May",
"year": "1985"
},
"name": "Bob",
"reference": "TRUE",
"reference name": "Smith"
}
]
Only handled above type of JSON only.
[
{
"Type": "AwsEc2Instance",
"Id": "i-cafebabe",
"Partition": "aws",
"Region": "us-west-2",
"Tags": {
"billingCode": "Lotus-1-2-3",
"needsPatching": "true"
},
"Details": {
"AwsEc2Instance": {
"Type": "i3.xlarge",
"ImageId": "ami-abcd1234",
"IpV4Addresses": [ "54.194.252.215", "192.168.1.88" ],
"IpV6Addresses": [ "2001:db812341a2b::123" ],
"KeyName": "my_keypair",
"VpcId": "vpc-11112222",
"SubnetId": "subnet-56f5f633",
"LaunchedAt": "2018-05-08T16:46:19.000Z"
}
}
}
]
I want to handle nested array[] ,{}
I have done something like this before and below code can be modified as I have not seen your dataset.
dataframe = pd.read_excel('dataframefilepath', encoding='utf-8', header=0)
'''Adding to list to finally save it as JSON'''
df = []
for (columnName, columnData) in dataframe.iteritems():
if dataframe.columns.get_loc(columnName) > 0:
for indata, rwdata in dataframe.iterrows():
for insav, rwsave in df_to_Save.iterrows():
if rwdata.Selected_Prediction == rwsave.Selected_Prediction:
#print()
df_to_Save.loc[insav, 'Value_to_Save'] = rwdata[dataframe.columns.get_loc(columnName)]
#print(rwdata[dataframe.columns.get_loc(columnName)])
df.append(df_to_Save.set_index('Selected_Prediction').T.to_dict('record'))
df = eval(df)
'''Saving in JSON format'''
path_to_save = '\\your path'
with open(path_to_save, 'w') as json_file:
json.dump(df, json_file)

Parsing JSON data if a key value is matched and print a key value in Python

I am very much new to JSON parsing. Below is my JSON:
[
{
"description": "Newton",
"exam_code": {
"date_added": "2015-05-13T04:49:54+00:00",
"description": "Production",
"exam_tags": [
{
"date_added": "2012-01-13T03:39:17+00:00",
"descriptive_name": "Production v0.1",
"id": 1,
"max_count": "147",
"name": "Production"
}
],
"id": 1,
"name": "Production",
"prefix": "SA"
},
"name": "CM"
},
{
"description": "Opera",
"exam_code": {
"date_added": "2015-05-13T04:49:54+00:00",
"description": "Production",
"test_tags": [
{
"date_added": "2012-02-22T12:44:55+00:00",
"descriptive_name": "Production v0.1",
"id": 1,
"max_count": "147",
"name": "Production"
}
],
"id": 1,
"name": "Production",
"prefix": "SA"
},
"name": "OS"
}
]
Here I am trying to find if name value is CM print description value.
If name value is OS then print description value.
Please help me to to understand how JSON parsing can be done?
Considering you have already read the JSON string from somewhere, be it a file, stdin, or any other source.
You can actually deserialize it into a Python object by doing:
import json
# ...
json_data = json.loads(json_str)
Where json_str is the JSON string that you want to parse.
In your case, json_str will get deserialized into a Python list, so you can do any operation on it as you'd normally do with a list.
Of course, this includes iterating over the elements:
for item in json_data:
if item.get('name') in ('CM', 'OS'):
print(item['description'])
As you can see, the items in json_data have been deserialized into dict, so you can access the actual fields using dict operations.
Note
You can also deserialize a JSON from the source directly, provided you have access to the file handler/descriptor or stream:
# Loading from a file
import json
with open('my_json.json', 'r') as fd:
# Note that we're using json.load, not json.loads
json_data = json.load(fd)
# Loading from stdin
import json, sys
json_data = json.load(sys.stdin)

Parsing JSON to CSV using Python: AttributeError: 'unicode' object has no attribute 'keys'

I have a nested JSON dataset containing multiple entries which look like this:
{
"coordinates": null,
"acoustic_features": {
"instrumentalness": "0.00479",
"liveness": "0.18",
"speechiness": "0.0294",
"danceability": "0.634",
"valence": "0.342",
"loudness": "-8.345",
"tempo": "125.044",
"acousticness": "0.00035",
"energy": "0.697",
"mode": "1",
"key": "6"
},
"artist_id": "b2980c722a1ace7a30303718ce5491d8",
"place": null,
"geo": null,
"tweet_lang": "en",
"source": "Share.Radionomy.com",
"track_title": "8eeZ",
"track_id": "cd52b3e5b51da29e5893dba82a418a4b",
"artist_name": "Dominion",
"entities": {
"hashtags": [{
"text": "nowplaying",
"indices": [0, 11]
}, {
"text": "goth",
"indices": [51, 56]
}, {
"text": "deathrock",
"indices": [57, 67]
}, {
"text": "postpunk",
"indices": [68, 77]
}],
"symbols": [],
"user_mentions": [],
"urls": [{
"indices": [28, 50],
"expanded_url": "cathedral13.com/blog13",
"display_url": "cathedral13.com/blog13",
"url": "t.co/Tatf4hEVkv"
}]
},
"created_at": "2014-01-01 05:54:21",
"text": "#nowplaying Dominion - 8eeZ Tatf4hEVkv #goth #deathrock #postpunk",
"user": {
"location": "middle of nowhere",
"lang": "en",
"time_zone": "Central Time (US & Canada)",
"name": "Cathedral 13",
"entities": null,
"id": 81496937,
"description": "I\u2019m a music junkie who is currently responsible for Cathedral 13 internet radio (goth, deathrock, post-punk)which has been online since 06/20/02."
},
"id": 418243774842929150
}
I want to convert it into a csv file in which there are multiple columns containing the respective entries for each JSON object. The following is the Python code I have written to do it:
import json
import csv
from pprint import pprint
data = []
with open('data_subset.json') as data_file:
for line in data_file:
data.append(json.loads(line))
# open a file for writing
data_csv = open('Data_csv.csv', 'w')
# create the csv writer object
csvwriter = csv.writer(data_csv)
for i in range(1,10):
count = 0
for dat in data[i]:
if count == 0:
header = dat.keys()
csvwriter.writerow(header)
count += 1
csvwriter.writerow(emp.values())
data_csv.close()
On running the above code,I get the error: AttributeError: 'unicode' object has no attribute 'keys'.
What could be the problem?
You can read the JSON file all in once like:
with open('a.txt') as data_file:
data = json.load(data_file)
Now you have the JSON as the data dictionary.
Since you want specific entries from the JSON to csv (e.g. entities is not saved to csv) you can keep a custom column header and then loop over the data to write the particular keys to the csv:
# Example to save the artist_id and user id; can be extended for the actual data
header = ['artist_id', 'id']
# open a file for writing
data_csv = open('Data_csv.csv', 'wb')
# create the csv writer object
csvwriter = csv.writer(data_csv)
# write the csv header
csvwriter.writerow(header)
for entry in data:
csvwriter.writerow([entry['artist_id'], entry['user']['id']])
data_csv.close()

Categories