Json to cvs using python - python

I'm trying to convert json to csv, but the code have **"header"**With my current knowladge I can't covert it into csv, because I don't now hot to handle "headers":
`
{
"__metadata": {
"uri": "http://ip:port/vvv/v1/folders?page=1&pagesize=50"
},
"first": {
"__deferred": {
"uri": "http://ip:port/vvv/v1/folders?page=1&pagesize=50"
}
},
"last": {
"__deferred": {
"uri": "http://ip:port/vvv/v1/folders?page=1&pagesize=50"
}
},
"entries": [
`
And the rest of code looks like this:
`
{
"__metadata": {
"uri": "http://ip:port/vvv/v1/folders/13483"
},
"cuid": "AfbTJW3iTE1MkiLULzA6P58",
"name": "Foldername1",
"description": "",
"id": "13483",
"type": "Folder",
"ownerid": "12",
"updated": "Wed Mar 01 09:14:23 CET 2017"
},
{
"__metadata": {
"uri": "http://ip:port/vvv/v1/folders/523"
},
"cuid": "AS1oZEJAynpNjZIaZK2rc7g",
"name": "foldername2",
"description": "",
"id": "523",
"type": "Folder",
"ownerid": "10",
"updated": "Wed Jan 18 00:11:06 CET 2017"
},
{
"__metadata": {
"uri": "http://ip:port/vvv/v1/folders/5356"
},
"cuid": "AeN4lEu0h_tAtnPEjFYxwi8",
"name": "foldername",
"description": "",
"id": "5356",
"type": "Folder",
"ownerid": "12",
"updated": "Fri Feb 10 17:28:53 CET 2017"
}
]
}
`
How can I convert above code into csv? How I can deal with "header"?

Python's json and csv libraries should handle this for you. Just load the json data in and access the entries tag directly. From there you can enumerate all the data and write it to a csv file.
This example shows how to also write all of the data in dataprovider before writing the expression list:
import json
import csv
data = """{
"dataprovider": {
"id": "DP0",
"name": "Query 1",
"dataSourceId": "5430",
"dataSourcePrefix": "DS0",
"dataSourceType": "unv",
"updated": "2010-12-03T13:07:43.000Z",
"duration": 1,
"isPartial": "false",
"rowCount": 1016,
"flowCount": 1,
"dictionary": {
"expression": [{
"#dataType": "String",
"#qualification": "Dimension",
"id": "DP0.DOa5",
"name": "Lines",
"description": "Product line. Each line contains a set of categories.",
"dataSourceObjectId": "DS0.DOa5",
"formulaLanguageId": "[Lines]"
},
{
"#dataType": "Numeric",
"#qualification": "Measure",
"#highPrecision": "false",
"id": "DP0.DO93",
"name": "Sales revenue",
"description": "Sales revenue $ - $ revenue of SKU sold",
"dataSourceObjectId": "DS0.DO93",
"formulaLanguageId": "[Sales revenue]",
"aggregationFunction": "Sum"
}]
},
"query": "SELECT ... FROM ... WHERE"
}
}
"""
my_json = json.loads(data)
entries = my_json['dataprovider']['dictionary']['expression']
header_1 = my_json['dataprovider'].keys()
header_1.remove("dictionary")
data_1 = [(k, str(my_json['dataprovider'][k])) for k in header_1]
header_2 = sorted(entries[0].keys())
with open('output.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
# Write initial header information
csv_output.writerows(data_1)
# Write an empty row
csv_output.writerow([])
# Write list information
csv_output.writerow(header_2)
for entry in entries:
csv_output.writerow([' '.join(str(entry.get(col, '')).splitlines()) for col in header_2])
The CSV file would then look something like:
updated,2010-12-03T13:07:43.000Z
name,Query 1
dataSourceType,unv
rowCount,1016
isPartial,false
dataSourceId,5430
query,SELECT ... FROM ... WHERE
duration,1
flowCount,1
dataSourcePrefix,DS0
id,DP0
#dataType,#qualification,dataSourceObjectId,description,formulaLanguageId,id,name
String,Dimension,DS0.DOa5,Product line. Each line contains a set of categories.,[Lines],DP0.DOa5,Lines
Numeric,Measure,DS0.DO93,Sales revenue $ - $ revenue of SKU sold,[Sales revenue],DP0.DO93,Sales revenue
If you are getting different JSON, you need to manually decide which part to extract, for example:
entries = my_json['documents']['document']

Related

Python: Convert json with extra data error into CSV

I have a JSON in below format which I receive from a different team and not allowed to make any changes to it:
{
"content": [
{
"id": "5603bbaae412390b73f0c7f",
"name": "ABC",
"description": "Test",
"rsid": "pwcs",
"type": "project",
"owner": {
"id": 529932
},
"created": "2015-09-24T09:00:26Z"
},
{
"id": "56094673e4b0a7e17e310b83",
"name": "secores",
"description": "Panel",
"rsid": "pwce",
"type": "project",
"owner": {
"id": 520902
},
"created": "2015-09-28T13:53:55Z"
}
],
"totalPages": 9,
"totalElements": 8592,
"number": 0,
"numberOfElements": 1000,
"firstPage": true,
"lastPage": false,
"sort": null,
"size": 1000
}
{
"content": [
{
"id": "5bf2cc64d977553780706050",
"name": "Services Report",
"description": "",
"rsid": "pcie",
"type": "project",
"owner": {
"id": 518013
},
"created": "2018-11-19T14:44:52Z"
},
{
"id": "5bf2d56e40b39312e3e167d0",
"name": "Standard form",
"description": "",
"rsid": "wcu",
"type": "project",
"owner": {
"id": 521114
},
"created": "2018-11-19T15:23:26Z"
}
],
"totalPages": 9,
"totalElements": 8592,
"number": 1,
"numberOfElements": 1000,
"firstPage": false,
"lastPage": false,
"sort": null,
"size": 1000
}
{
"content": [
{
"id": "5d95e7d6187c6d6376fd1bad",
"name": "New Project",
"description": "",
"rsid": "pcinforrod",
"type": "project",
"owner": {
"id": 200904228
},
"created": "2019-10-03T12:21:42Z"
},
{
"id": "5d95fc6e56d2e82519629b96",
"name": "Demo - 10/03",
"description": "",
"rsid": "sitedev",
"type": "project",
"owner": {
"id": 20001494
},
"created": "2019-10-03T13:49:34Z"
}
],
"totalPages": 9,
"totalElements": 8592,
"number": 2,
"numberOfElements": 1000,
"firstPage": false,
"lastPage": false,
"sort": null,
"size": 1000
}
I am trying to convert it into CSV using below code:
import csv
import json
with open("C:\python\SampleJSON.json",'rb') as file:
data = json.load(file)
fname = "workspaceExcelDemo.csv"
with open(fname,"w", encoding="utf-8", newline='') as file:
csv_file = csv.writer(file)
csv_file.writerow(["id","name","rsid"])
for item in data["content"]:
csv_file.writerow([item['id'],item['name'],item['rsid']])
However I am getting below error message while executing the above piece of code:
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 35 column 1 (char 937)
How do I convert the above JSON into CSV without making any changes to the JSON file?
If I understand your question and the comments well you could use the json.dumps method:
import csv
import json
with open("C:\python\SampleJSON.json",'rb') as file:
data = [json.loads(line) for line in file]
"""
The json.dumps method converts a Python object to a JSON formatted string.
The json.loads method parses a JSON string into a native Python object.
Replacing the "=" character with an empty string.
"""
data = json.loads(json.dumps(data).replace("=", ""))
fname = "workspaceExcelDemo.csv"
with open(fname, "w", encoding="utf-8", newline='') as file:
csv_file = csv.writer(file)
csv_file.writerow(["id", "name", "rsid"])
for item in data[0]["content"]:
csv_file.writerow([item['id'], item['name'], item['rsid']])

Ignore specific JSON keys when extracting data in Python

I'm extracting certain keys in several JSON files and then converting it to a CSV in Python. I'm able to define a key list when I run my code and get the information I need.
However, there are certain sub-keys that I want to ignore from the JSON file. For example, if we look at the following snippet:
JSON Sample
[
{
"callId": "abc123",
"errorCode": 0,
"apiVersion": 2,
"statusCode": 200,
"statusReason": "OK",
"time": "2020-12-14T12:00:32.744Z",
"registeredTimestamp": 1417731582000,
"UID": "_guid_abc123==",
"created": "2014-12-04T22:19:42.894Z",
"createdTimestamp": 1417731582000,
"data": {},
"preferences": {},
"emails": {
"verified": [],
"unverified": []
},
"identities": [
{
"provider": "facebook",
"providerUID": "123",
"allowsLogin": true,
"isLoginIdentity": true,
"isExpiredSession": true,
"lastUpdated": "2014-12-04T22:26:37.002Z",
"lastUpdatedTimestamp": 1417731997002,
"oldestDataUpdated": "2014-12-04T22:26:37.002Z",
"oldestDataUpdatedTimestamp": 1417731997002,
"firstName": "John",
"lastName": "Doe",
"nickname": "John Doe",
"profileURL": "https://www.facebook.com/John.Doe",
"age": 50,
"birthDay": 31,
"birthMonth": 12,
"birthYear": 1969,
"city": "City, State",
"education": [
{
"school": "High School Name",
"schoolType": "High School",
"degree": null,
"startYear": 0,
"fieldOfStudy": null,
"endYear": 0
}
],
"educationLevel": "High School",
"favorites": {
"music": [
{
"name": "Music 1",
"id": "123",
"category": "Musician/band"
},
{
"name": "Music 2",
"id": "123",
"category": "Musician/band"
}
],
"movies": [
{
"name": "Movie 1",
"id": "123",
"category": "Movie"
},
{
"name": "Movie 2",
"id": "123",
"category": "Movie"
}
],
"television": [
{
"name": "TV 1",
"id": "123",
"category": "Tv show"
}
]
},
"followersCount": 0,
"gender": "m",
"hometown": "City, State",
"languages": "English",
"likes": [
{
"name": "Like 1",
"id": "123",
"time": "2014-10-31T23:52:53.0000000Z",
"category": "TV",
"timestamp": "1414799573"
},
{
"name": "Like 2",
"id": "123",
"time": "2014-09-16T08:11:35.0000000Z",
"category": "Music",
"timestamp": "1410855095"
}
],
"locale": "en_US",
"name": "John Doe",
"photoURL": "https://graph.facebook.com/123/picture?type=large",
"timezone": "-8",
"thumbnailURL": "https://graph.facebook.com/123/picture?type=square",
"username": "john.doe",
"verified": "true",
"work": [
{
"companyID": null,
"isCurrent": null,
"endDate": null,
"company": "Company Name",
"industry": null,
"title": "Company Title",
"companySize": null,
"startDate": "2010-12-31T00:00:00"
}
]
}
],
"isActive": true,
"isLockedOut": false,
"isRegistered": true,
"isVerified": false,
"lastLogin": "2014-12-04T22:26:33.002Z",
"lastLoginTimestamp": 1417731993000,
"lastUpdated": "2014-12-04T22:19:42.769Z",
"lastUpdatedTimestamp": 1417731582769,
"loginProvider": "facebook",
"loginIDs": {
"emails": [],
"unverifiedEmails": []
},
"rbaPolicy": {
"riskPolicyLocked": false
},
"oldestDataUpdated": "2014-12-04T22:19:42.894Z",
"oldestDataUpdatedTimestamp": 1417731582894,
"registered": "2014-12-04T22:19:42.956Z",
"regSource": "",
"socialProviders": "facebook"
}
]
I want to extract data from created and identities but ignore identities.favorites and identities.likes as well as their data underneath it.
This is what I have so far, below. I defined the JSON keys that I want to extract in the key_list variable:
Current Code
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
json_list = json.load(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
# Extract data from the defined key names
key_list = ['created', 'identities']
json_list = [{k:d[k] for k in key_list} for d in json_list]
# Flatten and convert to a data frame
json_list_flattened = (flatten(d, '.') for d in json_list)
df = pandas.DataFrame(json_list_flattened)
# Export to CSV in the same directory with the original file name
export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
Similar to the key_list, I suspect that I would make an ignore list and factor that in the json_list for loop that I have? Something like:
key_ignore = ['identities.favorites', 'identities.likes']`
Then utilize the dict.pop() which looks like it will remove the unwanted sub-keys if it matches? Just not sure how to implement that correctly.
Expected Output
As a result, the code should extract data from the defined keys in key_list and ignore the sub keys defined in key_ignore, which is identities.favorites and identities.likes. Then the rest of the code will continue to convert it into a CSV:
created
identities.0.provider
identities.0.providerUID
identities...
2014-12-04T19:23:05.191Z
site
cb8168b0cf734b70ad541f0132763761
...
If the keys are always there, you can use
del d[0]['identities'][0]['likes']
del d[0]['identities'][0]['favorites']
or if you want to remove the columns from the dataframe after reading all the json data in you can use
df.drop(df.filter(regex='identities.0.favorites|identities.0.likes').columns, axis=1, inplace=True)

How to read fields without numeric index in JSON

I have a json file where I need to read it in a structured way to insert in a database each value in its respective column, but in the tag "customFields" the fields change index, example: "Tribe / Customer" can be index 0 (row['customFields'][0]) in a json block, and in the other one be index 3 (row['customFields'][3]), so I tried to read the data using the name of the row field ['customFields'] ['Tribe / Customer'], but I got the error below:
TypeError: list indices must be integers or slices, not str
Script:
def getCustomField(ModelData):
for row in ModelData["data"]["squads"][0]["cards"]:
print(row['identifier'],
row['customFields']['Tribe / Customer'],
row['customFields']['Stopped with'],
row['customFields']['Sub-Activity'],
row['customFields']['Activity'],
row['customFields']['Complexity'],
row['customFields']['Effort'])
if __name__ == "__main__":
f = open('test.json')
json_file = json.load(f)
getCustomField(json_file)
JSON:
{
"data": {
"squads": [
{
"name": "TESTE",
"cards": [
{
"identifier": "0102",
"title": "TESTE",
"description": " TESTE ",
"status": "on_track",
"priority": null,
"assignees": [
{
"fullname": "TESTE",
"email": "TESTE"
}
],
"createdAt": "2020-04-16T15:00:31-03:00",
"secondaryLabel": null,
"primaryLabels": [
"TESTE",
"TESTE"
],
"swimlane": "TESTE",
"workstate": "Active",
"customFields": [
{
"name": "Tribe / Customer",
"value": "TESTE 1"
},
{
"name": "Checkpoint",
"value": "GNN"
},
{
"name": "Stopped with",
"value": null
},
{
"name": "Sub-Activity",
"value": "DEPLOY"
},
{
"name": "Activity",
"value": "TOOL"
},
{
"name": "Complexity",
"value": "HIGH"
},
{
"name": "Effort",
"value": "20"
}
]
},
{
"identifier": "0103",
"title": "TESTE",
"description": " TESTE ",
"status": "on_track",
"priority": null,
"assignees": [
{
"fullname": "TESTE",
"email": "TESTE"
}
],
"createdAt": "2020-04-16T15:00:31-03:00",
"secondaryLabel": null,
"primaryLabels": [
"TESTE",
"TESTE"
],
"swimlane": "TESTE",
"workstate": "Active",
"customFields": [
{
"name": "Tribe / Customer",
"value": "TESTE 1"
},
{
"name": "Stopped with",
"value": null
},
{
"name": "Checkpoint",
"value": "GNN"
},
{
"name": "Sub-Activity",
"value": "DEPLOY"
},
{
"name": "Activity",
"value": "TOOL"
},
{
"name": "Complexity",
"value": "HIGH"
},
{
"name": "Effort",
"value": "20"
}
]
}
]
}
]
}
}
You'll have to parse the list of custom fields into something you can access by name. Since you're accessing multiple entries from the same list, a dictionary is the most appropriate choice.
for row in ModelData["data"]["squads"][0]["cards"]:
custom_fields_dict = {field['name']: field['value'] for field in row['customFields']}
print(row['identifier'],
custom_fields_dict['Tribe / Customer'],
...
)
If you only wanted a single field you could traverse the list looking for a match, but it would be less efficient to do that repeatedly.
I'm skipping over dealing with missing fields - you'd probably want to use get('Tribe / Customer', some_reasonable_default) if there's any possibility of the field not being present in the json list.

Accessing nested value in loop in json using python

I want to fetch the value of each api3 in this json object where each array has api3 value.
{
"count": 10,
"result": [
{
"type": "year",
"year": {
"month": {
"api1": {
"href": "https://Ap1.com"
},
"api2": {
"href": "FETCH-CONTENT"
},
"api3": {
"href": "https://Ap3.com"
},
"api4": {
"href": "https://Ap4.com"
}
},
"id": "sdvnkjsnvj",
"summary": "summeryc",
"type": "REST",
"apiId": "mlksmfmksdfs",
"idProvider": {
"id": "sfsmkfmskf",
"name": "Apikey"
},
"tags": []
}
},
{
"type": "year1",
"year": {
"month": {
"api1": {
"href": "https://Ap11.com"
},
"api2": {
"href": "FETCH-CONTENT-1"
},
"api3": {
"href": "https://Ap13.com"
},
"api4": {
"href": "https://Ap14.com"
}
},
"id": "sdvnkjsnvj",
"summary": "summeryc",
"type": "REST",
"apiId": "mlksmfmksdfs",
"idProvider": {
"id": "sfsmkfmskf",
"name": "Apikey"
},
"tags": []
}
},
I am able to get the whole json object and first value inside it.
with open('C:\python\examplee.json','r+') as fr:
data = json.load(fr)
print(data["result"])
Thank you in advance for helping me figuring this.
For each element in list of result key, get the value for the nested dictionary within item
print([item['year']['month']['api3'] for item in data['result']])
The output will be [{'href': 'https://Ap3.com'}, {'href': 'https://Ap13.com'}]
Or if you want to get the href value as well
print([item['year']['month']['api3']['href'] for item in data['result']])
The output will be
['https://Ap3.com', 'https://Ap13.com']
So your whole code will look like
data = {}
with open('C:\python\examplee.json','r+') as fr:
data = json.load(fr)
print([item['year']['month']['api3']['href'] for item in dct['result']])
Looks like your JSON schema is static so you can just use this:
print([x['year']['month']['api3']['href'] for x in data['result']])
will return you:
['https://Ap3.com', 'https://Ap13.com']

Flatten nested json to csv with nested column names

I have rather very weird requirement now. I have below json and somehow I have to convert it into flat csv.
[
{
"authorizationQualifier": "SDA",
"authorizationInformation": " ",
"securityQualifier": "ASD",
"securityInformation": " ",
"senderQualifier": "ASDAD",
"senderId": "FADA ",
"receiverQualifier": "ADSAS",
"receiverId": "ADAD ",
"date": "140101",
"time": "0730",
"standardsId": null,
"version": "00501",
"interchangeControlNumber": "123456789",
"acknowledgmentRequested": "0",
"testIndicator": "T",
"functionalGroups": [
{
"functionalIdentifierCode": "ADSAD",
"applicationSenderCode": "ASDAD",
"applicationReceiverCode": "ADSADS",
"date": "20140101",
"time": "07294900",
"groupControlNumber": "123456789",
"responsibleAgencyCode": "X",
"version": "005010X221A1",
"transactions": [
{
"name": "ASDADAD",
"transactionSetIdentifierCode": "adADS",
"transactionSetControlNumber": "123456789",
"implementationConventionReference": null,
"segments": [
{
"BPR03": "ad",
"BPR14": "QWQWDQ",
"BPR02": "1.57",
"BPR13": "23223",
"BPR01": "sad",
"BPR12": "56",
"BPR10": "32424",
"BPR09": "12313",
"BPR08": "DA",
"BPR07": "123456789",
"BPR06": "12313",
"BPR05": "ASDADSAD",
"BPR16": "21313",
"BPR04": "SDADSAS",
"BPR15": "11212",
"id": "aDSASD"
},
{
"TRN02": "2424",
"TRN03": "35435345",
"TRN01": "3435345",
"id": "FSDF"
},
{
"REF02": "fdsffs",
"REF01": "sfsfs",
"id": "fsfdsfd"
},
{
"DTM02": "2432424",
"id": "sfsfd",
"DTM01": "234243"
}
],
"loops": [
{
"id": "24324234234",
"segments": [
{
"N101": "sfsfsdf",
"N102": "sfsf",
"id": "dgfdgf"
},
{
"N301": "sfdssfdsfsf",
"N302": "effdssf",
"id": "fdssf"
},
{
"N401": "sdffssf",
"id": "sfds",
"N402": "sfdsf",
"N403": "23424"
},
{
"PER06": "Wsfsfdsfsf",
"PER05": "sfsf",
"PER04": "23424",
"PER03": "fdfbvcb",
"PER02": "Pedsdsf",
"PER01": "sfsfsf",
"id": "fdsdf"
}
]
},
{
"id": "2342",
"segments": [
{
"N101": "sdfsfds",
"N102": "vcbvcb",
"N103": "dsfsdfs",
"N104": "343443",
"id": "fdgfdg"
},
{
"N401": "dfsgdfg",
"id": "dfgdgdf",
"N402": "dgdgdg",
"N403": "234244"
},
{
"REF02": "23423342",
"REF01": "fsdfs",
"id": "sfdsfds"
}
]
}
]
}
]
}
]
}
]
The column header name corresponding to deeper key-value make take nested form, like functionalGroups[0].transactions[0].segments[0].BPR15.
I am able to do this in java using this github project (here you can find the output format I desire in the explanation) in one line:
flatJson = JSONFlattener.parseJson(new File("files/simple.json"), "UTF-8");
The output was:
date,securityQualifier,testIndicator,functionalGroups[1].functionalIdentifierCode,functionalGroups[1].date,functionalGroups[1].applicationReceiverCode, ...
140101,00,T,HP,20140101,ETIN,...
But I want to do this in python. I tried as suggested in this answer:
with open('data.json') as data_file:
data = json.load(data_file)
df = json_normalize(data, record_prefix=True)
with open('temp2.csv', "w", newline='\n') as csv_file:
csv_file.write(df.to_csv())
However, for column functionalGroups, it dumps json as a cell value.
I also tried as suggested in this answer:
with open('data.json') as f: # this ensures opening and closing file
a = json.loads(f.read())
df = pandas.DataFrame(a)
print(df.transpose())
But this also seem to do the same:
0
acknowledgmentRequested 0
authorizationInformation
authorizationQualifier SDA
date 140101
functionalGroups [{'functionalIdentifierCode': 'ADSAD', 'applic...
interchangeControlNumber 123456789
receiverId ADAD
receiverQualifier ADSAS
securityInformation
securityQualifier ASD
senderId FADA
senderQualifier ASDAD
standardsId None
testIndicator T
time 0730
version 00501
Is it possible to do what I desire in python?

Categories