Ignore specific JSON keys when extracting data in Python - python

I'm extracting certain keys in several JSON files and then converting it to a CSV in Python. I'm able to define a key list when I run my code and get the information I need.
However, there are certain sub-keys that I want to ignore from the JSON file. For example, if we look at the following snippet:
JSON Sample
[
{
"callId": "abc123",
"errorCode": 0,
"apiVersion": 2,
"statusCode": 200,
"statusReason": "OK",
"time": "2020-12-14T12:00:32.744Z",
"registeredTimestamp": 1417731582000,
"UID": "_guid_abc123==",
"created": "2014-12-04T22:19:42.894Z",
"createdTimestamp": 1417731582000,
"data": {},
"preferences": {},
"emails": {
"verified": [],
"unverified": []
},
"identities": [
{
"provider": "facebook",
"providerUID": "123",
"allowsLogin": true,
"isLoginIdentity": true,
"isExpiredSession": true,
"lastUpdated": "2014-12-04T22:26:37.002Z",
"lastUpdatedTimestamp": 1417731997002,
"oldestDataUpdated": "2014-12-04T22:26:37.002Z",
"oldestDataUpdatedTimestamp": 1417731997002,
"firstName": "John",
"lastName": "Doe",
"nickname": "John Doe",
"profileURL": "https://www.facebook.com/John.Doe",
"age": 50,
"birthDay": 31,
"birthMonth": 12,
"birthYear": 1969,
"city": "City, State",
"education": [
{
"school": "High School Name",
"schoolType": "High School",
"degree": null,
"startYear": 0,
"fieldOfStudy": null,
"endYear": 0
}
],
"educationLevel": "High School",
"favorites": {
"music": [
{
"name": "Music 1",
"id": "123",
"category": "Musician/band"
},
{
"name": "Music 2",
"id": "123",
"category": "Musician/band"
}
],
"movies": [
{
"name": "Movie 1",
"id": "123",
"category": "Movie"
},
{
"name": "Movie 2",
"id": "123",
"category": "Movie"
}
],
"television": [
{
"name": "TV 1",
"id": "123",
"category": "Tv show"
}
]
},
"followersCount": 0,
"gender": "m",
"hometown": "City, State",
"languages": "English",
"likes": [
{
"name": "Like 1",
"id": "123",
"time": "2014-10-31T23:52:53.0000000Z",
"category": "TV",
"timestamp": "1414799573"
},
{
"name": "Like 2",
"id": "123",
"time": "2014-09-16T08:11:35.0000000Z",
"category": "Music",
"timestamp": "1410855095"
}
],
"locale": "en_US",
"name": "John Doe",
"photoURL": "https://graph.facebook.com/123/picture?type=large",
"timezone": "-8",
"thumbnailURL": "https://graph.facebook.com/123/picture?type=square",
"username": "john.doe",
"verified": "true",
"work": [
{
"companyID": null,
"isCurrent": null,
"endDate": null,
"company": "Company Name",
"industry": null,
"title": "Company Title",
"companySize": null,
"startDate": "2010-12-31T00:00:00"
}
]
}
],
"isActive": true,
"isLockedOut": false,
"isRegistered": true,
"isVerified": false,
"lastLogin": "2014-12-04T22:26:33.002Z",
"lastLoginTimestamp": 1417731993000,
"lastUpdated": "2014-12-04T22:19:42.769Z",
"lastUpdatedTimestamp": 1417731582769,
"loginProvider": "facebook",
"loginIDs": {
"emails": [],
"unverifiedEmails": []
},
"rbaPolicy": {
"riskPolicyLocked": false
},
"oldestDataUpdated": "2014-12-04T22:19:42.894Z",
"oldestDataUpdatedTimestamp": 1417731582894,
"registered": "2014-12-04T22:19:42.956Z",
"regSource": "",
"socialProviders": "facebook"
}
]
I want to extract data from created and identities but ignore identities.favorites and identities.likes as well as their data underneath it.
This is what I have so far, below. I defined the JSON keys that I want to extract in the key_list variable:
Current Code
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
json_list = json.load(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
# Extract data from the defined key names
key_list = ['created', 'identities']
json_list = [{k:d[k] for k in key_list} for d in json_list]
# Flatten and convert to a data frame
json_list_flattened = (flatten(d, '.') for d in json_list)
df = pandas.DataFrame(json_list_flattened)
# Export to CSV in the same directory with the original file name
export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
Similar to the key_list, I suspect that I would make an ignore list and factor that in the json_list for loop that I have? Something like:
key_ignore = ['identities.favorites', 'identities.likes']`
Then utilize the dict.pop() which looks like it will remove the unwanted sub-keys if it matches? Just not sure how to implement that correctly.
Expected Output
As a result, the code should extract data from the defined keys in key_list and ignore the sub keys defined in key_ignore, which is identities.favorites and identities.likes. Then the rest of the code will continue to convert it into a CSV:
created
identities.0.provider
identities.0.providerUID
identities...
2014-12-04T19:23:05.191Z
site
cb8168b0cf734b70ad541f0132763761
...

If the keys are always there, you can use
del d[0]['identities'][0]['likes']
del d[0]['identities'][0]['favorites']
or if you want to remove the columns from the dataframe after reading all the json data in you can use
df.drop(df.filter(regex='identities.0.favorites|identities.0.likes').columns, axis=1, inplace=True)

Related

Python: Convert json with extra data error into CSV

I have a JSON in below format which I receive from a different team and not allowed to make any changes to it:
{
"content": [
{
"id": "5603bbaae412390b73f0c7f",
"name": "ABC",
"description": "Test",
"rsid": "pwcs",
"type": "project",
"owner": {
"id": 529932
},
"created": "2015-09-24T09:00:26Z"
},
{
"id": "56094673e4b0a7e17e310b83",
"name": "secores",
"description": "Panel",
"rsid": "pwce",
"type": "project",
"owner": {
"id": 520902
},
"created": "2015-09-28T13:53:55Z"
}
],
"totalPages": 9,
"totalElements": 8592,
"number": 0,
"numberOfElements": 1000,
"firstPage": true,
"lastPage": false,
"sort": null,
"size": 1000
}
{
"content": [
{
"id": "5bf2cc64d977553780706050",
"name": "Services Report",
"description": "",
"rsid": "pcie",
"type": "project",
"owner": {
"id": 518013
},
"created": "2018-11-19T14:44:52Z"
},
{
"id": "5bf2d56e40b39312e3e167d0",
"name": "Standard form",
"description": "",
"rsid": "wcu",
"type": "project",
"owner": {
"id": 521114
},
"created": "2018-11-19T15:23:26Z"
}
],
"totalPages": 9,
"totalElements": 8592,
"number": 1,
"numberOfElements": 1000,
"firstPage": false,
"lastPage": false,
"sort": null,
"size": 1000
}
{
"content": [
{
"id": "5d95e7d6187c6d6376fd1bad",
"name": "New Project",
"description": "",
"rsid": "pcinforrod",
"type": "project",
"owner": {
"id": 200904228
},
"created": "2019-10-03T12:21:42Z"
},
{
"id": "5d95fc6e56d2e82519629b96",
"name": "Demo - 10/03",
"description": "",
"rsid": "sitedev",
"type": "project",
"owner": {
"id": 20001494
},
"created": "2019-10-03T13:49:34Z"
}
],
"totalPages": 9,
"totalElements": 8592,
"number": 2,
"numberOfElements": 1000,
"firstPage": false,
"lastPage": false,
"sort": null,
"size": 1000
}
I am trying to convert it into CSV using below code:
import csv
import json
with open("C:\python\SampleJSON.json",'rb') as file:
data = json.load(file)
fname = "workspaceExcelDemo.csv"
with open(fname,"w", encoding="utf-8", newline='') as file:
csv_file = csv.writer(file)
csv_file.writerow(["id","name","rsid"])
for item in data["content"]:
csv_file.writerow([item['id'],item['name'],item['rsid']])
However I am getting below error message while executing the above piece of code:
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 35 column 1 (char 937)
How do I convert the above JSON into CSV without making any changes to the JSON file?
If I understand your question and the comments well you could use the json.dumps method:
import csv
import json
with open("C:\python\SampleJSON.json",'rb') as file:
data = [json.loads(line) for line in file]
"""
The json.dumps method converts a Python object to a JSON formatted string.
The json.loads method parses a JSON string into a native Python object.
Replacing the "=" character with an empty string.
"""
data = json.loads(json.dumps(data).replace("=", ""))
fname = "workspaceExcelDemo.csv"
with open(fname, "w", encoding="utf-8", newline='') as file:
csv_file = csv.writer(file)
csv_file.writerow(["id", "name", "rsid"])
for item in data[0]["content"]:
csv_file.writerow([item['id'], item['name'], item['rsid']])

navigating json table in python

I am trying to access the team name key value and the american key value
print(bv_json['outcomes'][0]['description'])
the parts of the json table that I need are denoted with the ########### trailing near the end of the table posted, I get an error about needing an integer to iterate rather than string, I am also struggling with navigating through the keys
thanks
[
{
"path": [
{
"id": "2958468",
"link": "/basketball/nba",
"description": "NBA",
"type": "LEAGUE",
"sportCode": "BASK",
"order": 1,
"leaf": true,
"current": true
},
{
"id": "227",
"link": "/basketball",
"description": "Basketball",
"type": "SPORT",
"sportCode": "BASK",
"order": 1,
"leaf": false,
"current": false
}
],
"events": [
{
"id": "8801181",
"description": "L.A. Clippers # Utah Jazz",
"type": "GAMEEVENT",
"link": "/basketball/nba/l-a-clippers-utah-jazz-202106082215",
"status": "O",
"sport": "BASK",
"startTime": 1623204900000,
"live": true,
"awayTeamFirst": true,
"denySameGame": "NO",
"teaserAllowed": true,
"competitionId": "2958468",
"notes": "Best of 7 - Game 1",
"numMarkets": 34,
"lastModified": 1623212024024,
"competitors": [
{
"id": "8801181-285",
"name": "Utah Jazz",
"home": true
},
{
"id": "8801181-310",
"name": "L.A. Clippers",
"home": false
}
],
"displayGroups": [
{
"id": "100-97",
"description": "Game Lines",
"defaultType": true,
"alternateType": false,
"markets": [
{
"id": "157658380",
"descriptionKey": "Head To Head",
"description": "Moneyline",
"key": "2W-12",
"marketTypeId": "3059",
"status": "O",
"singleOnly": false,
"notes": "",
"period": {
"id": "341",
"description": "Live Game",
"abbreviation": "G",
"live": true,
"main": true
},
"outcomes": [
{
"id": "849253180",
"description": "L.A. Clippers",##############
"status": "O",
"type": "A",
"competitorId": "8801181-310",
"price": {
"id": "7927852247",
"american": "+125",#########################
"decimal": "2.250",
"fractional": "5/4",
"malay": "-0.80",
"indonesian": "1.25",
"hongkong": "1.25"
It looks like your data structure is
[{[{[{[{{}}]}]}]}]
Which is a list containing a dictionary of a list of dictionaries of lists of dictionaries, which is to say it's nested and confusing.
To make it easy on yourself, I think defining some variables will help.
Let's access the first level list item, the dictionary that contains 'path'- this dict contains all the other lists of dictionaries.
full_dict = bvjson[0] # step into a list
Looking at the data, we know that outcomes is in the 'events' list of dicts, so let's define that variable to make it easier to step into for our when we get to our ultimate answer.
events = full_dict['events'] # access dictionary value by key
Now we have access to events, which is a list of dictionaries of lists of dictionaries.
In events, we see that 'outcomes' actually lives two steps into the 'displayGroups' value, so let's get 'displayGroups' into something useable.
display = events['displayGroups'][0]
# 'displayGroups' is a key in the dictionary in the event list,
# and it holds a list of dictionaries, so we use [0] to step
# into the list to access the dicts.
# Note - if there are multiple lists this will only access the first one.
Stepping in further:
markets = display['markets'][0]
outcomes = markets['outcomes'][0]
You finally have easy access to the outcomes list of dict!
description = outcomes['description']
price = outcomes['price']['american']
So remember, anytime you get a confusing nested json like this, stepping in to each value can help you figure out how to get what you want and if you need to access via index (if it's a list) or via key (if it's a dictionary).
Think of all of this as just a way to diagnose and figure out why you aren't getting the values you are requesting - it will be different for each case, and different logic will be required for handling getting multiple values out of each list or dict - but this is a good start and way to get your mind around it.
Here is your data properly enclosed:
bvjson =
[
{
"path": [
{
"id": "2958468",
"link": "/basketball/nba",
"description": "NBA",
"type": "LEAGUE",
"sportCode": "BASK",
"order": 1,
"leaf": True,
"current": True
},
{
"id": "227",
"link": "/basketball",
"description": "Basketball",
"type": "SPORT",
"sportCode": "BASK",
"order": 1,
"leaf": False,
"current": False
}
],
"events": [
{
"id": "8801181",
"description": "L.A. Clippers # Utah Jazz",
"type": "GAMEEVENT",
"link": "/basketball/nba/l-a-clippers-utah-jazz-202106082215",
"status": "O",
"sport": "BASK",
"startTime": 1623204900000,
"live": True,
"awayTeamFirst": True,
"denySameGame": "NO",
"teaserAllowed": True,
"competitionId": "2958468",
"notes": "Best of 7 - Game 1",
"numMarkets": 34,
"lastModified": 1623212024024,
"competitors": [
{
"id": "8801181-285",
"name": "Utah Jazz",
"home": True
},
{
"id": "8801181-310",
"name": "L.A. Clippers",
"home": False
}
],
"displayGroups": [
{
"id": "100-97",
"description": "Game Lines",
"defaultType": True,
"alternateType": False,
"markets": [
{
"id": "157658380",
"descriptionKey": "Head To Head",
"description": "Moneyline",
"key": "2W-12",
"marketTypeId": "3059",
"status": "O",
"singleOnly": False,
"notes": "",
"period": {
"id": "341",
"description": "Live Game",
"abbreviation": "G",
"live": True,
"main": True
},
"outcomes": [
{
"id": "849253180",
"description": "L.A. Clippers",##############
"status": "O",
"type": "A",
"competitorId": "8801181-310",
"price": {
"id": "7927852247",
"american": "+125",#########################
"decimal": "2.250",
"fractional": "5/4",
"malay": "-0.80",
"indonesian": "1.25",
"hongkong": "1.25"}
}
]
}
]
}
]
}
]
}
]

How to read fields without numeric index in JSON

I have a json file where I need to read it in a structured way to insert in a database each value in its respective column, but in the tag "customFields" the fields change index, example: "Tribe / Customer" can be index 0 (row['customFields'][0]) in a json block, and in the other one be index 3 (row['customFields'][3]), so I tried to read the data using the name of the row field ['customFields'] ['Tribe / Customer'], but I got the error below:
TypeError: list indices must be integers or slices, not str
Script:
def getCustomField(ModelData):
for row in ModelData["data"]["squads"][0]["cards"]:
print(row['identifier'],
row['customFields']['Tribe / Customer'],
row['customFields']['Stopped with'],
row['customFields']['Sub-Activity'],
row['customFields']['Activity'],
row['customFields']['Complexity'],
row['customFields']['Effort'])
if __name__ == "__main__":
f = open('test.json')
json_file = json.load(f)
getCustomField(json_file)
JSON:
{
"data": {
"squads": [
{
"name": "TESTE",
"cards": [
{
"identifier": "0102",
"title": "TESTE",
"description": " TESTE ",
"status": "on_track",
"priority": null,
"assignees": [
{
"fullname": "TESTE",
"email": "TESTE"
}
],
"createdAt": "2020-04-16T15:00:31-03:00",
"secondaryLabel": null,
"primaryLabels": [
"TESTE",
"TESTE"
],
"swimlane": "TESTE",
"workstate": "Active",
"customFields": [
{
"name": "Tribe / Customer",
"value": "TESTE 1"
},
{
"name": "Checkpoint",
"value": "GNN"
},
{
"name": "Stopped with",
"value": null
},
{
"name": "Sub-Activity",
"value": "DEPLOY"
},
{
"name": "Activity",
"value": "TOOL"
},
{
"name": "Complexity",
"value": "HIGH"
},
{
"name": "Effort",
"value": "20"
}
]
},
{
"identifier": "0103",
"title": "TESTE",
"description": " TESTE ",
"status": "on_track",
"priority": null,
"assignees": [
{
"fullname": "TESTE",
"email": "TESTE"
}
],
"createdAt": "2020-04-16T15:00:31-03:00",
"secondaryLabel": null,
"primaryLabels": [
"TESTE",
"TESTE"
],
"swimlane": "TESTE",
"workstate": "Active",
"customFields": [
{
"name": "Tribe / Customer",
"value": "TESTE 1"
},
{
"name": "Stopped with",
"value": null
},
{
"name": "Checkpoint",
"value": "GNN"
},
{
"name": "Sub-Activity",
"value": "DEPLOY"
},
{
"name": "Activity",
"value": "TOOL"
},
{
"name": "Complexity",
"value": "HIGH"
},
{
"name": "Effort",
"value": "20"
}
]
}
]
}
]
}
}
You'll have to parse the list of custom fields into something you can access by name. Since you're accessing multiple entries from the same list, a dictionary is the most appropriate choice.
for row in ModelData["data"]["squads"][0]["cards"]:
custom_fields_dict = {field['name']: field['value'] for field in row['customFields']}
print(row['identifier'],
custom_fields_dict['Tribe / Customer'],
...
)
If you only wanted a single field you could traverse the list looking for a match, but it would be less efficient to do that repeatedly.
I'm skipping over dealing with missing fields - you'd probably want to use get('Tribe / Customer', some_reasonable_default) if there's any possibility of the field not being present in the json list.

check if json element or object exists or not and proceed

Hi im am trying to parse json data and gets this error every time the element
if ['fields']['assignee'] in each:
TypeError: list indices must be integers or slices, not str
>>>
My json is this
{
"expand": "schema,names",
"startAt": 1,
"maxResults": 50,
"total": 7363,
"issues": [
{
"expand": "operations,versionedRepresentations,editmeta,changelog,renderedFields",
"id": "591838",
"self": "https://jira.mynet.com/rest/api/2/issue/591838",
"key": "TEST-8564",
"fields": {
"summary": "delete tables 31-03-2020 ",
"customfield_10006": 2.0,
"created": "2020-02-27T10:29:12.000+0100",
"description": "A LOT OF TEXT",
"assignee": null,
"labels": [
"DATA",
"Refined"
],
"status": {
"self": "https://jira.mynet.com/rest/api/2/status/10000",
"description": "",
"iconUrl": "https://jira.mynet.com/",
"name": "To Do",
"id": "10000",
"statusCategory": {
"self": "https://jira.mynet.com/rest/api/2/statuscategory/2",
"id": 2,
"key": "new",
"colorName": "blue-gray",
"name": "To Do"
}
}
}
}
]
}
The element in ['fields']['assignee'] is NULL in this example
sometimes it is like this
"assignee": : {
"self": "https://mynet.com/rest/api/2/user?username=xxxxxx",
"name": "sij",
"key": "x",
"emailAddress": xx#mynet.com",
"avatarUrls": {
"48x48": "https://mynet.com/secure/useravatar?ownerId=bdysdh&avatarId=16743",
"24x24": "https://mynet.com/secure/useravatar?size=small&ownerId=bdysdh&avatarId=16743",
"16x16": "https://mynet.com/secure/useravatar?size=xsmall&ownerId=bdysdh&avatarId=16743",
"32x32": "https://mynet.com/secure/useravatar?size=medium&ownerId=bdysdh&avatarId=16743"
},
"displayName": "Bruce Springsteen",
"active": true,
"timeZone": "Arctic/Longyearbyen"
},
I am trying to check of assignee is null and if so print null
my code looks like this
with open('C:\\TEMP\\testdata.json') as json_file:
data = json.load(json_file)
for each in data['issues']:
if ['fields']['assignee'] in each:
print (['fields']['assignee']['name'])
else:
print ('null')
I have tried to put in [0] between ['fields']['assignee']['name'] but nothing seems to help.
Try with
if 'fields' in each and 'assignee' in each['fields']:
Note that you need the name of the key, not surrounded by square brackets.
Perhaps better:
for each in data['issues']:
print(each.get('fields', {}).get('assignee', {}).get('name', 'null'))
and if you can't guarantee that 'issues' exists in data either:
for each in data.get('issues', []):
<as before>
data.get('issues', []) returns an empty list if data['issuess'] doesn't exist.

Parse specific data from JSON

I have a JSON file with lots of data, and I want to keep only specific data.
I thought reading the file, get all the data I want and save as a new JSON.
The JSON is like this:
{
"event": [
{
"date": "2019-01-01",
"location": "world",
"url": "www.com",
"comments": "null",
"country": "china",
"genre": "blues"
},
{
"date": "2000-01-01",
"location": "street x",
"url": "www.cn",
"comments": "null",
"country":"turkey",
"genre": "reds"
},
{...
and I want it to be like this (with just date and url from each event.
{
"event": [
{
"date": "2019-01-01",
"url": "www.com"
},
{
"date": "2000-01-01",
"url": "www.cn"
},
{...
I can open the JSON and read from it using
with open('xx.json') as f:
data = json.load(f)
data2=data["events"]["date"]
But I still need to understand how to save the data I want in a new JSON keeping it's structure
You can use loop comprehension to loop over the events in and return a dictionary containing only the keys that you want.
data = { "event": [
{
"date": "2019-01-01",
"location": "world",
"url": "www.com",
"comments": None,
"country": "china",
"genre": "blues",
},
{
"date": "2000-01-01",
"location": "street x",
"url": "www.cn",
"comments": None,
"country" :"turkey",
"genre":"reds",
}
]}
# List comprehension
data["event"] = [{"date": x["date"], "url": x["url"]} for x in data["event"]]
Alternatively, you can map a function over the events list
keys_to_keep = ["date", "url"]
def subset_dict(d):
return {x: d[x] for x in keys_to_keep}
data["event"] = list(map(subset_dict, data["event"]))

Categories