CSV to structured nested JSON using python - python

I'm trying to convert flat structure csv into nested json structure.
I have some data like :
State SubRegion Postcode Suburb
ACT South Canberra 2620 Oaks Estate
ACT North Canberra 2601 Acton
ACT North Canberra 2602 Ainslie
ACT Gungahlin-Hall 2914 Amaroo
I want desired output like this :
[
{
"name":"ACT",
"regions":[
{
"name":"South Canberra",
"suburbs":[
{
"postcode":"2620",
"name":"Oaks Estate"
}
]
},
{
"name":"North Canberra",
"suburbs":[
{
"postcode":"2601",
"name":"Acton"
},
{
"postcode":"2602",
"name":"Ainslie"
}
]
},
{
"name":"Gungahlin-Hall",
"suburbs":[
{
"postcode":"2914",
"name":"Amaroo"
}
]
}
]
}
]
I'm trying to get this structure using pandas and normal script but didn't get the correct structure yet.

I have solved this problem. Here is the solution :
def getindex(convertedList, value):
ivd = -1
for index, item in enumerate(convertedList):
# print("line 7 : ", item, value)
if item['name'] == value:
ivd = index
break
else:
ivd = -1
return ivd
with open('Regions.csv', 'r') as file:
reader = csv.reader(file)
mainData = []
loopIndex = 0
for row in reader:
if loopIndex > 0:
index = getindex(mainData, row[0])
if index > -1:
subindex = getindex(mainData[index]['regions'], row[1])
if subindex > -1:
suburbObj = {
'postcode' : row[3],
'name' : row[4]
}
mainData[index]['regions'][subindex]['suburbs'].append(suburbObj)
else :
regionObj = {
"name" : row[1],
"suburbs" : [{
"name" : row[4],
"postCode" : row[3]
}]
}
mainData[index]['regions'].append(regionObj)
else :
stateObj = {
'name' : row[0],
'regions' : [{
"name" : row[1],
"suburbs" : [{
"name" : row[4],
"postCode" : row[3]
}]
}]
}
mainData.append(stateObj)
loopIndex = loopIndex + 1
If anyone has any better-optimized code, you can post your solutions.
Thanks

i think this should work
import csv
import json
def add_new_region(name, postcode, name2):
d = {"name" : name,
"suburbs" : [add_suburb(postcode, name2)]
}
return d
def add_suburb(postcode, name):
return {"postcode" : postcode,
"name" : name
}
datalist=[]
region_dict={}
region_dict_counter = 0
with open("data.csv", "r") as f:
data = csv.reader(f)
next(data) # skip headers
for row in data:
if row[0] in region_dict.keys():
for x in (datalist[region_dict[row[0]]])["regions"]:
if x["name"] == row[1]:
(x["suburbs"]).append(add_suburb(row[2], row[3]))
break
else :
datalist[region_dict[row[0]]]["regions"].append(add_new_region(row[1], row[2], row[3]))
else:
d = { "name" : row[0],
"regions" : [ add_new_region(row[1], row[2], row[3])]}
datalist.append(d)
region_dict[row[0]] = region_dict_counter
region_dict_counter+=1
json_data=json.dumps(datalist, indent=4)
print(json_data)
with open("data.json", "w") as j:
j.write(json_data)

Related

How can i use ijson to extract a set of corresponding data from json file?

I have a json file just like this:
{
"CVE_data_type" : "CVE",
"CVE_Items" : [ {
"cve" : {
"CVE_data_meta" : {
"ID" : "CVE-2020-0001",
"ASSIGNER" : "security#android.com"
},
...
"configurations" : {
"CVE_data_version" : "4.0",
"nodes" : [ {
"operator" : "OR",
"children" : [ ],
"cpe_match" : [ {
"vulnerable" : true,
"cpe23Uri" : "cpe:2.3:o:google:android:8.0:*:*:*:*:*:*:*",
"cpe_name" : [ ]
}, {
"vulnerable" : true,
"cpe23Uri" : "cpe:2.3:o:google:android:8.1:*:*:*:*:*:*:*",
"cpe_name" : [ ]
}]
} ]
},
...
"publishedDate" : "2020-01-08T19:15Z",
"lastModifiedDate" : "2020-01-14T21:52Z"
}]
}
And i want to extract the CVE-ID and corresponding CPE,so i can lcoate the CVE-ID through CPE,here is my code
import ijson
import datetime
def parse_json(filename):
with open(filename, 'rb') as input_file:
CVEID = ijson.items(input_file, 'CVE_Items.item.cve.CVE_data_meta.ID', )
for id in CVEID:
print("CVE id: %s" % id)
# for prefix, event, value in parser:
# print('prefix={}, event={}, value={}'.format(prefix, event, value))
with open(filename, 'rb') as input_file:
cpes = ijson.items(input_file, 'CVE_Items.item.configurations.nodes.item.cpe_match.item', )
for cpe in cpes:
print("cpe: %s" % cpe['cpe23Uri'])
def main():
parse_json("cve.json")
end = datetime.datetime.now()
if __name__ == '__main__':
main()
Results:
CVE id: CVE-2020-0633
CVE id: CVE-2020-0631
cpe: cpe:2.3:o:google:android:8.0:*:*:*:*:*:*:*
cpe: cpe:2.3:o:google:android:10.0:*:*:*:*:*:*:*
cpe: cpe:2.3:o:microsoft:windows_10:1607:*:*:*:*:*:*:*
cpe: cpe:2.3:o:microsoft:windows_server_2016:-:*:*:*:*:*:*:*
But above this just extract the data and no correspondence.
Could anyone help? A little help would be appreciated.
I think if you need to keep track of CVE IDs and their corresponding CPEs you'll need to iterate over whole cve items and extract the bits of data you need (so you'll only do one pass through the file). Not as efficient memory-wise as your original iteration, but if each item in CVE_Items is not too big then it's not a problem:
with open(filename, 'rb') as input_file:
for cves in ijson.items(input_file, 'CVE_Items.item')
cve_id = cve['cve']['CVE_data_meta']['ID']
cpes = [match
for node in cve['configurations']['nodes']
for match in node['cpe_match']]
If you know there's always a single cpe_match element in nodes then you can replace the last list comprehension by cve['configurations']['nodes'][0]['cpe_match']

KeyError encountered even though keys exist

dir = {"sample":[
{ "key1":"data1" }
,
{ "key1":"data2" }
,
{ "key2":"data3" }
,
{ "key2":"data4" }
]
}
with my code:
listKey1 = []
listKey2 = []
with open(dir) as json_file:
data = json.load(json_file)
for p in data['sample']:
key1data = p['key1']
print("key1: " + key1data)
listKey1.append(key1data)
key2data = p['key2']
print("key2: " + key2data)
listKey2.append(key2data)
Im trying to store the data under the key1 and key2 keys into the listKey1 and listKey2 but i am getting the error:
KeyError: 'key1'
KeyError: 'key2'
As we can see on my file that both key1 and key2 are present.
Here is the modified code. Just check if the key exists before using it.
listKey1 = []
listKey2 = []
with open(dir) as json_file:
data = json.load(json_file)
for p in data['sample']:
if "key1" in p.keys(): # check if key exists on the current index
key1data = p['key1']
print("key1: " + key1data)
listKey1.append(key1data)
if "key2" in p.keys(): # check if key exists on the current index
key2data = p['key2']
print("key2: " + key2data)
listKey2.append(key2data)
Here is the output I get
key1: data1
key1: data2
key2: data3
key2: data4
Here is the reason why the key error occurs. Notice that there are different keys based on the index.
index [ 0 ] : { "key1":"data1" } - The key is "key1"-
,
index [ 1 ] :{ "key1":"data2" } - The key is "key1"
,
index [ 2 ] :{ "key2":"data3" } - The key is "key2"
,
index [ 3 ] :{ "key2":"data4" } - The key is "key2"
The issue that while you are looping p value does not always have key1 or key2. It either has one or the other.
So when it finds key1 it prints the data but gives an error for key2.
And when it finds key2 it prints the data but gives an error for key1
A good option for you is to use the get() method. If the key is present it will return the value else it will give the default value.
Try the code below.
listKey1 = []
listKey2 = []
with open(dir) as json_file:
data = json.load(json_file)
for p in data['sample']:
key1data = p.get('key1',"")
print("key1: " + key1data)
listKey1.append(key1data)
key2data = p.get('key2',"")
print("key2: " + key2data)
listKey2.append(key2data)
You can iterate over the elements, compare by the key and store the data accordingly:
dd = {
"sample":
[
{ "key1":"data1" },
{ "key1":"data2" },
{ "key2":"data3" },
{ "key2":"data4" }
]
}
key1data = []
key2data = []
for elem in dd['sample']:
for key, val in elem.items():
if key == "key1":
key1data.append(elem.get("key1"))
else:
key2data.append(elem.get("key2"))
print(key1data)
print(key2data)
OUTPUT:
['data1', 'data2']
['data3', 'data4']

Python: Fetching postgresql result and loading it to JSON

I'm trying to get details from PostgreSQL about some cameras and I need to insert them all in one JSON response, but I can't imagine how it should be done, because for row in self.data: processes one line per time, how can I add them all in one JSON dump?
I imagine JSON dump like this:
{
"status": "ok",
"total_cameras": 3,
"cameras":[{
"camera_id" : 1,
"camera_name" : "hikvision 1",
"camera_ip": "42.51.56.0"
},
{
"camera_id" : 2,
"camera_name" : "hikvision 2",
"camera_ip": "42.51.56.5"
},
{
"camera_id" : 3,
"camera_name" : "hikvision 3",
"camera_ip": "2.1.58.5"
}]
}
My code which I use to get information from PostgreSQL :
if not self.Data:
self.RES = {'status': 'nocameras'}
return web.Response(text=json.dumps(self.RES), status=403)
else:
self.rows = self.cursor.rowcount
for row in self.Data:
if self.rows > 1:
# Authorizing objects
print(row)
self.Camera_ID = row[0]
self.Camera_Name = row[1]
self.Camera_LAT = row[3]
self.Camera_LOG = row[4]
self.Camera_IP = row[2]
self.Camera_Last_Updated = row[6]
self.Camera_Street = row[5]
self.Camera_API_key = row[7]
print(self.Camera_ID, self.Camera_Name)
else:
self.RES = {'status': 'row_error'}
return web.Response(text=json.dumps(self.RES), status=500)
I would first use the returned rows to build a list of dictionaries:
cameras = [
dict(
camera_id=c_id,
camera_name=c_name,
camera_ip=c_ip
) for c_id, c_name, _, _, c_ip, *_ in self.Data
]
And then create the final JSON object:
json.dumps({
'status': 'ok',
'total_cameras': len(cameras),
'cameras': cameras
})

How can I use Python to generate nested JSON data from my CSV file

I have tried to use the online Jsonify It tool which can create nested JSON data from my data but I can't seem to get that to work. I have also tried to use the Python code from other posts on but they do not seem to work either. If you know an easier method than using Python, that would be good.
Here is my .CSV data:
ID,Name,Date,Subject,Start,Finish
0,Ladybridge High School,01/11/2019,Maths,05:28,0
0,Ladybridge High School,02/11/2019,Maths,05:30,06:45
0,Ladybridge High School,01/11/2019,Economics,11:58,12:40
0,Ladybridge High School,02/11/2019,Economics,11:58,12:40
1,Loreto Sixth Form,01/11/2019,Maths,05:28,06:45
1,Loreto Sixth Form,02/11/2019,Maths,05:30,06:45
1,Loreto Sixth Form,01/11/2019,Economics,11:58,12:40
1,Loreto Sixth Form,02/11/2019,Economics,11:58,12:40
This is the nested JSON structure I would like:
{
"Timetable" : [ {
"Date" : {
"01-11-2019" : {
"Maths" : {
"Start" : "05:28",
"Finish" : "06:45"
},
"Economics" : {
"Start" : "11:58",
"Finish" : "12:40"
}
},
"02-11-2019" : {
"Maths" : {
"Start" : "05:30",
"Finish" : "06:45"
},
"Economics" : {
"Start" : "11:58",
"Finish" : "12:40"
}
}
},
"Name" : "Ladybridge High School"
}, {
"Date" : {
"01-11-2019" : {
"Maths" : {
"Start" : "05:28",
"Finish" : "06:45"
},
"Economics" : {
"Start" : "11:58",
"Finish" : "12:40"
}
},
"02-11-2019" : {
"Maths" : {
"Start" : "05:30",
"Finish" : "06:45"
},
"Economics" : {
"Start" : "11:58",
"Finish" : "12:40"
}
}
},
"Name" : "Loreto Sixth From"
} ]
}
Something like this?
[EDIT]
I refactored it to handle arbitrary top-level keys for each entry in the timetable. I also made it first create a dict and then convert the dict to a list so that it can run in O(N) time, in case the input is very large.
import csv
timetable = {}
with open('data.csv') as f:
csv_data = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]
for row in csv_data:
if not timetable.get(row["ID"]):
timetable[row["ID"]] = {"ID": row["ID"], "Date": {}}
for k in row.keys():
# Date has to be handled as a special case
if k == "Date":
timetable[row["ID"]]["Date"][row["Date"]] = {}
timetable[row["ID"]]["Date"][row["Date"]][row["Subject"]] = {
"Start": row["Start"],
"Finish": row["Finish"]
}
# Ignore these keys because they are only for 'Date'
elif k == "Start" or k == "Finish" or k == "Subject":
continue
# Use everything else
else:
timetable[row["ID"]][k] = row[k]
timetable = {"Timetable": [v for k, v in timetable.items()]}
An improvement to the above answer to nest the ID before the name and date:
import csv
timetable = {"Timetable": []}
print(timetable)
with open("C:/Users/kspv914/Downloads/data.csv") as f:
csv_data = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]
name_array = []
for name in [row["Name"] for row in csv_data]:
name_array.append(name)
name_set = set(name_array)
for name in name_set:
timetable["Timetable"].append({"Name": name, "Date": {}})
for row in csv_data:
for entry in timetable["Timetable"]:
if entry["Name"] == row["Name"]:
entry["Date"][row["Date"]] = {}
entry["Date"][row["Date"]][row["Subject"]] = {
"Start": row["Start"],
"Finish": row["Finish"]
}
print(timetable)

Scraping different style of Json

I am familiar with scraping data in this format.
{"data":[{"assists":0,"assistsPerGame":0.0000,"evAssists":0,"evPoints":0,"gamesPlayed":1,"goals":0,"penaltyMinutes":0,"playerBirthCity":"Windsor","playerBirthCountry":"CAN","playerBirthDate":"1996-02-07",
import csv
import requests
outfile = open("NHL_Recent.csv","a",newline='')
writer = csv.writer(outfile)
writer.writerow(["Player","Pos","GP","G","A","P","+/-","PIM","PPG","PPP","SHG","SHP","GWG","OTG","S","S%","TOI","Shifts/PG","FOW%"])
req = requests.get('http://www.nhl.com/stats/rest/skaters?isAggregate=true&reportType=basic&isGame=true&reportName=skatersummary&sort=[{%22property%22:%22shots%22,%22direction%22:%22DESC%22}]&cayenneExp=gameDate%3E=%222017-11-4%22%20and%20gameDate%3C=%222017-11-10%22%20and%20gameTypeId=2')
data = req.json()['data']
for item in data:
Player = item['playerName']
Pos = item['playerPositionCode']
GP = item['gamesPlayed']
But not in this manner.
"totalItems" : 600,
"totalEvents" : 0,
"totalGames" : 600,
"totalMatches" : 0,
"wait" : 10,
"dates" : [ {
"date" : "2017-10-04",
"totalItems" : 4,
"totalEvents" : 0,
"totalGames" : 4,
"totalMatches" : 0,
"games" : [ {
"gamePk" : 2017020001,
"link" : "/api/v1/game/2017020001/feed/live",
"gameType" : "R",
"season" : "20172018",
"gameDate" : "2017-10-04T23:00:00Z",
"status" : {
"abstractGameState" : "Final",
"codedGameState" : "7",
"detailedState" : "Final",
"statusCode" : "7",
"startTimeTBD" : false
},
"teams" : {
"away" : {
"leagueRecord" : {
"wins" : 1,
"losses" : 0,
"ot" : 0,
"type" : "league"
},
"score" : 7,
"team" : {
"id" : 10,
"name" : "Toronto Maple Leafs",
"link" : "/api/v1/teams/10",
"venue" : {
"name" : "Air Canada Centre",
"link" : "/api/v1/venues/null",
"city" : "Toronto",
"timeZone" : {
"id" : "America/Toronto",
"offset" : -5,
"tz" : "EST"
}
},
"abbreviation" : "TOR",
"teamName" : "Maple Leafs",
"locationName" : "Toronto",
"firstYearOfPlay" : "1926",
"division" : {
"id" : 17,
"name" : "Atlantic",
"link" : "/api/v1/divisions/17"
},
"conference" : {
"id" : 6,
"name" : "Eastern",
"link" : "/api/v1/conferences/6"
},
"franchise" : {
"franchiseId" : 5,
"teamName" : "Maple Leafs",
"link" : "/api/v1/franchises/5
This is what I have so far with no success.
import csv
import requests
import os
outfile = open("NHL DIF JSON.csv","a",newline='')
writer = csv.writer(outfile)
writer.writerow(["Date","Game","gamep"])
req = requests.get('https://statsapi.web.nhl.com/api/v1/schedule?startDate=2017-10-04&endDate=2018-04-30&expand=schedule.teams,schedule.linescore,schedule.broadcasts.all,schedule.ticket,schedule.game.content.media.epg,schedule.radioBroadcasts,schedule.metadata,schedule.game.seriesSummary,seriesSummary.series&leaderCategories=&leaderGameTypes=R&site=en_nhl&teamId=&gameType=&timecode=')
data = req.json()['dates']
for item in data:
Date = item['date']
##for item in games:
Game = item['0']
gamep = item['gamePk']
print(Date,Game)
writer.writerow([Date,Game,gamep])
outfile.close()
os.system("taskkill /f /im pythonw.exe")
I Would like to pull the "gamePk", "gameDate" from totalGames along with the teamNames within "teams" and other categories. I eventually would like to put that into a csv with the gamePk, gameDate, teams, score, etc. I'm just not sure how to get through the individual categories, any help would be greatly appreciated! Thanks!
It's normal json data, just a bit complicated. You can get the date from data['dates'][i]['date']. For the teams, score, etc you have to iterate over data['dates'][i]['games'].
req = requests.get('https://statsapi.web.nhl.com/api/v1/schedule?startDate=2017-10-04&endDate=2018-04-30&expand=schedule.teams,schedule.linescore,schedule.broadcasts.all,schedule.ticket,schedule.game.content.media.epg,schedule.radioBroadcasts,schedule.metadata,schedule.game.seriesSummary,seriesSummary.series&leaderCategories=&leaderGameTypes=R&site=en_nhl&teamId=&gameType=&timecode=')
data = req.json()
my_data =[]
for item in data['dates']:
date = item['date']
games = item['games']
for game in games:
gamePk = game['gamePk']
gameDate = game['gameDate']
team_away, team_home = game['teams']['away'], game['teams']['home']
team_away_score = team_away['score']
team_home_score = team_home['score']
team_away_name = team_away['team']['name']
team_home_name = team_home['team']['name']
my_data.append([date, gamePk, gameDate, team_away_name, team_home_name, team_away_score, team_home_score])
headers = ["Date","Game","gamep","gameDate","team_away_name","team_home_name","team_away_score","team_home_score"]
with open("my_file.csv", "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
As for your last question, you can get the 'pk' from data['gameData']['game']['pk']. The player, event, triCode and coordinates values are a little harder to get because some items don't have 'players' and 'team' keys, or the 'coordinates' dict is empty.
In this case the dict.get method can be helpful because it will return None (or you can set a default value) if you try to access a non-existent key.
Still you have to design your code according to the structure of the json data, example:
req = requests.get('https://statsapi.web.nhl.com/api/v1/game/2017020001/feed/live?site=en_nhl')
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, triCode, coordinates_x, coordinates_y])
for row in my_data:
print(row)

Categories