Scraping different style of Json

Scraping different style of Json - python

I am familiar with scraping data in this format.
{"data":[{"assists":0,"assistsPerGame":0.0000,"evAssists":0,"evPoints":0,"gamesPlayed":1,"goals":0,"penaltyMinutes":0,"playerBirthCity":"Windsor","playerBirthCountry":"CAN","playerBirthDate":"1996-02-07",
import csv
import requests
outfile = open("NHL_Recent.csv","a",newline='')
writer = csv.writer(outfile)
writer.writerow(["Player","Pos","GP","G","A","P","+/-","PIM","PPG","PPP","SHG","SHP","GWG","OTG","S","S%","TOI","Shifts/PG","FOW%"])
req = requests.get('http://www.nhl.com/stats/rest/skaters?isAggregate=true&reportType=basic&isGame=true&reportName=skatersummary&sort=[{%22property%22:%22shots%22,%22direction%22:%22DESC%22}]&cayenneExp=gameDate%3E=%222017-11-4%22%20and%20gameDate%3C=%222017-11-10%22%20and%20gameTypeId=2')
data = req.json()['data']
for item in data:
Player = item['playerName']
Pos = item['playerPositionCode']
GP = item['gamesPlayed']
But not in this manner.
"totalItems" : 600,
"totalEvents" : 0,
"totalGames" : 600,
"totalMatches" : 0,
"wait" : 10,
"dates" : [ {
"date" : "2017-10-04",
"totalItems" : 4,
"totalEvents" : 0,
"totalGames" : 4,
"totalMatches" : 0,
"games" : [ {
"gamePk" : 2017020001,
"link" : "/api/v1/game/2017020001/feed/live",
"gameType" : "R",
"season" : "20172018",
"gameDate" : "2017-10-04T23:00:00Z",
"status" : {
"abstractGameState" : "Final",
"codedGameState" : "7",
"detailedState" : "Final",
"statusCode" : "7",
"startTimeTBD" : false
},
"teams" : {
"away" : {
"leagueRecord" : {
"wins" : 1,
"losses" : 0,
"ot" : 0,
"type" : "league"
},
"score" : 7,
"team" : {
"id" : 10,
"name" : "Toronto Maple Leafs",
"link" : "/api/v1/teams/10",
"venue" : {
"name" : "Air Canada Centre",
"link" : "/api/v1/venues/null",
"city" : "Toronto",
"timeZone" : {
"id" : "America/Toronto",
"offset" : -5,
"tz" : "EST"
}
},
"abbreviation" : "TOR",
"teamName" : "Maple Leafs",
"locationName" : "Toronto",
"firstYearOfPlay" : "1926",
"division" : {
"id" : 17,
"name" : "Atlantic",
"link" : "/api/v1/divisions/17"
},
"conference" : {
"id" : 6,
"name" : "Eastern",
"link" : "/api/v1/conferences/6"
},
"franchise" : {
"franchiseId" : 5,
"teamName" : "Maple Leafs",
"link" : "/api/v1/franchises/5
This is what I have so far with no success.
import csv
import requests
import os
outfile = open("NHL DIF JSON.csv","a",newline='')
writer = csv.writer(outfile)
writer.writerow(["Date","Game","gamep"])
req = requests.get('https://statsapi.web.nhl.com/api/v1/schedule?startDate=2017-10-04&endDate=2018-04-30&expand=schedule.teams,schedule.linescore,schedule.broadcasts.all,schedule.ticket,schedule.game.content.media.epg,schedule.radioBroadcasts,schedule.metadata,schedule.game.seriesSummary,seriesSummary.series&leaderCategories=&leaderGameTypes=R&site=en_nhl&teamId=&gameType=&timecode=')
data = req.json()['dates']
for item in data:
Date = item['date']
##for item in games:
Game = item['0']
gamep = item['gamePk']
print(Date,Game)
writer.writerow([Date,Game,gamep])
outfile.close()
os.system("taskkill /f /im pythonw.exe")
I Would like to pull the "gamePk", "gameDate" from totalGames along with the teamNames within "teams" and other categories. I eventually would like to put that into a csv with the gamePk, gameDate, teams, score, etc. I'm just not sure how to get through the individual categories, any help would be greatly appreciated! Thanks!

It's normal json data, just a bit complicated. You can get the date from data['dates'][i]['date']. For the teams, score, etc you have to iterate over data['dates'][i]['games'].
req = requests.get('https://statsapi.web.nhl.com/api/v1/schedule?startDate=2017-10-04&endDate=2018-04-30&expand=schedule.teams,schedule.linescore,schedule.broadcasts.all,schedule.ticket,schedule.game.content.media.epg,schedule.radioBroadcasts,schedule.metadata,schedule.game.seriesSummary,seriesSummary.series&leaderCategories=&leaderGameTypes=R&site=en_nhl&teamId=&gameType=&timecode=')
data = req.json()
my_data =[]
for item in data['dates']:
date = item['date']
games = item['games']
for game in games:
gamePk = game['gamePk']
gameDate = game['gameDate']
team_away, team_home = game['teams']['away'], game['teams']['home']
team_away_score = team_away['score']
team_home_score = team_home['score']
team_away_name = team_away['team']['name']
team_home_name = team_home['team']['name']
my_data.append([date, gamePk, gameDate, team_away_name, team_home_name, team_away_score, team_home_score])
headers = ["Date","Game","gamep","gameDate","team_away_name","team_home_name","team_away_score","team_home_score"]
with open("my_file.csv", "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
As for your last question, you can get the 'pk' from data['gameData']['game']['pk']. The player, event, triCode and coordinates values are a little harder to get because some items don't have 'players' and 'team' keys, or the 'coordinates' dict is empty.
In this case the dict.get method can be helpful because it will return None (or you can set a default value) if you try to access a non-existent key.
Still you have to design your code according to the structure of the json data, example:
req = requests.get('https://statsapi.web.nhl.com/api/v1/game/2017020001/feed/live?site=en_nhl')
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, triCode, coordinates_x, coordinates_y])
for row in my_data:
print(row)

Related

Convert dataframe into specific JSON

I would like to convert my DataFrame into a specific JSON. I try to use to_dict() but for the moment I didn't find the correct parameters to replicate the output.
Do you have any idea?
My code :
import pandas as pd
data = {
'alt' : ["BeattheBeachmark NEW", "BeattheBeachmark NEW"],
'Mod' : ["GA", "GA"],
'Pers' : ["Movment", "Movment"],
'Vie' : ["Inprogress", "Inprogress"],
'Actions' : ["Clear", "Add"]
}
df = pd.DataFrame(data)
My Ouput :
result = {
"alt" : {
"BeattheBeachmark NEW" : {
"Mod" : {
"GA" : {
"Pers" : {
"Movment" : {
"Vie" : {
"Inprogress" : {
'Actions' : ["Clear", "Add"]
}
}
}
}
}
}
}
}
}

You can group your dataframe by "alt", by "Mod"... and so on and create your dictionary along the way:
import pandas as pd
import json
data = {
'alt' : ["BeattheBeachmark NEW", "BeattheBeachmark NEW"],
'Mod' : ["GA", "GA"],
'Pers' : ["Movment", "Movment"],
'Vie' : ["Inprogress", "Inprogress"],
'Actions' : ["Clear", "Add"]
}
df = pd.DataFrame(data)
output_dict = dict()
output_dict['alt'] = dict()
for alt in df.groupby("alt"):
output_dict['alt'][alt[0]] = dict()
output_dict['alt'][alt[0]]["Mod"] = dict()
for mod in alt[1].groupby("Mod"):
output_dict['alt'][alt[0]]["Mod"][mod[0]] = dict()
output_dict['alt'][alt[0]]["Mod"][mod[0]]["Pers"] = dict()
for pers in mod[1].groupby("Pers"):
output_dict['alt'][alt[0]]["Mod"][mod[0]]["Pers"][pers[0]] = dict()
output_dict['alt'][alt[0]]["Mod"][mod[0]]["Pers"][pers[0]]["Vie"] = dict()
for vie in pers[1].groupby("Vie"):
output_dict['alt'][alt[0]]["Mod"][mod[0]]["Pers"][pers[0]]["Vie"][vie[0]] = dict()
output_dict['alt'][alt[0]]["Mod"][mod[0]]["Pers"][pers[0]]["Vie"][vie[0]]["Actions"] = list(vie[1].Actions)
print(json.dumps(output_dict, indent=4))
Output:
{
"alt": {
"BeattheBeachmark NEW": {
"Mod": {
"GA": {
"Pers": {
"Movment": {
"Vie": {
"Inprogress": {
"Actions": [
"Clear",
"Add"
]
}
}
}
}
}
}
}
}
}
EDIT: for archive purpose, I add a recursive solution for this kind of problem, making it much more generic:
import pandas as pd
import json
data = {
'alt' : ["BeattheBeachmark NEW", "BeattheBeachmark NEW"],
'Mod' : ["GA", "GA"],
'Pers' : ["Movment", "Movment"],
'Vie' : ["Inprogress", "Inprogress"],
'Actions' : ["Clear", "Add"]
}
df_in = pd.DataFrame(data)
output_dict = dict()
def extract_columns(df, col, output_dict):
if col == len(df.columns)-1:
output_dict[df.columns[col]] = list(df[df.columns[col]])
else:
output_dict[df.columns[col]] = dict()
for first_col_grp in df.groupby(df.columns[col]):
output_dict[df.columns[col]][first_col_grp[0]] = dict()
extract_columns(first_col_grp[1], col+1, output_dict[df.columns[col]][first_col_grp[0]])
extract_columns(df_in, 0, output_dict)
print(json.dumps(output_dict, indent=4))

To get the same dictionary as in your example, you can iterate through your dataframe's columns and create the dictionary as such (using literal evaluation to help since df.to_json returns a string and you want a list):
import ast
your_dict = {}
for col in df.columns:
your_dict[col] = df[col].to_json(orient='records')
your_dict[col] = ast.literal_eval(your_dict[col])
print(your_dict)
Giving you:
{'alt': ['BeattheBeachmark NEW', 'BeattheBeachmark NEW'],
'Mod': ['GA', 'GA'],
'Pers': ['Movment', 'Movment'],
'Vie': ['Inprogress', 'Inprogress'],
'Actions': ['Clear', 'Add']}

How can i use ijson to extract a set of corresponding data from json file?

I have a json file just like this:
{
"CVE_data_type" : "CVE",
"CVE_Items" : [ {
"cve" : {
"CVE_data_meta" : {
"ID" : "CVE-2020-0001",
"ASSIGNER" : "security#android.com"
},
...
"configurations" : {
"CVE_data_version" : "4.0",
"nodes" : [ {
"operator" : "OR",
"children" : [ ],
"cpe_match" : [ {
"vulnerable" : true,
"cpe23Uri" : "cpe:2.3:o:google:android:8.0:*:*:*:*:*:*:*",
"cpe_name" : [ ]
}, {
"vulnerable" : true,
"cpe23Uri" : "cpe:2.3:o:google:android:8.1:*:*:*:*:*:*:*",
"cpe_name" : [ ]
}]
} ]
},
...
"publishedDate" : "2020-01-08T19:15Z",
"lastModifiedDate" : "2020-01-14T21:52Z"
}]
}
And i want to extract the CVE-ID and corresponding CPE,so i can lcoate the CVE-ID through CPE,here is my code
import ijson
import datetime
def parse_json(filename):
with open(filename, 'rb') as input_file:
CVEID = ijson.items(input_file, 'CVE_Items.item.cve.CVE_data_meta.ID', )
for id in CVEID:
print("CVE id: %s" % id)
# for prefix, event, value in parser:
# print('prefix={}, event={}, value={}'.format(prefix, event, value))
with open(filename, 'rb') as input_file:
cpes = ijson.items(input_file, 'CVE_Items.item.configurations.nodes.item.cpe_match.item', )
for cpe in cpes:
print("cpe: %s" % cpe['cpe23Uri'])
def main():
parse_json("cve.json")
end = datetime.datetime.now()
if __name__ == '__main__':
main()
Results:
CVE id: CVE-2020-0633
CVE id: CVE-2020-0631
cpe: cpe:2.3:o:google:android:8.0:*:*:*:*:*:*:*
cpe: cpe:2.3:o:google:android:10.0:*:*:*:*:*:*:*
cpe: cpe:2.3:o:microsoft:windows_10:1607:*:*:*:*:*:*:*
cpe: cpe:2.3:o:microsoft:windows_server_2016:-:*:*:*:*:*:*:*
But above this just extract the data and no correspondence.
Could anyone help? A little help would be appreciated.

I think if you need to keep track of CVE IDs and their corresponding CPEs you'll need to iterate over whole cve items and extract the bits of data you need (so you'll only do one pass through the file). Not as efficient memory-wise as your original iteration, but if each item in CVE_Items is not too big then it's not a problem:
with open(filename, 'rb') as input_file:
for cves in ijson.items(input_file, 'CVE_Items.item')
cve_id = cve['cve']['CVE_data_meta']['ID']
cpes = [match
for node in cve['configurations']['nodes']
for match in node['cpe_match']]
If you know there's always a single cpe_match element in nodes then you can replace the last list comprehension by cve['configurations']['nodes'][0]['cpe_match']

Eliminate keys from list of dict python

i am pulling out information from this websites API:
https://financialmodelingprep.com/
to be specific i need the data from the income statements:
https://financialmodelingprep.com/developer/docs/#Company-Financial-Statements
what i get back from the API is a list, which contains 36 dictionarys with the following Data:
[ {
"date" : "2019-09-28",
"symbol" : "AAPL",
"fillingDate" : "2019-10-31 00:00:00",
"acceptedDate" : "2019-10-30 18:12:36",
"period" : "FY",
"revenue" : 260174000000,
"costOfRevenue" : 161782000000,
"grossProfit" : 98392000000,
"grossProfitRatio" : 0.378178,
"researchAndDevelopmentExpenses" : 16217000000,
"generalAndAdministrativeExpenses" : 18245000000,
"sellingAndMarketingExpenses" : 0.0,
"otherExpenses" : 1807000000,
"operatingExpenses" : 34462000000,
"costAndExpenses" : 196244000000,
"interestExpense" : 3576000000,
"depreciationAndAmortization" : 12547000000,
"ebitda" : 81860000000,
"ebitdaratio" : 0.314636,
"operatingIncome" : 63930000000,
"operatingIncomeRatio" : 0.24572,
"totalOtherIncomeExpensesNet" : 422000000,
"incomeBeforeTax" : 65737000000,
"incomeBeforeTaxRatio" : 0.252666,
"incomeTaxExpense" : 10481000000,
"netIncome" : 55256000000,
"netIncomeRatio" : 0.212381,
"eps" : 2.97145,
"epsdiluted" : 2.97145,
"weightedAverageShsOut" : 18595652000,
"weightedAverageShsOutDil" : 18595652000,
"link" : "https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/0000320193-19-000119-index.html",
"finalLink" : "https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/a10-k20199282019.htm"
}, ...
]
What i dont need in the dictionary are the keys:
fillingDate, acceptedDate, link, finalLink
I managed to remove them, but my problem is that now that piece of code i wrote spits out those dictionaries way too often, and i am not able to understand why...
Here is what i tried:
import requests
import json
url = "https://financialmodelingprep.com/api/v3/income-statement/AAPL?apikey=b60bb3d1967bb15bfb9daaa4426e77dc"
response = requests.get(url)
data = response.text
dataList = json.loads(data)
entriesToRemove = {
'fillingDate' : 0,
'acceptedDate' : 0,
'link' : 0,
'finalLink' : 0
}
removedEntries = []
newDict = {}
for index in range(len(dataList)):
for key in dataList[index]:
newDict[key] = dataList[index].get(key)
if key in entriesToRemove:
removedEntries = newDict.pop(key)
print(json.dumps(newDict, indent=4))
Thanks in advance

OP:
for each key in the dictionary, the dictionary gets printed a new time.
Reason:
for index in range(len(dataList)):
for key in dataList[index]:
newDict[key] = dataList[index].get(key)
if key in entriesToRemove:
removedEntries = newDict.pop(key)
print(json.dumps(newDict, indent=4)) # notice this line
The reason why the dictionary is printed for each key is because you have a print(json.dumps(newDict, indent=4)) statement inside the loop for each key-val iteration over the dictionary.
To eradicate the highlighted keys from a list of dict, you could iterate over the list and create another list of dict without the unnecessary keys:
s = [ {
"date" : "2019-09-28",
"symbol" : "AAPL",
"fillingDate" : "2019-10-31 00:00:00",
"acceptedDate" : "2019-10-30 18:12:36",
"period" : "FY",
"revenue" : 260174000000,
"costOfRevenue" : 161782000000,
"grossProfit" : 98392000000,
"grossProfitRatio" : 0.378178,
"researchAndDevelopmentExpenses" : 16217000000,
"generalAndAdministrativeExpenses" : 18245000000,
"sellingAndMarketingExpenses" : 0.0,
"otherExpenses" : 1807000000,
"operatingExpenses" : 34462000000,
"costAndExpenses" : 196244000000,
"interestExpense" : 3576000000,
"depreciationAndAmortization" : 12547000000,
"ebitda" : 81860000000,
"ebitdaratio" : 0.314636,
"operatingIncome" : 63930000000,
"operatingIncomeRatio" : 0.24572,
"totalOtherIncomeExpensesNet" : 422000000,
"incomeBeforeTax" : 65737000000,
"incomeBeforeTaxRatio" : 0.252666,
"incomeTaxExpense" : 10481000000,
"netIncome" : 55256000000,
"netIncomeRatio" : 0.212381,
"eps" : 2.97145,
"epsdiluted" : 2.97145,
"weightedAverageShsOut" : 18595652000,
"weightedAverageShsOutDil" : 18595652000,
"link" : "https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/0000320193-19-000119-index.html",
"finalLink" : "https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/a10-k20199282019.htm"
}
]
res = []
ignored_keys = ['fillingDate', 'acceptedDate', 'link', 'finalLink']
for dd in s:
for k,v in dd.items():
if k not in ignored_keys:
res.append({k: v})
print(res)
EDIT:
one-liner:
print({k:v for dd in s for k,v in dd.items() if k not in ignored_keys})

python how to search a string, count values and group by in json

I have a python program calling an API that receives the result as below:
{
"result": [
{
"company" : "BMW",
"model" : "5"
},
{
"company" : "BMW",
"model" : "5"
},
{
"company" : "BMW",
"model" : "5"
},
{
"company" : "BMW",
"model" : "3"
},
{
"company" : "BMW",
"model" : "7"
},
{
"company" : "AUDI",
"model" : "A3"
},
{
"company" : "AUDI",
"model" : "A7"
},
]
}
Now my task is to identify the number of occurrences of elements from the list in JSON output and group them. The expected output should look like this:
{
"BMW" :
{
"5series" : 3,
"3series" : 1,
"7series" : 1,
},
"AUDI" :
{
"A3" : 1,
"A7" : 1,
},
"MERCEDES":
{
"EClass" : 0,
"SClass" : 0
}
}
I need to find the "company" from list of elements. This will include names that may not be in JSON response sometimes, then the expected output should include that as 0. The "model" names (3,5,7,A3 etc..,) are fixed, so we know that's those are only ones that may or may not be in json api response.
For ex: The List has 3 company names in below code. - companyname = ["BMW,"AUDI","MERCEDES"] . However, sometimes, the JSON API response may not have one or more elements. In this case, "MERCEDES" is missing, but the final output should include "MERCEDES" as well with value as 0.
Here is what i have tried so far:
def modelcount():
companyname= ["BMW","AUDI","MERCEDES"]
url = apiurl
#Send Request
apiresponse = requests.get(url, auth=(user, password), headers=headers, proxies=proxies)
# Decode the JSON response into a dictionary and use the data
data = apiresponse.json()
print(len(data['result']))
3series= 0
5series= 0
7series= 0
A3=0
A7=0
EClass = 0
SClass = 0
modelcountjson = {}
for name in companyname:
for item in data['result']:
models= {}
if item['company'] == name:
if item['model'] == 3:
3series = 3series + 1
elif item['model'] == 5:
5series = 5series + 1
elif item['model'] == 7:
7series = 7series + 1
models['3series'] = 3series
models['5series'] = 5series
models['7series'] = 7series
#I still haven't written AUDI, MERCEDES above. This is where i feel i am writing inefficiently.
modelcountjson[name] = models
return jsonify(modelcountjson)
```
As the number of models grow, I am worried of code getting redundant with many for loops and may cause performance overhead. I am looking for help on achieving the end result in most efficient way.
Thank you so much for your help.

A useful package for working directly with JSON-style dictionaries and lists is toolz (see documentation for more details). This way you can concisely group the data and count occurrences of each model while handling potentially missing data separately:
from toolz import itertoolz
result = {
"result": [
{
"company" : "BMW",
"model" : "5"
},
{
"company" : "BMW",
"model" : "5"
},
{
"company" : "BMW",
"model" : "5"
},
{
"company" : "BMW",
"model" : "3"
},
{
"company" : "BMW",
"model" : "7"
},
{
"company" : "AUDI",
"model" : "A3"
},
{
"company" : "AUDI",
"model" : "A7"
},
]
}
final_output = {}
grouped_result = itertoolz.groupby('company', result['result'])
if 'MERCEDES' not in grouped_result:
final_output['MERCEDES'] = {
'EClass': 0,
'SClass': 0
}
for key, value in grouped_result.items():
models = itertoolz.pluck('model', value)
final_output[key] = itertoolz.frequencies(models)
The output results in:
{'AUDI': {'A3': 1, 'A7': 1}, 'BMW': {'3': 1, '5': 3, '7': 1}, 'MERCEDES': {'EClass': 0, 'SClass': 0}}

You could go for a bit of a separation of code and config:
conf = {
'BMW': {'format': '{}series', 'keys': ['3', '5', '7']},
'AUDI': {'format': '{}', 'keys': ['A3', 'A7']},
'MERCEDES': {'format': '{}Class', 'keys': ['E', 'S']},
}
def modelcount():
# retrieve `data`
# ...
result = {
k: {
v['format'].format(key): 0 for key in v['keys']
} for k, v in conf.items()
}
for car in data['result']:
com = car['company']
mod = car['model']
key = conf[com]['format'].format(mod)
result[com][key] += 1
for com in result:
result[com]['Total'] = sum(result[com].values())
return result
>>> modelcount()
{'BMW': {'3series': 1, '5series': 3, '7series': 1},
'AUDI': {'A3': 1, 'A7': 1},
'MERCEDES': {'EClass': 0, 'SClass': 0}}
This way, for more companies and models, you will only have to touch the conf, not the code. The time complexity of this is O(m+n) with m the total number of distinct models and n the number of cars in the API response.

How can I use Python to generate nested JSON data from my CSV file

I have tried to use the online Jsonify It tool which can create nested JSON data from my data but I can't seem to get that to work. I have also tried to use the Python code from other posts on but they do not seem to work either. If you know an easier method than using Python, that would be good.
Here is my .CSV data:
ID,Name,Date,Subject,Start,Finish
0,Ladybridge High School,01/11/2019,Maths,05:28,0
0,Ladybridge High School,02/11/2019,Maths,05:30,06:45
0,Ladybridge High School,01/11/2019,Economics,11:58,12:40
0,Ladybridge High School,02/11/2019,Economics,11:58,12:40
1,Loreto Sixth Form,01/11/2019,Maths,05:28,06:45
1,Loreto Sixth Form,02/11/2019,Maths,05:30,06:45
1,Loreto Sixth Form,01/11/2019,Economics,11:58,12:40
1,Loreto Sixth Form,02/11/2019,Economics,11:58,12:40
This is the nested JSON structure I would like:
{
"Timetable" : [ {
"Date" : {
"01-11-2019" : {
"Maths" : {
"Start" : "05:28",
"Finish" : "06:45"
},
"Economics" : {
"Start" : "11:58",
"Finish" : "12:40"
}
},
"02-11-2019" : {
"Maths" : {
"Start" : "05:30",
"Finish" : "06:45"
},
"Economics" : {
"Start" : "11:58",
"Finish" : "12:40"
}
}
},
"Name" : "Ladybridge High School"
}, {
"Date" : {
"01-11-2019" : {
"Maths" : {
"Start" : "05:28",
"Finish" : "06:45"
},
"Economics" : {
"Start" : "11:58",
"Finish" : "12:40"
}
},
"02-11-2019" : {
"Maths" : {
"Start" : "05:30",
"Finish" : "06:45"
},
"Economics" : {
"Start" : "11:58",
"Finish" : "12:40"
}
}
},
"Name" : "Loreto Sixth From"
} ]
}

Something like this?
[EDIT]
I refactored it to handle arbitrary top-level keys for each entry in the timetable. I also made it first create a dict and then convert the dict to a list so that it can run in O(N) time, in case the input is very large.
import csv
timetable = {}
with open('data.csv') as f:
csv_data = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]
for row in csv_data:
if not timetable.get(row["ID"]):
timetable[row["ID"]] = {"ID": row["ID"], "Date": {}}
for k in row.keys():
# Date has to be handled as a special case
if k == "Date":
timetable[row["ID"]]["Date"][row["Date"]] = {}
timetable[row["ID"]]["Date"][row["Date"]][row["Subject"]] = {
"Start": row["Start"],
"Finish": row["Finish"]
}
# Ignore these keys because they are only for 'Date'
elif k == "Start" or k == "Finish" or k == "Subject":
continue
# Use everything else
else:
timetable[row["ID"]][k] = row[k]
timetable = {"Timetable": [v for k, v in timetable.items()]}

An improvement to the above answer to nest the ID before the name and date:
import csv
timetable = {"Timetable": []}
print(timetable)
with open("C:/Users/kspv914/Downloads/data.csv") as f:
csv_data = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]
name_array = []
for name in [row["Name"] for row in csv_data]:
name_array.append(name)
name_set = set(name_array)
for name in name_set:
timetable["Timetable"].append({"Name": name, "Date": {}})
for row in csv_data:
for entry in timetable["Timetable"]:
if entry["Name"] == row["Name"]:
entry["Date"][row["Date"]] = {}
entry["Date"][row["Date"]][row["Subject"]] = {
"Start": row["Start"],
"Finish": row["Finish"]
}
print(timetable)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping different style of Json - python

Related

Convert dataframe into specific JSON

How can i use ijson to extract a set of corresponding data from json file?

Eliminate keys from list of dict python

python how to search a string, count values and group by in json

How can I use Python to generate nested JSON data from my CSV file

Categories

Resources