3 CSV files into a single JSON - python

I have 3 csv files, loan, customer and security, each of these files are quite large (800k+ rows). Each file is linked by a single column uniqueid. My aim is to create a single JSON file, the code below achieves this, however, it is very slow.
My question is how can i achieve this result faster?
import csv
import json
def multicsvtojson():
loanscsvfile = open('C:\\***\\loan.csv', 'r')
custcsvfile = open('C:\\***\\customer.csv', 'r')
securcsvfile = open('C:\\***\\security.csv', 'r')
loansreader = csv.DictReader(loanscsvfile, delimiter=',')
custreader = csv.DictReader(custcsvfile, delimiter=',')
securreader = csv.DictReader(securcsvfile, delimiter=',')
jsonfile = open('test.json', 'w')
#ready json file
output = []
loanscount = 0
#total loan count
for loansrow in loansreader:
loansrow['customers'] = []
loansrow['securities'] = []
output.append(loansrow)
custcsvfile.seek(0)
securcsvfile.seek(0)
for custrow in custreader:
if (loansrow["UniqueID"] == custrow["UniqueID"]):
loansrow['customers'].append(custrow)
for securrow in securreader:
if (loansrow["UniqueID"] == securrow["UniqueID"]):
loansrow['securities'].append(securrow)
loanscount = loanscount + 1 #increment the loan counter
print(loanscount)
total = {}
total['total'] = loanscount
output.insert(0, total)
json.dump(output, jsonfile, indent=4)
The current output is as follow
[{
"total": 2
},
{
"uniqueID": "",
"uniqueID2": "",
"colA": "",
"colB": "",
"colC": "",
"colD": "",
"customers": [
{
"uniqueID": "",
"custID": "",
"colA": "",
"colB": "",
}
],
"securities": [
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
}
]
},
{
"uniqueID": "",
"uniqueID2": "",
"colA": "",
"colB": "",
"colC": "",
"colD": "",
"customers": [
{
"uniqueID": "",
"custID": "",
"colA": "",
"colB": "",
},
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
}
],
"securities": [
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
},
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
}
]
}
}]

What probably costs you performance are the multiple reads of the customer and securities files. Because you reread the whole file at every loan row.
Maybe try to regroup customers and securties by ID before getting to the loans file, so you don't have to re-read every time.
import csv
import json
def multicsvtojson():
loanscsvfile = open('C:\\***\\loan.csv', 'r')
custcsvfile = open('C:\\***\\customer.csv', 'r')
securcsvfile = open('C:\\***\\security.csv', 'r')
loansreader = csv.DictReader(loanscsvfile, delimiter=',')
custreader = csv.DictReader(custcsvfile, delimiter=',')
securreader = csv.DictReader(securcsvfile, delimiter=',')
jsonfile = open('test.json', 'w')
#ready json file
output = []
loanscount = 0
# regroup customers by ID
customers = {}
for custrow in custreader:
id = custrow["UniqueID"]
if id not in customers:
customers[id] = []
customers[id].append(custrow)
# regroup securities by ID
securities = {}
for securrow in securreader:
id = securrow["UniqueID"]
if id not in securities:
securities[id] = []
securities[id].append(securrow)
#total loan count
for loansrow in loansreader:
loansrow['customers'] = customers.get("UniqueID", [])
loansrow['securities'] = securities.get("UniqueID", [])
output.append(loansrow)
loanscount = loanscount + 1 #increment the loan counter
print(loanscount)
total = {}
total['total'] = loanscount
output.insert(0, total)
json.dump(output, jsonfile, indent=4)
If the CSV files are too big to handle in memory, you can also try to use a temporary database such as tinydb.

Related

How to read a .json file in pandas to export it as a readable .csv file

I have created a .json file by appending a number of json strings using a get request. My aim is to convert the appended .json file into a readable .csv file. The .json file has the following format:
[{
"trades":[
{
"id": 20995465,
"unique_identifier": null,
"transaction_type": "BUY",
"transaction_date": "2016-11-08",
"symbol": "RDSA",
"market": "LSE",
"quantity": 10,
"price": 20.84,
"exchange_rate": 0.5525,
"brokerage": 3.619909502,
"brokerage_currency_code": "GBP",
"value": 380.81,
"comments": null,
"portfolio_id": 293304,
"holding_id": 6258682,
"instrument_id": 32021,
"confirmed": true,
"links": {
"portfolio": "https://api.sharesight.com/api/v3/portfolios/293304"
}
}
],
"links":{
"self":"https://api.sharesight.com/api/v3/portfolios/2/trades"
}
},
{
"trades":[
{
"id": 20995425,
"unique_identifier": null,
"transaction_type": "BUY",
"transaction_date": "2018-11-08",
"symbol": "PDSA",
"market": "LSE",
"quantity": 1,
"price": 2.84,
"exchange_rate": 0.25,
"brokerage": 7.619909502,
"brokerage_currency_code": "GBP",
"value": 80.81,
"comments": null,
"portfolio_id": 293604,
"holding_id": 6258635,
"instrument_id": 32023,
"confirmed": true,
"links": {
"portfolio": "https://api.sharesight.com/api/v3/portfolios/293604"
}
}
],
"links":{
"self":"https://api.sharesight.com/api/v3/portfolios/2/trades"
}
}
]
My attempt
client_id = 'ClientID'
client_secret = 'ClientSecret'
access_token_url='https://api.sharesight.com/oauth2/token'
client = BackendApplicationClient(client_id=client_id)
oauth = OAuth2Session(client=client)
token = oauth.fetch_token(token_url=access_token_url, client_id=client_id, client_secret=client_secret)
access_token = token['access_token']
head = {'Authorization': f'Bearer {access_token}'}
# Get the portfolios
r = requests.get('https://api.sharesight.com/api/v2/portfolios.json', headers=head)
# print(r)
j = r.json()
# print(j)
rjs = []
for p in j['portfolios']:
# print(p)
name = p['name']
pid = p['id']
print(f'Retrieving {name} - {pid}')
vurl = f'https://api.sharesight.com/api/v2/portfolios/{pid}/trades.json'
r = requests.get(vurl, headers=head)
rj = r.json()
rjs.append(rj)
with open('/Users/Filename.json', 'w') as json_file:
json.dump(rjs, json_file)
# Opening JSON file and loading the data
# into the variable data
with open('/Users/Filename.json') as json_file:
data = json.load(json_file)
trades_data = data['trades']
# now we will open a file for writing - create a blank .csv file
data_file = open('/Users/Filename.csv', 'w')
# create the csv writer object
csv_writer = csv.writer(data_file)
# Counter variable used for writing
# headers to the CSV file
count = 0
for emp in trades_data:
if count == 0:
# Writing headers of CSV file
header = emp.keys()
csv_writer.writerow(header)
count += 1
# Writing data of CSV file
csv_writer.writerow(emp.values())
data_file.close()
Error Code
trades_data = data['trades']
TypeError: list indices must be integers or slices, not str
I think I get this error because 'trades' is replicated twice in my .json string and thus might be viewed as a string. Is there a workaround around this issue? I'm new to python so would greatly appreciate your help!
Desired Output
A .csv file with the following structure:
Answer by #dsillman2000 for entry in data: trades_data = entry['trades'] ... etc

Converting excel spreadsheet to json

I want to convert an excel spreadsheet data to a JSON file. Here is the code I currently have:
Data
excel spreadsheet
Code
import xlrd
from collections import OrderedDict
import json
wb = xlrd.open_workbook('./file1.xlsx')
sh = wb.sheet_by_index(0)
data_list = []
for rownum in range(1, sh.nrows):
data = OrderedDict()
row_values = sh.row_values(rownum)
data['name'] = row_values[0]
data['description'] = row_values[1]
data_list.append(data)
data_list = {'columns': data_list}
j = json.dumps(data_list)
with open('seq1.json', 'w') as f:
f.write(j)
Output
{"columns": [{"name": "FILEID", "description": "FILETYPE"}]}
Expected output
{
"columns": [
{
"name": "fileid",
"description": "FILEID"
},
{
"name": "filetype",
"description": "FILETYPE"
},
{
"name": "stusab",
"description": "STUSAB"
},
{
"name": "chariter",
"description": "CHARITER"
},
{
"name": "sequence",
"description": "SEQUENCE"
},
{
"name": "logrecno",
"description": "LOGRECNO"
}
],
The "name" column should be displaying the first row while the "description" column should be displaying the second row.
What modification can I do in my function to get the output I am looking for?
You need to iterate over columns, not rows
import xlrd
from collections import OrderedDict
import json
wb = xlrd.open_workbook('./file1.xls')
sh = wb.sheet_by_index(0)
data_list = []
data = OrderedDict()
for colnum in range(0, sh.ncols):
data['name'] = sh.row_values(0)[colnum]
data['description'] = sh.row_values(1)[colnum]
data_list.append(data.copy())
data_list = {'columns': data_list}
j = json.dumps(data_list)
with open('seq1.json', 'w') as f:
f.write(j)
You should give a try to:
import excel2json
excel2json.convert_from_file('file.xlsx')
You can use pandas
import pandas as pd
df = pd.read_excel('./file1.xlsx')
with open('seq1.json', 'w') as f:
f.write(df.to_json())

python why in my json array each json element have the same value

So I have a json template and I am reading from a csv to update the some of the value of the json properties. I then put all the json in a array to write to a file. but in my file, all the json elements have the same value.
The issue is the old values are being overwritten some how. How should I fix that?
def main():
df = pd.read_csv("Daily_EXRATE.csv")
df = df.loc[df['Field1'] == '04']
opdb = {
"sell_rate": 1.2676,
"type": "currency_exchange",
"version": "1"
}
opdbarray = []
for index, rowsr in df.iterrows():
data = {}
data = rowsr.to_json()
data = json.loads(data)
opdb["sell_rate"] = data["Field11"]
opdbarray.append(opdb)
print(json.dumps(opdb, indent = 4 ))
# now write output to a file
jsonDataFile = open("ccData_1.json", "w")
jsonDataFile.write(json.dumps(opdbarray, indent=4, sort_keys=True))
jsonDataFile.close()
outputs are all the same
[
{
"sell_rate": "2.1058000000",
"type": "currency_exchange",
"version": "1"
},
{
"sell_rate": "2.1058000000",
"type": "currency_exchange",
"version": "1"
},
{
"sell_rate": "2.1058000000",
"type": "currency_exchange",
"version": "1"
},
You're appending the same obdb dictionary to apdbarray each time through the loop, just replacing its sell_rate element. You need to create a new dictionary each time.
def main():
df = pd.read_csv("Daily_EXRATE.csv")
df = df.loc[df['Field1'] == '04']
opdbarray = []
for index, rowsr in df.iterrows():
data = {}
data = rowsr.to_json()
data = json.loads(data)
opdb = {
"sell_rate": 1.2676,
"type": "currency_exchange",
"version": "1",
"sell_rate": data["Field11"]
}
opdbarray.append(opdb)
print(json.dumps(opdb, indent = 4 ))
# now write output to a file
jsonDataFile = open("ccData_1.json", "w")
jsonDataFile.write(json.dumps(opdbarray, indent=4, sort_keys=True))
jsonDataFile.close()

how to store JSON list data into python variables?

I have a JSON file that contains some data.
testJSON.json
{
"hourlyData": [
{
"frequency": "49.96",
"actual": " 2,240.43 ",
"schedule": " 2,223.85 ",
"acp": "325"
},
{
"frequency": "50.04",
"actual": " 1,862.88 ",
"schedule": " 1,881.09 ",
"acp": "275"
},
{
"frequency": "50.04",
"actual": " 1,882.17 ",
"schedule": " 1,885.94 ",
"acp": "275"
}
],
"storageData": [
{
"config": "ESS1",
"name": "BRPL",
"Power Rating": "20",
"Energy Rating": "20",
"socLow": "0",
"socHigh": "0.8",
"Charge Eff": "0.9273",
"Discharge Eff": "0.9273",
"Round Trip Eff": "0.922",
"Lower Limit": "57",
"Mid Limit": "76",
"High Limit": "95",
"Thrushold": "5",
"Discharging Price": "6"
}
],
I want to store these values into python variables. So what I have done is that first I created a dictionary which contain different variables of different types then I created a function which simple opens the JSON file then I try to store those json values into declared variables:
test.py
import json
#decalaring variables
storageData = {
"name": 'No name specified',
"powerRating":-1,
"energyRating":-1,
"annualMaxCycles":365,
"socLow":-1,
"socHigh":-1,
"chargeEff":-1,
"dChargeEff":-1,
"lowerLimit": -1,
"midLimit": -1,
"highLimit": -1,
"thrushold": 5,
"dischargingPrice": 6
}
marketData = {
"marketProducts": {
"dsm":{
"frequency":[],
"schedule":[],
"actual":[],
"acp": [],
}
}
inputMode = 'JSON'
JSONfileName = "testJSON.json"
def inputJSON():
if (inputMode == 'JSON'):
fileName = JSONfileName
# Import data from JSON files
with open(JSONfileName, 'r') as myfile:
dataJSON = ((myfile.read().replace('\n', '')))
inputJSON = json.loads(dataJSON)
# Assigning the storageData data
storageData['powerRating'] = inputJSON['storageData']['Power Rating']
storageData['energyRating'] = inputJSON['storageData']['energyRating']
storageData['warranty'] = inputJSON['storageData']['powerRating']
storageData['annualMaxCycles'] = inputJSON['storageData']['maxAnnualCycles']
storageData['socLow'] = inputJSON['storageData']['socLow']
storageData['socHigh'] = inputJSON['storageData']['socHigh']
storageData['chargeEff'] = inputJSON['storageData']['chargeEff']
storageData['dChargeEff'] = inputJSON['storageData']['dChargeEff']
storageData['lowerLimit'] = inputJSON['storageData']['lowerLimit']
storageData['midLimit'] = inputJSON['storageData']['midLimit']
storageData['highLimit'] = inputJSON['storageData']['highLimit']
storageData['thrushold'] = inputJSON['storageData']['thrushold']
storageData['dischargingPrice'] = inputJSON['storageData']['dischargingPrice']
marketData['marketProducts']['dsm']['frequency'] = inputJSON['hourlyData']['frequency']
marketData['marketProducts']['dsm']['acp'] = inputJSON['hourlyData']['acp']
marketData['marketProducts']['dsm']['actual'] = inputJSON['hourlyData']['actual']
marketData['marketProducts']['dsm']['schedule'] = inputJSON['hourlyData']['schedule']
inputJSON()
error that it gives me
Traceback (most recent call last):
File "C:/Users/nvats/PycharmProjects/dsm-final/test2.py", line 113, in <module>
inputJSON()
File "C:/Users/nvats/PycharmProjects/dsm-final/test2.py", line 80, in inputJSON
storageData['powerRating'] = inputJSON['storageData']['Power Rating']
TypeError: list indices must be integers or slices, not str
Instead of
# Assigning the storageData data
storageData['powerRating'] = inputJSON['storageData']['Power Rating']
storageData['energyRating'] = inputJSON['storageData']['energyRating']
storageData['warranty'] = inputJSON['storageData']['powerRating']
storageData['annualMaxCycles'] = inputJSON['storageData']['maxAnnualCycles']
storageData['socLow'] = inputJSON['storageData']['socLow']
storageData['socHigh'] = inputJSON['storageData']['socHigh']
storageData['chargeEff'] = inputJSON['storageData']['chargeEff']
storageData['dChargeEff'] = inputJSON['storageData']['dChargeEff']
storageData['lowerLimit'] = inputJSON['storageData']['lowerLimit']
storageData['midLimit'] = inputJSON['storageData']['midLimit']
storageData['highLimit'] = inputJSON['storageData']['highLimit']
storageData['thrushold'] = inputJSON['storageData']['thrushold']
storageData['dischargingPrice'] = inputJSON['storageData']['dischargingPrice']
Do
# Assigning the storageData data
storageData['powerRating'] = inputJSON['storageData[0]']['Power Rating']
storageData['energyRating'] = inputJSON['storageData[0]']['energyRating']
storageData['warranty'] = inputJSON['storageData[0]']['powerRating']
storageData['annualMaxCycles'] = inputJSON['storageData[0]']['maxAnnualCycles']
storageData['socLow'] = inputJSON['storageData[0]']['socLow']
storageData['socHigh'] = inputJSON['storageData[0]']['socHigh']
storageData['chargeEff'] = inputJSON['storageData[0]']['chargeEff']
storageData['dChargeEff'] = inputJSON['storageData[0]']['dChargeEff']
storageData['lowerLimit'] = inputJSON['storageData[0]']['lowerLimit']
storageData['midLimit'] = inputJSON['storageData[0]']['midLimit']
storageData['highLimit'] = inputJSON['storageData[0]']['highLimit']
storageData['thrushold'] = inputJSON['storageData[0]']['thrushold']
storageData['dischargingPrice'] = inputJSON['storageData[0]']['dischargingPrice']
Because it's an array, whose 1st index has the json data.
Ex. for "hourlyData"
for(data of inputJSON['hourlyData']){
marketData['marketProducts']['dsm']['frequency'] = data['frequency']
}
Hope that helps.

Iterate through a list of dictionaries and save duplicate data

I would like to iterate through a list of dictionaries and save values of certain keys (in my case "consumer Key" and "consumer Secret") as many times they are present into another dictionary.
Problem: I'm able to iterate through the list but my code is not saving the second consumer key and consumer secret, instead it is saving the first consumer key and consumer secret twice.
Input:
{
"accessType": "",
"apiProducts": [],
"appFamily": "default",
"appId": "ac56c8b2-6ac1-4971-a1d3-4bf97893c067",
"attributes": [
{
"name": "DisplayName",
"value": "quotaapp"
},
{
"name": "Notes",
"value": ""
}
],
"callbackUrl": "",
"createdAt": 1549274952045,
"createdBy": "suraj.pai.airody#sap.com",
"credentials": [
{
"apiProducts": [
{
"apiproduct": "apiprod",
"status": "approved"
}
],
"attributes": [],
"consumerKey": "xyz",
"consumerSecret": "abc",
"expiresAt": -1,
"issuedAt": 1549274952051,
"scopes": [],
"status": "approved"
},
{
"apiProducts": [
{
"apiproduct": "ouathTest-Product",
"status": "approved"
}
],
"attributes": [],
"consumerKey": "pqr",
"consumerSecret": "wmn",
"expiresAt": -1,
"issuedAt": 1554802431452,
"scopes": [],
"status": "approved"
}
],
"developerId": "xyz",
"lastModifiedAt": 1554802431662,
"lastModifiedBy": "suraj.pai.airody#sap.com",
"name": "quotaapp",
"scopes": [],
"status": "approved"
}
Code:
import requests
import json
from requests.auth import HTTPBasicAuth
import csv
def get_v2details():
a = 'orgID1'
b = 'appID1'
c = 'ConKey1'
d = 'ConSecret1'
e = 'appName1'
org_lst = []
some_dict = {}
con_blst = [] # variable to append the dictionary app level
n = int(input("Enter number of orgs from Landscape 1: "))
for i in range(0, n):
ele = str(input())
org_lst.append(ele)
cmp_orglst = list(org_lst)
print(cmp_orglst)
for j in cmp_orglst:
url = "https://canarydevmgmtsrv.dmzmo.sap.corp/v1/o/" + str(j) + "/apps/"
headers = {'Content-Type': 'application/json'}
response = requests.get(url, auth=HTTPBasicAuth('xyz', 'xyz'), headers=headers, verify=False)
app_data = json.loads(response.text)
print(app_data)
for k in app_data:
url1 = "https://canarydevmgmtsrv.dmzmo.sap.corp/v1/o/" + str(j) + "/apps/" + str(k)
headers = {'Content-Type': 'application/json'}
response1 = requests.get(url1, auth=HTTPBasicAuth('xyz', 'xyz'), headers=headers, verify=False)
consumer_data = json.loads(response1.text)
print(" Consumer Data is ", consumer_data)
for l in range(len(consumer_data['credentials'])):
some_dict[a] = str(j)
some_dict[b] = consumer_data['appId']
some_dict[e] = consumer_data['name']
some_dict[c] = consumer_data['credentials'][0]['consumerKey']
some_dict[d] = consumer_data['credentials'][0]['consumerSecret']
print(some_dict) # Print dictionary of each app ID
con_blst.append(some_dict.copy())
print(con_blst)
csv_columns = ['orgID1', 'appName1', 'appID1', 'ConKey1', 'ConSecret1']
csv_file = "Names1.csv"
try:
with open(csv_file, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
writer.writeheader()
for data in con_blst:
writer.writerow(data)
except IOError:
print("I/O error")
Expected result:
orgID1 appName1 appID1 ConKey1 ConSecret1
VALIDATE quotaapp 4bf97893c067 xyz abc
VALIDATE quotaapp 4bf97893c067 pqr wmn
Actual result:
orgID1 appName1 appID1 ConKey1 ConSecret1
VALIDATE quotaapp 4bf97893c067 xyz abc
VALIDATE quotaapp 4bf97893c067 xyz abc
It seems you just made a small error.
for l in range(len(consumer_data['credentials'])):
some_dict[a] = str(j)
some_dict[b] = consumer_data['appId']
some_dict[e] = consumer_data['name']
some_dict[c] = consumer_data['credentials'][0]['consumerKey'] #this line
some_dict[d] = consumer_data['credentials'][0]['consumerSecret'] #and this line
print(some_dict) # Print dictionary of each app ID
con_blst.append(some_dict.copy())
Should be
for l in range(len(consumer_data['credentials'])):
some_dict[a] = str(j)
some_dict[b] = consumer_data['appId']
some_dict[e] = consumer_data['name']
some_dict[c] = consumer_data['credentials'][l]['consumerKey'] # Here
some_dict[d] = consumer_data['credentials'][l]['consumerSecret'] # Here
print(some_dict) # Print dictionary of each app ID
con_blst.append(some_dict.copy())
You weren't looping through consumer_data['credentials'], you were just storing consumer_data['credentials'][0] twice

Categories