Are there any way to read JSON files in pieces? - python

I am trying to read some JSON file from web and create a SQL database with the data. I am using ijson to read data as stream. But when the code fails I need to start over to retrieve data. Are there any way to continue reading JSON file from where the program is failed?
I can read the whole document with json.loads but I am assuming the data is too big to read at a time.
You can see my code below.
import sqlite3
import ssl
import urllib.request
import json
import ijson
import time
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
conn = sqlite3.connect('rawdata.sqlite')
cur = conn.cursor()
cur.execute('''DROP TABLE IF EXISTS DailyData ''')
cur.execute('''DROP TABLE IF EXISTS Countries ''')
cur.execute('''DROP TABLE IF EXISTS Continents ''')
cur.execute('''CREATE TABLE IF NOT EXISTS DailyData
(id INTEGER, Day TEXT, Month TEXT, Year TEXT, country_id INTEGER, continent_id INTEGER, Cases TEXT, Deaths TEXT)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Countries
(id INTEGER, CountryCode TEXT UNIQUE, Country TEXT UNIQUE, Population TEXT, continent_id INTEGER)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Continents
(id INTEGER, Continent TEXT UNIQUE)''')
url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/"
f = urllib.request.urlopen(url, context=ctx)
reports = ijson.items(f, 'records.item')
sum = 0
count = 0
# error = 0
for item in reports :
iDataRep = item.get('dateRep')
iCases = item.get('cases')
iDeaths = item.get('deaths')
iCountryCode = item.get('countryterritoryCode')
iCountry = item.get('countriesAndTerritories')
iPopulation = item.get('popData2018')
iContinent = item.get('continentExp')
if len(iDataRep) < 0: iDataRep = 0
if len(iCases) < 0: iCases = 0
if len(iDeaths) < 0: iDeaths = 0
if len(iCountryCode) < 0: iCountryCode = 0
if len(iCountry) < 0: iCountry = 0
if len(iPopulation) < 0: iPopulation = 0
if len(iContinent) < 0: iContinent = 0
Spl = iDataRep.split('/')
iDay = Spl[0]
iMonth = Spl[1]
iYear = Spl[2]
id = count + 1
cur.execute('''INSERT OR IGNORE INTO Continents (id, Continent)
VALUES ( ?, ? )''', (id, iContinent))
cur.execute('''SELECT id FROM Continents WHERE Continent = ? ''', (iContinent, ))
continent_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO Countries (id, CountryCode, Country, Population, continent_id)
VALUES ( ?, ?, ?, ?, ? )''', (id, iCountryCode, iCountry, iPopulation, continent_id) )
cur.execute('''SELECT id FROM Countries WHERE Country = ? ''', (iCountry, ))
country_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO DailyData (id, Day, Month, Year, country_id, continent_id, Cases, Deaths)
VALUES ( ?, ?, ?, ?, ?, ?, ? ,?)''', (id, iDay, iMonth, iYear, country_id, continent_id, iCases, iDeaths) )
conn.commit()
# except:
# error = error + 1
# print(error)
# continue
count = count + 1
print(count, 'data retrieved...')
if count % 95 == 0:
time.sleep(1)
print('Program slept a second.')
numCountry = cur.execute('SELECT max(id) FROM Countries' )
numContinent = cur.execute('SELECT max(id) FROM Continents' )
print('From', numCountry, 'different countries and', numContinent, 'continents', count, 'data retrieved.')
cur.close()

Related

What does this traceback mean: sqlite3.InterfaceError: Error binding parameter 0 - probably unsupported type.?

I am trying to pull data the CDC and put the data into a database. My most recent traceback is: sqlite3.InterfaceError: Error binding parameter 0 - probably unsupported type. And do not know what that means or how to fix my code. Any help will be greatly appreciated.
Here is my code:
import json
from urllib.request import urlopen
import ssl
import sqlite3
conn=sqlite3.connect('cdcdata.sqlite')
cur=conn.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS CState (id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
state_id INTEGER, State TEXT, Count INTEGER)''')
cur.execute('''CREATE TABLE IF NOT EXISTS CDate(id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
date_id INTEGER, state_id INTEGER, Date INTEGER, Count INTEGER)''')
cur.execute('''CREATE TABLE State_COIVD_Numbers (State TEXT, state_id INTEGER, date_id INTEGER, count INTEGER)''')
#
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
data = open('rows.json', encoding='utf8').read()
info = json.loads(data)
for data in info['meta']['view']['columns']:
data1 = data['position']
if data1 != 1:
continue
date_data = data['cachedContents']['top']
cur.execute("SELECT Date FROM CDate WHERE Count= ?", (date_data,))
cur.fectone()
#print(date_data)
for data in info['meta']['view']['columns']:
data1 = data['position']
if data1 != 2:
continue
state_data = data['cachedContents']['top']
cur.execute("SELECT State FROM CState WHERE Count= ?", (state_data,))
cur.fetchone()
# print(state_data)
cur.execute('''INSERT INTO CState(State, Count) VALUES(?,?)''',(state_data,))
cur.execute('''INSERT INTO CDate(Date, Count) VALUES(?,?)''', (date_data,))
cur.execute('''select CState.State, CDate.Date FROM CState join CDate on State_COVID_Numbers.state_id=state.id''' )
cur.execute('''select CDate.Date, CState.State FROM CDate join CState on State_COVID_Numbers.date_id=date.id''' )
conn.commit()
conn.close()
Print('You Did It!')
I am new to coding and I have tried many things, but I do not know what to try to solve this problem.

ON CONFLICT DO UPDATE syntax and EXCLUDED error on cursor.executemany

I have a simplified postgres (ver 13) table below with updated rows generated in python with psycopg2.
My question is when I update the price field in the rows, I can't complete the update because of the following errors of ON CONFLICT DO UPDATE. If I don't use ON CONFLICT DO UPDATE , I can update the chart but I would like ON CONFLICT DO UPDATE because it eliminates duplicate rows.
With ON CONFLICT DO UPDATE , I only need to update the fields "price" and "last_updated" but update only when the rows match the "id,item,original_price_date"
The following errors I get ON CONFLICT DO UPDATE :
Error : syntax error at or near "="
# update the prices within the existing data
df = pd.DataFrame(np.array([['5/3/2010', 'rock', 15],
['4/15/2010', 'paper', 11],
['2/3/2015', 'scissor', 13]]),
columns = ['original_price_date', 'item', 'price'])
tuples_for_dB = [tuple(x) for x in df.to_numpy()]
sql_script = '''INSERT INTO ''' + TABLE_ + ''' (
original_price_date, item, price, created_date, last_updated)
VALUES (%s, %s, %s, transaction_timestamp(), transaction_timestamp())
ON CONFLICT (id, item, original_price_date)
DO UPDATE SET (price, last_updated = EXCLUDED.price, EXCLUDED.transaction_timestamp());'''
Error : relation "price_data" does not exist
sql_script = '''INSERT INTO ''' + TABLE_ + ''' (
original_price_date, item, price, created_date, last_updated)
VALUES (%s, %s, %s, transaction_timestamp(), transaction_timestamp())
ON CONFLICT (id, item, original_price_date)
DO UPDATE SET (price, last_updated) = (EXCLUDED.price, EXCLUDED.transaction_timestamp());'''
My original creation of the data :
# postGRESQL connection details
DATABASE_INITIAL_ = 'postgres'
DATABASE_ = 'data'
TABLE_ = 'price_data'
USER_ = 'postgres'
SERVERNAME_ = 'localhost'
PASSWORD_ = password_
HOST_ = '127.0.0.1'
PORT_ = '5432'
#establishing the connection
conn = psycopg2.connect(database = DATABASE_,
user = USER_,
password = PASSWORD_,
host = HOST_,
port = PORT_);
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT);
conn.autocommit = True
# Creating a cursor object using the cursor() method
cursor = conn.cursor()
sql = "SELECT 1 FROM pg_catalog.pg_database WHERE datname = " + "'" + DATABASE_ + "'"
cursor.execute(sql)
# If dB does not exist create the dB
exists = cursor.fetchone()
print(exists)
if not exists:
print('does not exist')
#Preparing query to create a database
sql = '''CREATE database '''+DATABASE_;
#Creating a database
cursor.execute(sql)
# Creating the table
sql = '''CREATE TABLE IF NOT EXISTS ''' + TABLE_ + ''' (
id SERIAL PRIMARY KEY,
original_price_date DATE NOT NULL,
item TEXT NOT NULL,
price NUMERIC NULL DEFAULT NULL,
created_date TIMESTAMPTZ NULL DEFAULT TRANSACTION_TIMESTAMP(),
last_updated TIMESTAMPTZ NULL DEFAULT TRANSACTION_TIMESTAMP());'''
cursor.execute(sql)
# update the table with data
df = pd.DataFrame(np.array([['5/3/2010', 'rock', 0.9],
['4/15/2010', 'paper', 6.5],
['2/3/2015', 'scissor', 3.9],
['3/23/2017', 'ball', 1.1],
['4/7/2013', 'tire', 5.4]]),
columns = ['original_price_date', 'item', 'price'])
tuples_for_dB = [tuple(x) for x in df.to_numpy()]
sql_script = '''INSERT INTO ''' + TABLE_ + ''' (
original_price_date, item, price, created_date, last_updated)
VALUES (%s, %s, %s, transaction_timestamp(), transaction_timestamp());'''
try:
cursor.executemany(sql_script, tuples_for_dB);
success = True
except psycopg2.Error as e:
error = e.pgcode
print(f'Error : {e.args[0]}')
success = False
if success:
print(f'\nData inserted successfully........')
print(f'Table INSERT sql commit comment :\n"{sql_script}"\n')
elif success == False:
print(f'\nData NOT inserted successfully XXXXXX')
# Preparing query to drop a table
sql = '''DROP TABLE IF EXISTS ''' + TABLE_ + ";"
# Creating the table
cursor.execute(sql)
conn.close()
I added a constraint row (CONSTRAINT com UNIQUE (original_price_date,item))) where I created the table.
sql = '''CREATE TABLE IF NOT EXISTS ''' + TABLE_ + ''' (
id SERIAL PRIMARY KEY,
original_price_date DATE NOT NULL,
item TEXT NOT NULL,
price NUMERIC NULL DEFAULT NULL,
created_date TIMESTAMPTZ NULL DEFAULT TRANSACTION_TIMESTAMP(),
last_updated TIMESTAMPTZ NULL DEFAULT TRANSACTION_TIMESTAMP(),
CONSTRAINT com UNIQUE (original_price_date,item));'''
Then I could insert the data NOT creating duplicate rows of (original_price_date,item) by the following statement.
sql = '''INSERT INTO ''' + TABLE_ + '''(original_price_date, item, price)
VALUES (%s, %s, %s)
ON CONFLICT (original_price_date, item)
DO UPDATE
SET (price, last_updated) = (EXCLUDED.price,TRANSACTION_TIMESTAMP());'''

Python for loop in Jupyter notebook doesn't stop at the end of the list

SOLVED: It was the magic function %%timeit which I put at the beginning of the cell. I even forgot to mention it in the question below. I removed it and everything started to work fine.
I have a list of CSV files, which I intend to loop and send to a function. The list of the files are named 1404.csv, 1405.csv, ....., 1905.csv, 1906.csv
When I use for loop without the function, everything works fine.
for i in os.listdir("dispensing_csv"):
db_file = "05.db"
csv_file = "dispensing_csv/" + i
month = "01/" + i[2:4] + "/20" + i[0:2]
# dispensing_csv_to_sql(db_file, csv_file, month)
print("Period " + month + " written.")
The outputs become:
Period 01/04/2014 written.
Period 01/05/2014 written.
....
....
Period 01/05/2019 written.
Period 01/06/2019 written.
But when I try to use the function inside the loop, the loop never ends.
for i in os.listdir("dispensing_csv"):
db_file = "05.db"
csv_file = "dispensing_csv/" + i
month = "01/" + i[2:4] + "/20" + i[0:2]
dispensing_csv_to_sql(db_file, csv_file, month)
print("Period " + month + " written.")
The outputs become:
Period 01/04/2014 written.
Period 01/05/2014 written.
....
....
Period 01/05/2019 written.
Period 01/06/2019 written.
Period 01/04/2014 written.
Period 01/05/2014 written.
....
....
....
I am using Python 3.7.3 in Jupyter Lab. As the name implies the function tries to write some CSV to a sqlite file.
Thanks in advance for your help.
ADDED dispensing_csv_to_sql()
def dispensing_csv_to_sql(db_file, csv_file, month):
# Same as in ContractorTypes in SQL db
ContractorTypes = {"Abated": 1, "Appliance": 2, "Pharmacy": 3}
# Connecting to the db
conn = sqlite3.connect(db_file)
c = conn.cursor()
# Reading the CSV file in dataframe
df = pd.read_csv(csv_file)
df.columns = df.columns.str.strip() # to remove possible whitespaces
# Checking if period exists in the db, update if not. Return period id
query_period = c.execute("SELECT 1 FROM Periods WHERE Period = ?", (month, ))
query_period = list(query_period)
try:
query_period = query_period[0][0]
except:
c.execute("INSERT INTO Periods (Period) VALUES (?)", (month, ))
# Getting the period_id to use in DispensingNumbers table
period_id = c.execute("SELECT PeriodID FROM Periods WHERE Period = ?", (month, ))
period_id = list(period_id)
period_id = period_id[0][0] # period_id
# Now iterating rows in df
for index, rows in df.iterrows():
# Checking if contractor exists in the db, update if not. Return contractor id
query_contractor = c.execute("SELECT 1 FROM Contractors WHERE ContractorCode = ?", (rows["ContractorCode"], ))
query_contractor = list(query_contractor)
try:
query_contractor = query_contractor[0][0]
except:
# Checking if area exists in the db, update if not. Return area id
query_area = c.execute("SELECT 1 FROM Areas WHERE AreaCode = ?", (rows["AreaCode"], ))
query_area = list(query_area)
try:
query_area = query_area[0][0]
except:
# Inserting non-existing area data
insert_area = "INSERT INTO Areas (AreaCode, AreaName) VALUES (?, ?)"
c.execute(insert_area, (rows["AreaCode"], rows["Area"]))
# Getting the area_id to use in Contractors table
area_id = c.execute("SELECT AreaID FROM Areas WHERE AreaCode = ?", (rows["AreaCode"], ))
area_id = list(area_id)
area_id = area_id[0][0] # area_id
# Inserting non-existing Contractor data
ContractorCode = rows["ContractorCode"]
ContractorName = rows["ContractorName"]
ContractorAddress = " ".join(rows[5:9].dropna().tolist())
ContractorPostcode = rows["Postcode"]
AreaID = area_id
ContractorTypeID = ContractorTypes[rows["Contractor Type"].strip()]
insert_contractor = "INSERT INTO Contractors (ContractorCode, ContractorName, ContractorAddress, \
ContractorPostcode, AreaID, ContractorTypeID) VALUES (?, ?, ?, ?, ?, ?)"
contractor_data_to_be_inserted = (ContractorCode, ContractorName, ContractorAddress, ContractorPostcode, AreaID, ContractorTypeID)
c.execute(insert_contractor, contractor_data_to_be_inserted)
# Getting the contractor_id to use in DispensingNumbers table
contractor_id = c.execute("SELECT ContractorID FROM Contractors WHERE ContractorCode = ?", (rows["ContractorCode"], ))
contractor_id = list(contractor_id)
contractor_id = contractor_id[0][0] # contractor_id
# Inserting dispensing numbers data
ContractorID = contractor_id
PeriodID = period_id
NumberofForms = rows["NumberofForms"] or 0
NumberofItems = rows["NumberofItems"] or 0
TotalNofPresPF = rows["TotalnumberofPrescriptions(ProfessionalFees)"] or 0
NofPresPFStdDis = rows["NumberofPrescriptions(ProfessionalFees)(Standarddiscountrate)"] or 0
NofPresPFZeroDis = rows["NumberofPrescriptions(ProfessionalFees)(Zerodiscountrate)"] or 0
NofFormEPS = rows["NumberofformsforElectronicPrescriptionService(EPS)"] or 0
NofItemEPS = rows["NumberofItemsprocessedviaElectronicPrescriptionService(EPS)"] or 0
MUR = rows["NumberofMedicineUseReviews(MURs)declared"] or 0
NMS = rows["NumberofNewMedicineService(NMS)interventionsdeclared"] or 0
AURHome = rows["NumberofApplianceUseReviews(AURs)conductedinusershomes"] or 0
AURPremise = rows["NumberofApplianceUseReviews(AURs)conductedatpremises"] or 0
Stoma = rows["NumberofStomaCustomisationFees"] or 0
insert_dispensingno = "INSERT INTO DispensingNumbers (ContractorID, PeriodID, NumberofForms, \
NumberofItems, TotalNofPresPF, NofPresPFStdDis, NofPresPFZeroDis, NofFormEPS, NofItemEPS, MUR, \
NMS, AURHome, AURPremise, Stoma) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
dispensingno_data_to_be_inserted = (ContractorID, PeriodID, NumberofForms, NumberofItems, TotalNofPresPF, NofPresPFStdDis,
NofPresPFZeroDis, NofFormEPS, NofItemEPS, MUR, NMS, AURHome, AURPremise, Stoma)
c.execute(insert_dispensingno, dispensingno_data_to_be_inserted)
conn.commit()
conn.close()
I solved the problem. It was the magic function %%timeit which I put at the beginning of the cell. I even forgot to mention it in the question below. I removed it and everything started to work fine.
I added this answer to be able to mark it solved.

python script is not saving into database

I am currently learning how to modify data with python using visual studios and sqlite. My assignment is to count how many times emails are found in a file, organize them in a way that each email is then counted. Then I must input these into SQLite as a table named Counts with two rows (org,count). I have wrote a code that runs the program and outputs it onto the visual studios output screen but not the database.
this is my program:
import sqlite3
conn = sqlite3.connect('database3.db')
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS Counts')
cur.execute('''CREATE TABLE Counts (email TEXT, count INTEGER)''')
#cur.execute("INSERT INTO Counts Values('mlucygray#gmail.com',1)")
# Save (commit) the changes
conn.commit()
fname = input('Enter file name: ')
if (len(fname) < 1): fname = 'mbox-short.txt'
fh = open(fname)
for line in fh:
if not line.startswith('From: '): continue
pieces = line.split()
email = pieces[1]
cur.execute('SELECT count FROM Counts WHERE email = ? ', (email,))
row = cur.fetchone()
if row is None:
cur.execute('''INSERT INTO Counts (email, count) VALUES (?, 1)''', (email,))
else:
cur.execute('UPDATE Counts SET count = count + 1 WHERE email = ?',(email,))
cur.execute('SELECT * FROM Counts')
# https://www.sqlite.org/lang_select.html
sqlstr = 'SELECT email, count FROM Counts ORDER BY count DESC LIMIT 10'
conn.commit()
for row in cur.execute(sqlstr):
print(str(row[0]), row[1])
conn.commit()
cur.close()
click here for the link to the output of the above code
Thank you for any suggestions
You need to commit changes with insert/update and DONT need to commit after executing select statements.
for line in fh:
if not line.lower().startswith('from: '): continue
pieces = line.split()
email = pieces[1]
cur.execute('SELECT count FROM Counts WHERE email = ?', (email,))
row = cur.fetchone()
if row is None:
cur.execute('''INSERT INTO Counts (email, count) VALUES (?, 1)''', (email,))
else:
cur.execute('UPDATE Counts SET count = count + 1 WHERE email = ?',(email,))
conn.commit()
sqlstr = 'SELECT email, count FROM Counts ORDER BY count DESC LIMIT 10'
for row in cur.execute(sqlstr):
print(str(row[0]), row[1])
cur.close()

python not correctly save data if I use pypyodbc

I wrote a simple script for parsing csv and insert data into SQL Server.
So, the very strange issue is that some variables are lost if I call them in a if condition.
This is the script:
# DB connection
conn = pypyodbc.connect('DRIVER={SQL Server};SERVER=xxx.xxx.xxx.xxx;DATABASE=SCAN;UID=user;PWD=password')
cursor = conn.cursor()
def main() :
reader = csv.reader(file(filename, "rb"), delimiter=';')
for row in reader :
ip = row[0]
host = row[1]
domain = row[2]
# get Operating System ID
os_id = getOperatingSystem(row[3])
manufacturer = row[4]
model = row[5]
# get computer_manufacturer ID
computer_manufacturer = getManufacturer(manufacturer, computer_model)
arch = getArch(row[6])
values = [ip, host, domain, os_id, manufacturer, arch]
hostIP = getHostIP(ip)
print "hostIP: " +str(hostIP)
if hostIP == 0:
print values
# insert values in DB
cursor.execute(
"""
INSERT INTO dbo.hosts (ip, host, domain, os_id, manufacturer, arch_id)
VALUES (?, ?, ?, ?, ?, ?)
""", values)
cursor.commit()
# return host IP ID
def getHostIP(hostIP) :
cursor.execute("SELECT id FROM mytable WHERE ip = ?", [hostIP])
row = cursor.fetchone()
if row is not None :
return row[0]
return 0
# return ID of Computer Manufacturer
def getComputerManufacturer(manufacturer, computer_model) :
cursor.execute("SELECT id FROM manufacturer WHERE manufacturer = ? AND computer_model = ?", [manufacturer, computer_model])
row = cursor.fetchone()
if row is not None:
return row[0]
else :
return setComputerManufacturer(manufacturer, computer_model)
If I commented cursor_execute and cursor_commit lines the print values correctly shows data, else it shows only the same csv line.
Can you give me a little help?
Thanks

Categories