Optimizing reading very large csv and writing it to SQLite

Optimizing reading very large csv and writing it to SQLite - python

I have a 10gb csv file of userIDs and genders which are sometimes duplicated.
userID,gender
372,f
37261,m
23,m
4725,f
...
Here's my code for importing csv and writing it to SQLite database:
import sqlite3
import csv
path = 'genders.csv'
user_table = 'Users'
conn = sqlite3.connect('db.sqlite')
cur = conn.cursor()
cur.execute(f'''DROP TABLE IF EXISTS {user_table}''')
cur.execute(f'''CREATE TABLE {user_table} (
userID INTEGER NOT NULL,
gender INTEGER,
PRIMARY KEY (userID))''')
with open(path) as csvfile:
datareader = csv.reader(csvfile)
# skip header
next(datareader, None)
for counter, line in enumerate(datareader):
# change gender string to integer
line[1] = 1 if line[1] == 'f' else 0
cur.execute(f'''INSERT OR IGNORE INTO {user_table} (userID, gender)
VALUES ({int(line[0])}, {int(line[1])})''')
conn.commit()
conn.close()
For now, it takes 10 seconds to process 1MB file (In reality, I have more columns and also create more tables.).
I don't think pd.to_sql can be used because I want to have a primary key.

Instead of using cursor.execute for every line, use cursor.executemany and insert all data at once.
Store your values in format _list=[(a,b,c..),(a2,b2,c2...),(a3,b3,c3...)......]
cursor.executemany('''INSERT OR IGNORE INTO {user_table} (userID, gender,...)
VALUES (?,?,...)''',(_list))
conn.commit()
Info:
https://docs.python.org/2/library/sqlite3.html#module-sqlite3

Related

SQLite and python - can't set primary keys

I'm trying to create tables using python but when I inspect the data structure in SQLite, the primary keys aren't being assigned. Here's the code for one of the tables. It seems to work as intended except for the primary key part. I'm new to Python and SQLite so I'm probably missing something very obvious but can't find any answers.
# Create a database and connect
conn = sql.connect('Coursework.db')
c = conn.cursor()
# Create the tables from the normalised schema
c.execute('CREATE TABLE IF NOT EXISTS room_host (room_ID integer PRIMARY KEY, host_ID integer)')
c.execute("SELECT count(name) from sqlite_master WHERE type='table' AND name='room_host'")
if c.fetchone()[0] == 1:
c.execute("DROP TABLE room_host")
else:
c.execute('CREATE TABLE room_host (room_ID integer PRIMARY KEY, host_ID integer)')
conn.commit()
# read data from csv
read_listings = pd.read_csv('listings.csv')
room_host = pd.DataFrame(read_listings, columns=['id', 'host_id'])
room_host.set_index('id')
room_host.to_sql("room_host", conn, if_exists='append', index=False)
c.execute("""INSERT INTO room_host (id, host_ID)
SELECT room_host.id, room_host.host_ID
FROM room_host
""")

I can't reporoduce the issue with the primary key, the table is created as expected when I run that SQL statement.
Other than that, the detour through Pandas is not really necessary, the csv module plus .executemany() seems to me as a much more straight-forward way of loading data from a CSV into a table.
import csv
import sqlite3 as sql
conn = sql.connect('Coursework.db')
conn.executescript('CREATE TABLE IF NOT EXISTS room_host (room_ID integer PRIMARY KEY, host_ID integer)')
conn.commit()
with open('listings.csv', encoding='utf8', newline='') as f:
reader = csv.reader(f, delimiter=',')
conn.executemany('INSERT INTO room_host (room_ID, host_ID) VALUES (?, ?)', reader)
conn.commit()

how to read from csv file and store data in sqlite3 using python

i have a python class readCSVintoDB that read from csv file and store data into sqlite 3 database.
note :
the csv file includes many fields so i just need 3 of them.
until now i am able to read csv file and stored into dataframe using pandas. but how to store the dataframe into the database.
error displayed :
File "C:\Users\test\Documents\Python_Projects\readCSV_DB.py", line 15,
in init self.importCSVintoDB() File
"C:\Users\test\Documents\Python_Projects\readCSV_DB.py", line 60, in
importCSVintoDB INSERT INTO rduWeather VALUES (?,?,?,?)''', i)
sqlite3.IntegrityError: datatype mismatch
when i tried to print i in the for loop it display the header name date
readCSV_DB :
import sqlite3
import pandas as pd
import os
class readCSVintoDB():
def __init__(self):
'''
self.csvobj = csvOBJ
self.dbobj = dbOBJ
'''
self.importCSVintoDB()
def importCSVintoDB(self):
userInput= input("enter the path of the csv file: ")
csvfile = userInput
df = pd.read_csv(csvfile,sep=';')
#print("dataFrame Headers is {0}".format(df.columns))# display the Headers
dp = (df[['date','temperaturemin','temperaturemax']])
print(dp)
'''
check if DB file exist
if no create an empty db file
'''
if not(os.path.exists('./rduDB.db')):
open('./rduDB.db','w').close()
'''
connect to the DB and get a connection cursor
'''
myConn = sqlite3.connect('./rduDB.db')
dbCursor = myConn.cursor()
'''
Assuming i need to create a table of (Name,FamilyName,age,work)
'''
dbCreateTable = '''CREATE TABLE IF NOT EXISTS rduWeather
(id INTEGER PRIMARY KEY,
Date varchar(256),
TemperatureMin FLOAT,
TemperatureMax FLOAT)'''
dbCursor.execute(dbCreateTable)
myConn.commit()
'''
insert data into the database
'''
for i in dp:
print(i)
dbCursor.execute('''
INSERT INTO rduWeather VALUES (?,?,?,?)''', i)
#myInsert=dbCursor.execute('''insert into Info ('Name','FA','age','work')
#VALUES('georges','hateh',23,'None')''')
myConn.commit()
mySelect=dbCursor.execute('''SELECT * from rduWeather WHERE (id = 10)''')
print(list(mySelect))
myConn.close()
test1 = readCSVintoDB()

If you want to write a single row (e.g: reg = (...)) try this function:
def write_datarow(conn, cols, reg):
''' Create a new entry (reg) into the rduWeather table
input: conn (class SQLite connection)
input: cols (list)
Table columns names
input: reg (tuple)
Data to be written as a row
'''
sql = 'INSERT INTO rduWeather({}) VALUES({})'.format(', '. join(cols),'?, '*(len(cols)-1)+'?')
cur = conn.cursor()
# Execute the SQL query
cur.execute(sql, reg)
# Confirm
conn.commit()
return
But if you had multiple rows reg = [(...),...,(...)] then use:
def write_datarow(conn, cols, reg):
''' Create a new entry (reg) into the rduWeather table
input: conn (class SQLite connection)
input: cols (list)
Table columns names
input: reg (list of tuples)
List of rows to be written
'''
sql = 'INSERT INTO rduWeather({}) VALUES({})'.format(', '. join(cols),'?, '*(len(cols)-1)+'?')
cur = conn.cursor()
# Execute the SQL query
cur.executemany(sql, reg)
# Confirm
conn.commit()
return

After your edition now I saw the problem. You commit the SQL query outside the for-loop.
Code this:
for i in dp:
dbCursor.execute(''' INSERT INTO rduWeather VALUES (?,?,?,?)''', i)
myConn.commit()

Querying a csv file with default values to columns

I have to query this table with the query of type:
query(id,mtype,competition,gender,team1,team2,venue,date)
If every of the parameters in query is given, we can use if statements to store the results. But some of the parameters may not be provided. In that case, we have to consider all the column values.
Also, I have these data in a csv file. I want to read the csv file into a list and then query it. The only catch is that if the user doesn't provide a parameter in the query, it should consider all the values in the column.
Can someone suggest a way to do this with only few if-else statements or suggest some other way?

You can use pandas with read_csv and query , i.e.:
import pandas as pd
# csv file should have the field names on the first row
# id,mtype,competition,gender,team1,team2,venue,date
df = pd.read_csv("the_file.csv", sep=",")
df['date'] = pd.to_datetime(df['date']) # convert date to a datetime object
mtype = "ODM"
sd = "2017-02-18"
ed = "2017-02-20"
df_query = df.query("mtype == '{}' and date > '{}' and date < '{}'".format(mtype, sd, ed))
print df_query
Option 2:
You can also convert the csv file into an sqlite db and issue the queries there, something like:
Import csv to sqlite:
import csv
import sqlite3
import os.path
csv_file = "csv_to_db.csv"
db_file = "csv_to_db.sqlite"
if not os.path.exists(db_file): # if no db_file we create one
con = sqlite3.Connection(db_file,detect_types=sqlite3.PARSE_DECLTYPES)
cur = con.cursor()
# csv fields: id,mtype,competition,gender,team1,team2,venue,date
cur.execute('CREATE TABLE "venues" ("id" int primary key, "mtype" text,'
' "competition" text, "gender" text, "team1" text, '
'"team2" text, "venue" text, "venue_date" date);')
f = open(csv_file)
csv_reader = csv.reader(f, delimiter=',')
cur.executemany('INSERT INTO venues VALUES (?, ?, ?, ?, ?, ?, ?, ?)', csv_reader)
cur.close()
con.commit()
con.close()
f.close()
Now we can start querying the db. You've asked:
Can you provide an example of type query(mtype,start_date,end_date)
with all other parameter missing?
For that you can use:
conn = sqlite3.connect(db_file,detect_types=sqlite3.PARSE_DECLTYPES)
c = conn.cursor()
start_date = "2017-02-15"
end_date = "2017-02-20"
c.execute("SELECT * FROM {table} WHERE mtype='{query}' AND venue_date BETWEEN date('{start_date}') AND date('{end_date}')".format(table="venues", query="ODM", start_date=start_date, end_date=end_date))
all_rows = c.fetchall()
print( all_rows)
Grab the complete gist

You can use Pandas, which provides a way to filter rows.

pyodbc.DataError: ('22018',"[22018] [Microsoft][ODBC SQL Server Driver][SQL Server]Conversion failed ] error

I have written the following snippet to import a CSV file into an MS SQL Server database but it gives me an error. It is based on code written for Sqlite for Python and changed for MSSQL.
import csv, pyodbc
import logging
def _get_col_datatypes(fin):
dr = csv.DictReader(fin) # comma is default delimiter
fieldTypes = {}
for entry in dr:
feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]
if not feildslLeft: break # We're done
for field in feildslLeft:
data = entry[field]
# Need data to decide
if len(data) == 0:
continue
if data.isdigit():
fieldTypes[field] = "INTEGER"
else:
fieldTypes[field] = "TEXT"
# TODO: Currently there's no support for DATE in sqllite
if len(feildslLeft) > 0:
raise Exception("Failed to find all the columns data types - Maybe some are empty?")
return fieldTypes
def escapingGenerator(f):
for line in f:
yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")
def csvToDb(csvFile, outputToFile = False):
# TODO: implement output to file
with open(csvFile,mode='r') as fin:
dt = _get_col_datatypes(fin)
fin.seek(0)
reader = csv.DictReader(fin)
# Keep the order of the columns name just as in the CSV
fields = reader.fieldnames
cols = []
# Set field and type
for f in fields:
cols.append("%s %s" % (f, dt[f]))
# Generate create table statement:
stmt = "CREATE TABLE ads (%s)" % ",".join(cols)
con = pyodbc.connect('DRIVER={SQL Server};SERVER=localhost;DATABASE=sd;UID=Test;PWD=11')
cur = con.cursor()
cur.execute(stmt)
fin.seek(0)
reader = csv.reader(escapingGenerator(fin))
# Generate insert statement:
stmt = "INSERT INTO ads VALUES(%s);" % ','.join('?' * len(cols))
cur.executemany(stmt, reader)
con.commit()
return con
csvToDb('Books.csv')
The error I am getting is
pyodbc.DataError: ('22018', "[22018] [Microsoft][ODBC SQL Server Driver][SQL Server]Conversion failed when converting the varchar value 'a' to data type int. (245) (SQLExecDirectW)")
Also please suggest if you think there are any other methods to dynamically import CSV or text files into an MSSQL database

The error message
Conversion failed when converting the varchar value 'a' to data type int.
reveals that your code can be "fooled" into thinking that a column is integer when it is really text, presumably because it only looks at the first row of data. Testing reveals that both
ID,txt1,txt2,int1
1,foo,123,3
2,bar,abc,4
and
"ID","txt1","txt2","int1"
1,"foo","123",3
2,"bar","abc",4
result in your code producing the CREATE TABLE statement:
CREATE TABLE ads (ID INTEGER,txt1 TEXT,txt2 INTEGER,int1 INTEGER)
which is wrong because the [txt2] column is not really INTEGER.
You could investigate tweaking your code to look at more than the first data row. (Microsoft's own import routines often default to the first eight rows when attempting to auto-detect data types.) You could also just import all columns as text and then convert them later in SQL server.
However, given that there must be hundreds – if not thousands – of examples out there for importing CSV data to SQL Server you should also consider doing a more exhaustive search for existing (debugged) code before you continue investing time and effort into "rolling your own solution".

Writing a csv file into SQL Server database using python

I am trying to write a csv file into a table in SQL Server database using python. I am facing errors when I pass the parameters , but I don't face any error when I do it manually. Here is the code I am executing.
cur=cnxn.cursor() # Get the cursor
csv_data = csv.reader(file(Samplefile.csv')) # Read the csv
for rows in csv_data: # Iterate through csv
cur.execute("INSERT INTO MyTable(Col1,Col2,Col3,Col4) VALUES (?,?,?,?)",rows)
cnxn.commit()
Error:
pyodbc.DataError: ('22001', '[22001] [Microsoft][ODBC SQL Server Driver][SQL Server]String or binary data would be truncated. (8152) (SQLExecDirectW); [01000] [Microsoft][ODBC SQL Server Driver][SQL Server]The statement has been terminated. (3621)')
However when I insert the values manually. It works fine
cur.execute("INSERT INTO MyTable(Col1,Col2,Col3,Col4) VALUES (?,?,?,?)",'A','B','C','D')
I have ensured that the TABLE is there in the database, data types are consistent with the data I am passing. Connection and cursor are also correct. The data type of rows is "list"

Consider building the query dynamically to ensure the number of placeholders matches your table and CSV file format. Then it's just a matter of ensuring your table and CSV file are correct, instead of checking that you typed enough ? placeholders in your code.
The following example assumes
CSV file contains column names in the first line
Connection is already built
File name is test.csv
Table name is MyTable
Python 3
...
with open ('test.csv', 'r') as f:
reader = csv.reader(f)
columns = next(reader)
query = 'insert into MyTable({0}) values ({1})'
query = query.format(','.join(columns), ','.join('?' * len(columns)))
cursor = connection.cursor()
for data in reader:
cursor.execute(query, data)
cursor.commit()
If column names are not included in the file:
...
with open ('test.csv', 'r') as f:
reader = csv.reader(f)
data = next(reader)
query = 'insert into MyTable values ({0})'
query = query.format(','.join('?' * len(data)))
cursor = connection.cursor()
cursor.execute(query, data)
for data in reader:
cursor.execute(query, data)
cursor.commit()

I modified the code written above by Brian as follows since the one posted above wouldn't work on the delimited files that I was trying to upload. The line row.pop() can also be ignored as it was necessary only for the set of files that I was trying to upload.
import csv
def upload_table(path, filename, delim, cursor):
"""
Function to upload flat file to sqlserver
"""
tbl = filename.split('.')[0]
cnt = 0
with open (path + filename, 'r') as f:
reader = csv.reader(f, delimiter=delim)
for row in reader:
row.pop() # can be commented out
row = ['NULL' if val == '' else val for val in row]
row = [x.replace("'", "''") for x in row]
out = "'" + "', '".join(str(item) for item in row) + "'"
out = out.replace("'NULL'", 'NULL')
query = "INSERT INTO " + tbl + " VALUES (" + out + ")"
cursor.execute(query)
cnt = cnt + 1
if cnt % 10000 == 0:
cursor.commit()
cursor.commit()
print("Uploaded " + str(cnt) + " rows into table " + tbl + ".")

You can pass the columns as arguments. For example:
for rows in csv_data: # Iterate through csv
cur.execute("INSERT INTO MyTable(Col1,Col2,Col3,Col4) VALUES (?,?,?,?)", *rows)

If you are using MySqlHook in airflow , if cursor.execute() with params throw san error
TypeError: not all arguments converted during string formatting
use %s instead of ?
with open('/usr/local/airflow/files/ifsc_details.csv','r') as csv_file:
csv_reader = csv.reader(csv_file)
columns = next(csv_reader)
query = '''insert into ifsc_details({0}) values({1});'''
query = query.format(','.join(columns), ','.join(['%s'] * len(columns)))
mysql = MySqlHook(mysql_conn_id='local_mysql')
conn = mysql.get_conn()
cursor = conn.cursor()
for data in csv_reader:
cursor.execute(query, data)
cursor.commit()

I got it sorted out. The error was due to the size restriction restriction of table. It changed the column capacity like from col1 varchar(10) to col1 varchar(35) etc. Now it's working fine.

Here is the script and hope this works for you:
import pandas as pd
import pyodbc as pc
connection_string = "Driver=SQL Server;Server=localhost;Database={0};Trusted_Connection=Yes;"
cnxn = pc.connect(connection_string.format("DataBaseNameHere"), autocommit=True)
cur=cnxn.cursor()
df= pd.read_csv("your_filepath_and_filename_here.csv").fillna('')
query = 'insert into TableName({0}) values ({1})'
query = query.format(','.join(df.columns), ','.join('?' * len(df1.columns)))
cur.fast_executemany = True
cur.executemany(query, df.values.tolist())
cnxn.close()

You can also import data into SQL by using either:
The SQL Server Import and Export Wizard
SQL Server Integration Services (SSIS)
The OPENROWSET function
More details can be found on this webpage:
https://learn.microsoft.com/en-us/sql/relational-databases/import-export/import-data-from-excel-to-sql?view=sql-server-2017

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Optimizing reading very large csv and writing it to SQLite - python

Related

SQLite and python - can't set primary keys

how to read from csv file and store data in sqlite3 using python

Querying a csv file with default values to columns

pyodbc.DataError: ('22018',"[22018] [Microsoft][ODBC SQL Server Driver][SQL Server]Conversion failed ] error

Writing a csv file into SQL Server database using python

Categories

Resources