how to speed up transfering df to sql - python

EDIT: After I saw this thread I removed the multi and chunksize parameters. But now I get the error The "ProgrammingError: (pyodbc.ProgrammingError) ('Unknown object type list during describe', 'HY000')" Is it because in my dataframe there are lists with tuples included?
I want to append all the rows from one dataframe (about 5000 rows and 12 columns) into a mysql server.
My code looks like this:
import sqlite3
import pyodbc
from sqlalchemy import create_engine, types
# connect to SQL database and open cursor
connection = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
"Server=DEBXT-MSDB-03\STUDENTS;"
"Database=Studentsdb;"
"Trusted_Connection=yes;")
# write your logic and sql statements
engine = create_engine('mssql+pyodbc://#' + 'censored' + '/' + 'censored' + '?trusted_connection=yes&driver=ODBC Driver 17 for SQL Server', fast_executemany=True)
dtypes = {
'Body': types.Text,
'sender': types.String(length=50),
'received': types.DateTime,
'account': types.String(length=20),
'type': types.String(length=10),
'email': types.Text,
'BodySA': types.Text,
'BOW': types.Text,
'topic_distribution': types.String(length=200),
'topic': types.String(length=50),
'mapped_topic': types.String(length=50)
}
df.to_sql('guided_lda', con=engine, if_exists='replace', index=False, dtype=dtypes, method ='multi', chunksize = 150)
# commit the data to database and close connection and cursor
connection.commit()
connection.close()
I'm waiting more than 10 minutes and it still is not done. I don't think it should take that long. I added the multi method, the chunksize (2100 / 12 columns) and even added now the fast_executemany=True in the engine statement. How long is too long?

Related

Inserting huge pandas dataframe into SQL Server table

I am looking for a way to insert a big set of data into a SQL Server table in Python. The problem is that my dataframe in Python has over 200 columns, currently I am using this code:
import pyodbc
import pandas as pd
server = 'yourservername'
database = 'AdventureWorks'
username = 'username'
password = 'yourpassword'
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+'UID='+username+';PWD='+ password)
cursor = cnxn.cursor()
for index, row in df.iterrows():
cursor.execute("INSERT INTO dbo.mytable (A,B,C)values(?,?,?)", row.A, row.B, row.C)
cnxn.commit()
cursor.close()
The problem is in INSERT INTO dbo.mytable (A, B, C) VALUES (?,?,?)", row.A, row.B, row.C as I need to insert a data with over 200 columns and specifying each of these columns is not really time efficient :(
I would appreciate any help!
Create connection in SqlAlchemy
Use df.to_sql() with chunksize param. Link to doc
ps. in my cases connection not in sqlalchemy not working in to_sql - function
Ok, I finally found a way:
serverName = 'xxx'
dataBase = 'zzz'
conn_str = urllib.parse.quote_plus(r'DRIVER={SQL Server};SERVER=' + serverName + r';DATABASE=' + dataBase + r';TRUSTED_CONNECTION=yes')
conn = 'mssql+pyodbc:///?odbc_connect={}'.format(conn_str)
engine = sqlalchemy.create_engine(conn,poolclass=NullPool)
connection = engine.connect()
df.to_sql("TableName", engine, schema='SchemaName', if_exists='append', index= True, chunksize=200)
connection.close()

Python: SQL Server insert multiple values with datetime (Conversion failed when converting date and/or time from character string)

I'm trying to insert multiple rows in a database table (SQL server) in python:
def write_to_database(values):
conn = pyodbc.connect("Driver={SQL Server};"
"Server=xxxx.database.windows.net;"
f"Database={database};"
"UID=user;"
"PWD=password;")
cursor = conn.cursor()
cursor.executemany("insert into KeyFigures (DateTime, Parameter, CommulativeTime, Value) values (?,?,?,?)", values)
conn.commit()
return()
date = datetime.datetime.now()
atom = date.strftime("%Y-%M-%d") + " " + date.strftime("%H:%M:%S") + ".4526800"
values = ([datetime.datetime.now().isoformat(), 1, "NULL", 2],[datetime.datetime.now().isoformat(), 1, "NULL", 47],[datetime.datetime.now().isoformat(), 1, "NULL", 78])
write_to_database(values)
I tried multiple formats of datetime, string combinations etc. e.g.:
datetime.datetime.now().isoformat()
atom
"2020-02-23 11:30:53.4526800"
"2020-02-23T11:30:53.4526800"
but i keep recieving the same error:
line 50, in write_to_database
cursor.executemany("insert into KeyFigures (DateTime, Parameter, CommulativeTime, Value) values (?,?,?,?)", values)
pyodbc.DataError: ('22007', '[22007] [Microsoft][ODBC SQL Server Driver][SQL Server]Conversion failed when converting date and/or time from character string. (241) (SQLExecDirectW)')
in SSMS the folowing works:
INSERT INTO KeyFigures (DateTime, Parameter, CommulativeTime, Value) VALUES ('2020-02-23 11:30:53.4526800',2,null,21)
how can I solve this error?
***** edit ****
#Mogo thank you very much. this was already very helpfull, but does not work in my code. I still receive the same error. I also tried to insert a single row and this piece of code works (with execute instead of executemany):
def write_to_database(date,parameter,cummulativeTime,value):
conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
"Server=xxxx.database.windows.net;"
f"Database={database};"
"UID=user;"
"PWD=password;")
with conn.cursor() as cursor:
cursor.execute(f"INSERT INTO dbo.KeyFigures (DateTime, Parameter, CommulativeTime, Value) values ('{date}',{parameter},{cummulativeTime},{value})")
conn.commit()
return()
date = datetime.datetime.now()
write_to_database(date, 1, "NULL", 43)
it doesnt work without date between quotes. Is this also the problem with the executemany? when i put the questionmark between qoutes ('?' or '?') gives the error that there are only 3 parameters given instead of 4.
As I mentioned in my comment, if you use a strongly type data type (so don't convert it to a string), python and pyodbc will handle this gracefully. I also, however, recommend updating to the ODBC Driver for SQL Server rather than using the old Native SQL Server Driver. I also put the cursor into a with so that it is close gracefully.
For a table I created with the definition below this worked fine, and inserted 2 rows, with the correct date and time values:
CREATE TABLE dbo.TestDateTable (i int,dt datetime2(7));
import datetime, pyodbc
def write_to_database(values):
conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
"Server=xxxx.database.windows.net;"
"Database={database};"
"UID=user;"
"PWD=password;")
with conn.cursor() as cursor:
cursor.executemany("INSERT INTO dbo.TestDateTable (i,dt) VALUES (?,?);", values)
conn.commit()
return()
date = datetime.datetime.now()
values = ([1,date],[2,date])
write_to_database(values)

How to use Bulk insert to insert data from Dataframe to SQL Server table?

I'm new to Python so reaching out for help. I have a csv file in S3 bucket, I would like to use Python pyodbc to import this csv file to a table in SQL server. This file is 50 MB (400k records). My code is below. As my code states below, my csv data is in a dataframe, how can I use Bulk insert to insert dataframe data into sql server table. If my approach does not work, please advise me with a different approach.
# Connection to S3
s3 = boto3.client(
service_name = 's3',
region_name = 'us-gov-west-1',
aws_access_key_id = 'ZZZZZZZZZZZZZZZZZZ',
aws_secret_access_key = 'AAAAAAAAAAAAAAAAA')
# Connection to SQL Server
server = 'myserver.amazonaws.com'
path = 'folder1/folder2/folder3/myCSVFile.csv'
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE=DB-staging;UID=User132;PWD=XXXXXX')
cursor = cnxn.cursor()
obj_sum = s3.get_object(Bucket = 'my_bucket', Key = path)
csv_data = pd.read_csv(obj_sum['Body'])
df = pd.DataFrame(csv_data, columns = ['SYSTEM_NAME', 'BUCKET_NAME', 'LOCATION', 'FILE_NAME', 'LAST_MOD_DATE', 'FILE_SIZE'])
#print(df.head(n=15).to_string(index=False))
# Insert DataFrame to table
cursor.execute("""truncate table dbo.table1""")
cursor.execute("""BULK INSERT dbo.table1 FROM """ + .....# what do I put here since data is in dataframe??)
I tried to loop through the dataframe and it took 20 minutes to insert 5k records. Code below. Looping through each record is an option but a poor one. This is why I'm moving towards bulk insert if possible.
for i in df.itertuples(index = False):
if i.FILE_SIZE != 0:
cursor.execute("""insert into dbo.table1 (SYSTEM_NAME, BUCKET_NAME, X_LOCATION, FILE_NAME, LAST_MOD_DATE, FILE_SIZE)
values (?,?,?,?,?,?)""", i.SYSTEM_NAME, i.BUCKET_NAME, i.LOCATION, i.FILE_NAME, i.LAST_MOD_DATE, i.FILE_SIZE)
Lastly, bonus question ... I would like to check if the "FILE_SIZE" column in my dataframe equals to 0, if it is skip over that record and move forward to the next record.
Thank you in advnace.
Thanks for the help.
using fast_executemany = True did the job for me.
engine = sal.create_engine("mssql+pyodbc://username:password#"+server+":1433/db-name?driver=ODBC+Driver+17+for+SQL+Server?Trusted_Connection=yes",
fast_executemany = True)
conn = engine.connect()
I had to change my code around to use "sqlalchemy" but it working great now.
To call the function to upload data to SQL Server is below:
df.to_sql(str, con = engine, index = False, if_exists = 'replace')

How to insert a Pandas Dataframe into MySql using PyMySQL

I have got a DataFrame which has got around 30,000+ rows and 150+ columns. So, currently I am using the following code to insert the data into MySQL. But since it is reading the rows one at a time, it is taking too much time to insert all the rows into MySql.
Is there any way in which I can insert the rows all at once or in batches? The constraint here is that I need to use only PyMySQL, I cannot install any other library.
import pymysql
import pandas as pd
# Create dataframe
data = pd.DataFrame({
'book_id':[12345, 12346, 12347],
'title':['Python Programming', 'Learn MySQL', 'Data Science Cookbook'],
'price':[29, 23, 27]
})
# Connect to the database
connection = pymysql.connect(host='localhost',
user='root',
password='12345',
db='book')
# create cursor
cursor=connection.cursor()
# creating column list for insertion
cols = "`,`".join([str(i) for i in data.columns.tolist()])
# Insert DataFrame recrds one by one.
for i,row in data.iterrows():
sql = "INSERT INTO `book_details` (`" +cols + "`) VALUES (" + "%s,"*(len(row)-1) + "%s)"
cursor.execute(sql, tuple(row))
# the connection is not autocommitted by default, so we must commit to save our changes
connection.commit()
# Execute query
sql = "SELECT * FROM `book_details`"
cursor.execute(sql)
# Fetch all the records
result = cursor.fetchall()
for i in result:
print(i)
connection.close()
Thank You.
Try using SQLALCHEMY to create an Engine than you can use later with pandas df.to_sql function. This function writes rows from pandas dataframe to SQL database and it is much faster than iterating your DataFrame and using the MySql cursor.
Your code would look something like this:
import pymysql
import pandas as pd
from sqlalchemy import create_engine
# Create dataframe
data = pd.DataFrame({
'book_id':[12345, 12346, 12347],
'title':['Python Programming', 'Learn MySQL', 'Data Science Cookbook'],
'price':[29, 23, 27]
})
db_data = 'mysql+mysqldb://' + 'root' + ':' + '12345' + '#' + 'localhost' + ':3306/' \
+ 'book' + '?charset=utf8mb4'
engine = create_engine(db_data)
# Connect to the database
connection = pymysql.connect(host='localhost',
user='root',
password='12345',
db='book')
# create cursor
cursor=connection.cursor()
# Execute the to_sql for writting DF into SQL
data.to_sql('book_details', engine, if_exists='append', index=False)
# Execute query
sql = "SELECT * FROM `book_details`"
cursor.execute(sql)
# Fetch all the records
result = cursor.fetchall()
for i in result:
print(i)
engine.dispose()
connection.close()
You can take a look to all the options this function has in pandas doc
It is faster to push a file to the SQL server and let the server manage the input.
So first push the data to a CSV file.
data.to_csv("import-data.csv", header=False, index=False, quoting=2, na_rep="\\N")
And then load it at once into the SQL table.
sql = "LOAD DATA LOCAL INFILE \'import-data.csv\' \
INTO TABLE book_details FIELDS TERMINATED BY \',\' ENCLOSED BY \'\"\' \
(`" +cols + "`)"
cursor.execute(sql)
Possible improvements.
remove or disable indexes on the table(s)
Take the commit out of the loop
Now try and load the data.
Generate a CSV file and load using ** LOAD DATA INFILE ** - this would be issued from within mysql.

How to create a new table in a MySQL DB from a pandas dataframe

I recently transitioned from using SQLite for most of my data storage and management needs to MySQL. I think I've finally gotten the correct libraries installed to work with Python 3.6, but now I am having trouble creating a new table from a dataframe in the MySQL database.
Here are the libraries I import:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
In my code, I first create a dataframe from a CSV file (no issues here).
def csv_to_df(infile):
return pd.read_csv(infile)
Then I establish a connection to the MySQL database using this def function:
def mysql_connection():
user = 'root'
password = 'abc'
host = '127.0.0.1'
port = '3306'
database = 'a001_db'
engine = create_engine("mysql://{0}:{1}#{2}:{3}/{4}?charset=utf8".format(user, password, host, port, database))
return engine
Lastly, I use the pandas function "to_sql" to create the database table in the MySQL database:
def df_to_mysql(df, db_tbl_name, conn=mysql_connection(), index=False):
df.to_sql(con = conn, name = db_tbl_name, if_exists='replace', index = False)
I run the code using this line:
df_to_mysql(csv_to_df(r'path/to/file.csv'), 'new_database_table')
The yields the following error:
InvalidRequestError: Could not reflect: requested table(s) not available in Engine(mysql://root:***#127.0.0.1:3306/a001_db?charset=utf8): (new_database_table)
I think this is telling me that I must first create a table in the database before passing the data in the dataframe to this table, but I'm not 100% positive about that. Regardless, I'm looking for a way to create a table in a MySQL database without manually creating the table first (I have many CSVs, each with 50+ fields, that have to be uploaded as new tables in a MySQL database).
Any suggestions?
I took an approach suggested by aws_apprentice above which was to create the table first, then write data to the table.
The code below first auto-generates a mysql table from a df (auto defining table names and datatypes) then writes the df data to that table.
There were a couple of hiccups I had to overcome, such as: unnamed csv columns, determining the correct data type for each field in the mysql table.
I'm sure there are multiple other (better?) ways to do this, but this seems to work.
import pandas as pd
from sqlalchemy import create_engine
infile = r'path/to/file.csv'
db = 'a001_db'
db_tbl_name = 'a001_rd004_db004'
'''
Load a csv file into a dataframe; if csv does not have headers, use the headers arg to create a list of headers; rename unnamed columns to conform to mysql column requirements
'''
def csv_to_df(infile, headers = []):
if len(headers) == 0:
df = pd.read_csv(infile)
else:
df = pd.read_csv(infile, header = None)
df.columns = headers
for r in range(10):
try:
df.rename( columns={'Unnamed: {0}'.format(r):'Unnamed{0}'.format(r)}, inplace=True )
except:
pass
return df
'''
Create a mapping of df dtypes to mysql data types (not perfect, but close enough)
'''
def dtype_mapping():
return {'object' : 'TEXT',
'int64' : 'INT',
'float64' : 'FLOAT',
'datetime64' : 'DATETIME',
'bool' : 'TINYINT',
'category' : 'TEXT',
'timedelta[ns]' : 'TEXT'}
'''
Create a sqlalchemy engine
'''
def mysql_engine(user = 'root', password = 'abc', host = '127.0.0.1', port = '3306', database = 'a001_db'):
engine = create_engine("mysql://{0}:{1}#{2}:{3}/{4}?charset=utf8".format(user, password, host, port, database))
return engine
'''
Create a mysql connection from sqlalchemy engine
'''
def mysql_conn(engine):
conn = engine.raw_connection()
return conn
'''
Create sql input for table names and types
'''
def gen_tbl_cols_sql(df):
dmap = dtype_mapping()
sql = "pi_db_uid INT AUTO_INCREMENT PRIMARY KEY"
df1 = df.rename(columns = {"" : "nocolname"})
hdrs = df1.dtypes.index
hdrs_list = [(hdr, str(df1[hdr].dtype)) for hdr in hdrs]
for hl in hdrs_list:
sql += " ,{0} {1}".format(hl[0], dmap[hl[1]])
return sql
'''
Create a mysql table from a df
'''
def create_mysql_tbl_schema(df, conn, db, tbl_name):
tbl_cols_sql = gen_tbl_cols_sql(df)
sql = "USE {0}; CREATE TABLE {1} ({2})".format(db, tbl_name, tbl_cols_sql)
cur = conn.cursor()
cur.execute(sql)
cur.close()
conn.commit()
'''
Write df data to newly create mysql table
'''
def df_to_mysql(df, engine, tbl_name):
df.to_sql(tbl_name, engine, if_exists='replace')
df = csv_to_df(infile)
create_mysql_tbl_schema(df, mysql_conn(mysql_engine()), db, db_tbl_name)
df_to_mysql(df, mysql_engine(), db_tbl_name)
This
connection = engine.connect()
df.to_sql(con=connection, name='TBL_NAME', schema='SCHEMA', index=False, if_exists='replace')
works with oracle DB in specific schema wothout errors, but will not work if you have limited permissions. And note that table names is case sensative.

Categories