I am looking for a way to insert a big set of data into a SQL Server table in Python. The problem is that my dataframe in Python has over 200 columns, currently I am using this code:
import pyodbc
import pandas as pd
server = 'yourservername'
database = 'AdventureWorks'
username = 'username'
password = 'yourpassword'
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+'UID='+username+';PWD='+ password)
cursor = cnxn.cursor()
for index, row in df.iterrows():
cursor.execute("INSERT INTO dbo.mytable (A,B,C)values(?,?,?)", row.A, row.B, row.C)
cnxn.commit()
cursor.close()
The problem is in INSERT INTO dbo.mytable (A, B, C) VALUES (?,?,?)", row.A, row.B, row.C as I need to insert a data with over 200 columns and specifying each of these columns is not really time efficient :(
I would appreciate any help!
Create connection in SqlAlchemy
Use df.to_sql() with chunksize param. Link to doc
ps. in my cases connection not in sqlalchemy not working in to_sql - function
Ok, I finally found a way:
serverName = 'xxx'
dataBase = 'zzz'
conn_str = urllib.parse.quote_plus(r'DRIVER={SQL Server};SERVER=' + serverName + r';DATABASE=' + dataBase + r';TRUSTED_CONNECTION=yes')
conn = 'mssql+pyodbc:///?odbc_connect={}'.format(conn_str)
engine = sqlalchemy.create_engine(conn,poolclass=NullPool)
connection = engine.connect()
df.to_sql("TableName", engine, schema='SchemaName', if_exists='append', index= True, chunksize=200)
connection.close()
Related
I have been looking since yesterday about the way I could convert the output of an SQL Query into a Pandas dataframe.
For example a code that does this :
data = select * from table
I've tried so many codes I've found on the internet but nothing seems to work.
Note that my database is stored in Azure DataBricks and I can only access the table using its URL.
Thank you so much !
Hope this would help you out. Both insertion & selection are in this code for reference.
def db_insert_user_level_info(table_name):
#Call Your DF Here , as an argument in the function or pass directly
df=df_parameter
params = urllib.parse.quote_plus("DRIVER={SQL Server};SERVER=DESKTOP-ITAJUJ2;DATABASE=githubAnalytics")
engine = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)
engine.connect()
table_row_count=select_row_count(table_name)
df_row_count=df.shape[0]
if table_row_count == df_row_count:
print("Data Cannot Be Inserted Because The Row Count is Same")
else:
df.to_sql(name=table_name,con=engine, index=False, if_exists='append')
print("********************************** DONE EXECTUTED SUCCESSFULLY ***************************************************")
def select_row_count(table_name):
cnxn = pyodbc.connect("Driver={SQL Server Native Client 11.0};"
"Server=DESKTOP-ITAJUJ2;"
"Database=githubAnalytics;"
"Trusted_Connection=yes;")
cur = cnxn.cursor()
try:
db_cmd = "SELECT count(*) FROM "+table_name
res = cur.execute(db_cmd)
# Do something with your result set, for example print out all the results:
for x in res:
return x[0]
except:
print("Table is not Available , Please Wait...")
Using sqlalchemy to connect to the database, and the built-in method read_sql_query from pandas to go straight to a DataFrame:
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine(url)
connection = engine.connect()
query = "SELECT * FROM table"
df = pd.read_sql_query(query,connection)
I have got a DataFrame which has got around 30,000+ rows and 150+ columns. So, currently I am using the following code to insert the data into MySQL. But since it is reading the rows one at a time, it is taking too much time to insert all the rows into MySql.
Is there any way in which I can insert the rows all at once or in batches? The constraint here is that I need to use only PyMySQL, I cannot install any other library.
import pymysql
import pandas as pd
# Create dataframe
data = pd.DataFrame({
'book_id':[12345, 12346, 12347],
'title':['Python Programming', 'Learn MySQL', 'Data Science Cookbook'],
'price':[29, 23, 27]
})
# Connect to the database
connection = pymysql.connect(host='localhost',
user='root',
password='12345',
db='book')
# create cursor
cursor=connection.cursor()
# creating column list for insertion
cols = "`,`".join([str(i) for i in data.columns.tolist()])
# Insert DataFrame recrds one by one.
for i,row in data.iterrows():
sql = "INSERT INTO `book_details` (`" +cols + "`) VALUES (" + "%s,"*(len(row)-1) + "%s)"
cursor.execute(sql, tuple(row))
# the connection is not autocommitted by default, so we must commit to save our changes
connection.commit()
# Execute query
sql = "SELECT * FROM `book_details`"
cursor.execute(sql)
# Fetch all the records
result = cursor.fetchall()
for i in result:
print(i)
connection.close()
Thank You.
Try using SQLALCHEMY to create an Engine than you can use later with pandas df.to_sql function. This function writes rows from pandas dataframe to SQL database and it is much faster than iterating your DataFrame and using the MySql cursor.
Your code would look something like this:
import pymysql
import pandas as pd
from sqlalchemy import create_engine
# Create dataframe
data = pd.DataFrame({
'book_id':[12345, 12346, 12347],
'title':['Python Programming', 'Learn MySQL', 'Data Science Cookbook'],
'price':[29, 23, 27]
})
db_data = 'mysql+mysqldb://' + 'root' + ':' + '12345' + '#' + 'localhost' + ':3306/' \
+ 'book' + '?charset=utf8mb4'
engine = create_engine(db_data)
# Connect to the database
connection = pymysql.connect(host='localhost',
user='root',
password='12345',
db='book')
# create cursor
cursor=connection.cursor()
# Execute the to_sql for writting DF into SQL
data.to_sql('book_details', engine, if_exists='append', index=False)
# Execute query
sql = "SELECT * FROM `book_details`"
cursor.execute(sql)
# Fetch all the records
result = cursor.fetchall()
for i in result:
print(i)
engine.dispose()
connection.close()
You can take a look to all the options this function has in pandas doc
It is faster to push a file to the SQL server and let the server manage the input.
So first push the data to a CSV file.
data.to_csv("import-data.csv", header=False, index=False, quoting=2, na_rep="\\N")
And then load it at once into the SQL table.
sql = "LOAD DATA LOCAL INFILE \'import-data.csv\' \
INTO TABLE book_details FIELDS TERMINATED BY \',\' ENCLOSED BY \'\"\' \
(`" +cols + "`)"
cursor.execute(sql)
Possible improvements.
remove or disable indexes on the table(s)
Take the commit out of the loop
Now try and load the data.
Generate a CSV file and load using ** LOAD DATA INFILE ** - this would be issued from within mysql.
I am looking to work in python with a table that I have in SQL. I want to store the entire table in a matrix called 'mat' and then get the output after the python code so I can read the table with SQL again. This is how I started:
import pyodbc
import pandas as pd
server = 'myserver'
database = 'mydatabase'
username = 'myuser'
password = 'mypassword'
cnxn = pyodbc.connect('DRIVER={ODBC Driver 13 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
******Python code*******
mat=pd.read_sql('select * from mytable order by time' , con = cnxn)
How should I read the table to store it in mat and then how do I send it back to SQL?
You have already read the data into a DataFrame. If you want to convert a dataframe to a matrix, do mat.values. If you want to write the data to a sql table, you will have to create a cursor and use it to insert the data.
cursor = cnxn.cursor()
cursor.execute(''' INSERT INTO myTable (FirstName, LastName) VALUES ('Wilsamson', 'Shiphrah') ''')
If you have multiple values, you should use the executemany command;
values = list(zip(mat['FirstName'].values.tolist(), mat['LastName'].values.tolist()))
cursor.executemany('''INSERT INTO myTable (FirstName, LastName) VALUES (?, ?)''', values);
At the end of the INSERT statement, you will need to commit the inserts before closing your cursor and connection.
cursor.commit()
cursor.close()
cnxn.close()
If you want to convert
This is how I do it.
import mysql.connector
import pandas as pd
import numpy as np
# use this to display ALL columns...useful, but definitely not required
pd.set_option('display.max_columns', None)
mydb = mysql.connector.connect(
host="localhost",
user="duser_name",
passwd="pswd",
database="db_naem"
)
mycursor = mydb.cursor()
mycursor.execute("SELECT * FROM YourTable")
myresult = mycursor.fetchall()
df = pd.DataFrame(myresult)
df.to_csv('C:\\path_here\\test.csv', sep=',')
You can easily convert a dataframe to a matrix.
np.array(df.to_records().view(type=np.matrix))
But I'm not sure why you want to do that. I think datframes are a lot more practical for most people's needs.
I recently transitioned from using SQLite for most of my data storage and management needs to MySQL. I think I've finally gotten the correct libraries installed to work with Python 3.6, but now I am having trouble creating a new table from a dataframe in the MySQL database.
Here are the libraries I import:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
In my code, I first create a dataframe from a CSV file (no issues here).
def csv_to_df(infile):
return pd.read_csv(infile)
Then I establish a connection to the MySQL database using this def function:
def mysql_connection():
user = 'root'
password = 'abc'
host = '127.0.0.1'
port = '3306'
database = 'a001_db'
engine = create_engine("mysql://{0}:{1}#{2}:{3}/{4}?charset=utf8".format(user, password, host, port, database))
return engine
Lastly, I use the pandas function "to_sql" to create the database table in the MySQL database:
def df_to_mysql(df, db_tbl_name, conn=mysql_connection(), index=False):
df.to_sql(con = conn, name = db_tbl_name, if_exists='replace', index = False)
I run the code using this line:
df_to_mysql(csv_to_df(r'path/to/file.csv'), 'new_database_table')
The yields the following error:
InvalidRequestError: Could not reflect: requested table(s) not available in Engine(mysql://root:***#127.0.0.1:3306/a001_db?charset=utf8): (new_database_table)
I think this is telling me that I must first create a table in the database before passing the data in the dataframe to this table, but I'm not 100% positive about that. Regardless, I'm looking for a way to create a table in a MySQL database without manually creating the table first (I have many CSVs, each with 50+ fields, that have to be uploaded as new tables in a MySQL database).
Any suggestions?
I took an approach suggested by aws_apprentice above which was to create the table first, then write data to the table.
The code below first auto-generates a mysql table from a df (auto defining table names and datatypes) then writes the df data to that table.
There were a couple of hiccups I had to overcome, such as: unnamed csv columns, determining the correct data type for each field in the mysql table.
I'm sure there are multiple other (better?) ways to do this, but this seems to work.
import pandas as pd
from sqlalchemy import create_engine
infile = r'path/to/file.csv'
db = 'a001_db'
db_tbl_name = 'a001_rd004_db004'
'''
Load a csv file into a dataframe; if csv does not have headers, use the headers arg to create a list of headers; rename unnamed columns to conform to mysql column requirements
'''
def csv_to_df(infile, headers = []):
if len(headers) == 0:
df = pd.read_csv(infile)
else:
df = pd.read_csv(infile, header = None)
df.columns = headers
for r in range(10):
try:
df.rename( columns={'Unnamed: {0}'.format(r):'Unnamed{0}'.format(r)}, inplace=True )
except:
pass
return df
'''
Create a mapping of df dtypes to mysql data types (not perfect, but close enough)
'''
def dtype_mapping():
return {'object' : 'TEXT',
'int64' : 'INT',
'float64' : 'FLOAT',
'datetime64' : 'DATETIME',
'bool' : 'TINYINT',
'category' : 'TEXT',
'timedelta[ns]' : 'TEXT'}
'''
Create a sqlalchemy engine
'''
def mysql_engine(user = 'root', password = 'abc', host = '127.0.0.1', port = '3306', database = 'a001_db'):
engine = create_engine("mysql://{0}:{1}#{2}:{3}/{4}?charset=utf8".format(user, password, host, port, database))
return engine
'''
Create a mysql connection from sqlalchemy engine
'''
def mysql_conn(engine):
conn = engine.raw_connection()
return conn
'''
Create sql input for table names and types
'''
def gen_tbl_cols_sql(df):
dmap = dtype_mapping()
sql = "pi_db_uid INT AUTO_INCREMENT PRIMARY KEY"
df1 = df.rename(columns = {"" : "nocolname"})
hdrs = df1.dtypes.index
hdrs_list = [(hdr, str(df1[hdr].dtype)) for hdr in hdrs]
for hl in hdrs_list:
sql += " ,{0} {1}".format(hl[0], dmap[hl[1]])
return sql
'''
Create a mysql table from a df
'''
def create_mysql_tbl_schema(df, conn, db, tbl_name):
tbl_cols_sql = gen_tbl_cols_sql(df)
sql = "USE {0}; CREATE TABLE {1} ({2})".format(db, tbl_name, tbl_cols_sql)
cur = conn.cursor()
cur.execute(sql)
cur.close()
conn.commit()
'''
Write df data to newly create mysql table
'''
def df_to_mysql(df, engine, tbl_name):
df.to_sql(tbl_name, engine, if_exists='replace')
df = csv_to_df(infile)
create_mysql_tbl_schema(df, mysql_conn(mysql_engine()), db, db_tbl_name)
df_to_mysql(df, mysql_engine(), db_tbl_name)
This
connection = engine.connect()
df.to_sql(con=connection, name='TBL_NAME', schema='SCHEMA', index=False, if_exists='replace')
works with oracle DB in specific schema wothout errors, but will not work if you have limited permissions. And note that table names is case sensative.
Struggling to figure out why this isnt working. I don't get any errors but it will not write to the table.
import pyodbc
connprod = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=master;Trusted_Connection=yes')
cursorprod = connprod.cursor()
conndev = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=master;Trusted_Connection=yes')
cursordev = conndev.cursor()
connlocal=pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=DBA;Trusted_Connection=yes')
cursorlocal = connlocal.cursor()
cursorprod.execute("SELECT Servername = ##servername ,Date = getdate() ,wait_type ,waiting_tasks_count ,wait_time_ms ,max_wait_time_ms ,signal_wait_time_ms FROM sys.dm_os_wait_stats GO")
rows = cursorprod.fetchall()
for row in rows:
cursorlocal.execute('insert into dba.dbo.dm_os_wait_stats values (?,?,?,?,?,?,?)', row)
cursorlocal.commit
If your example is accurate, you're not calling the commit method:
cursorlocal.commit()