Cannot drop table in pandas to_sql using SQLAlchemy - python

I'm trying to drop an existing table, do a query and then recreate the table using the pandas to_sql function. This query works in pgadmin, but not here. Any ideas of if this is a pandas bug or if my code is wrong?
Specific error is ValueError: Table 'a' already exists.
import pandas.io.sql as psql
from sqlalchemy import create_engine
engine = create_engine(r'postgresql://user#localhost:port/dbname')
c = engine.connect()
conn = c.connection
sql = """
drop table a;
select * from some_table limit 1;
"""
df = psql.read_sql(sql, con=conn)
print df.head()
df.to_sql('a', engine)
conn.close()

Why are you doing this like that? There is a shorter way: the if_exists kwag in to_sql. Try this:
import pandas.io.sql as psql
from sqlalchemy import create_engine
engine = create_engine(r'postgresql://user#localhost:port/dbname')
c = engine.connect()
conn = c.connection
sql = """
select * from some_table limit 1;
"""
df = psql.read_sql(sql, con=conn)
print df.head()
# Notice how below line is different. You forgot the schema argument
df.to_sql('a', con=conn, schema=schema_name, if_exists='replace')
conn.close()
According to docs:
replace: If table exists, drop it, recreate it, and insert data.
Ps. Additional tip:
This is better way to handle the connection:
with engine.connect() as conn, conn.begin():
sql = """select * from some_table limit 1"""
df = psql.read_sql(sql, con=conn)
print df.head()
df.to_sql('a', con=conn, schema=schema_name, if_exists='replace')
Because it ensures that your connection is always closed, even if your program exits with an error. This is important to prevent data corruption. Further, I would just use this:
import pandas as pd
...
pd.read_sql(sql, conn)
instead of the way you are doing it.
So, if I was in your place writing that code, it would look like this:
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine(r'postgresql://user#localhost:port/dbname')
with engine.connect() as conn, conn.begin():
df = pd.read_sql('select * from some_table limit 1', con=conn)
print df.head()
df.to_sql('a', con=conn, schema=schema_name, if_exists='replace')

Related

Execution failed on sql 'SELECT name FROM sqlite_master WHERE type='table' AND name=?;':

How can I easily write my pandas dataframe to a MySQL database using mysql.connector?
import mysql.connector as sql
import pandas as pd
db_connection = sql.connect(host='124685.eu-central-1.rds.amazonaws.com',
database="db_name", user='user', password='pw')
query = 'SELECT * FROM table_name'
df = pd.read_sql(sql=query, con=db_connection)
df["Person_Name"] = "xx"
df.to_sql(con=db_connection, name='table_name', if_exists='replace')
Tried this but it gives me an error that:
pandas.io.sql.DatabaseError: Execution failed on sql 'SELECT name FROM sqlite_master WHERE type='table' AND name=?;': Not all parameters were used in the SQL statement
Does the mysql.connectornot have a df.to_sqlfunction?
These are the col names:
Col names Index(['Person_ID', 'AirTable_ID_Person', 'Person_Name', 'Gender', 'Ethnicity',
'LinkedIn_Link_to_the_Profile_of_Person', 'Jensen_Analyst',
'Data_Source', 'Created_Time', 'Last_Modified_Time', 'Last refresh',
'createdTime', 'Gender_ID', 'Ethnicity_ID', 'Jensen_Analyst_ID',
'Data_Source_ID', 'Position_ID', 'Egnyte_File', 'Comment', 'Move',
'Right_Move', 'Bio-Import-Assistant', 'Diversity'],
dtype='object')
Pandas requires an SQLAlchemy engine to write data to sql. You can take up the following two approaches, the first being writing with a connector execure and the second using the engine with a pandas.to_sql statement.
It works very similar to your pandas read function.
import pandas as pd
import mysql.connector as sql
db_connection = sql.connect(host='124685.eu-central-1.rds.amazonaws.com',
database="db_name", user='user', password='pw')
query = 'SELECT * FROM table_name'
df = pd.read_sql(sql=query, con=db_connection)
df["Person_Name"] = "xx"
df_temp = df[['Person_Name', 'Person_ID']]
query_insert = 'insert into table_name(Person_Name) values %s where Person_ID = %s'
pars = df_temp.values.tolist()
pars = list(map(tuple, pars))
cursor = db_connection.cursor()
cursor.executemany(query, pars)
cursor.commit()
cursor.close()
Or you can establish an engine for uploading.
import pandas as pd
from sqlalchemy import create_engine
import mysql.connector as sql
# engine = create_engine('mysql+pymysql://username:password#host/database')
# or in your case-
engine = create_engine('mysql+pymysql://user:pw#124685.eu-central-1.rds.amazonaws.com/db_name')
db_connection = sql.connect(host='124685.eu-central-1.rds.amazonaws.com',
database="db_name", user='user', password='pw')
query = 'SELECT * FROM table_name'
df = pd.read_sql(sql=query, con=db_connection)
df["Person_Name"] = "xx"
df.to_sql(con=engine, name='table_name', if_exists='replace')
For this method be sure to install pymysql before running with pip install pymysql and you should be good to go.

Convert SQL query output into pandas dataframe

I have been looking since yesterday about the way I could convert the output of an SQL Query into a Pandas dataframe.
For example a code that does this :
data = select * from table
I've tried so many codes I've found on the internet but nothing seems to work.
Note that my database is stored in Azure DataBricks and I can only access the table using its URL.
Thank you so much !
Hope this would help you out. Both insertion & selection are in this code for reference.
def db_insert_user_level_info(table_name):
#Call Your DF Here , as an argument in the function or pass directly
df=df_parameter
params = urllib.parse.quote_plus("DRIVER={SQL Server};SERVER=DESKTOP-ITAJUJ2;DATABASE=githubAnalytics")
engine = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)
engine.connect()
table_row_count=select_row_count(table_name)
df_row_count=df.shape[0]
if table_row_count == df_row_count:
print("Data Cannot Be Inserted Because The Row Count is Same")
else:
df.to_sql(name=table_name,con=engine, index=False, if_exists='append')
print("********************************** DONE EXECTUTED SUCCESSFULLY ***************************************************")
def select_row_count(table_name):
cnxn = pyodbc.connect("Driver={SQL Server Native Client 11.0};"
"Server=DESKTOP-ITAJUJ2;"
"Database=githubAnalytics;"
"Trusted_Connection=yes;")
cur = cnxn.cursor()
try:
db_cmd = "SELECT count(*) FROM "+table_name
res = cur.execute(db_cmd)
# Do something with your result set, for example print out all the results:
for x in res:
return x[0]
except:
print("Table is not Available , Please Wait...")
Using sqlalchemy to connect to the database, and the built-in method read_sql_query from pandas to go straight to a DataFrame:
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine(url)
connection = engine.connect()
query = "SELECT * FROM table"
df = pd.read_sql_query(query,connection)

Using fast_executemany = True with sqlalchemy and Pandas [duplicate]

There is DataFrame.to_sql method, but it works only for mysql, sqlite and oracle databases. I cant pass to this method postgres connection or sqlalchemy engine.
Starting from pandas 0.14 (released end of May 2014), postgresql is supported. The sql module now uses sqlalchemy to support different database flavors. You can pass a sqlalchemy engine for a postgresql database (see docs). E.g.:
from sqlalchemy import create_engine
engine = create_engine('postgresql://username:password#localhost:5432/mydatabase')
df.to_sql('table_name', engine)
You are correct that in pandas up to version 0.13.1 postgresql was not supported. If you need to use an older version of pandas, here is a patched version of pandas.io.sql: https://gist.github.com/jorisvandenbossche/10841234.
I wrote this a time ago, so cannot fully guarantee that it always works, buth the basis should be there). If you put that file in your working directory and import it, then you should be able to do (where con is a postgresql connection):
import sql # the patched version (file is named sql.py)
sql.write_frame(df, 'table_name', con, flavor='postgresql')
Faster option:
The following code will copy your Pandas DF to postgres DB much faster than df.to_sql method and you won't need any intermediate csv file to store the df.
Create an engine based on your DB specifications.
Create a table in your postgres DB that has equal number of columns as the Dataframe (df).
Data in DF will get inserted in your postgres table.
from sqlalchemy import create_engine
import psycopg2
import io
If you want to replace the table, we can replace it with normal to_sql method using headers from our df and then load the entire big time consuming df into DB.
engine = create_engine(
'postgresql+psycopg2://username:password#host:port/database')
# Drop old table and create new empty table
df.head(0).to_sql('table_name', engine, if_exists='replace',index=False)
conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
df.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_from(output, 'table_name', null="") # null values become ''
conn.commit()
cur.close()
conn.close()
Pandas 0.24.0+ solution
In Pandas 0.24.0 a new feature was introduced specifically designed for fast writes to Postgres. You can learn more about it here: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql-method
import csv
from io import StringIO
from sqlalchemy import create_engine
def psql_insert_copy(table, conn, keys, data_iter):
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ', '.join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = '{}.{}'.format(table.schema, table.name)
else:
table_name = table.name
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
engine = create_engine('postgresql://myusername:mypassword#myhost:5432/mydatabase')
df.to_sql('table_name', engine, method=psql_insert_copy)
This is how I did it.
It may be faster because it is using execute_batch:
# df is the dataframe
if len(df) > 0:
df_columns = list(df)
# create (col1,col2,...)
columns = ",".join(df_columns)
# create VALUES('%s', '%s",...) one '%s' per column
values = "VALUES({})".format(",".join(["%s" for _ in df_columns]))
#create INSERT INTO table (columns) VALUES('%s',...)
insert_stmt = "INSERT INTO {} ({}) {}".format(table,columns,values)
cur = conn.cursor()
psycopg2.extras.execute_batch(cur, insert_stmt, df.values)
conn.commit()
cur.close()
Faster way to write a df to a table in a custom schema with/without index:
"""
Faster way to write df to table.
Slower way is to use df.to_sql()
"""
from io import StringIO
from pandas import DataFrame
from sqlalchemy.engine.base import Engine
class WriteDfToTableWithIndexMixin:
#classmethod
def write_df_to_table_with_index(
cls,
df: DataFrame,
table_name: str,
schema_name: str,
engine: Engine
):
"""
Truncate existing table and load df into table.
Keep each column as string to avoid datatype conflicts.
"""
df.head(0).to_sql(table_name, engine, if_exists='replace',
schema=schema_name, index=True, index_label='id')
conn = engine.raw_connection()
cur = conn.cursor()
output = StringIO()
df.to_csv(output, sep='\t', header=False,
index=True, index_label='id')
output.seek(0)
contents = output.getvalue()
cur.copy_expert(f"COPY {schema_name}.{table_name} FROM STDIN", output)
conn.commit()
class WriteDfToTableWithoutIndexMixin:
#classmethod
def write_df_to_table_without_index(
cls,
df: DataFrame,
table_name: str,
schema_name: str,
engine: Engine
):
"""
Truncate existing table and load df into table.
Keep each column as string to avoid datatype conflicts.
"""
df.head(0).to_sql(table_name, engine, if_exists='replace',
schema=schema_name, index=False)
conn = engine.raw_connection()
cur = conn.cursor()
output = StringIO()
df.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_expert(f"COPY {schema_name}.{table_name} FROM STDIN", output)
conn.commit()
If you have JSON values in a column in your df then above method will still load all data correctly but the json column will have some weird format. So converting that json column to ::json may generate error. You have to use to_sql() . Add method=multi to speed things up and add chunksize to prevent your machine from freezing:
df.to_sql(table_name, engine, if_exists='replace', schema=schema_name, index=False, method='multi', chunksize=1000)
using psycopg2 you can use native sql commands to write data into a postgres table.
import psycopg2
import pandas as pd
conn = psycopg2.connect("dbname='{db}' user='{user}' host='{host}' port='{port}' password='{passwd}'".format(
user=pg_user,
passwd=pg_pass,
host=pg_host,
port=pg_port,
db=pg_db))
cur = conn.cursor()
def insertIntoTable(df, table):
"""
Using cursor.executemany() to insert the dataframe
"""
# Create a list of tupples from the dataframe values
tuples = list(set([tuple(x) for x in df.to_numpy()]))
# Comma-separated dataframe columns
cols = ','.join(list(df.columns))
# SQL query to execute
query = "INSERT INTO %s(%s) VALUES(%%s,%%s,%%s,%%s)" % (
table, cols)
try:
cur.executemany(query, tuples)
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print("Error: %s" % error)
conn.rollback()
return 1
For Python 2.7 and Pandas 0.24.2 and using Psycopg2
Psycopg2 Connection Module
def dbConnect (db_parm, username_parm, host_parm, pw_parm):
# Parse in connection information
credentials = {'host': host_parm, 'database': db_parm, 'user': username_parm, 'password': pw_parm}
conn = psycopg2.connect(**credentials)
conn.autocommit = True # auto-commit each entry to the database
conn.cursor_factory = RealDictCursor
cur = conn.cursor()
print ("Connected Successfully to DB: " + str(db_parm) + "#" + str(host_parm))
return conn, cur
Connect to the database
conn, cur = dbConnect(databaseName, dbUser, dbHost, dbPwd)
Assuming dataframe to be present already as df
output = io.BytesIO() # For Python3 use StringIO
df.to_csv(output, sep='\t', header=True, index=False)
output.seek(0) # Required for rewinding the String object
copy_query = "COPY mem_info FROM STDOUT csv DELIMITER '\t' NULL '' ESCAPE '\\' HEADER " # Replace your table name in place of mem_info
cur.copy_expert(copy_query, output)
conn.commit()
Create engine (where dialect='postgres' or 'mysql', etc..):
from sqlalchemy import create_engine
engine = create_engine(f'{dialect}://{user_name}#{host}:{port}/{db_name}')
Session = sessionmaker(bind=engine)
with Session() as session:
df = pd.read_csv(path + f'/{file}')
df.to_sql('table_name', con=engine, if_exists='append',index=False)

Getting data from table in database

I want to extract data from a postgresql database and use that data (in a dataframe format) in a script. Here's my initial try:
from pandas import DataFrame
import psycopg2
conn = psycopg2.connect(host=host_address, database=name_of_database, user=user_name, password=user_password)
cur = conn.cursor()
cur.execute("SELECT * FROM %s;" % name_of_table)
the_data = cur.fetchall()
colnames = [desc[0] for desc in cur.description]
the_frame = DataFrame(the_data)
the_frame.columns = colnames
cur.close()
conn.close()
Note: I am aware that I should not use "string parameters interpolation (%) to pass variables to a SQL query string", but this works great for me as it is.
Would there be a more direct approach to this?
Edit: Here's what I used from the selected answer:
import pandas as pd
import sqlalchemy as sq
engine = sq.create_engine("postgresql+psycopg2://username:password#host:port/database")
the_frame = pd.read_sql_table(name_of_table, engine)
Pandas can load data from Postgres directly:
import psycopg2
import pandas.io.sql as pdsql
conn = psycopg2.connect(...)
the_frame = pdsql.read_frame("SELECT * FROM %s;" % name_of_table, conn)
If you have a recent pandas (>=0.14), you should use read_sql_query/table (read_frame is deprecated) with an sqlalchemy engine:
import pandas as pd
import sqlalchemy
import psycopg2
engine = sqlalchemy.create_engine("postgresql+psycopg2://...")
the_frame = pd.read_sql_query("SELECT * FROM %s;" % name_of_table, engine)
the_frame = pd.read_sql_table(name_of_table, engine)
Here is an alternate method:
# run sql code
result = conn.execute(sql)
# Insert to a dataframe
df = DataFrame(data=list(result), columns=result.keys())

How to write DataFrame to postgres table

There is DataFrame.to_sql method, but it works only for mysql, sqlite and oracle databases. I cant pass to this method postgres connection or sqlalchemy engine.
Starting from pandas 0.14 (released end of May 2014), postgresql is supported. The sql module now uses sqlalchemy to support different database flavors. You can pass a sqlalchemy engine for a postgresql database (see docs). E.g.:
from sqlalchemy import create_engine
engine = create_engine('postgresql://username:password#localhost:5432/mydatabase')
df.to_sql('table_name', engine)
You are correct that in pandas up to version 0.13.1 postgresql was not supported. If you need to use an older version of pandas, here is a patched version of pandas.io.sql: https://gist.github.com/jorisvandenbossche/10841234.
I wrote this a time ago, so cannot fully guarantee that it always works, buth the basis should be there). If you put that file in your working directory and import it, then you should be able to do (where con is a postgresql connection):
import sql # the patched version (file is named sql.py)
sql.write_frame(df, 'table_name', con, flavor='postgresql')
Faster option:
The following code will copy your Pandas DF to postgres DB much faster than df.to_sql method and you won't need any intermediate csv file to store the df.
Create an engine based on your DB specifications.
Create a table in your postgres DB that has equal number of columns as the Dataframe (df).
Data in DF will get inserted in your postgres table.
from sqlalchemy import create_engine
import psycopg2
import io
If you want to replace the table, we can replace it with normal to_sql method using headers from our df and then load the entire big time consuming df into DB.
engine = create_engine(
'postgresql+psycopg2://username:password#host:port/database')
# Drop old table and create new empty table
df.head(0).to_sql('table_name', engine, if_exists='replace',index=False)
conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
df.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_from(output, 'table_name', null="") # null values become ''
conn.commit()
cur.close()
conn.close()
Pandas 0.24.0+ solution
In Pandas 0.24.0 a new feature was introduced specifically designed for fast writes to Postgres. You can learn more about it here: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql-method
import csv
from io import StringIO
from sqlalchemy import create_engine
def psql_insert_copy(table, conn, keys, data_iter):
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ', '.join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = '{}.{}'.format(table.schema, table.name)
else:
table_name = table.name
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
engine = create_engine('postgresql://myusername:mypassword#myhost:5432/mydatabase')
df.to_sql('table_name', engine, method=psql_insert_copy)
This is how I did it.
It may be faster because it is using execute_batch:
# df is the dataframe
if len(df) > 0:
df_columns = list(df)
# create (col1,col2,...)
columns = ",".join(df_columns)
# create VALUES('%s', '%s",...) one '%s' per column
values = "VALUES({})".format(",".join(["%s" for _ in df_columns]))
#create INSERT INTO table (columns) VALUES('%s',...)
insert_stmt = "INSERT INTO {} ({}) {}".format(table,columns,values)
cur = conn.cursor()
psycopg2.extras.execute_batch(cur, insert_stmt, df.values)
conn.commit()
cur.close()
Faster way to write a df to a table in a custom schema with/without index:
"""
Faster way to write df to table.
Slower way is to use df.to_sql()
"""
from io import StringIO
from pandas import DataFrame
from sqlalchemy.engine.base import Engine
class WriteDfToTableWithIndexMixin:
#classmethod
def write_df_to_table_with_index(
cls,
df: DataFrame,
table_name: str,
schema_name: str,
engine: Engine
):
"""
Truncate existing table and load df into table.
Keep each column as string to avoid datatype conflicts.
"""
df.head(0).to_sql(table_name, engine, if_exists='replace',
schema=schema_name, index=True, index_label='id')
conn = engine.raw_connection()
cur = conn.cursor()
output = StringIO()
df.to_csv(output, sep='\t', header=False,
index=True, index_label='id')
output.seek(0)
contents = output.getvalue()
cur.copy_expert(f"COPY {schema_name}.{table_name} FROM STDIN", output)
conn.commit()
class WriteDfToTableWithoutIndexMixin:
#classmethod
def write_df_to_table_without_index(
cls,
df: DataFrame,
table_name: str,
schema_name: str,
engine: Engine
):
"""
Truncate existing table and load df into table.
Keep each column as string to avoid datatype conflicts.
"""
df.head(0).to_sql(table_name, engine, if_exists='replace',
schema=schema_name, index=False)
conn = engine.raw_connection()
cur = conn.cursor()
output = StringIO()
df.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_expert(f"COPY {schema_name}.{table_name} FROM STDIN", output)
conn.commit()
If you have JSON values in a column in your df then above method will still load all data correctly but the json column will have some weird format. So converting that json column to ::json may generate error. You have to use to_sql() . Add method=multi to speed things up and add chunksize to prevent your machine from freezing:
df.to_sql(table_name, engine, if_exists='replace', schema=schema_name, index=False, method='multi', chunksize=1000)
using psycopg2 you can use native sql commands to write data into a postgres table.
import psycopg2
import pandas as pd
conn = psycopg2.connect("dbname='{db}' user='{user}' host='{host}' port='{port}' password='{passwd}'".format(
user=pg_user,
passwd=pg_pass,
host=pg_host,
port=pg_port,
db=pg_db))
cur = conn.cursor()
def insertIntoTable(df, table):
"""
Using cursor.executemany() to insert the dataframe
"""
# Create a list of tupples from the dataframe values
tuples = list(set([tuple(x) for x in df.to_numpy()]))
# Comma-separated dataframe columns
cols = ','.join(list(df.columns))
# SQL query to execute
query = "INSERT INTO %s(%s) VALUES(%%s,%%s,%%s,%%s)" % (
table, cols)
try:
cur.executemany(query, tuples)
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print("Error: %s" % error)
conn.rollback()
return 1
For Python 2.7 and Pandas 0.24.2 and using Psycopg2
Psycopg2 Connection Module
def dbConnect (db_parm, username_parm, host_parm, pw_parm):
# Parse in connection information
credentials = {'host': host_parm, 'database': db_parm, 'user': username_parm, 'password': pw_parm}
conn = psycopg2.connect(**credentials)
conn.autocommit = True # auto-commit each entry to the database
conn.cursor_factory = RealDictCursor
cur = conn.cursor()
print ("Connected Successfully to DB: " + str(db_parm) + "#" + str(host_parm))
return conn, cur
Connect to the database
conn, cur = dbConnect(databaseName, dbUser, dbHost, dbPwd)
Assuming dataframe to be present already as df
output = io.BytesIO() # For Python3 use StringIO
df.to_csv(output, sep='\t', header=True, index=False)
output.seek(0) # Required for rewinding the String object
copy_query = "COPY mem_info FROM STDOUT csv DELIMITER '\t' NULL '' ESCAPE '\\' HEADER " # Replace your table name in place of mem_info
cur.copy_expert(copy_query, output)
conn.commit()
Create engine (where dialect='postgres' or 'mysql', etc..):
from sqlalchemy import create_engine
engine = create_engine(f'{dialect}://{user_name}#{host}:{port}/{db_name}')
Session = sessionmaker(bind=engine)
with Session() as session:
df = pd.read_csv(path + f'/{file}')
df.to_sql('table_name', con=engine, if_exists='append',index=False)

Categories