Why can't I raw insert a list of dicts with SQLalchemy ?
import os
import sqlalchemy
import pandas as pd
def connect_unix_socket() -> sqlalchemy.engine:
db_user = os.environ["DB_USER"]
db_pass = os.environ["DB_PASS"]
db_name = os.environ["DB_NAME"]
unix_socket_path = os.environ["INSTANCE_UNIX_SOCKET"]
return sqlalchemy.create_engine(
sqlalchemy.engine.url.URL.create(
drivername="postgresql+pg8000",
username=db_user,
password=db_pass,
database=db_name,
query={"unix_sock": f"{unix_socket_path}/.s.PGSQL.5432"},
)
)
def _insert_ecoproduct(df: pd.DataFrame) -> None:
db = connect_unix_socket()
db_matching = {
'gtin': 'ecoproduct_id',
'ITEM_NAME_AS_IN_MARKETPLACE' : 'ecoproductname',
'ITEM_WEIGHT_WITH_PACKAGE_KG' : 'ecoproductweight',
'ITEM_HEIGHT_CM' : 'ecoproductlength',
'ITEM_WIDTH_CM' : 'ecoproductwidth',
'test_gtin' : 'gtin_test',
'batteryembedded' : 'batteryembedded'
}
df = df[db_matching.keys()]
df.rename(columns=db_matching, inplace=True)
data = df.to_dict(orient='records')
sql_query = """INSERT INTO ecoproducts(
ecoproduct_id,
ecoproductname,
ecoproductweight,
ecoproductlength,
ecoproductwidth,
gtin_test,
batteryembedded)
VALUES (%(ecoproduct_id)s, %(ecoproductname)s,%(ecoproductweight)s,%(ecoproductlength)s,
%(ecoproductwidth)s,%(gtin_test)s,%(batteryembedded)s)
ON CONFLICT(ecoproduct_id) DO NOTHING;"""
with db.connect() as conn:
result = conn.exec_driver_sql(sql_query, data)
print(f"{result.rowcount} new rows were inserted.")
I keep having this error :
Is it possible to map parameters with th dialect pg8000 ? Or maybe I should use psycopg2 ?
What is the problem here ?
EDIT 1: see variable data details :
print(data)
print(type(data))
[{'ecoproduct_id': '6941487202157', 'ecoproductname': 'HUAWEI FreeBuds Pro Bluetooth sans Fil ', 'ecoproductweight': '4', 'ecoproductlength': '0.220', 'ecoproductwidth': '0.99', 'gtin_test': False, 'batteryembedded': 0}]
<class 'list'>
Is it possible to map [named] parameters with th (sic) dialect pg8000 ?
Yes. Using a SQLAlchemy text() object allows us to use named parameters even if the underlying DBAPI does not.
import sqlalchemy as sa
sql_query = """\
INSERT INTO ecoproducts(
ecoproduct_id,
ecoproductname,
ecoproductweight,
ecoproductlength,
ecoproductwidth,
gtin_test,
batteryembedded)
VALUES (
:ecoproduct_id,
:ecoproductname,
:ecoproductweight,
:ecoproductlength,
:ecoproductwidth,
:gtin_test,
:batteryembedded)
ON CONFLICT(ecoproduct_id) DO NOTHING;
"""
result = conn.execute(sa.text(sql_query), data)
Related
Before asking this question, I have read many links about UPSERT operation on Postgres:
PostgreSQL Upsert Using INSERT ON CONFLICT statement
Anyway to Upsert database using PostgreSQL in Python
But the question is different from them, since the functionality is different. What I want is to implement something like pandas to_sql function which has the following features:
Automatically creates table
Keeps the data types of each column
The only drawback of to_sql is that it doesn't UPSERT operation on Postgres. Is there anyway to implement the expected functionality (automatically create table based on columns, perform UPSERT operation and keep data types) by passing dataframe to it?
Previously implemented code using Pandas to_sql function:
class PostgreSQL:
def __init__(self):
postgres_config = config_dict[Consts.POSTGRES.value]
self.host = postgres_config[Consts.HOST.value]
self.port = postgres_config[Consts.PORT.value]
self.db_name = postgres_config[Consts.DB_NAME.value]
self.username = postgres_config[Consts.USERNAME.value]
self.password = postgres_config[Consts.PASSWORD.value]
def get_connection(self) -> object:
url_schema = Consts.POSTGRES_URL_SCHEMA.value.format(
self.username, self.password, self.host, self.port, self.db_name
)
try:
engine = create_engine(url_schema)
return engine
except Exception as e:
logger.error('Make sure you have provided correct credentials for the DB connection.')
raise e
def save_df_to_db(self, df: object, table_name: str) -> None:
df.to_sql(table_name, con=self.get_connection(), if_exists='append')
I have written a very generic code that performs UPSERT which is not supported officially in Postgres (until December 2021), using Pandas dataframe and in an efficient way.
By using the following code, it will update the existing primary key otherwise it will create a new table (in case table name doesn't exist) and add new records to the table.
Code:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, Table
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.ext.automap import automap_base
class PostgreSQL:
def __init__(self):
postgres_config = config_dict[Consts.POSTGRES.value]
self.host = postgres_config[Consts.HOST.value]
self.port = postgres_config[Consts.PORT.value]
self.db_name = postgres_config[Consts.DB_NAME.value]
self.username = postgres_config[Consts.USERNAME.value]
self.password = postgres_config[Consts.PASSWORD.value]
def get_connection(self) -> object:
url_schema = 'postgresql://{}:{}#{}:{}/{}'.format(
self.username, self.password, self.host, self.port, self.db_name
)
try:
engine = create_engine(url_schema)
return engine
except Exception as e:
logger.error('Make sure you have provided correct credentials for the DB connection.')
raise e
def run_query(self, query: str) -> list:
engine = self.get_connection()
return engine.execute(query).fetchall()
def save_df_to_db(self, df: object, table_name: str) -> None:
root_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
engine = self.get_connection()
add_primary_key_query = get_query(root_dir, Directories.COMMON.value, 'add_primary_key.sql', table_name)
table_existence_query = get_query(root_dir, Directories.COMMON.value, 'table_existence.sql', table_name)
if not engine.execute(table_existence_query).first()[0]: # if table does not exist
logger.info('Create table automatically and from scratch!')
df.to_sql(table_name, con=self.get_connection(), if_exists='append')
engine.execute(add_primary_key_query)
else:
try:
df = df.replace("NaT", None)
df = df.replace(pd.NaT, None)
df = df.replace({pd.NaT: None})
df_dict = df.to_dict('records')
except AttributeError as e:
logger.error('Empty Dataframe!')
raise e
with engine.connect() as connection:
logger.info('Table already exists!')
base = automap_base()
base.prepare(engine, reflect=True,)
target_table = Table(table_name, base.metadata,
autoload=True, autoload_with=engine,)
chunks = [df_dict[i:i + 1000] for i in range(0, len(df_dict), 1000)]
for chunk in chunks:
stmt = insert(target_table).values(chunk)
update_dict = {c.name: c for c in stmt.excluded if not c.primary_key}
connection.execute(stmt.on_conflict_do_update(
constraint=f'{table_name}_pkey',
set_=update_dict)
)
logger.info('Saving data is successfully done.')
Table existence query:
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_schema = 'public'
AND table_name = '{}'
);
Add primary key query:
ALTER TABLE {} add primary key (id);
I have been looking since yesterday about the way I could convert the output of an SQL Query into a Pandas dataframe.
For example a code that does this :
data = select * from table
I've tried so many codes I've found on the internet but nothing seems to work.
Note that my database is stored in Azure DataBricks and I can only access the table using its URL.
Thank you so much !
Hope this would help you out. Both insertion & selection are in this code for reference.
def db_insert_user_level_info(table_name):
#Call Your DF Here , as an argument in the function or pass directly
df=df_parameter
params = urllib.parse.quote_plus("DRIVER={SQL Server};SERVER=DESKTOP-ITAJUJ2;DATABASE=githubAnalytics")
engine = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)
engine.connect()
table_row_count=select_row_count(table_name)
df_row_count=df.shape[0]
if table_row_count == df_row_count:
print("Data Cannot Be Inserted Because The Row Count is Same")
else:
df.to_sql(name=table_name,con=engine, index=False, if_exists='append')
print("********************************** DONE EXECTUTED SUCCESSFULLY ***************************************************")
def select_row_count(table_name):
cnxn = pyodbc.connect("Driver={SQL Server Native Client 11.0};"
"Server=DESKTOP-ITAJUJ2;"
"Database=githubAnalytics;"
"Trusted_Connection=yes;")
cur = cnxn.cursor()
try:
db_cmd = "SELECT count(*) FROM "+table_name
res = cur.execute(db_cmd)
# Do something with your result set, for example print out all the results:
for x in res:
return x[0]
except:
print("Table is not Available , Please Wait...")
Using sqlalchemy to connect to the database, and the built-in method read_sql_query from pandas to go straight to a DataFrame:
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine(url)
connection = engine.connect()
query = "SELECT * FROM table"
df = pd.read_sql_query(query,connection)
I'm trying to connect to SQL server 2019 via sqlalchemy. I'm using both mssql+pyodbc and msql+pyodbc_mssql, but on both cases it cannot connect, always returns default_schema_name not defined.
Already checked database, user schema defined and everything.
Example:
from sqlalchemy import create_engine
import urllib
from sqlalchemy import create_engine
server = 'server'
database = 'db'
username = 'user'
password = 'pass'
#cnxn = 'DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password+';Trusted_Connection=yes'
cnxn = 'DSN=SQL Server;SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password+';Trusted_Connection=yes'
params = urllib.parse.quote_plus(cnxn)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
cnxn = engine.connect()
return None, dialect.default_schema_name
AttributeError: 'MSDialect_pyodbc' object has no attribute 'default_schema_name'
TIA.....
Hopefully the following provides enough for a minimum viable sample. I'm using it in a larger script to move 12m rows 3x a day, and for that reason I've included an example of chunking that I pinched from elsewhere.
#Set up enterprise DB connection
# Enterprise DB to be used
DRIVER = "ODBC Driver 17 for SQL Server"
USERNAME = "SQLUsername"
PSSWD = "SQLPassword"
SERVERNAME = "SERVERNAME01"
INSTANCENAME = "\SQL_01"
DB = "DATABASE_Name"
TABLE = "Table_Name"
#Set up SQL database connection variable / path
#I have included this as an example that can be used to chunk data up
conn_executemany = sql.create_engine(
f"mssql+pyodbc://{USERNAME}:{PSSWD}#{SERVERNAME}{INSTANCENAME}/{DB}?driver={DRIVER}", fast_executemany=True
)
#Used for SQL Loading from Pandas DF
def chunker(seq, size):
return (seq[pos : pos + size] for pos in range(0, len(seq), size))
#Used for SQL Loading from Pandas DF
def insert_with_progress(df, engine, table="", schema="dbo"):
con = engine.connect()
# Replace table
#engine.execute(f"DROP TABLE IF EXISTS {schema}.{table};") #This only works for SQL Server 2016 or greater
try:
engine.execute(f"DROP TABLE Temp_WeatherGrids;")
except:
print("Unable to drop temp table")
try:
engine.execute(f"CREATE TABLE [dbo].[Temp_WeatherGrids]([col_01] [int] NULL,[Location] [int] NULL,[DateTime] [datetime] NULL,[Counts] [real] NULL) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY];")
except:
print("Unable to create temp table")
# Insert with progress
SQL_SERVER_CHUNK_LIMIT = 250000
chunksize = math.floor(SQL_SERVER_CHUNK_LIMIT / len(df.columns))
for chunk in chunker(df, chunksize):
chunk.to_sql(
name=table,
con=con,
if_exists="append",
index=False
)
if __name__ == '__main__':
# intialise data. Example - make your own dataframe. DateTime should be pandas datetime objects.
data = {'Col_01':[0, 1, 2, 3],
'Location':['Bar', 'Pub', 'Brewery', 'Bottleshop'],
'DateTime':["1/1/2018", "1/1/2019", "1/1/2020", "1/1/2021"],
'Counts':[1, 2, 3, 4}
# Create DataFrame
df = pd.DataFrame(data)
insert_with_progress(df, conn_executemany, table=TABLE)
del [df]
I have a python script that loads , transform and calculates data. In sql-server there's a stored procedure that requires a table valued parameter, 2 required parameters and 2 optional parameters. In sql server I can call this SP:
USE [InstName]
GO
DECLARE #return_value int
DECLARE #MergeOnColumn core.MatchColumnTable
INSERT INTO #MergeOnColumn
SELECT 'foo.ExternalInput','bar.ExternalInput'
EXEC #return_value = [core].[_TableData]
#Target = N'[dbname].[tablename1]',
#Source = N'[dbname].[table2]',
#MergeOnColumn = #MergeOnColumn,
#Opt1Param = False,
#Opt2Param = False
SELECT 'Return Value' = #return_value
GO
after a comprehensive search I found the following post:
How to call stored procedure with SQLAlchemy that requires a user-defined-type Table parameter
it suggests to use PYTDS and the sql-alchemy 's dialect 'sql alchemy pytds' to call a SP with table valued parameters.
with this post and the documentation I created the following Python script:
import pandas as pd
import pytds
from pytds import login
import sqlalchemy as sa
from sqlalchemy import create_engine
import sqlalchemy_pytds
def connect():
return pytds.connect(dsn='ServerName',database='DBName', auth=login.SspiAuth())
engine = sa.create_engine('mssql+pytds://[ServerName]', creator=connect)
conn = engine.raw_connection()
with conn.cursor() as cur:
arg = ("foo.ExternalInput","bar.ExternalInput")
tvp = pytds.TableValuedParam(type_name="MergeOnColumn", rows=(arg))
cur.execute('EXEC test_proc %s', ("[dbname].[table2]", "[dbname].[table1]", tvp,))
cur.fetchall()
When I run this code I get the following error message:
TypeError: not all arguments converted during string formatting
Doe anyone know how to pass in the multiple arguments correctly or has a suggestion how I could handle this call SP directly?
On the basis of the comments to my question i've managed to get the stored procedure running with table valued parameters (and get the return values from the SP)
The final script is as follows:
import pandas as pd
import pytds
from pytds import login
import sqlalchemy as sa
from sqlalchemy import create_engine
import sqlalchemy_pytds
def connect():
return pytds.connect(dsn='ServerName',database='DBName',autocommit=True, auth=login.SspiAuth())
engine = sa.create_engine('mssql+pytds://[ServerName]', creator=connect)
conn = engine.raw_connection()
with conn.cursor() as cur:
arg = [["foo.ExternalInput","bar.ExternalInput"]]
tvp = pytds.TableValuedParam(type_name="core.MatchColumnTable", rows=arg)
cur.execute("EXEC test_proc #Target = N'[dbname].[tablename1]', #Source = N'[dbname].[table2]', #CleanTarget = 0, #UseColumnsFromTarget = 0, #MergeOnColumn = %s", (tvp,))
result = cur.fetchall()
print(result)
The autocommit is added in the connection (to commit the transaction in the cursor), the table valued parameter (marchcolumntable) expects 2 columns, so the arg is modified to fit 2 columns.
The parameters that are required besides the tvp are included in the exec string. The last param in the execute string is the name of the tvp parameter(mergeoncolumn) that is filled with the tvp.
optionally you can add the result status or row count as descripted in the pytds documentation:
https://python-tds.readthedocs.io/en/latest/index.html
Note!: in the stored procedure you have to make sure that the
SET NOCOUNT ON is added otherwise you wont get any results back to Python
pytds
Python DBAPI driver for MSSQL using pure Python TDS (Tabular Data Stream) protocol implementation
I used pytds for merge / upsert via a stored procedure targeting a SQL Server.
Example
Here are a example of the basic functions, a row data is represented by Tuple:
def get_connection(instance: str, database: str, user: str, password: str):
return pytds.connect(
dsn=instance, database=database, user=user, password=password, autocommit=True
)
def execute_with_tvp(connection: pytds.Connection, procedure_name: str, rows: list):
with connection.cursor() as cursor:
tvp = pytds.TableValuedParam(type_name=my_type, rows=rows)
cursor.callproc(procedure_name, tvp)
mssql+pyodbc://
pyodbc added support for table-valued parameters (TVPs) in version 4.0.25, released 2018-12-13. Simply supply the TVP value as a list of tuples:
proc_name = "so51930062"
type_name = proc_name + "Type"
# set up test environment
with engine.begin() as conn:
conn.exec_driver_sql(f"""\
DROP PROCEDURE IF EXISTS {proc_name}
""")
conn.exec_driver_sql(f"""\
DROP TYPE IF EXISTS {type_name}
""")
conn.exec_driver_sql(f"""\
CREATE TYPE {type_name} AS TABLE (
id int,
txt nvarchar(50)
)
""")
conn.exec_driver_sql(f"""\
CREATE PROCEDURE {proc_name}
#prefix nvarchar(10),
#tvp {type_name} READONLY
AS
BEGIN
SET NOCOUNT ON;
SELECT id, #prefix + txt AS new_txt FROM #tvp;
END
""")
#run test
with engine.begin() as conn:
data = {"prefix": "new_", "tvp": [(1, "foo"), (2, "bar")]}
sql = f"{{CALL {proc_name} (:prefix, :tvp)}}"
print(conn.execute(sa.text(sql), data).fetchall())
# [(1, 'new_foo'), (2, 'new_bar')]
I recently transitioned from using SQLite for most of my data storage and management needs to MySQL. I think I've finally gotten the correct libraries installed to work with Python 3.6, but now I am having trouble creating a new table from a dataframe in the MySQL database.
Here are the libraries I import:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
In my code, I first create a dataframe from a CSV file (no issues here).
def csv_to_df(infile):
return pd.read_csv(infile)
Then I establish a connection to the MySQL database using this def function:
def mysql_connection():
user = 'root'
password = 'abc'
host = '127.0.0.1'
port = '3306'
database = 'a001_db'
engine = create_engine("mysql://{0}:{1}#{2}:{3}/{4}?charset=utf8".format(user, password, host, port, database))
return engine
Lastly, I use the pandas function "to_sql" to create the database table in the MySQL database:
def df_to_mysql(df, db_tbl_name, conn=mysql_connection(), index=False):
df.to_sql(con = conn, name = db_tbl_name, if_exists='replace', index = False)
I run the code using this line:
df_to_mysql(csv_to_df(r'path/to/file.csv'), 'new_database_table')
The yields the following error:
InvalidRequestError: Could not reflect: requested table(s) not available in Engine(mysql://root:***#127.0.0.1:3306/a001_db?charset=utf8): (new_database_table)
I think this is telling me that I must first create a table in the database before passing the data in the dataframe to this table, but I'm not 100% positive about that. Regardless, I'm looking for a way to create a table in a MySQL database without manually creating the table first (I have many CSVs, each with 50+ fields, that have to be uploaded as new tables in a MySQL database).
Any suggestions?
I took an approach suggested by aws_apprentice above which was to create the table first, then write data to the table.
The code below first auto-generates a mysql table from a df (auto defining table names and datatypes) then writes the df data to that table.
There were a couple of hiccups I had to overcome, such as: unnamed csv columns, determining the correct data type for each field in the mysql table.
I'm sure there are multiple other (better?) ways to do this, but this seems to work.
import pandas as pd
from sqlalchemy import create_engine
infile = r'path/to/file.csv'
db = 'a001_db'
db_tbl_name = 'a001_rd004_db004'
'''
Load a csv file into a dataframe; if csv does not have headers, use the headers arg to create a list of headers; rename unnamed columns to conform to mysql column requirements
'''
def csv_to_df(infile, headers = []):
if len(headers) == 0:
df = pd.read_csv(infile)
else:
df = pd.read_csv(infile, header = None)
df.columns = headers
for r in range(10):
try:
df.rename( columns={'Unnamed: {0}'.format(r):'Unnamed{0}'.format(r)}, inplace=True )
except:
pass
return df
'''
Create a mapping of df dtypes to mysql data types (not perfect, but close enough)
'''
def dtype_mapping():
return {'object' : 'TEXT',
'int64' : 'INT',
'float64' : 'FLOAT',
'datetime64' : 'DATETIME',
'bool' : 'TINYINT',
'category' : 'TEXT',
'timedelta[ns]' : 'TEXT'}
'''
Create a sqlalchemy engine
'''
def mysql_engine(user = 'root', password = 'abc', host = '127.0.0.1', port = '3306', database = 'a001_db'):
engine = create_engine("mysql://{0}:{1}#{2}:{3}/{4}?charset=utf8".format(user, password, host, port, database))
return engine
'''
Create a mysql connection from sqlalchemy engine
'''
def mysql_conn(engine):
conn = engine.raw_connection()
return conn
'''
Create sql input for table names and types
'''
def gen_tbl_cols_sql(df):
dmap = dtype_mapping()
sql = "pi_db_uid INT AUTO_INCREMENT PRIMARY KEY"
df1 = df.rename(columns = {"" : "nocolname"})
hdrs = df1.dtypes.index
hdrs_list = [(hdr, str(df1[hdr].dtype)) for hdr in hdrs]
for hl in hdrs_list:
sql += " ,{0} {1}".format(hl[0], dmap[hl[1]])
return sql
'''
Create a mysql table from a df
'''
def create_mysql_tbl_schema(df, conn, db, tbl_name):
tbl_cols_sql = gen_tbl_cols_sql(df)
sql = "USE {0}; CREATE TABLE {1} ({2})".format(db, tbl_name, tbl_cols_sql)
cur = conn.cursor()
cur.execute(sql)
cur.close()
conn.commit()
'''
Write df data to newly create mysql table
'''
def df_to_mysql(df, engine, tbl_name):
df.to_sql(tbl_name, engine, if_exists='replace')
df = csv_to_df(infile)
create_mysql_tbl_schema(df, mysql_conn(mysql_engine()), db, db_tbl_name)
df_to_mysql(df, mysql_engine(), db_tbl_name)
This
connection = engine.connect()
df.to_sql(con=connection, name='TBL_NAME', schema='SCHEMA', index=False, if_exists='replace')
works with oracle DB in specific schema wothout errors, but will not work if you have limited permissions. And note that table names is case sensative.