I'm trying to connect to SQL server 2019 via sqlalchemy. I'm using both mssql+pyodbc and msql+pyodbc_mssql, but on both cases it cannot connect, always returns default_schema_name not defined.
Already checked database, user schema defined and everything.
Example:
from sqlalchemy import create_engine
import urllib
from sqlalchemy import create_engine
server = 'server'
database = 'db'
username = 'user'
password = 'pass'
#cnxn = 'DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password+';Trusted_Connection=yes'
cnxn = 'DSN=SQL Server;SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password+';Trusted_Connection=yes'
params = urllib.parse.quote_plus(cnxn)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
cnxn = engine.connect()
return None, dialect.default_schema_name
AttributeError: 'MSDialect_pyodbc' object has no attribute 'default_schema_name'
TIA.....
Hopefully the following provides enough for a minimum viable sample. I'm using it in a larger script to move 12m rows 3x a day, and for that reason I've included an example of chunking that I pinched from elsewhere.
#Set up enterprise DB connection
# Enterprise DB to be used
DRIVER = "ODBC Driver 17 for SQL Server"
USERNAME = "SQLUsername"
PSSWD = "SQLPassword"
SERVERNAME = "SERVERNAME01"
INSTANCENAME = "\SQL_01"
DB = "DATABASE_Name"
TABLE = "Table_Name"
#Set up SQL database connection variable / path
#I have included this as an example that can be used to chunk data up
conn_executemany = sql.create_engine(
f"mssql+pyodbc://{USERNAME}:{PSSWD}#{SERVERNAME}{INSTANCENAME}/{DB}?driver={DRIVER}", fast_executemany=True
)
#Used for SQL Loading from Pandas DF
def chunker(seq, size):
return (seq[pos : pos + size] for pos in range(0, len(seq), size))
#Used for SQL Loading from Pandas DF
def insert_with_progress(df, engine, table="", schema="dbo"):
con = engine.connect()
# Replace table
#engine.execute(f"DROP TABLE IF EXISTS {schema}.{table};") #This only works for SQL Server 2016 or greater
try:
engine.execute(f"DROP TABLE Temp_WeatherGrids;")
except:
print("Unable to drop temp table")
try:
engine.execute(f"CREATE TABLE [dbo].[Temp_WeatherGrids]([col_01] [int] NULL,[Location] [int] NULL,[DateTime] [datetime] NULL,[Counts] [real] NULL) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY];")
except:
print("Unable to create temp table")
# Insert with progress
SQL_SERVER_CHUNK_LIMIT = 250000
chunksize = math.floor(SQL_SERVER_CHUNK_LIMIT / len(df.columns))
for chunk in chunker(df, chunksize):
chunk.to_sql(
name=table,
con=con,
if_exists="append",
index=False
)
if __name__ == '__main__':
# intialise data. Example - make your own dataframe. DateTime should be pandas datetime objects.
data = {'Col_01':[0, 1, 2, 3],
'Location':['Bar', 'Pub', 'Brewery', 'Bottleshop'],
'DateTime':["1/1/2018", "1/1/2019", "1/1/2020", "1/1/2021"],
'Counts':[1, 2, 3, 4}
# Create DataFrame
df = pd.DataFrame(data)
insert_with_progress(df, conn_executemany, table=TABLE)
del [df]
Related
Before asking this question, I have read many links about UPSERT operation on Postgres:
PostgreSQL Upsert Using INSERT ON CONFLICT statement
Anyway to Upsert database using PostgreSQL in Python
But the question is different from them, since the functionality is different. What I want is to implement something like pandas to_sql function which has the following features:
Automatically creates table
Keeps the data types of each column
The only drawback of to_sql is that it doesn't UPSERT operation on Postgres. Is there anyway to implement the expected functionality (automatically create table based on columns, perform UPSERT operation and keep data types) by passing dataframe to it?
Previously implemented code using Pandas to_sql function:
class PostgreSQL:
def __init__(self):
postgres_config = config_dict[Consts.POSTGRES.value]
self.host = postgres_config[Consts.HOST.value]
self.port = postgres_config[Consts.PORT.value]
self.db_name = postgres_config[Consts.DB_NAME.value]
self.username = postgres_config[Consts.USERNAME.value]
self.password = postgres_config[Consts.PASSWORD.value]
def get_connection(self) -> object:
url_schema = Consts.POSTGRES_URL_SCHEMA.value.format(
self.username, self.password, self.host, self.port, self.db_name
)
try:
engine = create_engine(url_schema)
return engine
except Exception as e:
logger.error('Make sure you have provided correct credentials for the DB connection.')
raise e
def save_df_to_db(self, df: object, table_name: str) -> None:
df.to_sql(table_name, con=self.get_connection(), if_exists='append')
I have written a very generic code that performs UPSERT which is not supported officially in Postgres (until December 2021), using Pandas dataframe and in an efficient way.
By using the following code, it will update the existing primary key otherwise it will create a new table (in case table name doesn't exist) and add new records to the table.
Code:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, Table
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.ext.automap import automap_base
class PostgreSQL:
def __init__(self):
postgres_config = config_dict[Consts.POSTGRES.value]
self.host = postgres_config[Consts.HOST.value]
self.port = postgres_config[Consts.PORT.value]
self.db_name = postgres_config[Consts.DB_NAME.value]
self.username = postgres_config[Consts.USERNAME.value]
self.password = postgres_config[Consts.PASSWORD.value]
def get_connection(self) -> object:
url_schema = 'postgresql://{}:{}#{}:{}/{}'.format(
self.username, self.password, self.host, self.port, self.db_name
)
try:
engine = create_engine(url_schema)
return engine
except Exception as e:
logger.error('Make sure you have provided correct credentials for the DB connection.')
raise e
def run_query(self, query: str) -> list:
engine = self.get_connection()
return engine.execute(query).fetchall()
def save_df_to_db(self, df: object, table_name: str) -> None:
root_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
engine = self.get_connection()
add_primary_key_query = get_query(root_dir, Directories.COMMON.value, 'add_primary_key.sql', table_name)
table_existence_query = get_query(root_dir, Directories.COMMON.value, 'table_existence.sql', table_name)
if not engine.execute(table_existence_query).first()[0]: # if table does not exist
logger.info('Create table automatically and from scratch!')
df.to_sql(table_name, con=self.get_connection(), if_exists='append')
engine.execute(add_primary_key_query)
else:
try:
df = df.replace("NaT", None)
df = df.replace(pd.NaT, None)
df = df.replace({pd.NaT: None})
df_dict = df.to_dict('records')
except AttributeError as e:
logger.error('Empty Dataframe!')
raise e
with engine.connect() as connection:
logger.info('Table already exists!')
base = automap_base()
base.prepare(engine, reflect=True,)
target_table = Table(table_name, base.metadata,
autoload=True, autoload_with=engine,)
chunks = [df_dict[i:i + 1000] for i in range(0, len(df_dict), 1000)]
for chunk in chunks:
stmt = insert(target_table).values(chunk)
update_dict = {c.name: c for c in stmt.excluded if not c.primary_key}
connection.execute(stmt.on_conflict_do_update(
constraint=f'{table_name}_pkey',
set_=update_dict)
)
logger.info('Saving data is successfully done.')
Table existence query:
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_schema = 'public'
AND table_name = '{}'
);
Add primary key query:
ALTER TABLE {} add primary key (id);
I have been looking since yesterday about the way I could convert the output of an SQL Query into a Pandas dataframe.
For example a code that does this :
data = select * from table
I've tried so many codes I've found on the internet but nothing seems to work.
Note that my database is stored in Azure DataBricks and I can only access the table using its URL.
Thank you so much !
Hope this would help you out. Both insertion & selection are in this code for reference.
def db_insert_user_level_info(table_name):
#Call Your DF Here , as an argument in the function or pass directly
df=df_parameter
params = urllib.parse.quote_plus("DRIVER={SQL Server};SERVER=DESKTOP-ITAJUJ2;DATABASE=githubAnalytics")
engine = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)
engine.connect()
table_row_count=select_row_count(table_name)
df_row_count=df.shape[0]
if table_row_count == df_row_count:
print("Data Cannot Be Inserted Because The Row Count is Same")
else:
df.to_sql(name=table_name,con=engine, index=False, if_exists='append')
print("********************************** DONE EXECTUTED SUCCESSFULLY ***************************************************")
def select_row_count(table_name):
cnxn = pyodbc.connect("Driver={SQL Server Native Client 11.0};"
"Server=DESKTOP-ITAJUJ2;"
"Database=githubAnalytics;"
"Trusted_Connection=yes;")
cur = cnxn.cursor()
try:
db_cmd = "SELECT count(*) FROM "+table_name
res = cur.execute(db_cmd)
# Do something with your result set, for example print out all the results:
for x in res:
return x[0]
except:
print("Table is not Available , Please Wait...")
Using sqlalchemy to connect to the database, and the built-in method read_sql_query from pandas to go straight to a DataFrame:
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine(url)
connection = engine.connect()
query = "SELECT * FROM table"
df = pd.read_sql_query(query,connection)
I have a data base file .db in SQLite3 format and I was attempting to open it to look at the data inside it. Below is my attempt to code using python.
import sqlite3
# Create a SQL connection to our SQLite database
con = sqlite3.connect(dbfile)
cur = con.cursor()
# The result of a "cursor.execute" can be iterated over by row
for row in cur.execute("SELECT * FROM "):
print(row)
# Be sure to close the connection
con.close()
For the line ("SELECT * FROM ") , I understand that you have to put in the header of the table after the word "FROM", however, since I can't even open up the file in the first place, I have no idea what header to put. Hence how can I code such that I can open up the data base file to read its contents?
So, you analyzed it all right. After the FROM you have to put in the tablenames. But you can find them out like this:
SELECT name FROM sqlite_master WHERE type = 'table'
In code this looks like this:
# loading in modules
import sqlite3
# creating file path
dbfile = '/home/niklas/Desktop/Stuff/StockData-IBM.db'
# Create a SQL connection to our SQLite database
con = sqlite3.connect(dbfile)
# creating cursor
cur = con.cursor()
# reading all table names
table_list = [a for a in cur.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
# here is you table list
print(table_list)
# Be sure to close the connection
con.close()
That worked for me very good. The reading of the data you have done already right just paste in the tablenames.
If you want to see data for visual analysis as pandas dataframe, the below approach could also be used.
import pandas as pd
import sqlite3
import sqlalchemy
try:
conn = sqlite3.connect("file.db")
except Exception as e:
print(e)
#Now in order to read in pandas dataframe we need to know table name
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(f"Table Name : {cursor.fetchall()}")
df = pd.read_sql_query('SELECT * FROM Table_Name', conn)
conn.close()
from flask import Flask
app = Flask(__name__)
from sqlalchemy import create_engine, select, MetaData, Table
from sqlalchemy.sql import and_, or_
engine = create_engine('sqlite://username:password#host/databasename')
class UserModel():
def __init__(self):
try:
self.meta = MetaData()
self.users = Table("users", self.meta, autoload=True, autoload_with=engine)
except Exception as e:
print(e)
def get(self):
stmt = select([self.users.c.name, self.users.c.email, self.users.c.password])
print(stmt)
result = engine.execute(stmt)
temp = [dict(r) for r in result] if result else None
print(temp)
return temp
I try to query some data from a postgres database and add the results into an excel with the below Python code (I am connecting to the server through ssh tunnel and connecting to database using sqlalchemy):
from sshtunnel import SSHTunnelForwarder
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import pandas as pd
from pandas import DataFrame
import xlsxwriter
import openpyxl
with SSHTunnelForwarder(
('<server_ip>', 22),
ssh_username="<server_username>",
ssh_private_key='<private_key_path>',
remote_bind_address=('localhost', 5432)) as server:
server.start()
print "server connected"
#connect to DB
local_port = str(server.local_bind_port)
engine = create_engine('postgresql://<db_username>:<db_password>:' + local_port +'/<db_name>')
Session = sessionmaker(bind=engine)
s = Session()
print 'Database session created'
not_empty_query = False #flag empty queries
arg_query = "SELECT * from portalpage where id not in (select entityid from sharepermissions where entitytype='PortalPage')"
query = s.execute(arg_query)
print(query)
for row in query: #check if the query is empty
if (row[0] > 0):
not_empty_query = True
break
if not_empty_query == True: #if the query isn not empty add response into excel
df = pd.DataFrame(pd.np.empty((0, 8)))
df = DataFrame(query.fetchall())
print(df)
df.columns = query.keys()
df.to_excel("out.xlsx", engine="openpyxl", sheet_name="Worksheet_Name")
s.close()
It works for the most of the queries that I tried to execute, however with the above query it returns the below error:
ValueError: Length mismatch: Expected axis has 0 elements, new values have 8 elements
While I was troubleshooting I printed the the df parameter and I got an "Empty Dataframe".
However when I run the same query in my database directly I get results.
I also noticed that in the response, on my database, some columns are empty (not sure if it makes any difference).
Please also find a print screen of the code execution.
The above will work if I remove the below piece of code:
for row in query: #check if the query is empty
if (row[0] > 0):
not_empty_query = True
break
if not_empty_query == True:
However, if I remove this 'for loop' then for other queries (mainly for queries which return empty results) I get the same error.
Please find an example below.
Ay ideas?
Please try this. I found that the logic you are using to check if the query returns any data is the problem. I have modified it to have that check first. If there is any row returned then it builds the dataframe and then exports to excel. Please let me know if it works.
from sshtunnel import SSHTunnelForwarder
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import pandas as pd
from pandas import DataFrame
import xlsxwriter
import openpyxl
with SSHTunnelForwarder(
('<server_ip>', 22),
ssh_username="<server_username>",
ssh_private_key='<private_key_path>',
remote_bind_address=('localhost', 5432)) as server:
server.start()
print "server connected"
#connect to DB
local_port = str(server.local_bind_port)
engine = create_engine('postgresql://<db_username>:<db_password>:' + local_port +'/<db_name>')
Session = sessionmaker(bind=engine)
s = Session()
print 'Database session created'
arg_query = "SELECT * from portalpage where id not in (select entityid from sharepermissions where entitytype='PortalPage')"
query = conn.execute(arg_query)##rows_count
rows = query.fetchall()
columns=query.keys()
if len(rows) > 0:
df = DataFrame(rows)
df.columns =columns
df.to_excel("out.xlsx", engine="openpyxl", sheet_name="Worksheet_Name")
else:
print "no data"
Try to create an empty data frame first.
if not_empty_query == True: #if the query isn not empty add response into excel
df = pd.DataFrame(pd.np.empty((0, 8)))
df = DataFrame(query.fetchall())
print(df)
df.columns = query.keys()
df.to_excel("out.xlsx", engine="openpyxl", sheet_name="Worksheet_Name")
I recently transitioned from using SQLite for most of my data storage and management needs to MySQL. I think I've finally gotten the correct libraries installed to work with Python 3.6, but now I am having trouble creating a new table from a dataframe in the MySQL database.
Here are the libraries I import:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
In my code, I first create a dataframe from a CSV file (no issues here).
def csv_to_df(infile):
return pd.read_csv(infile)
Then I establish a connection to the MySQL database using this def function:
def mysql_connection():
user = 'root'
password = 'abc'
host = '127.0.0.1'
port = '3306'
database = 'a001_db'
engine = create_engine("mysql://{0}:{1}#{2}:{3}/{4}?charset=utf8".format(user, password, host, port, database))
return engine
Lastly, I use the pandas function "to_sql" to create the database table in the MySQL database:
def df_to_mysql(df, db_tbl_name, conn=mysql_connection(), index=False):
df.to_sql(con = conn, name = db_tbl_name, if_exists='replace', index = False)
I run the code using this line:
df_to_mysql(csv_to_df(r'path/to/file.csv'), 'new_database_table')
The yields the following error:
InvalidRequestError: Could not reflect: requested table(s) not available in Engine(mysql://root:***#127.0.0.1:3306/a001_db?charset=utf8): (new_database_table)
I think this is telling me that I must first create a table in the database before passing the data in the dataframe to this table, but I'm not 100% positive about that. Regardless, I'm looking for a way to create a table in a MySQL database without manually creating the table first (I have many CSVs, each with 50+ fields, that have to be uploaded as new tables in a MySQL database).
Any suggestions?
I took an approach suggested by aws_apprentice above which was to create the table first, then write data to the table.
The code below first auto-generates a mysql table from a df (auto defining table names and datatypes) then writes the df data to that table.
There were a couple of hiccups I had to overcome, such as: unnamed csv columns, determining the correct data type for each field in the mysql table.
I'm sure there are multiple other (better?) ways to do this, but this seems to work.
import pandas as pd
from sqlalchemy import create_engine
infile = r'path/to/file.csv'
db = 'a001_db'
db_tbl_name = 'a001_rd004_db004'
'''
Load a csv file into a dataframe; if csv does not have headers, use the headers arg to create a list of headers; rename unnamed columns to conform to mysql column requirements
'''
def csv_to_df(infile, headers = []):
if len(headers) == 0:
df = pd.read_csv(infile)
else:
df = pd.read_csv(infile, header = None)
df.columns = headers
for r in range(10):
try:
df.rename( columns={'Unnamed: {0}'.format(r):'Unnamed{0}'.format(r)}, inplace=True )
except:
pass
return df
'''
Create a mapping of df dtypes to mysql data types (not perfect, but close enough)
'''
def dtype_mapping():
return {'object' : 'TEXT',
'int64' : 'INT',
'float64' : 'FLOAT',
'datetime64' : 'DATETIME',
'bool' : 'TINYINT',
'category' : 'TEXT',
'timedelta[ns]' : 'TEXT'}
'''
Create a sqlalchemy engine
'''
def mysql_engine(user = 'root', password = 'abc', host = '127.0.0.1', port = '3306', database = 'a001_db'):
engine = create_engine("mysql://{0}:{1}#{2}:{3}/{4}?charset=utf8".format(user, password, host, port, database))
return engine
'''
Create a mysql connection from sqlalchemy engine
'''
def mysql_conn(engine):
conn = engine.raw_connection()
return conn
'''
Create sql input for table names and types
'''
def gen_tbl_cols_sql(df):
dmap = dtype_mapping()
sql = "pi_db_uid INT AUTO_INCREMENT PRIMARY KEY"
df1 = df.rename(columns = {"" : "nocolname"})
hdrs = df1.dtypes.index
hdrs_list = [(hdr, str(df1[hdr].dtype)) for hdr in hdrs]
for hl in hdrs_list:
sql += " ,{0} {1}".format(hl[0], dmap[hl[1]])
return sql
'''
Create a mysql table from a df
'''
def create_mysql_tbl_schema(df, conn, db, tbl_name):
tbl_cols_sql = gen_tbl_cols_sql(df)
sql = "USE {0}; CREATE TABLE {1} ({2})".format(db, tbl_name, tbl_cols_sql)
cur = conn.cursor()
cur.execute(sql)
cur.close()
conn.commit()
'''
Write df data to newly create mysql table
'''
def df_to_mysql(df, engine, tbl_name):
df.to_sql(tbl_name, engine, if_exists='replace')
df = csv_to_df(infile)
create_mysql_tbl_schema(df, mysql_conn(mysql_engine()), db, db_tbl_name)
df_to_mysql(df, mysql_engine(), db_tbl_name)
This
connection = engine.connect()
df.to_sql(con=connection, name='TBL_NAME', schema='SCHEMA', index=False, if_exists='replace')
works with oracle DB in specific schema wothout errors, but will not work if you have limited permissions. And note that table names is case sensative.