Assuming I have 30 databases in MySQL from db1 to db30. I have a python script that will create engine and connect to one db,
import pandas as pd
import MySQLdb
from sqlalchemy import create_engine
df = pd.read_csv('pricelist.csv')
new_df = df[['date','time','new_price']]
engine = create_engine('mysql+mysqldb://root:python#localhost:3306/db1', echo = False)
new_df.to_sql(name='temporary_table', con=engine, if_exists = 'append', index=False)
with engine.begin() as cnx:
sql_insert_query_new = 'REPLACE INTO newlist (SELECT * FROM temporary_table)'
cnx.execute(sql_insert_query_new)
cnx.execute("DROP TABLE temporary_table")
Now with the above script, I will need to have 30 python scripts to create engine and connect each db to conduct the query. And to call these 30 scripts, I will need to use a batch file on a task scheduler.
Is there an optimize way of connecting to multiple databases with a single script? I read up on sessions and don't think it is able to take in multiple databases. And if I have 30 python scripts doing this creation engine and connection, will there be any issue in terms of processing performance? Eventually, I will have like hundreds of db in MySQL.
Thanks!
Note: Each database has their own unique table names.
Using Python 3.7
I think may be you can do something like this:
import pandas as pd
import MySQLdb
from sqlalchemy import create_engine
df = pd.read_csv('pricelist.csv')
new_df = df[['date','time','new_price']]
db_names = [f'db{i}' for i in range(1, 31)]
table_names = ['temporary_table', 'table_name_2', 'table_name_3', ...]
for db, tb in zip(db_names, table_names):
engine = create_engine(f'mysql+mysqldb://root:python#localhost:3306/{db}', echo=False)
new_df.to_sql(name=tb, con=engine, if_exists='append', index=False)
with engine.begin() as cnx:
sql_insert_query_new = f'REPLACE INTO newlist (SELECT * FROM {tb})'
cnx.execute(sql_insert_query_new)
cnx.execute(f"DROP TABLE {tb}")
Related
I am trying to create table in database as this is my connection as the below code:
# pyodbc connection connect to server
conn = pyodbc.connect(
"driver={SQL Server};server=xxxxxxxxxxx; database=master; trusted_connection=true",
autocommit=True, Trusted_Connection='Yes')
crsr = conn.cursor()
# connect db (connect to database name) using SQL-Alchemy
engine = create_engine(
'mssql+pyodbc://xxxxxxxxxxx/master?driver=SQL+Server+Native+Client+11.0')
connection = engine.connect()
it's just a pyodbc conncetion
and this is the error I found:
Traceback (most recent call last):
File "C:/Users/haroo501/PycharmProjects/ToolUpdated/app.py", line 22, in <module>
dfeed_gsm_relation_m.push_dfeed_gsm_relation_sql()
File "C:\Users\haroo501\PycharmProjects\ToolUpdated\meta_data\dfeed_gsm_relation_m.py", line 31, in push_dfeed_gsm_relation_sql
if connec.crsr.dialect.has_table(connec.crsr, DATAF_GSM_RELATION):
AttributeError: 'pyodbc.Cursor' object has no attribute 'dialect'
and this is the code that creates the table in the database using MetaData():
from sqlalchemy import MetaData, Table, Column, Integer, String, Date, Float
from database import connec
import sqlalchemy as db
import pandas as pd
import numpy as np
from txt_to_csv import convert_to_csv
import os
def push_dfeed_gsm_relation_sql():
# Create a ditionary for all gsm_relations_mnm relation excel file
dataf_gsm_relation_col_dict = {
'cell_name': 'Cellname',
'n_cell_name': 'Ncellname',
'technology': 'Technology',
}
# table name in database 'df_gsm_relation'
DATAF_GSM_RELATION = 'df_gsm_relation'
# Create a list for dataf_gsm_relation_cols and put the dictionary in the list
dataf_gsm_relation_cols = list(dataf_gsm_relation_col_dict.keys())
dataf_gsm_relation_cols_meta = MetaData()
dataf_gsm_relation_relations = Table(
DATAF_GSM_RELATION, dataf_gsm_relation_cols_meta,
Column('id', Integer, primary_key=True),
Column(dataf_gsm_relation_cols[0], Integer),
Column(dataf_gsm_relation_cols[1], Integer),
Column(dataf_gsm_relation_cols[2], String),
)
if connec.crsr.dialect.has_table(connec.crsr, DATAF_GSM_RELATION):
dataf_gsm_relation_relations.drop(connec.crsr)
dataf_gsm_relation_cols_meta.create_all(connec.crsr)
dataf_gsm_rel_txt = 'gsmrelation_mnm.txt'
dataf_gsm_txt_df = pd.read_csv(dataf_gsm_rel_txt, sep=';')
dataf_gsm_rel_df_column_index = list(dataf_gsm_txt_df.columns)
dataf_gsm_txt_df.reset_index(inplace=True)
dataf_gsm_txt_df.drop(columns=dataf_gsm_txt_df.columns[-1], inplace=True)
dataf_gsm_rel_df_column_index = dict(zip(list(dataf_gsm_txt_df.columns), dataf_gsm_rel_df_column_index))
dataf_gsm_txt_df.rename(columns=dataf_gsm_rel_df_column_index, inplace=True)
dataf_gsm_txt_df.to_excel('gsmrelation_mnm.xlsx', 'Sheet1', index=False)
dataf_gsm_rel_excel = 'gsmrelation_mnm.csv'
dataf_gsm_rel_df = pd.read_csv(os.path.join(os.path.dirname(__file__), dataf_gsm_rel_excel), dtype={
dataf_gsm_relation_col_dict[dataf_gsm_relation_cols[0]]: int,
dataf_gsm_relation_col_dict[dataf_gsm_relation_cols[1]]: int,
dataf_gsm_relation_col_dict[dataf_gsm_relation_cols[2]]: str,
})
dataf_gsm_relations_table_query = db.insert(dataf_gsm_relation_relations)
dataf_gsm_relations_values_list = []
dataf_gsm_relations_row_count = 1
for i in dataf_gsm_rel_df.index:
dataf_gsm_relations_row = dataf_gsm_rel_df.loc[i]
dataf_gsm_rel_df_record = {'id': dataf_gsm_relations_row_count}
for col in dataf_gsm_relation_col_dict.keys():
if col == dataf_gsm_relation_cols[0] or col == dataf_gsm_relation_cols[1]:
dataf_gsm_rel_df_record[col] = int(dataf_gsm_relations_row[dataf_gsm_relation_col_dict[col]])
else:
dataf_gsm_rel_df_record[col] = dataf_gsm_relations_row[dataf_gsm_relation_col_dict[col]]
dataf_gsm_relations_values_list.append(dataf_gsm_rel_df_record)
dataf_gsm_relations_row_count += 1
ResultProxy_dataf_gsm_relations = connec.crsr.execute(dataf_gsm_relations_table_query,
dataf_gsm_relations_values_list)
as the problem in this part:
if connec.crsr.dialect.has_table(connec.crsr, DATAF_GSM_RELATION):
dataf_gsm_relation_relations.drop(connec.crsr)
dataf_gsm_relation_cols_meta.create_all(connec.crsr)
I know dialect function is related to from sqlalchemy import create_engine and this is my old connection as I changed to new connection using import pyodbc .....
So how can I solve this case using pyodbc module?
Edited
The other way to solve this is how to CREATE and DROP table in existing database using SQL ALCHEMY
and this is the related code example:
from database import connec
def create_db():
create_bd_query = "CREATE DATABASE MyNewDatabase"
connec.crsr.execute(create_bd_query)
def delete_database():
delete_bd_query = "DROP DATABASE MyNewDatabase"
connec.crsr.execute(delete_bd_query)
You cannot just import a completley different module and expect it to be the same :)
Dialects are what SQLalchemy uses to communicate to different drivers.
In this instance Pyodbc IS the driver so it has no need for a dialect.
From SQLAlchemy:
Dialects
The dialect is the system SQLAlchemy uses to communicate with various types of DBAPI implementations and databases. The sections that follow contain reference documentation and notes specific to the usage of each backend, as well as notes for the various DBAPIs.
All dialects require that an appropriate DBAPI driver is installed.
Included Dialects
PostgreSQL
MySQL
SQLite
Oracle
Microsoft SQL Server
Microsoft SQL Server
Support for the Microsoft SQL Server database.
DBAPI Support
The following dialect/DBAPI options are available. Please refer to individual
DBAPI sections for connect information.
PyODBC
mxODBC
pymssql
zxJDBC for Jython
adodbapi
Judging from the error and by looking at the PyODBC Wiki Documentation
I think this line:
if connec.crsr.dialect.has_table(connec.crsr, DATAF_GSM_RELATION):
should read:
# Does table 'DATAF_GSM_RELATION' exist?
if connec.crsr.tables(table=DATAF_GSM_RELATION).fetchone():
...
I have created a database with pandas :
import numpy as np
import sqlite3
import pandas as pd
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
df = pd.DataFrame(np.random.normal(0, 1, (10, 2)), columns=['A', 'B'])
path = 'sqlite:////home/username/Desktop/example.db'
engine = create_engine(path, echo=False)
df.to_sql('flows', engine, if_exists='append', index=False)
# This is only to show I am able to read the database
df_l = pd.read_sql("SELECT * FROM flows WHERE A>0 AND B<0", engine)
Now I would like to add one or more indexes to the database.
Is this case I would like to make first only the column A and then both the columns indices.
How can I do that?
If possible I would like a solution that uses only SqlAlchemy so that it is independent from the choice of the database.
You should use reflection to get hold of the table that pandas created for you.
With reference to:
SQLAlchemy Reflecting Database Objects
A Table object can be instructed to load information about itself from
the corresponding database schema object already existing within the
database. This process is called reflection. In the most simple case
you need only specify the table name, a MetaData object, and the
autoload=True flag. If the MetaData is not persistently bound, also
add the autoload_with argument:
you could try this:
meta = sqlalchemy.MetaData()
meta.reflect(bind=engine)
flows = meta.tables['flows']
# alternative of retrieving the table from meta:
#flows = sqlalchemy.Table('flows', meta, autoload=True, autoload_with=engine)
my_index = sqlalchemy.Index('flows_idx', flows.columns.get('A'))
my_index.create(bind=engine)
# lets confirm it is there
inspector = reflection.Inspector.from_engine(engine)
print(inspector.get_indexes('flows'))
This seems to work for me. You will have to define the variables psql_URI, table, and col yourself. Here I assume that the table name / column name may be in (partial) uppercase but you want the name of the index to be lowercase.
Derived from the answer here: https://stackoverflow.com/a/72976667/3406189
import sqlalchemy
from sqlalchemy.orm import Session
engine_psql = sqlalchemy.create_engine(psql_URI)
autocommit_engine = engine_psql.execution_options(isolation_level="AUTOCOMMIT")
with Session(autocommit_engine) as session:
session.execute(
f'CREATE INDEX IF NOT EXISTS idx_{table.lower()}_{col.lower()} ON sdi_ai."{table}" ("{col}");'
)
I would like to connect to MS SQL Server and execute a SQL command with python. I am familiar with using SQLAlchemy to create SQL tables, pandas DataFrames, etc., but how can I execute the SQL in a .sql file with python/pandas/SQLAlchemy? Is there a better way to do it?
For example I have the file 'update.sql' that contains the SQL text:
truncate table dev.dbo.jobs
truncate table dev.dbo.workers
go
insert into dev.dbo.jobs select * from test.dbo.jobs
insert into dev.dbo.workers select * from test.dbo.workers
You can use SQLAlchemy's connection.execute to run raw SQL queries. If you have the sql statements stored in a file then it might look something like this:
from sqlalchemy import create_engine
from sqlalchemy.sql import text
engine = create_engine('urltodb')
conn = engine.connect()
with open('file.sql', 'r') as f:
for l in f:
stmt = text(l)
conn.execute(stmt)
conn.close()
I can connect to my local mysql database from python, and I can create, select from, and insert individual rows.
My question is: can I directly instruct mysqldb to take an entire dataframe and insert it into an existing table, or do I need to iterate over the rows?
In either case, what would the python script look like for a very simple table with ID and two data columns, and a matching dataframe?
Update:
There is now a to_sql method, which is the preferred way to do this, rather than write_frame:
df.to_sql(con=con, name='table_name_for_df', if_exists='replace', flavor='mysql')
Also note: the syntax may change in pandas 0.14...
You can set up the connection with MySQLdb:
from pandas.io import sql
import MySQLdb
con = MySQLdb.connect() # may need to add some other options to connect
Setting the flavor of write_frame to 'mysql' means you can write to mysql:
sql.write_frame(df, con=con, name='table_name_for_df',
if_exists='replace', flavor='mysql')
The argument if_exists tells pandas how to deal if the table already exists:
if_exists: {'fail', 'replace', 'append'}, default 'fail'
fail: If table exists, do nothing.
replace: If table exists, drop it, recreate it, and insert data.
append: If table exists, insert data. Create if does not exist.
Although the write_frame docs currently suggest it only works on sqlite, mysql appears to be supported and in fact there is quite a bit of mysql testing in the codebase.
Andy Hayden mentioned the correct function (to_sql). In this answer, I'll give a complete example, which I tested with Python 3.5 but should also work for Python 2.7 (and Python 3.x):
First, let's create the dataframe:
# Create dataframe
import pandas as pd
import numpy as np
np.random.seed(0)
number_of_samples = 10
frame = pd.DataFrame({
'feature1': np.random.random(number_of_samples),
'feature2': np.random.random(number_of_samples),
'class': np.random.binomial(2, 0.1, size=number_of_samples),
},columns=['feature1','feature2','class'])
print(frame)
Which gives:
feature1 feature2 class
0 0.548814 0.791725 1
1 0.715189 0.528895 0
2 0.602763 0.568045 0
3 0.544883 0.925597 0
4 0.423655 0.071036 0
5 0.645894 0.087129 0
6 0.437587 0.020218 0
7 0.891773 0.832620 1
8 0.963663 0.778157 0
9 0.383442 0.870012 0
To import this dataframe into a MySQL table:
# Import dataframe into MySQL
import sqlalchemy
database_username = 'ENTER USERNAME'
database_password = 'ENTER USERNAME PASSWORD'
database_ip = 'ENTER DATABASE IP'
database_name = 'ENTER DATABASE NAME'
database_connection = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}#{2}/{3}'.
format(database_username, database_password,
database_ip, database_name))
frame.to_sql(con=database_connection, name='table_name_for_df', if_exists='replace')
One trick is that MySQLdb doesn't work with Python 3.x. So instead we use mysqlconnector, which may be installed as follows:
pip install mysql-connector==2.1.4 # version avoids Protobuf error
Output:
Note that to_sql creates the table as well as the columns if they do not already exist in the database.
You can do it by using pymysql:
For example, let's suppose you have a MySQL database with the next user, password, host and port and you want to write in the database 'data_2', if it is already there or not.
import pymysql
user = 'root'
passw = 'my-secret-pw-for-mysql-12ud'
host = '172.17.0.2'
port = 3306
database = 'data_2'
If you already have the database created:
conn = pymysql.connect(host=host,
port=port,
user=user,
passwd=passw,
db=database,
charset='utf8')
data.to_sql(name=database, con=conn, if_exists = 'replace', index=False, flavor = 'mysql')
If you do NOT have the database created, also valid when the database is already there:
conn = pymysql.connect(host=host, port=port, user=user, passwd=passw)
conn.cursor().execute("CREATE DATABASE IF NOT EXISTS {0} ".format(database))
conn = pymysql.connect(host=host,
port=port,
user=user,
passwd=passw,
db=database,
charset='utf8')
data.to_sql(name=database, con=conn, if_exists = 'replace', index=False, flavor = 'mysql')
Similar threads:
Writing to MySQL database with pandas using SQLAlchemy, to_sql
Writing a Pandas Dataframe to MySQL
The to_sql method works for me.
However, keep in mind that the it looks like it's going to be deprecated in favor of SQLAlchemy:
FutureWarning: The 'mysql' flavor with DBAPI connection is deprecated and will be removed in future versions. MySQL will be further supported with SQLAlchemy connectables. chunksize=chunksize, dtype=dtype)
Python 2 + 3
Prerequesites
Pandas
MySQL server
sqlalchemy
pymysql: pure python mysql client
Code
from pandas.io import sql
from sqlalchemy import create_engine
engine = create_engine("mysql+pymysql://{user}:{pw}#localhost/{db}"
.format(user="root",
pw="your_password",
db="pandas"))
df.to_sql(con=engine, name='table_name', if_exists='replace')
This should do the trick:
import pandas as pd
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
# Create engine
engine = create_engine('mysql://USER_NAME_HERE:PASS_HERE#HOST_ADRESS_HERE/DB_NAME_HERE')
# Create the connection and close it(whether successed of failed)
with engine.begin() as connection:
df.to_sql(name='INSERT_TABLE_NAME_HERE/INSERT_NEW_TABLE_NAME', con=connection, if_exists='append', index=False)
You might output your DataFrame as a csv file and then use mysqlimport to import your csv into your mysql.
EDIT
Seems pandas's build-in sql util provide a write_frame function but only works in sqlite.
I found something useful, you might try this
This has worked for me. At first I've created only the database, no predefined table I created.
from platform import python_version
print(python_version())
3.7.3
path='glass.data'
df=pd.read_csv(path)
df.head()
!conda install sqlalchemy
!conda install pymysql
pd.__version__
'0.24.2'
sqlalchemy.__version__
'1.3.20'
restarted the Kernel after installation.
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://USER:PASSWORD#HOST:PORT/DATABASE_NAME', echo=False)
try:
df.to_sql(name='glasstable',con=engine,index=False, if_exists='replace')
print('Sucessfully written to Database!!!')
except Exception as e:
print(e)
df.to_sql(name = "owner", con= db_connection, schema = 'aws', if_exists='replace', index = >True, index_label='id')
The documentation for Pandas has numerous examples of best practices for working with data stored in various formats.
However, I am unable to find any good examples for working with databases like MySQL for example.
Can anyone point me to links or give some code snippets of how to convert query results using mysql-python to data frames in Pandas efficiently ?
As Wes says, io/sql's read_sql will do it, once you've gotten a database connection using a DBI compatible library. We can look at two short examples using the MySQLdb and cx_Oracle libraries to connect to Oracle and MySQL and query their data dictionaries. Here is the example for cx_Oracle:
import pandas as pd
import cx_Oracle
ora_conn = cx_Oracle.connect('your_connection_string')
df_ora = pd.read_sql('select * from user_objects', con=ora_conn)
print 'loaded dataframe from Oracle. # Records: ', len(df_ora)
ora_conn.close()
And here is the equivalent example for MySQLdb:
import MySQLdb
mysql_cn= MySQLdb.connect(host='myhost',
port=3306,user='myusername', passwd='mypassword',
db='information_schema')
df_mysql = pd.read_sql('select * from VIEWS;', con=mysql_cn)
print 'loaded dataframe from MySQL. records:', len(df_mysql)
mysql_cn.close()
For recent readers of this question: pandas have the following warning in their docs for version 14.0:
Warning: Some of the existing functions or function aliases have been
deprecated and will be removed in future versions. This includes:
tquery, uquery, read_frame, frame_query, write_frame.
And:
Warning: The support for the ‘mysql’ flavor when using DBAPI connection objects has
been deprecated. MySQL will be further supported with SQLAlchemy
engines (GH6900).
This makes many of the answers here outdated. You should use sqlalchemy:
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('dialect://user:pass#host:port/schema', echo=False)
f = pd.read_sql_query('SELECT * FROM mytable', engine, index_col = 'ID')
For the record, here is an example using a sqlite database:
import pandas as pd
import sqlite3
with sqlite3.connect("whatever.sqlite") as con:
sql = "SELECT * FROM table_name"
df = pd.read_sql_query(sql, con)
print df.shape
I prefer to create queries with SQLAlchemy, and then make a DataFrame from it. SQLAlchemy makes it easier to combine SQL conditions Pythonically if you intend to mix and match things over and over.
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Table
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from pandas import DataFrame
import datetime
# We are connecting to an existing service
engine = create_engine('dialect://user:pwd#host:port/db', echo=False)
Session = sessionmaker(bind=engine)
session = Session()
Base = declarative_base()
# And we want to query an existing table
tablename = Table('tablename',
Base.metadata,
autoload=True,
autoload_with=engine,
schema='ownername')
# These are the "Where" parameters, but I could as easily
# create joins and limit results
us = tablename.c.country_code.in_(['US','MX'])
dc = tablename.c.locn_name.like('%DC%')
dt = tablename.c.arr_date >= datetime.date.today() # Give me convenience or...
q = session.query(tablename).\
filter(us & dc & dt) # That's where the magic happens!!!
def querydb(query):
"""
Function to execute query and return DataFrame.
"""
df = DataFrame(query.all());
df.columns = [x['name'] for x in query.column_descriptions]
return df
querydb(q)
MySQL example:
import MySQLdb as db
from pandas import DataFrame
from pandas.io.sql import frame_query
database = db.connect('localhost','username','password','database')
data = frame_query("SELECT * FROM data", database)
The same syntax works for Ms SQL server using podbc also.
import pyodbc
import pandas.io.sql as psql
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=servername;DATABASE=mydb;UID=username;PWD=password')
cursor = cnxn.cursor()
sql = ("""select * from mytable""")
df = psql.frame_query(sql, cnxn)
cnxn.close()
And this is how you connect to PostgreSQL using psycopg2 driver (install with "apt-get install python-psycopg2" if you're on Debian Linux derivative OS).
import pandas.io.sql as psql
import psycopg2
conn = psycopg2.connect("dbname='datawarehouse' user='user1' host='localhost' password='uberdba'")
q = """select month_idx, sum(payment) from bi_some_table"""
df3 = psql.frame_query(q, conn)
For Sybase the following works (with http://python-sybase.sourceforge.net)
import pandas.io.sql as psql
import Sybase
df = psql.frame_query("<Query>", con=Sybase.connect("<dsn>", "<user>", "<pwd>"))
pandas.io.sql.frame_query is deprecated. Use pandas.read_sql instead.
import the module
import pandas as pd
import oursql
connect
conn=oursql.connect(host="localhost",user="me",passwd="mypassword",db="classicmodels")
sql="Select customerName, city,country from customers order by customerName,country,city"
df_mysql = pd.read_sql(sql,conn)
print df_mysql
That works just fine and using pandas.io.sql frame_works (with the deprecation warning). Database used is the sample database from mysql tutorial.
This should work just fine.
import MySQLdb as mdb
import pandas as pd
con = mdb.connect(‘127.0.0.1’, ‘root’, ‘password’, ‘database_name’);
with con:
cur = con.cursor()
cur.execute(“select random_number_one, random_number_two, random_number_three from randomness.a_random_table”)
rows = cur.fetchall()
df = pd.DataFrame( [[ij for ij in i] for i in rows] )
df.rename(columns={0: ‘Random Number One’, 1: ‘Random Number Two’, 2: ‘Random Number Three’}, inplace=True);
print(df.head(20))
This helped for me for connecting to AWS MYSQL(RDS) from python 3.x based lambda function and loading into a pandas DataFrame
import json
import boto3
import pymysql
import pandas as pd
user = 'username'
password = 'XXXXXXX'
client = boto3.client('rds')
def lambda_handler(event, context):
conn = pymysql.connect(host='xxx.xxxxus-west-2.rds.amazonaws.com', port=3306, user=user, passwd=password, db='database name', connect_timeout=5)
df= pd.read_sql('select * from TableName limit 10',con=conn)
print(df)
# TODO implement
#return {
# 'statusCode': 200,
# 'df': df
#}
For Postgres users
import psycopg2
import pandas as pd
conn = psycopg2.connect("database='datawarehouse' user='user1' host='localhost' password='uberdba'")
customers = 'select * from customers'
customers_df = pd.read_sql(customers,conn)
customers_df