Reading SQL table from Python with merge condition - python

import pandas as pd
conn = pyodbc.connect("Driver={??};"
"Server=??;"
"Database=??;"
"Trusted_Connection=yes;")
df1 = pd.read_sql_query("SELECT TOP 10000 * FROM table1", conn)
df2 = pd.read_sql_query("SELECT * FROM table2 (((where id_key = id(from table1) ))) ", conn)
Hello,
I have two tables in SQL server. I wanted to pull the data from table2 that has the same ID which mean id_key = id(from table1).

get df1's id as a tuple:
ids = tuple(df1['id'].to_list())
print(ids)
'''
(1, 2)
'''
then, use format and read sql:
sql= 'select*from table where id_key in {}'.format(ids)
print(sql)
'''
select*from table where id_key in (1, 2)
'''
df2=pd.read_sql(sql,conn)
full code:
import pandas as pd
conn = pyodbc.connect("Driver={??};"
"Server=??;"
"Database=??;"
"Trusted_Connection=yes;")
df1 = pd.read_sql_query("SELECT TOP 10000 * FROM table1", conn)
ids = tuple(df1['id'].to_list())
df2_sql = 'SELECT * FROM table2 where id_key in {}'.format(ids)
df2 = pd.read_sql_query(df2_sql, conn)

Related

How to execute multiple cursors and fetch the data into two different Pandas dataframe using Python?

I am connecting to Snowflake and using below query I can fetch the data for one table. I want to fetch the data for multiple table. So I tried cursor.exceute for table2. However, dataframe only printed the result for last query.I need to fetch data from 2 tables into 2 different dataframe.
Thanks in advance!
Python Code:
import pandas as pd
import snowflake.connector
import os
tables = ['CUSTOMER', 'CALL_CENTER', 'CUSTOMER_ADDRESS','CATALOG_PAGE','CUSTOMER_DEMOGRAPHICS']
filename = 'output.txt'
def main():
conn = snowflake.connector.connect(
user="my_usdr",
password="pswd",
account="my_account",
warehouse="my_WH",
database="SNOWFLAKE_SAMPLE_DATA",
schema="INFORMATION_SCHEMA",
role="SYSADMIN")
cur = conn.cursor()
try:
cur.execute(f"""SELECT TABLE_SCHEMA,TABLE_TYPE,TABLE_NAME, ROW_COUNT, DATE(CREATED) AS "TABLE_CREATED_DATE", DATE(LAST_ALTERED) AS "LAST_ALTERED_DATE" FROM TABLES WHERE TABLE_TYPE='BASE TABLE'
AND TABLE_SCHEMA='TPCDS_SF100TCL' AND TABLE_NAME IN ({','.join("'" + x + "'" for x in tables)})
UNION
SELECT TABLE_SCHEMA,TABLE_TYPE,TABLE_NAME, ROW_COUNT, DATE(CREATED) AS "TABLE_CREATED_DATE", DATE(LAST_ALTERED) AS "LAST_ALTERED_DATE" FROM TABLES WHERE TABLE_TYPE='BASE TABLE'
AND TABLE_SCHEMA='TPCDS_SF10TCL'
AND TABLE_NAME IN ({','.join("'" + x + "'" for x in tables)})""")
cur.execute (f"""select * from TPCDS_SF100TCL.CALL_CENTER""")
df = cur.fetch_pandas_all()
print(df) # Print in same screen after execution
cur.close()
conn.close()
except Exception as e:
print(e)
# sys.exit(1)
cur.close()
conn.close()
if __name__ == "__main__":
main()
After the second cur.execute, the query result in the variable cur was overwritten. You can try the solution below, where I save the results after the first query in the variable df1 and the second query in the variable df2
import pandas as pd
import snowflake.connector
import os
tables = ['CUSTOMER', 'CALL_CENTER', 'CUSTOMER_ADDRESS','CATALOG_PAGE','CUSTOMER_DEMOGRAPHICS']
filename = 'output.txt'
def main():
conn = snowflake.connector.connect(
user="my_usdr",
password="pswd",
account="my_account",
warehouse="my_WH",
database="SNOWFLAKE_SAMPLE_DATA",
schema="INFORMATION_SCHEMA",
role="SYSADMIN")
cur = conn.cursor()
try:
cur.execute(f"""SELECT TABLE_SCHEMA,TABLE_TYPE,TABLE_NAME, ROW_COUNT, DATE(CREATED) AS "TABLE_CREATED_DATE", DATE(LAST_ALTERED) AS "LAST_ALTERED_DATE" FROM TABLES WHERE TABLE_TYPE='BASE TABLE'
AND TABLE_SCHEMA='TPCDS_SF100TCL' AND TABLE_NAME IN ({','.join("'" + x + "'" for x in tables)})
UNION
SELECT TABLE_SCHEMA,TABLE_TYPE,TABLE_NAME, ROW_COUNT, DATE(CREATED) AS "TABLE_CREATED_DATE", DATE(LAST_ALTERED) AS "LAST_ALTERED_DATE" FROM TABLES WHERE TABLE_TYPE='BASE TABLE'
AND TABLE_SCHEMA='TPCDS_SF10TCL'
AND TABLE_NAME IN ({','.join("'" + x + "'" for x in tables)})""")
df1 = cur.fetch_pandas_all()
cur.execute (f"""select * from TPCDS_SF100TCL.CALL_CENTER""")
df2 = cur.fetch_pandas_all()
print(df1) # Print first result
print(df2) # Print second result
cur.close()
conn.close()
except Exception as e:
print(e)
# sys.exit(1)
cur.close()
conn.close()
if __name__ == "__main__":
main()

Search mutiple values from python dataframe in MS-SQL table

I have a string 'new_string' which contains a set of values to be searched in my MS-SQL table. I am getting an error "Could not parse rfc1738 URL from string ''2535488','2568394''"
new_string = "'2535488','2568394'"
cnxn = pyodbc.connect("DRIVER={SQL Server};SERVER=ABCDEF;DATABASE=my_db") #connection is successfully established.
data1 = pd.read_sql("""select * from my_table where my_col in (%s)""",new_string,cnxn)
But if I type the following query, I get my results.
data1 = pd.read_sql("""select * from my_table where my_col in ('2535488','2568394')""",cnxn)
How can I search for the values in my table?
You should use prepared statements like as follows:
In [58]: parms = ['2535488','2568394']
In [59]: q = """select * from my_table where my_col in ({})""".format(','.join(['?'] * len(parms)))
In [60]: q
Out[60]: 'select * from my_table where my_col in (?,?)'
now you should be able to do:
data1 = pd.read_sql(q, cnxn, params=parms)

why doesn't pandas execute sql query?

why doesn't pandas execute sql query?
import sqlite3
import pandas as pd
# load data
df = pd.read_csv('CurriculumAuditReport.csv')
# strip whitespace from headers
df.columns = df.columns.str.strip()
con = sqlite3.connect("sans.db")
# drop data into database
df.to_sql("MyTable", con, if_exists='replace')
df = pd.read_sql_query('SELECT Count (Department) FROM MyTable WHERE `CompletedTraining` LIKE 'Incomplete' GROUP BY Department', con)
print(df)
con.close()
the query produces the result i want just fine in DB browser for SQLite
the output
C:\sans>C:\python34\python test2.py
File "test2.py", line 15
df = pd.read_sql_query('SELECT Count (Department) FROM MyTable WHERE
`CompletedTraining` LIKE 'Incomplete' GROUP BY Department', con)
^
SyntaxError: invalid syntax
my output should have 11 rows
You have an issue with quoting - try this:
df = pd.read_sql_query("SELECT Department, count(*) as cnt FROM MyTable WHERE CompletedTraining = 'Incomplete' GROUP BY Department", con)
You can also use the following technique:
qry = """
SELECT department, count(*) as cnt
FROM MyTable
WHERE CompletedTraining = 'Incomplete'
GROUP BY department
"""
df = pd.read_sql_query(qry, con)

How to store mySQL query result into pandas DataFrame with pymysql?

I'm trying to store a mySQL query result in a pandas DataFrame using pymysql and am running into errors building the dataframe. Found a similar question here and here, but it looks like there are pymysql-specific errors being thrown:
import pandas as pd
import datetime
import pymysql
# dummy values
connection = pymysql.connect(user='username', password='password', databse='database_name', host='host')
start_date = datetime.datetime(2017,11,15)
end_date = datetime.datetime(2017,11,16)
try:
with connection.cursor() as cursor:
query = "SELECT * FROM orders WHERE date_time BETWEEN %s AND %s"
cursor.execute(query, (start_date, end_date))
df = pd.DataFrame(data=cursor.fetchall(), index = None, columns = cursor.keys())
finally:
connection.close()
returns: AttributeError: 'Cursor' object has no attribute 'keys'
If I drop the index and columns arguments:
try:
with connection.cursor() as cursor:
query = "SELECT * FROM orders WHERE date_time BETWEEN %s AND %s"
cursor.execute(query, (start_date, end_date))
df = pd.DataFrame(cursor.fetchall())
finally:
connection.close()
returns ValueError: DataFrame constructor not properly called!
Thanks in advance!
Use Pandas.read_sql() for this:
query = "SELECT * FROM orders WHERE date_time BETWEEN ? AND ?"
df = pd.read_sql(query, connection, params=(start_date, end_date))
Thank you for your suggestion to use pandas.read_sql(). It works with executing a stored procedure as well! I tested it in MSSQL 2017 environment.
Below is an example (I hope it helps others):
def database_query_to_df(connection, stored_proc, start_date, end_date):
# Define a query
query ="SET NOCOUNT ON; EXEC " + stored_proc + " ?, ? " + "; SET NOCOUNT OFF"
# Pass the parameters to the query, execute it, and store the results in a data frame
df = pd.read_sql(query, connection, params=(start_date, end_date))
return df
Try This:
import pandas as pd
import pymysql
mysql_connection = pymysql.connect(host='localhost', user='root', password='', db='test', charset='utf8')
sql = "SELECT * FROM `brands`"
df = pd.read_sql(sql, mysql_connection, index_col='brand_id')
print(df)

How to pass a data frame as parameter to a SQL query in Python?

I have a dataframe that consists of one column of values and I want to pass it as a parameter to execute the following sql query:
query = "SELECT ValueDate, Value"\
"FROM Table "\
"WHERE [ID] in ( ? ) "
So I tried (among so many other things) the following:
df = pd.read_sql_query(query, conn, params=[ df['ID'].values ])
df = pd.read_sql_query(query, conn, params=[ df['ID'].tolist ])
df = pd.read_sql_query(query, conn, params=[ list(df['ID'].values) ])
...
What is the correct way to pass the dataframe values ?
NB: I am using Microsoft SQL Server so the query needs to be formatted as I did.
Does this get you what you need?
import pandas as pd
your_column = pd.Series([1,2,3,4,5,6,7,8,9])
query = "SELECT ValueDate, Value"\
"FROM Table "\
"WHERE [ID] in {}".format(tuple(your_column))
print(query)
# 'SELECT ValueDate, ValueFROM Table WHERE [ID] in (1, 2, 3, 4, 5, 6, 7, 8, 9)'
Then you should be able to query without further parameters.
df = pd.read_sql_query(query, conn)
params = tuple(df['ID'].values)
sql = "SELECT COUNT(*) FROM foobar WHERE id IN (%s)" % (",".join(["?"]*len(params)),)
cursor.execute(sql, params)

Categories