Extract dataframe value based on SQL query - python

I'm looking to store data into a dataframe field based on an sql query. Here's my attempt:
import pyodbc
import pandas as pd
import pandasql as ps
from datetime import datetime
from pandasql import sqldf
conx_string = "driver={SQL SERVER}; server=mssql_db; database=db; UID=usr; PWD=my_pwd;"
conn = pyodbc.connect(conx_string)
crsr = conn.cursor()
query = "select OrderID, OrderDate, OrigOrderID, from tbl_Orders"
'''
Example of return for my sql statement:
OrderID | OrderDate | OrigOrderID
-----------------------------------
23 | 15-02-2023 | NULL
24 | 16-02-2023 | 23
'''
data = crsr.execute(query)
rows = [list(x) for x in data]
columns = [column[0] for column in crsr.description]
df = pd.DataFrame(rows, columns=columns)
for i,row in df.iterrows():
df.at[i, 'Date of receipt of OrigOrder'] = ps.sqldf(f"select OrderDate from tbl_Orders where OrderID='{df.at[i, 'OrigOrderID']}'")
print(df.at[i, 'Date of receipt of OrigOrder'])
'''
Example of 'Date of receipt of OrigOrder' for 'OrderID = 24'
--> print(df.at[i, 'Date of receipt of OrigOrder']) should return '15-02-2023' as it is the date of its 'OrigOrderID = 23'
I'm expecting to have a new column df['Date of receipt of OrigOrder'] (object) having df['OrderDate'] of df['OrigOrderID'] per row.
Any help please?

Related

Reading SQL table from Python with merge condition

import pandas as pd
conn = pyodbc.connect("Driver={??};"
"Server=??;"
"Database=??;"
"Trusted_Connection=yes;")
df1 = pd.read_sql_query("SELECT TOP 10000 * FROM table1", conn)
df2 = pd.read_sql_query("SELECT * FROM table2 (((where id_key = id(from table1) ))) ", conn)
Hello,
I have two tables in SQL server. I wanted to pull the data from table2 that has the same ID which mean id_key = id(from table1).
get df1's id as a tuple:
ids = tuple(df1['id'].to_list())
print(ids)
'''
(1, 2)
'''
then, use format and read sql:
sql= 'select*from table where id_key in {}'.format(ids)
print(sql)
'''
select*from table where id_key in (1, 2)
'''
df2=pd.read_sql(sql,conn)
full code:
import pandas as pd
conn = pyodbc.connect("Driver={??};"
"Server=??;"
"Database=??;"
"Trusted_Connection=yes;")
df1 = pd.read_sql_query("SELECT TOP 10000 * FROM table1", conn)
ids = tuple(df1['id'].to_list())
df2_sql = 'SELECT * FROM table2 where id_key in {}'.format(ids)
df2 = pd.read_sql_query(df2_sql, conn)

PYODBC | Pandas : Write pandas dataframe from a nested JSON column SQL

I spent over two days trying to solve this but still blocked.
I work with pyodbc and pandas in order to write a specific data ( ID and Role ) from a column called ExtraData ( a nested JSON ) into a pandas DataFrame.
The JSON in ExtraData is the following :
{"Data":{"Person":[{"Source":"C","ID":"45","Role":43}],"NoID":2}}
Here is my attempts :
import pyodbc
import json
import pandas.io.json as pd_json
#// Skip setting the connection string
crsr = conn.cursor()
query_json ="SELECT ExtraData FROM data"
test_data= crsr.execute(query_json).fetchall()
for row in test_data:
test = json.dumps([x for x in row])
data = pd_json.loads(test)
print(data) #['{"Data":{"Person":[{"Source":"C","ID":"45","Role":43}],"NoID":2}}']
df = pd_json.json_normalize(data,
record_path='Person',
meta=['ID', 'Role'])
print(df)
I have the following error :
---> df = pd_json.json_normalize(data, record_path='Person', meta=['ID', 'Role'])
TypeError: string indices must be integers
Do you have an explanation pls? And how to avoid having this error ?
I hope you help!
you should change code from:
import pyodbc
import json
import pandas.io.json as pd_json
crsr = conn.cursor()
query_json ="SELECT ExtraData FROM data"
test_data= crsr.execute(query_json).fetchall()
for row in test_data:
test = json.dumps([x for x in row])
data = pd_json.loads(test)
print(data)
df = pd_json.json_normalize(data,
record_path='Person',
meta=['ID', 'Role'])
print(df)
to
import pyodbc
import json
import pandas.io.json as pd_json
crsr = conn.cursor()
query_json ="SELECT ExtraData FROM data"
test_data= crsr.execute(query_json).fetchall()
for row in test_data:
test = json.dumps([x for x in row])
data = pd_json.loads(test[0]) ** this line!
print(data)
df = pd_json.json_normalize(data,
record_path='Person',
meta=['ID', 'Role'])
print(df)

Sending data from variable to sql query in pandas read_sql(')

I want to send start date and end date value into my sql query from 2 separate variables. Suppose,
I have start_date = '2020-05-14' and end_date = '2020-07-08' stored in a variable. Now my query is:
db_connection_str = 'mysql+pymysql://username:password#host/table_name'
db_connection = create_engine(db_connection_str)
def myfunc(start_date, end_date):
sql_syn = "SELECT col_id, col_a, col_b, date, FROM en_stat where date between :start_date and :end_date"
sql_df = pd.read_sql(sql_syn, con=db_connection, chunksize=100)
how to pass the start_date and end_date values dynamically for this particular code.
Use python3 f-strings:
def myfunc(start_date, end_date):
sql_syn = f"SELECT col_id, col_a, col_b, date, FROM en_stat where date between {start_date} and {end_date}"
sql_df = pd.read_sql(sql_syn, con=db_connection, chunksize=100)

Create Dataframe with Cx_Oracle based on different query date

Below is a sample of DB table
date id name
01.02.11 4 aaaa
21.05.19 5 aaaa
31.12.12 5 aaaa
01.05.15 6 aaaa
In order to query data in the right way (avoiding duplicates), while querying I have to set a 'reporting date' which is the first month day.
The below code gives me the requested results but only for one month.
sql = 'select * from db where date = '01.03.20''
def oracle(user, pwd, dsn, sql, columns):
# Connection to databases
con = cx_Oracle.connect(user=user, password=pwd, dsn=dsn, encoding="UTF-8")
con.outputtypehandler = OutputHandler
# Cursor allows Python code to execute PostgreSQL command in a database session
cur = con.cursor()
# Check Connection
print('Connected')
# Create DF
df = pd.DataFrame(cur.execute(sql).fetchall(), columns= columns, dtype='object')[:]
print('Shape:', df.shape)
return df
Question: How can I query Data using CX_Oracle with different reporting date without doing it manually?
There are multiple way to solve this issue directly using SQL.
However, the expected solution should use 'a for loop'.
I was thinking about changing the reporting date with
for i in [str(i).zfill(2) for i in range(1,13)]:
for j in [str(j).zfill(2) for j in range(0,21)]
sql = f'select * from db where date = '01.{i}.{j}''
For eg: date = 01.01.19
The idea is to query data for this date --> store it within DF
Go to Next month 01.02.19 --> Store it in DF
And so on until reached range 21 or reached last current month (latest date)
If someone has any idea to query data using a loop with cx_Oracle and Pandas for different date thanks for helping!
How about something like this
from datetime import date, datetime, timedelta
import calendar
# Choose Start Month
start_month = date(2019, 1, 1)
# Get Current Month
current_month = date(datetime.today().year, datetime.today().month, 1)
# Create list to collect all successfully run queries
executed_sql_queries = []
# Create list for failed queries
failed_queries = []
# Create list to collect dfs
dfs = []
while start_month <= current_month:
query_date = start_month.strftime('%d.%m.%y')
sql = f"""select * from db where date = '{query_date}' """
try:
df = oracle(user, pwd, dsn, sql=sql, columns)
except sql_error as e:
print(e)
failed_queries.append(sql)
pass # move onto the next query or you can try re-running the query
else:
executed_sql_queries.append(sql)
dfs.append(df)
finally:
# Add one Month to the date for each run
days_in_month = calendar.monthrange(start_month.year, start_month.month)[1]
start_month = start_month + timedelta(days=days_in_month)
all_dfs = pd.concat(dfs)
executed_sql_queries:
["select * from db where date = '01.01.19' ",
"select * from db where date = '01.02.19' ",
"select * from db where date = '01.03.19' ",
"select * from db where date = '01.04.19' ",
"select * from db where date = '01.05.19' ",
"select * from db where date = '01.06.19' ",
"select * from db where date = '01.07.19' ",
"select * from db where date = '01.08.19' ",
"select * from db where date = '01.09.19' ",
"select * from db where date = '01.10.19' ",
"select * from db where date = '01.11.19' ",
"select * from db where date = '01.12.19' ",
"select * from db where date = '01.01.20' ",
"select * from db where date = '01.02.20' ",
"select * from db where date = '01.03.20' ",
"select * from db where date = '01.04.20' "]

why doesn't pandas execute sql query?

why doesn't pandas execute sql query?
import sqlite3
import pandas as pd
# load data
df = pd.read_csv('CurriculumAuditReport.csv')
# strip whitespace from headers
df.columns = df.columns.str.strip()
con = sqlite3.connect("sans.db")
# drop data into database
df.to_sql("MyTable", con, if_exists='replace')
df = pd.read_sql_query('SELECT Count (Department) FROM MyTable WHERE `CompletedTraining` LIKE 'Incomplete' GROUP BY Department', con)
print(df)
con.close()
the query produces the result i want just fine in DB browser for SQLite
the output
C:\sans>C:\python34\python test2.py
File "test2.py", line 15
df = pd.read_sql_query('SELECT Count (Department) FROM MyTable WHERE
`CompletedTraining` LIKE 'Incomplete' GROUP BY Department', con)
^
SyntaxError: invalid syntax
my output should have 11 rows
You have an issue with quoting - try this:
df = pd.read_sql_query("SELECT Department, count(*) as cnt FROM MyTable WHERE CompletedTraining = 'Incomplete' GROUP BY Department", con)
You can also use the following technique:
qry = """
SELECT department, count(*) as cnt
FROM MyTable
WHERE CompletedTraining = 'Incomplete'
GROUP BY department
"""
df = pd.read_sql_query(qry, con)

Categories