Executing SQL on Pandas Dataframe and storing results in same Dataframe

Executing SQL on Pandas Dataframe and storing results in same Dataframe - python

I have a data frame that looks like the image above. What I want to do is loop through the SQL statements under SQL_SCRIPT, execute them, and store the results in the next column over which would be called 'RESULTS'. When I just try and execute it (without storing it anywhere) it runs fine, but when I try and store the results in a new dataframe column it errors out with:
ValueError: cannot set a row with mismatched columns
Here is the code:
def run_tests(self):
s = self.connection()
df = self.retrieve_sql()
df_type = df.loc[df['STEP_TYPE'] == 'T']
df_to_list = df_type[['TABLE_NM', 'TEST_TABLE_NM', 'SQL_SCRIPT']]
print(df_to_list)
for sql_script in df_to_list['SQL_SCRIPT']:
df_to_list.loc['RESULTS'] = pd.read_sql(sql_script,s)
print(df_to_list)
Instead of read_sql I have also tried just using the session execute, which also works but I'm not sure how to store the results to the dataframe going that route:
def run_tests(self):
s = self.connection()
df = self.retrieve_sql()
df_type = df.loc[df['STEP_TYPE'] == 'T']
df_to_list = df_type[['TABLE_NM', 'TEST_TABLE_NM', 'SQL_SCRIPT']]
print(df_to_list)
for sql_script in df_to_list['SQL_SCRIPT']:
s.execute(sql_script)
Here is the connection function, if needed:
def connection(self):
con = self.load_json_file()
cfg_dsn = con['config']['dsn']
cfg_usr = con['config']['username']
cfg_pwd = con['config']['password']
udaExec = teradata.UdaExec(appName="DataAnalysis", version="1.0", logConsole=False)
session = udaExec.connect(method="odbc", dsn=cfg_dsn, username=cfg_usr, password=cfg_pwd)
return session

Consider running Series.apply on the column of SQL strings.
def run_tests(self):
s = self.connection()
c = s.cursor() # OPEN CURSOR
df = self.retrieve_sql()
df_type = df.loc[df['STEP_TYPE'] == 'T']
df_to_list = df_type[['TABLE_NM', 'TEST_TABLE_NM', 'SQL_SCRIPT']]
print(df_to_list)
# NEW METHOD TO RUN QUERY
def sql_run(x):
c.execute(x)
if c.rowcount > 0:
res = c.fetchone()[0]
else:
res = np.nan
return res
df_to_list['RESULTS'] = df_to_list['SQL_SCRIPT'].apply(sql_run)
print(df_to_list)

Related

unit test function that returns a pandas dataframe

How could I cover the tests in the case of a function that returns a dataframe that can change at any time? I am starting tests and I have no idea how to cover my code with unit tests, what do you advise me to do?
def update_with_formulario(date):
df_formulario = _get_formulario(date)
assert df_formulario is not None, 'Dataframe df_formulario is null'
df_csv = utilities.get_data_csv_with_pandas('df.csv')
assert df_csv is not None, 'Dataframe df_csv is null'
df = df_formulario.merge(df_csv, 'left')
utilities.write_csv_dataframe(f"{ROOT_WRITE_PATH}df.csv", df)
def _get_formulario(date):
global cnn
try:
repo = Repository()
cnn = repo.connect()
query = f"""SELECT DISTINCT fecha,
hora ,
coalesce(valor_copia, valor) AS value
FROM table
WHERE VARIABLE IN ('variable')
AND fecha_data = '{date} 00:00:00'"""
df_formulario = pd.read_sql_query(query, cnn)
return df_formulario
except:
logging.exception(exc_msg.EXC_PROCEDURE_EXCEPT_MSG)
finally:
if cnn:
cnn.close()
logging.info(exc_msg.CNN_CLOSED_MSG)

Generalize Getting Data From SQL Server to Python

I'm studying on a task that I have to get data from SQL Server, and because I'm running time series analysis, I need to specify a date field that can change every table or query. Also I can read a simple query or a stored procedure. I want to generalize my below code which is a field and database specific. I thought that I can define an empty dictionary in class and then I can call it in below dataread method. But I am conflicted.
class DataPrep:
def __init__(self,conn):
self.df = pd.DataFrame()
self.mega_projects = set()
self.mega_project_to_df = {}
self.mega_project_to_df_pvt = {}
self.conn={}
def read_data(self):
self.conn=pyodbc.connect({'driver':None, 'server':None, 'database':None, 'uid':None, 'pwd':None})
self.df = pd.read_sql_query('''exec [dbo].[ML_WorkLoad]''', self.conn, parse_dates={'CreatedDate': '%d/%m/%Y %H.%M.%S'})
#self.df = self.df[['EstimateManDay', 'CreatedDate', 'MegaProject', 'ProjectName']]
self.df['month'] = pd.DatetimeIndex(self.df['CreatedDate']).month
self.df['year'] = pd.DatetimeIndex(self.df['CreatedDate']).year
self.df['quarter'] = pd.DatetimeIndex(self.df['CreatedDate']).quarter
self.df['week'] = pd.DatetimeIndex(self.df['CreatedDate']).week
self.df['dayorg'] = pd.DatetimeIndex(self.df['CreatedDate']).day
self.df['day'] = 1
self.df['year_quarter'] = self.df['year'].astype(str) + "_" + self.df[
'quarter'].astype(str)
self.df['year_month'] = self.df['year'].astype(str) + "_" + self.df[
'month'].astype(str)
self.df['year_week'] = self.df['year'].astype(str) + "_" + self.df['week'].astype(
str)
self.df['date'] = pd.to_datetime(self.df[['year', 'month', 'day']])
self.df = self.df[self.df['CreatedDate'] <= datetime.strptime("2020-01-01", "%Y-%m-%d")]

Overwriting one data with another data in pandas(dataframe)

Periodically (every 120 seconds) get data but recent data overwrites previous data in SQL DB. I want all data to be saved.In addition, is the timer correct?
import sqlalchemy as sa
import psycopg2
import requests as rq
import pandas as pd
import json
import time
start_time = time.time()
while True:
temp = pd.DataFrame()
df = pd.DataFrame()
vehicleList = {"SN63NBK", "YY67UTP"}
for ids in vehicleList:
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
r = r.text
temp = pd.read_json(r)
temp['Type'] = 'ids'
df = pd.concat([df, temp], sort=False).reset_index(drop=True)
engine = sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus_pg6', engine, if_exists='replace', index=False)
time.sleep(120.0 - ((time.time() - start_time) % 120.0))

I changed your code slightly, but I think the main problem is in if_exists parameter which you should set to append, as #K753 have mentioned in the comments.
Also, YY67UTP id returns nothing, so I replaced it with another random id from the site to illustrate how code works.
def _data_gen(vehicles):
""" Yields a dataframe for each request """
for ids in vehicles:
time.sleep(1)
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
temp = pd.read_json(r.text)
temp['Type'] = ids
yield temp
while True:
# how do you break from while loop if you need to?
vehicleList = {"SN63NBK", "YY67UTP"}
df = pd.concat(_data_gen(vehicleList), sort=False, ignore_index=True)
engine = sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus_pg6', engine, if_exists='append', index=False)
time.sleep(120)

Pandas: Difficulty Adding New Columns

I have a Pandas dataframe, numeric_df, with a bunch of columns. I have this function:
def textstat_stats(text):
difficulty = textstat.flesch_reading_ease(text)
grade_difficulty = textstat.flesch_kincaid_grade(text)
gfog = textstat.gunning_fog(text)
smog = textstat.smog_index(text)
ari = textstat.automated_readability_index(text)
cli = textstat.coleman_liau_index(text)
lwf = textstat.linsear_write_formula(text)
dcrs = textstat.dale_chall_readability_score(text)
return pd.Series([difficulty, grade_difficulty, gfog, smog, ari, cli, lwf, dcrs])
which returns a Pandas Series. Now I'm trying this:
numeric_df[['difficulty', 'grade_difficulty','gfog','smog','ari','cli','lwf','dcrs']] = textstat_stats(text)
However, I get this Error:
KeyError: "['difficulty' 'grade_difficulty' 'gfog' 'smog' 'ari' 'cli' 'lwf' 'dcrs'] not in index"
What am I doing incorrectly?
Thanks!

It seems you need add index to Series which create columns names:
def textstat_stats(text):
difficulty = textstat.flesch_reading_ease(text)
grade_difficulty = textstat.flesch_kincaid_grade(text)
gfog = textstat.gunning_fog(text)
smog = textstat.smog_index(text)
ari = textstat.automated_readability_index(text)
cli = textstat.coleman_liau_index(text)
lwf = textstat.linsear_write_formula(text)
dcrs = textstat.dale_chall_readability_score(text)
idx = ['difficulty', 'grade_difficulty','gfog','smog','ari','cli','lwf','dcrs']
return pd.Series([difficulty, grade_difficulty, gfog, smog, ari, cli, lwf, dcrs],
index=idx)
df = textstat_stats(text)
print (df)

Use of function in pandas dataframe with multiprocessing

I am attempting to speed up calculations on a pandas DataFrame using multiprocessing which goes really well minus the fact that assigning the result of the calculation to the df.ix does not work here like it does in my code without trying multiprocessing here
I've added a #sanity check to the code which outputs valid values and would make me think this would work just fine, but the DataFrame doesn't get populated (stays as NaN). Does anyone know why that may be, and more importantly, what changes may be needed to plug the values into the DataFrame in the context of multiprocessing?
Output of sanity check:
should be setting df.ix[4][1] to: 23.2506112824
should be setting df.ix[0][0] to: 0.0
should be setting df.ix[7][0] to: 15.9574526264
code:
import mysql.connector
import numpy as np
from colormath.color_objects import LabColor
from colormath.color_diff import delta_e_cie2000
import pandas as pd
from mysql.connector.pooling import MySQLConnectionPool
from multiprocessing import Pool
pool = Pool()
cnx = mysql.connector.connect(user='user', password='pass',host='localhost', database='database')
cursor = cnx.cursor()
selectstmt = 'SELECT CIE_Lab, ID FROM `database`.`table`'
cursor.execute(selectstmt)
color = cursor.fetchall()
df = pd.DataFrame(columns = color, index = color)
sides = df.index
headers = df.dtypes.index
shape = df.shape[0]
def delta(cie_Lab1, cie_Lab2):
cie_Lab1 = cie_Lab1[1:]
cie_Lab1 = cie_Lab1[:-1]
cie_Lab2 = cie_Lab2[1:]
cie_Lab2 = cie_Lab2[:-1]
CIE_list1 = cie_Lab1.split(",")
CIE_list2 = cie_Lab2.split(",")
#print CIE_list1
CIE_L1 = CIE_list1[0]
CIE_a1 = CIE_list1[1]
CIE_b1 = CIE_list1[2]
CIE_L2 = CIE_list2[0]
CIE_a2 = CIE_list2[1]
CIE_b2 = CIE_list2[2]
color1 = LabColor(lab_l=CIE_L1, lab_a=CIE_a1, lab_b=CIE_b1)
color2 = LabColor(lab_l=CIE_L2, lab_a=CIE_a2, lab_b=CIE_b2)
deltae = delta_e_cie2000(color1, color2, Kl=1, Kc=1, Kh=1)
return deltae
def deltas(nums):
listoflists = []
for num in range(nums):
for mun in range(nums):
listoflists.append([num,mun])
return listoflists
def update(inp):
sides = df.index
headers = df.dtypes.index
num = inp[0]
mun = inp[1]
res = delta(headers[num][0], sides[mun][0])
#sanity check
print "should be setting df.ix["+str(mun)+"]["+str(num)+"] to: "+str(res)
df.ix[mun][num] = res
if __name__ == '__main__':
pool = Pool(4)
pool.map(update, deltas(shape))
pool.close()
pool.join()
print df
Dataframe example:
([69.62248143012944, -54.15108764844451, 67.92070706614288], 1) \
([69.62248143012944, -54.15108764844451, 67.920... NaN
([58.17848217611454, -52.251714243997995, 56.77... NaN
([87.02539335188214, -32.15758725885986, 66.450... NaN
([86.86259502866965, -31.483524711078015, 75.14... NaN
([85.39154525710671, -31.683349117376856, 71.35... NaN

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Executing SQL on Pandas Dataframe and storing results in same Dataframe - python

Related

unit test function that returns a pandas dataframe

Generalize Getting Data From SQL Server to Python

Overwriting one data with another data in pandas(dataframe)

Pandas: Difficulty Adding New Columns

Use of function in pandas dataframe with multiprocessing

Categories

Resources