Converting python script(which has __name__ == '__main__') to a Package/Library - python

I have a python script
import pandas as pd
import numpy as np
import EA_Upload_config as cfg
import datetime
#%%
def clockPrint(sentence):
now = datetime.datetime.now()
date_time = now.strftime("%H:%M:%S")
print(date_time + " : " + sentence)
def uploadToEA(df_,ds_api_name,operation_,instance,xmd_=None): #Upsert #Overwrite
import SalesforceEinsteinAnalytics as EA
clockPrint("Upload Process Initiated for "+instance+" instance...")
if instance.lower() == 'commercial':
EAS = EA.salesforceEinsteinAnalytics(env_url='https://spglobalratings.my.salesforce.com', browser='chrome')
if instance.lower() == 'analytical':
EAS = EA.salesforceEinsteinAnalytics(env_url='https://spglobalratingsae.my.salesforce.com', browser='chrome')
EAS.load_df_to_EA(df_,dataset_api_name=ds_api_name, operation=operation_,xmd=xmd_,fillna=False) #Error because of fillna=False
clockPrint("Upload Process Completed successfully for "+instance+" instance. Navigate to (Einstein Analytics --> Data Manager --> Monitor) to check progress.")
def processDate(date):
if pd.isnull(date):
return np.nan
else:
date = pd.to_datetime(date)
date = datetime.datetime.strftime(date,"%m/%d/%Y")
return date
if __name__ == '__main__':
df = pd.read_csv(cfg.FILE_PATH, dtype={"As of Date": str})
if len(cfg.DATE_COLUMNS) != 0:
for c in cfg.DATE_COLUMNS:
df[c] = df[c].apply(lambda x: processDate(x))
for c in df.columns:
if df[c].dtype == "O":
df[c].fillna('', inplace=True)
elif np.issubdtype(df[c].dtype, np.number):
df[c].fillna(0, inplace=True)
elif df[c].dtype == "datetime64[ns]":
df[c] = df[c].apply(lambda x: processDate(x))
df[c].fillna("", inplace=True)
df.fillna("", inplace=True)
for instance in cfg.INSTANCES:
if instance.lower() == 'commercial':
uploadToEA(df, cfg.COM_DATASET_API_NAME, cfg.COM_OPERATION, instance, cfg.COM_XMD)
elif instance.lower() == 'analytical':
uploadToEA(df, cfg.ANA_DATASET_API_NAME, cfg.ANA_OPERATION, instance, cfg.ANA_XMD)
else: clockPrint("Update INSTANCES variable as ['Commercial'] or ['Analytical'] or ['Commercial','Analytical'].")
It has three functions within it -
clockPrint
ProcessDate
uploadtoEA
** There is a link between this python script with another python script named EA_Upload_config which is being imported in this python script as cfg.
Now I want to convert this entire python script into a package. I am confused as to how should I handle name == 'main' ??
Aim is to convert this entire python script into a package

Related

Optimize dataframe filtering on large datasets, pandas

I have a little challenge here and to be honest, I have absolutely no idea how to handle it.
I have this dataframe composed of 660,000 rows and about 50 columns. I need to filter this dataframe very frequently and retrieve the filtered dataframe as fast as possible (goal is to have a processing time <1second). I'd like to be able to run that locally on a laptop, therefore my "processing power" is limited.
I have multiple inputs to filter the dataframe, some are set manually (see input 1) some are retrieved from another script (see input 2, the other script is not included in the code here for simplicity).
I was hoping to simple filter through the dataset using df[(df.column == filtervalue)]. However, it seems that the processing time is way too long.
Therefore, I am wondering whether there are some technics to optimize such processing time or if on the contrary the only way to optimize that is to go with a server that has a good CPU / Memory capacity?
Thanks for the help
import pandas as pd
df = pd.read_csv('xxxxxxxx', sep=";", dtype={"id": str,"dataset1": str,"dataset2":str,"myposition":str,"bet_1_preflop":float,"bet_2_preflop":float,"bet_3_preflop":float,"bet_1_flop":float,"bet_2_flop":float,
"bet_3_flop":float,"bet_1_turn":float ,"bet_2_turn":float,"bet_3_turn":float,"bet_1_river":float,"bet_2_river":float, "bet_3_river":float,
"myhand":str,"myposition":str,"cards_flop":str,"cards_turn":str,"cards_river":str,"action1_preflop":str," action2_preflop":str,
"action3_preflop":str,"action4_preflop":str, "action1_flop":str, "action2_flop":str, "action3_flop":str,"action4_flop":str,"action1_turn":str,
"action2_turn":str, "action3_turn":str, "action4_turn":str, "action1_river":str,"action2_river":str, "action3_river":str, "action4_river":str,
"action1_preflop_binary":'Int64', "action2_preflop_binary":'Int64', "action3_preflop_binary":'Int64', "action4_preflop_binary":'Int64',
"action1_flop_binary":'Int64',"action2_flop_binary":'Int64', "action3_flop_binary":'Int64', "action4_flop_binary":'Int64', "action1_turn_binary":'Int64',
"action2_turn_binary":'Int64', "action3_turn_binary":'Int64', "action4_turn_binary":'Int64',"action1_river_binary":'Int64', "action2_river_binary":'Int64',
"action3_river_binary":'Int64', "action4_river_binary":'Int64', "tiers":'Int64',"assorties":str,
"besthand_flop":str,"checker_flop":float,"handtype_flop":str,"topsuite_flop":'Int64',"topcolor_flop":'Int64',"besthand_turn":str,"checker_turn":float,"handtype_turn":str,
"topsuite_turn":'Int64',"topcolor_turn":'Int64',"besthand_river":str,"checker_river":float,"handtype_river":str,"topsuite_river":'Int64',"topcolor_river":'Int64'})
df = df.reset_index()
#Inputs for filters 1
myposition ="sb"
myhand = "ackc"
flop = "ad9d4h"
turn = "8d"
river = "th"
a1_preflop = "r"
a2_preflop = "r"
a3_preflop = "c"
a4_preflop = ""
a1_flop = "r"
a2_flop = "f"
a3_flop = ""
a4_flop = ""
a1_turn = ""
a2_turn = ""
a3_turn = ""
a4_turn = ""
a1_river = ""
a2_river = ""
a3_river = ""
a4_river = ""
#Inputs for filters 2 (from a different script)
tiers
assorties_status
best_allhands_flop[0]
best_allhands_flop[1]
best_allhands_flop[2]
highest_suite_flop
highest_color_flop
best_allhands_turn[0]
best_allhands_turn[1]
best_allhands_turn[2]
highest_suite_turn
highest_color_turn
best_allhands_river[0]
best_allhands_river[1]
best_allhands_river[2]
highest_suite_river
highest_color_river
#filtre_preflop_a1 = df[(df.myposition == myposition) & (df.tiers == tiers) & (df.assorties == assorties_status) & (df.action1_preflop == a1_preflop)]
#filtre_preflop_a2 = df[(df.myposition == myposition) & (df.tiers == tiers) & (df.assorties == assorties_status) & (df.action1_preflop == a1_preflop) & (df.action2_preflop == a2_preflop)]
#filtre_preflop_a3 = df[(df.myposition == myposition) & (df.tiers == tiers) & (df.assorties == assorties_status) & (df.action1_preflop == a1_preflop) & (df.action2_preflop == a2_preflop) & (df.action3_preflop == a3_preflop)]
#filtre_preflop_a4 = df[(df.myposition == myposition) & (df.tiers == tiers) & (df.assorties == assorties_status) & (df.action1_preflop == a1_preflop) & (df.action2_preflop == a2_preflop) & (df.action3_preflop == a3_preflop) & (df.action4_preflop == a4_preflop)]

How to appy threading to make it run faster?

I have this code to load data to file. I want to make it run concurrently using threads to make it faster. Some people recommended to use asyncio but I could'nt really understand it. This code is for cleaning a csv file. For eg it cleans reads date in arabic format and changes it to the english calender. Can anyone provide a brief overview of how this can be done.
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 00:03:38 2020
#author: siradmin
****** DATE ISSUES CODE ******
The purpose of this code is to correct date in different date columns
"""
import os
os.chdir("D://Medgulf Motor/2022/Code for date cleaning")
os.getcwd()
import pandas as pd
import datetime as dt
#df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.txt",
# engine='python', sep=';', chunksize=100000)
df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.csv",
engine='python', chunksize=100000 )
columns = ['Issue Date','Inception Date','Expiry Date', 'Policy Status Date',
'Vehicle Issue Date', 'Vehicle Inception Date','Vehicle Expiry Date',
'Status Date', 'Insured Date of Birth','Main Driver DOB']
# 'Istemarah Exp.', 'Additional Driver DOB']
fmts2 = ['%d/%m/%Y', '%d/%m/%y', '%d-%m-%Y', '%d-%m-%y', '%m/%d/%Y', '%Y/%m/%d',
'%Y-%m-%d', '%d|%m|%Y']
new_date = []
j = []
isd,ind,exd,psd,visd,vind,vexd,sd,ise,idb,mdd,add=(pd.DataFrame,)*12
header_flag = True
## Actual Code ##
print(dt.datetime.now())
for cx, chunk in enumerate(df):
for col in columns:
new_date = []
for idx, x in enumerate(chunk[col]):
try:
x = int(x)
dd = dt.datetime(1900,1,1)
da = dt.timedelta(days=int(x)-2)
nd = dd + da
x = nd.date()
except:
pass
for fmt in fmts2:
try:
x = str(x)
# x = str(x).replace("//0/", "/0")
# x = str(x).replace("//1/", "/1")
# x = str(x).replace("//2/", "/2")
x = str(x).replace(" 00:00:00", "")
x = str(x).replace("0/0/", "1/1/")
x = str(x).replace("/0/", "/01/")
x = str(x).replace("/2/", "/02/")
date_object = dt.datetime.strptime(x.strip(), fmt).date()
new_date.append((date_object))
break
except:
pass
if len(new_date) != idx:
pass
elif "29/02" in x or "29-02" in x:
new_date.append((x))
else:
# x = "None"
new_date.append(("")) #new_date.append((x))
match col:
case "Issue Date":
isd = isd.append(chunk.iloc[[idx]])
case "Inception Date":
ind = ind.append(chunk.iloc[[idx]])
case "Expiry Date":
exd = exd.append(chunk.iloc[[idx]])
case "Policy Status Date":
psd = psd.append(chunk.iloc[[idx]])
case "Vehicle Issue Date":
visd = visd.append(chunk.iloc[[idx]])
case "Vehicle Inception Date":
vind = vind.append(chunk.iloc[[idx]])
case "Vehicle Expiry Date":
vexd = vexd.append(chunk.iloc[[idx]])
case "Istemarah Exp.":
ise = ise.append(chunk.iloc[[idx]])
case "Main Driver DOB":
mdd = mdd.append(chunk.iloc[[idx]])
case "Additional Driver DOB":
add = add.append(chunk.iloc[[idx]])
# if col == "Issue Date":
# isd = isd.append(chunk.iloc[[idx]])
# if col == "Inception Date":
# ind = ind.append(chunk.iloc[[idx]])
# if col == "Expiry Date":
# exd = exd.append(chunk.iloc[[idx]])
# if col == "Policy Status Date":
# psd = psd.append(chunk.iloc[[idx]])
# if col == "Vehicle Issue Date":
# visd = visd.append(chunk.iloc[[idx]])
# if col == "Vehicle Inception Date":
# vind = vind.append(chunk.iloc[[idx]])
# if col == "Vehicle Expiry Date":
# vexd = vexd.append(chunk.iloc[[idx]])
# if col == "Istemarah Exp.":
# ise = ise.append(chunk.iloc[[idx]])
# # if col == "Insured Date of Birth":
# # idb = idb.append(chunk.iloc[[idx]])
# if col == "Main Driver DOB":
# mdd = mdd.append(chunk.iloc[[idx]])
# if col == "Additional Driver DOB":
# add = add.append(chunk.iloc[[idx]])
chunk[col] = j = ['{}'.format(t) for idx, t in enumerate(new_date)]
# chunk[col] = pd.to_datetime(chunk[col])
print ("Completed", col)
print ('we have completed ', cx, 'chunk\n')
chunk.to_csv('Tricast Policy Data.csv', mode='a', index =False, header = header_flag)
header_flag = False
print(dt.datetime.now())
if len(isd) != 0:
isd.to_csv("Issuedate.csv")
if len(ind) != 0:
ind.to_csv("Inceptiondatecsv")
if len(exd) != 0:
exd.to_csv("Expirydate.csv")
if len(psd) != 0:
psd.to_csv("policystatedate.csv")
if len(visd) != 0:
visd.to_csv("vehicleissuedate.csv")
if len(vind) != 0:
vind.to_csv("vehicleinceptiondate.csv")
if len(vexd) != 0:
vexd.to_csv("vehicleexpirydate.csv")
if len(sd) != 0:
sd.to_csv("statusdate.csv")
if len(ise) != 0:
ise.to_csv("istemarhexpiry.csv")
if len(idb) != 0:
idb.to_csv("insureddateofbirth.csv")
if len(mdd) != 0:
mdd.to_csv("maindriverdob.csv")
if len(add) != 0:
add.to_csv("adddriverdob.csv")
###############################################################################
Edit: this is the whole code.
My supervisor told me concurrency can be applied to the last part where the data is being loaded to the csv files.

Run a background thread in apache deployed flask script

I have a flask script which runs a background loop constantly and at a given time runs a process. When I ran this on my local machine for testing purposes it ran fine and the background thread worked, however, once I deployed it on ubuntu using apache the background process seems to not be running
from flask import Flask, render_template, redirect, url_for
from flask_wtf import FlaskForm
from wtforms import StringField, SubmitField, DateField
from wtforms.validators import Optional
from flask_bootstrap import Bootstrap
from threading import Thread
from MetocSC import scraper
import pandas as pd
from datetime import date
import time
import os
import re
app = Flask(__name__)
app.config['SECRET_KEY'] = 'key'
Bootstrap(app)
if not os.path.exists('Database.csv'):
df = pd.DataFrame(columns=['Type', 'Origin', 'Issue date', 'Subject', 'Date', 'Points', 'Upload date'])
df.to_csv('Database.csv', index=False)
class SearchForm(FlaskForm):
criteria = StringField('search')
date = DateField('Date', validators=(Optional(),))
submit = SubmitField('Search', render_kw={'class': 'btn btn-success'})
#app.route('/', methods=['GET', 'POST'])
def home():
# Display data with a search bar and date search
form = SearchForm()
if form.validate_on_submit():
return redirect(url_for('results', x=form.criteria.data, y=form.date.data))
return render_template('home.html', form=form)
#app.route('/results', defaults={'x': None, 'y': None})
#app.route('/results/<x>', defaults={'y': None})
#app.route('/results/<x>/<y>')
def results(x, y):
if x:
if re.search("""^\d{4}\-\d{2}\-\d{2}""", x):
date = x
if y:
criteria = y
else:
criteria = ''
if y:
if re.search("""^\d{4}\-\d{2}\-\d{2}""", y):
date = y
if x:
criteria = x
else:
criteria = ''
elif not x and not y:
date = ''
criteria = ''
print('date: ', date),
print('criteria', criteria)
df = pd.read_csv('Database.csv', index_col=[0])
if criteria != '' and date == '':
df = df[df.apply(lambda row: row.astype(str).str.contains(criteria, case=False).any(), axis=1)]
df.drop('Upload date', inplace=True, axis=1)
return render_template('results.html', tables=[df.to_html()], titles=[''])
elif date != '' and criteria == '':
df = df.loc[df['Upload date'] == date]
df.drop('Upload date', inplace=True, axis=1)
return render_template('results.html', tables=[df.to_html()], titles=[''])
elif date != '' and criteria != '':
df = df[(df['Upload date'] == date)]
df = df[df.apply(lambda row: row.astype(str).str.contains(criteria, case=False).any(), axis=1)]
df.drop('Upload date', inplace=True, axis=1)
return render_template('results.html', tables=[df.to_html()], titles=[''])
else:
return 'Error: Either a date or search criteria must be provided'
#app.route('/logs')
def logs():
df = pd.read_csv('Database.csv', index_col=[0])
df.drop('Upload date', inplace=True, axis=1)
return render_template('logs.html', tables=[df.to_html()], titles=[''])
def threaded_scraper():
while True: # Permanently run this loop as a thread
gmt = time.gmtime(time.time()) # Get the current GMT time
if gmt.tm_hour == 6 and gmt.tm_min == 30: # If half 6 scrape for the day
# region Check that it hasn't already been scraped today
f = open('check.txt', 'r')
if int(f.read()) == gmt.tm_mday:
time.sleep(1)
f.close()
# endregion
# region If not already scraped today then scrape
else:
ABPW10_data = scraper()
df = pd.read_csv('Database.csv', index_col=[0])
new_row = pd.DataFrame(data=
{
'Type': ABPW10_data['Type'],
'Origin': ABPW10_data['Origin'],
'Issue date': ABPW10_data['Issue date'],
'Subject': ABPW10_data['Subject'],
'Date': ABPW10_data['Date'],
'Points': str(ABPW10_data['Points']),
'Upload date': date.today()
},
index=[0]
)
df = pd.concat([df, new_row], ignore_index=True)
df.to_csv('Database.csv', index=False)
# endregion
# region Update text file to show that scraping has been done today
with open('check.txt', 'w') as f:
f.write(str(gmt.tm_mday))
f.close()
# endregion
else:
time.sleep(1)
thread = Thread(target=threaded_scraper)
thread.daemon = True
thread.start()
if __name__ == '__main__':
app.run()
I am aware there are other potentially better ways to thread tasks in flask, however this method worked for me so I stuck with it.
I followed this tutorial to get it set up on apache https://python.plainenglish.io/how-to-securely-deploy-flask-with-apache-in-a-linux-server-environment-7eacd4c69a73
Thanks for your help in advance

How to create library of a self-written static function in python

I have a python script which has multiple static functions. I want to convert that complete python script into a python library
import pandas as pd
import numpy as np
import EA_Upload_config as cfg
import datetime
#%%
def clockPrint(sentence):
now = datetime.datetime.now()
date_time = now.strftime("%H:%M:%S")
print(date_time + " : " + sentence)
def uploadToEA(df_,ds_api_name,operation_,instance,xmd_=None): #Upsert #Overwrite
import SalesforceEinsteinAnalytics as EA
clockPrint("Upload Process Initiated for "+instance+" instance...")
if instance.lower() == 'commercial':
EAS = EA.salesforceEinsteinAnalytics(env_url='https://spglobalratings.my.salesforce.com', browser='chrome')
if instance.lower() == 'analytical':
EAS = EA.salesforceEinsteinAnalytics(env_url='https://spglobalratingsae.my.salesforce.com', browser='chrome')
EAS.load_df_to_EA(df_,dataset_api_name=ds_api_name, operation=operation_,xmd=xmd_,fillna=False) #Error because of fillna=False
clockPrint("Upload Process Completed successfully for "+instance+" instance. Navigate to (Einstein Analytics --> Data Manager --> Monitor) to check progress.")
def processDate(date):
if pd.isnull(date):
return np.nan
else:
date = pd.to_datetime(date)
date = datetime.datetime.strftime(date,"%m/%d/%Y")
return date
if __name__ == '__main__':
df = pd.read_csv(cfg.FILE_PATH)
if len(cfg.DATE_COLUMNS) != 0:
for c in cfg.DATE_COLUMNS:
df[c] = df[c].apply(lambda x: processDate(x))
for c in df.columns:
if df[c].dtype == "O":
df[c].fillna('', inplace=True)
elif np.issubdtype(df[c].dtype, np.number):
df[c].fillna(0, inplace=True)
elif df[c].dtype == "datetime64[ns]":
df[c] = df[c].apply(lambda x: processDate(x))
df[c].fillna("", inplace=True)
df.fillna("", inplace=True)
for instance in cfg.INSTANCES:
if instance.lower() == 'commercial':
uploadToEA(df, cfg.COM_DATASET_API_NAME, cfg.COM_OPERATION, instance, cfg.COM_XMD)
elif instance.lower() == 'analytical':
uploadToEA(df, cfg.ANA_DATASET_API_NAME, cfg.ANA_OPERATION, instance, cfg.ANA_XMD)
else: clockPrint("Update INSTANCES variable as ['Commercial'] or ['Analytical'] or ['Commercial','Analytical'].")
This is my complete python script which I want to convert it into a library. How should I do it?

reading different Excel sheets in Python with if-elif-else

I am trying to read different sheets from Excel with if-elif-else statement depending upon the input and have written following code
import numpy as np
import pandas as pd
def ABSMATDATA(a,b,c,d,Material,Tmpref):
if Material == 2.016:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='H2')
elif Material == 28.016:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='N2')
elif Material == 32.000:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='O2')
elif Material == 32.065:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='S')
elif Material == 18.016:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='H2O')
elif Material == 64.065:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='SO2')
elif Material == 12.001:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='C Graphite')
elif Material == 28.011:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='CO')
elif Material == 44.011:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='CO2')
elif Material == 16.043:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='CH4')
elif Material == 30.070:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='C2H6')
elif Material == 44.097:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='C3H8')
elif Material == 58.124:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='C4H10')
else:
print('No data for this material available')
df =[list(np.arange(0,1100,100)),list(np.arange(0,11,1)),list(np.arange(0,11,1)),list(np.arange(0,11,1)),list(np.arange(0,11,1))]
return df
I am trying to run the Code calling ABSMATDATA(1,2,3,4,28.011,100) in the IPython Console but it is not giving any output. I was expecting to see the df in my Variable Explorer as an 2-dimensional array.
Your function is not returning anything, and you can cut your code a bit:
def ABSMATDATA(a,b,c,d,Material,Tmpref):
material_map = {2.016: 'H2',
28.016: 'N2',
32.000: 'O2',
32.065: 'S',
18.016: 'H20'}
if Material in material_map:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name=material_map[Material])
else:
df = [list(np.arange(0,1100,100)),list(np.arange(0,11,1)),list(np.arange(0,11,1)),list(np.arange(0,11,1)),list(np.arange(0,11,1))]
print('No data for this material available')
return df

Categories