I have a python script which has multiple static functions. I want to convert that complete python script into a python library
import pandas as pd
import numpy as np
import EA_Upload_config as cfg
import datetime
#%%
def clockPrint(sentence):
now = datetime.datetime.now()
date_time = now.strftime("%H:%M:%S")
print(date_time + " : " + sentence)
def uploadToEA(df_,ds_api_name,operation_,instance,xmd_=None): #Upsert #Overwrite
import SalesforceEinsteinAnalytics as EA
clockPrint("Upload Process Initiated for "+instance+" instance...")
if instance.lower() == 'commercial':
EAS = EA.salesforceEinsteinAnalytics(env_url='https://spglobalratings.my.salesforce.com', browser='chrome')
if instance.lower() == 'analytical':
EAS = EA.salesforceEinsteinAnalytics(env_url='https://spglobalratingsae.my.salesforce.com', browser='chrome')
EAS.load_df_to_EA(df_,dataset_api_name=ds_api_name, operation=operation_,xmd=xmd_,fillna=False) #Error because of fillna=False
clockPrint("Upload Process Completed successfully for "+instance+" instance. Navigate to (Einstein Analytics --> Data Manager --> Monitor) to check progress.")
def processDate(date):
if pd.isnull(date):
return np.nan
else:
date = pd.to_datetime(date)
date = datetime.datetime.strftime(date,"%m/%d/%Y")
return date
if __name__ == '__main__':
df = pd.read_csv(cfg.FILE_PATH)
if len(cfg.DATE_COLUMNS) != 0:
for c in cfg.DATE_COLUMNS:
df[c] = df[c].apply(lambda x: processDate(x))
for c in df.columns:
if df[c].dtype == "O":
df[c].fillna('', inplace=True)
elif np.issubdtype(df[c].dtype, np.number):
df[c].fillna(0, inplace=True)
elif df[c].dtype == "datetime64[ns]":
df[c] = df[c].apply(lambda x: processDate(x))
df[c].fillna("", inplace=True)
df.fillna("", inplace=True)
for instance in cfg.INSTANCES:
if instance.lower() == 'commercial':
uploadToEA(df, cfg.COM_DATASET_API_NAME, cfg.COM_OPERATION, instance, cfg.COM_XMD)
elif instance.lower() == 'analytical':
uploadToEA(df, cfg.ANA_DATASET_API_NAME, cfg.ANA_OPERATION, instance, cfg.ANA_XMD)
else: clockPrint("Update INSTANCES variable as ['Commercial'] or ['Analytical'] or ['Commercial','Analytical'].")
This is my complete python script which I want to convert it into a library. How should I do it?
Related
I have this code to load data to file. I want to make it run concurrently using threads to make it faster. Some people recommended to use asyncio but I could'nt really understand it. This code is for cleaning a csv file. For eg it cleans reads date in arabic format and changes it to the english calender. Can anyone provide a brief overview of how this can be done.
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 00:03:38 2020
#author: siradmin
****** DATE ISSUES CODE ******
The purpose of this code is to correct date in different date columns
"""
import os
os.chdir("D://Medgulf Motor/2022/Code for date cleaning")
os.getcwd()
import pandas as pd
import datetime as dt
#df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.txt",
# engine='python', sep=';', chunksize=100000)
df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.csv",
engine='python', chunksize=100000 )
columns = ['Issue Date','Inception Date','Expiry Date', 'Policy Status Date',
'Vehicle Issue Date', 'Vehicle Inception Date','Vehicle Expiry Date',
'Status Date', 'Insured Date of Birth','Main Driver DOB']
# 'Istemarah Exp.', 'Additional Driver DOB']
fmts2 = ['%d/%m/%Y', '%d/%m/%y', '%d-%m-%Y', '%d-%m-%y', '%m/%d/%Y', '%Y/%m/%d',
'%Y-%m-%d', '%d|%m|%Y']
new_date = []
j = []
isd,ind,exd,psd,visd,vind,vexd,sd,ise,idb,mdd,add=(pd.DataFrame,)*12
header_flag = True
## Actual Code ##
print(dt.datetime.now())
for cx, chunk in enumerate(df):
for col in columns:
new_date = []
for idx, x in enumerate(chunk[col]):
try:
x = int(x)
dd = dt.datetime(1900,1,1)
da = dt.timedelta(days=int(x)-2)
nd = dd + da
x = nd.date()
except:
pass
for fmt in fmts2:
try:
x = str(x)
# x = str(x).replace("//0/", "/0")
# x = str(x).replace("//1/", "/1")
# x = str(x).replace("//2/", "/2")
x = str(x).replace(" 00:00:00", "")
x = str(x).replace("0/0/", "1/1/")
x = str(x).replace("/0/", "/01/")
x = str(x).replace("/2/", "/02/")
date_object = dt.datetime.strptime(x.strip(), fmt).date()
new_date.append((date_object))
break
except:
pass
if len(new_date) != idx:
pass
elif "29/02" in x or "29-02" in x:
new_date.append((x))
else:
# x = "None"
new_date.append(("")) #new_date.append((x))
match col:
case "Issue Date":
isd = isd.append(chunk.iloc[[idx]])
case "Inception Date":
ind = ind.append(chunk.iloc[[idx]])
case "Expiry Date":
exd = exd.append(chunk.iloc[[idx]])
case "Policy Status Date":
psd = psd.append(chunk.iloc[[idx]])
case "Vehicle Issue Date":
visd = visd.append(chunk.iloc[[idx]])
case "Vehicle Inception Date":
vind = vind.append(chunk.iloc[[idx]])
case "Vehicle Expiry Date":
vexd = vexd.append(chunk.iloc[[idx]])
case "Istemarah Exp.":
ise = ise.append(chunk.iloc[[idx]])
case "Main Driver DOB":
mdd = mdd.append(chunk.iloc[[idx]])
case "Additional Driver DOB":
add = add.append(chunk.iloc[[idx]])
# if col == "Issue Date":
# isd = isd.append(chunk.iloc[[idx]])
# if col == "Inception Date":
# ind = ind.append(chunk.iloc[[idx]])
# if col == "Expiry Date":
# exd = exd.append(chunk.iloc[[idx]])
# if col == "Policy Status Date":
# psd = psd.append(chunk.iloc[[idx]])
# if col == "Vehicle Issue Date":
# visd = visd.append(chunk.iloc[[idx]])
# if col == "Vehicle Inception Date":
# vind = vind.append(chunk.iloc[[idx]])
# if col == "Vehicle Expiry Date":
# vexd = vexd.append(chunk.iloc[[idx]])
# if col == "Istemarah Exp.":
# ise = ise.append(chunk.iloc[[idx]])
# # if col == "Insured Date of Birth":
# # idb = idb.append(chunk.iloc[[idx]])
# if col == "Main Driver DOB":
# mdd = mdd.append(chunk.iloc[[idx]])
# if col == "Additional Driver DOB":
# add = add.append(chunk.iloc[[idx]])
chunk[col] = j = ['{}'.format(t) for idx, t in enumerate(new_date)]
# chunk[col] = pd.to_datetime(chunk[col])
print ("Completed", col)
print ('we have completed ', cx, 'chunk\n')
chunk.to_csv('Tricast Policy Data.csv', mode='a', index =False, header = header_flag)
header_flag = False
print(dt.datetime.now())
if len(isd) != 0:
isd.to_csv("Issuedate.csv")
if len(ind) != 0:
ind.to_csv("Inceptiondatecsv")
if len(exd) != 0:
exd.to_csv("Expirydate.csv")
if len(psd) != 0:
psd.to_csv("policystatedate.csv")
if len(visd) != 0:
visd.to_csv("vehicleissuedate.csv")
if len(vind) != 0:
vind.to_csv("vehicleinceptiondate.csv")
if len(vexd) != 0:
vexd.to_csv("vehicleexpirydate.csv")
if len(sd) != 0:
sd.to_csv("statusdate.csv")
if len(ise) != 0:
ise.to_csv("istemarhexpiry.csv")
if len(idb) != 0:
idb.to_csv("insureddateofbirth.csv")
if len(mdd) != 0:
mdd.to_csv("maindriverdob.csv")
if len(add) != 0:
add.to_csv("adddriverdob.csv")
###############################################################################
Edit: this is the whole code.
My supervisor told me concurrency can be applied to the last part where the data is being loaded to the csv files.
I have a flask script which runs a background loop constantly and at a given time runs a process. When I ran this on my local machine for testing purposes it ran fine and the background thread worked, however, once I deployed it on ubuntu using apache the background process seems to not be running
from flask import Flask, render_template, redirect, url_for
from flask_wtf import FlaskForm
from wtforms import StringField, SubmitField, DateField
from wtforms.validators import Optional
from flask_bootstrap import Bootstrap
from threading import Thread
from MetocSC import scraper
import pandas as pd
from datetime import date
import time
import os
import re
app = Flask(__name__)
app.config['SECRET_KEY'] = 'key'
Bootstrap(app)
if not os.path.exists('Database.csv'):
df = pd.DataFrame(columns=['Type', 'Origin', 'Issue date', 'Subject', 'Date', 'Points', 'Upload date'])
df.to_csv('Database.csv', index=False)
class SearchForm(FlaskForm):
criteria = StringField('search')
date = DateField('Date', validators=(Optional(),))
submit = SubmitField('Search', render_kw={'class': 'btn btn-success'})
#app.route('/', methods=['GET', 'POST'])
def home():
# Display data with a search bar and date search
form = SearchForm()
if form.validate_on_submit():
return redirect(url_for('results', x=form.criteria.data, y=form.date.data))
return render_template('home.html', form=form)
#app.route('/results', defaults={'x': None, 'y': None})
#app.route('/results/<x>', defaults={'y': None})
#app.route('/results/<x>/<y>')
def results(x, y):
if x:
if re.search("""^\d{4}\-\d{2}\-\d{2}""", x):
date = x
if y:
criteria = y
else:
criteria = ''
if y:
if re.search("""^\d{4}\-\d{2}\-\d{2}""", y):
date = y
if x:
criteria = x
else:
criteria = ''
elif not x and not y:
date = ''
criteria = ''
print('date: ', date),
print('criteria', criteria)
df = pd.read_csv('Database.csv', index_col=[0])
if criteria != '' and date == '':
df = df[df.apply(lambda row: row.astype(str).str.contains(criteria, case=False).any(), axis=1)]
df.drop('Upload date', inplace=True, axis=1)
return render_template('results.html', tables=[df.to_html()], titles=[''])
elif date != '' and criteria == '':
df = df.loc[df['Upload date'] == date]
df.drop('Upload date', inplace=True, axis=1)
return render_template('results.html', tables=[df.to_html()], titles=[''])
elif date != '' and criteria != '':
df = df[(df['Upload date'] == date)]
df = df[df.apply(lambda row: row.astype(str).str.contains(criteria, case=False).any(), axis=1)]
df.drop('Upload date', inplace=True, axis=1)
return render_template('results.html', tables=[df.to_html()], titles=[''])
else:
return 'Error: Either a date or search criteria must be provided'
#app.route('/logs')
def logs():
df = pd.read_csv('Database.csv', index_col=[0])
df.drop('Upload date', inplace=True, axis=1)
return render_template('logs.html', tables=[df.to_html()], titles=[''])
def threaded_scraper():
while True: # Permanently run this loop as a thread
gmt = time.gmtime(time.time()) # Get the current GMT time
if gmt.tm_hour == 6 and gmt.tm_min == 30: # If half 6 scrape for the day
# region Check that it hasn't already been scraped today
f = open('check.txt', 'r')
if int(f.read()) == gmt.tm_mday:
time.sleep(1)
f.close()
# endregion
# region If not already scraped today then scrape
else:
ABPW10_data = scraper()
df = pd.read_csv('Database.csv', index_col=[0])
new_row = pd.DataFrame(data=
{
'Type': ABPW10_data['Type'],
'Origin': ABPW10_data['Origin'],
'Issue date': ABPW10_data['Issue date'],
'Subject': ABPW10_data['Subject'],
'Date': ABPW10_data['Date'],
'Points': str(ABPW10_data['Points']),
'Upload date': date.today()
},
index=[0]
)
df = pd.concat([df, new_row], ignore_index=True)
df.to_csv('Database.csv', index=False)
# endregion
# region Update text file to show that scraping has been done today
with open('check.txt', 'w') as f:
f.write(str(gmt.tm_mday))
f.close()
# endregion
else:
time.sleep(1)
thread = Thread(target=threaded_scraper)
thread.daemon = True
thread.start()
if __name__ == '__main__':
app.run()
I am aware there are other potentially better ways to thread tasks in flask, however this method worked for me so I stuck with it.
I followed this tutorial to get it set up on apache https://python.plainenglish.io/how-to-securely-deploy-flask-with-apache-in-a-linux-server-environment-7eacd4c69a73
Thanks for your help in advance
I have a python script
import pandas as pd
import numpy as np
import EA_Upload_config as cfg
import datetime
#%%
def clockPrint(sentence):
now = datetime.datetime.now()
date_time = now.strftime("%H:%M:%S")
print(date_time + " : " + sentence)
def uploadToEA(df_,ds_api_name,operation_,instance,xmd_=None): #Upsert #Overwrite
import SalesforceEinsteinAnalytics as EA
clockPrint("Upload Process Initiated for "+instance+" instance...")
if instance.lower() == 'commercial':
EAS = EA.salesforceEinsteinAnalytics(env_url='https://spglobalratings.my.salesforce.com', browser='chrome')
if instance.lower() == 'analytical':
EAS = EA.salesforceEinsteinAnalytics(env_url='https://spglobalratingsae.my.salesforce.com', browser='chrome')
EAS.load_df_to_EA(df_,dataset_api_name=ds_api_name, operation=operation_,xmd=xmd_,fillna=False) #Error because of fillna=False
clockPrint("Upload Process Completed successfully for "+instance+" instance. Navigate to (Einstein Analytics --> Data Manager --> Monitor) to check progress.")
def processDate(date):
if pd.isnull(date):
return np.nan
else:
date = pd.to_datetime(date)
date = datetime.datetime.strftime(date,"%m/%d/%Y")
return date
if __name__ == '__main__':
df = pd.read_csv(cfg.FILE_PATH, dtype={"As of Date": str})
if len(cfg.DATE_COLUMNS) != 0:
for c in cfg.DATE_COLUMNS:
df[c] = df[c].apply(lambda x: processDate(x))
for c in df.columns:
if df[c].dtype == "O":
df[c].fillna('', inplace=True)
elif np.issubdtype(df[c].dtype, np.number):
df[c].fillna(0, inplace=True)
elif df[c].dtype == "datetime64[ns]":
df[c] = df[c].apply(lambda x: processDate(x))
df[c].fillna("", inplace=True)
df.fillna("", inplace=True)
for instance in cfg.INSTANCES:
if instance.lower() == 'commercial':
uploadToEA(df, cfg.COM_DATASET_API_NAME, cfg.COM_OPERATION, instance, cfg.COM_XMD)
elif instance.lower() == 'analytical':
uploadToEA(df, cfg.ANA_DATASET_API_NAME, cfg.ANA_OPERATION, instance, cfg.ANA_XMD)
else: clockPrint("Update INSTANCES variable as ['Commercial'] or ['Analytical'] or ['Commercial','Analytical'].")
It has three functions within it -
clockPrint
ProcessDate
uploadtoEA
** There is a link between this python script with another python script named EA_Upload_config which is being imported in this python script as cfg.
Now I want to convert this entire python script into a package. I am confused as to how should I handle name == 'main' ??
Aim is to convert this entire python script into a package
I am trying to read different sheets from Excel with if-elif-else statement depending upon the input and have written following code
import numpy as np
import pandas as pd
def ABSMATDATA(a,b,c,d,Material,Tmpref):
if Material == 2.016:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='H2')
elif Material == 28.016:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='N2')
elif Material == 32.000:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='O2')
elif Material == 32.065:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='S')
elif Material == 18.016:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='H2O')
elif Material == 64.065:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='SO2')
elif Material == 12.001:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='C Graphite')
elif Material == 28.011:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='CO')
elif Material == 44.011:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='CO2')
elif Material == 16.043:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='CH4')
elif Material == 30.070:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='C2H6')
elif Material == 44.097:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='C3H8')
elif Material == 58.124:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name='C4H10')
else:
print('No data for this material available')
df =[list(np.arange(0,1100,100)),list(np.arange(0,11,1)),list(np.arange(0,11,1)),list(np.arange(0,11,1)),list(np.arange(0,11,1))]
return df
I am trying to run the Code calling ABSMATDATA(1,2,3,4,28.011,100) in the IPython Console but it is not giving any output. I was expecting to see the df in my Variable Explorer as an 2-dimensional array.
Your function is not returning anything, and you can cut your code a bit:
def ABSMATDATA(a,b,c,d,Material,Tmpref):
material_map = {2.016: 'H2',
28.016: 'N2',
32.000: 'O2',
32.065: 'S',
18.016: 'H20'}
if Material in material_map:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name=material_map[Material])
else:
df = [list(np.arange(0,1100,100)),list(np.arange(0,11,1)),list(np.arange(0,11,1)),list(np.arange(0,11,1)),list(np.arange(0,11,1))]
print('No data for this material available')
return df
NB My code runs if copied
I wrote a simple script to backtest cryptocurrencies using the poloniex API.
First I request the data from the API and turn it into a dataframe data.
Then I take the data I want and make new df called df
A function trade must then be run on each line in df, simple put if the price is above the rolling mean it buys and sells if below, this data is then saved in log.
I am having trouble applying this function on each row in df.
I had great success using the line log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1) BUT surprising it works when BTC_ETH is used in the API call and not for others ie BTC_FCT or BTC_DOGE despite the data being identical in form. Using ETH results in the creation of DataFrame (which is what i want) DOGE and FCT creates a Series
First question, how can I run my trade function on each row and create a new df log with the results
Bonus question, even though the data types are the same why does it work for ETH but not for DOGE/FCT ?
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData¤cyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
print()
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
EDIT:
I solved the problem, I fixed it by appending the dicts to list and then using the df.from_dict() method to create the log dataframe, my code just to clarify.
def trade(date, close, MA):#, port):
#d = {'Data': close}
#test_log = test_log.append(d, ignore_index=True)
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
for key,value in port.items():
print(key, value )
log.from_dict(data_list)
The problem is that you are not always returning a value in trade, which is confusing Pandas. Try this:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData¤cyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
d = {'Date': date, 'type':'', 'coin_value': np.nan, 'btc_value': np.nan}
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d['type'] = 'buy'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d['type'] = 'sell'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
However, as I mentioned in the comment, passing a function with side-effects to apply is not a good idea according to the documentation, and in fact I think it may not produce the correct result in your case.