how to write dynamic code to optimize my script in python - python

I have this code, i want to do the same for 3 different dataframes df1, df2, df3.
the column in df2 is 10_11 and in df3 IS 5_6
df1_30_31 = pd.DataFrame(df1_30_31['30_31'])
df2_10_11 = pd.DataFrame(df2_10_11['10_11'])
df3_5_6 = pd.DataFrame(df3_5_6['5_6'])
df1_30_31['30_31'] = df1_30_31['30_31'].replace(np.nan, ' ', regex=True)
df1_30_31['new_'] = df1_30_31['30_31'].apply(lambda x: '-' if 'nan' in x else x)
def Ok_fun(df):
if (df['new_'] =='T-S') or (df['new_'] =='B-S') or (df['new_'] =='B-M') or (df['new_'] =='M-L'):
val = 'NOK'
elif (df['new_']!= '-') & (df['new_']!= 'NOK'):
val ='OK'
else:
val= 'NOO'
return val
df1_30_31['new_'] = df1_30_31.apply(Ok_fun, axis=1)
df1_30_31['30_31'].replace(' ', np.nan, inplace=True)
df1_30_31.dropna(inplace=True)
Nbr_not_ok_30_31 = df1_30_31[df1_30_31["new_"]=='NOK']['new_'].count()
Nbr_ok_30_31 = df1_30_31[df1_30_31["new_"]=='OK']['OK_NOTOK'].count()
total_30_31= Nbr_not_ok_30_31+Nbr_ok_30_31
ratio_30_31 = (Nbr_not_ok_30_31/total_30_31)*100

Use a function that takes df, column as input and returns the processed df.
def change_df(df, col):
# ...
df[col] = df[col].replace(np.nan, ' ', regex=True)
# ...
return df
df1 = change_df(df=df1, col='30_31')
df2 = change_df(df=df2, col='10_11')
df3 = change_df(df=df3, col='5_6')

Related

Pandas how to search one df for a certain date and return that data

I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.

How to clean up older data for a dictionary of dataframes?

This is a snippet of my code. I need to clean up one day of older data. how do we do that for a dictionary of dataframes?
master_train_dict = {}
for id in list_of_id:
temp_df = df.loc[df["id"] == id].copy(deep=False)
temp_df.drop('id', axis=1, inplace=True)
temp_df.reset_index(drop=True, inplace=True)
alert_list = list(temp_df["title"])
train_embedding = get_embeddings(alert_list, model)
temp_df["train_embedding"] = train_embedding
master_train_dict[parent_id] =
temp_df[["title","train_embedding","#timestamp"]]
#master_train_dict[parent_id] = temp_df
global master_dict
master_dict = master_train_dict
print(master_dict)
#clean up function
if len(master_dict)>0:
d = datetime.today() - timedelta(hours=1, minutes= 0)
master_dict=master_dict[id]['#timestamp']>d.strftime("%Y-%m-%d %H:%M:%S")
print(master_dict)
Consider working in defined methods and use groupby for building list or dict of subsetted data frames. Then call functions via dictionary comprehensions.
def build_df(sub):
sub_df.drop('id', axis=1, inplace=True)
sub_df.reset_index(drop=True, inplace=True)
alert_list = list(sub_df["title"])
train_embedding = get_embeddings(alert_list, model)
sub_df["train_embedding"] = train_embedding
sub_df = sub_df.reindex(["title","train_embedding","#timestamp"], axis="columns")
return sub_df
master_train_dict = {i:build_df(g) for i, g in df.groupby(["id"])}
def clean_df(df):
d = datetime.today() - timedelta(hours=1, minutes= 0)
df = df[df['#timestamp'] > d.strftime("%Y-%m-%d %H:%M:%S")]
return df
clean_master_train_dict = {k:clean_df(v) for k, v in master_train_dict.items()}

Python SQL to pandas DataFrame

I have a SQL query:
pd.read_sql_query("""SELECT UvTab.A, UvTab.Uv,
IFNULL(DvTab.Dv, 0) AS Dv
FROM
(
SELECT A, COUNT(*) AS Uv FROM B
WHERE Vtype = 2 GROUP BY A
) AS UvTab
LEFT JOIN
(
SELECT A, COUNT(*) AS Dv FROM B
WHERE Vtype = 3 GROUP BY A
) AS DvTab
ON UvTab.A = DvTab.A
""", conn)
And my goal is to get the same result but using only pandas' methods. What I obtained is:
UvTab = B.loc[B.Vtype == 2].groupby("A").size()
UvTab = pd.DataFrame({'A' : UvTab.index, 'Uv' : UvTab.values})
DvTab = B.loc[B.Vtype == 3].groupby("A").size()
DvTab = pd.DataFrame({'A' : DvTab.index, 'Dv' : DvTab.values})
df = pd.merge(UvTab, DvTab, how='left', on='A')
df['Dv'] = df['Dv'].fillna(0)
And it seems to be fine. But is this the simpliest and the best way to represent the query?
One idea is aggregate sum for count matching and then use DataFrame.join:
UvTab = (B.Vtype == 2).astype(int).groupby(B["A"]).sum().reset_index(name='Uv')
DvTab = (B.Vtype == 3).astype(int).groupby(B["A"]).sum().to_frame('Dv')
df = UvTab.join(DvTab, on='A').fillna({'DV':0})
Or alternative with merge:
UvTab = (B.Vtype == 2).astype(int).groupby(B["A"]).sum().reset_index(name='Uv')
DvTab = (B.Vtype == 3).astype(int).groupby(B["A"]).sum().reset_index(name='Dv')
df = UvTab.merge(DvTab, on='A', how='left').fillna({'DV':0})

How to improve my pandas efficiency when there is many selections

I have a big dataframe which has two million rows. There are 60000 unique (store_id, product_id) pairs.
I need select by each (store_id, product_id), do some calculation , such as resample to H , sum , avg . Finally, concat all to a new dataframe.
The problem is it is very very slow, and become slower while running.
The mainly code is:
def process_df(df, func, *args, **kwargs):
'''
'''
product_ids = df.product_id.unique()
store_ids = df.store_id.unique()
# uk = df.drop_duplicates(subset=['store_id','product_id'])
# for idx, item in uk.iterrows():
all_df = list()
i = 1
with tqdm(total=product_ids.shape[0]*store_ids.shape[0]) as t:
for store_id in store_ids:
sdf = df.loc[df['store_id']==store_id]
for product_id in product_ids:
new_df = sdf.loc[(sdf['product_id']==product_id) ]
if new_df.shape[0] < 14:
continue
new_df = func(new_df, *args, **kwargs)
new_df.loc[:, 'store_id'] = store_id
new_df.loc[:, 'product_id'] = product_id
all_df.append(new_df)
t.update()
all_df= pd.concat(all_df)
return all_df
def process_order_items(df, store_id=None, product_id=None, freq='D'):
if store_id and "store_id" in df.columns:
df = df.loc[df['store_id']==store_id]
if product_id and "product_id" in df.columns:
df = df.loc[df['product_id']==product_id]
# convert to datetime
df.loc[:, "datetime_create"] = pd.to_datetime(df.time_create, unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
df = df[["price", "count", "fee_total", "fee_real", "price_real", "price_guide", "price_change_category", "datetime_create"]]
df.loc[:, "has_discount"] = (df.price_change_category > 0).astype(int)
df.loc[:, "clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
if not freq:
df.loc[:, "date_create"] = df["datetime_create"]
else:
assert freq in ('D', 'H')
df.index = df.loc[:, "datetime_create"]
discount_order_count = df['has_discount'].resample(freq).sum()
clearance_order_count = df['clearance'].resample(freq).sum()
discount_sale_count = df.loc[df.has_discount >0, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance >0, 'count'].resample(freq).sum()
no_discount_price = df.loc[df.has_discount == 0, 'price'].resample(freq).sum()
no_clearance_price = df.loc[df.clearance == 0, 'price'].resample(freq).sum()
order_count = df['count'].resample(freq).count()
day_count = df['count'].resample(freq).sum()
price_guide = df['price_guide'].resample(freq).max()
price_avg = (df['price'] * df['count']).resample(freq).sum() / day_count
df = pd.DataFrame({
"price":price_avg,
"price_guide": price_guide,
"sale_count": day_count,
"order_count": order_count,
"discount_order_count": discount_order_count,
"clearance_order_count": clearance_order_count,
"discount_sale_count": discount_sale_count,
"clearance_sale_count": clearance_sale_count,
})
df = df.drop(df[df.order_count == 0].index)
return df
I think the problem is there are too many redundant selections.
Maybe I could use groupby(['store_id','product_id']).agg to avoid redundant , but I have no idea how to use process_order_items with it and merge results together.
I think you can change:
df.loc[:,"clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
to Series.isin:
df["clearance"] = df.price_change_category.isin([10, 20, 23]).astype(int)
Also solution for Resampler.aggregate:
d = {'has_discount':'sum',
'clearance':'sum',
'count': ['count', 'sum'],
'price_guide':'max'}
df1 = df.resample(freq).agg(d)
df1.columns = df1.columns.map('_'.join)
d1 = {'has_discount_count':'discount_order_count',
'clearance_count':'clearance_order_count',
'count_count':'order_count',
'count_sum':'day_count',
'price_guide_max':'price_guide'}
df1.rename(columns=d1)
Another idea is no convert boolean mask to integer, but use columns for filtering like:
df["has_discount"] = df.price_change_category > 0
df["clearance"] = df.price_change_category.isin([10, 20, 23])
discount_sale_count = df.loc[df.has_discount, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance, 'count'].resample(freq).sum()
#for filtering ==0 invert boolean mask columns by ~
no_discount_price = df.loc[~df.has_discount, 'price'].resample(freq).sum()
no_clearance_price = df.loc[~df.clearance, 'price'].resample(freq).sum()
First function should be simplify by GroupBy.apply instaed loops, then concat is not necessary:
def f(x):
print (x)
df = df.groupby(['product_id','store_id']).apply(f)

How to run a function on each row in DataFrame and append the result to a new DataFrame

NB My code runs if copied
I wrote a simple script to backtest cryptocurrencies using the poloniex API.
First I request the data from the API and turn it into a dataframe data.
Then I take the data I want and make new df called df
A function trade must then be run on each line in df, simple put if the price is above the rolling mean it buys and sells if below, this data is then saved in log.
I am having trouble applying this function on each row in df.
I had great success using the line log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1) BUT surprising it works when BTC_ETH is used in the API call and not for others ie BTC_FCT or BTC_DOGE despite the data being identical in form. Using ETH results in the creation of DataFrame (which is what i want) DOGE and FCT creates a Series
First question, how can I run my trade function on each row and create a new df log with the results
Bonus question, even though the data types are the same why does it work for ETH but not for DOGE/FCT ?
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData&currencyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
print()
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
EDIT:
I solved the problem, I fixed it by appending the dicts to list and then using the df.from_dict() method to create the log dataframe, my code just to clarify.
def trade(date, close, MA):#, port):
#d = {'Data': close}
#test_log = test_log.append(d, ignore_index=True)
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
for key,value in port.items():
print(key, value )
log.from_dict(data_list)
The problem is that you are not always returning a value in trade, which is confusing Pandas. Try this:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData&currencyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
d = {'Date': date, 'type':'', 'coin_value': np.nan, 'btc_value': np.nan}
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d['type'] = 'buy'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d['type'] = 'sell'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
However, as I mentioned in the comment, passing a function with side-effects to apply is not a good idea according to the documentation, and in fact I think it may not produce the correct result in your case.

Categories