multiplying columns using for-loop and pd.dataframe - python

I'm having trouble creating a new data table that will show annual energy use data. Basically, I'd like to multiply energy by different factors to show annual energy use.
The code is below.
#calculate energy amounts
energy_use_by_fuel = pd.DataFrame()
for hhid in energy_data.hhid.unique():
tempdtf = pd.DataFrame({
'hhid':hhid,
'monthly_electricity': energy_data.loc[energy_data.hhid == hhid, 'estimated_kwh_monthly']*3,
'monthly_gas': energy_data.loc[energy_data.hhid == hhid, 'monthly_gas_use_kg'] * 4,
'monthly_charcoal': energy_data.loc[energy_data.hhid == hhid,
'monthly_charcoal_use_kg'] * 5})
#join
tempdtf = energy_use_by_fuel.append(tempdtf, ignore_index = True)
As you can see, I'd like to calculate different energy uses for electricity, gas and charcoal. But when I multiply the data by the numbers, the resulting dataframe energy_use_by_fuel is empty.

The function df.append() does return a new object adding a DataFrame, so I think your code is setting wrong variable.
#join
energy_use_by_fuel = energy_use_by_fuel.append(tempdtf, ignore_index = True)

Related

I want to speed up For loop in python and want to use less memory

Hi below is the function i am using to calculate quantile(25/50/75) & mean for each column.
def newsummary(final_per,grp_lvl,col):
new_col1='_'.join([j]+grp_lvl+['25%'])
new_col2='_'.join([j]+grp_lvl+['50%'])
new_col3='_'.join([j]+grp_lvl+['75%'])
new_col4='_'.join([j]+grp_lvl+['mean'])
final_per1=pd.DataFrame()
final_per1=final_per.groupby(grp_lvl)[j].quantile(0.25).reset_index()
final_per1.rename(columns = {j:new_col1}, inplace = True)
final_per2[new_col1]=final_per1[new_col1].copy()
final_per1=final_per.groupby(grp_lvl)[j].quantile(0.5).reset_index()
final_per1.rename(columns = {j:new_col2}, inplace = True)
final_per2[new_col2]=final_per1[new_col2].copy()
final_per1=final_per.groupby(grp_lvl)[j].quantile(0.75).reset_index()
final_per1.rename(columns = {j:new_col3}, inplace = True)
final_per2[new_col3]=final_per1[new_col3].copy()
final_per1=final_per.groupby(grp_lvl)[j].mean().reset_index()
final_per1.rename(columns = {j:new_col4}, inplace = True)
final_per2[new_col4]=final_per1[new_col4].copy()
return final_per2
Calling the function
grp_lvl=['ZIP_CODE', 'year']
for j in list_col: # approximately 1400 columns to iterate
per=newsummary(final_per,grp_lvl,j)
I want to find quantile(25/50/75) & mean for each column n retain that column in new dataframe. This i have to do for around 1400 columns.
The Dataframe are pandas dataframe.
While executing this loop .copy() command causing performance issues. Are there any alternate ways to reduce performance issues and not face out of memory error.
Your help & suggestion are appreciated.
** Note:- i am using Azure Databricks cluster to execute this. **
Since you are creating 4 new columns for each of the column i.e., quantiles for 0.25, 205 0.75 and mean of the grouped data using pandas dataframe, the code that you are following might be a better choice.
The pyspark grouped data requires an aggregate function and there is no aggregate function to calculate quantile.
There is no need to use copy or return any value from the function. So, modify your code to the below code:
import pandas as pd
final_per2 = pd.DataFrame()
def newsummary(final_per,grp_lvl,col):
new_col1='_'.join([j]+grp_lvl+['25%'])
new_col2='_'.join([j]+grp_lvl+['50%'])
new_col3='_'.join([j]+grp_lvl+['75%'])
new_col4='_'.join([j]+grp_lvl+['mean'])
final_per1=pd.DataFrame()
final_per1=final_per.groupby(grp_lvl)[j].quantile(0.25).reset_index()
final_per1.rename(columns = {j:new_col1}, inplace = True)
final_per2[new_col1]=final_per1[new_col1]
final_per1=final_per.groupby(grp_lvl)[j].quantile(0.5).reset_index()
final_per1.rename(columns = {j:new_col2}, inplace = True)
final_per2[new_col2]=final_per1[new_col2]
final_per1=final_per.groupby(grp_lvl)[j].quantile(0.75).reset_index()
final_per1.rename(columns = {j:new_col3}, inplace = True)
final_per2[new_col3]=final_per1[new_col3]
final_per1=final_per.groupby(grp_lvl)[j].mean().reset_index()
final_per1.rename(columns = {j:new_col4}, inplace = True)
final_per2[new_col4]=final_per1[new_col4]
for j in cols: # approximately 1400 columns to iterate
newsummary(pdf,grp_lvl,j)
final_per2

trying to figure out a pythonic way of code that is taking time even after using list comprehension and pandas

I have two dataframes: one comprising a large data set, allprice_df, with time price series for all stocks; and the other, init_df, comprising selective stocks and trade entry dates. I am trying to find the highest price for each ticker symbol and its associated date.
The following code works but it is time consuming, and I am wondering if there is a better, more Pythonic way to accomplish this.
# Initial call
init_df = init_df.assign(HighestHigh = lambda x:
highestHigh(x['DateIdentified'], x['Ticker'], allprice_df))
# HighestHigh function in lambda call
def highestHigh(date1,ticker,allp_df):
if date1.size == ticker.size:
temp_df = pd.DataFrame(columns = ['DateIdentified','Ticker'])
temp_df['DateIdentified'] = date1
temp_df['Ticker'] = ticker
else:
print("dates and tickers size mismatching")
sys.exit(1)
counter = itertools.count(0)
high_list = [getHigh(x,y,allp_df, next(counter)) for x, y in zip(temp_df['DateIdentified'],temp_df['Ticker'])]
return high_list
# Getting high for each ticker
def getHigh(dateidentified,ticker,allp_df, count):
print("trade %s" % count)
currDate = datetime.datetime.now().date()
allpm_df = allp_df.loc[((allp_df['Ticker']==ticker)&(allp_df['date']>dateidentified)&(allp_df['date']<=currDate)),['high','date']]
hh = allpm_df.iloc[:,0].max()
hd = allpm_df.loc[(allpm_df['high']==hh),'date']
hh = round(hh,2)
h_list = [hh,hd]
return h_list
# Split the list in to 2 columns one with price and the other with the corresponding date
init_df = split_columns(init_df,"HighestHigh")
# The function to split the list elements in to different columns
def split_columns(orig_df,col):
split_df = pd.DataFrame(orig_df[col].tolist(),columns=[col+"Mod", col+"Date"])
split_df[col+"Date"] = split_df[col+"Date"].apply(lambda x: x.squeeze())
orig_df = pd.concat([orig_df,split_df], axis=1)
orig_df = orig_df.drop(col,axis=1)
orig_df = orig_df.rename(columns={col+"Mod": col})
return orig_df
There are a couple of obvious solutions that would help reduce your runtime.
First, in your getHigh function, instead of using loc to get the date associated with the maximum value for high, use idxmax to get the index of the row associated with the high and then access that row:
hh, hd = allpm_df[allpm_df['high'].idxmax()]
This will replace two O(N) operations (finding the maximum in a list, and doing a list lookup using a comparison) with one O(N) operation and one O(1) operation.
Edit
In light of your information on the size of your dataframes, my best guess is that this line is probably where most of your time is being consumed:
allpm_df = allp_df.loc[((allp_df['Ticker']==ticker)&(allp_df['date']>dateidentified)&(allp_df['date']<=currDate)),['high','date']]
In order to make this faster, I would setup your data frame to include a multi-index when you first create the data frame:
index = pd.MultiIndex.from_arrays(arrays = [ticker_symbols, dates], names = ['Symbol', 'Date'])
allp_df = pd.Dataframe(data, index = index)
allp_df.index.sortlevel(level = 0, sort_remaining = True)
This should create a dataframe with a sorted, multi-level index associated with your ticker symbol and date. Doing this will reduce your search time tremendously. Once you do that, you should be able to access all the data associated with a ticker symbol and a given date-range by doing this:
allp_df[ticker, (dateidentified: currDate)]
which should return your data much more quickly. For more information on multi-indexing, check out this helpful Pandas tutorial.

Grouped Time Series forecasting with scikit-hts

I am trying to forecast sales for multiple time series I took from kaggle's Store item demand forecasting challenge. It consists of a long format time series for 10 stores and 50 items resulting in 500 time series stacked on top of each other. And for each store and each item, I have 5 years of daily records with weekly and annual seasonalities.
In total there are : 365.2days * 5years * 10stores *50items = 913000 records.
From my understanding based on what I've read so far on Hierarchical and Grouped time series, the whole dataframe could be structured as a Grouped Time Series and not simply as a strict Hierarchical Time Series as aggregation could be done at the store or item levels interchangeably.
I want to find a way to forecast all 500 time series (for store1_item1, store1_item2,..., store10_item50) for the next year (from 01-jan-2015 to 31-dec-2015) using the scikit-hts library and its AutoArimaModel function which is a wrapper function of pmdarima's AutoArima function.
To handle the two levels of seasonality, I added Fourier terms as exogenous features to deal with the annual seasonality while auto_arima deals with the weekly seasonality.
My problem is that I got an error at during prediction step.
Here's the error message :
ValueError: Provided exogenous values are not of the appropriate shape. Required (365, 4), got (365, 8).
I assume something is wrong with the exogenous dictionary but I do not know how to solve the issue as I'm using scikit-hts for the first time. To do this, I followed the official documentation of scikit-hts here.
EDIT :______________________________________________________________
I have not seen that a similar bug was reported on Github. Following the proposed fix that I implemented locally, I could have some results. However, even though there is no error when running the code, some of the forecasts are negative as raised in the comments below this post. And we even get disproportionate values for the positive ones.
Here are the plots for all the combinations of store and item. You can see that this seems to work for only one combination.
df.loc['2014','store_1_item_1'].plot()
predictions.loc['2015','store_1_item_1'].plot()
df.loc['2014','store_1_item_2'].plot()
predictions.loc['2015','store_1_item_2'].plot()
df.loc['2014','store_2_item_1'].plot()
predictions.loc['2015','store_2_item_1'].plot()
df.loc['2014','store_2_item_2'].plot()
predictions.loc['2015','store_2_item_2'].plot()
_____________________________________________________________________
Complete code:
# imports
import pandas as pd
from pmdarima.preprocessing import FourierFeaturizer
import hts
from hts.hierarchy import HierarchyTree
from hts.model import AutoArimaModel
from hts import HTSRegressor
# read data from the csv file
data = pd.read_csv('train.csv', index_col='date', parse_dates=True)
# Train/Test split with reduced size
train_data = data.query('store == [1,2] and item == [1, 2]').loc['2013':'2014']
test_data = data.query('store == [1,2] and item == [1, 2]').loc['2015']
# Create the stores time series
# For each timestamp group by store and apply sum
stores_ts = train_data.drop(columns=['item']).groupby(['date','store']).sum()
stores_ts = stores_ts.unstack('store')
stores_ts.columns = stores_ts.columns.droplevel(0)
stores_ts.columns = ['store_' + str(i) for i in stores_ts.columns]
# Create the items time series
# For each timestamp group by item and apply sum
items_ts = train_data.drop(columns=['store']).groupby(['date','item']).sum()
items_ts = items_ts.unstack('item')
items_ts.columns = items_ts.columns.droplevel(0)
items_ts.columns = ['item_' + str(i) for i in items_ts.columns]
# Create the stores_items time series
# For each timestamp group by store AND by item and apply sum
store_item_ts = train_data.pivot_table(index= 'date', columns=['store', 'item'], aggfunc='sum')
store_item_ts.columns = store_item_ts.columns.droplevel(0)
# Rename the columns as store_i_item_j
col_names = []
for i in store_item_ts.columns:
col_name = 'store_' + str(i[0]) + '_item_' + str(i[1])
col_names.append(col_name)
store_item_ts.columns = store_item_ts.columns.droplevel(0)
store_item_ts.columns = col_names
# Create a new dataframe and add the root level of the hierarchy as the sum of all stores (or all items)
df = pd.DataFrame()
df['total'] = stores_ts.sum(1)
# Concatenate all created dataframes into one df
# df is the dataframe that will be used for model training
df = pd.concat([df, stores_ts, items_ts, store_item_ts], 1)
# Build fourier terms for train and test sets
four_terms = FourierFeaturizer(365.2, 1)
# Build the exogenous features dataframe for training data
exog_train_df = pd.DataFrame()
for i in range(1, 3):
for j in range(1, 3):
_, exog = four_terms.fit_transform(train_data.query(f'store == {i} and item == {j}').sales)
exog.columns= [f'store_{i}_item_{j}_'+ x for x in exog.columns]
exog_train_df = pd.concat([exog_train_df, exog], axis=1)
exog_train_df['date'] = df.index
exog_train_df.set_index('date', inplace=True)
# add the exogenous features dataframe to df before training
df = pd.concat([df, exog_train_df], axis= 1)
# Build the exogenous features dataframe for test set
# It will be used only when using model.predict()
exog_test_df = pd.DataFrame()
for i in range(1, 3):
for j in range(1, 3):
_, exog_test = four_terms.fit_transform(test_data.query(f'store == {i} and item == {j}').sales)
exog_test.columns= [f'store_{i}_item_{j}_'+ x for x in exog_test.columns]
exog_test_df = pd.concat([exog_test_df, exog_test], axis=1)
# Build the hierarchy of the Grouped Time Series
stores = [i for i in stores_ts.columns]
items = [i for i in items_ts.columns]
store_items = col_names
# Exogenous features mapping
exog_store_items = {e: [v for v in exog_train_df.columns if v.startswith(e)] for e in store_items}
exog_stores = {e:[v for v in exog_train_df.columns if v.startswith(e)] for e in stores}
exog_items = {e:[v for v in exog_train_df.columns if v.find(e) != -1] for e in items}
exog_total = {'total':[v for v in exog_train_df.columns if v.find('FOURIER') != -1]}
# Merge all dictionaries
exog_to_merge = [exog_store_items, exog_stores, exog_items, exog_total]
exogenous = {k:v for x in exog_to_merge for k,v in x.items()}
# Build hierarchy
total = {'total': stores + items}
store_h = {k: [v for v in store_items if v.startswith(k)] for k in stores}
hierarchy = {**total, **store_h}
# Hierarchy tree automatically created by hts
ht = HierarchyTree.from_nodes(nodes=hierarchy, df=df, exogenous=exogenous)
# Instanciate the auto arima model using HTSRegressor
autoarima = HTSRegressor(model='auto_arima', D=1, m=7, seasonal=True, revision_method='OLS', n_jobs=12)
# Fit the model to the training df that includes time series and exog_train_df
# Set exogenous param to the previously built dictionary
model = autoarima.fit(df, hierarchy, exogenous=exogenous)
# Make predictions
# Set the exogenous_df param
predictions = model.predict(exogenous_df=exog_test_df, steps_ahead=365)
Other approaches I thought of and that I already implemented successfully for one series (for store 1 and item 1 for example) :
TBATS applied to each series independently inside a loop across all 500 time series
auto_arima (SARIMAX) with exogenous features (=Fourier terms to deal with the weekly and annual seasonalities) for each series independently + a loop across all 500 time series
What do you think of these approaches? Do you have other suggestions on how to scale ARIMA to multiple time series?
I also want to try LSTM but I'm new to data science and deep learning and do not know how to prepare the data. Should I keep the data in their original form (long format) and apply one hot encoding to train_data['store'] and train_data['item'] columns or should I start with the df I ended up with here?
I Hope this helped you in fixing the issue with exogenous regressors. To handle negative forecasts I would suggest you to try square root transformation.

build indicator fractals with pandas

My DataFrame looks like this:
<DATE>,<TIME>,<PRICE>
20200702,110000,207.2400000
20200702,120000,207.4400000
20200702,130000,208.2400000
20200702,140000,208.8200000
20200702,150000,208.0700000
20200702,160000,208.8100000
20200702,170000,209.4300000
20200702,180000,208.8700000
20200702,190000,210.0000000
20200702,200000,209.6900000
20200702,210000,209.8700000
20200702,220000,209.8000000
20200702,230000,209.5900000
20200703,000000,209.6000000
20200703,110000,211.1800000
20200703,120000,209.3900000
20200703,130000,209.6400000
I want to add here 2 another boolean columns called 'Up Fractal' and 'Down Fractal'.
It is stock market indicator Fractals with period 5.
It means:
Script runs from first row to last.
Script takes current row and looks at PRICE.
Script takes 5 previous rows and 5 next rows.
If PRICE of current row is maximum it is called 'Up Fractal'. True value in column 'Up Fractal'
If PRICE of current row is minimum it is called 'Down Fractal'. True value in column 'Down Fractal'
On stock market chart it looks something like this (this is an example from internet, not about my DataFrame)
It is easy for me to find fractals using standart methods of python. But I need high speed of pandas.
Help me please. I am very new to pandas library.
from binance.spot import Spot
import pandas as pd
from pandas import DataFrame
import numpy as np
if __name__ == '__main__':
cl = Spot()
r = cl.klines("BTCUSDT", "5m", limit = "100")
df = DataFrame(r).iloc[:, :6]
df.columns = list("tohlcv")
# number of rows to calculate fractal
n = 10
df = df.astype({'t': int})
df = df.astype({'o': float})
df = df.astype({'h': float})
df = df.astype({'l': float})
df = df.astype({'c': float})
# the first way
df['uf'] = (df['h'] == df['h'].rolling(n+n+1, center=True).max())
df['df'] = (df['l'] == df['l'].rolling(n+n+1, center=True).min())
# the second way
df['upfractal'] = np.where(df['h'] == df['h'].rolling(n+n+1, center=True).max(), True, False)
df['downfractal'] = np.where(df['l'] == df['l'].rolling(n+n+1, center=True).min(), True, False)
print(df)
df.to_csv('BTC_USD.csv')

Output unique values from a pandas dataframe without reordering the output

I know that a few posts have been made regarding how to output the unique values of a dataframe without reordering the data.
I have tried many times to implement these methods, however, I believe that the problem relates to how the dataframe in question has been defined.
Basically, I want to look into the dataframe named "C", and output the unique values into a new dataframe named "C1", without changing the order in which they are stored at the moment.
The line that I use currently is:
C1 = pd.DataFrame(np.unique(C))
However, this returns an ascending order list (while, I simply want the list order preserved only with duplicates removed).
Once again, I apologise to the advanced users who will look at my code and shake their heads -- I'm still learning! And, yes, I have tried numerous methods to solve this problem (redefining the C dataframe, converting the output to be a list etc), to no avail unfortunately, so this is my cry for help to the Python gods. I defined both C and C1 as dataframes, as I understand that these are pretty much the best datastructures to house data in, such that they can be recalled and used later, plus it is quite useful to name the columns without affecting the data contained in the dataframe).
Once again, your help would be much appreciated.
F0 = ('08/02/2018','08/02/2018',50)
F1 = ('08/02/2018','09/02/2018',52)
F2 = ('10/02/2018','11/02/2018',46)
F3 = ('12/02/2018','16/02/2018',55)
F4 = ('09/02/2018','28/02/2018',48)
F_mat = [[F0,F1,F2,F3,F4]]
F_test = pd.DataFrame(np.array(F_mat).reshape(5,3),columns=('startdate','enddate','price'))
#convert string dates into DateTime data type
F_test['startdate'] = pd.to_datetime(F_test['startdate'])
F_test['enddate'] = pd.to_datetime(F_test['enddate'])
#convert datetype to be datetime type for columns startdate and enddate
F['startdate'] = pd.to_datetime(F['startdate'])
F['enddate'] = pd.to_datetime(F['enddate'])
#create contract duration column
F['duration'] = (F['enddate'] - F['startdate']).dt.days + 1
#re-order the F matrix by column 'duration', ensure that the bootstrapping
#prioritises the shorter term contracts
F.sort_values(by=['duration'], ascending=[True])
# create prices P
P = pd.DataFrame()
for index, row in F.iterrows():
new_P_row = pd.Series()
for date in pd.date_range(row['startdate'], row['enddate']):
new_P_row[date] = row['price']
P = P.append(new_P_row, ignore_index=True)
P.fillna(0, inplace=True)
#create C matrix, which records the unique day prices across the observation interval
C = pd.DataFrame(np.zeros((1, intNbCalendarDays)))
C.columns = tempDateRange
#create the Repatriation matrix, which records the order in which contracts will be
#stored in the A matrix, which means that once results are generated
#from the linear solver, we know exactly which CalendarDays map to
#which columns in the results array
#this array contains numbers from 1 to NbContracts
R = pd.DataFrame(np.zeros((1, intNbCalendarDays)))
R.columns = tempDateRange
#define a zero filled matrix, P1, which will house the dominant daily prices
P1 = pd.DataFrame(np.zeros((intNbContracts, intNbCalendarDays)))
#rename columns of P1 to be the dates contained in matrix array D
P1.columns = tempDateRange
#create prices in correct rows in P
for i in list(range(0, intNbContracts)):
for j in list(range(0, intNbCalendarDays)):
if (P.iloc[i, j] != 0 and C.iloc[0,j] == 0) :
flUniqueCalendarMarker = P.iloc[i, j]
C.iloc[0,j] = flUniqueCalendarMarker
P1.iloc[i,j] = flUniqueCalendarMarker
R.iloc[0,j] = i
for k in list(range(j+1,intNbCalendarDays)):
if (C.iloc[0,k] == 0 and P.iloc[i,k] != 0):
C.iloc[0,k] = flUniqueCalendarMarker
P1.iloc[i,k] = flUniqueCalendarMarker
R.iloc[0,k] = i
elif (C.iloc[0,j] != 0 and P.iloc[i,j] != 0):
P1.iloc[i,j] = C.iloc[0,j]
#convert C dataframe into C_list, in prepataion for converting C_list
#into a unique, order preserved list
C_list = C.values.tolist()
#create C1 matrix, which records the unique day prices across unique days in the observation period
C1 = pd.DataFrame(np.unique(C))
Use DataFrame.duplicated() to check if your data-frame contains any duplicate or not.
If yes then you can try DataFrame.drop_duplicate() .

Categories