Combining rolling and cumulative z-score functions into one - python

I have two functions:
First (z_score) to compute rolling z-score values given df column
Second (z_score_cum) to compute cumulative z-score without forward-looking bias
# rolling z_score
def z_score(df, window):
val_column = df.columns[0]
col_mean = df[val_column].rolling(window=window).mean()
col_std = df[val_column].rolling(window=window).std()
df['zscore' + '_'+ str(window)+'D'] = (df[val_column] - col_mean)/col_std
return df
# cumulative z_score
def z_score_cum(data_frame):
# calculating length of original data frame to standardize
len_ = len(data_frame)
# storing column name & making a copy of data frame
val_column = data_frame.columns[0]
data_frame_standardized_final = data_frame.copy()
# calculating statistics
data_frame_standardized_final['mean_past'] = [np.mean(data_frame_standardized_final[val_column][0:lv+1]) for lv in range(0,len_)]
data_frame_standardized_final['std_past'] = [np.std(data_frame_standardized_final[val_column][0:lv+1]) for lv in range(0,len_)]
data_frame_standardized_final['z_score_cum'] = (data_frame_standardized_final[val_column] - data_frame_standardized_final['mean_past']) / data_frame_standardized_final['std_past']
return data_frame_standardized_final[['z_score_cum']]
I would like to somehow combine those two into one z-score function, so that, no matter if I pass time window as parameter, it would compute z-score based on window and additionaly, will contain one column with cumulative z-score. Currently, I am creating a list of time windows (here in days), which I am passing in the loop while calling the function and joining this additional column separately, which I don't think is the optimal way of processing.
d_list = [n * 21 for n in range(1,13)]
df_zscore = df.copy()
for i in d_list:
df_zscore = z_score(df_zscore, i)
df_zscore_cum = z_score_cum(df)
df_z_scores = pd.concat([df_zscore, df_zscore_cum], axis=1)

Eventually, I made it this way:
def calculate_z_scores(self, list_of_windows, freq_flag='D'):
"""
Calculates rolling z-scores and cumulative z-scores based on given list
of time windows
Parameters
----------
list_of_windows : list
a list of time windows.
freq_flag : string
frequency flag. The default is 'D' (daily)
Returns
-------
data frame
a data frame with calculated rolling & cumulative z-score.
"""
z_scores_data_frame = self.original_data_frame.copy()
# get column with values (1st column)
val_column = z_scores_data_frame.columns[0]
len_ = len(z_scores_data_frame)
# calculating statistics for cumulative_zscore
z_scores_data_frame['mean_past'] = [np.mean(z_scores_data_frame[val_column][0:lv+1]) for lv in range(0,len_)]
z_scores_data_frame['std_past'] = [np.std(z_scores_data_frame[val_column][0:lv+1]) for lv in range(0,len_)]
z_scores_data_frame['zscore_cum'] = (z_scores_data_frame[val_column] - z_scores_data_frame['mean_past']) / z_scores_data_frame['std_past']
# taking care of rolling z_scores
for i in list_of_windows:
col_mean = z_scores_data_frame[val_column].rolling(window=i).mean()
col_std = z_scores_data_frame[val_column].rolling(window=i).std()
z_scores_data_frame['zscore' + '_' + str(i)+ freq_flag] = (z_scores_data_frame[val_column] - col_mean)/col_std
cols_to_leave = [c for c in z_scores_data_frame.columns if 'zscore' in c]
self.z_scores_data_frame = z_scores_data_frame[cols_to_leave]
return self.z_scores_data_frame
Just a sidenote: This is my class method, but after minor modifications, could be use as a standalone function.

Related

trying to figure out a pythonic way of code that is taking time even after using list comprehension and pandas

I have two dataframes: one comprising a large data set, allprice_df, with time price series for all stocks; and the other, init_df, comprising selective stocks and trade entry dates. I am trying to find the highest price for each ticker symbol and its associated date.
The following code works but it is time consuming, and I am wondering if there is a better, more Pythonic way to accomplish this.
# Initial call
init_df = init_df.assign(HighestHigh = lambda x:
highestHigh(x['DateIdentified'], x['Ticker'], allprice_df))
# HighestHigh function in lambda call
def highestHigh(date1,ticker,allp_df):
if date1.size == ticker.size:
temp_df = pd.DataFrame(columns = ['DateIdentified','Ticker'])
temp_df['DateIdentified'] = date1
temp_df['Ticker'] = ticker
else:
print("dates and tickers size mismatching")
sys.exit(1)
counter = itertools.count(0)
high_list = [getHigh(x,y,allp_df, next(counter)) for x, y in zip(temp_df['DateIdentified'],temp_df['Ticker'])]
return high_list
# Getting high for each ticker
def getHigh(dateidentified,ticker,allp_df, count):
print("trade %s" % count)
currDate = datetime.datetime.now().date()
allpm_df = allp_df.loc[((allp_df['Ticker']==ticker)&(allp_df['date']>dateidentified)&(allp_df['date']<=currDate)),['high','date']]
hh = allpm_df.iloc[:,0].max()
hd = allpm_df.loc[(allpm_df['high']==hh),'date']
hh = round(hh,2)
h_list = [hh,hd]
return h_list
# Split the list in to 2 columns one with price and the other with the corresponding date
init_df = split_columns(init_df,"HighestHigh")
# The function to split the list elements in to different columns
def split_columns(orig_df,col):
split_df = pd.DataFrame(orig_df[col].tolist(),columns=[col+"Mod", col+"Date"])
split_df[col+"Date"] = split_df[col+"Date"].apply(lambda x: x.squeeze())
orig_df = pd.concat([orig_df,split_df], axis=1)
orig_df = orig_df.drop(col,axis=1)
orig_df = orig_df.rename(columns={col+"Mod": col})
return orig_df
There are a couple of obvious solutions that would help reduce your runtime.
First, in your getHigh function, instead of using loc to get the date associated with the maximum value for high, use idxmax to get the index of the row associated with the high and then access that row:
hh, hd = allpm_df[allpm_df['high'].idxmax()]
This will replace two O(N) operations (finding the maximum in a list, and doing a list lookup using a comparison) with one O(N) operation and one O(1) operation.
Edit
In light of your information on the size of your dataframes, my best guess is that this line is probably where most of your time is being consumed:
allpm_df = allp_df.loc[((allp_df['Ticker']==ticker)&(allp_df['date']>dateidentified)&(allp_df['date']<=currDate)),['high','date']]
In order to make this faster, I would setup your data frame to include a multi-index when you first create the data frame:
index = pd.MultiIndex.from_arrays(arrays = [ticker_symbols, dates], names = ['Symbol', 'Date'])
allp_df = pd.Dataframe(data, index = index)
allp_df.index.sortlevel(level = 0, sort_remaining = True)
This should create a dataframe with a sorted, multi-level index associated with your ticker symbol and date. Doing this will reduce your search time tremendously. Once you do that, you should be able to access all the data associated with a ticker symbol and a given date-range by doing this:
allp_df[ticker, (dateidentified: currDate)]
which should return your data much more quickly. For more information on multi-indexing, check out this helpful Pandas tutorial.

Grouped Time Series forecasting with scikit-hts

I am trying to forecast sales for multiple time series I took from kaggle's Store item demand forecasting challenge. It consists of a long format time series for 10 stores and 50 items resulting in 500 time series stacked on top of each other. And for each store and each item, I have 5 years of daily records with weekly and annual seasonalities.
In total there are : 365.2days * 5years * 10stores *50items = 913000 records.
From my understanding based on what I've read so far on Hierarchical and Grouped time series, the whole dataframe could be structured as a Grouped Time Series and not simply as a strict Hierarchical Time Series as aggregation could be done at the store or item levels interchangeably.
I want to find a way to forecast all 500 time series (for store1_item1, store1_item2,..., store10_item50) for the next year (from 01-jan-2015 to 31-dec-2015) using the scikit-hts library and its AutoArimaModel function which is a wrapper function of pmdarima's AutoArima function.
To handle the two levels of seasonality, I added Fourier terms as exogenous features to deal with the annual seasonality while auto_arima deals with the weekly seasonality.
My problem is that I got an error at during prediction step.
Here's the error message :
ValueError: Provided exogenous values are not of the appropriate shape. Required (365, 4), got (365, 8).
I assume something is wrong with the exogenous dictionary but I do not know how to solve the issue as I'm using scikit-hts for the first time. To do this, I followed the official documentation of scikit-hts here.
EDIT :______________________________________________________________
I have not seen that a similar bug was reported on Github. Following the proposed fix that I implemented locally, I could have some results. However, even though there is no error when running the code, some of the forecasts are negative as raised in the comments below this post. And we even get disproportionate values for the positive ones.
Here are the plots for all the combinations of store and item. You can see that this seems to work for only one combination.
df.loc['2014','store_1_item_1'].plot()
predictions.loc['2015','store_1_item_1'].plot()
df.loc['2014','store_1_item_2'].plot()
predictions.loc['2015','store_1_item_2'].plot()
df.loc['2014','store_2_item_1'].plot()
predictions.loc['2015','store_2_item_1'].plot()
df.loc['2014','store_2_item_2'].plot()
predictions.loc['2015','store_2_item_2'].plot()
_____________________________________________________________________
Complete code:
# imports
import pandas as pd
from pmdarima.preprocessing import FourierFeaturizer
import hts
from hts.hierarchy import HierarchyTree
from hts.model import AutoArimaModel
from hts import HTSRegressor
# read data from the csv file
data = pd.read_csv('train.csv', index_col='date', parse_dates=True)
# Train/Test split with reduced size
train_data = data.query('store == [1,2] and item == [1, 2]').loc['2013':'2014']
test_data = data.query('store == [1,2] and item == [1, 2]').loc['2015']
# Create the stores time series
# For each timestamp group by store and apply sum
stores_ts = train_data.drop(columns=['item']).groupby(['date','store']).sum()
stores_ts = stores_ts.unstack('store')
stores_ts.columns = stores_ts.columns.droplevel(0)
stores_ts.columns = ['store_' + str(i) for i in stores_ts.columns]
# Create the items time series
# For each timestamp group by item and apply sum
items_ts = train_data.drop(columns=['store']).groupby(['date','item']).sum()
items_ts = items_ts.unstack('item')
items_ts.columns = items_ts.columns.droplevel(0)
items_ts.columns = ['item_' + str(i) for i in items_ts.columns]
# Create the stores_items time series
# For each timestamp group by store AND by item and apply sum
store_item_ts = train_data.pivot_table(index= 'date', columns=['store', 'item'], aggfunc='sum')
store_item_ts.columns = store_item_ts.columns.droplevel(0)
# Rename the columns as store_i_item_j
col_names = []
for i in store_item_ts.columns:
col_name = 'store_' + str(i[0]) + '_item_' + str(i[1])
col_names.append(col_name)
store_item_ts.columns = store_item_ts.columns.droplevel(0)
store_item_ts.columns = col_names
# Create a new dataframe and add the root level of the hierarchy as the sum of all stores (or all items)
df = pd.DataFrame()
df['total'] = stores_ts.sum(1)
# Concatenate all created dataframes into one df
# df is the dataframe that will be used for model training
df = pd.concat([df, stores_ts, items_ts, store_item_ts], 1)
# Build fourier terms for train and test sets
four_terms = FourierFeaturizer(365.2, 1)
# Build the exogenous features dataframe for training data
exog_train_df = pd.DataFrame()
for i in range(1, 3):
for j in range(1, 3):
_, exog = four_terms.fit_transform(train_data.query(f'store == {i} and item == {j}').sales)
exog.columns= [f'store_{i}_item_{j}_'+ x for x in exog.columns]
exog_train_df = pd.concat([exog_train_df, exog], axis=1)
exog_train_df['date'] = df.index
exog_train_df.set_index('date', inplace=True)
# add the exogenous features dataframe to df before training
df = pd.concat([df, exog_train_df], axis= 1)
# Build the exogenous features dataframe for test set
# It will be used only when using model.predict()
exog_test_df = pd.DataFrame()
for i in range(1, 3):
for j in range(1, 3):
_, exog_test = four_terms.fit_transform(test_data.query(f'store == {i} and item == {j}').sales)
exog_test.columns= [f'store_{i}_item_{j}_'+ x for x in exog_test.columns]
exog_test_df = pd.concat([exog_test_df, exog_test], axis=1)
# Build the hierarchy of the Grouped Time Series
stores = [i for i in stores_ts.columns]
items = [i for i in items_ts.columns]
store_items = col_names
# Exogenous features mapping
exog_store_items = {e: [v for v in exog_train_df.columns if v.startswith(e)] for e in store_items}
exog_stores = {e:[v for v in exog_train_df.columns if v.startswith(e)] for e in stores}
exog_items = {e:[v for v in exog_train_df.columns if v.find(e) != -1] for e in items}
exog_total = {'total':[v for v in exog_train_df.columns if v.find('FOURIER') != -1]}
# Merge all dictionaries
exog_to_merge = [exog_store_items, exog_stores, exog_items, exog_total]
exogenous = {k:v for x in exog_to_merge for k,v in x.items()}
# Build hierarchy
total = {'total': stores + items}
store_h = {k: [v for v in store_items if v.startswith(k)] for k in stores}
hierarchy = {**total, **store_h}
# Hierarchy tree automatically created by hts
ht = HierarchyTree.from_nodes(nodes=hierarchy, df=df, exogenous=exogenous)
# Instanciate the auto arima model using HTSRegressor
autoarima = HTSRegressor(model='auto_arima', D=1, m=7, seasonal=True, revision_method='OLS', n_jobs=12)
# Fit the model to the training df that includes time series and exog_train_df
# Set exogenous param to the previously built dictionary
model = autoarima.fit(df, hierarchy, exogenous=exogenous)
# Make predictions
# Set the exogenous_df param
predictions = model.predict(exogenous_df=exog_test_df, steps_ahead=365)
Other approaches I thought of and that I already implemented successfully for one series (for store 1 and item 1 for example) :
TBATS applied to each series independently inside a loop across all 500 time series
auto_arima (SARIMAX) with exogenous features (=Fourier terms to deal with the weekly and annual seasonalities) for each series independently + a loop across all 500 time series
What do you think of these approaches? Do you have other suggestions on how to scale ARIMA to multiple time series?
I also want to try LSTM but I'm new to data science and deep learning and do not know how to prepare the data. Should I keep the data in their original form (long format) and apply one hot encoding to train_data['store'] and train_data['item'] columns or should I start with the df I ended up with here?
I Hope this helped you in fixing the issue with exogenous regressors. To handle negative forecasts I would suggest you to try square root transformation.

Why does performance decrease with the size of the data frame?

I'm working with a relatively large dataset (approx 5m observations, made up of about 5.5k firms).
I needed to run OLS regressions with a 60 month rolling window for each firm. I noticed that the performance was insanely slow when I ran the following code:
for idx, sub_df in master_df.groupby("firm_id"):
# OLS code
However, when I first split my dataframe into about 5.5k dfs and then iterated over each of the dfs, the performance improved dramatically.
grouped_df = master_df.groupby("firm_id")
df_list = [group for group in grouped_df]
for df in df_list:
my_df = df[1]
# OLS code
I'm talking 1-2 weeks of time (24/7) to complete in the first version compared to 8-9 hours tops.
Can anyone please explain why splitting the master df into N smaller dfs and then iterating over each smaller df performs better than iterating over the same number of groups within the master df?
Thanks ever so much!
I'm unable to reproduce your observation. Here's some code that generates data and then times the direct and indirect methods separately. The time taken is very similar in either case.
Is it possible that you accidentally sorted the dataframe by the group key between the runs? Sorting by group key results in a noticeable difference in run time.
Otherwise, I'm beginning to think that there might be some other differences in your code. It would be great if you could post the full code.
import numpy as np
import pandas as pd
from datetime import datetime
def generate_data():
''' returns a Pandas DF with columns 'firm_id' and 'score' '''
# configuration
np.random.seed(22)
num_groups = 50000 # number of distinct groups in the DF
mean_group_length = 200 # how many records per group?
cov_group_length = 0.10 # throw in some variability in the num records per group
# simulate group lengths
stdv_group_length = mean_group_length * cov_group_length
group_lengths = np.random.normal(
loc=mean_group_length,
scale=stdv_group_length,
size=(num_groups,)).astype(int)
group_lengths[group_lengths <= 0] = mean_group_length
# final length of DF
total_length = sum(group_lengths)
# compute entries for group key column
firm_id_list = []
for i, l in enumerate(group_lengths):
firm_id_list.extend([(i + 1)] * l)
# construct the DF; data column is 'score' populated with Numpy's U[0, 1)
result_df = pd.DataFrame(data={
'firm_id': firm_id_list,
'score': np.random.rand(total_length)
})
# Optionally, shuffle or sort the DF by group keys
# ALTERNATIVE 1: (badly) unsorted df
result_df = result_df.sample(frac=1, random_state=13).reset_index(drop=True)
# ALTERNATIVE 2: sort by group key
# result_df.sort_values(by='firm_id', inplace=True)
return result_df
def time_method(df, method):
''' time 'method' with 'df' as its argument '''
t_start = datetime.now()
method(df)
t_final = datetime.now()
delta_t = t_final - t_start
print(f"Method '{method.__name__}' took {delta_t}.")
return
def process_direct(df):
''' direct for-loop over groupby object '''
for group, df in df.groupby('firm_id'):
m = df.score.mean()
s = df.score.std()
return
def process_indirect(df):
''' indirect method: generate groups first as list and then loop over list '''
grouped_df = df.groupby('firm_id')
group_list = [pair for pair in grouped_df]
for pair in group_list:
m = pair[1].score.mean()
s = pair[1].score.std()
df = generate_data()
time_method(df, process_direct)
time_method(df, process_indirect)

Construct one row for each Index value from multiple by appending

I have got some data(42 features) collected from people during some months(maximum - 6; varies for different entries), every month's value is represented in its own row:
There are 9267 unique ID values(set as index) and as many as 50 000 rows in the df.
I want to convert it to 42 * 6 feature vectors for each ID(even though some will have a lot of NaNs there), so that i can train on them, here is how it should look like:
Here is my solution:
def flatten_features(f_matrix, ID):
'''constructs a 1x(6*n) vector from 6xn matrix'''
#check wether it is a series, not dataframe
if(len(f_matrix.shape) == 1):
f_matrix['ID'] = ID
return f_matrix
flattened_vector = f_matrix.iloc[0]
for i in range(1, f_matrix.shape[0]):
vector_append = f_matrix.iloc[i]
vector_append.index = (lambda month, series_names : series_names.map(lambda name : name + '_' + str(month)))\
(i, vector_append.index)
flattened_vector = flattened_vector.append(vector_append)
flattened_vector['ID'] = ID
return flattened_vector
#construct dataframe of flattened vectors for numerical features
new_indices = flatten_features(numerical_f.iloc[:6], 1).index
new_indices
flattened_num_f = pd.DataFrame(columns=new_indices)
flattened_num_f
for label in numerical_f.index.unique():
matr = numerical_f.loc[label]
flattened_num_f = flattened_num_f.append(flatten_features(matr, label))
It yields needed results, however it runs very slow. I wonder, is there a more elegant and fast solution?
if you want to transpose df, you could cam T function.
I assume you have id stored in unique_id variable
new_f = numerical_f.T
new_f.columns = unique_id

How to find median and quantiles using Spark

How can I find median of an RDD of integers using a distributed method, IPython, and Spark? The RDD is approximately 700,000 elements and therefore too large to collect and find the median.
This question is similar to this question. However, the answer to the question is using Scala, which I do not know.
How can I calculate exact median with Apache Spark?
Using the thinking for the Scala answer, I am trying to write a similar answer in Python.
I know I first want to sort the RDD. I do not know how. I see the sortBy (Sorts this RDD by the given keyfunc) and sortByKey (Sorts this RDD, which is assumed to consist of (key, value) pairs.) methods. I think both use key value and my RDD only has integer elements.
First, I was thinking of doing myrdd.sortBy(lambda x: x)?
Next I will find the length of the rdd (rdd.count()).
Finally, I want to find the element or 2 elements at the center of the rdd. I need help with this method too.
EDIT:
I had an idea. Maybe I can index my RDD and then key = index and value = element. And then I can try to sort by value? I don't know if this is possible because there is only a sortByKey method.
Ongoing work
SPARK-30569 - Add DSL functions invoking percentile_approx
Spark 2.0+:
You can use approxQuantile method which implements Greenwald-Khanna algorithm:
Python:
df.approxQuantile("x", [0.5], 0.25)
Scala:
df.stat.approxQuantile("x", Array(0.5), 0.25)
where the last parameter is a relative error. The lower the number the more accurate results and more expensive computation.
Since Spark 2.2 (SPARK-14352) it supports estimation on multiple columns:
df.approxQuantile(["x", "y", "z"], [0.5], 0.25)
and
df.approxQuantile(Array("x", "y", "z"), Array(0.5), 0.25)
Underlying methods can be also used in SQL aggregation (both global and groped) using approx_percentile function:
> SELECT approx_percentile(10.0, array(0.5, 0.4, 0.1), 100);
[10.0,10.0,10.0]
> SELECT approx_percentile(10.0, 0.5, 100);
10.0
Spark < 2.0
Python
As I've mentioned in the comments it is most likely not worth all the fuss. If data is relatively small like in your case then simply collect and compute median locally:
import numpy as np
np.random.seed(323)
rdd = sc.parallelize(np.random.randint(1000000, size=700000))
%time np.median(rdd.collect())
np.array(rdd.collect()).nbytes
It takes around 0.01 second on my few years old computer and around 5.5MB of memory.
If data is much larger sorting will be a limiting factor so instead of getting an exact value it is probably better to sample, collect, and compute locally. But if you really want a to use Spark something like this should do the trick (if I didn't mess up anything):
from numpy import floor
import time
def quantile(rdd, p, sample=None, seed=None):
"""Compute a quantile of order p ∈ [0, 1]
:rdd a numeric rdd
:p quantile(between 0 and 1)
:sample fraction of and rdd to use. If not provided we use a whole dataset
:seed random number generator seed to be used with sample
"""
assert 0 <= p <= 1
assert sample is None or 0 < sample <= 1
seed = seed if seed is not None else time.time()
rdd = rdd if sample is None else rdd.sample(False, sample, seed)
rddSortedWithIndex = (rdd.
sortBy(lambda x: x).
zipWithIndex().
map(lambda (x, i): (i, x)).
cache())
n = rddSortedWithIndex.count()
h = (n - 1) * p
rddX, rddXPlusOne = (
rddSortedWithIndex.lookup(x)[0]
for x in int(floor(h)) + np.array([0L, 1L]))
return rddX + (h - floor(h)) * (rddXPlusOne - rddX)
And some tests:
np.median(rdd.collect()), quantile(rdd, 0.5)
## (500184.5, 500184.5)
np.percentile(rdd.collect(), 25), quantile(rdd, 0.25)
## (250506.75, 250506.75)
np.percentile(rdd.collect(), 75), quantile(rdd, 0.75)
(750069.25, 750069.25)
Finally lets define median:
from functools import partial
median = partial(quantile, p=0.5)
So far so good but it takes 4.66 s in a local mode without any network communication. There is probably way to improve this, but why even bother?
Language independent (Hive UDAF):
If you use HiveContext you can also use Hive UDAFs. With integral values:
rdd.map(lambda x: (float(x), )).toDF(["x"]).registerTempTable("df")
sqlContext.sql("SELECT percentile_approx(x, 0.5) FROM df")
With continuous values:
sqlContext.sql("SELECT percentile(x, 0.5) FROM df")
In percentile_approx you can pass an additional argument which determines a number of records to use.
Here is the method I used using window functions (with pyspark 2.2.0).
from pyspark.sql import DataFrame
class median():
""" Create median class with over method to pass partition """
def __init__(self, df, col, name):
assert col
self.column=col
self.df = df
self.name = name
def over(self, window):
from pyspark.sql.functions import percent_rank, pow, first
first_window = window.orderBy(self.column) # first, order by column we want to compute the median for
df = self.df.withColumn("percent_rank", percent_rank().over(first_window)) # add percent_rank column, percent_rank = 0.5 coressponds to median
second_window = window.orderBy(pow(df.percent_rank-0.5, 2)) # order by (percent_rank - 0.5)^2 ascending
return df.withColumn(self.name, first(self.column).over(second_window)) # the first row of the window corresponds to median
def addMedian(self, col, median_name):
""" Method to be added to spark native DataFrame class """
return median(self, col, median_name)
# Add method to DataFrame class
DataFrame.addMedian = addMedian
Then call the addMedian method to calculate the median of col2:
from pyspark.sql import Window
median_window = Window.partitionBy("col1")
df = df.addMedian("col2", "median").over(median_window)
Finally you can group by if needed.
df.groupby("col1", "median")
Adding a solution if you want an RDD method only and dont want to move to DF.
This snippet can get you a percentile for an RDD of double.
If you input percentile as 50, you should obtain your required median.
Let me know if there are any corner cases not accounted for.
/**
* Gets the nth percentile entry for an RDD of doubles
*
* #param inputScore : Input scores consisting of a RDD of doubles
* #param percentile : The percentile cutoff required (between 0 to 100), e.g 90%ile of [1,4,5,9,19,23,44] = ~23.
* It prefers the higher value when the desired quantile lies between two data points
* #return : The number best representing the percentile in the Rdd of double
*/
def getRddPercentile(inputScore: RDD[Double], percentile: Double): Double = {
val numEntries = inputScore.count().toDouble
val retrievedEntry = (percentile * numEntries / 100.0 ).min(numEntries).max(0).toInt
inputScore
.sortBy { case (score) => score }
.zipWithIndex()
.filter { case (score, index) => index == retrievedEntry }
.map { case (score, index) => score }
.collect()(0)
}
There are two ways that can be used. One is using approxQuantile method and the other percentile_approx method. However, both the methods might not give accurate results when there are even number of records.
importpyspark.sql.functions.percentile_approx as F
# df.select(F.percentile_approx("COLUMN_NAME_FOR_WHICH_MEDIAN_TO_BE_COMPUTED", 0.5).alias("MEDIAN)) # might not give proper results when there are even number of records
((
df.select(F.percentile_approx("COLUMN_NAME_FOR_WHICH_MEDIAN_TO_BE_COMPUTED", 0.5) + df.select(F.percentile_approx("COLUMN_NAME_FOR_WHICH_MEDIAN_TO_BE_COMPUTED", 0.51)
)*.5).alias("MEDIAN))
I have written the function which takes data frame as an input and returns a dataframe which has median as an output over a partition and order_col is the column for which we want to calculate median for part_col is the level at which we want to calculate median for :
from pyspark.sql import Window
import pyspark.sql.functions as F
def calculate_median(dataframe, part_col, order_col):
win = Window.partitionBy(*part_col).orderBy(order_col)
# count_row = dataframe.groupby(*part_col).distinct().count()
dataframe.persist()
dataframe.count()
temp = dataframe.withColumn("rank", F.row_number().over(win))
temp = temp.withColumn(
"count_row_part",
F.count(order_col).over(Window.partitionBy(part_col))
)
temp = temp.withColumn(
"even_flag",
F.when(
F.col("count_row_part") %2 == 0,
F.lit(1)
).otherwise(
F.lit(0)
)
).withColumn(
"mid_value",
F.floor(F.col("count_row_part")/2)
)
temp = temp.withColumn(
"avg_flag",
F.when(
(F.col("even_flag")==1) &
(F.col("rank") == F.col("mid_value"))|
((F.col("rank")-1) == F.col("mid_value")),
F.lit(1)
).otherwise(
F.when(
F.col("rank") == F.col("mid_value")+1,
F.lit(1)
)
)
)
temp.show(10)
return temp.filter(
F.col("avg_flag") == 1
).groupby(
part_col + ["avg_flag"]
).agg(
F.avg(F.col(order_col)).alias("median")
).drop("avg_flag")
For exact median computation you can use the following function and use it with PySpark DataFrame API:
def median_exact(col: Union[Column, str]) -> Column:
"""
For grouped aggregations, Spark provides a way via pyspark.sql.functions.percentile_approx("col", .5) function,
since for large datasets, computing the median is computationally expensive.
This function manually computes the median and should only be used for small to mid sized datasets / groupings.
:param col: Column to compute the median for.
:return: A pyspark `Column` containing the median calculation expression
"""
list_expr = F.filter(F.collect_list(col), lambda x: x.isNotNull())
sorted_list_expr = F.sort_array(list_expr)
size_expr = F.size(sorted_list_expr)
even_num_elements = (size_expr % 2) == 0
odd_num_elements = ~even_num_elements
return F.when(size_expr == 0, None).otherwise(
F.when(odd_num_elements, sorted_list_expr[F.floor(size_expr / 2)]).otherwise(
(
sorted_list_expr[(size_expr / 2 - 1).cast("long")]
+ sorted_list_expr[(size_expr / 2).cast("long")]
)
/ 2
)
)
Apply it like this:
output_df = input_spark_df.groupby("group").agg(
median_exact("elems").alias("elems_median")
)
We can calculate the median and quantiles in spark using the following code:
df.stat.approxQuantile(col,[quantiles],error)
For example, finding the median in the following dataframe [1,2,3,4,5]:
df.stat.approxQuantile(col,[0.5],0)
The lesser the error, the more accurate the results.
From version 3.4+ (and also already in 3.3.1) the median function is directly available
https://github.com/apache/spark/blob/e170a2eb236a376b036730b5d63371e753f1d947/python/pyspark/sql/functions.py#L633
import pyspark.sql.functions as f
df.groupBy("grp").agg(f.median("val"))
I guess the respective documentation will be added if the version is finally released.

Categories