Issue with .dropna() in Pandas - python

In the below function I am working with a Pandas dataframe. I am bringing in a data frame, and immediately resetting the index. I then make a copy of that dataframe so I avoid any Chained Assignment issues.
I then want to use .dropna(inplace=True, subset = [header], axis=0) to remove any rows where my column of interest (header) is nan. However, once I start into the for loop it is clear that the nan values haven't dropped because I keep getting warnings like:
RuntimeWarning: Mean of empty slice
which is a result of my array neighbors having all nan values.
My Question: In the line where I use df_copy.dropna(inplace=True, subset=[header], axis=0) does it seem like I am not actually getting a permanent drop of those rows?
n_samples = 10
tolerance = 1.5
dataframe = pd.read_csv('my_file.csv')
def removeOutliers(dataframe, header):
dataframe.reset_index(inplace=True, drop=True)
df_copy = dataframe.copy()
#Why doesn't the below actually drop the NaNs?
df_copy.dropna(inplace=True, subset=[header], axis=0)
for ii in range(len(df_copy['Lng'])):
a = df_copy.iloc[ii]['Lng'] - df_copy.iloc[:]['Lng']
b = df_copy.iloc[ii]['Lat'] - df_copy.iloc[:]['Lat']
c = np.array((a**2 + b**2)**0.5 )
d = np.zeros((len(df_copy['Lng'])))
e = np.zeros((len(df_copy['Lng'])))
d[:] = df_copy.iloc[:]['Well']
e[:] = df_copy.iloc[:][header]
idx = np.argpartition(c, n_samples+1)
max_loc = np.where(e[idx[0:n_samples+1]] == e[ii])
neighbors = np.delete(e[idx[0:n_samples+1]], max_loc)
avg = np.nanmean(neighbors)
std = np.nanstd(neighbors)
if df_copy.iloc[ii][header] > (avg + tolerance*std) or df_copy.iloc[ii][header] < (avg - tolerance*std):
df_copy.iloc[ii, df_copy.columns.get_loc(header)] = np.nan
return df_copy
test_data = removeOutliers(dataframe, 'myColumn')

Related

How to iterate over rows of each column in a dataframe

My current code functions and produces a graph if there is only 1 sensor, i.e. if col2, and col3 are deleted in the example data provided below, leaving one column.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
d = {'col1': [-2587.944231, -1897.324231,-2510.304231,-2203.814231,-2105.734231,-2446.964231,-2963.904231,-2177.254231, 2796.354231,-2085.304231], 'col2': [-3764.468462,-3723.608462,-3750.168462,-3694.998462,-3991.268462,-3972.878462,3676.608462,-3827.808462,-3629.618462,-1841.758462,], 'col3': [-166.1357692,-35.36576923, 321.4157692,108.9257692,-123.2257692, -10.84576923, -100.7457692, 89.27423077, -211.0857692, 101.5342308]}
df = pd.DataFrame(data=d)
sensors = 3
window_size = 5
dfn = df.rolling(window_size).corr(pairwise = True)
index = df.index #index of values in the data frame.
rows = len(index) #len(index) returns number of rows in the data.
sensors = 3
baseline_num = [0]*(rows) #baseline numerator, by default zero
baseline = [0]*(rows) #initialize baseline value
baseline = DataFrame(baseline)
baseline_num = DataFrame(baseline_num)
v = [None]*(rows) # Initialize an empty array v[] equal to amount of rows in .csv file
s = [None]*(rows) #Initialize another empty array for the slope values for detecting when there is an exposure
d = [0]*(rows)
sensors_on = True #Is the sensor detecting something (True) or not (False).
off_count = 0
off_require = 8 # how many offs until baseline is updated
sensitivity = 1000
for i in range(0, (rows)): #This iterates over each index value, i.e. each row, and sums the values and returns them in list format.
v[i] = dfn.loc[i].to_numpy().sum() - sensors
for colname,colitems in df.iteritems():
for rownum,rowitem in colitems.iteritems():
#d[rownum] = dfone.loc[rownum].to_numpy()
#d[colname][rownum] = df.loc[colname][rownum]
if v[rownum] >= sensitivity:
sensors_on = True
off_count = 0
baseline_num[rownum] = 0
else:
sensors_on = False
off_count += 1
if off_count == off_require:
for x in range(0, (off_require)):
baseline_num[colname][rownum] += df[colname][rownum - x]
elif off_count > off_require:
baseline_num[colname][rownum] += baseline_num[colname][rownum - 1] + df[colname][rownum] - (df[colname][rownum - off_require]) #this loop is just an optimization, one calculation per loop once the first calculation is established
baseline[colname][rownum] = ((baseline_num[colname][rownum])//(off_require)) #mean of the last "off_require" points
dfx = DataFrame(v, columns =['Sensor Correlation']) #converts the summed correlation tables back from list format to a DataFrame, with the sole column name 'Sensor Correlation'
dft = pd.DataFrame(baseline, columns =['baseline'])
dft = dft.astype(float)
dfx.plot(figsize=(50,25), linewidth=5, fontsize=40) # plots dfx dataframe which contains correlated and summed data
dft.plot(figsize=(50,25), linewidth=5, fontsize=40)
Basically, instead of 1 graph as this produces, I would like to iterate over each column only for this loop:
for colname,colitems in df.iteritems():
for rownum,rowitem in colitems.iteritems():
#d[rownum] = dfone.loc[rownum].to_numpy()
#d[colname][rownum] = df.loc[colname][rownum]
if v[rownum] >= sensitivity:
sensors_on = True
off_count = 0
baseline_num[rownum] = 0
else:
sensors_on = False
off_count += 1
if off_count == off_require:
for x in range(0, (off_require)):
baseline_num[colname][rownum] += df[colname][rownum - x]
elif off_count > off_require:
baseline_num[colname][rownum] += baseline_num[colname][rownum - 1] + df[colname][rownum] - (df[colname][rownum - off_require]) #this loop is just an optimization, one calculation per loop once the first calculation is established
I've tried some other solutions from other questions but none of them seem to solve this case.
As of now, I've tried multiple conversions to things like lists and tuples, and then calling them something like this:
baseline_num[i,column] += d[i - x,column]
as well as
baseline_num[i][column += d[i - x][column]
while iterating over the loop using
for column in columns
However no matter how I seem to arrange the solution, there is always some keyerror of expecting integer or slice indices, among other errors.
See pictures for expected/possible outputs of one column on actual data.with varying input parameters (sensitivity value, and off_require is varied in different cases.)
One such solution which didn't work was the looping method from this link:
https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
I've also tried creating a loop using iteritems as the outer loop. This did not function as well.
Below are links to possible graph outputs for various sensitivity values, and windows in my actual dataset, with only one column. (i.e i manually deleted other columns, and plotted just the one using the current program)
sensitivity 1000, window 8
sensitivity 800, window 5
sensitivity 1500, window 5
If there's anything I've left out that would be helpful to solving this, please let me know so I can rectify it immediately.
See this picture for my original df.head:
df.head
Did you try,
for colname,colitems in df.iteritems():
for rownum,rowitem in colitems.iteritems():
print(df[colname][rownum])
The first loop iterates over all the columns, and the 2nd loops iterates over all the rows for that column.
edit:
From our conversation below, I think that your baseline and df dataframes don't have the same column names because of how you created them and how you are accessing the elements.
My suggestion is that you create the baseline dataframe to be a copy of your df dataframe and edit the information within it from there.
Edit:
I have managed to make your code work for 1 loop, but I run into an index error, I am not sure what your optimisation function does but i think that is what is causing it, take a look.
It is this part baseline_num[colname][rownum - 1], in the second loop i guess because you do rownum (0) -1, you get index -1. You need to change it so that in the first loop rownum is 1 or something, I am not sure what you are trying to do there.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
d = {'col1': [-2587.944231, -1897.324231,-2510.304231,-2203.814231,-2105.734231,-2446.964231,-2963.904231,-2177.254231, 2796.354231,-2085.304231], 'col2': [-3764.468462,-3723.608462,-3750.168462,-3694.998462,-3991.268462,-3972.878462,3676.608462,-3827.808462,-3629.618462,-1841.758462,], 'col3': [-166.1357692,-35.36576923, 321.4157692,108.9257692,-123.2257692, -10.84576923, -100.7457692, 89.27423077, -211.0857692, 101.5342308]}
df = pd.DataFrame(data=d)
sensors = 3
window_size = 5
dfn = df.rolling(window_size).corr(pairwise = True)
index = df.index #index of values in the data frame.
rows = len(index) #len(index) returns number of rows in the data.
sensors = 3
baseline_num = [0]*(rows) #baseline numerator, by default zero
baseline = [0]*(rows) #initialize baseline value
baseline = pd.DataFrame(df)
baseline_num = pd.DataFrame(df)
#print(baseline_num)
v = [None]*(rows) # Initialize an empty array v[] equal to amount of rows in .csv file
s = [None]*(rows) #Initialize another empty array for the slope values for detecting when there is an exposure
d = [0]*(rows)
sensors_on = True #Is the sensor detecting something (True) or not (False).
off_count = 0
off_require = 8 # how many offs until baseline is updated
sensitivity = 1000
for i in range(0, (rows)): #This iterates over each index value, i.e. each row, and sums the values and returns them in list format.
v[i] = dfn.loc[i].to_numpy().sum() - sensors
for colname,colitems in df.iteritems():
#print(colname)
for rownum,rowitem in colitems.iteritems():
#print(rownum)
#display(baseline[colname][rownum])
#d[rownum] = dfone.loc[rownum].to_numpy()
#d[colname][rownum] = df.loc[colname][rownum]
if v[rownum] >= sensitivity:
sensors_on = True
off_count = 0
baseline_num[rownum] = 0
else:
sensors_on = False
off_count += 1
if off_count == off_require:
for x in range(0, (off_require)):
baseline_num[colname][rownum] += df[colname][rownum - x]
elif off_count > off_require:
baseline_num[colname][rownum] += baseline_num[colname][rownum - 1] + df[colname][rownum] - (df[colname][rownum - off_require]) #this loop is just an optimization, one calculation per loop once the first calculation is established
baseline[colname][rownum] = ((baseline_num[colname][rownum])//(off_require)) #mean of the last "off_require" points
print(baseline[colname][rownum])
dfx = pd.DataFrame(v, columns =['Sensor Correlation']) #converts the summed correlation tables back from list format to a DataFrame, with the sole column name 'Sensor Correlation'
dft = pd.DataFrame(baseline, columns =['baseline'])
dft = dft.astype(float)
dfx.plot(figsize=(50,25), linewidth=5, fontsize=40) # plots dfx dataframe which contains correlated and summed data
dft.plot(figsize=(50,25), linewidth=5, fontsize=40)
My output looks like this,
-324.0
-238.0
-314.0
-276.0
-264.0
-306.0
-371.0
-806.0
638.0
-412.0
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
354 try:
--> 355 return self._range.index(new_key)
356 except ValueError as err:
ValueError: -1 is not in range
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
3 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
355 return self._range.index(new_key)
356 except ValueError as err:
--> 357 raise KeyError(key) from err
358 raise KeyError(key)
359 return super().get_loc(key, method=method, tolerance=tolerance)
KeyError: -1
I don't have enough rep to comment, but below is what I was able to work out. Hope it helps!
I tried to use the to_list() function while working out an answer, and it threw me an error:
AttributeError: 'DataFrame' object has no attribute 'to_list'
So, I decided to circumvent that method and came up with this:
indexes = [x for x in df.index]
row_vals = []
for index in indexes :
for val in df.iloc[i].values:
row_vals.append(val)
The object row_vals will contain all values in row order.
If you only want to get the row values for a particular row or set of rows, you would need to do this:
indx_subset = [`list of row indices`] #(Ex. [1, 2, 5, 6, etc...])
row_vals = []
for indx in indx_subset:
for val in df.loc[indx].values:
row_vals.append(val)
row_vals will then have all the row values from the specified indices.

Pandas Dataframe masking issues: referring to previous rows and selecting values

I am new to Pandas, and I'm trying to avoid iterating over a DataFrame, and attempting to use vectorisation instead. I am not able to get the results I want; I need help in the more complicated masking and selection statements
This is my code:
import random
from datetime import datetime, timedelta
import pandas as pd
dates = []
temp = []
press = []
vel = []
fmt = '%Y-%m-%d %H:%M:%S'
stime = datetime.strptime('2020-01-06 10:28:16', fmt)
etime = datetime.strptime('2020-04-10 03:43:12', fmt)
td = etime - stime
l = set([random.random() for x in range(0, 1000)])
dates = [((td * x) + stime) for x in random.sample(l, 100)]
for i in range(100):
press.append(random.uniform(14,95.5))
temp.append(random.uniform(-15,45))
vel.append(random.uniform(50,153))
measurements = {
'date' : dates,
'pressure' : press,
'velocity' : vel,
'temperature': temp
}
df = pd.DataFrame(measurements)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.sort_index()
df2 = pd.DataFrame()
# if temp increased from previous row, set flag
df2['temp_inc'] = df['temperature'] - df.shift(1)['temperature'] > 0
df2['temp_inc'] = df2['temp_inc'].replace({True: 1, False: 0})
# need to fetch velocity where pressure has increased from previous row, else 0
press_up_mask = df.where( (df['pressure'] - df.shift(1)['pressure']) > 0)
#df2['press_spike_velocity'] = df[press_up_mask]['velocity']
# Need to perform calc based on 'temp_inc' column: if 'temp_inc' column is 1: calculate pressure * velocity, else 0
temp_inc_mask = df2['temp_inc'] == 1
df2['boyle_fact'] = df[temp_inc_mask]['pressure'] * df[temp_inc_mask]['velocity']
# Get some stats
df2['short_max_temp'] = df['temperature'].rolling(3).max()
df2['long_min_pressure'] = df['pressure'].rolling(30).min()
print(df.head())
print(df2.head())
How do I correctly calculate columns 'press_spike_velocity' and 'boyle_fact' ?
Starting from the computations:
# if temp increased from previous row, set flag
df2['temp_inc'] = df['temperature'] - df.shift(1)['temperature'] > 0
# setting int type instead of replace
df2['temp_inc'] = df2['temp_inc'].astype(int)
# need to fetch velocity where pressure has increased from previous row, else 0
press_up_mask = df.where( (df['pressure'] - df['pressure'].shift(1)) > 0)
# set column to velocity then mask in zeros via assignment
df2['press_spike_velocity'] = df['velocity'].copy()
df2['press_spike_velocity'][~press_up_mask] = 0
# Need to perform calc based on 'temp_inc' column: if 'temp_inc' column is 1: calculate pressure * velocity, else 0
temp_inc_mask = df2['temp_inc'] == 1
# same masking approach as above
df2['boyle_fact'] = df['pressure'] * df['velocity']
df2['boyle_fact'][~temp_inc_mask] = 0
This is the simplest way to solve your problem with minimal changes to the code itself. If you dig into pandas more you could probably find methods to do this in 1-2 fewer lines via inplace operations but I don't know how much performance or readability you would gain from that.

Calculate column in Pandas Dataframe using adjacent rows without iterating through each row

I would like to see if there is a way to calculate a column in a dataframe that uses something similar to a moving average without iterating through each row.
Current working code:
def create_candles(ticks, instrument, time_slice):
candlesticks = ticks.price.resample(time_slice, base=00).ohlc().bfill()
volume = ticks.amount.resample(time_slice, base=00).sum()
candlesticks['volume'] = volume
candlesticks['instrument'] = instrument
candlesticks['ttr'] = 0
# candlesticks['vr_7'] = 0
candlesticks['vr_10'] = 0
candlesticks = calculate_indicators(candlesticks, instrument, time_slice)
return candlesticks
def calculate_indicators(candlesticks, instrument):
candlesticks.sort_index(inplace=True)
# candlesticks['rsi_14'] = talib.RSI(candlesticks.close, timeperiod=14)
candlesticks['lr_50'] = talib.LINEARREG(candlesticks.close, timeperiod=50)
# candlesticks['lr_150'] = talib.LINEARREG(candlesticks.close, timeperiod=150)
# candlesticks['ema_55'] = talib.EMA(candlesticks.close, timeperiod=55)
# candlesticks['ema_28'] = talib.EMA(candlesticks.close, timeperiod=28)
# candlesticks['ema_18'] = talib.EMA(candlesticks.close, timeperiod=18)
# candlesticks['ema_9'] = talib.EMA(candlesticks.close, timeperiod=9)
# candlesticks['wma_21'] = talib.WMA(candlesticks.close, timeperiod=21)
# candlesticks['wma_12'] = talib.WMA(candlesticks.close, timeperiod=12)
# candlesticks['wma_11'] = talib.WMA(candlesticks.close, timeperiod=11)
# candlesticks['wma_5'] = talib.WMA(candlesticks.close, timeperiod=5)
candlesticks['cmo_9'] = talib.CMO(candlesticks.close, timeperiod=9)
for row in candlesticks.itertuples():
current_index = candlesticks.index.get_loc(row.Index)
if current_index >= 1:
previous_close = candlesticks.iloc[current_index - 1, candlesticks.columns.get_loc('close')]
candlesticks.iloc[current_index, candlesticks.columns.get_loc('ttr')] = max(
row.high - row.low,
abs(row.high - previous_close),
abs(row.low - previous_close))
if current_index > 10:
candlesticks.iloc[current_index, candlesticks.columns.get_loc('vr_10')] = candlesticks.iloc[current_index, candlesticks.columns.get_loc('ttr')] / (
max(candlesticks.high[current_index - 9: current_index].max(), candlesticks.close[current_index - 11]) -
min(candlesticks.low[current_index - 9: current_index].min(), candlesticks.close[current_index - 11]))
candlesticks['timestamp'] = pd.to_datetime(candlesticks.index)
candlesticks['instrument'] = instrument
candlesticks.fillna(0, inplace=True)
return candlesticks
in the iteration, i am calculating the True Range ('TTR') and then the Volatility Ratio ('VR_10')
TTR is calculated on every row in the DF except for the first one. It uses the previous row's close column, and the current row's high and low column.
VR_10 is calculated on every row except for the first 10. it uses the high and low column of the previous 9 rows and the close of the 10th row back.
EDIT 2
I have tried many ways to add a text based data frame in this question, there just doesnt seem to be a solution with the width of my frame. there is no difference in the input and output dataframes other than the column TTR and VR_10 have all 0s in the input, and have non-zero values in the output.
an example would be this dataframe:
Is there a way I can do this without iteration?
With the nudge from Andreas to use rolling, I came to an answer:
first, I had to find out how to use rolling with multiple columns. found that here.
I made a modification because I need to roll up, not down
def roll(df, w, **kwargs):
df.sort_values(by='timestamp', ascending=0, inplace=True)
v = df.values
d0, d1 = v.shape
s0, s1 = v.strides
a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1))
rolled_df = pd.concat({
row: pd.DataFrame(values, columns=df.columns)
for row, values in zip(df.index, a)
})
return rolled_df.groupby(level=0, **kwargs)
after that, I created 2 functions:
def calculate_vr(window):
return window.iloc[0].ttr / (max(window.high[1:9].max(), window.iloc[10].close) - min(window.low[1:9].min(), window.iloc[10].close))
def calculate_ttr(window):
return max(window.iloc[0].high - window.iloc[0].low, abs(window.iloc[0].high - window.iloc[1].close), abs(window.iloc[0].low - window.iloc[1].close))
and called those functions like this:
candlesticks['ttr'] = roll(candlesticks, 3).apply(calculate_ttr)
candlesticks['vr_10'] = roll(candlesticks, 11).apply(calculate_vr)
added timers to both ways and this way is roughly 3X slower than iteration.

Efficient (fast) way to group continuous data in one DataFrame based on ranges taken from another DataFrame in Python Pandas?

I have experimental data produced by different programs. One is logging the start and end time of a trial as well as the type of trial (a category).
start trial type end
0 6.002987 2 c 7.574240
1 7.967054 3 b 19.084946
2 21.864419 5 b 23.298480
3 23.656995 7 c 24.087210
4 24.194764 9 c 27.960752
The other one records a continous datastream and logs the time for each observation.
X Y Z
0.0000 0.324963 -0.642636 -2.305040
0.0333 0.025089 -0.480412 -0.637273
0.0666 0.364149 0.966594 0.789467
0.0999 -0.087334 -0.761769 0.399813
0.1332 0.841872 2.306711 -1.059608
I have the 2 tables as pandas DataFrames and want to retrieve only those parts of the continuous data that is between the start to end ranges found in the rows of the trials DataFrame. I managed that by using a for-loop that iterates over the rows, but I was thinking that there must be more of a "pandas way" of doing this. So I looked into apply, but what I came up with so far was even considerably slower than the loop.
As I'm working on a lot of large datasets I'm looking for the most efficient way in terms of execution time to solve this.
This is a slice of the expected result for the continous DataFrame:
X Y Z trial type
13.6863 0.265358 0.116529 1.196689 NaN NaN
13.7196 -0.715096 -0.413416 0.696454 NaN NaN
13.7529 0.714897 -0.158183 1.735958 4.0 b
13.7862 -0.259513 0.194762 -0.531482 4.0 b
13.8195 -0.929080 -1.200593 -1.233834 4.0 b
[EDIT:] Here I test performance of different approaches. I found a way using apply(), but it isn't much faster than using iterrows.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def create_trials_df(num_trials=360, max_start=1400.0):
# First df holds start and end times (as seconds) of a trial as well as type of trial.
d = {'trial': pd.Series(np.sort(np.random.choice(np.arange(1, 400), replace=False, size=(360,)))),
'type': pd.Series(np.random.choice(('a', 'b', 'c', 'd'),size=num_trials)),
'start': pd.Series(np.sort(np.random.random_sample((num_trials,))) * max_start)}
trials_df = pd.DataFrame(d)
# Create column for when the trial ended.
trials_df['end'] = trials_df['start'].shift(-1)
trials_df.loc[num_trials-1, 'end'] = trials_df['start'].iloc[-1] + 2.0
trials_df['diff'] = trials_df['end'] - trials_df['start']
trials_df['end'] = trials_df['end'] - trials_df['diff'] * 0.2
del trials_df['diff']
return trials_df
def create_continuous_df(num_trials=360, max_start=1400.0):
# Second df has continuously recorded data with time as index.
time_delta = 1.0/30.0
rows = int((max_start+2) * 1/time_delta)
idx_time = pd.Index(np.arange(rows) * time_delta)
continuous_df = pd.DataFrame(np.random.randn(rows, 3), index=idx_time, columns=list('XYZ'))
print("continuous rows:", continuous_df.index.size)
print("continuous last time:", continuous_df.last_valid_index())
return continuous_df
# I want to group the continuous data by trial and type later on.
def iterrows_test(trials_df, continuous_df):
for index, row in trials_df.iterrows():
continuous_df.loc[row['start']:row['end'], 'trial'] = row['trial']
continuous_df.loc[row['start']:row['end'], 'type'] = row['type']
def itertuples_test(trials_df, continuous_df):
continuous_df['trial'] = np.NaN
continuous_df['type'] = np.NaN
for row in trials_df.itertuples():
continuous_df.loc[slice(row[1],row[4]), ['trial','type']] = [row[2],row[3]]
def apply_test(trials_df, continuous_df):
trial_series = pd.Series([x[0] for x in zip(trials_df.values)])
continuous_df['trial'] = np.NaN
continuous_df['type'] = np.NaN
def insert_trial_data_to_continuous(vals, con_df):
con_df.loc[slice(vals[0], vals[3]), ['trial','type']] = [vals[1],vals[2]]
trial_series.apply(insert_trial_data_to_continuous, args=(continuous_df,))
def real_slow_index_map(trials_df, continuous_df):
# Transform trial_data to new df: merge start and end ordered, make it float index.
trials_df['pre-start'] = trials_df['start'] - 0.0001
trials_df['post-end'] = trials_df['end'] + 0.0001
start_df = pd.DataFrame(data={'type': trials_df['type'].values, 'trial': trials_df['trial'].values},
index=trials_df['start'])
end_df = pd.DataFrame(data={'type': trials_df['type'].values, 'trial': trials_df['trial'].values},
index=trials_df['end'])
# Fill inbetween trials with NaN.
pre_start_df = pd.DataFrame({'trial': np.NaN, 'type': np.NaN}, index=trials_df['pre-start'])
post_end_df = pd.DataFrame({'trial': np.NaN, 'type': np.NaN}, index=trials_df['post-end'])
new_df = start_df.append([end_df, pre_start_df, post_end_df])
new_df.sort_index(inplace=True)
# Each start/end index in new_df has corresponding value in type and trial column.
def get_tuple(idx):
res = new_df.iloc[new_df.index.get_loc(idx, method='nearest')]
# return trial and type column values.
return tuple(res.values)
# Apply this to all indices.
idx_series = continuous_df.index.to_series()
continuous_df['trial'] = idx_series.apply(get_tuple).values
continuous_df[['trial', 'type']] = continuous_df['trial'].apply(pd.Series)
def jp_data_analysis_answer(trials_df, continuous_df):
ranges = trials_df[['trial', 'type', 'start', 'end']].values
def return_trial(n):
for i, r in enumerate(ranges):
if r[2] <= n <= r[3]:
return tuple((i, r[1]))
else:
return np.nan, np.nan
continuous_df['trial'], continuous_df['type'] = list(zip(*continuous_df.index.map(return_trial)))
def performance_test(func, trials_df, continuous_df):
return_df = continuous_df.copy()
time_ref = time.perf_counter()
func(trials_df, return_df)
time_delta = time.perf_counter() - time_ref
print("time delta for {}:".format(func.__name__), time_delta)
return return_df
# Just to illustrate where this is going:
def plot_trial(continuous_df):
continuous_df['type'] = continuous_df['type'].astype('category')
continuous_df = continuous_df.groupby('type').filter(lambda x: x is not np.NaN)
# Without the NaNs in column, let's set the trial column to dtype integer.
continuous_df['trial'] = continuous_df['trial'].astype('int64')
# Plot the data by trial.
for key, group in continuous_df.groupby('trial'):
group.drop(['trial', 'type'], axis=1).plot()
plt.title('Trial {}, Type: {}'.format(key, group['type'].iloc[0]))
plt.show()
break
if __name__ == '__main__':
import time
num_trials = 360
max_start_time = 1400
trials_df = create_trials_df(max_start=max_start_time)
data_df = create_continuous_df(max_start=max_start_time)
# My original approach with a for-loop over iterrows.
iterrows_df = performance_test(iterrows_test,trials_df, data_df)
# itertuples test
itertuples_df = performance_test(itertuples_test,trials_df, data_df)
# apply() on trial data, continuous data is manipulated therein
apply_df = performance_test(apply_test,trials_df, data_df)
# Mapping on index of continuous data. SLOW!
map_idx_df = performance_test(real_slow_index_map,trials_df, data_df)
# method by jp_data_analysis' answer. Works well with small continuous_df, but doesn't scale well.
jp_df = performance_test(jp_data_analysis_answer,trials_df, data_df)
plot_trial(apply_df)
I see a factor ~7x improvement with below logic. The trick is to use an index.map(custom_function) on continuous_df and unpack the results, together with (in my opinion) underused for..else.. construct. This is still sub-optimal, but may be sufficient for your purposes, and certainly better than iterating rows.
import numpy as np
import pandas as pd
def test2():
# First df holds start and end times (as seconds) of a trial as well as type of trial.
num_trials = 360
max_start = 1400.0
d = {'trial': pd.Series(np.sort(np.random.choice(np.arange(1, 400), replace=False, size=(360,)))),
'type': pd.Series(np.random.choice(('a', 'b', 'c', 'd'),size=num_trials)),
'start': pd.Series(np.sort(np.random.random_sample((num_trials,))) * max_start)}
trials_df = pd.DataFrame(d)
# Create column for when the trial ended.
trials_df['end'] = trials_df['start'].shift(-1)
trials_df.loc[num_trials-1, 'end'] = trials_df['start'].iloc[-1] + 2.0
trials_df['diff'] = trials_df['end'] - trials_df['start']
trials_df['end'] = trials_df['end'] - trials_df['diff'] * 0.2
del trials_df['diff']
# Second df has continuously recorded data with time as index.
time_delta = 0.0333
rows = int(max_start+2/time_delta)
idx_time = pd.Index(np.arange(rows) * time_delta)
continuous_df = pd.DataFrame(np.random.randn(rows,3), index=idx_time, columns=list('XYZ'))
ranges = trials_df[['trial', 'type', 'start', 'end']].values
def return_trial(n):
for r in ranges:
if r[2] <= n <= r[3]:
return tuple(r[:2])
else:
return (np.nan, '')
continuous_df['trial'], continuous_df['type'] = list(zip(*continuous_df.index.map(return_trial)))
return trials_df, continuous_df

pandas dataframe find nth non isnull row

I want to know how many points in a pandas dataframe where index is a series of dates that I need to have in order to end up with X points after doing a dropna(). I want the latest points. Example:
window = 504
s1 = pd.DataFrame(stuff)
len(s1.index) --> 600
dropped_series = s1.dropna()
len(dropped_series.index) --> 480
diff_points_count = len(s1.index) - len(dropped_series.index)
final_series = s1.tail(window + diff_points_count).dropna()
--> len(final_series.index) does not necessarily equal the window. Depends on where the NaN's are.
I need it to work where s1 is either a pandas.Series or a pandas.DataFrame
Here is my solution, but I'm sure there's a more elegant way to do it:
all_series_df = pd.concat([harmonized_series_set[i] for i in series_indices], axis=1)
all_series_df['is_valid'] = all_series_df.apply(lambda x: 0 if np.any(np.isnan(x)) else 1, raw=True, axis=1)
valid_point_count = all_series_df['is_valid'].sum()
all_series_df['count_valid'] = valid_point_count - all_series_df['is_valid'].cumsum() + 1
matching_row_array = all_series_df.loc[all_series_df['count_valid'] == (window + output_length - 1)]
matching_row_index = 0
if isinstance(matching_row_array, pd.DataFrame) and len(matching_row_array.index) > 0:
matching_row_index = all_series_df.index.get_loc(matching_row_array.index[0])
tail_amount = len(all_series_df.index) - matching_row_index
for i, arg in enumerate(args):
if i in series_indices:
tailed_series = harmonized_series_set[i].tail(tail_amount)
harmonized_args.append(tailed_series)
else:
harmonized_args.append(arg)
return tuple(harmonized_args)

Categories