Find pandas quartiles based on another column - python

I have a dataframe:
Av_Temp Tot_Precip
278.001 0
274 0.0751864
270.294 0.631634
271.526 0.229285
272.246 0.0652201
273 0.0840059
270.463 0.0602944
269.983 0.103563
268.774 0.0694555
269.529 0.010908
270.062 0.043915
271.982 0.0295718
I want to find the percentile values (25%, 50%, 75%) for the column: 'Tot_Precip' for each decile (top 10%, next 10% ...) of values from the column: Av_Temp. Currently, I am doing this:
import numpy, pandas, pdb
expl_var = 'Av_Temp'
cname = 'Tot_Precip'
num_samples = 10.0
max_val = df[expl_var].max()
min_val = df[expl_var].min()
expl_bins = numpy.linspace(min_val, max_val, num = num_samples)
for index, val in enumerate(expl_bins):
print index
if index < (len(expl_bins) - 1):
cur_val = val
nxt_val = expl_bins[index+1]
# Subset dataframe to rows with values of expl_var between
# cur_val and nxt_val
sub_ind_df = df[(df[expl_var] >= cur_val) & (df[expl_var] <= nxt_val)]
sub_ind_df[cname+'_quartiles'] = pandas.qcut(sub_ind_df[cname], 4)
# Merge with sub_df
pdb.set_trace()
Not sure how to proceed after this.
The answer could be something like:
Av_Temp_decile Tot_Precip_25 Tot_Precip_50 Tot_Precip_75
270 - 272 0.03 0.05 0.08

I'm only splitting you data into halves rather than deciles here due to the small example dataset, but everything should work the same if you just increase the number of bins in the initial cut:
# Change this to 10 to get deciles
df['Temp_Halves'] = pd.qcut(df['Av_Temp'], 2)
def get_quartiles(group):
# Add retbins=True to get the bin edges
qs, bins = pd.qcut(group['Tot_Precip'], [.25, .5, .75], retbins=True)
# Returning a series from a function means groupby.apply() will
# expand it into separate columns
return pd.Series(bins, index=['Precip_25', 'Precip_50', 'Precip_75']
df.groupby('Temp_Halves').apply(get_quartiles)
Out[21]:
Precip_25 Precip_50 Precip_75
Temp_Halves
[268.774, 270.995] 0.048010 0.064875 0.095036
(270.995, 278.001] 0.038484 0.070203 0.081801

Related

Remove following rows that are above or under by X amount from the current row['x']

I am calculating correlations and the data frame I have needs to be filtered.
I am looking to remove the rows under the current row from the data frame that are above or under by X amount starting with the first row and looping through the dataframe all the way until the last row.
example:
df['y'] has the values 50,51,52,53,54,55,70,71,72,73,74,75
if X = 10 it would start at 50 and see 51,52,53,54,55 as within that 10+- range and delete the rows. 70 would stay as it is not within that range and the same test would start again at 70 where 71,72,73,74,75 and respective rows would be deleted
the filter if X=10 would thus leave us with the rows including 50,75 for df.
It would leave me with a clean dataframe that deletes the instances that are linked to the first instance of what is essentially the same observed period. I tried coding a loop to do that but I am left with the wrong result and desperate at this point. Hopefully someone can correct the mistake or point me in the right direction.
df6['index'] = df6.index
df6.sort_values('index')
boom = len(dataframe1.index)/3
#Taking initial comparison values from first row
c = df6.iloc[0]['index']
#Including first row in result
filters = [True]
#Skipping first row in comparisons
for index, row in df6.iloc[1:].iterrows():
if c-boom <= row['index'] <= c+boom:
filters.append(False)
else:
filters.append(True)
# Updating values to compare based on latest accepted row
c = row['index']
df2 = df6.loc[filters].sort_values('correlation').drop('index', 1)
df2
OUTPUT BEFORE
OUTPUT AFTER
IIUC, your main issue is to filter consecutive values within a threshold.
You can use a custom function for that that acts on a Series (=column) to return the list of valid indices:
def consecutive(s, threshold = 10):
prev = float('-inf')
idx = []
for i, val in s.iteritems():
if val-prev > threshold:
idx.append(i)
prev = val
return idx
Example of use:
import pandas as pd
df = pd.DataFrame({'y': [50,51,52,53,54,55,70,71,72,73,74,75]})
df2 = df.loc[consecutive(df['y'])]
Output:
y
0 50
6 70
variant
If you prefer the function to return a boolean indexer, here is a varient:
def consecutive(s, threshold = 10):
prev = float('-inf')
idx = [False]*len(s)
for i, val in s.iteritems():
if val-prev > threshold:
idx[i] = True
prev = val
return idx

How to iterate over rows of each column in a dataframe

My current code functions and produces a graph if there is only 1 sensor, i.e. if col2, and col3 are deleted in the example data provided below, leaving one column.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
d = {'col1': [-2587.944231, -1897.324231,-2510.304231,-2203.814231,-2105.734231,-2446.964231,-2963.904231,-2177.254231, 2796.354231,-2085.304231], 'col2': [-3764.468462,-3723.608462,-3750.168462,-3694.998462,-3991.268462,-3972.878462,3676.608462,-3827.808462,-3629.618462,-1841.758462,], 'col3': [-166.1357692,-35.36576923, 321.4157692,108.9257692,-123.2257692, -10.84576923, -100.7457692, 89.27423077, -211.0857692, 101.5342308]}
df = pd.DataFrame(data=d)
sensors = 3
window_size = 5
dfn = df.rolling(window_size).corr(pairwise = True)
index = df.index #index of values in the data frame.
rows = len(index) #len(index) returns number of rows in the data.
sensors = 3
baseline_num = [0]*(rows) #baseline numerator, by default zero
baseline = [0]*(rows) #initialize baseline value
baseline = DataFrame(baseline)
baseline_num = DataFrame(baseline_num)
v = [None]*(rows) # Initialize an empty array v[] equal to amount of rows in .csv file
s = [None]*(rows) #Initialize another empty array for the slope values for detecting when there is an exposure
d = [0]*(rows)
sensors_on = True #Is the sensor detecting something (True) or not (False).
off_count = 0
off_require = 8 # how many offs until baseline is updated
sensitivity = 1000
for i in range(0, (rows)): #This iterates over each index value, i.e. each row, and sums the values and returns them in list format.
v[i] = dfn.loc[i].to_numpy().sum() - sensors
for colname,colitems in df.iteritems():
for rownum,rowitem in colitems.iteritems():
#d[rownum] = dfone.loc[rownum].to_numpy()
#d[colname][rownum] = df.loc[colname][rownum]
if v[rownum] >= sensitivity:
sensors_on = True
off_count = 0
baseline_num[rownum] = 0
else:
sensors_on = False
off_count += 1
if off_count == off_require:
for x in range(0, (off_require)):
baseline_num[colname][rownum] += df[colname][rownum - x]
elif off_count > off_require:
baseline_num[colname][rownum] += baseline_num[colname][rownum - 1] + df[colname][rownum] - (df[colname][rownum - off_require]) #this loop is just an optimization, one calculation per loop once the first calculation is established
baseline[colname][rownum] = ((baseline_num[colname][rownum])//(off_require)) #mean of the last "off_require" points
dfx = DataFrame(v, columns =['Sensor Correlation']) #converts the summed correlation tables back from list format to a DataFrame, with the sole column name 'Sensor Correlation'
dft = pd.DataFrame(baseline, columns =['baseline'])
dft = dft.astype(float)
dfx.plot(figsize=(50,25), linewidth=5, fontsize=40) # plots dfx dataframe which contains correlated and summed data
dft.plot(figsize=(50,25), linewidth=5, fontsize=40)
Basically, instead of 1 graph as this produces, I would like to iterate over each column only for this loop:
for colname,colitems in df.iteritems():
for rownum,rowitem in colitems.iteritems():
#d[rownum] = dfone.loc[rownum].to_numpy()
#d[colname][rownum] = df.loc[colname][rownum]
if v[rownum] >= sensitivity:
sensors_on = True
off_count = 0
baseline_num[rownum] = 0
else:
sensors_on = False
off_count += 1
if off_count == off_require:
for x in range(0, (off_require)):
baseline_num[colname][rownum] += df[colname][rownum - x]
elif off_count > off_require:
baseline_num[colname][rownum] += baseline_num[colname][rownum - 1] + df[colname][rownum] - (df[colname][rownum - off_require]) #this loop is just an optimization, one calculation per loop once the first calculation is established
I've tried some other solutions from other questions but none of them seem to solve this case.
As of now, I've tried multiple conversions to things like lists and tuples, and then calling them something like this:
baseline_num[i,column] += d[i - x,column]
as well as
baseline_num[i][column += d[i - x][column]
while iterating over the loop using
for column in columns
However no matter how I seem to arrange the solution, there is always some keyerror of expecting integer or slice indices, among other errors.
See pictures for expected/possible outputs of one column on actual data.with varying input parameters (sensitivity value, and off_require is varied in different cases.)
One such solution which didn't work was the looping method from this link:
https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
I've also tried creating a loop using iteritems as the outer loop. This did not function as well.
Below are links to possible graph outputs for various sensitivity values, and windows in my actual dataset, with only one column. (i.e i manually deleted other columns, and plotted just the one using the current program)
sensitivity 1000, window 8
sensitivity 800, window 5
sensitivity 1500, window 5
If there's anything I've left out that would be helpful to solving this, please let me know so I can rectify it immediately.
See this picture for my original df.head:
df.head
Did you try,
for colname,colitems in df.iteritems():
for rownum,rowitem in colitems.iteritems():
print(df[colname][rownum])
The first loop iterates over all the columns, and the 2nd loops iterates over all the rows for that column.
edit:
From our conversation below, I think that your baseline and df dataframes don't have the same column names because of how you created them and how you are accessing the elements.
My suggestion is that you create the baseline dataframe to be a copy of your df dataframe and edit the information within it from there.
Edit:
I have managed to make your code work for 1 loop, but I run into an index error, I am not sure what your optimisation function does but i think that is what is causing it, take a look.
It is this part baseline_num[colname][rownum - 1], in the second loop i guess because you do rownum (0) -1, you get index -1. You need to change it so that in the first loop rownum is 1 or something, I am not sure what you are trying to do there.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
d = {'col1': [-2587.944231, -1897.324231,-2510.304231,-2203.814231,-2105.734231,-2446.964231,-2963.904231,-2177.254231, 2796.354231,-2085.304231], 'col2': [-3764.468462,-3723.608462,-3750.168462,-3694.998462,-3991.268462,-3972.878462,3676.608462,-3827.808462,-3629.618462,-1841.758462,], 'col3': [-166.1357692,-35.36576923, 321.4157692,108.9257692,-123.2257692, -10.84576923, -100.7457692, 89.27423077, -211.0857692, 101.5342308]}
df = pd.DataFrame(data=d)
sensors = 3
window_size = 5
dfn = df.rolling(window_size).corr(pairwise = True)
index = df.index #index of values in the data frame.
rows = len(index) #len(index) returns number of rows in the data.
sensors = 3
baseline_num = [0]*(rows) #baseline numerator, by default zero
baseline = [0]*(rows) #initialize baseline value
baseline = pd.DataFrame(df)
baseline_num = pd.DataFrame(df)
#print(baseline_num)
v = [None]*(rows) # Initialize an empty array v[] equal to amount of rows in .csv file
s = [None]*(rows) #Initialize another empty array for the slope values for detecting when there is an exposure
d = [0]*(rows)
sensors_on = True #Is the sensor detecting something (True) or not (False).
off_count = 0
off_require = 8 # how many offs until baseline is updated
sensitivity = 1000
for i in range(0, (rows)): #This iterates over each index value, i.e. each row, and sums the values and returns them in list format.
v[i] = dfn.loc[i].to_numpy().sum() - sensors
for colname,colitems in df.iteritems():
#print(colname)
for rownum,rowitem in colitems.iteritems():
#print(rownum)
#display(baseline[colname][rownum])
#d[rownum] = dfone.loc[rownum].to_numpy()
#d[colname][rownum] = df.loc[colname][rownum]
if v[rownum] >= sensitivity:
sensors_on = True
off_count = 0
baseline_num[rownum] = 0
else:
sensors_on = False
off_count += 1
if off_count == off_require:
for x in range(0, (off_require)):
baseline_num[colname][rownum] += df[colname][rownum - x]
elif off_count > off_require:
baseline_num[colname][rownum] += baseline_num[colname][rownum - 1] + df[colname][rownum] - (df[colname][rownum - off_require]) #this loop is just an optimization, one calculation per loop once the first calculation is established
baseline[colname][rownum] = ((baseline_num[colname][rownum])//(off_require)) #mean of the last "off_require" points
print(baseline[colname][rownum])
dfx = pd.DataFrame(v, columns =['Sensor Correlation']) #converts the summed correlation tables back from list format to a DataFrame, with the sole column name 'Sensor Correlation'
dft = pd.DataFrame(baseline, columns =['baseline'])
dft = dft.astype(float)
dfx.plot(figsize=(50,25), linewidth=5, fontsize=40) # plots dfx dataframe which contains correlated and summed data
dft.plot(figsize=(50,25), linewidth=5, fontsize=40)
My output looks like this,
-324.0
-238.0
-314.0
-276.0
-264.0
-306.0
-371.0
-806.0
638.0
-412.0
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
354 try:
--> 355 return self._range.index(new_key)
356 except ValueError as err:
ValueError: -1 is not in range
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
3 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
355 return self._range.index(new_key)
356 except ValueError as err:
--> 357 raise KeyError(key) from err
358 raise KeyError(key)
359 return super().get_loc(key, method=method, tolerance=tolerance)
KeyError: -1
I don't have enough rep to comment, but below is what I was able to work out. Hope it helps!
I tried to use the to_list() function while working out an answer, and it threw me an error:
AttributeError: 'DataFrame' object has no attribute 'to_list'
So, I decided to circumvent that method and came up with this:
indexes = [x for x in df.index]
row_vals = []
for index in indexes :
for val in df.iloc[i].values:
row_vals.append(val)
The object row_vals will contain all values in row order.
If you only want to get the row values for a particular row or set of rows, you would need to do this:
indx_subset = [`list of row indices`] #(Ex. [1, 2, 5, 6, etc...])
row_vals = []
for indx in indx_subset:
for val in df.loc[indx].values:
row_vals.append(val)
row_vals will then have all the row values from the specified indices.

Calculate column in Pandas Dataframe using adjacent rows without iterating through each row

I would like to see if there is a way to calculate a column in a dataframe that uses something similar to a moving average without iterating through each row.
Current working code:
def create_candles(ticks, instrument, time_slice):
candlesticks = ticks.price.resample(time_slice, base=00).ohlc().bfill()
volume = ticks.amount.resample(time_slice, base=00).sum()
candlesticks['volume'] = volume
candlesticks['instrument'] = instrument
candlesticks['ttr'] = 0
# candlesticks['vr_7'] = 0
candlesticks['vr_10'] = 0
candlesticks = calculate_indicators(candlesticks, instrument, time_slice)
return candlesticks
def calculate_indicators(candlesticks, instrument):
candlesticks.sort_index(inplace=True)
# candlesticks['rsi_14'] = talib.RSI(candlesticks.close, timeperiod=14)
candlesticks['lr_50'] = talib.LINEARREG(candlesticks.close, timeperiod=50)
# candlesticks['lr_150'] = talib.LINEARREG(candlesticks.close, timeperiod=150)
# candlesticks['ema_55'] = talib.EMA(candlesticks.close, timeperiod=55)
# candlesticks['ema_28'] = talib.EMA(candlesticks.close, timeperiod=28)
# candlesticks['ema_18'] = talib.EMA(candlesticks.close, timeperiod=18)
# candlesticks['ema_9'] = talib.EMA(candlesticks.close, timeperiod=9)
# candlesticks['wma_21'] = talib.WMA(candlesticks.close, timeperiod=21)
# candlesticks['wma_12'] = talib.WMA(candlesticks.close, timeperiod=12)
# candlesticks['wma_11'] = talib.WMA(candlesticks.close, timeperiod=11)
# candlesticks['wma_5'] = talib.WMA(candlesticks.close, timeperiod=5)
candlesticks['cmo_9'] = talib.CMO(candlesticks.close, timeperiod=9)
for row in candlesticks.itertuples():
current_index = candlesticks.index.get_loc(row.Index)
if current_index >= 1:
previous_close = candlesticks.iloc[current_index - 1, candlesticks.columns.get_loc('close')]
candlesticks.iloc[current_index, candlesticks.columns.get_loc('ttr')] = max(
row.high - row.low,
abs(row.high - previous_close),
abs(row.low - previous_close))
if current_index > 10:
candlesticks.iloc[current_index, candlesticks.columns.get_loc('vr_10')] = candlesticks.iloc[current_index, candlesticks.columns.get_loc('ttr')] / (
max(candlesticks.high[current_index - 9: current_index].max(), candlesticks.close[current_index - 11]) -
min(candlesticks.low[current_index - 9: current_index].min(), candlesticks.close[current_index - 11]))
candlesticks['timestamp'] = pd.to_datetime(candlesticks.index)
candlesticks['instrument'] = instrument
candlesticks.fillna(0, inplace=True)
return candlesticks
in the iteration, i am calculating the True Range ('TTR') and then the Volatility Ratio ('VR_10')
TTR is calculated on every row in the DF except for the first one. It uses the previous row's close column, and the current row's high and low column.
VR_10 is calculated on every row except for the first 10. it uses the high and low column of the previous 9 rows and the close of the 10th row back.
EDIT 2
I have tried many ways to add a text based data frame in this question, there just doesnt seem to be a solution with the width of my frame. there is no difference in the input and output dataframes other than the column TTR and VR_10 have all 0s in the input, and have non-zero values in the output.
an example would be this dataframe:
Is there a way I can do this without iteration?
With the nudge from Andreas to use rolling, I came to an answer:
first, I had to find out how to use rolling with multiple columns. found that here.
I made a modification because I need to roll up, not down
def roll(df, w, **kwargs):
df.sort_values(by='timestamp', ascending=0, inplace=True)
v = df.values
d0, d1 = v.shape
s0, s1 = v.strides
a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1))
rolled_df = pd.concat({
row: pd.DataFrame(values, columns=df.columns)
for row, values in zip(df.index, a)
})
return rolled_df.groupby(level=0, **kwargs)
after that, I created 2 functions:
def calculate_vr(window):
return window.iloc[0].ttr / (max(window.high[1:9].max(), window.iloc[10].close) - min(window.low[1:9].min(), window.iloc[10].close))
def calculate_ttr(window):
return max(window.iloc[0].high - window.iloc[0].low, abs(window.iloc[0].high - window.iloc[1].close), abs(window.iloc[0].low - window.iloc[1].close))
and called those functions like this:
candlesticks['ttr'] = roll(candlesticks, 3).apply(calculate_ttr)
candlesticks['vr_10'] = roll(candlesticks, 11).apply(calculate_vr)
added timers to both ways and this way is roughly 3X slower than iteration.

Issue with .dropna() in Pandas

In the below function I am working with a Pandas dataframe. I am bringing in a data frame, and immediately resetting the index. I then make a copy of that dataframe so I avoid any Chained Assignment issues.
I then want to use .dropna(inplace=True, subset = [header], axis=0) to remove any rows where my column of interest (header) is nan. However, once I start into the for loop it is clear that the nan values haven't dropped because I keep getting warnings like:
RuntimeWarning: Mean of empty slice
which is a result of my array neighbors having all nan values.
My Question: In the line where I use df_copy.dropna(inplace=True, subset=[header], axis=0) does it seem like I am not actually getting a permanent drop of those rows?
n_samples = 10
tolerance = 1.5
dataframe = pd.read_csv('my_file.csv')
def removeOutliers(dataframe, header):
dataframe.reset_index(inplace=True, drop=True)
df_copy = dataframe.copy()
#Why doesn't the below actually drop the NaNs?
df_copy.dropna(inplace=True, subset=[header], axis=0)
for ii in range(len(df_copy['Lng'])):
a = df_copy.iloc[ii]['Lng'] - df_copy.iloc[:]['Lng']
b = df_copy.iloc[ii]['Lat'] - df_copy.iloc[:]['Lat']
c = np.array((a**2 + b**2)**0.5 )
d = np.zeros((len(df_copy['Lng'])))
e = np.zeros((len(df_copy['Lng'])))
d[:] = df_copy.iloc[:]['Well']
e[:] = df_copy.iloc[:][header]
idx = np.argpartition(c, n_samples+1)
max_loc = np.where(e[idx[0:n_samples+1]] == e[ii])
neighbors = np.delete(e[idx[0:n_samples+1]], max_loc)
avg = np.nanmean(neighbors)
std = np.nanstd(neighbors)
if df_copy.iloc[ii][header] > (avg + tolerance*std) or df_copy.iloc[ii][header] < (avg - tolerance*std):
df_copy.iloc[ii, df_copy.columns.get_loc(header)] = np.nan
return df_copy
test_data = removeOutliers(dataframe, 'myColumn')

List Highest Correlation Pairs from a Large Correlation Matrix in Pandas?

How do you find the top correlations in a correlation matrix with Pandas? There are many answers on how to do this with R (Show correlations as an ordered list, not as a large matrix or Efficient way to get highly correlated pairs from large data set in Python or R), but I am wondering how to do it with pandas? In my case the matrix is 4460x4460, so can't do it visually.
You can use DataFrame.values to get an numpy array of the data and then use NumPy functions such as argsort() to get the most correlated pairs.
But if you want to do this in pandas, you can unstack and sort the DataFrame:
import pandas as pd
import numpy as np
shape = (50, 4460)
data = np.random.normal(size=shape)
data[:, 1000] += data[:, 2000]
df = pd.DataFrame(data)
c = df.corr().abs()
s = c.unstack()
so = s.sort_values(kind="quicksort")
print so[-4470:-4460]
Here is the output:
2192 1522 0.636198
1522 2192 0.636198
3677 2027 0.641817
2027 3677 0.641817
242 130 0.646760
130 242 0.646760
1171 2733 0.670048
2733 1171 0.670048
1000 2000 0.742340
2000 1000 0.742340
dtype: float64
#HYRY's answer is perfect. Just building on that answer by adding a bit more logic to avoid duplicate and self correlations and proper sorting:
import pandas as pd
d = {'x1': [1, 4, 4, 5, 6],
'x2': [0, 0, 8, 2, 4],
'x3': [2, 8, 8, 10, 12],
'x4': [-1, -4, -4, -4, -5]}
df = pd.DataFrame(data = d)
print("Data Frame")
print(df)
print()
print("Correlation Matrix")
print(df.corr())
print()
def get_redundant_pairs(df):
'''Get diagonal and lower triangular pairs of correlation matrix'''
pairs_to_drop = set()
cols = df.columns
for i in range(0, df.shape[1]):
for j in range(0, i+1):
pairs_to_drop.add((cols[i], cols[j]))
return pairs_to_drop
def get_top_abs_correlations(df, n=5):
au_corr = df.corr().abs().unstack()
labels_to_drop = get_redundant_pairs(df)
au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
return au_corr[0:n]
print("Top Absolute Correlations")
print(get_top_abs_correlations(df, 3))
That gives the following output:
Data Frame
x1 x2 x3 x4
0 1 0 2 -1
1 4 0 8 -4
2 4 8 8 -4
3 5 2 10 -4
4 6 4 12 -5
Correlation Matrix
x1 x2 x3 x4
x1 1.000000 0.399298 1.000000 -0.969248
x2 0.399298 1.000000 0.399298 -0.472866
x3 1.000000 0.399298 1.000000 -0.969248
x4 -0.969248 -0.472866 -0.969248 1.000000
Top Absolute Correlations
x1 x3 1.000000
x3 x4 0.969248
x1 x4 0.969248
dtype: float64
Few lines solution without redundant pairs of variables:
corr_matrix = df.corr().abs()
#the matrix is symmetric so we need to extract upper triangle matrix without diagonal (k = 1)
sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
.stack()
.sort_values(ascending=False))
#first element of sol series is the pair with the biggest correlation
Then you can iterate through names of variables pairs (which are pandas.Series multi-indexes) and theirs values like this:
for index, value in sol.items():
# do some staff
Combining some features of #HYRY and #arun's answers, you can print the top correlations for dataframe df in a single line using:
df.corr().unstack().sort_values().drop_duplicates()
Note: the one downside is if you have 1.0 correlations that are not one variable to itself, the drop_duplicates() addition would remove them
I liked Addison Klinke's post the most, as being the simplest, but used Wojciech Moszczyńsk’s suggestion for filtering and charting, but extended the filter to avoid absolute values, so given a large correlation matrix, filter it, chart it, and then flatten it:
Created, Filtered and Charted
dfCorr = df.corr()
filteredDf = dfCorr[((dfCorr >= .5) | (dfCorr <= -.5)) & (dfCorr !=1.000)]
plt.figure(figsize=(30,10))
sn.heatmap(filteredDf, annot=True, cmap="Reds")
plt.show()
Function
In the end, I created a small function to create the correlation matrix, filter it, and then flatten it. As an idea, it could easily be extended, e.g., asymmetric upper and lower bounds, etc.
def corrFilter(x: pd.DataFrame, bound: float):
xCorr = x.corr()
xFiltered = xCorr[((xCorr >= bound) | (xCorr <= -bound)) & (xCorr !=1.000)]
xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
return xFlattened
corrFilter(df, .7)
Follow-Up
Eventually, I refined the functions
# Returns correlation matrix
def corrFilter(x: pd.DataFrame, bound: float):
xCorr = x.corr()
xFiltered = xCorr[((xCorr >= bound) | (xCorr <= -bound)) & (xCorr !=1.000)]
return xFiltered
# flattens correlation matrix with bounds
def corrFilterFlattened(x: pd.DataFrame, bound: float):
xFiltered = corrFilter(x, bound)
xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
return xFlattened
# Returns correlation for a variable from flattened correlation matrix
def filterForLabels(df: pd.DataFrame, label):
try:
sideLeft = df[label,]
except:
sideLeft = pd.DataFrame()
try:
sideRight = df[:,label]
except:
sideRight = pd.DataFrame()
if sideLeft.empty and sideRight.empty:
return pd.DataFrame()
elif sideLeft.empty:
concat = sideRight.to_frame()
concat.rename(columns={0:'Corr'},inplace=True)
return concat
elif sideRight.empty:
concat = sideLeft.to_frame()
concat.rename(columns={0:'Corr'},inplace=True)
return concat
else:
concat = pd.concat([sideLeft,sideRight], axis=1)
concat["Corr"] = concat[0].fillna(0) + concat[1].fillna(0)
concat.drop(columns=[0,1], inplace=True)
return concat
You can do graphically according to this simple code by substituting your data.
corr = df.corr()
kot = corr[corr>=.9]
plt.figure(figsize=(12,8))
sns.heatmap(kot, cmap="Greens")
Use the code below to view the correlations in the descending order.
# See the correlations in descending order
corr = df.corr() # df is the pandas dataframe
c1 = corr.abs().unstack()
c1.sort_values(ascending = False)
Combining most the answers above into a short snippet:
def top_entries(df):
mat = df.corr().abs()
# Remove duplicate and identity entries
mat.loc[:,:] = np.tril(mat.values, k=-1)
mat = mat[mat>0]
# Unstack, sort ascending, and reset the index, so features are in columns
# instead of indexes (allowing e.g. a pretty print in Jupyter).
# Also rename these it for good measure.
return (mat.unstack()
.sort_values(ascending=False)
.reset_index()
.rename(columns={
"level_0": "feature_a",
"level_1": "feature_b",
0: "correlation"
}))
Lot's of good answers here. The easiest way I found was a combination of some of the answers above.
corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
corr = corr.unstack().transpose()\
.sort_values(by='column', ascending=False)\
.dropna()
The following function should do the trick. This implementation
Removes self correlations
Removes duplicates
Enables the selection of top N highest correlated features
and it is also configurable so that you can keep both the self correlations as well as the duplicates. You can also to report as many feature pairs as you wish.
def get_feature_correlation(df, top_n=None, corr_method='spearman',
remove_duplicates=True, remove_self_correlations=True):
"""
Compute the feature correlation and sort feature pairs based on their correlation
:param df: The dataframe with the predictor variables
:type df: pandas.core.frame.DataFrame
:param top_n: Top N feature pairs to be reported (if None, all of the pairs will be returned)
:param corr_method: Correlation compuation method
:type corr_method: str
:param remove_duplicates: Indicates whether duplicate features must be removed
:type remove_duplicates: bool
:param remove_self_correlations: Indicates whether self correlations will be removed
:type remove_self_correlations: bool
:return: pandas.core.frame.DataFrame
"""
corr_matrix_abs = df.corr(method=corr_method).abs()
corr_matrix_abs_us = corr_matrix_abs.unstack()
sorted_correlated_features = corr_matrix_abs_us \
.sort_values(kind="quicksort", ascending=False) \
.reset_index()
# Remove comparisons of the same feature
if remove_self_correlations:
sorted_correlated_features = sorted_correlated_features[
(sorted_correlated_features.level_0 != sorted_correlated_features.level_1)
]
# Remove duplicates
if remove_duplicates:
sorted_correlated_features = sorted_correlated_features.iloc[:-2:2]
# Create meaningful names for the columns
sorted_correlated_features.columns = ['Feature 1', 'Feature 2', 'Correlation (abs)']
if top_n:
return sorted_correlated_features[:top_n]
return sorted_correlated_features
Use itertools.combinations to get all unique correlations from pandas own correlation matrix .corr(), generate list of lists and feed it back into a DataFrame in order to use '.sort_values'. Set ascending = True to display lowest correlations on top
corrank takes a DataFrame as argument because it requires .corr().
def corrank(X: pandas.DataFrame):
import itertools
df = pd.DataFrame([[(i,j),X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],columns=['pairs','corr'])
print(df.sort_values(by='corr',ascending=False))
corrank(X) # prints a descending list of correlation pair (Max on top)
I didn't want to unstack or over-complicate this issue, since I just wanted to drop some highly correlated features as part of a feature selection phase.
So I ended up with the following simplified solution:
# map features to their absolute correlation values
corr = features.corr().abs()
# set equality (self correlation) as zero
corr[corr == 1] = 0
# of each feature, find the max correlation
# and sort the resulting array in ascending order
corr_cols = corr.max().sort_values(ascending=False)
# display the highly correlated features
display(corr_cols[corr_cols > 0.8])
In this case, if you want to drop correlated features, you may map through the filtered corr_cols array and remove the odd-indexed (or even-indexed) ones.
I was trying some of the solutions here but then I actually came up with my own one. I hope this might be useful for the next one so I share it here:
def sort_correlation_matrix(correlation_matrix):
cor = correlation_matrix.abs()
top_col = cor[cor.columns[0]][1:]
top_col = top_col.sort_values(ascending=False)
ordered_columns = [cor.columns[0]] + top_col.index.tolist()
return correlation_matrix[ordered_columns].reindex(ordered_columns)
This is a improve code from #MiFi. This one order in abs but not excluding the negative values.
def top_correlation (df,n):
corr_matrix = df.corr()
correlation = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
.stack()
.sort_values(ascending=False))
correlation = pd.DataFrame(correlation).reset_index()
correlation.columns=["Variable_1","Variable_2","Correlacion"]
correlation = correlation.reindex(correlation.Correlacion.abs().sort_values(ascending=False).index).reset_index().drop(["index"],axis=1)
return correlation.head(n)
top_correlation(ANYDATA,10)
simple is better
from collections import defaultdict
res = defaultdict(dict)
corr = returns.corr().replace(1, -1)
names = list(corr)
for name in names:
idx = corr[name].argmax()
max_pairwise_name = names[idx]
res[name][max_pairwise_name] = corr.loc[max_pairwisename, name]
Now res contains the maximum pairwise correlation for each pair

Categories