Pandas flexible determination of metrics - python

Imagine we have different structures of dataframes in Pandas
# creating the first dataframe
df1 = pd.DataFrame({
"width": [1, 5],
"height": [5, 8]})
# creating second dataframe
df2 = pd.DataFrame({
"a": [7, 8],
"b": [11, 23],
"c": [1, 3]})
# creating second dataframe
df3 = pd.DataFrame({
"radius": [7, 8],
"height": [11, 23]})
In general there might be more than 2 dataframes. Now, I want to create a logic that is mapping columns names to specific functions to create a new column "metric" (think of it as area for two columns and volume for 3 columns). I want to specify column names ensembles
column_name_ensembles = {
"1": {
"ensemble": ['height', 'width'],
"method": area},
"2": {
"ensemble": ['a', 'b', 'c'],
"method": volume_cube},
"3": {
"ensemble": ['radius', 'height'],
"method": volume_cylinder}}
def area(width, height):
return width * height
def volume_cube(a, b, c):
return a * b * c
def volume_cylinder(radius, height):
return (3.14159 * radius ** 2) * height
Now, the area function create a new column for the dataframe df1['metric'] = df1['height'] * df2['widht'] and the volumen function will create a new column for the dataframe df2['metic'] = df2['a'] * df2['b'] * df2['c']. Note, that the functions can have arbitrary form but it takes the ensemble as parameters. The desired function metric(df, column_name_ensembles) should take an arbitrary dataframe as input and decide by inspecting the column names which function should be applied.
Example input output behaviour
df1_with_metric = metric(df1, column_name_ensembles)
print(df1_with_metric)
# output
# width height metric
# 0 1 5 5
# 1 5 8 40
df2_with_metric = metric(df2, column_name_ensembles)
print(df2_with_metric)
# output
# a b c metric
# 0 7 11 1 77
# 1 8 23 3 552
df3_with_metric = metric(df3, column_name_ensembles)
print(df3_with_metric)
# output
# radius height metric
# 0 7 11 1693.31701
# 1 8 23 4624.42048
The perfect solution would be a function that takes the dataframe and the column_name_ensembles as parameters and returns the dataframe with the appropriate 'metric' added to it.
I know this can be achieved by multiple if and else statements, but this does not seem to be the most intelligent solution. Maybe there is a design pattern that can solve this problem, but I am not an expert at design patterns.
Thank you for reading my question! I am looking forward for your great answers.

You can use the inspect module to extract parameter names automatically and then map frozenset of parameter names to metric functions directly:
import inspect
metrics = {
frozenset(inspect.signature(f).parameters): f
for f in (area, volume_cube, volume_cylinder)
}
Then for a given data frame, if all columns are guaranteed to be arguments to the relevant metric, you can simply query that dictionary:
def apply_metric(df, metrics):
metric = metrics[frozenset(df.columns)]
args = tuple(df[p] for p in inspect.signature(metric).parameters)
df['metric'] = metric(*args)
return df
In case the input data frame has more columns than are required by the metric function you can use set intersection for finding the relevant metric:
def apply_metric(df, metrics):
for parameters, metric in metrics.items():
if parameters & set(df.columns) == parameters:
args = tuple(df[p] for p in inspect.signature(metric).parameters)
df['metric'] = metric(*args)
break
else:
raise ValueError(f'No metric found for columns {df.columns}')
return df

The function that runs the model should be a fairly flexible apply. Assuming the calculations will always be limited to the data in a single row, this would probably work.
First, I modified the functions to use a common input. I added a triangle area calc to be sure this was extensible.
#def area(width, height):
# return width * height
def area(row):
return row['width'] * row['height']
#def volume_cube(a, b, c):
# return a * b * c
def volume_cube(row):
return row['a'] * row['b'] * row['c']
#def volume_cylinder(radius, height):
# return (3.14159 * radius ** 2) * height
def volume_cylinder(row):
return (3.14159 * row['radius'] ** 2) * row['height']
def area_triangle(row):
return 0.5 * row['width'] * row['height']
This allows us to use the same apply for all of the functions. Because I'm a bit ocd, I changed the names of keys in the reference dictionary.
column_name_ensembles = {
"area": {
"ensemble": ['width', 'height'],
"method": area},
"volume_cube": {
"ensemble": ['a', 'b', 'c'],
"method": volume_cube},
"volume_cylinder": {
"ensemble": ['radius', 'height'],
"method": volume_cylinder},
"area_triangle": {
"ensemble": ['width', 'height'],
"method": area_triangle},
}
The metric function then is an apply to the df. You have to specify the function you are targeting in this version, but you could infer the ensemble method based on the columns. This version makes sure the required columns are available.
def metric(df,method_id):
source_columns = list(df.columns)
calc_columns = column_name_ensembles[method_id]['ensemble']
if all(factor in source_columns for factor in calc_columns):
df['metric'] = df.apply(lambda row: column_name_ensembles[method_id]['method'](row),axis=1)
return df
else:
print('Column Mismatch')
You can then specify the dataframe and the ensemble method.
df1_with_metric = metric(df1,'area')
df2_with_metric = metric(df2,'volume_cube')
df3_with_metric = metric(df3,'volume_cylinder')
df1_with_triangle_metric = metric(df1,'area_triangle')

def metric(df, column_name_ensembles):
df_cols_set = set(df.columns)
# if there is a need to overwrite the previously calculated 'metric' column
df_cols_set.discard('metric')
for column_name_ensemble in column_name_ensembles.items():
# pick up the first `column_name_ensemble` dictionary
# with 'ensemble' matching the df columns
# (excluding 'metric' column, if present)
# comparing `set` if order of column names
# in ensemble does not matter (as per your df1 example),
# else can compare `list`
if df_cols_set == set(column_name_ensemble[1]['ensemble']):
df['metric'] = column_name_ensemble[1]['method'](**{col: df[col] for col in df_cols_set})
break
# if there is a match, return df with 'metric' calculated
# else, return original df untouched
return df

Solution
The idea is to make a function as generic as possible. To do that you should rely on df.apply using axis=1 to apply the function row wise.
The function would be:
def method(df, ensembles):
# To avoid modifying the original dataframe
df = df_in.copy()
for data in ensembles.values():
if set(df.columns) == set(data["ensemble"]):
df["method"] = df.apply(lambda row: data["method"](**row), axis=1)
return df
Why it always works?
This would be posible to apply even for functions that won't work with the whole column.
For example:
df = pd.DataFrame({
"a": [1, 2],
"b": [[1, 2], [3, 4]],
})
def a_in_b(a, b):
return a in b
# This will work
df.apply(lambda row: a_in_b(**row), axis=1)
# This won't
a_in_b(df["a"], df["b"])

Here is an interesting way of doing this using pandas methods (Details below)
def metric(dataframe,column_name_ensembles):
func_df = pd.DataFrame(column_name_ensembles).T
func_to_apply = func_df.loc[func_df['ensemble'].map(dataframe.columns.difference)
.str.len().eq(0),'method'].iat[0]
return dataframe.assign(metric=dataframe.apply(lambda x: func_to_apply(**x),axis=1))
print(metric(df1,column_name_ensembles),'\n')
print(metric(df2,column_name_ensembles),'\n')
print(metric(df3,column_name_ensembles))
width height metric
0 1 5 5
1 5 8 40
a b c metric
0 7 11 1 77
1 8 23 3 552
radius height metric
0 7 11 1693.31701
1 8 23 4624.42048
More details:
func_df = pd.DataFrame(column_name_ensembles).T
This creates a dataframe of column names and associated methods like below:
ensemble method
1 [height, width] <function area at 0x000002809540F9D8>
2 [a, b, c] <function volume_cube at 0x000002809540F950>
3 [radius, height] <function volume_cylinder at 0x000002809540FF28>
Using this dataframe , we find the row where difference of column names of the passed dataframe and the list of columns in ensamble is 0 using pd.Index.difference , series.map , series.str.len and series.eq()
func_df['ensemble'].map(df1.columns.difference)
1 Index([], dtype='object') <- Row matches the df columns completely
2 Index(['height', 'width'], dtype='object')
3 Index(['width'], dtype='object')
Name: ensemble, dtype: object
func_df['ensemble'].map(df1.columns.difference).str.len().eq(0)
1 True
2 False
3 False
Next , where True , we pick the function in the method column
func_df.loc[func_df['ensemble'].map(df1.columns.difference)
.str.len().eq(0),'method'].iat[0]
#<function __main__.area(width, height)>
and using apply and df.assign we create a new row with a copy of the passed dataframe returned.

Related

How to select list elements based on crteria from other lists

I am new to Python, coming from SciLab (an open source MatLab ersatz), which I am using as a toolbox for my analyses (test data analysis, reliability, acoustics, ...); I am definitely not a computer science lad.
I have data in the form of lists of same length (vectors of same size in SciLab).
I use some of them as parameter in order to select data from another one; e.g.
t_v = [1:10]; // a parameter vector
p_v = [20:29]; another parameter vector
res_v(t_v > 5 & p_v < 28); // are the res_v vector elements of which "corresponding" p_v and t_v values comply with my criteria; i can use it for analyses.
This is very direct and simple in SciLab; I did not find the way to achieve the same with Python, either "Pythonically" or simply translated.
Any idea that could help me, please?
Have a nice day,
Patrick.
You could use numpy arrays. It's easy:
import numpy as np
par1 = np.array([1,1,5,5,5,1,1])
par2 = np.array([-1,1,1,-1,1,1,1])
data = np.array([1,2,3,4,5,6,7])
print(par1)
print(par2)
print(data)
bool_filter = (par1[:]>1) & (par2[:]<0)
# example to do it directly in the array
filtered_data = data[ par1[:]>1 ]
print( filtered_data )
#filtering with the two parameters
filtered_data_twice = data[ bool_filter==True ]
print( filtered_data_twice )
output:
[1 1 5 5 5 1 1]
[-1 1 1 -1 1 1 1]
[1 2 3 4 5 6 7]
[3 4 5]
[4]
Note that it does not keep the same number of elements.
Here's my modified solution according to your last comment.
t_v = list(range(1,10))
p_v = list(range(20,29))
res_v = list(range(30,39))
def first_idex_greater_than(search_number, lst):
for count, number in enumerate(lst):
if number > search_number:
return count
def first_idex_lower_than(search_number, lst):
for count, number in enumerate(lst[::-1]):
if number < search_number:
return len(lst) - count # since I searched lst from top to bottom,
# I need to also reverse count
t_v_index = first_idex_greater_than(5, t_v)
p_v_index = first_idex_lower_than(28, p_v)
print(res_v[min(t_v_index, p_v_index):max(t_v_index, p_v_index)])
It returns an array [35, 36, 37].
I'm sure you can optimize it better according to your needs.
The problem statement is not clearly defined, but this is what I interpret to be a likely solution.
import pandas as pd
tv = list(range(1, 11))
pv = list(range(20, 30))
res = list(range(30, 40))
df = pd.DataFrame({'tv': tv, 'pv': pv, 'res': res})
print(df)
def criteria(row, col1, a, col2, b):
if (row[col1] > a) & (row[col2] < b):
return True
else:
return False
df['select'] = df.apply(lambda row: criteria(row, 'tv', 5, 'pv', 28), axis=1)
selected_res = df.loc[df['select']]['res'].tolist()
print(selected_res)
# ... or another way ..
print(df.loc[(df.tv > 5) & (df.pv < 28)]['res'])
This produces a dataframe where each column is the original lists, and applies a selection criteria, based on columns tv and pv to identify the rows in which the criteria, applied dependently to the 2 lists, is satisfied (or not), and then creates a new column of booleans identifying the rows where the criteria is either True or False.
[35, 36, 37]
5 35
6 36
7 37

Pandas: How to highlight a cell value based on a Z-score value?

In my df below, I want to :
identify and flag the outliers in col_E using z-scores
separately explain how to identify and flag the outliers using z-scores in two or more columns, for example col_D & col_E
See below for the dataset
import pandas as pd
from scipy import stats
# intialise data of lists
df = {
'col_A':['P0', 'P1', 'P2', 'P4', 'P5'],
'col_B':[1,1,1,1,1],
'col_C':[1,2,3,5,9],
'col_D':[120.05, 181.90, 10.34, 153.10, 311.17],
'col_E':[110.21, 191.12, 190.21, 12.00, 245.09 ],
'col_F':[100.22,199.10, 191.13,199.99, 255.19],
'col_G':[140.29, 291.07, 390.22, 245.09, 4122.62],
}
# Create DataFrame
df = pd.DataFrame(df)
# Print the output.
df
Desired: flag all outliers in col_D first and then col_D and col_E secondly (Note: In my image below 10.34 and 12.00 were randomly highlighted)
Q1
Attempt:
#Q1
exclude_cols = ['col_A','col_B','col_C','col_D','col_F','col_G']
include_cols = ['col_E'] # desired column
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
#Q2
exclude_cols = ['col_A','col_B','col_C','col_F','col_G']
include_cols = ['col_D','col_E'] # desired columns
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
Thanks!
I assume the following meanings to demonstrate a broader range of usage.
Q1 stands for calculating a single column
Q2 stands for calculating over multiple columns pooled together.
If Q2 is meant to calculated on multiple columns separately, then you can simply loop your Q1 solution over multiple columns, which should be trivial so I will omit such situation here.
Keys
Q1 is quite straightforward as one can return a list of values by list comprehension.
Q2 is a little bit complicated because the z-score would be applied over a DataFrame subset (i.e. axis=None must be used). According to the official docs, when applying style over a DataFrame, the returning object must also be a DataFrame with the same index and columns as the subset. This is what caused the reshaping and DataFrame construction artifacts.
Single Column (Q1)
Note that z=3 is lowered to 1.5 for demonstration purpose.
# desired column
include_cols = ['col_E']
# additional control
outlier_threshold = 1.5 # 3 won't work!
ddof = 0 # degree of freedom correction. Sample = 1 and population = 0.
def flag_outliers(s: pd.Series):
outlier_mask = np.abs(stats.zscore(s, ddof=ddof)) > outlier_threshold
# replace boolean values with corresponding strings
return ['background-color: yellow' if val else '' for val in outlier_mask]
df.style.apply(flag_outliers, subset=include_cols)
Result
Multiple Column Pooled (Q2, Assumed)
Q2
include_cols = ['col_D', 'col_E'] # desired columns
outlier_threshold = 1.5
ddof = 0
def flag_outliers(s: pd.DataFrame) -> pd.DataFrame:
outlier_mask = np.abs(stats.zscore(s.values.reshape(-1), axis=None, ddof=ddof)) > outlier_threshold
# prepare the array of string to be returned
arr = np.array(['background-color: yellow' if val else '' for val in outlier_mask], dtype=object).reshape(s.shape)
# cast the array into dataframe
return pd.DataFrame(arr, columns=s.columns, index=s.index)
df.style.apply(flag_outliers, axis=None, subset=include_cols)
Result
Based on this answer, just pass the condition of the score to a dict storing the background color of each column index.
include_cols = ['col_D', 'col_E']
def color_outliers_yellow(row, include, color='yellow', z_score = 1):
styles = {col: '' for col in row.index}
if row.name in include:
scores = stats.zscore(list(row))
scores = [(f'background-color: {color}' if score > z_score else '') for score in scores]
return {k:v for k, v in zip(styles.keys(), scores)}
else:
return styles
df.style.apply(lambda x: color_outliers_yellow(x, include=include_cols), axis=0)
Results in:

Cover all columns using the least amount of rows in a pandas dataframe

I have a pandas dataframe looking like the following picture:
The goal here is to select the least amount of rows to have a "1" in all columns. In this scenario, the final selection should be these two rows:
The algorithm should work even if I add columns and rows. It should also work if I change the combination of 1 and 0 in any given row.
Use sum per rows, then compare by Series.ge (>=) for greater or equal and filter by boolean indexing:
df[df.sum(axis=1).ge(2)]
It want test 1 or 0 values first compare by DataFrame.eq for equal ==:
df[df.eq(1).sum(axis=1).ge(2)]
df[df.eq(0).sum(axis=1).ge(2)]
For those interested, this is how I managed to do it:
def _getBestRowsFinalSelection(self, df, cols):
"""
Get the selected rows for the final selection
Parameters:
1. df: Dataframe to use
2. cols: Columns of the binary variables in the Dataframe object (df)
RETURNS -> DataFrame : dfSelected
"""
isOne = df.loc[df[df.loc[:, cols] == 1].sum(axis=1) > 0, :]
lstIsOne = isOne.loc[:, cols].values.tolist()
lstIsOne = [(x, lstItem) for x, lstItem in zip(isOne.index.values.tolist(), lstIsOne)]
winningComb = None
stopFlag = False
for i in range(1, isOne.shape[0] + 1):
if stopFlag:
break;
combs = combinations(lstIsOne, i) #from itertools
for c in combs:
data = [x[1] for x in c]
index = [x[0] for x in c]
dfTmp = pd.DataFrame(data=data, columns=cols, index=index)
if (dfTmp.sum() > 0).all():
dfTmp["Final Selection"] = "Yes"
winningComb = dfTmp
stopFlag = True
break;
return winningComb

Advanced groupby column creation in pandas Dataframe

I have a catalogue of groups of galaxies in a DataFrame, 'compact', which consists mainly in
a group id ('CG', int),
a magnitude ('R', negative float)
and a morphology ('Morph', string, for example 'S' or 'E').
I'm trying to construct a second pandas DataFrame with the following properties of the groups:
'Morph' of the object having the lowest 'R' in the group
Difference between the second lowest and the lowest 'R' in the group
Difference between the lowest 'R' in the group and R of the group, defined as -2.5*log10(sum(10**(-0.4*R)))
Proportions of objects having a given 'Morph' (on column for 'S', one for other morphologies, for example) in the group, NOT COUNTING THE ONE HAVING THE LOWEST 'R'.
I'm having troubles for the last one, could you help me to write it? The other ones work, but, as a secondary question, I would like if I'm doing it clean or if there's better to do.
Here is my code (with a line for my last column which works but doesn't give exactly what I want, and a try in comments which doesn't work):
GroupBy = compact.sort_values('R').groupby('CG', as_index=False)
R2 = GroupBy.head(2).groupby('CG', as_index=False).last().R
R1 = GroupBy.first().sort_values('CG').R
DeltaR12 = R2 - R1
MorphCen = GroupBy.first().sort_values('CG').Morph
Group = GroupBy.first().sort_values('CG').CG
RGroup = GroupBy.apply(lambda x: -2.5*np.log10((10**(-0.4*x.R)).sum()))
DeltaR1gr = R1 - RGroup
# Works, but counts the object with lowest R:
PropS = GroupBy.apply(lambda x: 1.0*x.loc[x['Morph'] == 'S'].shape[0]/x.shape[0])
# Tries to let aside lowest R, but doesn't work:
# PropS = GroupBy.apply(lambda x: 1.0*x.loc[x['Morph'] == 'S' &
# x['R']>x['R'].min()].shape[0]/x.shape[0])
# PropRed = same than PropS, but for 'Morph' != 'S'
CompactML = pd.DataFrame([Group,MorphCen,DeltaR12,DeltaR1gr]).transpose()
CompactML.columns = ['CG', 'MorphCen', 'DeltaR12','DeltaR1gr']
First, its nice if you provide actual data or create some fake data. Below I have created some fake data with 5 different integer CG groups, 2 types of morphology (S and E) and random negative numbers for 'R'.
I have then redone all your aggregations in a custom function that computes each of the 4 returning aggregations in one line and sends the results back as a Series which adds each output as row to your original DataFrame.
#create fake data
df = pd.DataFrame({'CG':np.random.randint(0, 5, 100), 'Morph':np.random.choice(['S', 'E'], 100), 'R':np.random.rand(100) * -100})
print(df.head())
CG Morph R
0 3 E -72.377887
1 2 E -26.126565
2 0 E -4.428494
3 0 E -2.055434
4 4 E -93.341489
# define custom aggregation function
def my_agg(x):
x = x.sort_values('R')
morph = x.head(1)['Morph'].values[0]
diff = x.iloc[0]['R'] - x.iloc[1]['R']
diff2 = -2.5*np.log10(sum(10**(-0.4*x['R'])))
prop = (x['Morph'].iloc[1:] == 'S').mean()
return pd.Series([morph, diff, diff2, prop], index=['morph', 'diff', 'diff2', 'prop'])
# apply custom agg function
df.groupby('CG').apply(my_agg)
morph diff diff2 prop
CG
0 E -1.562630 -97.676934 0.555556
1 S -3.228845 -98.398337 0.391304
2 S -6.537937 -91.092164 0.307692
3 E -0.023813 -99.919336 0.500000
4 E -11.943842 -99.815734 0.705882
So, here is the final code, thanks to Ted Pertou:
# define custom aggregation function
def my_agg(x):
x = x.sort_values('R')
morph = x.head(1)['Morph'].values[0]
diff = x.iloc[1]['R'] - x.iloc[0]['R']
diff2 = x.iloc[0]['R'] + 2.5*np.log10(sum(10**(-0.4*x['R'])))
prop = (x['Morph'].iloc[1:] == 'S').mean()
return pd.Series([morph, diff, diff2, prop], index=['MorphCen', 'DeltaR12', 'DeltaRGrp1', 'PropS'])
# apply custom agg function
compact.groupby('CG').apply(my_agg)

List Highest Correlation Pairs from a Large Correlation Matrix in Pandas?

How do you find the top correlations in a correlation matrix with Pandas? There are many answers on how to do this with R (Show correlations as an ordered list, not as a large matrix or Efficient way to get highly correlated pairs from large data set in Python or R), but I am wondering how to do it with pandas? In my case the matrix is 4460x4460, so can't do it visually.
You can use DataFrame.values to get an numpy array of the data and then use NumPy functions such as argsort() to get the most correlated pairs.
But if you want to do this in pandas, you can unstack and sort the DataFrame:
import pandas as pd
import numpy as np
shape = (50, 4460)
data = np.random.normal(size=shape)
data[:, 1000] += data[:, 2000]
df = pd.DataFrame(data)
c = df.corr().abs()
s = c.unstack()
so = s.sort_values(kind="quicksort")
print so[-4470:-4460]
Here is the output:
2192 1522 0.636198
1522 2192 0.636198
3677 2027 0.641817
2027 3677 0.641817
242 130 0.646760
130 242 0.646760
1171 2733 0.670048
2733 1171 0.670048
1000 2000 0.742340
2000 1000 0.742340
dtype: float64
#HYRY's answer is perfect. Just building on that answer by adding a bit more logic to avoid duplicate and self correlations and proper sorting:
import pandas as pd
d = {'x1': [1, 4, 4, 5, 6],
'x2': [0, 0, 8, 2, 4],
'x3': [2, 8, 8, 10, 12],
'x4': [-1, -4, -4, -4, -5]}
df = pd.DataFrame(data = d)
print("Data Frame")
print(df)
print()
print("Correlation Matrix")
print(df.corr())
print()
def get_redundant_pairs(df):
'''Get diagonal and lower triangular pairs of correlation matrix'''
pairs_to_drop = set()
cols = df.columns
for i in range(0, df.shape[1]):
for j in range(0, i+1):
pairs_to_drop.add((cols[i], cols[j]))
return pairs_to_drop
def get_top_abs_correlations(df, n=5):
au_corr = df.corr().abs().unstack()
labels_to_drop = get_redundant_pairs(df)
au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
return au_corr[0:n]
print("Top Absolute Correlations")
print(get_top_abs_correlations(df, 3))
That gives the following output:
Data Frame
x1 x2 x3 x4
0 1 0 2 -1
1 4 0 8 -4
2 4 8 8 -4
3 5 2 10 -4
4 6 4 12 -5
Correlation Matrix
x1 x2 x3 x4
x1 1.000000 0.399298 1.000000 -0.969248
x2 0.399298 1.000000 0.399298 -0.472866
x3 1.000000 0.399298 1.000000 -0.969248
x4 -0.969248 -0.472866 -0.969248 1.000000
Top Absolute Correlations
x1 x3 1.000000
x3 x4 0.969248
x1 x4 0.969248
dtype: float64
Few lines solution without redundant pairs of variables:
corr_matrix = df.corr().abs()
#the matrix is symmetric so we need to extract upper triangle matrix without diagonal (k = 1)
sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
.stack()
.sort_values(ascending=False))
#first element of sol series is the pair with the biggest correlation
Then you can iterate through names of variables pairs (which are pandas.Series multi-indexes) and theirs values like this:
for index, value in sol.items():
# do some staff
Combining some features of #HYRY and #arun's answers, you can print the top correlations for dataframe df in a single line using:
df.corr().unstack().sort_values().drop_duplicates()
Note: the one downside is if you have 1.0 correlations that are not one variable to itself, the drop_duplicates() addition would remove them
I liked Addison Klinke's post the most, as being the simplest, but used Wojciech Moszczyńsk’s suggestion for filtering and charting, but extended the filter to avoid absolute values, so given a large correlation matrix, filter it, chart it, and then flatten it:
Created, Filtered and Charted
dfCorr = df.corr()
filteredDf = dfCorr[((dfCorr >= .5) | (dfCorr <= -.5)) & (dfCorr !=1.000)]
plt.figure(figsize=(30,10))
sn.heatmap(filteredDf, annot=True, cmap="Reds")
plt.show()
Function
In the end, I created a small function to create the correlation matrix, filter it, and then flatten it. As an idea, it could easily be extended, e.g., asymmetric upper and lower bounds, etc.
def corrFilter(x: pd.DataFrame, bound: float):
xCorr = x.corr()
xFiltered = xCorr[((xCorr >= bound) | (xCorr <= -bound)) & (xCorr !=1.000)]
xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
return xFlattened
corrFilter(df, .7)
Follow-Up
Eventually, I refined the functions
# Returns correlation matrix
def corrFilter(x: pd.DataFrame, bound: float):
xCorr = x.corr()
xFiltered = xCorr[((xCorr >= bound) | (xCorr <= -bound)) & (xCorr !=1.000)]
return xFiltered
# flattens correlation matrix with bounds
def corrFilterFlattened(x: pd.DataFrame, bound: float):
xFiltered = corrFilter(x, bound)
xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
return xFlattened
# Returns correlation for a variable from flattened correlation matrix
def filterForLabels(df: pd.DataFrame, label):
try:
sideLeft = df[label,]
except:
sideLeft = pd.DataFrame()
try:
sideRight = df[:,label]
except:
sideRight = pd.DataFrame()
if sideLeft.empty and sideRight.empty:
return pd.DataFrame()
elif sideLeft.empty:
concat = sideRight.to_frame()
concat.rename(columns={0:'Corr'},inplace=True)
return concat
elif sideRight.empty:
concat = sideLeft.to_frame()
concat.rename(columns={0:'Corr'},inplace=True)
return concat
else:
concat = pd.concat([sideLeft,sideRight], axis=1)
concat["Corr"] = concat[0].fillna(0) + concat[1].fillna(0)
concat.drop(columns=[0,1], inplace=True)
return concat
You can do graphically according to this simple code by substituting your data.
corr = df.corr()
kot = corr[corr>=.9]
plt.figure(figsize=(12,8))
sns.heatmap(kot, cmap="Greens")
Use the code below to view the correlations in the descending order.
# See the correlations in descending order
corr = df.corr() # df is the pandas dataframe
c1 = corr.abs().unstack()
c1.sort_values(ascending = False)
Combining most the answers above into a short snippet:
def top_entries(df):
mat = df.corr().abs()
# Remove duplicate and identity entries
mat.loc[:,:] = np.tril(mat.values, k=-1)
mat = mat[mat>0]
# Unstack, sort ascending, and reset the index, so features are in columns
# instead of indexes (allowing e.g. a pretty print in Jupyter).
# Also rename these it for good measure.
return (mat.unstack()
.sort_values(ascending=False)
.reset_index()
.rename(columns={
"level_0": "feature_a",
"level_1": "feature_b",
0: "correlation"
}))
Lot's of good answers here. The easiest way I found was a combination of some of the answers above.
corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
corr = corr.unstack().transpose()\
.sort_values(by='column', ascending=False)\
.dropna()
The following function should do the trick. This implementation
Removes self correlations
Removes duplicates
Enables the selection of top N highest correlated features
and it is also configurable so that you can keep both the self correlations as well as the duplicates. You can also to report as many feature pairs as you wish.
def get_feature_correlation(df, top_n=None, corr_method='spearman',
remove_duplicates=True, remove_self_correlations=True):
"""
Compute the feature correlation and sort feature pairs based on their correlation
:param df: The dataframe with the predictor variables
:type df: pandas.core.frame.DataFrame
:param top_n: Top N feature pairs to be reported (if None, all of the pairs will be returned)
:param corr_method: Correlation compuation method
:type corr_method: str
:param remove_duplicates: Indicates whether duplicate features must be removed
:type remove_duplicates: bool
:param remove_self_correlations: Indicates whether self correlations will be removed
:type remove_self_correlations: bool
:return: pandas.core.frame.DataFrame
"""
corr_matrix_abs = df.corr(method=corr_method).abs()
corr_matrix_abs_us = corr_matrix_abs.unstack()
sorted_correlated_features = corr_matrix_abs_us \
.sort_values(kind="quicksort", ascending=False) \
.reset_index()
# Remove comparisons of the same feature
if remove_self_correlations:
sorted_correlated_features = sorted_correlated_features[
(sorted_correlated_features.level_0 != sorted_correlated_features.level_1)
]
# Remove duplicates
if remove_duplicates:
sorted_correlated_features = sorted_correlated_features.iloc[:-2:2]
# Create meaningful names for the columns
sorted_correlated_features.columns = ['Feature 1', 'Feature 2', 'Correlation (abs)']
if top_n:
return sorted_correlated_features[:top_n]
return sorted_correlated_features
Use itertools.combinations to get all unique correlations from pandas own correlation matrix .corr(), generate list of lists and feed it back into a DataFrame in order to use '.sort_values'. Set ascending = True to display lowest correlations on top
corrank takes a DataFrame as argument because it requires .corr().
def corrank(X: pandas.DataFrame):
import itertools
df = pd.DataFrame([[(i,j),X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],columns=['pairs','corr'])
print(df.sort_values(by='corr',ascending=False))
corrank(X) # prints a descending list of correlation pair (Max on top)
I didn't want to unstack or over-complicate this issue, since I just wanted to drop some highly correlated features as part of a feature selection phase.
So I ended up with the following simplified solution:
# map features to their absolute correlation values
corr = features.corr().abs()
# set equality (self correlation) as zero
corr[corr == 1] = 0
# of each feature, find the max correlation
# and sort the resulting array in ascending order
corr_cols = corr.max().sort_values(ascending=False)
# display the highly correlated features
display(corr_cols[corr_cols > 0.8])
In this case, if you want to drop correlated features, you may map through the filtered corr_cols array and remove the odd-indexed (or even-indexed) ones.
I was trying some of the solutions here but then I actually came up with my own one. I hope this might be useful for the next one so I share it here:
def sort_correlation_matrix(correlation_matrix):
cor = correlation_matrix.abs()
top_col = cor[cor.columns[0]][1:]
top_col = top_col.sort_values(ascending=False)
ordered_columns = [cor.columns[0]] + top_col.index.tolist()
return correlation_matrix[ordered_columns].reindex(ordered_columns)
This is a improve code from #MiFi. This one order in abs but not excluding the negative values.
def top_correlation (df,n):
corr_matrix = df.corr()
correlation = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
.stack()
.sort_values(ascending=False))
correlation = pd.DataFrame(correlation).reset_index()
correlation.columns=["Variable_1","Variable_2","Correlacion"]
correlation = correlation.reindex(correlation.Correlacion.abs().sort_values(ascending=False).index).reset_index().drop(["index"],axis=1)
return correlation.head(n)
top_correlation(ANYDATA,10)
simple is better
from collections import defaultdict
res = defaultdict(dict)
corr = returns.corr().replace(1, -1)
names = list(corr)
for name in names:
idx = corr[name].argmax()
max_pairwise_name = names[idx]
res[name][max_pairwise_name] = corr.loc[max_pairwisename, name]
Now res contains the maximum pairwise correlation for each pair

Categories