return value to df after several operations - python

I run IPR outlier control for a relatively big dataframe df:
I perform IPR within subset of the data so I use for loop.
How can I return value to original df >1 000 000 rows:
months product brick units is_outlier
0 202104 abc 3 1.00 False
1 202104 abc 6 3.00 False
for product in df['product'].unique():
for brick in df['brick'].unique():
try:
# Extract the units for the current product and brick
data = df.loc[(df['product'] == product) & (df['brick'] == brick)]['units'].values
# Scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.reshape(-1, 1))
# Fit a linear regression model to the data
reg = LinearRegression()
reg.fit(np.arange(len(data_scaled)).reshape(-1, 1), data_scaled)
# Calculate the residuals of the regression
residuals = data_scaled - reg.predict(np.arange(len(data_scaled)).reshape(-1, 1))
# Identify any observations with a residual larger than 2 standard deviations from the mean
threshold = 2*residuals.std()
outliers = np.where(np.abs(residuals) > threshold)
# Set the "is_outlier" column to True for the outliers in the current product
df.loc[(df['product'] == product ) & (df['brick']== brick) & (df.index.isin(outliers[0])), 'is_outlier'] = True
except:
pass

As #QuangHoang suggested, use groupby and apply your custom function:
def outlier(df):
data = df.to_numpy().reshape((-1, 1))
# Scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
# Fit a linear regression model to the data
reg = LinearRegression()
reg.fit(np.arange(len(data_scaled)).reshape(-1, 1), data_scaled)
# Calculate the residuals of the regression
residuals = data_scaled - reg.predict(np.arange(len(data_scaled)).reshape(-1, 1))
# Identify any observations with a residual
# larger than 2 standard deviations from the mean
threshold = 2*residuals.std()
return np.ravel(np.abs(residuals) > threshold)
df['is_outlier'] = df.groupby(['product', 'brick'])['units'].transform(outlier)

Related

Calculate standard deviations of estimation errors for ensemble models

I have a model in which I would like to analyse the residuals.Ultimatly, I would like to identify extreme resudials that lie outside of the confidence interval for each day. But am having trouble calculating the pointwise standard deviation of residuals for each model in the bagging regressor.
My sample code is below;
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
# Sample DataFrame
df = pd.DataFrame(np.random.randint(0,200,size=(500, 4)), columns=list('ABCD'))
# Add dates to sample data
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(500)]
df['date'] = date_list
df['date'] = df['date'].astype('str')
# Split dataset into testing and training
train = df[:int(len(df)*0.80)]
test = df[int(len(df)*0.20):]
X_train = train[['B','C','D','date']]
X_test = test[['B','C','D','date']]
y_train = train[['A']]
y_test = test[['A']]
# Function to Encode the data
def encode_and_bind(data_in, feature_to_encode):
dummies = pd.get_dummies(data_in[[feature_to_encode]])
data_out = pd.concat([data_in, dummies], axis=1)
data_out = data_out.drop([feature_to_encode], axis=1)
return(data_out)
for feature in features_to_encode:
X_train_final = encode_and_bind(X_train, 'date')
X_test_final = encode_and_bind(X_test, 'date')
# Define Model
svr_lin = SVR(kernel="linear", C=100, gamma="auto")
regr = BaggingRegressor(base_estimator=svr_lin,random_state=5).fit(X_train_final, y_train.values.ravel())
# Predictions
y_pred = regr.predict(X_test_final)
# Join the predictions back into orignial dataframe
y_test['predict'] = y_pred
# Calculate residuals
y_test['residuals'] = y_test['A'] - y_test['predict']
I found this method online
raw_pred = [x.predict([[0, 0, 0, 0]]) for x in regr.estimators_]
but am not sure of what to use for the x.predict([[0, 0, 0, 0]]) part since I have far more than 4 features.
EDIT:
Building off of #2MuchC0ff33's answer I tried
stdevs = []
for dates in X_test_final.columns[3:]:
test = X_test_final[X_test_final[dates]==1]
raw_pred = [x.predict([test.iloc[0]]) for x in regr.estimators_]
dates= dates
sdev= np.std(raw_pred)
sdev = sdev.astype('str')
stdevs.append(dates + "," + sdev)
it seems to be correct, but I don't know enough about how these calculations are being done to judge if this is working in the way I think it is.
F, thanks for sharing your attempt from my answer.
I am going to try to break everything down and hopefully provide you a solution you need. Apologies in advance if I am repeating some of your code but it is how my brain works haha.
You can group the residuals by date and calculate the standard deviation for each group to calculate the pointwise standard deviation of residuals for each day. Here's how to go about it:
y_test['date'] = y_test['date'].apply(lambda x: x[:10])
grouped = y_test.groupby(['date'])
residual_groups = grouped['residuals']
residual_stds = residual_groups.std()
This will give you the residual standard deviation for each day. For each day, multiply the standard deviation by a constant such as 1.96 (for a 95% confidence interval) and add/subtract it from the mean of the residuals.
residual_means = residual_groups.mean()
CI = 1.96 * residual_stds
upper_bound = residual_means + CI
lower_bound = residual_means - CI
Finally, by comparing the residuals with the lower and upper bounds, you can identify the extreme residuals that lie outside the confidence interval for each day:
extreme_residuals = y_test[(y_test['residuals'] > upper_bound) | (y_test['residuals'] < lower_bound)]
You can extend this method to find the standard deviation for each day.
# Group the test data by the date feature
grouped = X_test_final.groupby(['date'])
stdevs = []
for name, group in grouped:
raw_pred = [x.predict(group) for x in regr.estimators_]
# Calculate the standard deviation of the predictions for each group
sdev = np.std(raw_pred)
stdevs.append((name, sdev))
I think we could replace 0, 0, 0, 0 with x_test_final. Let me know your thoughts on my updated method below:
raw_pred = [x.predict([X_test_final.iloc[0]]) for x in regr.estimators_]

Drawing equal samples from each class in stratified sampling

So I have 1000 class 1 and 2500 class 2. So naturally when using:
sklearn's train_test_split(test_size = 200, stratify = y). I get an imbalanced test set since it is preserving the data distribution from the original data set. However, I would like to split to have 100 class 1 and 100 class 2 in the test set.
How would I do it? Any suggestions would be appreciated.
Split Manually
A manual solution isn't that scary. Main steps explained:
Isolate the index of class-1 and class-2 rows.
Use np.random.permutation() to select random n1 and n2 test samples for class 1 and 2 respectively.
Use df.index.difference() to perform inverse selection for the train samples.
The code can be easily generalized to arbitrary number of classes and arbitrary numbers to be selected as test data (just put n1/n2, idx1/idx2, etc. into lists and process by loops). But that's out of the scope of the question itself.
Code
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
# data
df = pd.DataFrame(
data={
"label": np.array([1]*1000 + [2]*2500),
# label 1 has value > 0, label 2 has value < 0
"value": np.hstack([np.random.uniform(0, 1, 1000),
np.random.uniform(-1, 0, 2500)])
}
)
df = df.sample(frac=1).reset_index(drop=True)
# sampling number for each class
n1 = 100
n2 = 100
# 1. get indexes and lengths for the classes respectively
idx1 = df.index.values[df["label"] == 1]
idx2 = df.index.values[df["label"] == 2]
len1 = len(idx1) # 1000
len2 = len(idx2) # 2500
# 2. draw index for test dataset
draw1 = np.random.permutation(len1)[:n1] # keep the first n1 entries to be selected
idx1_test = idx1[draw1]
draw2 = np.random.permutation(len2)[:n2]
idx2_test = idx2[draw2]
# combine the drawn indexes
idx_test = np.hstack([idx1_test, idx2_test])
# 3. derive index for train dataset
idx_train = df.index.difference(idx_test)
# split
df_train = df.loc[idx_train, :] # optional: .reset_index(drop=True)
df_test = df.loc[idx_test, :]
# len(df_train) = 3300
# len(df_test) = 200
# verify that no row was missing
idx_merged = np.hstack([df_train.index.values, df_test.index.values])
assert len(np.unique(idx_merged)) == 3500

Not able to store output from a custom function in a dataframe in Python

I have written a function to compute KS values for various iterations of a logistic regression model. When I run the code, the KS value is printed on the screen but it is not getting stored in the table.
Custom KS Function
def ks_value(bad_flag=None, predicted_prob=None):
## Data Prep
ksdf = pd.DataFrame([])
ksdf['bad_flag'] = bad_flag
ksdf['probability'] = predicted_prob
ksdf = ksdf.reset_index()
ksdf.drop(columns=['index'],inplace=True)
ksdf['decile'] = pd.qcut(ksdf['probability'],10,labels=['1','2','3','4','5','6','7','8','9','10'])
ksdf['good_flag'] = 1-ksdf['bad_flag']
ksdf.head()
## Pivot
ksdf1 = pd.pivot_table(data=ksdf,index=['decile'],values=['bad_flag','good_flag','probability'],
aggfunc={'bad_flag':[np.sum],
'good_flag':[np.sum],
'probability' : [np.min,np.max]})
## Add Columns
ksdf1['total_counts'] = ksdf1['bad_flag']+ksdf1['good_flag']
ksdf1 = ksdf1.reset_index()
ksdf1.columns = ['Decile','Defaulter_Count','Non-Defaulter_Count','max_score','min_score','Total_Count']
ksdf1 = ksdf1.sort_values(by='min_score',ascending=False)
ksdf1['Default_Rate'] = (ksdf1['Defaulter_Count'] / ksdf1['Total_Count']).apply('{0:.2%}'.format)
default_sum = ksdf1['Defaulter_Count'].sum()
non_default_sum = ksdf1['Non-Defaulter_Count'].sum()
ksdf1['Default %'] = (ksdf1['Defaulter_Count']/default_sum).apply('{0:.2%}'.format)
ksdf1['Non_Default %'] = (ksdf1['Non-Defaulter_Count']/non_default_sum).apply('{0:.2%}'.format)
## Compute KS
ksdf1['ks_stats'] = np.round(((ksdf1['Defaulter_Count'] / ksdf1['Defaulter_Count'].sum()).cumsum() -(ksdf1['Non-Defaulter_Count'] / ksdf1['Non-Defaulter_Count'].sum()).cumsum()), 4) * 100
return(ksdf1['ks_stats'].max())
Code for iterating on Logistic Regression
# Iterating to find the Optimal value of C for model overfitting - Checks on Test Data
C_param_range = [0.001,0.01,0.1,1,10,100]
table1 = pd.DataFrame(columns = ['C_parameter','Test Accuracy','Train Accuracy','Test KS','Train KS'])
table1['C_parameter'] = C_param_range
j = 0
for i in C_param_range:
# Apply logistic regression model to training data
lr = LogisticRegression(penalty = 'l2', C = i,random_state = 0,max_iter = 1000)
lr.fit(X_train,y_train)
# Predict class (0,1) using model
y_pred = lr.predict(X_test)
y_pred2 = lr.predict(X_train)
y_prob = lr.predict_proba(X_test)[:,1]
y_prob2 = lr.predict_proba(X_train)[:,1]
# KS Value
table1.iloc[j,3] = ks_value(bad_flag=y_test, predicted_prob=y_prob)
table1.iloc[j,4] = ks_value(bad_flag=y_train, predicted_prob=y_prob2)
# Saving accuracy score in table
table1.iloc[j,1] = accuracy_score(y_test,y_pred)
table1.iloc[j,2] = accuracy_score(y_train,y_pred2)
j += 1
Output is something like this:
KS is 35.49
KS is 34.25
C_parameter TestAccuracy TrainAccuracy TestKS TrainKS
0.001 0.919911 0.919056 NaN NaN

Odd Results on Entropy Calculation

I am trying to write a function that properly calculates the entropy of a given dataset. However, I am getting very weird entropy values.
I am following the understanding that all entropy calculations must fall between 0 and 1, yet I am consistently getting values above 2.
Note: I must use log base 2 for this
Can someone explain why am I yielding incorrect entropy results?
The dataset I am testing is the ecoli dataset from the UCI Machine Learning Repository
import numpy
import math
#################### DATA HANDLING LIBRARY ####################
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
# Function that utilizes the numpy library to randomize the dataset.
def randomize_data(csv):
csv = numpy.random.shuffle(csv)
return csv
# Function to split the data into test, training set, and validation sets
def split_data(csv):
# Call the randomize data function
randomize_data(csv)
# Grab the number of rows and calculate where to split
num_rows = csv.shape[0]
validation_split = int(num_rows * 0.10)
training_split = int(num_rows * 0.72)
testing_split = int(num_rows * 0.18)
# Validation set as the first 10% of the data
validation_set = csv[:validation_split]
# Training set as the next 72
training_set = csv[validation_split:training_split + validation_split]
# Testing set as the last 18
testing_set = csv[training_split + validation_split:]
# Split the data into classes vs actual data
training_cols = training_set.shape[1]
testing_cols = testing_set.shape[1]
validation_cols = validation_set.shape[1]
training_classes = training_set[:, training_cols - 1]
testing_classes = testing_set[:, testing_cols - 1]
validation_classes = validation_set[:, validation_cols - 1]
# Take the sets and remove the last (classification) column
training_set = training_set[:-1]
testing_set = testing_set[:-1]
validation_set = validation_set[:-1]
# Return the datasets
return testing_set, testing_classes, training_set, training_classes, validation_set, validation_classes
#################### DATA HANDLING LIBRARY ####################
# This function returns the list of classes, and their associated weights (i.e. distributions)
# for a given dataset
def class_distribution(dataset):
# Ensure the dataset is a numpy array
dataset = numpy.asarray(dataset)
# Collect # of total rows and columns, using numpy
num_total_rows = dataset.shape[0]
num_columns = dataset.shape[1]
# Create a numpy array of just the classes
classes = dataset[:, num_columns - 1]
# Use numpy.unique to remove duplicates
classes = numpy.unique(classes)
# Create an empty array for the class weights
class_weights = []
# Loop through the classes one by one
for aclass in classes:
# Create storage variables
total = 0
weight = 0
# Now loop through the dataset
for row in dataset:
# If the class of the dataset is equal to the current class you are evaluating, increase the total
if numpy.array_equal(aclass, row[-1]):
total = total + 1
# If not, continue
else:
continue
# Divide the # of occurences by total rows
weight = float((total / num_total_rows))
# Add that weight to the list of class weights
class_weights.append(weight)
# Turn the weights into a numpy array
class_weights = numpy.asarray(class_weights)
# Return the array
return classes, class_weights
# This function returns the entropy for a given dataset
# Can be used across an entire csv, or just for a column of data (feature)
def get_entropy(dataset):
# Set initial entropy
entropy = 0.0
# Determine the classes and their frequencies (weights) of the dataset
classes, class_freq = class_distribution(dataset)
# Utilize numpy's quicksort to test the most occurring class first
numpy.sort(class_freq)
# Determine the max entropy for the dataset
max_entropy = math.log(len(classes), 2)
print("MAX ENTROPY FOR THIS DATASET: ", max_entropy)
# Loop through the frequencies and use given formula to calculate entropy
# For...Each simulates the sequence operator
for freq in class_freq:
entropy += float(-freq * math.log(freq, 2))
# Return the entropy value
return entropy
def main():
ecol = csv_to_array('ecoli.csv')
testing_set, testing_classes, training_set, training_classes, validation_set, validation_classes = split_data(ecol)
entropy = get_entropy(ecol)
print(entropy)
main()
The following function was used to calculate Entropy:
# Function to return Shannon's Entropy
def entropy(attributes, dataset, targetAttr):
freq = {}
entropy = 0.0
index = 0
for item in attributes:
if (targetAttr == item):
break
else:
index = index + 1
index = index - 1
for item in dataset:
if ((item[index]) in freq):
# Increase the index
freq[item[index]] += 1.0
else:
# Initialize it by setting it to 0
freq[item[index]] = 1.0
for freq in freq.values():
entropy = entropy + (-freq / len(dataset)) * math.log(freq / len(dataset), 2)
return entropy
As #MattTimmermans had indicated, entropy's value is actually contingent on the number of classes. For strictly 2 classes, it is contained in the 0 to 1 (inclusive) range. However, for more than 2 classes (which is what was being tested), entropy is calculated with a different formula (converted to Pythonic code above). This post here explains those mathematics and calculations a bit more in detail.

How to calculate correlation between all columns and remove highly correlated ones using pandas?

I have a huge data set and prior to machine learning modeling it is always suggested that first you should remove highly correlated descriptors(columns) how can i calculate the column wice correlation and remove the column with a threshold value say remove all the columns or descriptors having >0.8 correlation. also it should retained the headers in reduce data..
Example data set
GA PN PC MBP GR AP
0.033 6.652 6.681 0.194 0.874 3.177
0.034 9.039 6.224 0.194 1.137 3.4
0.035 10.936 10.304 1.015 0.911 4.9
0.022 10.11 9.603 1.374 0.848 4.566
0.035 2.963 17.156 0.599 0.823 9.406
0.033 10.872 10.244 1.015 0.574 4.871
0.035 21.694 22.389 1.015 0.859 9.259
0.035 10.936 10.304 1.015 0.911 4.5
Please help....
The method here worked well for me, only a few lines of code: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
import numpy as np
# Create correlation matrix
corr_matrix = df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
df.drop(to_drop, axis=1, inplace=True)
Here is the approach which I have used -
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of deleted columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
if colname in dataset.columns:
del dataset[colname] # deleting the column from the dataset
print(dataset)
Hope this helps!
Here is an Auto ML class I created to eliminate multicollinearity between features.
What makes my code unique is that out two features that have high correlation, I have eliminated the feature that is least correlated with the target! I got the idea from this seminar by Vishal Patel Sir - https://www.youtube.com/watch?v=ioXKxulmwVQ&feature=youtu.be
#Feature selection class to eliminate multicollinearity
class MultiCollinearityEliminator():
#Class Constructor
def __init__(self, df, target, threshold):
self.df = df
self.target = target
self.threshold = threshold
#Method to create and return the feature correlation matrix dataframe
def createCorrMatrix(self, include_target = False):
#Checking we should include the target in the correlation matrix
if (include_target == False):
df_temp = self.df.drop([self.target], axis =1)
#Setting method to Pearson to prevent issues in case the default method for df.corr() gets changed
#Setting min_period to 30 for the sample size to be statistically significant (normal) according to
#central limit theorem
corrMatrix = df_temp.corr(method='pearson', min_periods=30).abs()
#Target is included for creating the series of feature to target correlation - Please refer the notes under the
#print statement to understand why we create the series of feature to target correlation
elif (include_target == True):
corrMatrix = self.df.corr(method='pearson', min_periods=30).abs()
return corrMatrix
#Method to create and return the feature to target correlation matrix dataframe
def createCorrMatrixWithTarget(self):
#After obtaining the list of correlated features, this method will help to view which variables
#(in the list of correlated features) are least correlated with the target
#This way, out the list of correlated features, we can ensure to elimate the feature that is
#least correlated with the target
#This not only helps to sustain the predictive power of the model but also helps in reducing model complexity
#Obtaining the correlation matrix of the dataframe (along with the target)
corrMatrix = self.createCorrMatrix(include_target = True)
#Creating the required dataframe, then dropping the target row
#and sorting by the value of correlation with target (in asceding order)
corrWithTarget = pd.DataFrame(corrMatrix.loc[:,self.target]).drop([self.target], axis = 0).sort_values(by = self.target)
print(corrWithTarget, '\n')
return corrWithTarget
#Method to create and return the list of correlated features
def createCorrelatedFeaturesList(self):
#Obtaining the correlation matrix of the dataframe (without the target)
corrMatrix = self.createCorrMatrix(include_target = False)
colCorr = []
#Iterating through the columns of the correlation matrix dataframe
for column in corrMatrix.columns:
#Iterating through the values (row wise) of the correlation matrix dataframe
for idx, row in corrMatrix.iterrows():
if(row[column]>self.threshold) and (row[column]<1):
#Adding the features that are not already in the list of correlated features
if (idx not in colCorr):
colCorr.append(idx)
if (column not in colCorr):
colCorr.append(column)
print(colCorr, '\n')
return colCorr
#Method to eliminate the least important features from the list of correlated features
def deleteFeatures(self, colCorr):
#Obtaining the feature to target correlation matrix dataframe
corrWithTarget = self.createCorrMatrixWithTarget()
for idx, row in corrWithTarget.iterrows():
print(idx, '\n')
if (idx in colCorr):
self.df = self.df.drop(idx, axis =1)
break
return self.df
#Method to run automatically eliminate multicollinearity
def autoEliminateMulticollinearity(self):
#Obtaining the list of correlated features
colCorr = self.createCorrelatedFeaturesList()
while colCorr != []:
#Obtaining the dataframe after deleting the feature (from the list of correlated features)
#that is least correlated with the taregt
self.df = self.deleteFeatures(colCorr)
#Obtaining the list of correlated features
colCorr = self.createCorrelatedFeaturesList()
return self.df
You can use the following for a given data frame df:
corr_matrix = df.corr().abs()
high_corr_var=np.where(corr_matrix>0.8)
high_corr_var=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
You can test this code below ?
Load libraries import
pandas as pd
import numpy as np
# Create feature matrix with two highly correlated features
X = np.array([[1, 1, 1],
[2, 2, 0],
[3, 3, 1],
[4, 4, 0],
[5, 5, 1],
[6, 6, 0],
[7, 7, 1],
[8, 7, 0],
[9, 7, 1]])
# Convert feature matrix into DataFrame
df = pd.DataFrame(X)
# View the data frame
df
# Create correlation matrix
corr_matrix = df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
df.drop(df[to_drop], axis=1)
I found the answer provided by TomDobbs quite useful, however it doesn't work as intended. It has two problems:
it misses the last pair of variables in each of correlation matrix rows/columns.
it fails to remove one of each pair of collinear variables from the returned dataframe.
My revised version below corrects these issues:
def remove_collinear_features(x, threshold):
'''
Objective:
Remove collinear features in a dataframe with a correlation coefficient
greater than the threshold. Removing collinear features can help a model
to generalize and improves the interpretability of the model.
Inputs:
x: features dataframe
threshold: features with correlations greater than this value are removed
Output:
dataframe that contains only the non-highly-collinear features
'''
# Calculate the correlation matrix
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterate through the correlation matrix and compare correlations
for i in iters:
for j in range(i+1):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = abs(item.values)
# If correlation exceeds the threshold
if val >= threshold:
# Print the correlated features and the correlation value
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drop_cols.append(col.values[0])
# Drop one of each pair of correlated columns
drops = set(drop_cols)
x = x.drop(columns=drops)
return x
Firstly, I'd suggest using something like PCA as a dimensionality reduction method, but if you have to roll your own then your question is insufficiently constrained. Where two columns are correlated, which one do you want to remove? What if column A is correlated with column B, while column B is correlated with column C, but not column A?
You can get a pairwise matrix of correlations by calling DataFrame.corr() (docs) which might help you with developing your algorithm, but eventually you need to convert that into a list of columns to keep.
I took the liberty to modify TomDobbs' answer. The reported bug in the comments is removed now. Also, the new function filters out the negative correlation, too.
def corr_df(x, corr_val):
'''
Obj: Drops features that are strongly correlated to other features.
This lowers model complexity, and aids in generalizing the model.
Inputs:
df: features df (x)
corr_val: Columns are dropped relative to the corr_val input (e.g. 0.8)
Output: df that only includes uncorrelated features
'''
# Creates Correlation Matrix and Instantiates
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = item.values
if abs(val) >= corr_val:
# Prints the correlated feature set and the corr val
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drop_cols.append(i)
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i+1):(i+2)].columns.values
x = x.drop(col, axis=1)
return x
Plug your features dataframe in this function and just set your correlation threshold. It'll auto drop columns, but will also give you a diagnostic of the columns it drops if you want to do it manually.
def corr_df(x, corr_val):
'''
Obj: Drops features that are strongly correlated to other features.
This lowers model complexity, and aids in generalizing the model.
Inputs:
df: features df (x)
corr_val: Columns are dropped relative to the corr_val input (e.g. 0.8)
Output: df that only includes uncorrelated features
'''
# Creates Correlation Matrix and Instantiates
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = item.values
if val >= corr_val:
# Prints the correlated feature set and the corr val
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drop_cols.append(i)
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i+1):(i+2)].columns.values
df = x.drop(col, axis=1)
return df
At first, thanks to TomDobbs and Synergix for their code. Below I am sharing my modifield version with some additions:
Between two correlated variables this function drops a variable which has the least correlation with the target variable
Added some useful logs (set verbose to True for log printing)
def remove_collinear_features(df_model, target_var, threshold, verbose):
'''
Objective:
Remove collinear features in a dataframe with a correlation coefficient
greater than the threshold and which have the least correlation with the target (dependent) variable. Removing collinear features can help a model
to generalize and improves the interpretability of the model.
Inputs:
df_model: features dataframe
target_var: target (dependent) variable
threshold: features with correlations greater than this value are removed
verbose: set to "True" for the log printing
Output:
dataframe that contains only the non-highly-collinear features
'''
# Calculate the correlation matrix
corr_matrix = df_model.drop(target_var, 1).corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
dropped_feature = ""
# Iterate through the correlation matrix and compare correlations
for i in iters:
for j in range(i+1):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = abs(item.values)
# If correlation exceeds the threshold
if val >= threshold:
# Print the correlated features and the correlation value
if verbose:
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
col_value_corr = df_model[col.values[0]].corr(df_model[target_var])
row_value_corr = df_model[row.values[0]].corr(df_model[target_var])
if verbose:
print("{}: {}".format(col.values[0], np.round(col_value_corr, 3)))
print("{}: {}".format(row.values[0], np.round(row_value_corr, 3)))
if col_value_corr < row_value_corr:
drop_cols.append(col.values[0])
dropped_feature = "dropped: " + col.values[0]
else:
drop_cols.append(row.values[0])
dropped_feature = "dropped: " + row.values[0]
if verbose:
print(dropped_feature)
print("-----------------------------------------------------------------------------")
# Drop one of each pair of correlated columns
drops = set(drop_cols)
df_model = df_model.drop(columns=drops)
print("dropped columns: ")
print(list(drops))
print("-----------------------------------------------------------------------------")
print("used columns: ")
print(df_model.columns.tolist())
return df_model
I know that there are already a lot of answers on that but one way I found very simple and short is the following:
# Get correlation matrix
corr = X.corr()
# Create a mask for values above 90%
# But also below 100% since it variables correlated with the same one
mask = (X.corr() > 0.9) & (X.corr() < 1.0)
high_corr = corr[mask]
# Create a new column mask using any() and ~
col_to_filter_out = ~high_corr[mask].any()
# Apply new mask
X_clean = X[high_corr.columns[col_to_filter_out]]
# Visualize cleaned dataset
X_clean
If you run out of memory due to pandas .corr() you may find the following solution useful:
import numpy as np
from numba import jit
#jit(nopython=True)
def corr_filter(X, threshold):
n = X.shape[1]
columns = np.ones((n,))
for i in range(n-1):
for j in range(i+1, n):
if columns[j] == 1:
correlation = np.abs(np.corrcoef(X[:,i], X[:,j])[0,1])
if correlation >= threshold:
columns[j] = 0
return columns
columns = corr_filter(df.values, 0.7).astype(bool)
selected_columns = df.columns[columns]
A small revision to the solution posted by user3025698 that resolves an issue where the correlation between the first two columns is not captured and some data type checking.
def filter_df_corr(inp_data, corr_val):
'''
Returns an array or dataframe (based on type(inp_data) adjusted to drop \
columns with high correlation to one another. Takes second arg corr_val
that defines the cutoff
----------
inp_data : np.array, pd.DataFrame
Values to consider
corr_val : float
Value [0, 1] on which to base the correlation cutoff
'''
# Creates Correlation Matrix
if isinstance(inp_data, np.ndarray):
inp_data = pd.DataFrame(data=inp_data)
array_flag = True
else:
array_flag = False
corr_matrix = inp_data.corr()
# Iterates through Correlation Matrix Table to find correlated columns
drop_cols = []
n_cols = len(corr_matrix.columns)
for i in range(n_cols):
for k in range(i+1, n_cols):
val = corr_matrix.iloc[k, i]
col = corr_matrix.columns[i]
row = corr_matrix.index[k]
if abs(val) >= corr_val:
# Prints the correlated feature set and the corr val
print(col, "|", row, "|", round(val, 2))
drop_cols.append(col)
# Drops the correlated columns
drop_cols = set(drop_cols)
inp_data = inp_data.drop(columns=drop_cols)
# Return same type as inp
if array_flag:
return inp_data.values
else:
return inp_data
The question here refers to a HUGE dataset. However, all of the answers I see are dealing with dataframes. I present an answer for a scipy sparse matrix which runs in parallel. Rather than returning a giant correlation matrix, this returns a feature mask of fields to keep after checking all fields for both positive and negative Pearson correlations.
I also try to minimize calculations using the following strategy:
Process each column
Start at the current column + 1 and calculate correlations moving to the right.
For any abs(correlation) >= threshold, mark the current column for removal and calculate no further correlations.
Perform these steps for each column in the dataset except the last.
This might be sped up further by keeping a global list of columns marked for removal and skipping further correlation calculations for such columns, since columns will execute out of order. However, I do not know enough about race conditions in python to implement this tonight.
Returning a column mask will obviously allow the code to handle much larger datasets than returning the entire correlation matrix.
Check each column using this function:
def get_corr_row(idx_num, sp_mat, thresh):
# slice the column at idx_num
cols = sp_mat.shape[1]
x = sp_mat[:,idx_num].toarray().ravel()
start = idx_num + 1
# Now slice each column to the right of idx_num
for i in range(start, cols):
y = sp_mat[:,i].toarray().ravel()
# Check the pearson correlation
corr, pVal = pearsonr(x,y)
# Pearson ranges from -1 to 1.
# We check both positive and negative correlations >= thresh using abs(corr)
if abs(corr) >= thresh:
# stop checking after finding the 1st correlation > thresh
return False
# Mark column at idx_num for removal in the mask
return True
Run the column level correlation checks in parallel:
from joblib import Parallel, delayed
import multiprocessing
def Get_Corr_Mask(sp_mat, thresh, n_jobs=-1):
# we must make sure the matrix is in csc format
# before we start doing all these column slices!
sp_mat = sp_mat.tocsc()
cols = sp_mat.shape[1]
if n_jobs == -1:
# Process the work on all available CPU cores
num_cores = multiprocessing.cpu_count()
else:
# Process the work on the specified number of CPU cores
num_cores = n_jobs
# Return a mask of all columns to keep by calling get_corr_row()
# once for each column in the matrix
return Parallel(n_jobs=num_cores, verbose=5)(delayed(get_corr_row)(i, sp_mat, thresh)for i in range(cols))
General Usage:
#Get the mask using your sparse matrix and threshold.
corr_mask = Get_Corr_Mask(X_t_fpr, 0.95)
# Remove features that are >= 95% correlated
X_t_fpr_corr = X_t_fpr[:,corr_mask]
If you wanted to return a breakdown of correlated columns you could use this function to look at them to see what you are dropping and adjust your threshold
def corr_cols(df,thresh):
# Create correlation matrix
corr_matrix = df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
dic = {'Feature_1':[],'Featur_2':[],'val':[]}
for col in upper.columns:
corl = list(filter(lambda x: x >= thresh, upper[col] ))
#print(corl)
if len(corl) > 0:
inds = [round(x,4) for x in corl]
for ind in inds:
#print(col)
#print(ind)
col2 = upper[col].index[list(upper[col].apply(lambda x: round(x,4))).index(ind)]
#print(col2)
dic['Feature_1'].append(col)
dic['Featur_2'].append(col2)
dic['val'].append(ind)
return pd.DataFrame(dic).sort_values(by="val", ascending=False)
And then remove them by calling the df
corr = corr_cols(star,0.5)
df.drop(columns = corr.iloc[:,0].unique())
This is the approach I used on my job last month. Perhaps it is not the best or quickest way, but it works fine. Here, df is my original Pandas dataframe:
dropvars = []
threshold = 0.95
df_corr = df.corr().stack().reset_index().rename(columns={'level_0': 'Var 1', 'level_1': 'Var 2', 0: 'Corr'})
df_corr = df_corr[(df_corr['Corr'].abs() >= threshold) & (df_corr['Var 1'] != df_corr['Var 2'])]
while len(df_corr) > 0:
var = df_corr['Var 1'].iloc[0]
df_corr = df_corr[((df_corr['Var 1'] != var) & (df_corr['Var 2'] != var))]
dropvars.append(var)
df.drop(columns=dropvars, inplace=True)
My idea is as follows: first, I create a dataframe containing columna Var 1, Var 2 and Corr, where I keep only those pairs of variables whose correlation is higher than or equal my threshold (in absolute value). Then, I iteratively choose the first variable (Var 1 value) in this correlations dataframe, add it to dropvar list, and remove all lines of the correlations dataframe where it appears, until my correlations dataframe is empty. In the end, I remove the columns in my dropvar list from my original dataframe.
I had a similar question today and came across this post. This is what I ended up with.
def uncorrelated_features(df, threshold=0.7):
"""
Returns a subset of df columns with Pearson correlations
below threshold.
"""
corr = df.corr().abs()
keep = []
for i in range(len(corr.iloc[:,0])):
above = corr.iloc[:i,i]
if len(keep) > 0: above = above[keep]
if len(above[above < threshold]) == len(above):
keep.append(corr.columns.values[i])
return df[keep]
I write my own way without any for loop to delete high covariance data from pandas dataframe
#get co variance of data
coVar = df.corr() # or df.corr().abs()
threshold = 0.5 #
"""
1. .where(coVar != 1.0) set NaN where col and index is 1
2. .where(coVar >= threshold) if not greater than threshold set Nan
3. .fillna(0) Fill NaN with 0
4. .sum() convert data frame to serise with sum() and just where is co var greater than threshold sum it
5. > 0 convert all Series to Boolean
"""
coVarCols = coVar.where(coVar != 1.0).where(coVar >=threshold).fillna(0).sum() > 0
# Not Boolean Becuase we need to delete where is co var greater than threshold
coVarCols = ~coVarCols
# get where you want
df[coVarCols[coVarCols].index]
I hope that's can help to use own pandas function to work with out any for loop, That's can help Improve your speed in big dataset
correlatedColumns = []
corr = df.corr()
indices = corr.index
columns = corr.columns
posthreshold = 0.7
negthreshold = -0.7
for c in columns:
for r in indices:
if c != r and (corr[c][r] > posthreshold or corr[c][r] < negthreshold):
correlatedColumns.append({"column" : c , "row" : r , "val" :corr[c][r] })
print(correlatedColumns)
in my code i need to remove low correlated columns with the dependent variable, and i got this code
to_drop = pd.DataFrame(to_drop).fillna(True)
to_drop = list(to_drop[to_drop['SalePrice'] <.4 ].index)
df_h1.drop(to_drop,axis=1)
df_h1 is my dataframe and SalePrice is the dependent variable... i think changing the value may suit for all other problems
The below snippet drop the most correlated features recursively.
def get_corr_feature(df):
corr_matrix = df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
upper['score']= upper.max(axis=1)
upper.sort_values(by=['score'],ascending=False)
#Find the most correlated feature and send return it for drop
column_name=upper.sort_values(by=['score'],ascending=False).index[0]
max_score=upper.loc[column_name,'score']
return column_name, max_score
max_score=1
while max_score>0.5:
column_name, max_score=get_corr_feature(df)
df.drop(column_name,axis=1,inplace=True)
I wrote a notebook that uses partial correlations
https://gist.github.com/thistleknot/ce1fc38ea9fcb1a8dafcfe6e0d8af475
the gist of it (pun intended)
for train_index, test_index in kfold.split(all_data):
#print(iteration)
max_pvalue = 1
subset = all_data.iloc[train_index].loc[:, ~all_data.columns.isin([exclude])]
#skip y and states
set_ = subset.loc[:, ~subset.columns.isin([target])].columns.tolist()
n=len(subset)
while(max_pvalue>=.05):
dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
p_values = pd.DataFrame(2*dist.cdf(-abs(subset.pcorr()[target]))).T
p_values.columns = list(subset.columns)
max_pname = p_values.idxmax(axis=1)[0]
max_pvalue = p_values[max_pname].values[0]
if (max_pvalue > .05):
set_.remove(max_pname)
temp = [target]
temp.extend(set_)
subset = subset[temp]
winners = p_values.loc[:, ~p_values.columns.isin([target])].columns.tolist()
sig_table = (sig_table + np.where(all_data.columns.isin(winners),1,0)).copy()
signs_table[all_data.columns.get_indexer(winners)]+=np.where(subset.pcorr()[target][winners]<0,-1,1)
significance = pd.DataFrame(sig_table).T
significance.columns = list(all_data.columns)
display(significance)
sign = pd.DataFrame(signs_table).T
sign.columns = list(all_data.columns)
display(sign)
purity = abs((sign/num_folds)*(sign/significance)).T.replace([np.inf, -np.inf, np.NaN], 0)
display(purity.T)
I believe this has to be done in an iterative way:
uncorrelated_features = features.copy()
# Loop until there's nothing to drop
while True:
# Calculating the correlation matrix for the remaining list of features
cor = uncorrelated_features.corr().abs()
# Generating a square matrix with all 1s except for the main axis
zero_main = np.triu(np.ones(cor.shape), k=1) +
np.tril(np.ones(cor.shape), k=-1)
# Using the zero_main matrix to filter out the main axis of the correlation matrix
except_main = cor.where(zero_main.astype(bool))
# Calculating some metrics for each column, including the max correlation,
# mean correlation and the name of the column
mertics = [(except_main[column].max(), except_main[column].mean(), column) for column in except_main.columns]
# Sort the list to find the most suitable candidate to drop at index 0
mertics.sort(key=lambda x: (x[0], x[1]), reverse=True)
# Check and see if there's anything to drop from the list of features
if mertics[0][0] > 0.5:
uncorrelated_features.drop(mertics[0][2], axis=1, inplace=True)
else:
break
It's worth mentioning that you might want to customize the way I sorted the metrics list and/or how I detected whether I want to drop the column or not.
I manage to do it using this way. Kindly have a try. However, the way I did is just reached display purposes as I want to capture the result in my report. If you want to drop it, you can choose any columns from the dataframe below to drop it since can just choose either 1.
row_index = 0
corrDict = {}
row_name = []
col_name = []
corr_val = []
while row_index < len(df.corr().index.tolist()):
for index, x in enumerate(df.corr().iloc[row_index, :]):
if abs(x) >= 0.8 and index != row_index:
if abs(x) in corr_val:
if (df.corr().index.tolist()[row_index] in col_name) and (df.corr().columns.tolist()[index] in row_name):
continue
row_name.append(df.corr().index.tolist()[row_index])
col_name.append(df.corr().columns.tolist()[index])
corr_val.append(x)
row_index += 1
corrDict ={"First Feature (FF)": row_name, "Second Feature (SF)": col_name, "Correlation (FF x SF)": corr_val}
corr_df2=pd.DataFrame(corrDict)
corr_df2
This is my output:
You can choose either First Feature (FF) or Second Feature (SF).
To drop highly correlated features from your original dataset:
your_df.drop(corr_df2['First Feature (FF)'].tolist(), axis=1, inplace=True)
There are three challenges to this problem. First, if features x and y are correlated, you don't want to use an algorithm that drops both. Second, if x and y are pairwise correlated and features y and z are also pairwise correlated, you want the algorithm to only remove y. In this sense, you want it to remove the minimum number of features so that no remaining features have correlations above your threshold. Third, from an efficiency standpoint, you do not want to have to compute the correlation matrix more than once.
Here's an option:
def corr_cleaner(df,corr_cutoff):
'''
df: pandas dataframe with column headers.
corr_cutoff: float between 0 and 1.
'''
abs_corr_matrix = df.corr().abs()
filtered_cols = []
while True:
offenders = []
for i in range(len(abs_corr_matrix)):
for j in range(len(abs_corr_matrix)):
if i != j:
if abs_corr_matrix.iloc[i,j] > corr_cutoff:
offenders.append(df.columns[i])
if len(offenders) > 0: # if at least one high correlation remains
c = Counter(offenders)
worst_offender = c.most_common(1)[0][0] # var name of worst offender
del df[worst_offender]
filtered_cols.append(worst_offender)
abs_corr_matrix.drop(worst_offender, axis=0, inplace=True) #drop from x-axis
abs_corr_matrix.drop(worst_offender, axis=1, inplace=True) #drop from y-axis
else: # if no high correlations remain, break
break
return df, filtered_cols
You could use the following function, you'll also gets the elements sorted:
def correlation(dataset, threshold = 0.3):
c = dataset.corr().abs()
s = c.unstack()
so = s.sort_values(kind="quicksort")
results = []
for index, row in so.items():
if index[0] != index[1] and row > threshold:
results.append({index: row})
return results
You could invoke the function sending a pandas dataset that you want to find the correlation and the threshold as follows:
highly_correlated_features = correlation(dataset=data_train_val_without_label, threshold=0.35)
highly_correlated_features
It would result in something like this for a dataset with the following columns and the default threshold:
Input columns:
0 HighBP 202944 non-null float64
1 HighChol 202944 non-null float64
2 CholCheck 202944 non-null float64
3 BMI 202944 non-null float64
4 Smoker 202944 non-null float64
5 Stroke 202944 non-null float64
6 HeartDiseaseorAttack 202944 non-null float64
7 PhysActivity 202944 non-null float64
8 Fruits 202944 non-null float64
9 Veggies 202944 non-null float64
10 HvyAlcoholConsump 202944 non-null float64
11 AnyHealthcare 202944 non-null float64
12 NoDocbcCost 202944 non-null float64
13 GenHlth 202944 non-null float64
14 MentHlth 202944 non-null float64
15 PhysHlth 202944 non-null float64
16 DiffWalk 202944 non-null float64
17 Sex 202944 non-null float64
18 Age 202944 non-null float64
19 Education 202944 non-null float64
20 Income 202944 non-null float64
Output:
[{('Income', 'Education'): 0.38083797089605675},
{('Education', 'Income'): 0.38083797089605675},
{('DiffWalk', 'PhysHlth'): 0.38145172573435343},
{('PhysHlth', 'DiffWalk'): 0.38145172573435343},
{('DiffWalk', 'GenHlth'): 0.385707943062701},
{('GenHlth', 'DiffWalk'): 0.385707943062701},
{('PhysHlth', 'GenHlth'): 0.3907082729122655},
{('GenHlth', 'PhysHlth'): 0.3907082729122655}]
Can use statsmodels's varianve_inflation_factor to detect multicollinearity in the dataframe.
from statsmodels.stats.outliers_influence import variance_inflation_factor
def vif(X):
vif = pd.DataFrame()
vif['Variables'] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
return vif
Where X is the DataFrame. VIF for columns that involve multicollinearity will be more than 10. For columns that can be perfectly reproduced by linear combination of other available columns, then its vif value will be infinity. So remove columns now by one, until all infinity values and higher vif values are removed.
You can use the Following code:
l=[]
corr_matrix = df.corr().abs()
for ci in corr_matrix.columns:
for cj in corr_matrix.columns:
if (corr_matrix[ci][cj]>0.8 and ci!=cj):
l.append(ci)
l = np.array(l)
to_drop = np.unique(l)
df.drop(to_drop, axis=1, inplace=True)

Categories