Confused about random_state in sklearn - python

So, basically, I'm using a RF for descriptive modelling as follows:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y)
class_weights = dict(enumerate(class_weights))
class_weights
{0: 0.5561096747856852, 1: 4.955559597429368}
clf = RandomForestClassifier(class_weight=class_weights, random_state=0)
cross_val_score(clf, X, y, cv=10, scoring='f1').mean()
And plotting variables importance as:
import matplotlib.pyplot as plt
def plot_importances(clf, features, n):
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
if n:
indices = indices[:n]
plt.figure(figsize=(10, 5))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], align='center')
plt.xticks(range(len(indices)), features[indices], rotation=90)
plt.xlim([-1, len(indices)])
plt.show()
return features[indices]
imp = plot_importances(clf, X.columns, 30)
I was expecting variable importances to be the same across multiple runs. However, their importances changes whenever I re-run the notebook.
I don't understand why is that. Is it related to the cross_val_score method somehow?

I cannot reproduce the problem. For me the variable importances does remain the same for multiple runs when I produce some data using:
X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
X = pd.DataFrame(X)
Also changing the data to have an uneven weighting by selecting the first 750 y/X data points does not lead to differences in importances.
What data do you use?

Related

Scikit-learn: My linear regression is not a straight line, it is messy

I'm trying to simply plot a regression line, however I get messy lines. Is it because I fitted the model with 2 features, so the only appropriate visualization would be a 3d plane?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
# prepare data
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)[['AGE','RM']]
y = boston.target
# split dataset into training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.20, random_state=33)
# apply linear regression on dataset
lm = LinearRegression()
lm.fit(X_train, y_train)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)
#plot relationship between RM and price
plt.scatter(X_train['RM'],
y_train,
c='g',
s=40,
alpha=0.5)
plt.plot(X_train['RM'], pred_train, color='r')
plt.title('Relationship between RM and Price')
plt.ylabel('Price')
plt.xlabel('RM')
You are right. You are training on multiple features, i.e AGE, and RM. But you are plotting a 2D plot with only one feature, i.e RM. Try to get a 3D plot. In general, linear regression with two features results in a plane. This is still a linear regression. That is why we use the term "hyperplane". It resolves to a line for a single feature, a plane for two features and so on.
Here is the output in 3D:
plt3d = plt.figure().gca(projection='3d')
plt3d.view_init(azim=135)
plt3d.plot_trisurf(X_train['RM'].values, X_train['AGE'].values, pred_train, alpha=0.7, antialiased=True)
The problem is that when you plot you have to order the arguments.
'plt.plot(np.sort(X_train['RM']), np.sort(pred_train), color='r')'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
# prepare data
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)[['AGE','RM']]
y = boston.target
# split dataset into training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.20, random_state=33)
# apply linear regression on dataset
lm = LinearRegression()
lm.fit(X_train, y_train)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)
#plot relationship between RM and price
plt.scatter(X_train['RM'],
y_train,
c='g',
s=40,
alpha=0.5)
plt.plot(np.sort(X_train['RM']), np.sort(pred_train), color='r')
plt.title('Relationship between RM and Price')
plt.ylabel('Price')
plt.xlabel('RM')
plt.show()
the result:
output-plot
Probably if you do a 3d plot you will visualize easily the relation between the co-variables RM and age 3d-plot

Plot feature importance in RandomForestRegressor sklearn

I Am new in Data Science. I am trying to find out the feature importance ranking for my dataset. I already applied Random forest and got the output.
Here is my code:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# importing dataset
dataset=pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:,3:12].values
Y = dataset.iloc[:,13].values
#encoding catagorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#country
labelencoder_X_1= LabelEncoder()
X[:,1]=labelencoder_X_1.fit_transform(X[:,1])
#gender
labelencoder_X_2= LabelEncoder()
X[:,2]=labelencoder_X_2.fit_transform(X[:,2])
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()
#spliting dataset into test set and train set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
In the importance part i almost copied the example shown in :
https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
Here is the code:
#feature importance
from sklearn.ensemble import ExtraTreesClassifier
importances = regressor.feature_importances_
std = np.std([tree.feature_importances_ for tree in regressor.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
I am expecting the output shown in the documentation. Can Anyone Help me please ? Thanks in Advance.
My dataset is here:
You have a lot of features and cannot been seen in a single plot.
Just plot some of them.
Here I plot the first 20 most important:
# Plot the feature importances of the forest
plt.figure(figsize=(18,9))
plt.title("Feature importances")
n=20
_ = plt.bar(range(n), importances[indices][:n], color="r", yerr=std[indices][:n])
plt.xticks(range(n), indices)
plt.xlim([-1, n])
plt.show()
My code in case you need it: https://filebin.net/be4h27swglqf3ci3
Output:

How to get the prediction probabilities using cross validation in scikit-learn

I am using RandomForestClassifier as follows using cross validation for a binary classification (class labels are 0 and 1).
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
clf=RandomForestClassifier(random_state = 42, class_weight="balanced")
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
accuracy = cross_val_score(clf, X, y, cv=k_fold, scoring = 'accuracy')
print("Accuracy: " + str(round(100*accuracy.mean(), 2)) + "%")
f1 = cross_val_score(clf, X, y, cv=k_fold, scoring = 'f1_weighted')
print("F Measure: " + str(round(100*f1.mean(), 2)) + "%")
Now I want to order my data using prediction probabilities of class 1 with cross validation results. For that I tried the following two ways.
pred = clf.predict_proba(X)[:,1]
print(pred)
probs = clf.predict_proba(X)
best_n = np.argsort(probs, axis=1)[:,-6:]
I get the following error
NotFittedError: This RandomForestClassifier instance is not fitted
yet. Call 'fit' with appropriate arguments before using this method.
for both the situations.
I am just wondering where I am making things wrong.
I am happy to provide more details if needed.
In case, you want to use the CV model for a unseen data point/s, use the following approach.
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
iris = datasets.load_iris()
X = iris.data
y = iris.target
clf = RandomForestClassifier(n_estimators=10, random_state = 42, class_weight="balanced")
cv_results = cross_validate(clf, X, y, cv=3, return_estimator=True)
clf_fold_0 = cv_results['estimator'][0]
clf_fold_0.predict_proba([iris.data[133]])
# array([[0. , 0.5, 0.5]])
Have a look at the documentation it specifies that the probability is calculated based on the mean results from the trees.
In your case, you first need to call the fit() method to generate the tress in the model. Once you fit the model on the training data, you can call the predict_proba() method.
This is also specified in the error.
# Fit model
model = RandomForestClassifier(...)
model.fit(X_train, Y_train)
# Probabilty
model.predict_proba(X)[:,1]
I solved my problem using the following code:
proba = cross_val_predict(clf, X, y, cv=k_fold, method='predict_proba')
print(proba[:,1])
print(np.argsort(proba[:,1]))

Why I get the X_train_std is not defined

May I know why I get the error message -
NameError: name 'X_train_std' is not defined
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std,
y_combined, classifier=lr,
test_idx=range(105,150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
lr.predict_proba(X_test_std[0,:])
weights, params = [], []
for c in np.arange(-5, 5):
lr = LogisticRegression(C=10**c, random_state=0)
lr.fit(X_train_std, y_train)
weights.append(lr.coef_[1])
params.append(10**c)
weights = np.array(weights)
plt.plot(params, weights[:, 0],
label='petal length')
plt.plot(params, weights[:, 1], linestyle='--',
label='petal width')
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.legend(loc='upper left')
plt.xscale('log')
plt.show()
Plesea see the link -
https://www.freecodecamp.org/forum/t/how-to-modify-my-python-logistic-regression/265795
https://bytes.com/topic/python/answers/972352-why-i-get-x_train_std-not-defined#post3821849
https://www.researchgate.net/post/Why_I_get_the_X_train_std_is_not_defined
.
Well, X_train_std is not defined/declared. You need to declare the variable and give it a value before using it.
Like:
X_train_std = 3
You didn't copy enough of the sample code. Somewhere above this, there is likely a call to train_test_split
Basically, to do what you want, you need a set of X variables, your Y variable (what will be predicted). You normally split them into a training set and a test set, and in addition, many algorithms work better on standardized (zero mean, 1 standard deviation), which is what the _std probably means in your variable name.
The code that comes before your snippet probably looks something like:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
my_df = pd.DataFrame(....this is your data for the test...)
X = my_df[[X_variable_column_names_here]]
Y = my_df[Y_variable_column_name]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
Edit: It looks from the axis labels on your plot that you're trying to do logistic regression against the Iris dataset. The fully worked example is here:
https://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html

What is the Python code to show the feature importance in SVM?

How can I show the important features that contribute to the SVM model along with the feature name?
My code is shown below,
First I Imported the modules
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
Then I divided my data into features and variables
y = df_new[['numeric values']]
X = df_new.drop('numeric values', axis=1).values
Then I Setup the pipeline
steps = [('scalar', StandardScaler()),
('SVM', SVC(kernel='linear'))]
pipeline = Pipeline(steps)
Then I Specified my the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
'SVM__gamma':[0.1, 0.01]}
I Created a train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=21)
Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline,param_grid = parameters,cv=5)
Fit to the training set
cv.fit(X_train,y_train.values.ravel())
Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)
feature_importances = cv.best_estimator_.feature_importances_
The error message I get
'Pipeline' object has no attribute 'feature_importances_'
What I understood is that, lets suppose you are building a model with 100 feature and you want to know which feature is more important and which is less if this is the case ?
Just try Uni-variate feature selection method, Its very basic method and you can play with this before going to advance methods for your data. Sample code is provided scikit-learn it self. You can modified it as per your requirement.
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, svm
from sklearn.feature_selection import SelectPercentile, f_classif
###############################################################################
# import some data to play with
# The iris dataset
iris = datasets.load_iris()
# Some noisy data not correlated
E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))
# Add the noisy data to the informative features
X = np.hstack((iris.data, E))
y = iris.target
###############################################################################
plt.figure(1)
plt.clf()
X_indices = np.arange(X.shape[-1])
###############################################################################
# Univariate feature selection with F-test for feature scoring
# We use the default selection function: the 10% most significant features
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X, y)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
label=r'Univariate score ($-Log(p_{value})$)', color='g')
###############################################################################
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(X, y)
svm_weights = (clf.coef_ ** 2).sum(axis=0)
svm_weights /= svm_weights.max()
plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='r')
clf_selected = svm.SVC(kernel='linear')
clf_selected.fit(selector.transform(X), y)
svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
svm_weights_selected /= svm_weights_selected.max()
plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
width=.2, label='SVM weights after selection', color='b')
plt.title("Comparing feature selection")
plt.xlabel('Feature number')
plt.yticks(())
plt.axis('tight')
plt.legend(loc='upper right')
plt.show()
Code ref.
http://scikit-learn.org/0.15/auto_examples/plot_feature_selection.html
Note;
For each feature, this method will plot p-values for the univariate feature selection and the corresponding weights of an SVM. This method selects those feature which shows larger SVM weights.

Categories