f_importances function in sklearn - python

I found this question here which seems to address my problem(Determining the most contributing features for SVM classifier in sklearn).
However as my understanding of Python language is limited I need some help.
I have a dependent variable which is 'Group' that has two levels 'Group1' and 'Group2'.
This is the code I found, adapted to my data:
import pandas as pd
df = pd.read_csv('C:/Users/myPC/OneDrive/Desktop/analysis/dataframe6.csv')
X = df.drop('Group', axis=1)
y = df['Group']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2']
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
f_importances(svclassifier.coef_, features_names)
It produces just a blank plot.
I think there is something I should change in features_names = ['input1', 'input2'] but I am not sure what.

The code you used to plot expects a one-dimensional array. The attribute coef_, according to the documentation will be:
coef_ ndarray of shape (n_classes * (n_classes - 1) / 2, n_features)
Weights assigned to the features when kernel="linear".
Using an example :
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
np.random.seed(123)
df = pd.DataFrame(np.random.uniform(0,1,(400,3)),columns=['input1','input2','input3'])
df['Group'] = np.random.choice(['Group1','Group2'],400)
X = df.drop('Group', axis=1)
y = df['Group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
We check the shape of the array:
print(svclassifier.coef_.shape)
(1, 3)
Because you have only 2 class, there's only 1 row. We can do:
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2','input3']
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
f_importances(svclassifier.coef_[0], features_names)
This is the plot I got :

Related

How to visualize cluster boundaries

I generated several datasets, and using classifiers, I predicted the distribution of clusters. I need to draw boundaries between clusters on the chart. In the form of lines or in the form of filled areas - it does not matter. Please let me know if there is any way to do this.
My code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split
n_sample = 2000
def make_square(n_sample):
data=np.array([0,[]])
data[0] = np.random.sample((n_sample,2))
for i in range(n_sample):
if data[0][i][0] > 0.5 and data[0][i][1] > 0.5 or data[0][i][0] < 0.5 and data[0][i][1] < 0.5:
data[1].append(1)
else:
data[1].append(0)
return data
datasets = [
make_circles(n_samples=n_sample, noise=0.09, factor=0.5),
make_square(n_sample),
make_moons(n_samples=n_sample, noise=0.12),
]
ks=[]
for data in datasets:
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc = classifier.score(X_test, y_test)
accs = []
for i in range(1, 8):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
acc0 = knn.score(X_test, y_test)
accs.append(acc0)
plt.figure(figsize=(12, 6))
plt.plot(range(1, 8), accs, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('accs Score K Value')
plt.xlabel('K Value')
plt.ylabel('accs Score')
print("Max Score:", max(accs), "k=",accs.index(max(accs))+1)
ks.append(accs.index(max(accs))+1)
for i in range(3):
data = datasets[i]
k = ks[i]
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
plt.figure(figsize=(9,9))
plt.title("Test")
plt.scatter(X_test[:,0], X_test[:,1], c=y_test)
plt.figure(figsize=(9,9))
plt.title("Predict")
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred)
Example output:
enter image description here
enter image description here
scikit-learn 1.1 introduced the DecisionBoundaryDisplay to assist with this sort of task.
Following the use of make_moons and the KNeighborsClassifier in the question, we can fit the classifier on the dataset, invoke the DecisionBoundaryDisplay.from_estimator() method, then scatter the X data on the returned axis:
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import DecisionBoundaryDisplay
X, y = make_moons(noise=0.2)
clf = KNeighborsClassifier().fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(clf, X, response_method="predict", alpha=0.3)
disp.ax_.scatter(X[:, 0], X[:, 1], c=y)
plt.show()
Resulting in something like this:

Logistic Regression- multiclass-multioutput is not supported + errors

I am new to python and while doing a Logistic Regression I'm having a few issues, such as displayed. Here's my code, and then the error messages:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
X = dataset_df
Y = dataset_df
'X_train, X_test, y_train, y_test\
= train_test_split(X, y, test_size = 0.3, random_state=1)'
X_train, X_validation, y_train, y_validation\
= train_test_split(X_train, y_train, test_size = 0.3, random_state=1)
sc = StandardScaler()
sc.fit(X_train)
X_train_Std = sc.transform(X_train)
lr_classifier = LogisticRegression(C = 1000, random_state= 1)
rf_classifier = RandomForestClassifier(max_depth=5, random_state= 1)
rf_classifier.fit(X_train_Std, y_train)
rf_classifier.predict_proba(sc.transform(X_validation))
then here
roc_auc_score(y_true=y_test, y_score=lr_2.predict(X_test_std_pca_1))
NameError: name 'lr_2' is not defined
and there
max_depth_params = [2, 3, 5 ,10]
for max_depth in max_depth_params:
rf_classifier = RandomForestClassifier(max_depth=max_depth, random_state= 1)
rf_classifier.fit(X_train_Std, y_train)
y_pred2 = rf_classifier.predict(sc.transform(X_validation))
print('max depth param:', max_depth, 'accuracy:', accuracy_score(y_true=y_validation, y_pred=y_pred2))
ValueError: multiclass-multioutput is not supported
and there
lr_classifier.fit(X_train_Std, y_train)
y_pred = lr_classifier.predict(sc.transform(X_validation))
ValueError: y should be a 1d array, got an array of shape (3876, 16) instead.
and finally :
y_pred2 = rf_classifier.predict(sc.transform(X_validation))
print('Misclassified samples {0} out of {1}, i.e. {2:.2f}% accurate'.\
format((y_validation != y_pred).sum(), len(y_validation), (1 - (y_validation != y_pred).sum()/len(y_validation))*100))
TypeError: unsupported format string passed to Series.format
so many messages of error that i feel as if my head is going to explode, if someone could help i'd be very grateful 🙏
Variable name error lr_2 is not variable name you have defined it as lr_classifier
Your Target or y is not single column but it's 2D (Multiclass)
It clearly states the issue that y is not 1D array
Unsupported datatype is passed probably an Series.
And try to debug errors one at a time and learn python basic before going to Machine Learning please.

Can I fit a VAR model using the LASSO method in Python?

I have to fit 40 time series in a VectorAutoregressive model, the enormous quantity of variables suggest to use a selection method. I would love to use the LASSO method, but I'm using statsmodel for the fitting, and the only way to implement LASSO with that library is for a Linear regression model. Someone can help?
You can try using fit_regularized, it's like when you fit an OLS, and you set L1_wt to be 1 so that it is a lasso:
sm.OLS(..,..).fit_regularized(alpha=..,L1_wt=1)
We can with an example, first load the boston dataset:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import numpy as np
import statsmodels.api as sm
scaler = StandardScaler()
data = load_boston()
data_scaled = scaler.fit_transform(data.data)
X_train, X_test, y_train, y_test = train_test_split(data_scaled, data.target, test_size=0.33, random_state=42)
Below to show that it works similarly, and you need to tweak the shrinkage parameter, alpha anyway in your model:
alphas = [0.0001,0.001, 0.01, 0.1,0.2, 0.5, 1]
mse_sklearn = []
mse_sm = []
for a in alphas:
clf = linear_model.Lasso(alpha=a)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mse_sklearn.append(mean_squared_error(y_test, y_pred))
mdl = sm.OLS(y_train,sm.add_constant(X_train)).fit_regularized(alpha=a,L1_wt=1)
y_pred = mdl.predict(sm.add_constant(X_test))
mse_sm.append(mean_squared_error(y_test, y_pred))
Visualize the results:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(alphas,mse_sm,label="sm")
ax.plot(alphas,mse_sklearn,label="sklearn")
ax.legend()

Plot feature importance in RandomForestRegressor sklearn

I Am new in Data Science. I am trying to find out the feature importance ranking for my dataset. I already applied Random forest and got the output.
Here is my code:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# importing dataset
dataset=pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:,3:12].values
Y = dataset.iloc[:,13].values
#encoding catagorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#country
labelencoder_X_1= LabelEncoder()
X[:,1]=labelencoder_X_1.fit_transform(X[:,1])
#gender
labelencoder_X_2= LabelEncoder()
X[:,2]=labelencoder_X_2.fit_transform(X[:,2])
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()
#spliting dataset into test set and train set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
In the importance part i almost copied the example shown in :
https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
Here is the code:
#feature importance
from sklearn.ensemble import ExtraTreesClassifier
importances = regressor.feature_importances_
std = np.std([tree.feature_importances_ for tree in regressor.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
I am expecting the output shown in the documentation. Can Anyone Help me please ? Thanks in Advance.
My dataset is here:
You have a lot of features and cannot been seen in a single plot.
Just plot some of them.
Here I plot the first 20 most important:
# Plot the feature importances of the forest
plt.figure(figsize=(18,9))
plt.title("Feature importances")
n=20
_ = plt.bar(range(n), importances[indices][:n], color="r", yerr=std[indices][:n])
plt.xticks(range(n), indices)
plt.xlim([-1, n])
plt.show()
My code in case you need it: https://filebin.net/be4h27swglqf3ci3
Output:

ValueError("x and y must be the same size")

import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn import metrics, model_selection
from xgboost.sklearn import XGBClassifier
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import ensemble
warnings.filterwarnings('ignore')
train = pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/train.csv (6)/train.csv')
test = pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/test.csv (2)/test.csv')
test_labels=pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/test_labels.csv/test_labels.csv')
print("\nTrain Data")
print("==========\n",train)
print("\nTest Data")
print("==========\n",test)
print("\nTest_labels Data")
print("================\n",test_labels)
sns.barplot(x='toxic', y='identity_hate', data=train);
plt.show()
print("\n\nTrain data shape:",train.shape)
print("\nTest data shape:",test.shape)
print("\nTestLabels data shape:",test_labels.shape)
print("\nCorrelation matrix")
print("==================")
plt.title('Correlation Matrix')
sns.heatmap(train.corr())
plt.show()
print("\n Data Descriptive")
print("================\n",train.describe())
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
max_depth = 5, alpha = 10, n_estimators = 10)
print("\nRegressor")
print("===========\n",xg_reg)
X = test_labels.iloc[:,1:6].values
Y = test_labels.iloc[:,6].values
#print("X value\n",X,"\n\nY value \n",Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier()
model.fit(X_train, y_train)
print("\n Classifier")
print("============\n",model)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
params = {
'n_estimators': 1,
'max_depth': 1,
'learning_rate': 1,
'criterion': 'mse'
}
gradient_boosting_regressor = ensemble.GradientBoostingRegressor(**params)
gradient_boosting_regressor.fit(X, Y)
plt.figure(figsize=(10, 5))
plt.title('Gradient Boosting model (1 estimators, Single tree split)')
plt.scatter(X, Y)
plt.plot(X, gradient_boosting_regressor.predict(X), color='r')
plt.show()
While executing the above code this error occurs.
"raise ValueError("x and y must be the same size")"
I have .csv file with 1398 rows and 2 columns. I have taken 40% as a y_test set, as it is visible in the above code.

Categories