I generated several datasets, and using classifiers, I predicted the distribution of clusters. I need to draw boundaries between clusters on the chart. In the form of lines or in the form of filled areas - it does not matter. Please let me know if there is any way to do this.
My code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split
n_sample = 2000
def make_square(n_sample):
data=np.array([0,[]])
data[0] = np.random.sample((n_sample,2))
for i in range(n_sample):
if data[0][i][0] > 0.5 and data[0][i][1] > 0.5 or data[0][i][0] < 0.5 and data[0][i][1] < 0.5:
data[1].append(1)
else:
data[1].append(0)
return data
datasets = [
make_circles(n_samples=n_sample, noise=0.09, factor=0.5),
make_square(n_sample),
make_moons(n_samples=n_sample, noise=0.12),
]
ks=[]
for data in datasets:
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc = classifier.score(X_test, y_test)
accs = []
for i in range(1, 8):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
acc0 = knn.score(X_test, y_test)
accs.append(acc0)
plt.figure(figsize=(12, 6))
plt.plot(range(1, 8), accs, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('accs Score K Value')
plt.xlabel('K Value')
plt.ylabel('accs Score')
print("Max Score:", max(accs), "k=",accs.index(max(accs))+1)
ks.append(accs.index(max(accs))+1)
for i in range(3):
data = datasets[i]
k = ks[i]
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
plt.figure(figsize=(9,9))
plt.title("Test")
plt.scatter(X_test[:,0], X_test[:,1], c=y_test)
plt.figure(figsize=(9,9))
plt.title("Predict")
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred)
Example output:
enter image description here
enter image description here
scikit-learn 1.1 introduced the DecisionBoundaryDisplay to assist with this sort of task.
Following the use of make_moons and the KNeighborsClassifier in the question, we can fit the classifier on the dataset, invoke the DecisionBoundaryDisplay.from_estimator() method, then scatter the X data on the returned axis:
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import DecisionBoundaryDisplay
X, y = make_moons(noise=0.2)
clf = KNeighborsClassifier().fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(clf, X, response_method="predict", alpha=0.3)
disp.ax_.scatter(X[:, 0], X[:, 1], c=y)
plt.show()
Resulting in something like this:
Related
I found this question here which seems to address my problem(Determining the most contributing features for SVM classifier in sklearn).
However as my understanding of Python language is limited I need some help.
I have a dependent variable which is 'Group' that has two levels 'Group1' and 'Group2'.
This is the code I found, adapted to my data:
import pandas as pd
df = pd.read_csv('C:/Users/myPC/OneDrive/Desktop/analysis/dataframe6.csv')
X = df.drop('Group', axis=1)
y = df['Group']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2']
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
f_importances(svclassifier.coef_, features_names)
It produces just a blank plot.
I think there is something I should change in features_names = ['input1', 'input2'] but I am not sure what.
The code you used to plot expects a one-dimensional array. The attribute coef_, according to the documentation will be:
coef_ ndarray of shape (n_classes * (n_classes - 1) / 2, n_features)
Weights assigned to the features when kernel="linear".
Using an example :
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
np.random.seed(123)
df = pd.DataFrame(np.random.uniform(0,1,(400,3)),columns=['input1','input2','input3'])
df['Group'] = np.random.choice(['Group1','Group2'],400)
X = df.drop('Group', axis=1)
y = df['Group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
We check the shape of the array:
print(svclassifier.coef_.shape)
(1, 3)
Because you have only 2 class, there's only 1 row. We can do:
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2','input3']
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
f_importances(svclassifier.coef_[0], features_names)
This is the plot I got :
I encountered a problem as the title showed and didn't know how to fix it. I want to use cross-validation to analyze chronic disease. Could someone help me? thanks
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =
0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train,y_train)
# Predicting the test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
ckd = confusion_matrix(y_test, y_pred)
ckd
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 =np.meshgrid(np.arange(start =X_set[:,0].min()-1, stop= X_set[:,
0].max()+1,step =
0.01), np.arange(start =X_set[:,1].min()-1, stop=
X_set[:,1].max()+1,step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape), #this line error
alpha = 0.15, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(),X1.max())
plt.xlim(X1.min(),X1.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set ==j, 1],
c= ListedColormap(('red','green'))(i),label = j)
plt.title('Kernel SVM (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
This is my code:
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
%matplotlib inline
boston_properties = load_boston()
l_distance = boston_properties['data'][:, np.newaxis, 7]
linreg = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(l_distance, boston_properties['target'], test_size = 0.3)
y_pred = cross_val_predict(linreg, l_distance, boston_properties.target, cv=5)
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=2)
plt.show()
print(y_pred.shape)
The error which I'm receiving is the following:
ValueError: x and y must have same first dimension, but have shapes (152, 1) and (506,)
How can I make this work?
You made a train_test_split, but you're not using it to train the model. Then you predict on your entire training data, and compare it with y_test. This makes no sense. Use these lines instead:
l_distance = boston_properties['data'][:, np.newaxis, 7]
linreg = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(l_distance,
boston_properties['target'], test_size = 0.3) # now you have a train/test set
y_pred = cross_val_predict(linreg, X_train, y_train, cv=5)
plt.scatter(X_train, y_train, color='black')
plt.plot(X_train, y_pred, color='blue', linewidth=2)
plt.show()
Edit: You can also use this line to make a straight line through your points:
plt.scatter(X_train, y_train, color='black')
plt.plot([X_train[np.argmin(X_train)], X_train[np.argmax(X_train)]],
[y_pred[np.argmin(X_train)], y_pred[np.argmax(X_train)]],
color='blue')
plt.show()
I need to check these criteria in multiple linear regression in order to make a loop:
p- value < 0.05
F- statistics < 0.05
R^2 >= 0.8
I use this code:
X = df.iloc[:,2:].values
Y = df.iloc[:,1].values
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state= 0)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)
df1 = pd.DataFrame({'Actual': Y_test.flatten(), 'Predicted': Y_pred.flatten()})
df1.plot(kind='bar')
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
import statsmodels.formula.api as sm
X= np.append (arr = np.ones((141,1)).astype(int), values = X, axis = 1)
X_opt = X[:,[0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()
The output is:
Here
How could I define the criteria so they become comparable? They need to receive different values in every loop.
Thanks :)
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn import metrics, model_selection
from xgboost.sklearn import XGBClassifier
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import ensemble
warnings.filterwarnings('ignore')
train = pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/train.csv (6)/train.csv')
test = pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/test.csv (2)/test.csv')
test_labels=pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/test_labels.csv/test_labels.csv')
print("\nTrain Data")
print("==========\n",train)
print("\nTest Data")
print("==========\n",test)
print("\nTest_labels Data")
print("================\n",test_labels)
sns.barplot(x='toxic', y='identity_hate', data=train);
plt.show()
print("\n\nTrain data shape:",train.shape)
print("\nTest data shape:",test.shape)
print("\nTestLabels data shape:",test_labels.shape)
print("\nCorrelation matrix")
print("==================")
plt.title('Correlation Matrix')
sns.heatmap(train.corr())
plt.show()
print("\n Data Descriptive")
print("================\n",train.describe())
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
max_depth = 5, alpha = 10, n_estimators = 10)
print("\nRegressor")
print("===========\n",xg_reg)
X = test_labels.iloc[:,1:6].values
Y = test_labels.iloc[:,6].values
#print("X value\n",X,"\n\nY value \n",Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier()
model.fit(X_train, y_train)
print("\n Classifier")
print("============\n",model)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
params = {
'n_estimators': 1,
'max_depth': 1,
'learning_rate': 1,
'criterion': 'mse'
}
gradient_boosting_regressor = ensemble.GradientBoostingRegressor(**params)
gradient_boosting_regressor.fit(X, Y)
plt.figure(figsize=(10, 5))
plt.title('Gradient Boosting model (1 estimators, Single tree split)')
plt.scatter(X, Y)
plt.plot(X, gradient_boosting_regressor.predict(X), color='r')
plt.show()
While executing the above code this error occurs.
"raise ValueError("x and y must be the same size")"
I have .csv file with 1398 rows and 2 columns. I have taken 40% as a y_test set, as it is visible in the above code.