from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
r = pd.read_csv("vitalsign_test.csv")
clm_list = []
for column in r.columns:
clm_list.append(column)
X = r[clm_list[1:len(clm_list)-1]].values
y = r[clm_list[len(clm_list)-1]].values
X_train, X_test, y_train, y_test = train_test_split (X,y, test_size = 0.3, random_state=4)
k_range = range(1,25)
scores = []
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
plt.plot(k_range,scores)
plt.xlabel('value of k for clf')
plt.ylabel('testing accuracy')
reponse that I am getting is
ValueError: x and y must have same first dimension
my feature and response shape is:
y.shape
Out[60]: (500,)
X.shape
Out[61]: (500, 6)
It has nothing to do with your X and y, it is about x and y arguments to plot, since your scores has one element, and k_range has 25. The error is incorrect indentation:
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
should be
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
Related
I generated several datasets, and using classifiers, I predicted the distribution of clusters. I need to draw boundaries between clusters on the chart. In the form of lines or in the form of filled areas - it does not matter. Please let me know if there is any way to do this.
My code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split
n_sample = 2000
def make_square(n_sample):
data=np.array([0,[]])
data[0] = np.random.sample((n_sample,2))
for i in range(n_sample):
if data[0][i][0] > 0.5 and data[0][i][1] > 0.5 or data[0][i][0] < 0.5 and data[0][i][1] < 0.5:
data[1].append(1)
else:
data[1].append(0)
return data
datasets = [
make_circles(n_samples=n_sample, noise=0.09, factor=0.5),
make_square(n_sample),
make_moons(n_samples=n_sample, noise=0.12),
]
ks=[]
for data in datasets:
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc = classifier.score(X_test, y_test)
accs = []
for i in range(1, 8):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
acc0 = knn.score(X_test, y_test)
accs.append(acc0)
plt.figure(figsize=(12, 6))
plt.plot(range(1, 8), accs, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('accs Score K Value')
plt.xlabel('K Value')
plt.ylabel('accs Score')
print("Max Score:", max(accs), "k=",accs.index(max(accs))+1)
ks.append(accs.index(max(accs))+1)
for i in range(3):
data = datasets[i]
k = ks[i]
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
plt.figure(figsize=(9,9))
plt.title("Test")
plt.scatter(X_test[:,0], X_test[:,1], c=y_test)
plt.figure(figsize=(9,9))
plt.title("Predict")
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred)
Example output:
enter image description here
enter image description here
scikit-learn 1.1 introduced the DecisionBoundaryDisplay to assist with this sort of task.
Following the use of make_moons and the KNeighborsClassifier in the question, we can fit the classifier on the dataset, invoke the DecisionBoundaryDisplay.from_estimator() method, then scatter the X data on the returned axis:
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import DecisionBoundaryDisplay
X, y = make_moons(noise=0.2)
clf = KNeighborsClassifier().fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(clf, X, response_method="predict", alpha=0.3)
disp.ax_.scatter(X[:, 0], X[:, 1], c=y)
plt.show()
Resulting in something like this:
import numpy as np
import pandas as pd
import sklearn
from sklearn import linear_model
from sklearn.utils import shuffle
import matplotlib.pyplot as pyplot
import pickle
from matplotlib import style
data = pd.read_csv("student-mat.csv", sep=";")
data = data[["G1", "G3", "G3", "studytime", "failures", "absences", "freetime"]]
predict = "G3"
X = np.array(data.drop([predict], 1))
Y = np.array(data[predict])
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size = 0.1)
best = 0
for _ in range(3000):
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.1)
linear = linear_model.LinearRegression()
linear.fit(x_train, y_train)
acc = linear.score(x_test, y_test)
print(acc)
if acc > best:
best = acc
with open("studentmodel.pickle", "wb") as f:
pickle.dump(linear, f)
pickle_in = open("studentmodel.pickle", "rb")
linear = pickle.load(pickle_in)
print('Co: \n', linear.coef_)
print('Intercept: \n', linear.intercept_)
predictions = linear.predict(x_test)
for x in range(len(predictions)):
print(predictions[x], x_test[x], y_test[x])
p = 'G1'
style.use("ggplot")
pyplot.scatter(data[p],data["G3"])
pyplot.xlabel(p)
pyplot.ylabel("Final Grade")
pyplot.show()
Error: raise ValueError ("X and y must be the same size")
Can anyone please explain to me what I have done wrong? Because well I am new to programing and was following a tutorial and everything up to the last 5 lines was working fine but when I try to make a graph it gives me this error "raise ValueError ("X and y must be the same size")" it only allows me to make a graph if I write the code like this
style.use("ggplot")
pyplot.scatter(data["G3"],data["G3"])
pyplot.xlabel(p)
pyplot.ylabel("Final Grade")
pyplot.show()
Which only gives me a straight line on a graph
Thank you for any help!
I have run following code using this data.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import pickle
from matplotlib import style
data = pd.read_csv("student-mat.csv")
# Here, I have changed columns because "G2" was occurring twice.
data = data[["G1", "G2", "G3", "studytime", "failures", "absences", "freetime"]]
predict = "G3"
print(data.head())
X = np.array(data.drop([predict], 1))
print(X)
y = np.array(data[predict])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
best = 0
for _ in range(3000):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
linear = LinearRegression()
linear.fit(X_train, y_train)
acc = linear.score(X_test, y_test)
print(acc)
if acc > best:
best = acc
with open("studentmodel.pickle", "wb") as f:
pickle.dump(linear, f)
pickle_in = open("studentmodel.pickle", "rb")
linear = pickle.load(pickle_in)
print('Co: \n', linear.coef_)
print('Intercept: \n', linear.intercept_)
predictions = linear.predict(X_test)
for x in range(len(predictions)):
print(predictions[x], X_test[x], y_test[x])
p = 'G1'
style.use("ggplot")
plt.scatter(data[p], data["G3"])
plt.xlabel(p)
plt.ylabel("Final Grade")
plt.show()
This will produce the following image.
I need to check these criteria in multiple linear regression in order to make a loop:
p- value < 0.05
F- statistics < 0.05
R^2 >= 0.8
I use this code:
X = df.iloc[:,2:].values
Y = df.iloc[:,1].values
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state= 0)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)
df1 = pd.DataFrame({'Actual': Y_test.flatten(), 'Predicted': Y_pred.flatten()})
df1.plot(kind='bar')
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
import statsmodels.formula.api as sm
X= np.append (arr = np.ones((141,1)).astype(int), values = X, axis = 1)
X_opt = X[:,[0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()
The output is:
Here
How could I define the criteria so they become comparable? They need to receive different values in every loop.
Thanks :)
I try logistic regression classification using "k-fold cross validation" in python.
my code:
`import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix,roc_auc_score
data = pd.read_csv('xxx.csv')
X = data[["a","b","c",...]]
y = data["Class"]
def get_predictions(clf, X_train, y_train, X_test):
clf = clf
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)
train_pred = clf.predict(X_train)
print('train-set confusion matrix:\n', confusion_matrix(y_train,train_pred))
return y_pred, y_pred_prob
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
pred_test_full=0
cv_score=[]
i=1
for train_index, test_index in skf.split(X, y):
X_train, y_train = X.loc[train_index], y.loc[train_index]
X_test, y_test = X.loc[test_index], y.loc[test_index]
log_cfl = LogisticRegression(C=2);
log_cfl.fit(X_train, y_train)
y_pred, y_pred_prob = get_predictions(LogisticRegression(C=2), X_train, y_train, X_test)
score=roc_auc_score(y_test,log_cfl.predict(X_test))
print('ROC AUC score: ',score)
cv_score.append(score)
pred_test_full = pred_test_full + y_pred_prob
i+=1`
I get error at this line of code:
`pred_test_full = pred_test_full + y_pred_prob`
For loop runs 2 times. Then in third, I get the error.
'operands could not be broadcast together with shapes <56962,2> <5696..' error.
I couldn't understand what is wrong, could you help to figure out?
Below is an example of using scikit-learn to get cross-validated predictions from k-nearest neighbors, with k chosen by cross-validation. The code seems to work, but how can I also print the k that was selected in each of the outer folds?
import numpy as np, sklearn
n = 100
X = np.random.randn(n, 2)
y = np.where(np.sum(X, axis = 1) + np.random.randn(n) > 0, "blue", "red")
preds = sklearn.model_selection.cross_val_predict(
X = X,
y = y,
estimator = sklearn.model_selection.GridSearchCV(
estimator = sklearn.neighbors.KNeighborsClassifier(),
param_grid = {'n_neighbors': range(1, 7)},
cv = sklearn.model_selection.KFold(10, random_state = 133),
scoring = 'accuracy'),
cv = sklearn.model_selection.KFold(10, random_state = 144))
You can't get this directly from that function, so you would need to replace cross_val_predict with cross_validate and set the return_estimator flag to True. You can then select the estimators used in the returned dictionary with the key estimator. The selected parameters of the estimators is stored in the attribute best_params_. So
import numpy as np
import sklearn
# sklearn 0.20.3 doesn't seem to import submodules in __init__
# So importing them directly is required.
import sklearn.model_selection
import sklearn.neighbors
n = 100
X = np.random.randn(n, 2)
y = np.where(np.sum(X, axis = 1) + np.random.randn(n) > 0, "blue", "red")
scores = sklearn.model_selection.cross_validate(
X = X,
y = y,
estimator = sklearn.model_selection.GridSearchCV(
estimator = sklearn.neighbors.KNeighborsClassifier(),
param_grid = {'n_neighbors': range(1, 7)},
cv = sklearn.model_selection.KFold(10, random_state = 133),
scoring = 'accuracy'),
cv = sklearn.model_selection.KFold(10, random_state = 144),
return_estimator=True)
# Selected hyper-parameters for the estimator from the first fold
print(scores['estimator'][0].best_params_)
Unfortunately you can't get the actual predictions AND the hyper-parameters selected from the same function. If you want that, you will have to do the nested cross-validation manually:
cv = sklearn.model_selection.KFold(10, random_state = 144)
estimator = sklearn.model_selection.GridSearchCV(
estimator = sklearn.neighbors.KNeighborsClassifier(),
param_grid = {'n_neighbors': range(1, 7)},
cv = sklearn.model_selection.KFold(10, random_state = 133),
scoring = 'accuracy')
for train, test in cv.split(X,y):
X_train, y_train = X[train], y[train]
X_test, y_test = X[test], y[test]
m = estimator.fit(X_train, y_train)
print(m.best_params_)
y_pred = m.predict(X_test)
print(y_pred)