I just learned about Machine Learning. I wrote a piece of code about KNN but I can't add weight = 1/distances in my code. Can anyone help me, please?
Thank you very much!
I'm sorry about the wording because I used GG translate
Below is the code I have referenced:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets
import pandas as pd
from sklearn.model_selection import train_test_split
class KNearestNeighbor():
def __init__(self , k):
self.k = k
def train(self,X,y):
self.X_train = X
self.y_train = y
def predict(self,X_test):
distances= self.euclidean_distance(X_test)
return self.predict_labels(distances)
def euclidean_distance(self,X_test):
num_test = X_test.shape[0]
num_train= self.X_train.shape[0]
distances= np.zeros((num_test,num_train))
for i in range(num_test):
for j in range(num_train):
distances[i,j]= np.sqrt( np.sum((X_test[i,:] - self.X_train[j,:])**2))
return distances
def predict_labels(self,distances):
num_test= distances.shape[0]
y_pred= np.zeros(num_test)
for i in range(num_test):
y_indices= np.argsort(distances[i,:])
k_closet_classes= self.y_train[y_indices[: self.k]].astype(int)
y_pred[i]= np.argmax(np.bincount(k_closet_classes))
return y_pred
if __name__ == '__main__':
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2,random_state=2)
KNN = KNearestNeighbor(k=10)
KNN.train(X_train,y_train)
y_pred = KNN.predict(X_test)
print(f'Accuracy: {sum(y_pred == y_test )/y_test.shape[0]}')
Related
I want to get the top 10 features with XGBRegressor with ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10) I get the top 10 features. But how could I use this in my pipeline?
I have this class FeatureSelector_Only_Top_10, how could I only use the top 10 features and later printed out? For example print(grid.feature_selection_top_10.top10features).
Imports:
import time
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
XGB:
xgb_reg_start = time.time()
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train_nor, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train_nor)
val_preds_xgb_reg = xgb_reg.predict(X_test_nor)
xgb_reg_end = time.time()
print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")
print("\nTraining MSE:", round(metrics.mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(metrics.mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(metrics.r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(metrics.r2_score(y_test, val_preds_xgb_reg),4))
ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', inplace=True)
ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10)
Pipeline:
class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
def __init__(self,n_components = 10):
self.n_components = n_components
def fit(self, X, y = None):
# Don't know
return self
def transform(self, X, y = None):
# Don't know
return X
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
steps = [#('feature_selection_top_10', FeatureSelector_Only_Top_10()),
#('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=100))),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps)
parameteres = { }
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)
print("score = %3.2f" %(grid.score(X_test,y_test)))
If you want to select the N best features of your dataset in your Pipelineyou should define a custom Transformer.
This object should train and select the N best feature from xgboost during the transform() method. Then During the transform() method, this transformer should filter your dataset accordingly.
I would do as follows:
from sklearn.datasets import make_regression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Lasso
import pandas as pd
import xgboost as xgb
class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
def __init__(self,n_components = 10):
self.n_components = n_components
self.top_n_features = None
def fit(self, X, y = None):
X = pd.DataFrame(X)
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X, y)
self.top_n_features = (pd.DataFrame(
xgb_reg.feature_importances_,
columns=['weight'],
index=X.columns)
.sort_values(by='weight', ascending=False)
.head(10)
)
return self
def transform(self, X, y = None):
return pd.DataFrame(X).filter(self.top_n_features.index)
X, y = make_regression(n_features=50)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
steps = [('feature_selection_top_10', FeatureSelector_Only_Top_10()),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps)
pipeline.fit(X, y)
print("score = %3.2f" %(pipeline.score(X_test,y_test)))
#retrieve the top N features and their weights
pipeline['feature_selection_top_10'].top_n_features
You can include SelectFromModel in the pipeline in order to extract the top 10 features based on their importance weights, there is no need to create a custom transformer. As explained in the documentation, if you want to select 10 features you need to set max_features=10 and threshold=-np.inf.
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
X, y = make_regression(n_features=100, n_samples=1000, random_state=42)
X = pd.DataFrame(data=X, columns=['x' + str(i) for i in range(X.shape[1])])
y = pd.Series(y, name='y')
pipeline = Pipeline([
('selector', SelectFromModel(estimator=XGBRegressor(), max_features=10, threshold=-np.inf)),
('regressor', LinearRegression())
])
pipeline.fit(X, y)
selected_features = pipeline['selector'].get_support()
print(selected_features.sum())
# 10
selected_features_names = X.columns[selected_features].tolist()
print(selected_features_names)
# ['x0', 'x14', 'x17', 'x35', 'x42', 'x43', 'x57', 'x71', 'x84', 'x95']
selected_features_importances = pipeline['selector'].estimator_.feature_importances_[selected_features]
print(selected_features_importances)
# [0.09361505 0.18474296 0.14420615 0.01952794 0.10946904 0.02192107 0.03307951 0.02948984 0.02851948 0.1216883]
selected_features_coefficients = pipeline['regressor'].coef_
print(selected_features_coefficients)
# [49.43000693 83.91437854 78.25242596 -0.76411769 56.67970515 0.16829694 28.81967319 0.50277914 24.55006237 68.17120687]
import numpy as np
import pandas as pd
import sklearn
from sklearn import linear_model
from sklearn.utils import shuffle
import matplotlib.pyplot as pyplot
import pickle
from matplotlib import style
data = pd.read_csv("student-mat.csv", sep=";")
data = data[["G1", "G3", "G3", "studytime", "failures", "absences", "freetime"]]
predict = "G3"
X = np.array(data.drop([predict], 1))
Y = np.array(data[predict])
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size = 0.1)
best = 0
for _ in range(3000):
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.1)
linear = linear_model.LinearRegression()
linear.fit(x_train, y_train)
acc = linear.score(x_test, y_test)
print(acc)
if acc > best:
best = acc
with open("studentmodel.pickle", "wb") as f:
pickle.dump(linear, f)
pickle_in = open("studentmodel.pickle", "rb")
linear = pickle.load(pickle_in)
print('Co: \n', linear.coef_)
print('Intercept: \n', linear.intercept_)
predictions = linear.predict(x_test)
for x in range(len(predictions)):
print(predictions[x], x_test[x], y_test[x])
p = 'G1'
style.use("ggplot")
pyplot.scatter(data[p],data["G3"])
pyplot.xlabel(p)
pyplot.ylabel("Final Grade")
pyplot.show()
Error: raise ValueError ("X and y must be the same size")
Can anyone please explain to me what I have done wrong? Because well I am new to programing and was following a tutorial and everything up to the last 5 lines was working fine but when I try to make a graph it gives me this error "raise ValueError ("X and y must be the same size")" it only allows me to make a graph if I write the code like this
style.use("ggplot")
pyplot.scatter(data["G3"],data["G3"])
pyplot.xlabel(p)
pyplot.ylabel("Final Grade")
pyplot.show()
Which only gives me a straight line on a graph
Thank you for any help!
I have run following code using this data.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import pickle
from matplotlib import style
data = pd.read_csv("student-mat.csv")
# Here, I have changed columns because "G2" was occurring twice.
data = data[["G1", "G2", "G3", "studytime", "failures", "absences", "freetime"]]
predict = "G3"
print(data.head())
X = np.array(data.drop([predict], 1))
print(X)
y = np.array(data[predict])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
best = 0
for _ in range(3000):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
linear = LinearRegression()
linear.fit(X_train, y_train)
acc = linear.score(X_test, y_test)
print(acc)
if acc > best:
best = acc
with open("studentmodel.pickle", "wb") as f:
pickle.dump(linear, f)
pickle_in = open("studentmodel.pickle", "rb")
linear = pickle.load(pickle_in)
print('Co: \n', linear.coef_)
print('Intercept: \n', linear.intercept_)
predictions = linear.predict(X_test)
for x in range(len(predictions)):
print(predictions[x], X_test[x], y_test[x])
p = 'G1'
style.use("ggplot")
plt.scatter(data[p], data["G3"])
plt.xlabel(p)
plt.ylabel("Final Grade")
plt.show()
This will produce the following image.
I am working on knn without using any library. The problem is that the labels are numeric
label = [1.5171, 1.7999, 2.4493, 2.8622, 2.9961, 3.6356, 3.7742, 5.8069, 7.1357 etc..]}
from each label there is one value
I want to predict the label for a new data but how should i choose the winning label if from each one there is one value?
prediction = max(set(label_neighbors), key=label_neighbors.count)
I'm guessing that you want to learn the mechanics of KNN, right. See the sample code below. This should do what you want.
import numpy as np
import scipy.spatial
from collections import Counter
# loading the Iris-Flower dataset from Sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 42, test_size = 0.2)
class KNN:
def __init__(self, k):
self.k = k
def fit(self, X, y):
self.X_train = X
self.y_train = y
def distance(self, X1, X2):
distance = scipy.spatial.distance.euclidean(X1, X2)
def predict(self, X_test):
final_output = []
for i in range(len(X_test)):
d = []
votes = []
for j in range(len(X_train)):
dist = scipy.spatial.distance.euclidean(X_train[j] , X_test[i])
d.append([dist, j])
d.sort()
d = d[0:self.k]
for d, j in d:
votes.append(y_train[j])
ans = Counter(votes).most_common(1)[0][0]
final_output.append(ans)
return final_output
def score(self, X_test, y_test):
predictions = self.predict(X_test)
return (predictions == y_test).sum() / len(y_test)
clf = KNN(3)
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
for i in prediction:
print(i, end= ' ')
prediction == y_test
clf.score(X_test, y_test)
# Result:
# 1.0
Well, look at that! We got 100%! Not bad, not bad at all!!
Reference:
https://medium.com/analytics-vidhya/implementing-k-nearest-neighbours-knn-without-using-scikit-learn-3905b4decc3c
I am playing a bit with differnt regression models on the boston house dataset. I found that if I use a normal linear model or ridge regression the predicted values are of shape (102, 1) while if I use the identical code with Lasso the output is shape (102,). Why is that? This makes it then crash with ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all() in the pearsonr line.
Any idea on how to make the code below run smoothly?
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd
import sys
def evalOneModel (model, name, X, y, nRuns):
allMse = []
allR2 = []
all_rho_P = []
################ OLS ################
for i in range(nRuns):
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
model = model.fit(x_train, y_train)
predictions = model.predict(x_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
allMse.append(mse)
allR2.append(r2)
print(type(y_test))
print(y_test.shape)
print(type(predictions))
print(predictions.shape)
rhoP, pval = pearsonr(y_test, predictions)
rhoP = rhoP[0]
all_rho_P.append(rhoP)
print("run{}={:0.3f}; ".format(i, rhoP), end="")
print(model.coef_)
myTitle = "{} mean={:0.3f}".format(name, np.mean(all_rho_P))
print("")
print(myTitle)
print("")
sys.stdout.flush()
####### MAIN #####
pd.set_option('expand_frame_repr', False)
bosten_data = load_boston()
df = pd.DataFrame(bosten_data.data, columns=bosten_data.feature_names)
df['MEDV'] = bosten_data.target # add the target to the data frame
target = pd.DataFrame(bosten_data.target, columns=["MEDV"])
norm_df = (df - df.mean()) / df.std()
norm_target = (target - target.mean()) / target.std()
X = norm_df[["RM", "AGE", "PTRATIO", "LSTAT"]]
y = norm_target
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr, pearsonr
print("\n\nstarting runs ...\n")
from sklearn import linear_model
model = linear_model.LinearRegression()
evalOneModel (model, "OLS", X, y, 1)
from sklearn.linear_model import Ridge # L2
model = linear_model.Ridge(alpha=1.0)
evalOneModel (model, "Ridge (alpha=1)", X, y, 1)
from sklearn.linear_model import Lasso # L1
model = linear_model.Lasso(alpha=1.0)
evalOneModel (model, "Lasso (alpha=1)", X, y, 1)
This is the custom code
#Custom model for multiple linear regression
import numpy as np
import pandas as pd
dataset = pd.read_csv("50s.csv")
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4:5].values
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
x[:,3] = lb.fit_transform(x[:,3])
from sklearn.preprocessing import OneHotEncoder
on = OneHotEncoder(categorical_features=[3])
x = on.fit_transform(x).toarray()
x = x[:,1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=1/5, random_state=0)
con = np.matrix(X_train)
z = np.matrix(y_train)
#training model
result1 = con.transpose()*con
result1 = np.linalg.inv(result1)
p = con.transpose()*z
f = result1*p
l = []
for i in range(len(X_test)):
temp = f[0]*X_test[i][0] + f[1]*X_test[i][1] +f[2]*X_test[i][2]+f[3]*X_test[i][3]+f[4]*X_test[i][4]
l.append(temp)
import matplotlib.pyplot as plt
plt.scatter(y_test,l)
plt.show()
Then I created created a model with scikit learn
and compared the results with y_test and l(predicted values of above code)
comparisons are as follows
for i in range(len(prediction)):
print(y_test[i],prediction[i],l[i],sep=' ')
103282.38 103015.20159795816 [[116862.44205399]]
144259.4 132582.27760816005 [[118661.40080974]]
146121.95 132447.73845175043 [[124952.97891882]]
77798.83 71976.09851258533 [[60680.01036438]]
This were the comparison between y_test,scikit-learn model predictions and custom code predictions
please help with the accuracy of model.
blue :Custom model predictions
yellow : scikit-learn model predictions