This is the custom code
#Custom model for multiple linear regression
import numpy as np
import pandas as pd
dataset = pd.read_csv("50s.csv")
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4:5].values
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
x[:,3] = lb.fit_transform(x[:,3])
from sklearn.preprocessing import OneHotEncoder
on = OneHotEncoder(categorical_features=[3])
x = on.fit_transform(x).toarray()
x = x[:,1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=1/5, random_state=0)
con = np.matrix(X_train)
z = np.matrix(y_train)
#training model
result1 = con.transpose()*con
result1 = np.linalg.inv(result1)
p = con.transpose()*z
f = result1*p
l = []
for i in range(len(X_test)):
temp = f[0]*X_test[i][0] + f[1]*X_test[i][1] +f[2]*X_test[i][2]+f[3]*X_test[i][3]+f[4]*X_test[i][4]
l.append(temp)
import matplotlib.pyplot as plt
plt.scatter(y_test,l)
plt.show()
Then I created created a model with scikit learn
and compared the results with y_test and l(predicted values of above code)
comparisons are as follows
for i in range(len(prediction)):
print(y_test[i],prediction[i],l[i],sep=' ')
103282.38 103015.20159795816 [[116862.44205399]]
144259.4 132582.27760816005 [[118661.40080974]]
146121.95 132447.73845175043 [[124952.97891882]]
77798.83 71976.09851258533 [[60680.01036438]]
This were the comparison between y_test,scikit-learn model predictions and custom code predictions
please help with the accuracy of model.
blue :Custom model predictions
yellow : scikit-learn model predictions
Related
Given this model:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
import graphviz
X, y = make_classification(n_samples=1000, n_features=10,n_informative=3, n_redundant=5, random_state=42)
df = pd.DataFrame(data=X)
df.columns = 'X' + (df.columns+1).astype(str)
df[df.columns[-3:]] = df[df.columns[-3:]].astype(int)
df['Y'] = y
X_train, X_test, y_train, y_test = train_test_split(df.drop('Y', axis=1), df['Y'], test_size=0.3, random_state=42)
n_negative_class = y_train.value_counts().sort_index()[0]
n_positive_class = y_train.value_counts().sort_index()[1]
xgb = XGBClassifier(random_state = 42, n_estimators=50,
scale_pos_weight = n_negative_class/n_positive_class,
use_label_encoder=False)
xgb.fit(X_train, y_train, eval_metric="auc")
y_train_scores = xgb.predict_proba(X_test)[:,1]
xgboost.to_graphviz(xgb, num_trees=49)
How can I plot the final tree used in xgb.predict_proba(X_test)[:,1]? Is necesarily the last one (as XGBoost trees learn from the last tree)? Or XGBoost chooses some tree among those 50 estimators given the loss or eval_metric given?
Trying to fit a linear kernel ridge regression model on a dataset with 8 features.
import pandas as pd
import urllib.request
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls'
urllib.request.urlretrieve(url, './Concrete_Data.xls')
data = pd.read_excel('./Concrete_Data.xls')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
new_col_names = ["Cement", "BlastFurnaceSlag", "FlyAsh", "Water", "Superplasticizer","CoarseAggregate", "FineAggregate", "Age", "CC_Strength"]
curr_col_names = list(data.columns)
mapper = {}
for i,name in enumerate(curr_col_names):
mapper[name] = new_col_names[i]
data = data.rename(columns=mapper)
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
from sklearn.kernel_ridge import KernelRidge
kr = KernelRidge(alpha=1.0)
kr.fit(x_train, y_train)
y_pred_kr = kr.predict(y_test)
When I try to run the code, there is an error that says the expected array is meant to be 2D but is a 1D array. Could someone let me know what I am possibly doing wrong?
I have been writing this code and i have gotten to the point where it runs but does not converge unfortunately. Could someone please have a look because I have checked many things and not too sure why it isn't converging. The data set is from here: https://github.com/nshomron/covidpred/blob/master/data/corona_tested_individuals_ver_006.english.csv.zip
The code I have hoped to split it up to make it a bit clear:
#---------- IMPORTS ----------
import numpy as np
import matplotlib as plt
from numpy.core.defchararray import index
import pandas as pd
from pandas.core.tools.datetimes import Scalar
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import svm
#---------- PREPROCESSING ----------
#---------- Import data ----------
data = pd.read_csv(r'C:\Users\Saaqib\Documents\Python\PythonProjects\Covidproject\corona_tested_individuals.csv', )
X = data.loc[:, data.columns != 'corona_result']
X = X.loc[:, X.columns != 'test_date']
y = data.iloc[:,6]
#---------- Encode data ----------
Le_X = LabelEncoder()
X['age_60_and_above'] = Le_X.fit_transform(X['age_60_and_above'])
X['gender'] = Le_X.fit_transform(X['gender'])
X['test_indication'] = Le_X.fit_transform(X['test_indication'])
# print('data=',X)
y = Le_X.fit_transform(y)
y = np.array(y)
Hot_enc_X = OneHotEncoder()
enc_X = pd.DataFrame(Hot_enc_X.fit_transform(X[['gender','test_indication']]).toarray())
X = X.join(enc_X)
X = X.drop(columns=['gender','test_indication'])
X = X.replace("None", float('nan'))
X["cough"] = X["cough"].fillna(0)
X["fever"] = X["fever"].fillna(0)
X["sore_throat"] = X["sore_throat"].fillna(0)
X["shortness_of_breath"] = X["shortness_of_breath"].fillna(0)
X["head_ache"] = X["head_ache"].fillna(0)
X["age_60_and_above"] = X["age_60_and_above"].fillna(0)
X['cough'] = X['cough'].astype(float)
X['fever'] = X['fever'].astype(float)
X['sore_throat'] = X['sore_throat'].astype(float)
X['shortness_of_breath'] = X['shortness_of_breath'].astype(float)
X['head_ache'] = X['head_ache'].astype(float)
X['age_60_and_above'] = X['age_60_and_above'].astype(float)
#---------- Split data set ----------
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#---------- Train Model ----------
covid_model = svm.SVC(kernel='linear')
covid_model.fit(X_train, y_train)
predictions = covid_model.predict(X_test)
acc = accuracy_score(y_test,predictions)
print("pred:", predictions)
print("acc:", acc)
I am playing a bit with differnt regression models on the boston house dataset. I found that if I use a normal linear model or ridge regression the predicted values are of shape (102, 1) while if I use the identical code with Lasso the output is shape (102,). Why is that? This makes it then crash with ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all() in the pearsonr line.
Any idea on how to make the code below run smoothly?
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd
import sys
def evalOneModel (model, name, X, y, nRuns):
allMse = []
allR2 = []
all_rho_P = []
################ OLS ################
for i in range(nRuns):
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
model = model.fit(x_train, y_train)
predictions = model.predict(x_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
allMse.append(mse)
allR2.append(r2)
print(type(y_test))
print(y_test.shape)
print(type(predictions))
print(predictions.shape)
rhoP, pval = pearsonr(y_test, predictions)
rhoP = rhoP[0]
all_rho_P.append(rhoP)
print("run{}={:0.3f}; ".format(i, rhoP), end="")
print(model.coef_)
myTitle = "{} mean={:0.3f}".format(name, np.mean(all_rho_P))
print("")
print(myTitle)
print("")
sys.stdout.flush()
####### MAIN #####
pd.set_option('expand_frame_repr', False)
bosten_data = load_boston()
df = pd.DataFrame(bosten_data.data, columns=bosten_data.feature_names)
df['MEDV'] = bosten_data.target # add the target to the data frame
target = pd.DataFrame(bosten_data.target, columns=["MEDV"])
norm_df = (df - df.mean()) / df.std()
norm_target = (target - target.mean()) / target.std()
X = norm_df[["RM", "AGE", "PTRATIO", "LSTAT"]]
y = norm_target
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr, pearsonr
print("\n\nstarting runs ...\n")
from sklearn import linear_model
model = linear_model.LinearRegression()
evalOneModel (model, "OLS", X, y, 1)
from sklearn.linear_model import Ridge # L2
model = linear_model.Ridge(alpha=1.0)
evalOneModel (model, "Ridge (alpha=1)", X, y, 1)
from sklearn.linear_model import Lasso # L1
model = linear_model.Lasso(alpha=1.0)
evalOneModel (model, "Lasso (alpha=1)", X, y, 1)
I have a simple KNN classification problem, the output of the code below is the accuracy of the classifier resulted after training the classifier and splitting the dataset into "train" and "test".
What I want my system to be like is:
First, train the classifier using dataset;
Upload an image from URL;
Classify it according to the dataset.
For example, the output should be "class 1". I believe it's simple but I am pretty new to python.
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
dataset = pd.read_csv(fdes)
X = dataset.iloc[:,:20].values
y = dataset['target'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
neigh.fit(X_train, y_train)
# Predicting the Test set results
y_pred = neigh.predict(X_test)
y_compare = np.vstack((y_test,y_pred)).T
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
#finding accuracy from the confusion matrix.
a = cm.shape
corrPred = 0
falsePred = 0
#prining results
for row in range(a[0]):
for c in range(a[1]):
if row == c:
corrPred +=cm[row,c]
else:
falsePred += cm[row,c]
kernelRbfAccuracy = corrPred/(cm.sum())
print ('Accuracy of knn : ', corrPred/(cm.sum()))
After all those steps, you can continue with:
from io import BytesIO
import numpy as np
import requests
from PIL import Image
response = requests.get(url)
img = Image.open(BytesIO(response.content))
img = np.array(img).reshape(1, -1)
output_class = neigh.predict(img)[0]
print(output_class)