Pickling an XGBoost model makes it give different results when predicting - python

I've tried to reproduce the issue with sinthetic data with no luck, it happens with a particular (private) dataset
Data generation
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
NOISE = 25
def real_vs_fitted(reg, X,y):
fitted = reg.predict(X)
real = y
residual = real - fitted
plt.scatter(fitted, residual)
NOISE = 25
samples = 1000
np.random.seed(42)
# Data
samples = 1000
A = np.random.randint(1,100,samples)
B = np.random.randint(1,100,samples)
C = np.random.randint(1,100,samples)
X = np.stack([A,B,C]).T
y = 3*A + 10*np.log(B) + A*B*C + np.random.normal(0,NOISE,len(X))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
Training and serializing
reg = XGBRegressor()
reg.fit(X_train,y_train)
real_vs_fitted(reg,X_test,y_test)
print(f'Reg object Hash: {hash(reg)}')
with open('model.pkl', 'wb') as f:
pkl.dump(reg, f)
Unserializing
with open("model.pkl", "rb") as f:
reg = pkl.load(f)
real_vs_fitted(reg,X_test,y_test)
print(f'Reg object Hash: {hash(reg)}')
Hashes are different (I'm assuming it's because some attributes are not saved when serializing, maybe not).
X_test hash by row (pd.util.hash_pandas) is the same too
reg.get_booster().trees_to_dataframe() seems to be the same too
Any clues?

Related

Make a function that loads the model, takes a vector of data, and makes a prediction

I'm trying to train a regression model on the Boston Housing dataset and save the model to disk and then make the Title function.
Below code's is working
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from joblib import dump, load
boston = load_boston()
boston_df = pd.DataFrame(boston.data, columns = boston.feature_names)
y = pd.DataFrame(boston.target)
x_train, x_test, y_train, y_test = train_test_split(boston_df, y, test_size = 0.2, random_state=17)
model = LinearRegression()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
mse = mean_squared_error(y_test, y_predict)
with open('model.pkl', 'wb') as file:
pickle.dump(model, file)
del model
with open('model.pkl', 'rb') as file:
lin_model = pickle.load(file)
But when I add it's supossed to work
def predicting(x):
x = pd.DataFrame(x)
with open('model.pkl', 'rb') as file:
lin_model = pickle.load(file)
pred = lin_model.predict(x)
print(pred)
But I don't know what structure does 'x' need for this function to work!
x = [1,2,4,5,6,3,5,6,7,3,5,6,4]
predicting(x)
Like above it says ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 13 is different from 1)
Please help?
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from joblib import dump, load
from sklearn.datasets import make_blobs
boston = load_boston()
boston_df = pd.DataFrame(boston.data, columns = boston.feature_names)
y = pd.DataFrame(boston.target)
x_train, x_test, y_train, y_test = train_test_split(boston_df, y, test_size = 0.2, random_state=17)
model = LinearRegression()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
mse = mean_squared_error(y_test, y_predict)
with open('model.pkl', 'wb') as file:
pickle.dump(model, file)
del model
x, _ = make_blobs(n_samples=3, centers=2, n_features=13, random_state=1)
def preficting(x):
with open('model.pkl', 'rb') as file:
lin_model = pickle.load(file)
ynew = lin_model.predict(x)
for i in range(len(x)):
print("X=%s, Predicted=%s" % (x[i], ynew[i]))
preficting(x)
Without the final print but I saw it and thought that was pretty, but here's what I needed! :)

100% error rate on test set with one class svm

I am trying to detect outlier images. But I'm getting bizarre results from the model.
I've read in the images with cv2, flattened them into 1d-arrays, and turned them into a pandas dataframe and then fed that into the SVM.
import numpy as np
import cv2
import glob
import pandas as pd
import sys, os
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import *
import seaborn as sns`
load the labels and files
labels_wt = np.loadtxt("labels_wt.txt", delimiter="\t", dtype="str")
files_wt = np.loadtxt("files_wt.txt", delimiter="\t", dtype="str")`
load and flatten the images
wt_images_tmp = [cv2.imread(file) for file in files_wt]
wt_images = [image.flatten() for image in wt_images_tmp]
tmp3 = np.array(wt_images)
mutant_images_tmp = [cv2.imread(file) for file in files_mut]
mutant_images = [image.flatten() for image in mutant_images_tmp]
tmp4 = np.array(mutant_images)
X = pd.DataFrame(tmp3) #load the wild-type images
y = pd.Series(labels_wt)
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)
X_outliers = pd.DataFrame(tmp4)
clf = svm.OneClassSVM(nu=0.15, kernel="rbf", gamma=0.0001)
clf.fit(X_train)
Then I evaluate the results according to the sklearn tutorial on oneclass SVM.
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size
print(n_error_train / len(y_pred_train))
print(float(n_error_test) / float(len(y_pred_test)))
print(n_error_outliers / len(y_pred_outliers))`
my error rates on the training set have been variable (10-30%), but on the test set, they have never gone below 100%. Am I doing this wrong?
My guess is that you are setting random_state = 42, this is biasing your train_test_split to always have the same splitting pattern. You can read more about it in this answer. Don't specify any state and run the code again, so:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2)
This will show different results. Once you are sure this works, make sure yo then do cross-validation, possibly using k-fold validation. Let us know if this helps.

import csv to OrderedDict and predict using regression

i build a regression model to predict energy ( 1 columns ) from 5 variables ( 5 columns ) ... i used my exprimental data to train and fit the model and it works with good score ...
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('new.csv')
X = data.drop(['E'],1)
y = data['E']
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5 ,
random_state=2)
from sklearn import ensemble
clf1 = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth =5,
min_samples_split = 2, loss='ls',
learning_rate = 0.1)
clf1.fit(X_train, y_train)
clf1.score(X_test, y_test)
but now i want to add a new csv file contain new data for mentioned 5 variables to OrderedDict and use the model to predict energy ...
with code bellow i manually insert row by row and it predict energy correctly
from collections import OrderedDict
new_data = OrderedDict([('H',48.52512), ('A',169.8379), ('P',55.52512),
('R',3.058758), ('Q',2038.055)])
new_data = pd.Series(new_data)
data = new_data.values.reshape(1, -1)
clf1.predict(data)
but i cant do this with huge datasets and need to import csv file ... i do the bellow but cant figure it out ....
data_2 = pd.read_csv('new2.csv')
X_new = OrderedDict(data_2)
new_data = pd.Series(X_new)
data = new_data.values.reshape(1, -1)
clf1.predict(data)
but gives me : ValueError: setting an array element with a sequence.
can anyone help me ??

How to increase the model accuracy of multiple linear regression

This is the custom code
#Custom model for multiple linear regression
import numpy as np
import pandas as pd
dataset = pd.read_csv("50s.csv")
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4:5].values
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
x[:,3] = lb.fit_transform(x[:,3])
from sklearn.preprocessing import OneHotEncoder
on = OneHotEncoder(categorical_features=[3])
x = on.fit_transform(x).toarray()
x = x[:,1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=1/5, random_state=0)
con = np.matrix(X_train)
z = np.matrix(y_train)
#training model
result1 = con.transpose()*con
result1 = np.linalg.inv(result1)
p = con.transpose()*z
f = result1*p
l = []
for i in range(len(X_test)):
temp = f[0]*X_test[i][0] + f[1]*X_test[i][1] +f[2]*X_test[i][2]+f[3]*X_test[i][3]+f[4]*X_test[i][4]
l.append(temp)
import matplotlib.pyplot as plt
plt.scatter(y_test,l)
plt.show()
Then I created created a model with scikit learn
and compared the results with y_test and l(predicted values of above code)
comparisons are as follows
for i in range(len(prediction)):
print(y_test[i],prediction[i],l[i],sep=' ')
103282.38 103015.20159795816 [[116862.44205399]]
144259.4 132582.27760816005 [[118661.40080974]]
146121.95 132447.73845175043 [[124952.97891882]]
77798.83 71976.09851258533 [[60680.01036438]]
This were the comparison between y_test,scikit-learn model predictions and custom code predictions
please help with the accuracy of model.
blue :Custom model predictions
yellow : scikit-learn model predictions

Python scikit-learn to JSON

I have a model built with Python scikit-learn. I understand that the models can be saved in Pickle or Joblib formats. Are there any existing methods out there to save the jobs in JSON format? Please see the model build code below for reference:
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import pickle
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names =['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
# Fit the model on 33%
model = LogisticRegression()
model.fit(X_train, Y_train)
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
You'll have to cook up your own serialization/deserialization recipe. Fortunately, logistic regression can basically be captured by the coefficients and the intercept. However, the LogisticRegression object keeps some other metadata around which we might as well capture. I threw together the following functions that does the dirty-work. Keep in mind, this is still rough:
import numpy as np
import json
from sklearn.linear_model import LogisticRegression
def logistic_regression_to_json(lrmodel, file=None):
if file is not None:
serialize = lambda x: json.dump(x, file)
else:
serialize = json.dumps
data = {}
data['init_params'] = lrmodel.get_params()
data['model_params'] = mp = {}
for p in ('coef_', 'intercept_','classes_', 'n_iter_'):
mp[p] = getattr(lrmodel, p).tolist()
return serialize(data)
def logistic_regression_from_json(jstring):
data = json.loads(jstring)
model = LogisticRegression(**data['init_params'])
for name, p in data['model_params'].items():
setattr(model, name, np.array(p))
return model
Note, with just 'coef_', 'intercept_','classes_' you could do the predictions yourself, since logistic regression is a straight-forward linear model, it's simply matrix-multiplication.

Categories