I want to select model using the following code.
import numpy as np
import pandas as pd
from math import log
from sklearn import model_selection
from sklearn.model_selection import train_test_split as split
from sklearn.metrics import accuracy_score as accuracy
import xgboost as xgb
dataframe_training = pd.read_csv("train.csv")
train_tag=dataframe_training['tags']
train_dummies_tags= train_tag.str.get_dummies(",")
dataframe_training=dataframe_training.filter(items=['review_ratio','log_date_difference','log_price'])
x_train=pd.concat([dataframe_training,train_dummies_tags], axis=1, sort=False)
y_train=dataframe_training.filter(items=['playtime_forever'])
Xtrain, Xtest, Ytrain, Ytest = split(x_train,y_train, test_size=0.25, random_state=7)
from sklearn.ensemble import *
AllRegressorModel = [xgb.XGBRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor]
def Model_Selection_By_Cross_Valid():
ThisRound_SelectedModel = None,
ThisRound_SelectedModel_Name = None,
ThisRound_SelectedModel_Score = None,
for temp_select_model_name in AllRegressorModel:
kfold = model_selection.KFold(n_splits=10, random_state=7),
print (kfold),
temp_model= temp_select_model_name(),
temp_model.fit(Xtrain, Ytrain.ravel()),
results = model_selection.cross_val_score(temp_model, X_train, Y_train.ravel(), cv=kfold, scoring='neg_mean_squared_error'),
print(temp_select_model_name,results.mean()),
if (ThisRound_SelectedModel == None) or (abs(results.mean()) < ThisRound_SelectedModel_Score):
ThisRound_SelectedModel = temp_model,
ThisRound_SelectedModel_Name = temp_select_model_name,
ThisRound_SelectedModel_Score = abs(results.mean()),
print ("This round Model Name: ", temp_model,"MSE Score: ",abs(results.mean())),
print ("This Model Feature Importance",temp_model.feature_importances_),
print("This Model Do No Have Feature Importance......"),
print ("<----------------------------------->"),
print ("Selected Model Name:", ThisRound_SelectedModel, "MSE Score:",ThisRound_SelectedModel_Score),
return {"ModelName": ThisRound_SelectedModel_Name,"Model": ThisRound_SelectedModel}
When I run the program,
SelectedModel = Model_Selection_By_Cross_Valid()
AttributeError: 'tuple' object has no attribute 'fit' is shown.
How can I solve the problem?
Thank you very much.
Related
In my program,
from sklearn import linear_model
from keras.wrappers.scikit_learn import KerasClassifier,KerasRegressor
import eli5
from eli5.sklearn import PermutationImportance
if __name__ == '__main__':
(cnn_tube_par_X_train, cnn_tube_par_X_test, cnn_tube_Y_train, cnn_tube_Y_test) = read_file()
sc_X = StandardScaler()
sc_Y = StandardScaler()
sc_cnn_tube_par_X_train = sc_X.fit_transform(cnn_tube_par_X_train.iloc[:, 1:6].values)
sc_cnn_tube_par_X_test = sc_X.transform(cnn_tube_par_X_test.iloc[:, 1:6].values)
sc_cnn_tube_eff_Y_train = sc_Y.fit_transform(cnn_tube_Y_train.iloc[:, -1:].values)
sc_cnn_tube_eff_Y_test = sc_Y.transform(cnn_tube_Y_test.iloc[:, -1:].values)
MLR_pImportance(sc_cnn_tube_par_X_train,sc_cnn_tube_par_X_test,sc_cnn_tube_eff_Y_train,sc_cnn_tube_eff_Y_test)
def MLR_pImportance(sc_mlr_tube_par_X_train,sc_mlr_tube_par_X_test,sc_mlr_tube_eff_Y_train,sc_mlr_tube_eff_Y_test):
mlr = linear_model.LinearRegression()
mlr.fit(sc_mlr_tube_par_X_train,sc_mlr_tube_eff_Y_train)
perm = PermutationImportance(mlr,random_state=1).fit(sc_mlr_tube_par_X_test,sc_mlr_tube_eff_Y_test)
print(perm.feature_importances_)
print(perm.feature_importances_std_)
eli5.show_weights(perm)
The results show that :
[0.63895352 0.1270582 0.06904505 0.32131836 0.02549574]
[0.02766096 0.01535046 0.01789114 0.02761288 0.01048179]
these are the result of
print(perm.feature_importances_)
print(perm.feature_importances_std_)
but the sebtance:
eli5.show_weights(perm)
show nothing
could you tell the reason,and how to solve it
I am a newbie in machine learning model deployment and have been trying to deploy a simple machine learning model on car price prediction in Flask and Heroku. Have made the model using sklearn pipeline and transformer. The code runs perfectly on the Jupyter Notebook.
Deploying it in Heroku through Github, shows Build succeeded. However on launching the application, its shows application error.
It seems there something is wrong with the app.py file code for Flask. Any help or insight would be really helpful. Thank You
Jupyter Notebook Code:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('car data.csv')
data['current_year'] = 2020
data['car_age'] = data['current_year'] - data['Year']
num_features = [col for col in data.columns if data[col].dtype != 'O' and col != 'Selling_Price']
cat_features = [col for col in data.columns if data[col].dtype == 'O']
X= data.drop(['Selling_Price'], axis=1)
y = data['Selling_Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
num_transformer = Pipeline(steps = [('scaler', StandardScaler())])
cat_transformer = Pipeline(steps =[('OneHot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers = [('numerical_transformer', num_transformer, num_features),
('categorical_transformer', cat_transformer, cat_features)])
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
grid_param = {'n_estimators' :[100, 200, 500, 800, 1000]}
grid = GridSearchCV(estimator = rf, param_grid= grid_param)
model = Pipeline(steps= [('preprocessor',preprocessor), ('grid_regressor', grid)])
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error
MAE = mean_absolute_error(y_test, y_predict)
MSE = mean_squared_error(y_test, y_predict)
RMSE = np.sqrt(MSE)
import pickle
file = open('regression_model.pkl', 'wb')
pickle.dump(model, file)
Code for the app.py file for Flask:
from flask import Flask, render_template, request
import jsonify
import pickle
import numpy as np
import sklearn
model = pickle.load(open('regression_model.pkl', 'rb'))
app = Flask(__name__)
#app.route('/', methods=['GET'])
def Home():
return render_template('index.html')
#app.route("/predict", methods= ['POST']
def predict():
if request.method == 'POST':
Year = int(request.form['Year'])
Selling_Price = float(request.form['Selling_Price'])
Present_Price = float(request.form['Present_Price'])
Kms_Driven = int(request.form['Kms_Driven'])
Owner = int(request.form['Owner'])
car_age = 2020 - Year
Fuel_Type_Petrol = request.form['Fuel_Type_Petrol']
if(Fuel_Type_Petrol=='Petrol'):
Fuel_Type_Petrol=1
Fuel_Type_Diesel=0
elif (Fuel_Type_Petrol =='Diesel'):
Fuel_Type_Petrol=0
Fuel_Type_Diesel=1
else:
Fuel_Type_Petrol=0
Fuel_Type_Diesel=0
Seller_Type_Individual = request.form['Seller_Type_Individual']
if(Seller_Type_Individual =='Individual'):
Seller_Type_Individual=1
else:
Seller_Type_Individual=0
Transmission_Mannual=request.form['Transmission_Mannual']
if(Transmission_Mannual=='Mannual'):
Transmission_Mannual=1
else:
Transmission_Mannual=0
data = [Selling_Price, Present_Price, Kms_Driven, Owner, Fuel_Type_Petrol, Fuel_Type_Diesel, Seller_Type_Individual, Transmission_Mannual, car_age]
prediction = model.predict([data])
output = round(prediction[0], 2)
if output <0:
return render_template('index.html', prediction_texts="Sorry you cannot sell this car")
else:
return render_template('index.html', prediction_texts= "You can sell the car at {}".format(output))
else:
render_template('index.html')
if __name__ == '__main__':
app.run(debug=True)
Recently I was working with a dataset in python and got an unexpected error. the error was: ValueError: could not convert string to float. Actually in the dataset there were text data also which I converted into integer with LabelEncoder. But when I am going in the training part where I fit the model, I'm getting this error which makes no sense.
code:
import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import pickle
import numpy as np
data = pd.read_csv("house_train.csv")
data = data.fillna(value=0)
dataX_train = data.drop(["SalePrice"], axis = 1)
dataX_test = data.SalePrice
le = preprocessing.LabelEncoder()
dataX_train.MSZoning = le.fit_transform(list(data["MSZoning"]))
dataX_train.Street = le.fit_transform(list(data["Street"]))
dataX_train.Alley = le.fit_transform(list(data["Alley"]))
dataX_train.LotShape = le.fit_transform(list(data["LotShape"]))
dataX_train.LandContour = le.fit_transform(list(data["LandContour"]))
dataX_train.Utilities = le.fit_transform(list(data["Utilities"]))
dataX_train.LotConfig = le.fit_transform(list(data["LotConfig"]))
dataX_train.LandSlope = le.fit_transform(list(data["LandSlope"]))
dataX_train.Neighborhood = le.fit_transform(list(data["Neighborhood"]))
dataX_train.Condition1 = le.fit_transform(list(data["Condition1"]))
dataX_train.Condition2 = le.fit_transform(list(data["Condition2"]))
dataX_train.BldgType = le.fit_transform(list(data["BldgType"]))
dataX_train.HouseStyle = le.fit_transform(list(data["HouseStyle"]))
dataX_train.RoofStyle = le.fit_transform(list(data["RoofStyle"]))
dataX_train.RoofMatl = le.fit_transform(list(data["RoofMatl"]))
dataX_train.Exterior1st = le.fit_transform(list(data["Exterior1st"]))
dataX_train.Exterior2nd = le.fit_transform(list(data["Exterior2nd"]))
dataX_train.MasVnrType = le.fit_transform(list(data["MasVnrType"]))
dataX_train.ExterQual = le.fit_transform(list(data["ExterQual"]))
dataX_train.ExterCond = le.fit_transform(list(data["ExterCond"]))
dataX_train.Foundation = le.fit_transform(list(data["Foundation"]))
dataX_train.BsmtQual = le.fit_transform(list(data["BsmtQual"]))
dataX_train.BsmtExposure = le.fit_transform(list(data["BsmtExposure"]))
dataX_train.BsmtFinType1 = le.fit_transform(list(data["BsmtFinType1"]))
dataX_train.BsmtFinType2 = le.fit_transform(list(data["BsmtFinType2"]))
dataX_train.Heating = le.fit_transform(list(data["Heating"]))
dataX_train.HeatingQC = le.fit_transform(list(data["HeatingQC"]))
dataX_train.CentralAir = le.fit_transform(list(data["CentralAir"]))
dataX_train.Electrical = le.fit_transform(list(data["Electrical"]))
dataX_train.KitchenQual = le.fit_transform(list(data["KitchenQual"]))
dataX_train.Functional = le.fit_transform(list(data["Functional"]))
dataX_train.FireplaceQu = le.fit_transform(list(data["FireplaceQu"]))
dataX_train.GarageType = le.fit_transform(list(data["GarageType"]))
dataX_train.GarageFinish = le.fit_transform(list(data["GarageFinish"]))
dataX_train.GarageQual = le.fit_transform(list(data["GarageQual"]))
dataX_train.GarageCond = le.fit_transform(list(data["GarageCond"]))
dataX_train.PavedDrive = le.fit_transform(list(data["PavedDrive"]))
dataX_train.PoolQC = le.fit_transform(list(data["PoolQC"]))
dataX_train.Fence = le.fit_transform(list(data["Fence"]))
dataX_train.MiscFeature = le.fit_transform(list(data["MiscFeature"]))
dataX_train.SaleType = le.fit_transform(list(data["SaleType"]))
dataX_train.SaleCondition = le.fit_transform(list(data["SaleCondition"]))
best = 0
x_train, x_test, y_train, y_test = model_selection.train_test_split(dataX_train, dataX_test,
test_size = 0.2)
clf = linear_model.LinearRegression()
clf.fit(x_train, y_train)
acc = clf.score(x_test, y_test)
if acc > best:
best = acc
with open("housingmodel.pickle", "wb") as f:
pickle.dump(clf , f)
print(acc)
first of all check out if you encoded all of your features in dataX_train, I think you missed something there.
try: dataX_train.dtypes and check if there is any of non-numeric values and then use to_numeric on a non numeric columns. For example
dataX_train['NonNumericCol'] = dataX_train['NonNumericCol'].apply(pd.to_numeric)
Going through the second chapter of Hands-On Machine Learning with Scikit-Learn & TensorFlow, running to the error stated above. This happens when I try to implement the following line:
linReg.fit(housingPrepared, housing_labels)
Researching online it looks like it has to do something with the dimensions of my features and my labels not matching up. Printing the shape of housingPrepared (X) and housing_labels (Y) yields the following result:
(16512, 16) (4128,)
I've spent the last hour going through line by line to see if I missed a line in this chapter, can't find anything. Wondering if someone here might have an intuition on where a potential solution for this problem could be.
Thank you so much in advance. All my code up to the problem line is posted below:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from zlib import crc32
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from CategoricalEncoder import CategoricalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_array
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetchHousingData(housingUrl=HOUSING_URL, housingPath=HOUSING_PATH):
if not os.path.isdir(housingPath):
os.makedirs(housingPath)
tgzPath = os.path.join(housingPath, "housing.tgz")
urllib.request.urlretrieve(housingUrl, tgzPath)
housingTgz = tarfile.open(tgzPath)
housingTgz.extractall(path=housingPath)
housingTgz.close()
def loadHousingData(housingPath=HOUSING_PATH):
return pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv")
housing = loadHousingData()
#plt.hist(housing['longitude'],bins=50)
#plt.show()
def splitTrainTesT(data, testRatio):
shuffled_indices = np.random.permutation(len(data))
testSetSize = int(len(data)* testRatio)
testIndices = shuffled_indices[:testSetSize]
trainIndices = shuffled_indices[testSetSize:]
return data.iloc[trainIndices], data.iloc[testIndices]
def testSetCheck(identifier, testRatio):
return crc32(np.int64(identifier)) & 0xffffffff < testRatio * 2 ** 32
def splitTrainTestByID(data, testRatio, idColumn):
ids = data[idColumn]
inTestSet = ids.apply(lambda id_: testSetCheck(id_, testRatio))
return data.loc[~inTestSet], data.loc[inTestSet]
#housingWithID = housing.reset_index()
#trainSet, testSet = splitTrainTestByID(housingWithID,0.2,"index")
trainSet, testSet = train_test_split(housing,test_size=0.2,random_state=42)
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
#plt.hist(housing["income_cat"])
#plt.show()
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for trainIndex, testIndex in split.split(housing, housing["income_cat"]):
stratTrainSet = housing.loc[trainIndex]
stratTestSet = housing.loc[testIndex]
for set in (stratTrainSet, stratTestSet):
set.drop("income_cat", axis=1, inplace=True)
housing = stratTrainSet.copy()
#print(housing)
#plt.scatter(x=housing["latitude"],y=housing["longitude"], alpha=0.4)
#plt.show()
corr_matrix = housing.corr()
#print(corr_matrix["median_house_value"].sort_values(ascending=False))
#attribues = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
#scatter_matrix(housing[attribues], figsize=(12,8))
#plt.show()
""" PREPARING DATA FOR MACHINE LEARNING ALGORITHMS"""
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()
housing.dropna(subset=["total_bedrooms"])
imputer = Imputer(strategy="median")
housingNum = housing.drop("ocean_proximity", axis=1)
imputer.fit(housingNum)
X = imputer.transform(housingNum)
housingTr = pd.DataFrame(X, columns=housingNum.columns)
housingCat = housing["ocean_proximity"]
housingCatEncoded, housingCategories = housingCat.factorize()
encoder = OneHotEncoder()
housingCat1Hot = encoder.fit_transform(housingCatEncoded.reshape(-1,1))
"""Custom Transformers For Rooms Per Household, etc"""
roomsIX, bedroomsIX, populationIX, householdsIX = 3,4,5,6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, addBedroomsPerRoom = True):
self.addBedroomsPerRoom = addBedroomsPerRoom
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
roomsPerHousehold = X[:,roomsIX]/X[:,householdsIX]
populationPerHousehold = X[:,populationIX]/X[:,householdsIX]
if self.addBedroomsPerRoom:
bedroomsPerRoom = X[:,bedroomsIX]/X[:,roomsIX]
return np.c_[X, roomsPerHousehold, populationPerHousehold, bedroomsPerRoom]
else:
return np.c_[X, roomsPerHousehold, populationPerHousehold]
attrAdder = CombinedAttributesAdder(addBedroomsPerRoom=False)
housingExtraAttribs = attrAdder.transform(housing.values)
numPipeline = Pipeline([('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housingNumTr = numPipeline.fit_transform(housingNum)
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attributeNames):
self.attributeNames = attributeNames
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attributeNames].values
numAttribs = list(housingNum)
catAttribs = ["ocean_proximity"]
numPipeline = Pipeline([('selector', DataFrameSelector(numAttribs)),
('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),])
"""UPDATE SKLEARN TO INCLUDE CATEGORICAL ENCODER LIBRARY"""
catPipeline = Pipeline([('selector', DataFrameSelector(catAttribs)),
('cat_encoder', CategoricalEncoder(encoding='onehot-dense')),
])
fullPipeline = FeatureUnion(transformer_list=[("num_pipeline", numPipeline), ("cat_pipeline", catPipeline),])
housingPrepared = fullPipeline.fit_transform(housing)
linReg = LinearRegression()
print(housingPrepared.shape, housing_labels.shape)
linReg.fit(housingPrepared, housing_labels)
I believe the problem is in these two lines:
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()
Change it to:
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTrainSet["median_house_value"].copy()
and you're good to go.
I was trying to add a CSV to my python code and use LR, LDA and various other Algorithms to test their score results.
and Then I wanted to show the results in the form of a table in SQL server.
I get the error "Invalid Parameter Type. param-index= 0 and param-type = numpy.ndarray HY105"
Added Knowledge would be appreciated.
Here is the code:
import matplotlib.pyplot as plt
import numpy as np
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import sqlalchemy
import pypyodbc
import pyodbc
# Load dataset
url = "AirPassengers.csv"
names = ['Serial Number', 'Time', 'Air Passengers']
dataset = pandas.read_csv(url, names=names)
print(dataset.describe())
print(dataset)
#Creating Validation Sets
array = dataset.values
X = array[1:, 0:4]
Y = array[1: ,2]
validation_size = 0.30
seed = 5
X_train, X_validation, Y_train, Y_validation =
model_selection.train_test_split(X,Y,test_size=validation_size,
random_state=seed)
#Test Harness and Evaluation Metrics
seed = 5
scoring = 'accuracy'
#Building Models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train,
cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
conn = pyodbc.connect(
r'DRIVER={ODBC Driver 11 for SQL Server};'
r'SERVER=Q3GN0570\MSSQLSERVER1;'
r'DATABASE=AdventureWorksDW2014;'
r'Trusted_Connection=yes;'
)
cursorexec = conn.cursor()
cursorexec.execute("INSERT INTO pythonTest(LR,LDA,KNN,CART,NB,SVM) VALUES
(?,?,?,?,?,?)", results)
cursorexec.commit()
conn.close()