Machine Learning Web Application error in Flask - python

I am a newbie in machine learning model deployment and have been trying to deploy a simple machine learning model on car price prediction in Flask and Heroku. Have made the model using sklearn pipeline and transformer. The code runs perfectly on the Jupyter Notebook.
Deploying it in Heroku through Github, shows Build succeeded. However on launching the application, its shows application error.
It seems there something is wrong with the app.py file code for Flask. Any help or insight would be really helpful. Thank You
Jupyter Notebook Code:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('car data.csv')
data['current_year'] = 2020
data['car_age'] = data['current_year'] - data['Year']
num_features = [col for col in data.columns if data[col].dtype != 'O' and col != 'Selling_Price']
cat_features = [col for col in data.columns if data[col].dtype == 'O']
X= data.drop(['Selling_Price'], axis=1)
y = data['Selling_Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
num_transformer = Pipeline(steps = [('scaler', StandardScaler())])
cat_transformer = Pipeline(steps =[('OneHot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers = [('numerical_transformer', num_transformer, num_features),
('categorical_transformer', cat_transformer, cat_features)])
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
grid_param = {'n_estimators' :[100, 200, 500, 800, 1000]}
grid = GridSearchCV(estimator = rf, param_grid= grid_param)
model = Pipeline(steps= [('preprocessor',preprocessor), ('grid_regressor', grid)])
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error
MAE = mean_absolute_error(y_test, y_predict)
MSE = mean_squared_error(y_test, y_predict)
RMSE = np.sqrt(MSE)
import pickle
file = open('regression_model.pkl', 'wb')
pickle.dump(model, file)
Code for the app.py file for Flask:
from flask import Flask, render_template, request
import jsonify
import pickle
import numpy as np
import sklearn
model = pickle.load(open('regression_model.pkl', 'rb'))
app = Flask(__name__)
#app.route('/', methods=['GET'])
def Home():
return render_template('index.html')
#app.route("/predict", methods= ['POST']
def predict():
if request.method == 'POST':
Year = int(request.form['Year'])
Selling_Price = float(request.form['Selling_Price'])
Present_Price = float(request.form['Present_Price'])
Kms_Driven = int(request.form['Kms_Driven'])
Owner = int(request.form['Owner'])
car_age = 2020 - Year
Fuel_Type_Petrol = request.form['Fuel_Type_Petrol']
if(Fuel_Type_Petrol=='Petrol'):
Fuel_Type_Petrol=1
Fuel_Type_Diesel=0
elif (Fuel_Type_Petrol =='Diesel'):
Fuel_Type_Petrol=0
Fuel_Type_Diesel=1
else:
Fuel_Type_Petrol=0
Fuel_Type_Diesel=0
Seller_Type_Individual = request.form['Seller_Type_Individual']
if(Seller_Type_Individual =='Individual'):
Seller_Type_Individual=1
else:
Seller_Type_Individual=0
Transmission_Mannual=request.form['Transmission_Mannual']
if(Transmission_Mannual=='Mannual'):
Transmission_Mannual=1
else:
Transmission_Mannual=0
data = [Selling_Price, Present_Price, Kms_Driven, Owner, Fuel_Type_Petrol, Fuel_Type_Diesel, Seller_Type_Individual, Transmission_Mannual, car_age]
prediction = model.predict([data])
output = round(prediction[0], 2)
if output <0:
return render_template('index.html', prediction_texts="Sorry you cannot sell this car")
else:
return render_template('index.html', prediction_texts= "You can sell the car at {}".format(output))
else:
render_template('index.html')
if __name__ == '__main__':
app.run(debug=True)

Related

python eli5 show_weights() show nothing

In my program,
from sklearn import linear_model
from keras.wrappers.scikit_learn import KerasClassifier,KerasRegressor
import eli5
from eli5.sklearn import PermutationImportance
if __name__ == '__main__':
(cnn_tube_par_X_train, cnn_tube_par_X_test, cnn_tube_Y_train, cnn_tube_Y_test) = read_file()
sc_X = StandardScaler()
sc_Y = StandardScaler()
sc_cnn_tube_par_X_train = sc_X.fit_transform(cnn_tube_par_X_train.iloc[:, 1:6].values)
sc_cnn_tube_par_X_test = sc_X.transform(cnn_tube_par_X_test.iloc[:, 1:6].values)
sc_cnn_tube_eff_Y_train = sc_Y.fit_transform(cnn_tube_Y_train.iloc[:, -1:].values)
sc_cnn_tube_eff_Y_test = sc_Y.transform(cnn_tube_Y_test.iloc[:, -1:].values)
MLR_pImportance(sc_cnn_tube_par_X_train,sc_cnn_tube_par_X_test,sc_cnn_tube_eff_Y_train,sc_cnn_tube_eff_Y_test)
def MLR_pImportance(sc_mlr_tube_par_X_train,sc_mlr_tube_par_X_test,sc_mlr_tube_eff_Y_train,sc_mlr_tube_eff_Y_test):
mlr = linear_model.LinearRegression()
mlr.fit(sc_mlr_tube_par_X_train,sc_mlr_tube_eff_Y_train)
perm = PermutationImportance(mlr,random_state=1).fit(sc_mlr_tube_par_X_test,sc_mlr_tube_eff_Y_test)
print(perm.feature_importances_)
print(perm.feature_importances_std_)
eli5.show_weights(perm)
The results show that :
[0.63895352 0.1270582 0.06904505 0.32131836 0.02549574]
[0.02766096 0.01535046 0.01789114 0.02761288 0.01048179]
these are the result of
print(perm.feature_importances_)
print(perm.feature_importances_std_)
but the sebtance:
eli5.show_weights(perm)
show nothing
could you tell the reason,and how to solve it

AttributeError: 'tuple' object has no attribute when selecting model

I want to select model using the following code.
import numpy as np
import pandas as pd
from math import log
from sklearn import model_selection
from sklearn.model_selection import train_test_split as split
from sklearn.metrics import accuracy_score as accuracy
import xgboost as xgb
dataframe_training = pd.read_csv("train.csv")
train_tag=dataframe_training['tags']
train_dummies_tags= train_tag.str.get_dummies(",")
dataframe_training=dataframe_training.filter(items=['review_ratio','log_date_difference','log_price'])
x_train=pd.concat([dataframe_training,train_dummies_tags], axis=1, sort=False)
y_train=dataframe_training.filter(items=['playtime_forever'])
Xtrain, Xtest, Ytrain, Ytest = split(x_train,y_train, test_size=0.25, random_state=7)
from sklearn.ensemble import *
AllRegressorModel = [xgb.XGBRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor]
def Model_Selection_By_Cross_Valid():
ThisRound_SelectedModel = None,
ThisRound_SelectedModel_Name = None,
ThisRound_SelectedModel_Score = None,
for temp_select_model_name in AllRegressorModel:
kfold = model_selection.KFold(n_splits=10, random_state=7),
print (kfold),
temp_model= temp_select_model_name(),
temp_model.fit(Xtrain, Ytrain.ravel()),
results = model_selection.cross_val_score(temp_model, X_train, Y_train.ravel(), cv=kfold, scoring='neg_mean_squared_error'),
print(temp_select_model_name,results.mean()),
if (ThisRound_SelectedModel == None) or (abs(results.mean()) < ThisRound_SelectedModel_Score):
ThisRound_SelectedModel = temp_model,
ThisRound_SelectedModel_Name = temp_select_model_name,
ThisRound_SelectedModel_Score = abs(results.mean()),
print ("This round Model Name: ", temp_model,"MSE Score: ",abs(results.mean())),
print ("This Model Feature Importance",temp_model.feature_importances_),
print("This Model Do No Have Feature Importance......"),
print ("<----------------------------------->"),
print ("Selected Model Name:", ThisRound_SelectedModel, "MSE Score:",ThisRound_SelectedModel_Score),
return {"ModelName": ThisRound_SelectedModel_Name,"Model": ThisRound_SelectedModel}
When I run the program,
SelectedModel = Model_Selection_By_Cross_Valid()
AttributeError: 'tuple' object has no attribute 'fit' is shown.
How can I solve the problem?
Thank you very much.

Sklearn - Found input variables with inconsistent numbers of samples: [16512, 4128]

Going through the second chapter of Hands-On Machine Learning with Scikit-Learn & TensorFlow, running to the error stated above. This happens when I try to implement the following line:
linReg.fit(housingPrepared, housing_labels)
Researching online it looks like it has to do something with the dimensions of my features and my labels not matching up. Printing the shape of housingPrepared (X) and housing_labels (Y) yields the following result:
(16512, 16) (4128,)
I've spent the last hour going through line by line to see if I missed a line in this chapter, can't find anything. Wondering if someone here might have an intuition on where a potential solution for this problem could be.
Thank you so much in advance. All my code up to the problem line is posted below:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from zlib import crc32
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from CategoricalEncoder import CategoricalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_array
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetchHousingData(housingUrl=HOUSING_URL, housingPath=HOUSING_PATH):
if not os.path.isdir(housingPath):
os.makedirs(housingPath)
tgzPath = os.path.join(housingPath, "housing.tgz")
urllib.request.urlretrieve(housingUrl, tgzPath)
housingTgz = tarfile.open(tgzPath)
housingTgz.extractall(path=housingPath)
housingTgz.close()
def loadHousingData(housingPath=HOUSING_PATH):
return pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv")
housing = loadHousingData()
#plt.hist(housing['longitude'],bins=50)
#plt.show()
def splitTrainTesT(data, testRatio):
shuffled_indices = np.random.permutation(len(data))
testSetSize = int(len(data)* testRatio)
testIndices = shuffled_indices[:testSetSize]
trainIndices = shuffled_indices[testSetSize:]
return data.iloc[trainIndices], data.iloc[testIndices]
def testSetCheck(identifier, testRatio):
return crc32(np.int64(identifier)) & 0xffffffff < testRatio * 2 ** 32
def splitTrainTestByID(data, testRatio, idColumn):
ids = data[idColumn]
inTestSet = ids.apply(lambda id_: testSetCheck(id_, testRatio))
return data.loc[~inTestSet], data.loc[inTestSet]
#housingWithID = housing.reset_index()
#trainSet, testSet = splitTrainTestByID(housingWithID,0.2,"index")
trainSet, testSet = train_test_split(housing,test_size=0.2,random_state=42)
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
#plt.hist(housing["income_cat"])
#plt.show()
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for trainIndex, testIndex in split.split(housing, housing["income_cat"]):
stratTrainSet = housing.loc[trainIndex]
stratTestSet = housing.loc[testIndex]
for set in (stratTrainSet, stratTestSet):
set.drop("income_cat", axis=1, inplace=True)
housing = stratTrainSet.copy()
#print(housing)
#plt.scatter(x=housing["latitude"],y=housing["longitude"], alpha=0.4)
#plt.show()
corr_matrix = housing.corr()
#print(corr_matrix["median_house_value"].sort_values(ascending=False))
#attribues = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
#scatter_matrix(housing[attribues], figsize=(12,8))
#plt.show()
""" PREPARING DATA FOR MACHINE LEARNING ALGORITHMS"""
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()
housing.dropna(subset=["total_bedrooms"])
imputer = Imputer(strategy="median")
housingNum = housing.drop("ocean_proximity", axis=1)
imputer.fit(housingNum)
X = imputer.transform(housingNum)
housingTr = pd.DataFrame(X, columns=housingNum.columns)
housingCat = housing["ocean_proximity"]
housingCatEncoded, housingCategories = housingCat.factorize()
encoder = OneHotEncoder()
housingCat1Hot = encoder.fit_transform(housingCatEncoded.reshape(-1,1))
"""Custom Transformers For Rooms Per Household, etc"""
roomsIX, bedroomsIX, populationIX, householdsIX = 3,4,5,6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, addBedroomsPerRoom = True):
self.addBedroomsPerRoom = addBedroomsPerRoom
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
roomsPerHousehold = X[:,roomsIX]/X[:,householdsIX]
populationPerHousehold = X[:,populationIX]/X[:,householdsIX]
if self.addBedroomsPerRoom:
bedroomsPerRoom = X[:,bedroomsIX]/X[:,roomsIX]
return np.c_[X, roomsPerHousehold, populationPerHousehold, bedroomsPerRoom]
else:
return np.c_[X, roomsPerHousehold, populationPerHousehold]
attrAdder = CombinedAttributesAdder(addBedroomsPerRoom=False)
housingExtraAttribs = attrAdder.transform(housing.values)
numPipeline = Pipeline([('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housingNumTr = numPipeline.fit_transform(housingNum)
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attributeNames):
self.attributeNames = attributeNames
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attributeNames].values
numAttribs = list(housingNum)
catAttribs = ["ocean_proximity"]
numPipeline = Pipeline([('selector', DataFrameSelector(numAttribs)),
('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),])
"""UPDATE SKLEARN TO INCLUDE CATEGORICAL ENCODER LIBRARY"""
catPipeline = Pipeline([('selector', DataFrameSelector(catAttribs)),
('cat_encoder', CategoricalEncoder(encoding='onehot-dense')),
])
fullPipeline = FeatureUnion(transformer_list=[("num_pipeline", numPipeline), ("cat_pipeline", catPipeline),])
housingPrepared = fullPipeline.fit_transform(housing)
linReg = LinearRegression()
print(housingPrepared.shape, housing_labels.shape)
linReg.fit(housingPrepared, housing_labels)
I believe the problem is in these two lines:
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()
Change it to:
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTrainSet["median_house_value"].copy()
and you're good to go.

tf.feature_column.categorical_column_with_identity Error: object has no attribute 'name' or no 'get_sparse_tensors'

Updated post:
I looked into the initial Suggestion to get to the root of my Problem.
The orginal Problem was that several feature columns produced an Attribute error ('no Name' or 'get_sparse_tensors')
This is an example code for an 'tuple' object has no attribute 'name':
metro = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_identity("metro",94)),
tf.feature_column.indicator_column(metro),
Data from Metro column Looks like this:
-"(not set)"
-"Abilene-Sweetwater TX"
-"Albany-Schenectady-Troy NY"
-"Atlanta GA"
-...
Here's an example Code for _get_sparse_tensor Error:
browser = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_identity("browser",54))
tf.feature_column.indicator_column(browser),
I can solve both error by replacing categorical_column_with_identity with categorical_column_with_vocabulary_list
metro = tf.feature_column.categorical_column_with_vocabulary_list('metro',
vocabulary_list=['(not set)','Abilene-Sweetwater TX','Albany-Schenectady-Troy NY','Atlanta GA'])
Since using tf.feature_column.categorical_column_with_identity is way faster than writing Long vocabulary list, I'd love to know why this Errors occur?
Here's the MCVE:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
import argparse
import tensorflow as tf
# specify col.names.
names = [
'browser',
'metro',
'transactionRevenue'
]
# specify dtypes.
dtypes = {
'browser': str,
'metro': str,
'transactionRevenue': np.float32
}
df = pd.read_csv('dropped_train.csv', names=names, dtype=dtypes, n a_values='?',encoding ="ISO-8859-1")
def load_data(y_name="transactionRevenue", train_fraction=0.7, seed=None):
# Load the raw data columns.
data = df
# Shuffle the data
np.random.seed(seed)
# Split the data into train/test subsets.
x_train = data.sample(frac=train_fraction, random_state=seed)
x_test = data.drop(x_train.index)
# Extract the label from the features DataFrame.
y_train = x_train.pop(y_name)
y_test = x_test.pop(y_name)
return (x_train, y_train), (x_test, y_test)
load_data()
def features_columns():
metro = tf.feature_column.indicator_column (tf.feature_column.categorical_column_with_identity("metro",94)),
browser = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_identity("browser",54))
feature_columns = [
tf.feature_column.indicator_column(browser),
tf.feature_column.indicator_column(metro)
]
return feature_columns
features_columns()
log_dir = ("C:\\…\\gs sales\\model")
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=500, type=int, help='batch size')
parser.add_argument('--train_steps', default=10000, type=int, help='number of training steps')
parser.add_argument('--norm_factor', default=10., type=float, help='normalization factor')
def main(argv):
"""Builds, trains, and evaluates the model."""
args = parser.parse_args(argv[1:])
(train_x, train_y), (test_x, test_y) = load_data()
train_y /= args.norm_factor
test_y /= args.norm_factor
# Build the training dataset.
training_input_fn = tf.estimator.inputs.pandas_input_fn(x=train_x, y=train_y, batch_size=64,
shuffle=True, num_epochs=None)
# Build the Estimator.
model = tf.estimator.DNNRegressor(hidden_units=[50,30,10], feature_columns=features_columns(),
model_dir=log_dir)
# Train the model.
model.train(input_fn=training_input_fn, steps=args.train_steps)
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)

Python Connected With SQL Server (HY105 Error) Invalid Parameter Type HY105

I was trying to add a CSV to my python code and use LR, LDA and various other Algorithms to test their score results.
and Then I wanted to show the results in the form of a table in SQL server.
I get the error "Invalid Parameter Type. param-index= 0 and param-type = numpy.ndarray HY105"
Added Knowledge would be appreciated.
Here is the code:
import matplotlib.pyplot as plt
import numpy as np
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import sqlalchemy
import pypyodbc
import pyodbc
# Load dataset
url = "AirPassengers.csv"
names = ['Serial Number', 'Time', 'Air Passengers']
dataset = pandas.read_csv(url, names=names)
print(dataset.describe())
print(dataset)
#Creating Validation Sets
array = dataset.values
X = array[1:, 0:4]
Y = array[1: ,2]
validation_size = 0.30
seed = 5
X_train, X_validation, Y_train, Y_validation =
model_selection.train_test_split(X,Y,test_size=validation_size,
random_state=seed)
#Test Harness and Evaluation Metrics
seed = 5
scoring = 'accuracy'
#Building Models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train,
cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
conn = pyodbc.connect(
r'DRIVER={ODBC Driver 11 for SQL Server};'
r'SERVER=Q3GN0570\MSSQLSERVER1;'
r'DATABASE=AdventureWorksDW2014;'
r'Trusted_Connection=yes;'
)
cursorexec = conn.cursor()
cursorexec.execute("INSERT INTO pythonTest(LR,LDA,KNN,CART,NB,SVM) VALUES
(?,?,?,?,?,?)", results)
cursorexec.commit()
conn.close()

Categories