What value to offer to the future dataframe when doing predictions? - python

I have a dataset "group_by_df" which has a "day" column and "o3". I want to do some predictions in the future, for example for the next 5 days. I have managed to complete the dataframe in the "day" column with the following 5 days using this code:
days_predicted = 5
rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')
df = pd.DataFrame({'day': rng})
df[sensor_name] = group_by_df["o3"] #the current existent values for o3
df[sensor_name][len(group_by_df):] = "" #the future values
The thing is that I cannot do the linear Regression with this format for the "o3" column, as I either leave it Nan or "". But linear regression throws this error: ValueError: could not convert string to float:
What values should I give to the "o3" column so that I can do a future prediction? Here is the updated code, I have replaced the unknown "o3" values with the average for the known days. Is this a good approach?
import datetime
import datetime as dt
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from distributed.deploy.ssh import bcolors
from flask_babel import _
from pandas.plotting import register_matplotlib_converters
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None # default='warn'
register_matplotlib_converters()
def create_columns(data):
data["readable time"] = ""
data["day"] = ""
for i in range(0, len(data)):
data.loc[i, ['readable time']] = datetime.datetime.fromtimestamp(
data["time"][i]).strftime(
'%d/%m/%Y %H:%M:%S')
data.loc[i, ['day']] = datetime.datetime.fromtimestamp(data["time"][i]).strftime(
'%d/%m/%Y')
def calculate_linear_regression(data, sensor_name):
create_columns(data) # from timeseries
data['day'] = pd.to_datetime(data['day'], dayfirst=True)
data = data.sort_values(by=['readable time'])
group_by_df = pd.DataFrame([name, group.mean()[sensor_name]] for name, group in data.groupby('day'))
group_by_df.columns = ['day', sensor_name]
print("group by df ", group_by_df)
group_by_df['day'] = pd.to_datetime(group_by_df['day'])
# initial length of dataframe(before future prediction)
initial_len_df = len(group_by_df)
days_predicted = 5
rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')
df = pd.DataFrame({'day': rng})
df[sensor_name] = group_by_df[sensor_name]
df[sensor_name][len(group_by_df):] = group_by_df[sensor_name].mean() # ""
print("after... \n", df)
group_by_df = df
print("GROUP BY DF\n", group_by_df)
group_by_df['day'] = group_by_df['day'].map(dt.datetime.toordinal)
def split(group_by_df):
X = group_by_df[['day']].values
y = group_by_df[[sensor_name]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split(group_by_df)
def analyse_forecast():
print("MSE linear regression(mean squared error)",
mean_squared_error(group_by_df[sensor_name], group_by_df['predicted']))
print("r2 score ", r2_score(group_by_df[sensor_name], group_by_df['predicted']))
rmse = np.sqrt(mean_squared_error(group_by_df[sensor_name], group_by_df['predicted']))
print("RMSE for linear regression=", rmse)
print( "MSE TEST ", mean_squared_error(y_test, group_by_df['predicted'][len(X_train):]))
print("MSE TRAIN ", mean_squared_error(y_train, group_by_df['predicted'][:len(X_train)]))
print("r2 score TEST", r2_score(y_test, group_by_df['predicted'][len(X_train):]))
return mean_squared_error(group_by_df[sensor_name], group_by_df['predicted'])
def calculate_linear_reg():
group_by_df.reset_index(inplace=True)
mse_list = []
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(group_by_df[['day']])
group_by_df['predicted'] = y_pred
mse_list.append(analyse_forecast())
print(group_by_df) # print predicted values
calculate_linear_reg()
return group_by_df, X_train, sensor_name, initial_len_df
def create_figure(group_by_df, X_train, sensor_name, initial_len_df):
print("INITIAL LEN DF IS", initial_len_df)
linear_regression_fig = go.Figure()
# plot predicted values
linear_regression_fig.add_trace(go.Scatter(
x=group_by_df['day'].map(dt.datetime.fromordinal),
y=group_by_df['predicted'],
name=_("Linear Regression"),
mode='lines+markers',
marker=dict(
color=np.where(group_by_df['day'].index < len(X_train), 'red', 'green'))))
# plot actual values
linear_regression_fig.add_trace(go.Scatter(
x=group_by_df['day'].map(dt.datetime.fromordinal)[:initial_len_df],
y=group_by_df[sensor_name][:initial_len_df],
name=_('Actual values'),
mode='lines+markers'))
linear_regression_fig.update_layout(
height=700,
font=dict(color="grey"),
paper_bgcolor='rgba(0,0,0,0)',
title=_('Linear Regression for ') + _(sensor_name),
yaxis_title=_(sensor_name),
xaxis_title=_('Day'),
showlegend=True)
linear_regression_fig.show()
data="https://raw.githubusercontent.com/iulianastroia/csv_data/master/final_dataframe.csv"
data = pd.read_csv(data)
group_by_df, X_train, sensor_name, initial_len_df = calculate_linear_regression(data, "o3")
linear_reg_fig = create_figure(group_by_df, X_train, sensor_name, initial_len_df)
``

Related

How to improve the knn model?

I built a knn model for classification. Unfortunately, my model has accuracy > 80%, and I would like to get a better result. Can I ask for some tips? Maybe I used too many predictors?
My data = https://www.openml.org/search?type=data&sort=runs&id=53&status=active
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
heart_disease = pd.read_csv('heart_disease.csv', sep=';', decimal=',')
y = heart_disease['heart_disease']
X = heart_disease.drop(["heart_disease"], axis=1)
correlation_matrix = heart_disease.corr()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
scaler = MinMaxScaler(feature_range=(-1,1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
knn_3 = KNeighborsClassifier(3, n_jobs = -1)
knn_3.fit(X_train, y_train)
y_train_pred = knn_3.predict(X_train)
labels = ['0', '1']
print('Training set')
print(pd.DataFrame(confusion_matrix(y_train, y_train_pred), index = labels, columns = labels))
print(accuracy_score(y_train, y_train_pred))
print(f1_score(y_train, y_train_pred))
y_test_pred = knn_3.predict(X_test)
print('Test set')
print(pd.DataFrame(confusion_matrix(y_test, y_test_pred), index = labels, columns = labels))
print(accuracy_score(y_test, y_test_pred))
print(f1_score(y_test, y_test_pred))
hyperparameters = {'n_neighbors' : range(1, 15), 'weights': ['uniform','distance']}
knn_best = GridSearchCV(KNeighborsClassifier(), hyperparameters, n_jobs = -1, error_score = 'raise')
knn_best.fit(X_train,y_train)
knn_best.best_params_
y_train_pred_best = knn_best.predict(X_train)
y_test_pred_best = knn_best.predict(X_test)
print('Training set')
print(pd.DataFrame(confusion_matrix(y_train, y_train_pred_best), index = labels, columns = labels))
print(accuracy_score(y_train, y_train_pred_best))
print(f1_score(y_train, y_train_pred_best))
print('Test set')
print(pd.DataFrame(confusion_matrix(y_test, y_test_pred_best), index = labels, columns = labels))
print(accuracy_score(y_test, y_test_pred_best))
print(f1_score(y_test, y_test_pred_best))
```.
Just a little part of answer, to find the best number for k_neighbors.
errlist = [] #an error list to append
for i in range(1,40): #from 0-40 numbers to use in k_neighbors
knn_i = KNeighborsClassifier(k_neighbors=i)
knn_i.fit(X_train,y_train)
errlist.append(np.mean(knn_i.predict(X_test)!=y_test)) # append the mean of failed-predict numbers
plot a line to see best k_neighbors:
plt.plot(range(1,40),errlist)
feel free to change the numbers for range.

I'm having problems converting my forecast_out from daily to a minute forecast

Hi I'm working on a learning model and I seem to have been stuck with the forecast_out function of Scikit-learn.
I need help in creating forecasts not just for daily, but hourly, and even by the minute as well.
Thanks!
import pandas as pd
import datetime as dt
import pandas_datareader as reader
import fbprophet
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
end = dt.datetime.now()
start = dt.datetime(end.year-1,end.month,end.day,end.hour,end.minute)
start
df = reader.get_data_yahoo('BTC-USD',start,end)
print(df.head())
df = df[['Adj Close']]
print(df.head())
forecast_out =30 #(I want to be able to predict hourly and even by the minute as well)
df['Prediction'] = df[['Adj Close']].shift(-forecast_out)
print(df.tail())
X = np.array (df.drop(['Prediction'],1))
X = X[:-forecast_out]
print (X)
y = np.array (df['Prediction'])
y = y[:-forecast_out]
print(y)
x_train, x_test, y_train, y_test = train_test_split (X,y, test_size=0.2)
svr_rbf = SVR(kernel='rbf' , C=1e3, gamma=0.1)
svr_rbf.fit(x_train, y_train)
svm_confidence = svr_rbf.score(x_test, y_test)
print ("svm confidence: ", svm_confidence)
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_confidence = lr.score(x_test, y_test)
print ("lr confidence: ", lr_confidence)
x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_out:]
print(x_forecast)
lr_prediction = lr.predict (x_forecast)
print(lr_prediction)
svm_prediction = svr_rbf.predict (x_forecast)
print(svm_prediction)

how to use fit model to predict tomorrow's value?

I am learning time series forecast. Here is the code and it is working fine, the prediction is always comparing to test data. Now given the today's stock price from X, how can I predict tomorrow's stock Y( MSFT)price? This is the main goal of prediction. Thanks
import numpy as np
import pandas as pd
import pandas_datareader.data as web
# Error Metrics
from sklearn.metrics import mean_squared_error
# Time series Models
from statsmodels.tsa.arima_model import ARIMA
from matplotlib import pyplot
from datetime import datetime, timedelta
#Diable the warnings
import warnings
warnings.filterwarnings('ignore')
stk_tickers = ['MSFT', 'IBM', 'GOOGL']
ccy_tickers = ['DEXJPUS', 'DEXUSUK']
idx_tickers = ['SP500', 'DJIA', 'VIXCLS']
stk_data = web.DataReader(stk_tickers, 'yahoo',start_date, end_date)
ccy_data = web.DataReader(ccy_tickers, 'fred')
idx_data = web.DataReader(idx_tickers, 'fred')
return_period = 5
Y = np.log(stk_data.loc[:, ('Adj Close', 'MSFT')]).diff(return_period).shift(-return_period)
Y.name = Y.name[-1]+'_pred'
X1 = np.log(stk_data.loc[:, ('Adj Close', ('GOOGL', 'IBM'))]).diff(return_period)
X1.columns = X1.columns.droplevel()
X2 = np.log(ccy_data).diff(return_period)
X3 = np.log(idx_data).diff(return_period)
X4 = pd.concat([np.log(stk_data.loc[:, ('Adj Close', 'MSFT')]).diff(i) for i in [return_period, return_period*3, return_period*6, return_period*12]], axis=1).dropna()
X4.columns = ['MSFT_DT', 'MSFT_3DT', 'MSFT_6DT', 'MSFT_12DT']
X = pd.concat([X1, X2, X3, X4], axis=1)
dataset = pd.concat([Y, X], axis=1).dropna().iloc[::return_period, :]
Y = dataset.loc[:, Y.name]
X = dataset.loc[:, X.columns]
validation_size = 0.2
#In case the data is not dependent on the time series, then train and test split randomly
# seed = 7
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
#In case the data is not dependent on the time series, then train and test split should be done based on sequential sample
#This can be done by selecting an arbitrary split point in the ordered list of observations and creating two new datasets.
train_size = int(len(X) * (1-validation_size))
X_train, X_test = X[0:train_size], X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(X)]
X_train_ARIMA=X_train.loc[:, ['GOOGL', 'IBM', 'DEXJPUS', 'SP500', 'DJIA', 'VIXCLS']]
X_test_ARIMA=X_test.loc[:, ['GOOGL', 'IBM', 'DEXJPUS', 'SP500', 'DJIA', 'VIXCLS']]
tr_len = len(X_train_ARIMA)
te_len = len(X_test_ARIMA)
to_len = len (X)
modelARIMA=ARIMA(endog=Y_train,exog=X_train_ARIMA,order=[2,0,1])
model_fit = modelARIMA.fit()
error_Training_ARIMA = mean_squared_error(Y_train, model_fit.fittedvalues)
predicted = model_fit.predict(start = tr_len -1 ,end = to_len -1, exog = X_test_ARIMA)[1:]

LSTM neural network for multiple steps time series prediction

I tried to develop a model that foresees two time-steps forward
In this regard I modified a GitHub code for the single step forecast coding a data_load function that takes n steps backward in the X_train/test series and set it against a y_train/test 2-array.
I set the neurons list to output in Dense a 2-vector object.
And last I wrote a predict function and a plot function for the 2-step-forecast.
I do not normalized features lables and forecasts I will do in the future.
After a bit of hyperfine tuning it returns a good score for the mse and rmse:
Train Score: 0.00000 MSE (0.00 RMSE)
Test Score: 0.00153 MSE (0.04 RMSE)
It can find quite well the trend, but it returns all forecasts with negative directions.
Does anyone have a suggestion?
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt2
import pandas as pd
from pandas import datetime
import math, time
import itertools
from sklearn import preprocessing
import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.recurrent import LSTM
from keras.models import load_model
import keras
from numpy import newaxis
pd.core.common.is_list_like = pd.api.types.is_list_like
import pandas_datareader.data as web
import h5py
from keras import backend as K
import quandl
quandl.ApiConfig.api_key = 'myQuandlKey'
seq_len = 2
shape = [seq_len, 9, 2]
neurons = [256, 256, 64, 2]
dropout = 0.2
decay = 0.5
epochs = 100
stock_name = 'AAPL'
global_start_time = time.time()
def get_stock_data(stock_name, normalize=True, ma=[]):
"""
Return a dataframe of that stock and normalize all the values.
(Optional: create moving average)
"""
df = quandl.get_table('WIKI/PRICES', ticker = stock_name)
df.drop(['ticker', 'open', 'high', 'low', 'close', 'ex-dividend', 'volume', 'split_ratio'], 1, inplace=True)
df.set_index('date', inplace=True)
# Renaming all the columns so that we can use the old version code
df.rename(columns={'adj_open': 'Open', 'adj_high': 'High', 'adj_low': 'Low', 'adj_volume': 'Volume', 'adj_close': 'Adj Close'}, inplace=True)
# Percentage change
df['Pct'] = df['Adj Close'].pct_change()
df.dropna(inplace=True)
# Moving Average
if ma != []:
for moving in ma:
df['{}ma'.format(moving)] = df['Adj Close'].rolling(window=moving).mean()
df.dropna(inplace=True)
if normalize:
min_max_scaler = preprocessing.MinMaxScaler()
df['Open'] = min_max_scaler.fit_transform(df.Open.values.reshape(-1,1))
df['High'] = min_max_scaler.fit_transform(df.High.values.reshape(-1,1))
df['Low'] = min_max_scaler.fit_transform(df.Low.values.reshape(-1,1))
df['Volume'] = min_max_scaler.fit_transform(df.Volume.values.reshape(-1,1))
df['Adj Close'] = min_max_scaler.fit_transform(df['Adj Close'].values.reshape(-1,1))
df['Pct'] = min_max_scaler.fit_transform(df['Pct'].values.reshape(-1,1))
if ma != []:
for moving in ma:
df['{}ma'.format(moving)] = min_max_scaler.fit_transform(df['{}ma'.format(moving)].values.reshape(-1,1))
# Move Adj Close to the rightmost for the ease of training
adj_close = df['Adj Close']
df.drop(labels=['Adj Close'], axis=1, inplace=True)
df = pd.concat([df, adj_close], axis=1)
#df.to_csv('aap.csv')
return df
df = get_stock_data(stock_name, ma=[50, 100, 200])
def plot_stock(df):
print(df.head())
plt.subplot(211)
plt.plot(df['Adj Close'], color='red', label='Adj Close')
plt.legend(loc='best')
plt.subplot(212)
plt.plot(df['Pct'], color='blue', label='Percentage change')
plt.legend(loc='best')
plt.show()
#plot_stock(df)
def load_data(stock, seq_len):
amount_of_features = len(stock.columns)
data = stock.values
sequence_length = seq_len + 2 # index starting from 0
result = []
for index in range(len(data) - sequence_length): # maxmimum date = lastest date - sequence length
result.append(data[index: index + sequence_length]) # index : index + 22days
result = np.array(result)
row = round(0.8 * result.shape[0]) # 80% split
train = result[:int(row), :,:] # 80% date
X_train = train[:, :-2,:] # all data until day m
y_train = train[:, -2:,:][:,:,-1] # day m + 1 adjusted close price
X_test = result[int(row):, :-2,:]
y_test = result[int(row):, -2:,:][:,:,-1]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], amount_of_features))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], amount_of_features))
print ("________________________________________________________________")
print ("X_train shape = {}".format(X_train.shape))
print ("y_train shape = {}".format(y_train.shape))
print ("")
print ("X_test shape = {}".format(X_test.shape))
print ("y_test shape = {}".format(y_test.shape))
print ("________________________________________________________________")
return [X_train, y_train, X_test, y_test]
X_train, y_train, X_test, y_test = load_data(df, seq_len)
def build_model(shape, neurons, dropout, decay):
model = Sequential()
model.add(LSTM(neurons[0], input_shape=(shape[0], shape[1]), return_sequences=True))
model.add(Dropout(dropout))
model.add(LSTM(neurons[1], input_shape=(shape[0], shape[1]), return_sequences=False))
model.add(Dropout(dropout))
model.add(Dense(neurons[2],kernel_initializer="uniform",activation='relu'))
model.add(Dense(neurons[3],kernel_initializer="uniform",activation='linear'))
# model = load_model('my_LSTM_stock_model1000.h5')
adam = keras.optimizers.Adam(decay=decay)
model.compile(loss='mse',optimizer='adam', metrics=['accuracy'])
model.summary()
return model
model = build_model(shape, neurons, dropout, decay)
model.fit(
X_train,
y_train,
batch_size=512,
epochs=epochs,
validation_split=0.01,
verbose=1)
def model_score(model, X_train, y_train, X_test, y_test):
trainScore = model.evaluate(X_train, y_train, verbose=0)
print('Train Score: %.5f MSE (%.2f RMSE)' % (trainScore[0], math.sqrt(trainScore[0])))
testScore = model.evaluate(X_test, y_test, verbose=0)
print('Test Score: %.5f MSE (%.2f RMSE)' % (testScore[0], math.sqrt(testScore[0])))
return trainScore[0], testScore[0]
model_score(model, X_train, y_train, X_test, y_test)
def percentage_difference(model, X_test, y_test):
percentage_diff=[]
p = model.predict(X_test)
for u in range(len(y_test)): # for each data index in test data
pr = p[u][0] # pr = prediction on day u
percentage_diff.append((pr-y_test[u]/pr)*100)
print('Prediction duration: ', str(datetime.timedelta(seconds=(time.time() - global_start_time))) )
return p
def denormalize(stock_name, normalized_value):
"""
Return a dataframe of that stock and normalize all the values.
(Optional: create moving average)
"""
df = quandl.get_table('WIKI/PRICES', ticker = stock_name)
df.drop(['ticker', 'open', 'high', 'low', 'close', 'ex-dividend', 'volume', 'split_ratio'], 1, inplace=True)
df.set_index('date', inplace=True)
# Renaming all the columns so that we can use the old version code
df.rename(columns={'adj_open': 'Open', 'adj_high': 'High', 'adj_low': 'Low', 'adj_volume': 'Volume', 'adj_close': 'Adj Close'}, inplace=True)
df.dropna(inplace=True)
df = df['Adj Close'].values.reshape(-1,1)
normalized_value = normalized_value.reshape(-1,1)
#return df.shape, p.shape
min_max_scaler = preprocessing.MinMaxScaler()
a = min_max_scaler.fit_transform(df)
new = min_max_scaler.inverse_transform(normalized_value)
return new
def plot_result(stock_name, normalized_value_p, normalized_value_y_test):
newp = denormalize(stock_name, normalized_value_p)
newy_test = denormalize(stock_name, normalized_value_y_test)
#newy_test = np.roll(newy_test,1,0)
plt2.plot(newp, color='red', label='Prediction')
plt2.plot(newy_test,color='blue', label='Actual')
plt2.legend(loc='best')
plt2.title('Global run time {}'.format(str(datetime.timedelta(seconds=(time.time() - global_start_time))) ) )
plt2.xlabel('Days')
plt2.ylabel('Adjusted Close')
plt2.show()
def predict_sequences_multiple(model, data, window_size, prediction_len):
#Predict sequence of 50 steps before shifting prediction run forward by 50 steps
prediction_seqs = []
for i in range(int(len(data)/prediction_len)):
curr_frame = data[i*prediction_len]
predicted = []
for j in range(prediction_len):
predicted.append(model.predict(curr_frame[newaxis,:,:])[0,0])
curr_frame = curr_frame[1:]
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
prediction_seqs.append(predicted)
print('Prediction duration: ', str(datetime.timedelta(seconds=(time.time() - global_start_time))) )
return prediction_seqs
def plot_results_multiple(predicted_data, true_data, prediction_len):
fig = plt.figure(facecolor='white')
ax = fig.add_subplot(111)
ax.plot(true_data, label='True Data')
#Pad the list of predictions to shift it in the graph to it's correct start
for i, data in enumerate(predicted_data):
padding = [None for p in range(i * prediction_len)]
plt.plot(padding + data, label='Prediction')
plt.legend()
plt.title('Global run time {}'.format(str(datetime.timedelta(seconds=(time.time() - global_start_time))) ) )
plt.show()
#Single step prediction
#p = percentage_difference(model, X_test, y_test)
#plot_result(stock_name, p, y_test)
#Multiple step prediction
predictions = predict_sequences_multiple(model, X_test, seq_len, 2)
plot_results_multiple(predictions, y_test, 2)

Items of feature_columns must be a _FeatureColumn

I am getting this error:
ValueError: Items of feature_columns must be a _FeatureColumn. Given
(type ): Index(['CreditScore',
'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
'IsActiveMember', 'EstimatedSalary', 'Exited'],
dtype='object').
I am using tensorFlow lib. I want to get prediction results but I can not run m.train(input_fn=get_input_fn ,steps=5000) code. I always got the same error whatever I did. I used these input functions in the following but nothing changed.
def input_fn_train():
x=tf.constant(df_train.astype(np.float64)),
y=tf.constant(df_train[LABEL].astype(np.float64))
return x, y
and
def get_input_fn(data_set, num_epochs=None, shuffle=False):
return tf.estimator.inputs.pandas_input_fn(
x=pd.DataFrame({k: data_set[k].values for k in data_set.columns}),
y=pd.Series(data_set[LABEL].values), num_epochs=num_epochs,
shuffle=shuffle)
I can not understand what should I do. What the error is about? I've been googling but never found useful thing. How can I handle this error. The code is below. Thanks!
import pandas as pd
import tensorflow as tf
import numpy as np
import tempfile
COLS= ["RowNumber","CustomerId","Surname","CreditScore","Geography",
"Gender","Age","Tenure","Balance","NumOfProducts","HasCrCard",
"IsActiveMember","EstimatedSalary","Exited"]
FEATURES = ["CreditScore","Age","Tenure","Balance","NumOfProducts",
"HasCrCard","IsActiveMember", "EstimatedSalary"]
LABEL="Exited"
df_train = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
df_test = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
test_label = df_test[LABEL].astype(float)
df_test.drop("Surname", axis = 1, inplace=True)
df_test.drop("RowNumber", axis = 1, inplace=True)
df_test.drop("CustomerId", axis = 1, inplace=True)
df_train.drop("CustomerId", axis = 1, inplace=True)
df_train.drop("Surname", axis = 1, inplace=True)
df_train.drop("RowNumber", axis = 1, inplace=True)
df_train.drop("Geography", axis = 1, inplace=True)
df_train.drop("Gender", axis = 1, inplace=True)
def get_input_fn():
return {'x': tf.constant(df_train[FEATURES].as_matrix(), tf.float32,
df_train.shape),
'y': tf.constant(df_train[LABEL].as_matrix(), tf.float32,
df_train.shape)
}
df=df_train.select_dtypes(exclude=['object'])
numeric_cols=df.columns
m = tf.estimator.LinearClassifier(model_dir=model_dir, feature_columns=
[numeric_cols])
m.train(input_fn=get_input_fn ,steps=5000)
results = m.evaluate(input_fn= get_input_fn(df_test, num_epochs=1,
shuffle=False),steps=None)
y = m.predict(input_fn=get_input_fn(df_test, num_epochs=1, shuffle=False))
pred = list(y)
rowNumber=0
for i in pred:
print(str(rowNumber)+': '+str(pred[i]))
rowNumber=rowNumber+1
Your first mistake is how you create tf.estimator.LinearClassifier. You're passing the dataframe index df.columns into feature_columns, but should pass the list of tensorflow feature columns. The columns should define if it's numerical or categorical and in the later case the encoding type.
Secondly, the input function can be simplified a lot, since you're reading pandas dataframe. Just use tf.estimator.inputs.pandas_input_fn.
Your .csv is most likely different, I've made a dummy one with some values. So here's a way to read the input and fit the model correctly:
import pandas as pd
import tensorflow as tf
FEATURES = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts",
"HasCrCard", "IsActiveMember", "EstimatedSalary", "Exited"]
credit_score = tf.feature_column.numeric_column("CreditScore")
age = tf.feature_column.numeric_column("Age")
tenure = tf.feature_column.numeric_column("Tenure")
balance = tf.feature_column.numeric_column("Balance")
num_of_products = tf.feature_column.numeric_column("NumOfProducts")
has_card = tf.feature_column.categorical_column_with_vocabulary_list("HasCrCard", ["True", "False"])
is_active_member = tf.feature_column.categorical_column_with_vocabulary_list("IsActiveMember", ["True", "False"])
estimated_salary = tf.feature_column.numeric_column("EstimatedSalary")
feature_columns = [credit_score, age, tenure, balance, num_of_products, has_card, is_active_member, estimated_salary]
def input_fn(num_epochs=None, shuffle=True, batch_size=100):
df = pd.read_csv('Churn_Modelling.csv',
names=FEATURES,
dtype={'HasCrCard': str, 'IsActiveMember': str},
skipinitialspace=True,
header=0)
df = df.dropna(how='any', axis=0) # remove NaN elements
labels = df["Exited"]
return tf.estimator.inputs.pandas_input_fn(x=df,
y=labels,
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
model = tf.estimator.LinearClassifier(model_dir=None,
feature_columns=feature_columns)
model.train(input_fn=input_fn(), steps=100)
It is working clearly.
import pandas as pd
import tensorflow as tf
import tempfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
def split_data(data, rate, label):
data = data.dropna()
train_data, test_data = train_test_split(data, test_size=rate)
train_label = train_data[label]
train_data = train_data.drop(label, 1)
test_label = test_data[label]
test_data = test_data.drop(label, 1)
return train_data, train_label, test_data, test_label
LABEL = "Exited"
data = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
data.drop("Surname", axis=1, inplace=True)
data.drop("RowNumber", axis=1, inplace=True)
data.drop("CustomerId", axis=1, inplace=True)
data.drop("Geography", axis=1, inplace=True)
data.drop("Gender", axis=1, inplace=True)
x_train, y_train, x_test, y_test = split_data(data, 0.20, LABEL)
def get_input_fn_train():
input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_train,
y=y_train,
shuffle=False
)
return input_fn
def get_input_fn_test():
input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_test,
y=y_test,
shuffle=False
)
return input_fn
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn
(get_input_fn_train())
model_dir = tempfile.mkdtemp()
m = tf.estimator.LinearClassifier(model_dir=model_dir,
feature_columns=feature_columns)
# train data
m.train(input_fn=get_input_fn_train(), steps=5000)
# you can get accuracy, accuracy_baseline, auc, auc_precision_recall,
#average_loss, global_step, label/mean, lossprediction/mean
results = m.evaluate(input_fn=get_input_fn_test(), steps=None)
print("model directory = %s" % model_dir)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
# get prediction results
y = m.predict(input_fn=get_input_fn_test())
predictions = list(y)
pred1=pd.DataFrame(data=predictions)
prediction=pd.DataFrame(data=pred1['class_ids'])
pred=[]
for row in prediction["class_ids"]:
pred.append(row[0])
rowNumber = 0
for i in pred:
print(str(rowNumber) + ': ' + str(i))
rowNumber = rowNumber + 1
def calculate(prediction, LABEL):
arr = {"accuracy": accuracy_score(prediction, LABEL),
"report": classification_report(prediction, LABEL),
"Confusion_Matrix": confusion_matrix(prediction, LABEL),
"F1 score": f1_score(prediction, LABEL),
"Recall Score": recall_score(prediction, LABEL),
"cohen_kappa": cohen_kappa_score(prediction, LABEL)
}
return arr
pred2 = pd.DataFrame(data=pred)
print(calculate(pred2.round(), y_test))
I'm going to make some small changes to #Maxim's answer (thanks, btw) and post a minimum working example with random numpy data. This seems to run fine on my windows machine. Note the suppressed warning due to my particular hardware.
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd
import numpy as np
import tensorflow as tf
FEATURES = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary", "Exited"]
credit_score = tf.feature_column.numeric_column("CreditScore")
age = tf.feature_column.numeric_column("Age")
tenure = tf.feature_column.numeric_column("Tenure")
balance = tf.feature_column.numeric_column("Balance")
num_of_products = tf.feature_column.numeric_column("NumOfProducts")
estimated_salary = tf.feature_column.numeric_column("EstimatedSalary")
feature_columns = [credit_score, age, tenure, balance, num_of_products, estimated_salary]
def input_fn(num_epochs=None, shuffle=True, batch_size=100):
N_features = len(FEATURES)
print(N_features)
N_examples = 5000
X_train = np.random.rand(N_examples,N_features)
Y_train = np.random.rand(N_examples)
columns = [str(i) for i in range(N_features)]
columns = FEATURES
df = pd.DataFrame(data = X_train, columns = columns)
labels = df["Exited"]
return tf.estimator.inputs.pandas_input_fn(x=df,
y=labels,
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
model = tf.estimator.LinearClassifier(model_dir='model_dir',
feature_columns=feature_columns)
model.train(input_fn=input_fn(), steps=100)

Categories