How can I plug a prediction of X into BernoulliNB.predict_proba? - python

I have a Python script that, given an observed class (X) and some columns of binary (Y), predicts a class (Pred_X). It then predicts the probability of each class (Prob(1), etc). How could I get the probability of only the observed class (Prob(X)) please?
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()
# Data
df_1 = pd.DataFrame({'X' : [1,2,1,1,1,2,1,2,2,1],
'Y1': [1,0,0,1,0,0,1,1,0,1],
'Y2': [0,0,1,0,0,1,0,0,1,0],
'Y3': [1,0,0,0,0,0,1,0,0,0]})
# Split the data
df_I = df_1 .loc[ : , ['Y1', 'Y2', 'Y3']]
S_O = df_1['X']
# Bernoulli Naive Bayes Classifier
A_F = BNB.fit(df_I, S_O)
# Predict X
A_P = BNB.predict(df_I)
df_P = pd.DataFrame(A_P)
df_P.columns = ['Pred_X']
# Predict Probability
A_R = BNB.predict_proba(df_I)
df_R = pd.DataFrame(A_R)
df_R.columns = ['Prob_1', 'Prob_2']
# Join
df_1 = df_1.join(df_P)
df_1 = df_1.join(df_R)

thanks #jezrael:
# Rename the columns after the classes of X
classes = df_1['X'].unique()
df_R.columns = [classes]
# Look up the predicted probability of X
df_1['Prob_X'] = df_R.lookup(df_R.index, df_1.X)

Related

error when using SKforecast: UserWarning: `y` has DatetimeIndex index but no frequency. Index is overwritten with a RangeIndex of step 1

I am trying to use skforecast for time series analysis however I am getting warning telling me that the df has no frequency because the index is not DateTimeIndex but in fact it is.
Here is the code:
import yfinance as yf
import datetime as dt
spxl = yf.Ticker("SPXL")
hist = spxl.history(start="2015-01-01")
hist = hist.asfreq("D")
data = hist.dropna()
type(data.index)
#Output: pandas.core.indexes.datetimes.DatetimeIndex
#Split data into train-val-test
#==============================================================================
data = data.loc['2015-01-01': '2022-12-31']
end_train = '2019-12-31'
end_validation = '2020-12-31'
data_train = data.loc[: end_train, :].copy()
data_val = data.loc[end_train:end_validation, :].copy()
data_test = data.loc[end_validation:, :].copy()
#Create forecaster
#==============================================================================
forecaster = ForecasterAutoreg(
regressor = LGBMRegressor(),
lags = 7
)
#Grid search of hyper-parameters and lags
#==============================================================================
#Regressor hyper-parameters
param_grid = {
'n_estimators': [100, 500],
'max_depth': [3, 5, 10],
'learning_rate': [0.01, 0.1]
}
#Lags used as predictors
lags_grid = [7]
Here where the warning is triggered, when creating forecaster:
results_grid_q10 = grid_search_forecaster(
forecaster = forecaster,
y = data.loc[:end_validation, 'Close'],
param_grid = param_grid,
lags_grid = lags_grid,
steps = 7,
refit = True,
metric = 'mean_squared_error',
initial_train_size = int(len(data_train)),
fixed_train_size = False,
return_best = True,
verbose = False
)
I can not seem to understand what I am doing wrong!
Just in case someone is facing the same problem with skforecast or any other time series forecasting, here is the solutions by the library creators:
https://github.com/JoaquinAmatRodrigo/skforecast/issues/329

How to predict price for a future day given historical data

so currently this is the code I have. Not attached are various graphs that I have made that show the actual stock price from the CSV and then my projections. I'm wanting to make it where I simply predict tomorrow's stock price given all of this historical data but I'm having a difficult time. The "df.loc[len(df.index)] = ['2022-04-05',0,0,0,0,0,0]" was where I was trying to put the predictions for future days although I am open to other ways.
# Machine learning
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# For data manipulation
import pandas as pd
import numpy as np
# To plot
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
# method of pandas
df = pd.read_csv('data_files/MSFT.csv')
#add extra row of blank data for future prediction
df.loc[len(df.index)] = ['2022-04-05',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-06',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-07',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-08',0,0,0,0,0,0]
# Changes The Date column as index columns
df.index = pd.to_datetime(df['Date'])
# drop The original date column
df = df.drop(['Date'], axis='columns')
print(df)
# Create predictor variables
df['Open-Close'] = df.Open - df.Close
df['High-Low'] = df.High - df.Low
# Store all predictor variables in a variable X
X = df[['Open-Close', 'High-Low']]
X.head()
# Target variables
y = np.where(df['Close'].shift(-1) > df['Close'], 1, 0)
print(y)
split_percentage = 0.8
split = int(split_percentage*len(df))
# Train data set
X_train = X[:split]
y_train = y[:split]
# Test data set
X_test = X[split:]
y_test = y[split:]
# Support vector classifier
cls = SVC().fit(X_train, y_train)
df['Predicted_Signal'] = cls.predict(X)
# Calculate daily returns
df['Return'] = df.Close.pct_change()
# Calculate strategy returns
df['Strategy_Return'] = df.Return * df.Predicted_Signal.shift(1)
# Calculate Cumulutive returns
df['Cum_Ret'] = df['Return'].cumsum()
# Plot Strategy Cumulative returns
df['Cum_Strategy'] = df['Strategy_Return'].cumsum()

How do I create a linear regression model for a file that has about 500 columns as y variables? Working with Python

This code manually selects a column from the y table and then joins it to the X table. The program then performs linear regression. Any idea how to do this for every single column from the y table?
yDF = pd.read_csv('ytable.csv')
yDF.drop('Dates', axis = 1, inplace = True)
XDF = pd.read_csv('Xtable.csv')
ycolumnDF = yDF.iloc[:,0].to_frame()
regressionDF = pd.concat([XDF,ycolumnDF], axis=1)
X = regressionDF.iloc[:,1:20]
y = regressionDF.iloc[:,20:].squeeze()
lm = linear_model.LinearRegression()
lm.fit(X,y)
cf = lm.coef_
print(cf)
You can regress multiple y's on the same X's at the same time. Something like this should work
import numpy as np
from sklearn.linear_model import LinearRegression
df_X = pd.DataFrame(columns = ['x1','x2','x3'], data = np.random.normal(size = (10,3)))
df_y = pd.DataFrame(columns = ['y1','y2'], data = np.random.normal(size = (10,2)))
X = df_X.iloc[:,:]
y = df_y.iloc[:,:]
lm = LinearRegression().fit(X,y)
print(lm.coef_)
produces
[[ 0.16115884 0.08471495 0.39169592]
[-0.51929011 0.29160846 -0.62106353]]
The first row here ([ 0.16115884 0.08471495 0.39169592]) are the regression coefs of y1 on xs and the second are the regression coefs of y2 on xs.

How to get feature importance in RF

I am trying to get RF feature importance, I fit the random forest on the data like this:
model = RandomForestRegressor()
n = model.fit(self.X_train,self.y_train)
if n is not None:
df = pd.DataFrame(data = n , columns = ["Feature","Importance_Score"])
df["Feature_Name"] = np.array(self.X_Headers)
df = df.drop(["Feature"], axis = 1)
df[["Feature_Name","Importance_Score"]].to_csv("RF_Importances.csv", index = False)
del df
However, the n variable returns None, why is this happening?
Not very sure how model.fit(self.X_train,self.y_train) is supposed to work. Need more information about how you set up the model.
If we set this up using simulated data, it works:
np.random.seed(111)
X = pd.DataFrame(np.random.normal(0,1,(100,5)),columns=['A','B','C','D','E'])
y = np.random.normal(0,1,100)
model = RandomForestRegressor()
n = model.fit(X,y)
if n is not None:
df = pd.DataFrame({'features':X.columns,'importance':n.feature_importances_})
df
features importance
0 A 0.176091
1 B 0.183817
2 C 0.169927
3 D 0.267574
4 E 0.202591

Do you have to create dummy variables for ordinal variables? Also getting error with conversion

For the dataset that I am working with, the categorical variables are ordinal, ranging from 1 to 5 for three columns. I am going to be feeding this into XGBoost.
Would I be okay to just run this command and skip creating dummy variables:
ser = pd.Series([1, 2, 3], dtype='category')
ser = ser.to_frame()
ser = ser.T
I would like to know conceptually, since the categorical data is ordinal, would simply converting that to type category be adequate for the model? I tried creating dummy variables but all the values become a 1.
As for the code now, it runs but this command returns: 'numpy.int64'.
type(ser[0][0])
Am I going about this correctly? Any help would be great!
Edit: updated code
Edit2: Normalizing the numerical data values. Is this logic correct?:
r = [1, 2, 3, 100 ,200]
scaler = preprocessing.StandardScaler()
r = preprocessing.scale(r)
r = pd.Series(r)
r = r.to_frame()
r = r.T
Edit3: This is the dataset.
Just setting categorical variables as dtype="category" is not sufficient and won't work.
You need to convert categorical values to true categorical values with pd.factorize(), where each category is assigned a numerical label.
Let's say df is your pandas dataframe. Then in general you could use this boilerplate code:
df_numeric = df.select_dtypes(exclude=['object'])
df_obj = df.select_dtypes(include=['object']).copy()
# factorize categoricals columnwise
for c in df_obj:
df_obj[c] = pd.factorize(df_obj[c])[0]
# if you want to one hot encode then add this line:
df_obj = pd.get_dummies(df_obj, prefix_sep='_', drop_first = True)
# merge dataframes back to one dataframe
df_final = pd.concat([df_numeric, df_obj], axis=1)
Since your categorical variables already are factorized (as far as I understand), you can skip the factorization and just try one hot encoding.
See also this post on stats.stackexchange.
If you want to standardize/normalize your numerical data (not the categorical) use this function:
from sklearn import preprocessing
def scale_data(data, scale="robust"):
x = data.values
if scale == "minmax":
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
elif scale == "standard":
scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
elif scale == "quantile":
scaler = preprocessing.QuantileTransformer()
x_scaled = scaler.fit_transform(x)
elif scale == "robust":
scaler = preprocessing.RobustScaler()
x_scaled = scaler.fit_transform(x)
data = pd.DataFrame(x_scaled, columns = data.columns)
return data
scaled_df = scale_data(df_numeric, "robust")
Putting it all together for your dataset:
from sklearn import preprocessing
df = pd.read_excel("default of credit card clients.xls", skiprows=1)
y = df['default payment next month'] #target variable
del df['default payment next month']
c = [2,3,4] # index of categorical data columns
r = list(range(0,24))
r = [x for x in r if x not in c] # get list of all other columns
df_cat = df.iloc[:, [2,3,4]].copy()
df_con = df.iloc[:, r].copy()
# factorize categorical data
for c in df_cat:
df_cat[c] = pd.factorize(df_cat[c])[0]
# scale continuous data
scaler = preprocessing.MinMaxScaler()
df_scaled = scaler.fit_transform(df_con)
df_scaled = pd.DataFrame(df_scaled, columns=df_con.columns)
df_final = pd.concat([df_cat, df_scaled], axis=1)
#reorder columns back to original order
cols = df.columns
df_final = df_final[cols]
To further improve the code, do the train/test split before normalization, fit_transform() on the training data and just transform() on the test data. Otherwise you will have a data leak.

Categories