I'm at a loss as to what's happening here.
I'm downloading historical stock data with Pandas Datareader, and after some small manipulations (ie. re-arranging the dataframe, adding moving averages, etc.), I pass the dataframe to FeatureTools to do a quick Auto Feature Engineering, which it does fine by adding new columns to the dataframe...
BUT then I pass it to FeatureSelector (to remove all columns that are highly correlated, have no importance, etc.) but I receive an issue where FeatureSelector cannot find the "label" column in the dataset that I'm trying to point it to anymore (Adj Close). I'm new to FeatureSelector so I'm not entirely sure how to use it yet. From there, it will pass the data on to TPOT to do an Auto Regression.
I have included my full code here, I know you're not supposed to, but it will be a working code for anyone to be able to try and see my issue on their side. The error I get is:
KeyError: "labels ['Adj Close'] not contained in axis"
It would appear that FeatureSelector is removing the "Adj Close" label/column during the removal step, but I thought that was why we assign it to the internal "label=" part? Any suggestions would be great. Would love to get this working. Just type in a ticker symbol to get started (ex. CLVS). Thanks!
ticker_input = input('Which stock ticker would you like to predict?') # Start with CLVS for testing
print('Getting the historical data for: ',ticker_input)
# Downloading historical data as dataframe
from datetime import datetime
from pandas_datareader import data as web
import pandas as pd
ex = 'yahoo'
start = datetime(2010, 1, 1)
end = datetime.now()
df = web.DataReader(ticker_input, ex, start, end).reset_index()
# Create the prediction dataset
df = df.drop(['Close'],axis=1)
df['PrevHi'] = df['High'].shift(1)
df['PrevLo'] = df['Low'].shift(1)
df['PrevClose'] = df['Adj Close'].shift(1)
df['PrevVol'] = df['Volume'].shift(1)
df['PrevOpen'] = df['Open'].shift(1)
df = df.drop(['High','Low','Volume'],axis=1)
# Get the 9 and 20 MA values
df['9MA'] = df['Open'].rolling(window=9).mean()
df['20MA'] = df['Open'].rolling(window=20).mean()
import time
# Reshape the df
df2 = df[['Date','Open','PrevOpen','PrevHi','PrevLo','PrevClose','PrevVol','9MA','20MA','Adj Close']]
df2.dropna(how='all') # THIS DROP ISN'T DROPPING ROWS W/ BLANK VALUES FOR SOME REASON???
# Auto Feature Engineering using Feature Tools
import featuretools as ft
#print(ft.list_primitives().to_string()) # To get full list of primitives that could be used
print('Adding the engineered features to the dataframe. This may take a while...')
es = ft.EntitySet(id = 'stockdata')
es.entity_from_dataframe(entity_id = 'data', dataframe = df2,
make_index = False,index = 'Date')
# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'data', max_depth=2,verbose=True,
agg_primitives = ['skew','mean','median',
'all','count','num_unique','trend','max','mode',
'std','sum','min'],
trans_primitives = ['divide_numeric'])
# 'diff',
# 'greater_than',
# 'less_than_equal_to',
# 'cum_mean',
# 'time_since',
# 'cum_sum',
# 'add_numeric',
# 'multiply_numeric',
# 'greater_than_equal_to',
# 'negate',
# 'cum_min',
# 'subtract_numeric',
# 'not',
# 'cum_count',
# 'modulo_numeric',
# 'less_than'])
print(feature_matrix.head())
df2 = feature_matrix
df2.to_csv('FeatureMatrix.csv')
# Trying to now name all the feature columns and label for FeatureSelector...
features = df2.drop(['Adj Close'],axis=1)
label = df2['Adj Close'].values
# Now, drop all columns of low importance
from feature_selector import FeatureSelector
fs = FeatureSelector(data = features, labels = label)
fs.identify_all(selection_params = {'missing_threshold': 0.6,
'correlation_threshold': 0.98,
'task': 'regression',
'eval_metric': 'mse',
'cumulative_importance': 0.99})
df2 = fs.remove(methods = 'all')
# Somewhere above it's not recognizing my Adj Close label anymore?
# Training dataset
df = df2.iloc[:-90] # subtracting 90 rows/days to use as the predictions dataset later
print('Printing training dataframe...')
print(df)
# Prediction dataset for later use
prediction_df = df2.iloc[-90:]
print('Printing prediction dataframe for later use...')
print(prediction_df)
# Can keep adding to the dataset with things like PrevIndustryHi,Lo,Close,Open and other metrics
print('Pausing for 20 seconds to review before training...')
time.sleep(20)
# Now, train a TPOT Regressor
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
import os
features = df.drop(['Adj Close'],axis=1)
label = df['Adj Close']
X_train, X_test, y_train, y_test = train_test_split(features, label,
train_size=0.75, test_size=0.25)
# Create a folder to cache the pipeline work (use if not using auto)
# if os.path.exists('./PipelineCache'):
# pass
# else:
# os.mkdir('./PipelineCache')
tpot = TPOTRegressor(generations=10, population_size=40, verbosity=2) #memory='./PipelineCache', memory='auto',
tpot.fit(X_train, y_train)
predictions = (tpot.predict(X_test))
actuals = y_test
last_row = df.tail(1)
print('The last closing price was :')
print(last_row['Adj Close'])
print("TPOT's final score on training data is : ")
print(tpot.score(X_test, y_test))
if os.path.exists('./Exported Pipelines'):
pass
else:
os.mkdir('./Exported Pipelines')
tpot.export('./Exported Pipelines/1day-prediction-pipeline.py')
# Now, use the TPOT model to predict on the held out predictions dataset
from sklearn.metrics import mean_squared_error
features = prediction_df.drop(['Adj Close'], axis=1)
labels = prediction_df['Adj Close']
# Fit the model to the prediction_df and predict the labels
#tpot.fit(features, labels)
results = tpot.predict(features)
predictions_list = []
for preds in results:
predictions_list.append(preds)
prediction_df['Predictions'] = predictions_list
prediction_df.to_csv('PredictionsPerformance.csv', index=True)
print('The Mean Square Error of the predictions is :')
print(mean_squared_error(labels,results))
print('DONE!')
# Clear the cache directory when you don't need it anymore.
# If you're testing the same dataset over and over, use the
# same cache file
#from shutil import rmtree
#rmtree('./PipelineCache')
As a workaround, I just re-added the df column with the adj close in it, after the removal process, like so:
# Trying to now name all the feature columns and label for FeatureSelector...
features = df.drop("Adj Close", axis=1)
label = df["Adj Close"]
# Now, drop all columns of low importance
from feature_selector import FeatureSelector
fs = FeatureSelector(data = features, labels = label)
fs.identify_all(selection_params = {'missing_threshold': 0.6,
'correlation_threshold': 0.98,
'task': 'regression',
'eval_metric': 'mse',
'cumulative_importance': 0.99})
all_to_remove = fs.check_removal()
print(all_to_remove[:])
df = fs.remove(methods = 'all')
# Re-Add the Adj Close to the df because FeatureTools removes it once you assign it as the label for some reason
df['Adj Close'] = label
Related
so currently this is the code I have. Not attached are various graphs that I have made that show the actual stock price from the CSV and then my projections. I'm wanting to make it where I simply predict tomorrow's stock price given all of this historical data but I'm having a difficult time. The "df.loc[len(df.index)] = ['2022-04-05',0,0,0,0,0,0]" was where I was trying to put the predictions for future days although I am open to other ways.
# Machine learning
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# For data manipulation
import pandas as pd
import numpy as np
# To plot
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
# method of pandas
df = pd.read_csv('data_files/MSFT.csv')
#add extra row of blank data for future prediction
df.loc[len(df.index)] = ['2022-04-05',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-06',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-07',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-08',0,0,0,0,0,0]
# Changes The Date column as index columns
df.index = pd.to_datetime(df['Date'])
# drop The original date column
df = df.drop(['Date'], axis='columns')
print(df)
# Create predictor variables
df['Open-Close'] = df.Open - df.Close
df['High-Low'] = df.High - df.Low
# Store all predictor variables in a variable X
X = df[['Open-Close', 'High-Low']]
X.head()
# Target variables
y = np.where(df['Close'].shift(-1) > df['Close'], 1, 0)
print(y)
split_percentage = 0.8
split = int(split_percentage*len(df))
# Train data set
X_train = X[:split]
y_train = y[:split]
# Test data set
X_test = X[split:]
y_test = y[split:]
# Support vector classifier
cls = SVC().fit(X_train, y_train)
df['Predicted_Signal'] = cls.predict(X)
# Calculate daily returns
df['Return'] = df.Close.pct_change()
# Calculate strategy returns
df['Strategy_Return'] = df.Return * df.Predicted_Signal.shift(1)
# Calculate Cumulutive returns
df['Cum_Ret'] = df['Return'].cumsum()
# Plot Strategy Cumulative returns
df['Cum_Strategy'] = df['Strategy_Return'].cumsum()
I'm trying to standardize a dataset in Python as part of Principle Component Analysis. I've managed to do the following so far:
cancer_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
cancer_data.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
'Normal Nucleoli', 'Mitoses','Class']
cancer_data = cancer_data.replace('?', np.NaN)
cancer_data = cancer_data.fillna(cancer_data.median())
classDF = cancer_data['Class']
cancer_data = cancer_data.drop(['Class' ,'Sample code'], axis = 1)
# Standardization of data
standardized = StandardScaler().fit_transform(cancer_data)
x = pd.DataFrame(standardized, columns = cancer_data.columns)
However when I check the Mean values, I get the following output:
array([-5.08256606e-17, -9.14861892e-17, -3.04953964e-17, 5.08256606e-17,
5.08256606e-17, -8.13210570e-17, 3.04953964e-17, -1.32146718e-16,
-8.13210570e-17])
I'm not too sure what I'm doing wrong for these values to be wrong so any help is much appreicated (I'm new to data mining).
Use the formula of the standarization:
column = column to standardized
df_std[column] = (df_std[column] - df_std[column].mean()) /
df_std[column].std()
or:
from sklearn.preprocessing import StandardScaler
# create a scaler object
std_scaler = StandardScaler()
std_scaler
# fit and transform the data
df_std = pd.DataFrame(std_scaler.fit_transform(df_cars), columns=column)
Read for more information :
https://towardsdatascience.com/data-normalization-with-pandas-and-scikit-learn-7c1cc6ed6475
I am following the tutorial here; https://www.analyticsvidhya.com/blog/2018/10/predicting-stock-price-machine-learningnd-deep-learning-techniques-python/#comment-155692
Instead of using the provided dataset I am using one needed for my assignment.
The code used is
#import packages
import pandas as pd
import numpy as np
#to plot within notebook
import matplotlib.pyplot as plt
%matplotlib inline
#setting figure size
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20,10
#for normalizing data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
#read the file
df = pd.read_csv('C:/Users/Usert/Downloads/stock-20050101-to-20171231/stock-20050101-to-20171231/IBM_2006-01-01_to_2018-01-01.csv')
#print the head
df.head()
#setting index as date
df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d')
df.index = df['Date']
#plot
plt.figure(figsize=(16,8))
plt.plot(df['Close'], label='Close Price history')
#creating dataframe with date and the target variable
data = df.sort_index(ascending=True, axis=0)
new_data = pd.DataFrame(index=range(0,len(df)),columns=['Date', 'Close'])
for i in range(0,len(data)):
new_data['Date'][i] = data['Date'][i]
new_data['Close'][i] = data['Close'][i]
#splitting into train and validation
train = new_data[:987]
valid = new_data[987:]
new_data.shape, train.shape, valid.shape
((1235, 2), (987, 2), (248, 2))
train['Date'].min(), train['Date'].max(), valid['Date'].min(), valid['Date'].max()
#make predictions
preds = []
for i in range(0,248):
a = train['Close'][len(train)-248+i:].sum() + sum(preds)
b = a/248
preds.append(b)
#calculate rmse
rms=np.sqrt(np.mean(np.power((np.array(valid['Close'])-preds),2)))
rms
#plot
valid['Predictions'] = 0
valid['Predictions'] = preds
plt.plot(train['Close'])
plt.plot(valid[['Close', 'Predictions']])
This runs fine until "#Calculate RMSE" when it hits the error.
File "<ipython-input-92-1256d885493e>", line 65, in <module>
rms=np.sqrt(np.mean(np.power((np.array(valid['Close'])-preds),2)))
ValueError: operands could not be broadcast together with shapes (2033,) (248,)
Using "print(valid.shape)" and "print(len(preds))" as requested returns "(604, 3)" and "248".
Any idea how I change the numbers to fit my dataset as each time I change the numbers I create more errors?
Just FYI;
The dataset I am using has 7 columns named "Date, Open, High, Low, Close, Volume and Name" with 3021 rows of data including headers.
Whilst the one in the tutorial has 8 columns being "date, open, high, low, last, close, total_trade_quantity, and turnover" with 1236 rows including headers.
All,
My dataset looks like following. I am trying to predict the 'amount' for next 6 months using either the fbProphet or other model. But my issue is that I would like to predict amount based on each groups i.e A,B,C,D for next 6 months. I am not sure how to do that in python using fbProphet or other model ? I referenced official page of fbprophet, but the only information I found is that "Prophet" takes two columns only One is "Date" and other is "amount" .
I am new to python, so any help with code explanation is greatly appreciated!
import pandas as pd
data = {'Date':['2017-01-01', '2017-02-01', '2017-03-01', '2017-04-01','2017-05-01','2017-06-01','2017-07-01'],'Group':['A','B','C','D','C','A','B'],
'Amount':['12.1','13','15','10','12','9.0','5.6']}
df = pd.DataFrame(data)
print (df)
output:
Date Group Amount
0 2017-01-01 A 12.1
1 2017-02-01 B 13
2 2017-03-01 C 15
3 2017-04-01 D 10
4 2017-05-01 C 12
5 2017-06-01 A 9.0
6 2017-07-01 B 5.6
fbprophet requires two columns ds and y, so you need to first rename the two columns
df = df.rename(columns={'Date': 'ds', 'Amount':'y'})
Assuming that your groups are independent from each other and you want to get one prediction for each group, you can group the dataframe by "Group" column and run forecast for each group
from fbprophet import Prophet
grouped = df.groupby('Group')
for g in grouped.groups:
group = grouped.get_group(g)
m = Prophet()
m.fit(group)
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
print(forecast.tail())
Take note that the input dataframe that you supply in the question is not sufficient for the model because group D only has a single data point. fbprophet's forecast needs at least 2 non-Nan rows.
EDIT: if you want to merge all predictions into one dataframe, the idea is to name the yhat for each observations differently, do pd.merge() in the loop, and then cherry-pick the columns that you need at the end:
final = pd.DataFrame()
for g in grouped.groups:
group = grouped.get_group(g)
m = Prophet()
m.fit(group)
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
forecast = forecast.rename(columns={'yhat': 'yhat_'+g})
final = pd.merge(final, forecast.set_index('ds'), how='outer', left_index=True, right_index=True)
final = final[['yhat_' + g for g in grouped.groups.keys()]]
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
# Before doing any modeling using ARIMA or SARIMAS etc Confirm that
# your time-series is stationary by using Augmented Dick Fuller test
# or other tests.
# Create a list of all groups or get from Data using np.unique or other methods
groups_iter = ['A', 'B', 'C', 'D']
dict_org = {}
dict_pred = {}
group_accuracy = {}
# Iterate over all groups and get data
# from Dataframe by filtering for specific group
for i in range(len(groups_iter)):
X = data[data['Group'] == groups_iter[i]]['Amount'].values
size = int(len(X) * 0.70)
train, test = X[0:size], X[size:len(X)]
history = [x for in train]
# Using ARIMA model here you can also do grid search for best parameters
for t in range(len(test)):
model = ARIMA(history, order = (5, 1, 0))
model_fit = model.fit(disp = 0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
print("Predicted:%f, expected:%f" %(yhat, obs))
error = mean_squared_log_error(test, predictions)
dict_org.update({groups_iter[i]: test})
dict_pred.update({group_iter[i]: test})
print("Group: ", group_iter[i], "Test MSE:%f"% error)
group_accuracy.update({group_iter[i]: error})
plt.plot(test)
plt.plot(predictions, color = 'red')
plt.show()
I know this is old but I was trying to predict outcomes for different clients and I tried to use Aditya Santoso solution above but got into some errors, so I added a couple of modifications and finally this worked for me:
df = pd.read_csv('file.csv')
df = pd.DataFrame(df)
df = df.rename(columns={'date': 'ds', 'amount': 'y', 'client_id': 'client_id'})
#I had to filter first clients with less than 3 records to avoid errors as prophet only works for 2+ records by group
df = df.groupby('client_id').filter(lambda x: len(x) > 2)
df.client_id = df.client_id.astype(str)
final = pd.DataFrame(columns=['client','ds','yhat'])
grouped = df.groupby('client_id')
for g in grouped.groups:
group = grouped.get_group(g)
m = Prophet()
m.fit(group)
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
#I added a column with client id
forecast['client'] = g
#I used concat instead of merge
final = pd.concat([final, forecast], ignore_index=True)
final.head(10)
For the dataset that I am working with, the categorical variables are ordinal, ranging from 1 to 5 for three columns. I am going to be feeding this into XGBoost.
Would I be okay to just run this command and skip creating dummy variables:
ser = pd.Series([1, 2, 3], dtype='category')
ser = ser.to_frame()
ser = ser.T
I would like to know conceptually, since the categorical data is ordinal, would simply converting that to type category be adequate for the model? I tried creating dummy variables but all the values become a 1.
As for the code now, it runs but this command returns: 'numpy.int64'.
type(ser[0][0])
Am I going about this correctly? Any help would be great!
Edit: updated code
Edit2: Normalizing the numerical data values. Is this logic correct?:
r = [1, 2, 3, 100 ,200]
scaler = preprocessing.StandardScaler()
r = preprocessing.scale(r)
r = pd.Series(r)
r = r.to_frame()
r = r.T
Edit3: This is the dataset.
Just setting categorical variables as dtype="category" is not sufficient and won't work.
You need to convert categorical values to true categorical values with pd.factorize(), where each category is assigned a numerical label.
Let's say df is your pandas dataframe. Then in general you could use this boilerplate code:
df_numeric = df.select_dtypes(exclude=['object'])
df_obj = df.select_dtypes(include=['object']).copy()
# factorize categoricals columnwise
for c in df_obj:
df_obj[c] = pd.factorize(df_obj[c])[0]
# if you want to one hot encode then add this line:
df_obj = pd.get_dummies(df_obj, prefix_sep='_', drop_first = True)
# merge dataframes back to one dataframe
df_final = pd.concat([df_numeric, df_obj], axis=1)
Since your categorical variables already are factorized (as far as I understand), you can skip the factorization and just try one hot encoding.
See also this post on stats.stackexchange.
If you want to standardize/normalize your numerical data (not the categorical) use this function:
from sklearn import preprocessing
def scale_data(data, scale="robust"):
x = data.values
if scale == "minmax":
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
elif scale == "standard":
scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
elif scale == "quantile":
scaler = preprocessing.QuantileTransformer()
x_scaled = scaler.fit_transform(x)
elif scale == "robust":
scaler = preprocessing.RobustScaler()
x_scaled = scaler.fit_transform(x)
data = pd.DataFrame(x_scaled, columns = data.columns)
return data
scaled_df = scale_data(df_numeric, "robust")
Putting it all together for your dataset:
from sklearn import preprocessing
df = pd.read_excel("default of credit card clients.xls", skiprows=1)
y = df['default payment next month'] #target variable
del df['default payment next month']
c = [2,3,4] # index of categorical data columns
r = list(range(0,24))
r = [x for x in r if x not in c] # get list of all other columns
df_cat = df.iloc[:, [2,3,4]].copy()
df_con = df.iloc[:, r].copy()
# factorize categorical data
for c in df_cat:
df_cat[c] = pd.factorize(df_cat[c])[0]
# scale continuous data
scaler = preprocessing.MinMaxScaler()
df_scaled = scaler.fit_transform(df_con)
df_scaled = pd.DataFrame(df_scaled, columns=df_con.columns)
df_final = pd.concat([df_cat, df_scaled], axis=1)
#reorder columns back to original order
cols = df.columns
df_final = df_final[cols]
To further improve the code, do the train/test split before normalization, fit_transform() on the training data and just transform() on the test data. Otherwise you will have a data leak.