Summarize Loop Results in Pandas Table - python

I got the code that downloads tickers and runs the linear regression for each stock in the downloaded list. I am stuck on the last step: showing Prediction & Residual values for each stock, for the last date in the data.
import pandas as pd
import numpy as np
import yfinance as yf
import datetime as dt
from sklearn import linear_model
tickers = ['EXPE','MSFT']
data = yf.download(tickers, start="2012-04-03", end="2017-07-07")['Close']
data = data.reset_index()
data = data.dropna()
df = pd.DataFrame(data, columns = ["Date"])
df["Date"]=df["Date"].apply(lambda x: x.toordinal())
for ticker in tickers:
data[ticker] = pd.DataFrame(data, columns = [ticker])
X = df
y = data[ticker]
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
predictions = lm.predict(X)
residuals = y-lm.predict(X)
print (predictions[-1:])
print(residuals[-1:])
The current output looks like this:
[136.28856636]
1323 13.491432
Name: EXPE, dtype: float64
[64.19943648]
1323 5.260563
Name: MSFT, dtype: float64
But I would like it to show like this (as pandas table):
Predictions Residuals
EXPE 136.29 13.49
MSFT 64.20 5.26

You could do something like this where you store values in a list:
import pandas as pd
import numpy as np
import yfinance as yf
import datetime as dt
from sklearn import linear_model
tickers = ['EXPE','MSFT']
data = yf.download(tickers, start="2012-04-03", end="2017-07-07")['Close']
data = data.reset_index()
data = data.dropna()
df = pd.DataFrame(data, columns = ["Date"])
df["Date"]=df["Date"].apply(lambda x: x.toordinal())
predictions_output = []
residuals_output = []
for ticker in tickers:
data[ticker] = pd.DataFrame(data, columns = [ticker])
X = df
y = data[ticker]
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
predictions = lm.predict(X)
residuals = y-lm.predict(X)
predictions_output.append(float(predictions[-1:]))
residuals_output.append(float(residuals[-1:]))
expectation_df = pd.DataFrame(list(zip(predictions_output, residuals_output)),
columns =['Predictions', 'Residuals']).set_index([tickers])
print(expectation_df)
with the output being:
Predictions Residuals
EXPE 136.288566 13.491432
MSFT 64.199436 5.260563
EDIT: I went too quickly and looked back and realized tickers was already defined, so you can use that to set your index here and lose the Tickers index heading to match your desired output.
Also if you want those values rounded, you can just append these two lines in your loop:
predictions_output.append(round(float(predictions[-1:]), 2))
residuals_output.append(round(float(residuals[-1:]), 2))

Related

How to predict price for a future day given historical data

so currently this is the code I have. Not attached are various graphs that I have made that show the actual stock price from the CSV and then my projections. I'm wanting to make it where I simply predict tomorrow's stock price given all of this historical data but I'm having a difficult time. The "df.loc[len(df.index)] = ['2022-04-05',0,0,0,0,0,0]" was where I was trying to put the predictions for future days although I am open to other ways.
# Machine learning
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# For data manipulation
import pandas as pd
import numpy as np
# To plot
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
# method of pandas
df = pd.read_csv('data_files/MSFT.csv')
#add extra row of blank data for future prediction
df.loc[len(df.index)] = ['2022-04-05',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-06',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-07',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-08',0,0,0,0,0,0]
# Changes The Date column as index columns
df.index = pd.to_datetime(df['Date'])
# drop The original date column
df = df.drop(['Date'], axis='columns')
print(df)
# Create predictor variables
df['Open-Close'] = df.Open - df.Close
df['High-Low'] = df.High - df.Low
# Store all predictor variables in a variable X
X = df[['Open-Close', 'High-Low']]
X.head()
# Target variables
y = np.where(df['Close'].shift(-1) > df['Close'], 1, 0)
print(y)
split_percentage = 0.8
split = int(split_percentage*len(df))
# Train data set
X_train = X[:split]
y_train = y[:split]
# Test data set
X_test = X[split:]
y_test = y[split:]
# Support vector classifier
cls = SVC().fit(X_train, y_train)
df['Predicted_Signal'] = cls.predict(X)
# Calculate daily returns
df['Return'] = df.Close.pct_change()
# Calculate strategy returns
df['Strategy_Return'] = df.Return * df.Predicted_Signal.shift(1)
# Calculate Cumulutive returns
df['Cum_Ret'] = df['Return'].cumsum()
# Plot Strategy Cumulative returns
df['Cum_Strategy'] = df['Strategy_Return'].cumsum()

Split data frame in python based on one parameter shape

I have a data frame which is like the following :
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
df_input = pd.read_csv('combine_input.csv', delimiter=',')
df_output = pd.read_csv('combine_output.csv', delimiter=',')
In this data frame, there are many repeated rows for example the first row is repeated more than 1000 times, and so on for the other rows
when I plot the time distribution I got that figure which shows that the frequency of the time parameter
df_input.plot(y='time',kind = 'hist',figsize=(10,10))
plt.grid()
plt.show()
My question is how can I take the data only in the following red rectangular for example at time = 0.006 and frequency = 0.75 1e6 ( check the following pic )
Note: InPlace of target you have to write time as your column name Is time,or change column name to target
def calRows(df,x,y):
#df For consideration
df1 = pd.DataFrame(df.target[df.target<=x])
minCount = len(df1)
targets = df1.target.unique()
for i in targets:
count = int(df1[df1.target == i].count())
if minCount > count:
minCount = count
if minCount > y:
minCount = int(y)
return minCount
You have To pass your data frame, x-intercept of the graph, y-intercept of graph to calRows(df,x,y) function which will return the number of rows to take for each target.
rows = CalRows(df,6,75)
print(rows)
takeFeatures(df,rows,x) function will take dataframe, rows (result of first function), x-intercept of graph and will return you the final dataframe.
def takeFeatures(df,rows,x):
finalDf = pd.DataFrame(columns = df.columns)
df1 = df[df.target<=x]
targets = df1.target.unique()
for i in targets:
targeti = df1[df1.target==i]
sample = targeti.sample(rows)
finalDf = pd.concat([finalDf,sample])
return finalDf
Calling takeFeature() Function
final = takeFeatures(df,rows,6)
print(final)
Your Final DataFrame will have the Values ThatYou expected in Graph
And After Plotting this final dataframe you will get like this graph

Error calculating r squared with statsmodels for multiple yfinance data in a DataFrame

I recently began learning Python, but rather with a complex project I had already started in Excel. I have used different guides for the code I have used so far, tweaked to my needs.
I am using 'yfinance' to gather data for multiple cryptocurrencies in a specific time period from Yahoo! Finance. Also, 'stats models' to obtain alpha, beta and r squared using a DataFrame created with all cryptocurrencies and an additional column with the mkt. return (x variable).
I am having the following error: ValueError: endog and exog matrices are different sizes. I saw another question/answer regarding this error, but it did not seem to relate to my issue.
The error takes place in line 87 [model = sm.OLS(Y2,X_)] of the following code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override()
df1 = pdr.get_data_yahoo("BTC-USD", start="2015-01-01", end="2020-01-01")
df2 = pdr.get_data_yahoo("ETH-USD", start="2015-01-01", end="2020-01-01")
df3 = pdr.get_data_yahoo("XRP-USD", start="2015-01-01", end="2020-01-01")
df4 = pdr.get_data_yahoo("BCH-USD", start="2015-01-01", end="2020-01-01")
df5 = pdr.get_data_yahoo("USDT-USD", start="2015-01-01", end="2020-01-01")
df6 = pdr.get_data_yahoo("BSV-USD", start="2015-01-01", end="2020-01-01")
df7 = pdr.get_data_yahoo("LTC-USD", start="2015-01-01", end="2020-01-01")
df8 = pdr.get_data_yahoo("BNB-USD", start="2015-01-01", end="2020-01-01")
df9 = pdr.get_data_yahoo("EOS-USD", start="2015-01-01", end="2020-01-01")
df10 = pdr.get_data_yahoo("LINK-USD", start="2015-01-01", end="2020-01-01")
df11 = pdr.get_data_yahoo("XMR-USD", start="2015-01-01", end="2020-01-01")
df12 = pdr.get_data_yahoo("BTG-USD", start="2015-01-01", end="2020-01-01")
return_btc = df1.Close.pct_change()[1:]
return_eth = df2.Close.pct_change()[1:]
return_xrp = df3.Close.pct_change()[1:]
return_bch = df4.Close.pct_change()[1:]
return_usdt = df5.Close.pct_change()[1:]
return_bsv = df6.Close.pct_change()[1:]
return_ltc = df7.Close.pct_change()[1:]
return_bnb = df8.Close.pct_change()[1:]
return_eos = df9.Close.pct_change()[1:]
return_link = df10.Close.pct_change()[1:]
return_xmr = df11.Close.pct_change()[1:]
return_btg = df12.Close.pct_change()[1:]
d = {"BTC Return":return_btc, "ETH Return":return_eth, "XRP Return":return_xrp, "BCH Return":return_bch,
"USDT Return":return_usdt, "BSV Return":return_bsv, "LTC Return":return_ltc, "BNB Return":return_bnb,
"EOS Return":return_eos, "LINK Return":return_link, "XMR Return":return_xmr, "BTG Return":return_btg}
df = pd.DataFrame(d) # new data frame with all returns data
df = pd.DataFrame(d, columns=["Date", "BTC Return", "ETH Return", "XRP Return", "BCH Return", "USDT Return", "BSV Return",
"LTC Return", "BNB Return", "EOS Return", "LINK Return", "XMR Return", "BTG Return"])
avg_row = df.mean(axis=1)
return_mkt = avg_row
d1 = {"BTC Return":return_btc, "ETH Return":return_eth, "XRP Return":return_xrp, "BCH Return":return_bch,
"USDT Return":return_usdt, "BSV Return":return_bsv, "LTC Return":return_ltc, "BNB Return":return_bnb,
"EOS Return":return_eos, "LINK Return":return_link, "XMR Return":return_xmr, "BTG Return":return_btg, "MKT Return":return_mkt}
df = pd.DataFrame(d1)
print(df)
import statsmodels.api as sm
from statsmodels import regression
X = return_mkt.values
Y1 = return_btc
Y2 = return_eth
#Y3 = return_xrp
def linreg(x,y):
x = sm.add_constant(x)
model = regression.linear_model.OLS(y,x).fit()
# we are removing the constant
x = x[:, 1]
return model.params[0], model.params[1]
X_ = sm.add_constant(X) # artificially add intercept to x, as advised in the docs
model = sm.OLS(Y1,X_)
results = model.fit()
rsquared = results.rsquared
alpha, beta = linreg(X,Y1)
def linreg(x,y):
x = sm.add_constant(x)
model = regression.linear_model.OLS(y,x).fit()
# we are removing the constant
x = x[:, 1]
return model.params[0], model.params[1]
X_ = sm.add_constant(X) # artificially add intercept to x, as advised in the docs
model = sm.OLS(Y2,X_)
results = model.fit()
rsquared = results.rsquared
alpha, beta = linreg(X,Y2)
The error is located in the second def, as I am trying to compute the previously mentioned statistics for each cryptocurrency. Thus, the 1st def is for BTC (Y1), the 2nd def is for ETH (Y2), and so on (Y3,...).
The entire code was fine when I had only the function for BTC at the end, the error occurred when I tried to add more of the same function for the others.
Fundamentally, the problem is that because Ethereum (and all other cryptos) started later than bitcoin, there are null values for the price every day for the first few years, which can't be handled. So you have to take just the values where they are not null.
However, there are many things in your code which you could factor out so that you don't repeat yourself unnecessarily. You made an attempt at that with the linreg function, but then you re-defined it for the second crypto, which shouldn't be necessary.
Here is a quick re-write which addresses both the fundamental problem and hopefully illustrates what I mean above. The output is a dataframe with the statistics you're looking for, by cryptocurrency. The goal is to write as much of the code 'generically', and then just provide a list of cryptos that you are interested in.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas_datareader import data as pdr
import datetime
import yfinance as yf
import statsmodels.api as sm
from statsmodels import regression
yf.pdr_override()
cryptos = ["BTC", "ETH", "XRP"] # Here you can specify the cryptos you want. I just used 3 for demonstration
# The rest of the code is not specific to any one crypto
def get_and_process_data(c):
raw_data = pdr.get_data_yahoo(c + '-USD', start="2015-01-01", end="2020-01-01")
return raw_data.Close.pct_change()[1:]
df = pd.DataFrame({c: get_and_process_data(c) for c in cryptos})
df['avg_return'] = df.mean(axis=1) # avg market return
print(df)
def model(x, y):
# Calculate r-squared
X = sm.add_constant(x) # artificially add intercept to x, as advised in the docs
model = sm.OLS(y,X).fit()
rsquared = model.rsquared
# Fit linear regression and calculate alpha and beta
X = sm.add_constant(x)
model = regression.linear_model.OLS(y,X).fit()
alpha = model.params[0]
beta = model.params[1]
return rsquared, alpha, beta
results = pd.DataFrame({c: model(df[df[c].notnull()]['avg_return'], df[df[c].notnull()][c]) for c in cryptos}).transpose()
results.columns = ['rsquared', 'alpha', 'beta']
print(results)

loop through dataframe columns to do simple linear regression?

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
df = pd.read_excel("Book1.xlsx")
for column in df:
X = df["Row Labels"]
Y = df[column]
y1 =Y.values.reshape(-1,1)
x1 =X.values.reshape(-1,1)
regressor = LinearRegression()
regressor.fit(x1, y1)
y_new = []
y_i = []
for i in range(12,24):
y_new.append(regressor.predict([[i]]))
y_i.append(i)
df2 = pd.DataFrame({'column':y_new})
i write this code to loop through the dataframe columns to do simple linear regression and put all the predicted value in dataframe. but it is predicting only the last columns value.
df2 = pd.DataFrame({'column':y_new}) creates a column named 'column' verbatim (not the name saved in the variable column. Moreover, df2 is recreated in every iteration, each iteration it only saves the last y_new.
I think what you want is to create a new column in df2 in each iteration:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
df = pd.read_excel("Book1.xlsx")
df2 = pd.DataFrame()
for column in df:
X = df["Row Labels"]
Y = df[column]
y1 =Y.values.reshape(-1,1)
x1 =X.values.reshape(-1,1)
regressor = LinearRegression()
regressor.fit(x1, y1)
y_new = []
y_i = []
for i in range(12,24):
y_new.append(regressor.predict([[i]]))
y_i.append(i)
df2[column] = y_new

How to perform time series analysis that contains multiple groups in Python using fbProphet or other models?

All,
My dataset looks like following. I am trying to predict the 'amount' for next 6 months using either the fbProphet or other model. But my issue is that I would like to predict amount based on each groups i.e A,B,C,D for next 6 months. I am not sure how to do that in python using fbProphet or other model ? I referenced official page of fbprophet, but the only information I found is that "Prophet" takes two columns only One is "Date" and other is "amount" .
I am new to python, so any help with code explanation is greatly appreciated!
import pandas as pd
data = {'Date':['2017-01-01', '2017-02-01', '2017-03-01', '2017-04-01','2017-05-01','2017-06-01','2017-07-01'],'Group':['A','B','C','D','C','A','B'],
'Amount':['12.1','13','15','10','12','9.0','5.6']}
df = pd.DataFrame(data)
print (df)
output:
Date Group Amount
0 2017-01-01 A 12.1
1 2017-02-01 B 13
2 2017-03-01 C 15
3 2017-04-01 D 10
4 2017-05-01 C 12
5 2017-06-01 A 9.0
6 2017-07-01 B 5.6
fbprophet requires two columns ds and y, so you need to first rename the two columns
df = df.rename(columns={'Date': 'ds', 'Amount':'y'})
Assuming that your groups are independent from each other and you want to get one prediction for each group, you can group the dataframe by "Group" column and run forecast for each group
from fbprophet import Prophet
grouped = df.groupby('Group')
for g in grouped.groups:
group = grouped.get_group(g)
m = Prophet()
m.fit(group)
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
print(forecast.tail())
Take note that the input dataframe that you supply in the question is not sufficient for the model because group D only has a single data point. fbprophet's forecast needs at least 2 non-Nan rows.
EDIT: if you want to merge all predictions into one dataframe, the idea is to name the yhat for each observations differently, do pd.merge() in the loop, and then cherry-pick the columns that you need at the end:
final = pd.DataFrame()
for g in grouped.groups:
group = grouped.get_group(g)
m = Prophet()
m.fit(group)
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
forecast = forecast.rename(columns={'yhat': 'yhat_'+g})
final = pd.merge(final, forecast.set_index('ds'), how='outer', left_index=True, right_index=True)
final = final[['yhat_' + g for g in grouped.groups.keys()]]
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
# Before doing any modeling using ARIMA or SARIMAS etc Confirm that
# your time-series is stationary by using Augmented Dick Fuller test
# or other tests.
# Create a list of all groups or get from Data using np.unique or other methods
groups_iter = ['A', 'B', 'C', 'D']
dict_org = {}
dict_pred = {}
group_accuracy = {}
# Iterate over all groups and get data
# from Dataframe by filtering for specific group
for i in range(len(groups_iter)):
X = data[data['Group'] == groups_iter[i]]['Amount'].values
size = int(len(X) * 0.70)
train, test = X[0:size], X[size:len(X)]
history = [x for in train]
# Using ARIMA model here you can also do grid search for best parameters
for t in range(len(test)):
model = ARIMA(history, order = (5, 1, 0))
model_fit = model.fit(disp = 0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
print("Predicted:%f, expected:%f" %(yhat, obs))
error = mean_squared_log_error(test, predictions)
dict_org.update({groups_iter[i]: test})
dict_pred.update({group_iter[i]: test})
print("Group: ", group_iter[i], "Test MSE:%f"% error)
group_accuracy.update({group_iter[i]: error})
plt.plot(test)
plt.plot(predictions, color = 'red')
plt.show()
I know this is old but I was trying to predict outcomes for different clients and I tried to use Aditya Santoso solution above but got into some errors, so I added a couple of modifications and finally this worked for me:
df = pd.read_csv('file.csv')
df = pd.DataFrame(df)
df = df.rename(columns={'date': 'ds', 'amount': 'y', 'client_id': 'client_id'})
#I had to filter first clients with less than 3 records to avoid errors as prophet only works for 2+ records by group
df = df.groupby('client_id').filter(lambda x: len(x) > 2)
df.client_id = df.client_id.astype(str)
final = pd.DataFrame(columns=['client','ds','yhat'])
grouped = df.groupby('client_id')
for g in grouped.groups:
group = grouped.get_group(g)
m = Prophet()
m.fit(group)
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
#I added a column with client id
forecast['client'] = g
#I used concat instead of merge
final = pd.concat([final, forecast], ignore_index=True)
final.head(10)

Categories