I load data from yahoo finance using the motor_daily function. It takes in a list of tickers and gets me the data.
Here are the used libs:
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
Here is the function definition:
def motor_daily(ticker_file):
tickers_list = ticker_file #SP100
stocks = yf.download(tickers_list, start = start, end = tomorrow) #YYYY-MM-DD
company_name = []
ticker_code = []
for ticker in tickers_list:
loaded_ticker = yf.Ticker(ticker)
tickers = ticker
ticker_code.append(tickers)
finance = pd.DataFrame(ticker_code)
finance["Ticker"] = pd.DataFrame(ticker_code)
finance["Ticker_start"] = finance["Ticker"].str.split('-').str[0]
finance= finance.drop(columns=[0])
stocks_close = stocks.Close
stocks_close = stocks_close.reset_index()
return stocks_close
def ticker_data(list):
data = []
for ticks in list:
data.append(motor_daily(ticks))
return data
The above function loads closing prices for each ticker / stock name in the list (therefore the loop) and stores this in data.
list_of_lists includes:
[['VOW3.DE', 'BMW.DE', 'BEI.DE', 'DPW.DE', 'FME.DE'],
['ISS.CO', 'LUN.CO', 'CARL-B.CO', 'TRYG.CO', 'SIM.CO']]
Output of print(ticker_data(list_of_list))
[ Date BEI.DE BMW.DE DPW.DE FME.DE VOW3.DE
0 2021-03-10 86.860001 81.339996 43.650002 60.840000 196.020004
1 2021-03-11 86.139999 78.519997 44.549999 61.340000 192.039993
2 2021-03-12 87.080002 77.480003 45.060001 60.939999 190.220001
3 2021-03-15 86.959999 77.800003 44.919998 60.759998 194.779999
4 2021-03-16 87.680000 80.500000 45.580002 61.259998 207.850006
5 2021-03-17 88.260002 85.459999 45.419998 60.779999 230.800003,
Date CARL-B.CO ISS.CO LUN.CO SIM.CO TRYG.CO
0 2021-03-10 1012.0 122.599998 243.600006 768.0 135.399994
1 2021-03-11 1009.0 120.300003 235.300003 780.0 143.500000
2 2021-03-12 1006.0 121.150002 237.000000 772.5 143.699997
3 2021-03-15 1006.5 124.250000 236.300003 783.0 145.100006
4 2021-03-16 983.0 125.550003 236.100006 795.5 147.399994
5 2021-03-17 982.0 121.949997 230.300003 778.0 143.899994]
When I try to convert the output to a dataframe using:
df = pd.DataFrame(ticker_data(list_of_list)) output is
ValueError: Must pass 2-d input. shape=(2, 6, 6)
I cannot convert this to a pandas dataframe, how should I go about doing this?
Your motor_daily has a bunch of unused elements. Also, I had to define the start and end times.
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
def motor_daily(ticker_list):
start = pd.Timestamp('now').normalize() - pd.offsets.Day(7)
end = pd.Timestamp('now').normalize() + pd.offsets.BusinessDay(0)
return yf.download(ticker_list, start=start, end=end).Close
list_of_lists = [
['VOW3.DE', 'BMW.DE', 'BEI.DE', 'DPW.DE', 'FME.DE'],
['ISS.CO', 'LUN.CO', 'CARL-B.CO', 'TRYG.CO', 'SIM.CO']
]
df = pd.concat(map(motor_daily, list_of_lists), axis=1)
# I transposed for prettier printing
df.T
Date 2021-03-10 2021-03-11 2021-03-12 2021-03-15 2021-03-16
BEI.DE 86.860001 86.139999 87.080002 86.959999 87.680000
BMW.DE 81.339996 78.519997 77.480003 77.800003 80.500000
DPW.DE 43.650002 44.549999 45.060001 44.919998 45.580002
FME.DE 60.840000 61.340000 60.939999 60.759998 61.259998
VOW3.DE 196.020004 192.039993 190.220001 194.779999 207.850006
CARL-B.CO 1012.000000 1009.000000 1006.000000 1006.500000 983.000000
ISS.CO 122.599998 120.300003 121.150002 124.250000 125.550003
LUN.CO 243.600006 235.300003 237.000000 236.300003 236.100006
SIM.CO 768.000000 780.000000 772.500000 783.000000 795.500000
TRYG.CO 135.399994 143.500000 143.699997 145.100006 147.399994
You can iterate through ticker_data(list_of_list) and make multiple dataframes:
lol = [['VOW3.DE', 'BMW.DE', 'BEI.DE', 'DPW.DE', 'FME.DE'],
['ISS.CO', 'LUN.CO', 'CARL-B.CO', 'TRYG.CO', 'SIM.CO']]
res = ticker_data(lol)
dataframes = [pd.DataFrame(lst) for lst in res]
print(dataframes[0])
#
Date BEI.DE BMW.DE DPW.DE FME.DE VOW3.DE
0 1996-11-08 NaN 18.171000 NaN NaN NaN
1 1996-11-11 NaN 18.122000 NaN NaN NaN
2 1996-11-12 NaN 18.259001 NaN NaN NaN
3 1996-11-13 NaN 18.230000 NaN NaN NaN
4 1996-11-14 NaN 18.289000 NaN NaN NaN
... ... ... ... ... ... ...
6241 2021-03-11 86.139999 78.519997 44.549999 61.340000 192.039993
6242 2021-03-12 87.080002 77.480003 45.060001 60.939999 190.220001
6243 2021-03-15 86.959999 77.800003 44.919998 60.759998 194.779999
6244 2021-03-16 87.680000 80.500000 45.580002 61.259998 207.850006
6245 2021-03-17 88.260002 85.459999 45.419998 60.779999 230.800003
Related
How can I modify the output from what it is currently, into the arrangement of the output as described at the bottom? I've tried stacking and un-stacking but I can't seem to hit the head on the nail. Help would be highly appreciated.
My code:
portfolio_count = 0
Equity_perportfolio = []
Portfolio_sequence = []
while portfolio_count < 1:
# declaring list
list = Tickers
portfolio_count = portfolio_count + 1
# initializing the value of n (Number of assets in portfolio)
n = 5
# printing n elements from list (add number while printing the potential portfolio)
potential_portfolio = random.sample(list, n)
print("Portfolio number", portfolio_count)
print(potential_portfolio)
#Pull 'relevant data' about the selected stocks. (Yahoo API?) # 1. df with Index Date and Closing
price_data_close = web.get_data_yahoo(potential_portfolio,
start = '2012-01-01',
end = '2021-03-31')['Close']
price_data = web.get_data_yahoo(potential_portfolio,
start = '2012-01-01',
end = '2021-03-31')
print(price_data)
Which gives me the following structure:(IGNORE NaNs)
Attributes Adj Close ... Volume
Symbols D HOLX PSX ... PSX MGM PG
Date ...
2012-01-03 36.209511 17.840000 NaN ... NaN 25873300.0 11565900.0
2012-01-04 35.912926 17.910000 NaN ... NaN 14717400.0 10595400.0
2012-01-05 35.837063 18.360001 NaN ... NaN 12437500.0 10085300.0
2012-01-06 35.471519 18.570000 NaN ... NaN 9079700.0 8421200.0
2012-01-09 35.423241 18.520000 NaN ... NaN 15750100.0 7836100.0
... ... ... ... ... ... ... ...
2021-03-25 75.220001 71.050003 82.440002 ... 2613300.0 9601500.0 7517300.0
2021-03-26 75.779999 73.419998 84.309998 ... 2368900.0 7809100.0 10820100.0
2021-03-29 76.699997 74.199997 82.529999 ... 1880600.0 7809700.0 11176000.0
2021-03-30 75.529999 73.870003 82.309998 ... 1960600.0 5668500.0 8090600.0
2021-03-31 75.959999 74.379997 81.540001 ... 2665200.0 7029900.0 9202600.0
However, I wanted it to output in this format:
Date Symbols Open High Low Close Volume Adjusted
04/12/2020 MMM 172.130005 173.160004 171.539993 172.460007 2663600 171.050461
07/12/2020 MMM 171.720001 172.5 169.179993 170.149994 2526800 168.759323
08/12/2020 MMM 169.740005 172.830002 169.699997 172.460007 1730800 171.050461
08/12/2020 MMM 169.740005 172.830002 169.699997 172.460007 1730800 171.050461
11/12/2020 D 172.300003 174.649994 172.169998 174.020004 1875700 172.597702
11/12/2020 D 172.300003 174.649994 172.169998 174.020004 1875700 172.597702
11/12/2020 D 172.300003 174.649994 172.169998 174.020004 1875700 172.597702
14/12/2020 D 175.669998 176.199997 172.990005 173.080002 3700100 171.66539
14/12/2020 D 175.669998 176.199997 172.990005 173.080002 3700100 171.66539
14/12/2020 PSX 175.669998 176.199997 172.990005 173.080002 3700100 171.66539
14/12/2020 PSX 175.669998 176.199997 172.990005 173.080002 3700100 171.66539
15/12/2020 PSX 174.389999 175.059998 172.550003 174.679993 2270600 173.252304
18/12/2020 PSX 176.759995 177.460007 175.110001 176.419998 4682000 174.978088
18/12/2020 PSX 176.759995 177.460007 175.110001 176.419998 4682000 174.978088
23/12/2020 PG 175.300003 175.809998 173.960007 173.990005 1762600 172.567963
28/12/2020 PG 175.309998 176.399994 174.389999 174.710007 1403000 173.282074
29/12/2020 PG 175.550003 175.639999 173.149994 173.850006 1218900 172.429108
31/12/2020 PG 174.119995 174.869995 173.179993 174.789993 1841300 173.361404
05/01/2021 PG 172.009995 173.25 170.649994 171.580002 2295300 170.177643
07/01/2021 MMM 171.559998 173.460007 166.160004 169.720001 5863400 168.332855
07/01/2021 MMM 171.559998 173.460007 166.160004 169.720001 5863400 168.332855
07/01/2021 MMM 171.559998 173.460007 166.160004 169.720001 5863400 168.332855
08/01/2021 MMM 169.169998 169.539993 164.610001 166.619995 4808100 165.258179
13/01/2021 MMM 167.270004 167.740005 166.050003 166.279999 2098000 164.920959
15/01/2021 MMM 165.630005 166.259995 163.380005 165.550003 3550700 164.19693
19/01/2021 MMM 167.259995 169.550003 166.800003 169.119995 3903200 167.737747
I have a list of shares that make up an ETF. I have formatted the tickers into a list and have named this variable assets
print(assets)
['AUD', 'CRWD', 'SPLK', 'OKTA', 'AVGO', 'CSCO', 'NET', 'ZS', 'AKAM', 'FTNT', 'BAH', 'CYBR', 'CHKP', 'BA/', 'VMW', 'PFPT', 'PANW', 'VRSN', 'FFIV', 'JNPR', 'LDOS', '4704', 'FEYE', 'QLYS', 'SAIC', 'RPD', 'HO', 'MIME', 'SAIL', 'VRNS', 'ITRI', 'AVST', 'MANT', 'TENB', '053800', 'ZIXI', 'OSPN', 'RDWR', 'ULE', 'MOBL', 'ATEN', 'TUFN', 'RBBN', 'NCC', 'KRW', 'EUR', 'JPY', 'GBP', 'USD']
I use the following for loop to iterate through the list and pull historical data from yahoo
for i in assets:
try:
df[i] = web.DataReader(i, data_source='yahoo', start=start, end=end)['Adj Close']
except RemoteDataError:
print(f'{i}')
continue
I am returned with:
BA/
4704
H0
053800
KRW
JPY
Suggesting these assets cannot be found on yahoo finance. I understand this is the case and accept that.
When I look for the stocks that have theoretically been found (e.g. df['FEYE']) on yahoo finance I get the following.
0 NaN 1 NaN 2 NaN 3 NaN 4 NaN 5 NaN 6 NaN 7 NaN 8 NaN 9 NaN 10 NaN 11 NaN 12 NaN 13 NaN 14 NaN 15 NaN 16 NaN 17 NaN 18 NaN 19 NaN 20 NaN 21 NaN 22 NaN 23 NaN 24 NaN 25 NaN 26 NaN 27 NaN 28 NaN 29 NaN 30 NaN 31 NaN 32 NaN 33 NaN 34 NaN 35 NaN 36 NaN 37 NaN 38 NaN 39 NaN 40 NaN 41 NaN 42 NaN 43 NaN 44 NaN 45 NaN 46 NaN 47 NaN 48 NaN
Name: FEYE, dtype: float64
When I proceed normally with just one share
(e.g. CSCO = web.DataReader(assets[5], data_source='yahoo', start=start, end=end)['Adj Close'])
It is all ok.
Any help is greatly appreciated,
Thank you!
Here is reproducible testing example of code and output.
If You have existing dataframe named df then new data is incompatible in terms of index and maybe column names.
Creating new dataframe is needed but outside the loop. Each itertation creates new column with ticker data.
import pandas as pd
import pandas_datareader.data as web
from pandas_datareader._utils import RemoteDataError
assets=['AUD', 'CRWD', 'SPLK', 'OKTA', 'AVGO', 'CSCO', 'NET', 'ZS', 'AKAM', 'FTNT', 'BAH', 'CYBR', 'CHKP', 'BA/', 'VMW', 'PFPT', 'PANW', 'VRSN', 'FFIV', 'JNPR', 'LDOS', '4704', 'FEYE', 'QLYS', 'SAIC', 'RPD', 'HO', 'MIME', 'SAIL', 'VRNS', 'ITRI', 'AVST', 'MANT', 'TENB', '053800', 'ZIXI', 'OSPN', 'RDWR', 'ULE', 'MOBL', 'ATEN', 'TUFN', 'RBBN', 'NCC', 'KRW', 'EUR', 'JPY', 'GBP', 'USD']
df = pd.DataFrame()
for i in assets:
try:
print(f'Try: {i}')
df[i] = web.DataReader(i, data_source='yahoo')['Adj Close']
except RemoteDataError as r:
print(f'Try: {i}: {r}')
continue
result:
Try: AUD
Try: CRWD
Try: SPLK
Try: OKTA
Try: AVGO
Try: CSCO
Try: NET
Try: ZS
Try: AKAM
Try: FTNT
Try: BAH
Try: CYBR
Try: CHKP
Try: BA/
Try: BA/: Unable to read URL: https://finance.yahoo.com/quote/BA//history?period1=1435975200&period2=1593741599&interval=1d&frequency=1d&filter=history
Response Text:
b'<html>\n<meta charset=\'utf-8\'>\n<script>\nvar u=\'https://www.yahoo.com/?err=404&err_url=https%3a%2f%2ffinance.yahoo.com%2fquote%2fBA%2f%2fhistory%3fperiod1%3d1435975200%26period2%3d1593741599%26interval%3d1d%26frequency%3d1d%26filter%3dhistory\';\nif(window!=window.top){\n document.write(\'<p>Content is currently unavailable.</p><img src="//geo.yahoo.com/p?s=1197757039&t=\'+new Date().getTime()+\'&_R=\'+encodeURIComponent(document.referrer)+\'&err=404&err_url=\'+u+\'" width="0px" height="0px"/>\');\n}else{\n window.location.replace(u);\n}\n</script>\n<noscript><META http-equiv="refresh" content="0;URL=\'https://www.yahoo.com/?err=404&err_url=https%3a%2f%2ffinance.yahoo.com%2fquote%2fBA%2f%2fhistory%3fperiod1%3d1435975200%26period2%3d1593741599%26interval%3d1d%26frequency%3d1d%26filter%3dhistory\'"></noscript>\n</html>\n'
Try: VMW
Try: PFPT
Try: PANW
Try: VRSN
Try: FFIV
Try: JNPR
Try: LDOS
Try: 4704
Try: 4704: No data fetched for symbol 4704 using YahooDailyReader
Try: FEYE
Try: QLYS
Try: SAIC
Try: RPD
Try: HO
Try: HO: No data fetched for symbol HO using YahooDailyReader
Try: MIME
Try: SAIL
Try: VRNS
Try: ITRI
Try: AVST
Try: MANT
Try: TENB
Try: 053800
Try: 053800: No data fetched for symbol 053800 using YahooDailyReader
Try: ZIXI
Try: OSPN
Try: RDWR
Try: ULE
Try: MOBL
Try: ATEN
Try: TUFN
Try: RBBN
Try: NCC
Try: KRW
Try: KRW: No data fetched for symbol KRW using YahooDailyReader
Try: EUR
Try: JPY
Try: JPY: No data fetched for symbol JPY using YahooDailyReader
Try: GBP
Please note there are 2 types of error:
when ticker does not exists, for example "HO"
when resulting URL is wrong due to "/" in "BA/"
Head of result set dataframe df.head():
AUD CRWD SPLK OKTA ... NCC EUR GBP USD
Date ...
2015-11-03 51.500000 NaN 57.139999 NaN ... 3.45 NaN 154.220001 13.608685
2015-12-22 55.189999 NaN 54.369999 NaN ... 3.48 NaN 148.279999 13.924644
2015-12-23 55.560001 NaN 56.509998 NaN ... 3.48 NaN 148.699997 14.146811
2015-12-24 55.560001 NaN 56.779999 NaN ... 3.48 NaN 149.119995 14.324224
2015-12-28 56.270000 NaN 57.660000 NaN ... 3.48 NaN 148.800003 14.057305
[5 rows x 43 columns]
Hope this helps.
I have a pandas data frame that looks like:
High Low ... Volume OpenInterest
2018-01-02 983.25 975.50 ... 8387 67556
2018-01-03 986.75 981.00 ... 7447 67525
2018-01-04 985.25 977.00 ... 8725 67687
2018-01-05 990.75 984.00 ... 7948 67975
I calculate the Average True Range and save it into a series:
i = 0
TR_l = [0]
while i < (df.shape[0]-1):
#TR = max(df.loc[i + 1, 'High'], df.loc[i, 'Close']) - min(df.loc[i + 1, 'Low'], df.loc[i, 'Close'])
TR = max(df['High'][i+1], df['Close'][i]) - min(df['Low'][i+1], df['Close'][i])
TR_l.append(TR)
i = i + 1
TR_s = pd.Series(TR_l)
ATR = pd.Series(TR_s.ewm(span=n, min_periods=n).mean(), name='ATR_' + str(n))
With a 14-period rolling window ATR looks like:
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN
10 NaN
11 NaN
12 NaN
13 8.096064
14 7.968324
15 8.455205
16 9.046418
17 8.895405
18 9.088769
19 9.641879
20 9.516764
But when I do:
df = df.join(ATR)
The ATR column in df is all NaN. It's because the indexes are different between the data frame and ATR. Is there any way to add the ATR column into the data frame?
Consider shift to avoid the while loop across rows and list building. Below uses Union Pacific (UNP) railroad stock data to demonstrate:
import pandas as pd
import pandas_datareader as pdr
stock_df = pdr.get_data_yahoo('UNP').loc['2019-01-01':'2019-03-29']
# SHIFT DATA ONE DAY BACK AND JOIN TO ORIGINAL DATA
stock_df = stock_df.join(stock_df.shift(-1), rsuffix='_future')
# CALCULATE TR DIFFERENCE BY ROW
stock_df['TR'] = stock_df.apply(lambda x: max(x['High_future'], x['Close']) - min(x['Low_future'], x['Close']), axis=1)
# CALCULATE EWM MEAN
n = 14
stock_df['ATR'] = stock_df['TR'].ewm(span=n, min_periods=n).mean()
Output
print(stock_df.head(20))
# High Low Open Close Volume Adj Close High_future Low_future Open_future Close_future Volume_future Adj Close_future TR ATR
# Date
# 2019-01-02 138.320007 134.770004 135.649994 137.779999 3606300.0 137.067413 136.750000 132.169998 136.039993 132.679993 5684500.0 131.993790 5.610001 NaN
# 2019-01-03 136.750000 132.169998 136.039993 132.679993 5684500.0 131.993790 138.580002 134.520004 134.820007 137.789993 5649900.0 137.077362 5.900009 NaN
# 2019-01-04 138.580002 134.520004 134.820007 137.789993 5649900.0 137.077362 139.229996 136.259995 137.330002 138.649994 4034200.0 137.932907 2.970001 NaN
# 2019-01-07 139.229996 136.259995 137.330002 138.649994 4034200.0 137.932907 152.889999 149.039993 151.059998 150.750000 10558800.0 149.970337 14.240005 NaN
# 2019-01-08 152.889999 149.039993 151.059998 150.750000 10558800.0 149.970337 151.059998 148.610001 150.289993 150.360001 4284600.0 149.582352 2.449997 NaN
# 2019-01-09 151.059998 148.610001 150.289993 150.360001 4284600.0 149.582352 155.289993 149.009995 149.899994 154.660004 6444600.0 153.860123 6.279999 NaN
# 2019-01-10 155.289993 149.009995 149.899994 154.660004 6444600.0 153.860123 155.029999 153.089996 153.639999 153.210007 3845200.0 152.417618 1.940002 NaN
# 2019-01-11 155.029999 153.089996 153.639999 153.210007 3845200.0 152.417618 154.240005 151.649994 152.229996 153.889999 3507100.0 153.094101 2.590012 NaN
# 2019-01-14 154.240005 151.649994 152.229996 153.889999 3507100.0 153.094101 154.360001 151.740005 153.789993 152.479996 4685100.0 151.691391 2.619995 NaN
# 2019-01-15 154.360001 151.740005 153.789993 152.479996 4685100.0 151.691391 153.729996 150.910004 152.910004 151.970001 4053200.0 151.184021 2.819992 NaN
# 2019-01-16 153.729996 150.910004 152.910004 151.970001 4053200.0 151.184021 154.919998 150.929993 151.110001 154.639999 4075400.0 153.840210 3.990005 NaN
# 2019-01-17 154.919998 150.929993 151.110001 154.639999 4075400.0 153.840210 158.800003 155.009995 155.539993 158.339996 5003900.0 157.521072 4.160004 NaN
# 2019-01-18 158.800003 155.009995 155.539993 158.339996 5003900.0 157.521072 157.199997 154.410004 156.929993 155.020004 6052900.0 154.218262 3.929993 NaN
# 2019-01-22 157.199997 154.410004 156.929993 155.020004 6052900.0 154.218262 156.020004 152.429993 155.449997 154.330002 4858000.0 153.531830 3.590012 4.011254
# 2019-01-23 156.020004 152.429993 155.449997 154.330002 4858000.0 153.531830 160.759995 156.009995 160.039993 160.339996 9222400.0 159.510742 6.429993 4.376440
# 2019-01-24 160.759995 156.009995 160.039993 160.339996 9222400.0 159.510742 162.000000 160.220001 161.460007 160.949997 7770700.0 160.117584 1.779999 3.991223
# 2019-01-25 162.000000 160.220001 161.460007 160.949997 7770700.0 160.117584 160.789993 159.339996 160.000000 159.899994 3733800.0 159.073013 1.610001 3.643168
# 2019-01-28 160.789993 159.339996 160.000000 159.899994 3733800.0 159.073013 160.929993 158.750000 160.039993 160.169998 3436900.0 159.341614 2.179993 3.432011
# 2019-01-29 160.929993 158.750000 160.039993 160.169998 3436900.0 159.341614 161.889999 159.440002 161.089996 160.820007 4112200.0 159.988266 2.449997 3.291831
# 2019-01-30 161.889999 159.440002 161.089996 160.820007 4112200.0 159.988266 160.990005 157.020004 160.750000 159.070007 7438600.0 158.247314 3.970001 3.387735
I'm trying to find an efficient way to generate rolling counts or sums in pandas given a grouping and a date range. Eventually, I want to be able to add conditions, ie. evaluating a 'type' field, but I'm not there just yet. I've written something to get the job done, but feel that there could be a more direct way of getting to the desired result.
My pandas data frame currently looks like this, with the desired output being put in the last column 'rolling_sales_180'.
name date amount rolling_sales_180
0 David 2015-01-01 100 100.0
1 David 2015-01-05 500 600.0
2 David 2015-05-30 50 650.0
3 David 2015-07-25 50 100.0
4 Ryan 2014-01-04 100 100.0
5 Ryan 2015-01-19 500 500.0
6 Ryan 2016-03-31 50 50.0
7 Joe 2015-07-01 100 100.0
8 Joe 2015-09-09 500 600.0
9 Joe 2015-10-15 50 650.0
My current solution and environment can be sourced below. I've been modeling my solution from this R Q&A in stackoverflow. Efficient way to perform running total in the last 365 day window
import pandas as pd
import numpy as np
def trans_date_to_dist_matrix(date_col): # used to create a distance matrix
x = date_col.tolist()
y = date_col.tolist()
data = []
for i in x:
tmp = []
for j in y:
tmp.append(abs((i - j).days))
data.append(tmp)
del tmp
return pd.DataFrame(data=data, index=date_col.values, columns=date_col.values)
def lower_tri(x_col, date_col, win): # x_col = column user wants a rolling sum of ,date_col = dates, win = time window
dm = trans_date_to_dist_matrix(date_col=date_col) # dm = distance matrix
dm = dm.where(dm <= win) # find all elements of the distance matrix that are less than window(time)
lt = dm.where(np.tril(np.ones(dm.shape)).astype(np.bool)) # lt = lower tri of distance matrix so we get only future dates
lt[lt >= 0.0] = 1.0 # cleans up our lower tri so that we can sum events that happen on the day we are evaluating
lt = lt.fillna(0) # replaces NaN with 0's for multiplication
return pd.DataFrame(x_col.values * lt.values).sum(axis=1).tolist()
def flatten(x):
try:
n = [v for sl in x for v in sl]
return [v for sl in n for v in sl]
except:
return [v for sl in x for v in sl]
data = [
['David', '1/1/2015', 100], ['David', '1/5/2015', 500], ['David', '5/30/2015', 50], ['David', '7/25/2015', 50],
['Ryan', '1/4/2014', 100], ['Ryan', '1/19/2015', 500], ['Ryan', '3/31/2016', 50],
['Joe', '7/1/2015', 100], ['Joe', '9/9/2015', 500], ['Joe', '10/15/2015', 50]
]
list_of_vals = []
dates_df = pd.DataFrame(data=data, columns=['name', 'date', 'amount'], index=None)
dates_df['date'] = pd.to_datetime(dates_df['date'])
list_of_vals.append(dates_df.groupby('name', as_index=False).apply(
lambda x: lower_tri(x_col=x.amount, date_col=x.date, win=180)))
new_data = flatten(list_of_vals)
dates_df['rolling_sales_180'] = new_data
print dates_df
Your time and feedback are appreciated.
Pandas has support for time-aware rolling via the rolling method, so you can use that instead of writing your own solution from scratch:
def get_rolling_amount(grp, freq):
return grp.rolling(freq, on='date')['amount'].sum()
df['rolling_sales_180'] = df.groupby('name', as_index=False, group_keys=False) \
.apply(get_rolling_amount, '180D')
The resulting output:
name date amount rolling_sales_180
0 David 2015-01-01 100 100.0
1 David 2015-01-05 500 600.0
2 David 2015-05-30 50 650.0
3 David 2015-07-25 50 100.0
4 Ryan 2014-01-04 100 100.0
5 Ryan 2015-01-19 500 500.0
6 Ryan 2016-03-31 50 50.0
7 Joe 2015-07-01 100 100.0
8 Joe 2015-09-09 500 600.0
9 Joe 2015-10-15 50 650.0
I have a function saved and defined in a different script called TechAnalisys.py This function just outputs a scalar, so I plan to use pd.rolling_apply() to generate a new column into the original dataframe (df).
The function works fine when executed, but I have problems when using the rolling_apply() application.This link Passing arguments to rolling_apply shows how you should do it, and that is how I think it my code is but it still shows the error "TypeError: int object is not iterable" appears
This is the function (located in the script TechAnalisys.py)
def hurst(df,days):
import pandas as pd
import numpy as np
df2 = pd.DataFrame()
df2 = df[-days:]
rango = lambda x: x.max() - x.min()
df2['ret'] = 1 - df.PX_LAST/df.PX_LAST.shift(1)
df2 = df2.dropna()
ave = pd.expanding_mean(df2.ret)
df2['desvdeprom'] = df2.ret - ave
df2['acum'] = df2['desvdeprom'].cumsum()
df2['rangorolled'] = pd.expanding_apply(df2.acum, rango)
df2['datastd'] = pd.expanding_std(df2.ret)
df2['rango_rangostd'] = np.log(df2.rangorolled/df2.datastd)
df2['tiempo1'] = np.log(range(1,len(df2.index)+1))
df2 = df2.dropna()
model1 = pd.ols(y=df2['rango_rangostd'], x=df2['tiempo1'], intercept=False)
return model1.beta
and now this is the main script:
import pandas as pd
import numpy as np
import TechAnalysis as ta
df = pd.DataFrame(np.log(np.cumsum(np.random.randn(100000)+1)+1000),columns =['PX_LAST'])
The following works:
print ta.hurst(df,50)
This doesn't work:
df['hurst_roll'] = pd.rolling_apply(df, 15 , ta.hurst, args=(50))
Whats wrong in the code?
If you check the type of df within the hurst function, you'll see that rolling_apply passes it as numpy.array.
If you create a DataFrame from this numpy.array inside rolling_apply, it works. I also used a longer window because there were only 15 values per array but you seemed to be planning on using the last 50 days.
def hurst(df, days):
df = pd.DataFrame(df, columns=['PX_LAST'])
df2 = pd.DataFrame()
df2 = df.loc[-days:, :]
rango = lambda x: x.max() - x.min()
df2['ret'] = 1 - df.loc[:, 'PX_LAST']/df.loc[:, 'PX_LAST'].shift(1)
df2 = df2.dropna()
ave = pd.expanding_mean(df2.ret)
df2['desvdeprom'] = df2.ret - ave
df2['acum'] = df2['desvdeprom'].cumsum()
df2['rangorolled'] = pd.expanding_apply(df2.acum, rango)
df2['datastd'] = pd.expanding_std(df2.ret)
df2['rango_rangostd'] = np.log(df2.rangorolled/df2.datastd)
df2['tiempo1'] = np.log(range(1, len(df2.index)+1))
df2 = df2.dropna()
model1 = pd.ols(y=df2['rango_rangostd'], x=df2['tiempo1'], intercept=False)
return model1.beta
def rol_apply():
df = pd.DataFrame(np.log(np.cumsum(np.random.randn(1000)+1)+1000), columns=['PX_LAST'])
df['hurst_roll'] = pd.rolling_apply(df, 100, hurst, args=(50, ))
PX_LAST hurst_roll
0 6.907911 NaN
1 6.907808 NaN
2 6.907520 NaN
3 6.908048 NaN
4 6.907622 NaN
5 6.909895 NaN
6 6.911281 NaN
7 6.911998 NaN
8 6.912245 NaN
9 6.912457 NaN
10 6.913794 NaN
11 6.914294 NaN
12 6.915157 NaN
13 6.916172 NaN
14 6.916838 NaN
15 6.917235 NaN
16 6.918061 NaN
17 6.918717 NaN
18 6.920109 NaN
19 6.919867 NaN
20 6.921309 NaN
21 6.922786 NaN
22 6.924173 NaN
23 6.925523 NaN
24 6.926517 NaN
25 6.928552 NaN
26 6.930198 NaN
27 6.931738 NaN
28 6.931959 NaN
29 6.932111 NaN
.. ... ...
970 7.562284 0.653381
971 7.563388 0.630455
972 7.563499 0.577746
973 7.563686 0.552758
974 7.564105 0.540144
975 7.564428 0.541411
976 7.564351 0.532154
977 7.564408 0.530999
978 7.564681 0.532376
979 7.565192 0.536758
980 7.565359 0.538629
981 7.566112 0.555789
982 7.566678 0.553163
983 7.566364 0.577953
984 7.567587 0.634843
985 7.568583 0.679807
986 7.569268 0.662653
987 7.570018 0.630447
988 7.570375 0.659497
989 7.570704 0.622190
990 7.571009 0.485458
991 7.571886 0.551147
992 7.573148 0.459912
993 7.574134 0.463146
994 7.574478 0.463158
995 7.574671 0.535014
996 7.575177 0.467705
997 7.575374 0.531098
998 7.575620 0.540611
999 7.576727 0.465572
[1000 rows x 2 columns]