Trying to scrape table from www.fangraphs.com using BeautifulSoup (Python) - python

I have successfully scraped a leaderboard table from said site, at this URL:
https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season=2022&month=1000&season1=2022&ind=0&team=0,ts&rost=0&age=0&filter=&players=0&startdate=2022-01-01&enddate=2022-09-13&sort=17,d
using the following code:
import pandas as pd
import requests
from datetime import date, timedelta
from bs4 import BeautifulSoup
import lxml
import numpy as np
def parse_array_from_fangraphs_html(start_date,end_date, URL_1):
"""
Take a HTML stats page from fangraphs and parse it out to a dataframe.
"""
# parse input
PITCHERS_URL = URL_1
# request the data
pitchers_html = requests.get(PITCHERS_URL).text
soup = BeautifulSoup(pitchers_html, "lxml")
table = soup.find("table", {"class": "rgMasterTable"})
# get headers
headers_html = table.find("thead").find_all("th")
headers = []
for header in headers_html:
headers.append(header.text)
# get rows
rows = []
rows_html = table.find("tbody").find_all("tr")
for row in rows_html:
row_data = []
for cell in row.find_all("td"):
row_data.append(cell.text)
rows.append(row_data)
return pd.DataFrame(rows, columns = headers)
sdate = '2022-01-01'
enddate = date.today()
enddate =enddate.strftime("%Y-%m-%d")
#date.today() - timedelta(1)
#enddate = enddate.strftime("%Y-%m-%d")
PITCHERS = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season=2022&month=1000&season1=2022&ind=0&team=0,ts&rost=0&age=0&filter=&players=0&startdate={}&enddate={}&sort=17,d".format(sdate, enddate)
wRC1 = parse_array_from_fangraphs_html(sdate, enddate, PITCHERS)
where the table is successfully assigned to the dataframe wRC1.
I'm trying to do something similar but with the following link:
https://www.fangraphs.com/players/trevor-rogers/22286/game-log?type=0&gds=2022-04-10&gde=2022-09-12&season=&position=P
using the following code:
import pandas as pd
import requests
from datetime import date, timedelta
from bs4 import BeautifulSoup
import lxml
import numpy as np
def parse_array_from_fangraphs_html(start_date,end_date, URL_1):
"""
Take a HTML stats page from fangraphs and parse it out to a dataframe.
"""
# parse input
PITCHERS_URL = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=c%2C13%2C7%2C8%2C120%2C121%2C331%2C105%2C111%2C24%2C19%2C14%2C329%2C324%2C45%2C122%2C6%2C42%2C43%2C328%2C330%2C322%2C323%2C326%2C332&season=2021&month=1000&season1=2015&ind=0&team=&rost=&age=&filter=&players=&startdate={}&enddate={}&page=1_2000".format(start_date, end_date)
PITCHERS_URL = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season=2022&month=1000&season1=2022&ind=0&team=0,ts&rost=0&age=0&filter=&players=0&startdate={}&enddate={}&sort=17,d".format(start_date, end_date)
PITCHERS_URL = URL_1
# request the data
pitchers_html = requests.get(PITCHERS_URL).text
soup = BeautifulSoup(pitchers_html, "lxml")
table = soup.find("table", {"class": "rgMasterTable"})
# get headers
headers_html = table.find("thead").find_all("th")
headers = []
for header in headers_html:
headers.append(header.text)
# get rows
rows = []
rows_html = table.find("tbody").find_all("tr")
for row in rows_html:
row_data = []
for cell in row.find_all("td"):
row_data.append(cell.text)
rows.append(row_data)
return pd.DataFrame(rows, columns = headers)
sdate = '2022-01-01'
enddate = date.today()
enddate =enddate.strftime("%Y-%m-%d")
#date.today() - timedelta(1)
#enddate = enddate.strftime("%Y-%m-%d")
PITCHERS = "https://www.fangraphs.com/players/trevor-rogers/22286/game-log?type=0&gds=2022-04-10&gde=2022-09-12&season=&position=P"
df = parse_array_from_fangraphs_html(sdate, enddate, PITCHERS)
But the program ends up producing the following error:
AttributeError: 'NoneType' object has no attribute 'find'
What could be producing this problem? Thanks in advance!

That data is being pulled dynamically by page' javascript, from an API endpoint. You can see that endpoint by inspecting Dev tools - Network tab in browser. Here is one way to do it:
import pandas as pd
import requests
r = requests.get('https://cdn.fangraphs.com/api/players/game-log?playerid=22286&position=P&type=0&season=&gds=2022-04-10&gde=2022-09-12&z=1663107181')
df = pd.json_normalize(r.json()['mlb'])
print(df)
Result printed in terminal:
Date Opp teamid season Team HomeAway Age W L ERA G GS CG ShO SV HLD BS IP TBF H R ER HR BB IBB HBP WP BK SO K/9 BB/9 H/9 K/BB IFH% BUH% GB FB LD IFFB IFH BU BUH K% BB% K-BB% SIERA HR/9 AVG WHIP BABIP LOB% FIP E-F xFIP ERA- FIP- xFIP- GB/FB LD% GB% FB% IFFB% HR/FB RS RS/9 Balls Strikes Pitches WPA -WPA +WPA RE24 REW pLI inLI gmLI exLI Pulls Games WPA/LI Clutch SD MD FB%1 FBv SL% SLv CH% CHv wFB wSL wCH wFB/C wSL/C wCH/C O-Swing% Z-Swing% Swing% O-Contact% Z-Contact% Contact% Zone% F-Strike% SwStr% Pull Cent Oppo Soft Med Hard bipCount Pull% Cent% Oppo% Soft% Med% Hard% PlayerName playerid tERA GSv2 pfxFA% pfxSI% pfxSL% pfxCH% pfxvFA pfxvSI pfxvSL pfxvCH pfxFA-X pfxSI-X pfxSL-X pfxCH-X pfxFA-Z pfxSI-Z pfxSL-Z pfxCH-Z pfxwFA pfxwSI pfxwSL pfxwCH pfxwFA/C pfxwSI/C pfxwSL/C pfxwCH/C pfxO-Swing% pfxZ-Swing% pfxSwing% pfxO-Contact% pfxZ-Contact% pfxContact% pfxZone% pfxPace piCH% piFA% piSI% piSL% piXX% pivCH pivFA pivSI pivSL pivXX piCH-X piFA-X piSI-X piSL-X piXX-X piCH-Z piFA-Z piSI-Z piSL-Z piXX-Z piwCH piwFA piwSI piwSL piwXX piwCH/C piwFA/C piwSI/C piwSL/C piwXX/C piO-Swing% piZ-Swing% piSwing% piO-Contact% piZ-Contact% piContact% piZone% Events EV LA Barrels Barrel% maxEV HardHit HardHit% gamedate dh
0 2050-01-01 - - - 20 2022 - - - A 24 4.0 11.0 5.349057 22.0 22.0 0.0 0.0 0.0 0.0 0.0 106.0 470.0 113.0 67.0 63.0 14.0 44.0 0.0 5.0 5.0 0.0 105.0 8.915095 3.735849 9.594340 2.386364 0.067669 0.5 133.0 112.0 69.0 5.0 9.0 2.0 1.0 0.223404 0.093617 0.129787 4.186245 1.188679 0.268409 1.481132 0.327815 0.667135 4.243406 1.105651 4.094434 134.238531 107.477754 101.526711 1.187500 0.219745 0.423567 0.356688 0.044643 0.125000 50.0 4.245283 685.0 1223.0 1908.0 -1.919216 -9.791034 7.871818 -15.8036 -1.638929e+00 0.998163 0.882337 0.867627 1.019309 22.0 22.0 -1.502681 -0.420067 0.0 0.0 0.527254 94.645129 0.179769 80.728863 0.292977 85.706619 -8.878565 -1.319738 -2.106202 -0.882561 -0.384763 -0.376780 0.322813 0.715539 0.487153 0.695531 0.826620 0.776103 0.418458 0.612766 0.109015 133.0 109.0 74.0 62.0 162.0 92.0 316.0 0.420886 0.344937 0.234177 0.196203 0.512658 0.291139 Trevor Rogers 22286 4.566455 45.0 0.525157 0.002096 0.179769 0.292977 94.608583 94.849997 80.678718 85.654563 8.065439 11.7475 -3.389592 9.090555 8.385319 3.2725 3.808251 1.335760 -7.785403 -0.614196 -1.246096 -1.899808 -0.776986 -15.354905 -0.363293 -0.339858 0.298838 0.674298 0.487945 0.660777 0.824074 0.774436 0.503669 23.385816 0.292453 0.524633 0.002096 0.179769 0.001048 85.611075 94.589424 94.797028 80.623514 91.021751 8.324599 7.148858 10.731025 -4.301155 5.42764 0.261684 7.435025 2.438555 2.763215 6.392905 -1.972312 -7.847004 -0.614196 -1.246096 0.134105 -0.353461 -0.783916 -15.354905 -0.363293 6.705273 0.298319 0.675732 0.487421 0.665493 0.823529 0.775269 0.501048 316.0 88.299055 11.857416 23.0 0.072785 113.824 116.0 0.367089 2050-01-01 0
1 2022-09-12 TEX 20 2022 MIA H 24 0.0 0.0 2.842105 1.0 1.0 0.0 0.0 0.0 0.0 0.0 6.1 24.0 4.0 2.0 2.0 0.0 2.0 0.0 0.0 0.0 0.0 9.0 12.789474 2.842105 5.684211 4.500000 0.000000 0.0 7.0 3.0 3.0 0.0 0.0 0.0 0.0 0.375000 0.083333 0.291667 2.394518 0.000000 0.181818 0.947368 0.307692 0.666667 1.226027 1.616078 1.928979 71.324734 32.517111 48.896813 2.333333 0.230769 0.538462 0.230769 0.000000 0.000000 2.0 2.842105 29.0 60.0 89.0 0.034852 -0.393320 0.428172 0.9269 9.934618e-02 0.979617 1.082800 0.870000 3.073200 1.0 1.0 0.203277 -0.167700 0.0 0.0 0.573034 94.980392 0.067416 83.500000 0.359551 86.156250 0.518355 0.342558 0.944681 1.016382 5.709294 2.952127 0.301887 0.750000 0.483146 0.500000 0.740741 0.651163 0.404494 0.583333 0.168539 4.0 5.0 4.0 1.0 8.0 4.0 13.0 0.307692 0.384615 0.307692 0.076923 0.615385 0.307692 Trevor Rogers 22286 1.677209 68.0 0.573034 NaN 0.067416 0.359551 94.907839 NaN 83.483337 86.134377 9.537647 NaN -1.148333 10.087188 8.466666 NaN 4.296667 1.082500 0.518355 NaN 0.342558 0.944681 1.016382 NaN 5.709294 2.952127 0.325581 0.652174 0.494382 0.500000 0.700000 0.636364 0.516854 20.453125 0.359551 0.573034 NaN 0.067416 NaN 86.203064 94.993490 NaN 83.560450 NaN 9.029422 8.369133 NaN -2.768143 NaN -0.289749 7.267294 NaN 2.612752 NaN 0.944681 0.518355 NaN 0.342558 NaN 2.952127 1.016382 NaN 5.709294 NaN 0.325581 0.652174 0.494382 0.500000 0.700000 0.636364 0.516854 13.0 92.482628 2.117006 1.0 0.076923 105.379 6.0 0.461538 2022-09-12 1
2 2022-09-07 #PHI 20 2022 MIA A 24 0.0 1.0 4.500000 1.0 1.0 0.0 0.0 0.0 0.0 0.0 6.0 23.0 5.0 3.0 3.0 2.0 0.0 0.0 0.0 1.0 0.0 8.0 12.000000 0.000000 7.500000 8.000000 0.166667 0.0 6.0 6.0 3.0 0.0 1.0 0.0 0.0 0.347826 0.000000 0.347826 2.000855 3.000000 0.217391 0.833333 0.230769 0.909091 4.787431 -0.287431 1.938107 112.930823 120.992954 49.118658 1.000000 0.200000 0.400000 0.400000 0.000000 0.333333 2.0 3.000000 30.0 57.0 87.0 -0.059300 -0.440200 0.380900 0.0000 -3.725290e-09 0.804167 0.915000 0.920000 0.750000 1.0 1.0 -0.103264 0.029523 0.0 0.0 0.620690 94.185185 0.160920 82.428571 0.218391 85.947368 2.034562 -0.678451 -1.764662 3.767707 -4.846082 -9.287697 0.318182 0.674419 0.494253 0.642857 0.862069 0.790698 0.494253 0.608696 0.103448 9.0 4.0 2.0 2.0 8.0 5.0 15.0 0.600000 0.266667 0.133333 0.133333 0.533333 0.333333 Trevor Rogers 22286 4.851247 52.0 0.620690 NaN 0.160920 0.218391 94.181478 NaN 82.321429 85.836843 9.481296 NaN -0.274286 10.177368 8.612408 NaN 4.684286 1.065789 2.034562 NaN -0.678451 -1.764662 3.767707 NaN -4.846082 -9.287697 0.238095 0.733333 0.494253 0.600000 0.848485 0.790698 0.517241 22.000000 0.218391 0.620690 NaN 0.160920 NaN 86.182418 94.539867 NaN 82.674308 NaN 8.901917 8.122533 NaN -1.927560 NaN -0.384359 7.333927 NaN 2.989588 NaN -1.764662 2.034562 NaN -0.678451 NaN -9.287697 3.767707 NaN -4.846082 NaN 0.250000 0.702128 0.494253 0.600000 0.848485 0.790698 0.540230 15.0 89.261637 23.314498 2.0 0.133333 109.308 7.0 0.466667 2022-09-07 0
[...]
Data returned is quite extensive: you can slice & dice it further, to get what you want from there.
For relevant pandas documentation, see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html

Related

Does auto arima predict function return series

I have been following this tutorial:
https://www.analyticsvidhya.com/blog/2021/11/basic-understanding-of-time-series-modelling-with-auto-arimax/
I was fitting it on my dataset.
It was working fine but when I ran it again the next day model.predict function started returning series rather than a list. And it is not giving proper results like it did previously.
model = pm.auto_arima(df_train["farm rate"], exogenous=df_train[exogenous_features], trace=True, error_action="ignore", suppress_warnings=True)
model.fit(df_train["farm rate"], exogenous=df_train[exogenous_features])
forecast = model.predict(n_periods=len(df_valid), exogenous=df_valid[exogenous_features])
df_valid["Forecast_ARIMAX"] = forecast
My df looks like this
date day doc farm rate open close price month year day_of_week week doc_mean_3 open_mean_3 close_mean_3 price_mean_3
date
2015-01-01 2015-01-01 1.0 51.5 165.0 170.0 170.0 2375.0 1.0 2015.0 3.0 1.0 38.574580 155.158973 150.355472 3418.872350
2015-01-02 2015-01-02 2.0 48.5 165.0 170.0 160.0 2375.0 1.0 2015.0 4.0 1.0 38.574580 155.158973 150.355472 3418.872350
2015-01-03 2015-01-03 3.0 44.5 150.0 180.0 167.5 2375.0 1.0 2015.0 5.0 1.0 48.166667 173.333333 165.833333 2375.000000
2015-01-05 2015-01-05 5.0 40.5 150.0 165.0 155.0 2375.0 1.0 2015.0 0.0 2.0 44.500000 171.666667 160.833333 2375.000000
2015-01-06 2015-01-06 6.0 36.5 140.0 155.0 152.5 2375.0 1.0 2015.0 1.0 2.0 40.500000 166.666667 158.333333 2375.000000
2015-01-07 2015-01-07 7.0 35.5 140.0 155.0 150.0 2375.0 1.0 2015.0 2.0 2.0 37.500000 158.333333 152.500000 2375.000000
2015-01-08 2015-01-08 8.0 35.5 140.0 162.5 155.0 2375.0 1.0 2015.0 3.0 2.0 35.833333 157.500000 152.500000 2375.000000
2015-01-09 2015-01-09 9.0 37.5 145.0 165.0 157.5 2375.0 1.0 2015.0 4.0 2.0 36.166667 160.833333
I have data from 2015 to 2022.Exogenous features are : ["doc", "open","close","doc_mean_3", "open_mean_3", "close_mean_3", "price_mean_3"]
They are calculated using this method:
def calc_lag_features(df, feature, window):
df[f"{feature}_mean_{window}"] = df[feature].rolling(window=window).mean()
return df
Edit:
These are the results I get. Completely flat graph. Adding or removing exogenous variables isnt making any difference. Previously, it was able to capture the true graph to quite an extent.
OUTPUT
Depending on exogenous variables straight line only goes up and down and is not able to capture the true trend.

View data after transformation

Is it possible to see the data after Altair applies transformations and aggregations?
For example, can you access the underlying data after the following transformations?
import altair as alt
from vega_datasets import data
source = data.seattle_weather.url
step = 20
overlap = 1
alt.Chart(source, height=step).transform_timeunit(
Month='month(date)'
).transform_joinaggregate(
mean_temp='mean(temp_max)', groupby=['Month']
).transform_bin(
['bin_max', 'bin_min'], 'temp_max'
).transform_aggregate(
value='count()', groupby=['Month', 'mean_temp', 'bin_min', 'bin_max']
).transform_impute(
impute='value', groupby=['Month', 'mean_temp'], key='bin_min', value=0
).mark_area(...
)
Above code from the Ridgeplot example
Transforms are evaluated in Javascript, and there is not any built-in way to access data in the javascript frontend from the Python backend. However, there is an experimental package called altair_transform that is able to evaluate most Vega expressions in Python.
For your chart you can use it like this:
import altair as alt
from vega_datasets import data
source = data.seattle_weather()
step = 20
overlap = 1
chart = alt.Chart(source, height=step).transform_timeunit(
Month='month(date)'
).transform_joinaggregate(
mean_temp='mean(temp_max)', groupby=['Month']
).transform_bin(
['bin_max', 'bin_min'], 'temp_max'
).transform_aggregate(
value='count()', groupby=['Month', 'mean_temp', 'bin_min', 'bin_max']
).transform_impute(
impute='value', groupby=['Month', 'mean_temp'], key='bin_min', value=0
).mark_area().encode(
x='Month:T',
y='value:Q'
)
import altair_transform
data = altair_transform.extract_data(chart)
print(data)
bin_min Month mean_temp bin_max value
0 0.0 1900-01-01 8.229032 -5.0 2.0
1 5.0 1900-01-01 8.229032 0.0 19.0
2 10.0 1900-01-01 8.229032 5.0 72.0
3 15.0 1900-01-01 8.229032 10.0 29.0
4 20.0 1900-01-01 8.229032 15.0 2.0
.. ... ... ... ... ...
103 20.0 1900-12-01 8.194355 15.0 4.0
104 25.0 1900-12-01 8.194355 NaN 0.0
105 30.0 1900-12-01 8.194355 NaN 0.0
106 35.0 1900-12-01 8.194355 NaN 0.0
107 40.0 1900-12-01 8.194355 NaN 0.0
[108 rows x 5 columns]

How to disable calculating with nans while pandas resample().mean() and resample().sum()?

I need to calculate the annual mean from monthly data. If there is a nan value in my monthly data, I want the whole year to be nan as well.
This is my code so far:
station_data = pd.read_csv(station_data_files[0], sep=';', header=0)
station_data = station_data.replace(-999, np.nan)
station_data = station_data.set_index("MESS_DATUM_BEGINN") # it is a row with time dates
station_data_anual = pd.DataFrame()
station_data_anual["Y_TT"] = station_data["MO_TT"].resample("A").mean()
station_data_anual["Y_RR"] = station_data["MO_RR"].resample("A").sum()
The problem is, that it ignores the nans. Which means e.g. that station_data_anual["Y_RR"] values are to low. For years in which I have only nans as monthly values, it returns 0.
Note: There are some questions similar to mine, but they didn't help me.
Note: Python
some clarifications:
Input Data:
station_data
Out[235]:
STATIONS_ID MESS_DATUM_ENDE QN_4 ... MO_RR MX_RS eor
MESS_DATUM_BEGINN ...
1981-01-01 403.0 1981-01-31 10.0 ... 51.5 10.0 eor
1981-02-01 403.0 1981-02-28 10.0 ... 23.8 5.4 eor
1981-03-01 403.0 1981-03-31 10.0 ... 116.5 28.0 eor
1981-04-01 403.0 1981-04-30 10.0 ... 24.1 9.5 eor
1981-05-01 403.0 1981-05-31 10.0 ... 29.4 8.4 eor
... ... ... ... ... ... ...
2010-08-01 403.0 2010-08-31 10.0 ... NaN 29.1 eor
2010-09-01 403.0 2010-09-30 10.0 ... NaN 29.8 eor
2010-10-01 403.0 2010-10-31 10.0 ... NaN 5.5 eor
2010-11-01 403.0 2010-11-30 10.0 ... NaN 17.5 eor
2010-12-01 403.0 2010-12-31 10.0 ... NaN 8.2 eor
[360 rows x 16 columns]
have a closer look:
station_data["MO_RR"][276:288]
Out[242]:
MESS_DATUM_BEGINN
2004-01-01 66.3
2004-02-01 NaN
2004-03-01 NaN
2004-04-01 NaN
2004-05-01 NaN
2004-06-01 NaN
2004-07-01 NaN
2004-08-01 NaN
2004-09-01 NaN
2004-10-01 NaN
2004-11-01 NaN
2004-12-01 NaN
Name: MO_RR, dtype: float64
Output Data:
station_data_anual
Out[238]:
Y_TT Y_RR
MESS_DATUM_BEGINN
...
2003-12-31 9.866667 430.5
2004-12-31 9.620833 66.3
2005-12-31 9.665833 0.0
2006-12-31 10.158333 0.0
2007-12-31 10.555000 0.0
2008-12-31 10.361667 0.0
2009-12-31 9.587500 0.0
2010-12-31 8.207500 0.0
my result has to look like:
Y_TT Y_TX Y_TN Y_RR
MESS_DATUM_BEGINN
...
Y_TT Y_RR
MESS_DATUM_BEGINN
...
2003-12-31 9.866667 430.5
2004-12-31 9.620833 nan # getting nan instead of 66.3 is especially important
2005-12-31 9.665833 nan
2006-12-31 10.158333 nan
2007-12-31 10.555000 nan
2008-12-31 10.361667 nan
2009-12-31 9.587500 nan
2010-12-31 8.207500 nan
I have never used sampling and there might be better solutions out there which could simply ignore the "group" based on "condition". But a very simple solution could be to use a custom mean function after resample.
def very_mean(array_like):
if any(pd.isnull(array_like)):
return np.nan
else:
return array_like.mean()
station_data_anual["Y_TT"] = station_data["MO_TT"].resample("A").apply(very_mean)
Could you try to remove the nan values first using?
station_data_anual = pd.DataFrame()
station_data_anual["Y_TT"] = station_data["MO_TT"].dropna().resample("A").mean()
station_data_anual["Y_RR"] = station_data["MO_RR"].dropna().resample("A").sum()
It seems that NaN values are not included in the mean, considering the following experiment:
df_ = pd.DataFrame(index=pd.date_range("2022","2023",periods=12))
df_['a'] = np.ones(12)
df_.iloc[1]['a'] = np.NaN
df_.resample("2M").mean()
All of the averaged 2 month periods still have 1.0 as the mean value in the output of the mean() calculation above:
a
2022-01-31 1.0
2022-03-31 1.0
2022-05-31 1.0
2022-07-31 1.0
2022-09-30 1.0
2022-11-30 1.0
2023-01-31 1.0

Problems understanding the logic when creating code using groupby, list comprehensions and custom functions

I want to calculate a rolling mean of different window sizes for each ticker in my dataframe. Ideally I could pass a list of window sizes and for each ticker I would get new columns (one for each rolling mean size). So if I wanted a rolling mean of 2 and one of 3, the output would be two columns for each ticker.
import datetime as dt
import numpy as np
import pandas as pd
Dt_df = pd.DataFrame({"Date":pd.date_range('2018-07-01', periods=5, freq='D')})
Tick_df = pd.DataFrame({"Ticker":['ABC',"HIJ","XYZ"]})
Mult_df = pd.merge(Tick_df.assign(key='x'), Dt_df.assign(key='x') on='key').drop('key', 1)
df2 = pd.DataFrame(np.random.randint(low=5, high=10, size=(15, 1)), columns=['Price'])
df3 = Mult_df.join(df2, how='outer')
df3.set_index(['Ticker','Date'],inplace = True)
Here is the Example Dataset:
When I try to apply this function:
def my_RollMeans(x):
w = [1,2,3]
s = pd.Series(x)
Bob = pd.DataFrame([s.rolling(w1).mean() for w1 in w]).T
return Bob
to my dataframe df3 using various versions of apply or transform I get errors.
NewDF = df3.groupby('Ticker').Price.transform(my_RollMeans).fillna(0)
The latest error is:
Data must be 1-dimensional
IIUC try using apply and I made a modification to your custom function:
def my_RollMeans(x):
w = [1,2,3]
s = pd.Series(x)
Bob = pd.DataFrame([s.rolling(w1).mean().rename('Price_'+str(w1)) for w1 in w]).T
return Bob
df3.groupby('Ticker').apply(lambda x : my_RollMeans(x.Price)).fillna(0)
Output:
Price_1 Price_2 Price_3
Ticker Date
ABC 2018-07-01 9.0 0.0 0.000000
2018-07-02 8.0 8.5 0.000000
2018-07-03 7.0 7.5 8.000000
2018-07-04 8.0 7.5 7.666667
2018-07-05 8.0 8.0 7.666667
HIJ 2018-07-01 8.0 0.0 0.000000
2018-07-02 9.0 8.5 0.000000
2018-07-03 5.0 7.0 7.333333
2018-07-04 6.0 5.5 6.666667
2018-07-05 7.0 6.5 6.000000
XYZ 2018-07-01 9.0 0.0 0.000000
2018-07-02 5.0 7.0 0.000000
2018-07-03 9.0 7.0 7.666667
2018-07-04 8.0 8.5 7.333333
2018-07-05 6.0 7.0 7.666667

How do I merge 2 dataframes with an index in only 1 dataframe?

I have created 2 panda data frames, the first called 'dfmas' with an index 'Date', then dates, data and 3 moving average columns;
OPEN HIGH LOW LAST ma5 ma8 ma21
Date
11/23/2009 88.84 89.19 88.58 88.97 NaN NaN NaN
11/24/2009 88.97 89.07 88.36 88.50 NaN NaN NaN
11/25/2009 88.50 88.63 87.22 87.35 NaN NaN NaN
11/26/2009 87.35 87.48 86.30 86.59 NaN NaN NaN
11/27/2009 86.59 87.02 84.83 86.53 87.588 NaN NaN
11/30/2009 87.17 87.17 85.87 86.41 87.076 NaN NaN
12/1/2009 86.41 87.53 86.17 86.68 86.712 NaN NaN
12/2/2009 86.68 87.49 86.59 87.39 86.720 87.302 NaN
12/3/2009 87.39 88.48 87.32 88.26 87.054 87.214 NaN
12/4/2009 88.26 90.77 88.00 90.56 87.860 87.471 NaN
the second dataframe is made from the above data looking at when the moving averages crossover;
ma = [0,]
ma5Last = ma5[0]
ma8Last = ma8[0]
for ma5Curr, ma8Curr in zip(ma5[1:], ma8[1:]):
if ma5Curr > ma5Last and ma8Curr > ma8Last:
ma.append(1)
elif ma5Curr < ma5Last and ma8Curr < ma8Last:
ma.append(-1)
else:
ma.append(0)
ma5Last = ma5Curr
ma8Last = ma8Curr
maX = pd.DataFrame(ma).astype('float')
maX.columns = ['maX']
and is called 'maX' below;
maX
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0
9 1.0
However I'm unable to merge/concat the 2 data frames.
How do I add the 'Date" index to the second 'maX'dataframe and then merge/concat/combine the two dataframes together? Many thanks in advance.
Is this what you are after?
df['maX'] = maX.maX.values
df
Out[1263]:
OPEN HIGH LOW LAST ma5 ma8 ma21 maX
Date
11/23/2009 88.84 89.19 88.58 88.97 NaN NaN NaN 0.0
11/24/2009 88.97 89.07 88.36 88.50 NaN NaN NaN 0.0
11/25/2009 88.50 88.63 87.22 87.35 NaN NaN NaN 0.0
11/26/2009 87.35 87.48 86.30 86.59 NaN NaN NaN 0.0
11/27/2009 86.59 87.02 84.83 86.53 87.588 NaN NaN 0.0
11/30/2009 87.17 87.17 85.87 86.41 87.076 NaN NaN 0.0
12/1/2009 86.41 87.53 86.17 86.68 86.712 NaN NaN 0.0
12/2/2009 86.68 87.49 86.59 87.39 86.720 87.302 NaN 0.0
12/3/2009 87.39 88.48 87.32 88.26 87.054 87.214 NaN 0.0
12/4/2009 88.26 90.77 88.00 90.56 87.860 87.471 NaN 1.0
If dataframes have same length simply add index from original DataFrame for align indexes:
maX = pd.DataFrame(ma, index=df.index).astype('float')

Categories