View data after transformation - python

Is it possible to see the data after Altair applies transformations and aggregations?
For example, can you access the underlying data after the following transformations?
import altair as alt
from vega_datasets import data
source = data.seattle_weather.url
step = 20
overlap = 1
alt.Chart(source, height=step).transform_timeunit(
Month='month(date)'
).transform_joinaggregate(
mean_temp='mean(temp_max)', groupby=['Month']
).transform_bin(
['bin_max', 'bin_min'], 'temp_max'
).transform_aggregate(
value='count()', groupby=['Month', 'mean_temp', 'bin_min', 'bin_max']
).transform_impute(
impute='value', groupby=['Month', 'mean_temp'], key='bin_min', value=0
).mark_area(...
)
Above code from the Ridgeplot example

Transforms are evaluated in Javascript, and there is not any built-in way to access data in the javascript frontend from the Python backend. However, there is an experimental package called altair_transform that is able to evaluate most Vega expressions in Python.
For your chart you can use it like this:
import altair as alt
from vega_datasets import data
source = data.seattle_weather()
step = 20
overlap = 1
chart = alt.Chart(source, height=step).transform_timeunit(
Month='month(date)'
).transform_joinaggregate(
mean_temp='mean(temp_max)', groupby=['Month']
).transform_bin(
['bin_max', 'bin_min'], 'temp_max'
).transform_aggregate(
value='count()', groupby=['Month', 'mean_temp', 'bin_min', 'bin_max']
).transform_impute(
impute='value', groupby=['Month', 'mean_temp'], key='bin_min', value=0
).mark_area().encode(
x='Month:T',
y='value:Q'
)
import altair_transform
data = altair_transform.extract_data(chart)
print(data)
bin_min Month mean_temp bin_max value
0 0.0 1900-01-01 8.229032 -5.0 2.0
1 5.0 1900-01-01 8.229032 0.0 19.0
2 10.0 1900-01-01 8.229032 5.0 72.0
3 15.0 1900-01-01 8.229032 10.0 29.0
4 20.0 1900-01-01 8.229032 15.0 2.0
.. ... ... ... ... ...
103 20.0 1900-12-01 8.194355 15.0 4.0
104 25.0 1900-12-01 8.194355 NaN 0.0
105 30.0 1900-12-01 8.194355 NaN 0.0
106 35.0 1900-12-01 8.194355 NaN 0.0
107 40.0 1900-12-01 8.194355 NaN 0.0
[108 rows x 5 columns]

Related

Trying to scrape table from www.fangraphs.com using BeautifulSoup (Python)

I have successfully scraped a leaderboard table from said site, at this URL:
https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season=2022&month=1000&season1=2022&ind=0&team=0,ts&rost=0&age=0&filter=&players=0&startdate=2022-01-01&enddate=2022-09-13&sort=17,d
using the following code:
import pandas as pd
import requests
from datetime import date, timedelta
from bs4 import BeautifulSoup
import lxml
import numpy as np
def parse_array_from_fangraphs_html(start_date,end_date, URL_1):
"""
Take a HTML stats page from fangraphs and parse it out to a dataframe.
"""
# parse input
PITCHERS_URL = URL_1
# request the data
pitchers_html = requests.get(PITCHERS_URL).text
soup = BeautifulSoup(pitchers_html, "lxml")
table = soup.find("table", {"class": "rgMasterTable"})
# get headers
headers_html = table.find("thead").find_all("th")
headers = []
for header in headers_html:
headers.append(header.text)
# get rows
rows = []
rows_html = table.find("tbody").find_all("tr")
for row in rows_html:
row_data = []
for cell in row.find_all("td"):
row_data.append(cell.text)
rows.append(row_data)
return pd.DataFrame(rows, columns = headers)
sdate = '2022-01-01'
enddate = date.today()
enddate =enddate.strftime("%Y-%m-%d")
#date.today() - timedelta(1)
#enddate = enddate.strftime("%Y-%m-%d")
PITCHERS = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season=2022&month=1000&season1=2022&ind=0&team=0,ts&rost=0&age=0&filter=&players=0&startdate={}&enddate={}&sort=17,d".format(sdate, enddate)
wRC1 = parse_array_from_fangraphs_html(sdate, enddate, PITCHERS)
where the table is successfully assigned to the dataframe wRC1.
I'm trying to do something similar but with the following link:
https://www.fangraphs.com/players/trevor-rogers/22286/game-log?type=0&gds=2022-04-10&gde=2022-09-12&season=&position=P
using the following code:
import pandas as pd
import requests
from datetime import date, timedelta
from bs4 import BeautifulSoup
import lxml
import numpy as np
def parse_array_from_fangraphs_html(start_date,end_date, URL_1):
"""
Take a HTML stats page from fangraphs and parse it out to a dataframe.
"""
# parse input
PITCHERS_URL = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=c%2C13%2C7%2C8%2C120%2C121%2C331%2C105%2C111%2C24%2C19%2C14%2C329%2C324%2C45%2C122%2C6%2C42%2C43%2C328%2C330%2C322%2C323%2C326%2C332&season=2021&month=1000&season1=2015&ind=0&team=&rost=&age=&filter=&players=&startdate={}&enddate={}&page=1_2000".format(start_date, end_date)
PITCHERS_URL = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season=2022&month=1000&season1=2022&ind=0&team=0,ts&rost=0&age=0&filter=&players=0&startdate={}&enddate={}&sort=17,d".format(start_date, end_date)
PITCHERS_URL = URL_1
# request the data
pitchers_html = requests.get(PITCHERS_URL).text
soup = BeautifulSoup(pitchers_html, "lxml")
table = soup.find("table", {"class": "rgMasterTable"})
# get headers
headers_html = table.find("thead").find_all("th")
headers = []
for header in headers_html:
headers.append(header.text)
# get rows
rows = []
rows_html = table.find("tbody").find_all("tr")
for row in rows_html:
row_data = []
for cell in row.find_all("td"):
row_data.append(cell.text)
rows.append(row_data)
return pd.DataFrame(rows, columns = headers)
sdate = '2022-01-01'
enddate = date.today()
enddate =enddate.strftime("%Y-%m-%d")
#date.today() - timedelta(1)
#enddate = enddate.strftime("%Y-%m-%d")
PITCHERS = "https://www.fangraphs.com/players/trevor-rogers/22286/game-log?type=0&gds=2022-04-10&gde=2022-09-12&season=&position=P"
df = parse_array_from_fangraphs_html(sdate, enddate, PITCHERS)
But the program ends up producing the following error:
AttributeError: 'NoneType' object has no attribute 'find'
What could be producing this problem? Thanks in advance!
That data is being pulled dynamically by page' javascript, from an API endpoint. You can see that endpoint by inspecting Dev tools - Network tab in browser. Here is one way to do it:
import pandas as pd
import requests
r = requests.get('https://cdn.fangraphs.com/api/players/game-log?playerid=22286&position=P&type=0&season=&gds=2022-04-10&gde=2022-09-12&z=1663107181')
df = pd.json_normalize(r.json()['mlb'])
print(df)
Result printed in terminal:
Date Opp teamid season Team HomeAway Age W L ERA G GS CG ShO SV HLD BS IP TBF H R ER HR BB IBB HBP WP BK SO K/9 BB/9 H/9 K/BB IFH% BUH% GB FB LD IFFB IFH BU BUH K% BB% K-BB% SIERA HR/9 AVG WHIP BABIP LOB% FIP E-F xFIP ERA- FIP- xFIP- GB/FB LD% GB% FB% IFFB% HR/FB RS RS/9 Balls Strikes Pitches WPA -WPA +WPA RE24 REW pLI inLI gmLI exLI Pulls Games WPA/LI Clutch SD MD FB%1 FBv SL% SLv CH% CHv wFB wSL wCH wFB/C wSL/C wCH/C O-Swing% Z-Swing% Swing% O-Contact% Z-Contact% Contact% Zone% F-Strike% SwStr% Pull Cent Oppo Soft Med Hard bipCount Pull% Cent% Oppo% Soft% Med% Hard% PlayerName playerid tERA GSv2 pfxFA% pfxSI% pfxSL% pfxCH% pfxvFA pfxvSI pfxvSL pfxvCH pfxFA-X pfxSI-X pfxSL-X pfxCH-X pfxFA-Z pfxSI-Z pfxSL-Z pfxCH-Z pfxwFA pfxwSI pfxwSL pfxwCH pfxwFA/C pfxwSI/C pfxwSL/C pfxwCH/C pfxO-Swing% pfxZ-Swing% pfxSwing% pfxO-Contact% pfxZ-Contact% pfxContact% pfxZone% pfxPace piCH% piFA% piSI% piSL% piXX% pivCH pivFA pivSI pivSL pivXX piCH-X piFA-X piSI-X piSL-X piXX-X piCH-Z piFA-Z piSI-Z piSL-Z piXX-Z piwCH piwFA piwSI piwSL piwXX piwCH/C piwFA/C piwSI/C piwSL/C piwXX/C piO-Swing% piZ-Swing% piSwing% piO-Contact% piZ-Contact% piContact% piZone% Events EV LA Barrels Barrel% maxEV HardHit HardHit% gamedate dh
0 2050-01-01 - - - 20 2022 - - - A 24 4.0 11.0 5.349057 22.0 22.0 0.0 0.0 0.0 0.0 0.0 106.0 470.0 113.0 67.0 63.0 14.0 44.0 0.0 5.0 5.0 0.0 105.0 8.915095 3.735849 9.594340 2.386364 0.067669 0.5 133.0 112.0 69.0 5.0 9.0 2.0 1.0 0.223404 0.093617 0.129787 4.186245 1.188679 0.268409 1.481132 0.327815 0.667135 4.243406 1.105651 4.094434 134.238531 107.477754 101.526711 1.187500 0.219745 0.423567 0.356688 0.044643 0.125000 50.0 4.245283 685.0 1223.0 1908.0 -1.919216 -9.791034 7.871818 -15.8036 -1.638929e+00 0.998163 0.882337 0.867627 1.019309 22.0 22.0 -1.502681 -0.420067 0.0 0.0 0.527254 94.645129 0.179769 80.728863 0.292977 85.706619 -8.878565 -1.319738 -2.106202 -0.882561 -0.384763 -0.376780 0.322813 0.715539 0.487153 0.695531 0.826620 0.776103 0.418458 0.612766 0.109015 133.0 109.0 74.0 62.0 162.0 92.0 316.0 0.420886 0.344937 0.234177 0.196203 0.512658 0.291139 Trevor Rogers 22286 4.566455 45.0 0.525157 0.002096 0.179769 0.292977 94.608583 94.849997 80.678718 85.654563 8.065439 11.7475 -3.389592 9.090555 8.385319 3.2725 3.808251 1.335760 -7.785403 -0.614196 -1.246096 -1.899808 -0.776986 -15.354905 -0.363293 -0.339858 0.298838 0.674298 0.487945 0.660777 0.824074 0.774436 0.503669 23.385816 0.292453 0.524633 0.002096 0.179769 0.001048 85.611075 94.589424 94.797028 80.623514 91.021751 8.324599 7.148858 10.731025 -4.301155 5.42764 0.261684 7.435025 2.438555 2.763215 6.392905 -1.972312 -7.847004 -0.614196 -1.246096 0.134105 -0.353461 -0.783916 -15.354905 -0.363293 6.705273 0.298319 0.675732 0.487421 0.665493 0.823529 0.775269 0.501048 316.0 88.299055 11.857416 23.0 0.072785 113.824 116.0 0.367089 2050-01-01 0
1 2022-09-12 TEX 20 2022 MIA H 24 0.0 0.0 2.842105 1.0 1.0 0.0 0.0 0.0 0.0 0.0 6.1 24.0 4.0 2.0 2.0 0.0 2.0 0.0 0.0 0.0 0.0 9.0 12.789474 2.842105 5.684211 4.500000 0.000000 0.0 7.0 3.0 3.0 0.0 0.0 0.0 0.0 0.375000 0.083333 0.291667 2.394518 0.000000 0.181818 0.947368 0.307692 0.666667 1.226027 1.616078 1.928979 71.324734 32.517111 48.896813 2.333333 0.230769 0.538462 0.230769 0.000000 0.000000 2.0 2.842105 29.0 60.0 89.0 0.034852 -0.393320 0.428172 0.9269 9.934618e-02 0.979617 1.082800 0.870000 3.073200 1.0 1.0 0.203277 -0.167700 0.0 0.0 0.573034 94.980392 0.067416 83.500000 0.359551 86.156250 0.518355 0.342558 0.944681 1.016382 5.709294 2.952127 0.301887 0.750000 0.483146 0.500000 0.740741 0.651163 0.404494 0.583333 0.168539 4.0 5.0 4.0 1.0 8.0 4.0 13.0 0.307692 0.384615 0.307692 0.076923 0.615385 0.307692 Trevor Rogers 22286 1.677209 68.0 0.573034 NaN 0.067416 0.359551 94.907839 NaN 83.483337 86.134377 9.537647 NaN -1.148333 10.087188 8.466666 NaN 4.296667 1.082500 0.518355 NaN 0.342558 0.944681 1.016382 NaN 5.709294 2.952127 0.325581 0.652174 0.494382 0.500000 0.700000 0.636364 0.516854 20.453125 0.359551 0.573034 NaN 0.067416 NaN 86.203064 94.993490 NaN 83.560450 NaN 9.029422 8.369133 NaN -2.768143 NaN -0.289749 7.267294 NaN 2.612752 NaN 0.944681 0.518355 NaN 0.342558 NaN 2.952127 1.016382 NaN 5.709294 NaN 0.325581 0.652174 0.494382 0.500000 0.700000 0.636364 0.516854 13.0 92.482628 2.117006 1.0 0.076923 105.379 6.0 0.461538 2022-09-12 1
2 2022-09-07 #PHI 20 2022 MIA A 24 0.0 1.0 4.500000 1.0 1.0 0.0 0.0 0.0 0.0 0.0 6.0 23.0 5.0 3.0 3.0 2.0 0.0 0.0 0.0 1.0 0.0 8.0 12.000000 0.000000 7.500000 8.000000 0.166667 0.0 6.0 6.0 3.0 0.0 1.0 0.0 0.0 0.347826 0.000000 0.347826 2.000855 3.000000 0.217391 0.833333 0.230769 0.909091 4.787431 -0.287431 1.938107 112.930823 120.992954 49.118658 1.000000 0.200000 0.400000 0.400000 0.000000 0.333333 2.0 3.000000 30.0 57.0 87.0 -0.059300 -0.440200 0.380900 0.0000 -3.725290e-09 0.804167 0.915000 0.920000 0.750000 1.0 1.0 -0.103264 0.029523 0.0 0.0 0.620690 94.185185 0.160920 82.428571 0.218391 85.947368 2.034562 -0.678451 -1.764662 3.767707 -4.846082 -9.287697 0.318182 0.674419 0.494253 0.642857 0.862069 0.790698 0.494253 0.608696 0.103448 9.0 4.0 2.0 2.0 8.0 5.0 15.0 0.600000 0.266667 0.133333 0.133333 0.533333 0.333333 Trevor Rogers 22286 4.851247 52.0 0.620690 NaN 0.160920 0.218391 94.181478 NaN 82.321429 85.836843 9.481296 NaN -0.274286 10.177368 8.612408 NaN 4.684286 1.065789 2.034562 NaN -0.678451 -1.764662 3.767707 NaN -4.846082 -9.287697 0.238095 0.733333 0.494253 0.600000 0.848485 0.790698 0.517241 22.000000 0.218391 0.620690 NaN 0.160920 NaN 86.182418 94.539867 NaN 82.674308 NaN 8.901917 8.122533 NaN -1.927560 NaN -0.384359 7.333927 NaN 2.989588 NaN -1.764662 2.034562 NaN -0.678451 NaN -9.287697 3.767707 NaN -4.846082 NaN 0.250000 0.702128 0.494253 0.600000 0.848485 0.790698 0.540230 15.0 89.261637 23.314498 2.0 0.133333 109.308 7.0 0.466667 2022-09-07 0
[...]
Data returned is quite extensive: you can slice & dice it further, to get what you want from there.
For relevant pandas documentation, see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html

Smart way to plot (very skewed) wealth distribution across different years?

I have multiple dataframes (different years) that looks like the following dataframe. Each dataframe contains the share of wealth each id holds (across equally distributed 1000 units of x-axis bins. So for instance, if there are 4,000,000 individuals, each bin will represent the sum of 4,000 individuals in descending order). What I want is to plot this in one chart. I am lacking creatibity as to what is the best to way to show these very skewed wealth distribution across different years...
When i look at my dataframe from year 2021, the top 0.1 holds 92% of all wealth. So when I plot it using a bar chart, it looks like just one straight vertical line, and if i use a line chart, it is a L-shaped graph. I was thinking maybe i should have different x-axis bin width, as in, insteady of using 1000 equal sized bins on the a-axis, maybe the top 0.1%, top 0.1-0.5%, top 0.5-1%, 1-5%, 5-10%, 10-20%,... etc.
If anyone has a good idea, i'd really really appreciate it!
x wealth_share_2016
1 0.33430437283205316
2 0.08857907028903435
3 0.05827083476711605
4 0.03862747269456592
5 0.034995688078949164
6 0.025653645763917113
7 0.021026627708501285
8 0.018026751734878957
9 0.01642864468243111
10 0.015728925648574896
11 0.013588290634843092
12 0.01227954727973525
13 0.011382643296594532
14 0.010141965617682762
15 0.008819245941582449
..
1000 0.000000000011221421
x wealth_share_2017
0.0 0.901371131515615
1.0 0.029149650261610725
2.0 0.01448219525035078
3.0 0.00924941242097224
4.0 0.006528547368042855
5.0 0.004915282901262396
6.0 0.0038227195841958007
7.0 0.003202422960559232
8.0 0.0027194902152005056
9.0 0.002256081738439025
10.0 0.001913906326353021
11.0 0.001655920262049755
12.0 0.001497315358785623
13.0 0.0013007783674694787
14.0 0.0011483994993211357
15.0 0.0010006446573525651
16.0 0.0009187314949837794
17.0 0.0008060306765341464
18.0 0.0007121683663280601
19.0 0.0006479765506981805
20.0 0.0006209618807503557
21.0 0.0005522371927723867
22.0 0.0004900821167110386
23.0 0.0004397140637940455
24.0 0.00039311806560654995
25.0 0.0003568253540177216
26.0 0.00033181209459040074
27.0 0.0003194446403240109
28.0 0.0003184084588259308
29.0 0.0003182506069381648
30.0 0.0003148797013444408
31.0 0.0002961487376129427
32.0 0.00027052175379974156
33.0 0.00024743766685454786
34.0 0.0002256857592625916
35.0 0.00020579998427225097
36.0 0.000189038268813506
37.0 0.00017386965729266948
38.0 0.0001613485014690905
39.0 0.0001574132034911388
40.0 0.0001490677750078641
41.0 0.00013790177558791725
42.0 0.0001282878615396144
43.0 0.00012095612436994448
44.0 0.00011214167633915717
45.0 0.00010421673782294511
46.0 9.715626623684205e-05
47.0 9.282271063116496e-05
48.0 8.696571645233427e-05
49.0 8.108410275243205e-05
50.0 7.672762907247785e-05
51.0 7.164556991989368e-05
52.0 6.712091046340094e-05
53.0 6.402983760430654e-05
54.0 6.340827259447476e-05
55.0 6.212579456204865e-05
56.0 6.0479432395632356e-05
57.0 5.871255187231619e-05
58.0 5.6732218205513816e-05
59.0 5.469844909188562e-05
60.0 5.272638831110061e-05
61.0 5.082941624023762e-05
62.0 4.9172657560503e-05
63.0 4.7723292856953955e-05
64.0 4.640794539328976e-05
65.0 4.4830504104868853e-05
66.0 4.33432435988776e-05
67.0 4.17840819038174e-05
68.0 4.0359335324500254e-05
69.0 3.890539627505912e-05
70.0 3.773843593447448e-05
71.0 3.650676651396156e-05
72.0 3.528219096983737e-05
73.0 3.440527767945646e-05
74.0 3.350747980104347e-05
75.0 3.26561659597071e-05
76.0 3.19802966664897e-05
77.0 3.1835209823474306e-05
78.0 3.183429293715699e-05
79.0 3.183429293715699e-05
80.0 3.179465449554639e-05
81.0 3.1754468203569435e-05
82.0 3.1704945367497785e-05
83.0 3.1660515386167146e-05
84.0 3.161204511239972e-05
85.0 3.160031088406889e-05
86.0 3.160031088406889e-05
87.0 3.159054611415194e-05
88.0 3.1527283185355765e-05
89.0 3.1443493604304305e-05
90.0 3.1323353389521874e-05
91.0 3.117894171029721e-05
92.0 3.0954278315859144e-05
93.0 3.057844960395481e-05
94.0 3.014447137763062e-05
95.0 2.9597164606371073e-05
96.0 2.887863910263771e-05
97.0 2.8423195872524498e-05
98.0 2.7793813070448293e-05
99.0 2.7040901735687525e-05
100.0 2.619028564470109e-05
101.0 2.5450004510283205e-05
102.0 2.4855217140189223e-05
103.0 2.403822662596923e-05
104.0 2.3244772756237742e-05
... ...
1000.0 0.000000023425324
Binning these data across irregular percentage ranges is a common way to present such distributions. You can categorize and aggregate data using pd.cut() with subsequent group_by():
import pandas as pd
import matplotlib.pyplot as plt
#sample data generation
import numpy as np
rng = np.random.default_rng(123)
n = 1000
df = pd.DataFrame({"x": range(n), "wealth_share_2017": np.sort(rng.pareto(a=100, size=n))[::-1]})
df.loc[0, "wealth_share_2017"] = 50
df["wealth_share_2017"] /= df["wealth_share_2017"].sum()
n = len(df)
#define bins in percent
#the last valueis slightly above 100% to ensure that the final bin is included
bins = [0, 0.1, 0.5, 1.0, 10.0, 50.0, 100.01]
#create figure labels for intervals from bins
labels = [f"[{start:.1f}, {stop:.1f})" for start, stop in zip(bins[:-1], bins[1:])]
#categorize data
df["cats"] = pd.cut(df["x"], bins=[n*i/100 for i in bins], include_lowest=True, right=False, labels=labels)
#and aggregate
df_plot = df.groupby(by="cats")["wealth_share_2017"].sum().mul(100)
df_plot.plot.bar(rot=45, xlabel="Income percentile", ylabel="Wealth share (%)", title=df_plot.name)
plt.tight_layout()
plt.show()

pandas dataframe interpolate for Nans with groupby using window of discrete days of the year

The small reproducible example below sets up a dataframe that is 100 yrs in length containing some randomly generated values. It then inserts 3 100-day stretches of missing values. Using this small example, I am attempting to sort out the pandas commands that will fill in the missing days using average values for that day of the year (hence the use of .groupby) with a condition. For example, if April 12th is missing, how can the last line of code be altered such that only the 10 nearest April 12th's are used to fill in the missing value? In other words, a missing April 12th value in 1920 would be filled in using the mean April 12th values between 1915 to 1925; a missing April 12th value in 2000 would be filled in with the mean April 12th values between 1995 to 2005, etc. I tried playing around with adding a .rolling() to the lambda function in last line of script, but was unsuccessful in my attempt.
Bonus question: The example below extends from 1918 to 2018. If a value is missing on April 12th 1919, for example, it would still be nice if ten April 12ths were used to fill in the missing value even though the window couldn't be 'centered' on the missing day because of its proximity to the beginning of the time series. Is there a solution to the first question above that would be flexible enough to still use a minimum of 10 values when missing values are close to the beginning and ending of the time series?
import pandas as pd
import numpy as np
import random
# create 100 yr time series
dates = pd.date_range(start="1918-01-01", end="2018-12-31").strftime("%Y-%m-%d")
vals = [random.randrange(1, 50, 1) for i in range(len(dates))]
# Create some arbitrary gaps
vals[100:200] = vals[9962:10062] = vals[35895:35995] = [np.nan] * 100
# Create dataframe
df = pd.DataFrame(dict(
list(
zip(["Date", "vals"],
[dates, vals])
)
))
# confirm missing vals
df.iloc[95:105]
df.iloc[35890:35900]
# set a date index (for use by groupby)
df.index = pd.DatetimeIndex(df['Date'])
df['Date'] = df.index
# Need help restricting the mean to the 10 nearest same-days-of-the-year:
df['vals'] = df.groupby([df.index.month, df.index.day])['vals'].transform(lambda x: x.fillna(x.mean()))
This answers both parts
build a DF dfr that is the calculation you want
lambda function returns a dict {year:val, ...}
make sure indexes are named in reasonable way
expand out dict with apply(pd.Series)
reshape by putting year columns back into index
merge() built DF with original DF. vals column contains NaN 0 column is value to fill
finally fillna()
# create 100 yr time series
dates = pd.date_range(start="1918-01-01", end="2018-12-31")
vals = [random.randrange(1, 50, 1) for i in range(len(dates))]
# Create some arbitrary gaps
vals[100:200] = vals[9962:10062] = vals[35895:35995] = [np.nan] * 100
# Create dataframe - simplified from question...
df = pd.DataFrame({"Date":dates,"vals":vals})
df[df.isna().any(axis=1)]
ystart = df.Date.dt.year.min()
# generate rolling means for month/day. bfill for when it's start of series
dfr = (df.groupby([df.Date.dt.month, df.Date.dt.day])["vals"]
.agg(lambda s: {y+ystart:v for y,v in enumerate(s.dropna().rolling(5).mean().bfill())})
.to_frame().rename_axis(["month","day"])
)
# expand dict into columns and reshape to by indexed by month,day,year
dfr = dfr.join(dfr.vals.apply(pd.Series)).drop(columns="vals").rename_axis("year",axis=1).stack().to_frame()
# get df index back, plus vals & fillna (column 0) can be seen alongside each other
dfm = df.merge(dfr, left_on=[df.Date.dt.month,df.Date.dt.day,df.Date.dt.year], right_index=True)
# finally what we really want to do - fill tha NaNs
df.fillna(dfm[0])
analysis
taking NaN for 11-Apr-1918, default is 22 as it's backfilled from 1921
(12+2+47+47+2)/5 == 22
dfm.query("key_0==4 & key_1==11").head(7)
key_0
key_1
key_2
Date
vals
0
100
4
11
1918
1918-04-11 00:00:00
nan
22
465
4
11
1919
1919-04-11 00:00:00
12
22
831
4
11
1920
1920-04-11 00:00:00
2
22
1196
4
11
1921
1921-04-11 00:00:00
47
27
1561
4
11
1922
1922-04-11 00:00:00
47
36
1926
4
11
1923
1923-04-11 00:00:00
2
34.6
2292
4
11
1924
1924-04-11 00:00:00
37
29.4
I'm not sure how far I've gotten with the intent of your question. The approach I've taken is to satisfy two requirements
Need an arbitrary number of averages
Use those averages to fill in the NA
I have addressed the
Simply put, instead of filling in the NA with before and after dates, I fill in the NA with averages extracted from any number of years in a row.
import pandas as pd
import numpy as np
import random
# create 100 yr time series
dates = pd.date_range(start="1918-01-01", end="2018-12-31").strftime("%Y-%m-%d")
vals = [random.randrange(1, 50, 1) for i in range(len(dates))]
# Create some arbitrary gaps
vals[100:200] = vals[9962:10062] = vals[35895:35995] = [np.nan] * 100
# Create dataframe
df = pd.DataFrame(dict(
list(
zip(["Date", "vals"],
[dates, vals])
)
))
df['Date'] = pd.to_datetime(df['Date'])
df['mm-dd'] = df['Date'].apply(lambda x:'{:02}-{:02}'.format(x.month, x.day))
df['yyyy'] = df['Date'].apply(lambda x:'{:04}'.format(x.year))
df = df.iloc[:,1:].pivot(index='mm-dd', columns='yyyy')
df.columns = df.columns.droplevel(0)
df['nans'] = df.isnull().sum(axis=1)
df['10n_mean'] = df.iloc[:,:-1].sample(n=10, axis=1).mean(axis=1)
df['10n_mean'] = df['10n_mean'].round(1)
df.loc[df['nans'] >= 1]
yyyy 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 ... 2011 2012 2013 2014 2015 2016 2017 2018 nans 10n_mean
mm-dd
02-29 NaN NaN 34.0 NaN NaN NaN 2.0 NaN NaN NaN ... NaN 49.0 NaN NaN NaN 32.0 NaN NaN 76 21.6
04-11 NaN 43.0 12.0 28.0 29.0 28.0 1.0 38.0 11.0 3.0 ... 17.0 35.0 8.0 17.0 34.0 NaN 5.0 33.0 3 29.7
04-12 NaN 19.0 38.0 34.0 48.0 46.0 28.0 29.0 29.0 14.0 ... 41.0 16.0 9.0 39.0 8.0 NaN 1.0 12.0 3 21.3
04-13 NaN 33.0 26.0 47.0 21.0 26.0 20.0 16.0 11.0 7.0 ... 5.0 11.0 34.0 28.0 27.0 NaN 2.0 46.0 3 21.3
04-14 NaN 36.0 19.0 6.0 45.0 41.0 24.0 39.0 1.0 11.0 ... 30.0 47.0 45.0 14.0 48.0 NaN 16.0 8.0 3 24.7
df_mean = df.T.fillna(df['10n_mean'], downcast='infer').T
df_mean.loc[df_mean['nans'] >= 1]
yyyy 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 ... 2011 2012 2013 2014 2015 2016 2017 2018 nans 10n_mean
mm-dd
02-29 21.6 21.6 34.0 21.6 21.6 21.6 2.0 21.6 21.6 21.6 ... 21.6 49.0 21.6 21.6 21.6 32.0 21.6 21.6 76.0 21.6
04-11 29.7 43.0 12.0 28.0 29.0 28.0 1.0 38.0 11.0 3.0 ... 17.0 35.0 8.0 17.0 34.0 29.7 5.0 33.0 3.0 29.7
04-12 21.3 19.0 38.0 34.0 48.0 46.0 28.0 29.0 29.0 14.0 ... 41.0 16.0 9.0 39.0 8.0 21.3 1.0 12.0 3.0 21.3
04-13 21.3 33.0 26.0 47.0 21.0 26.0 20.0 16.0 11.0 7.0 ... 5.0 11.0 34.0 28.0 27.0 21.3 2.0 46.0 3.0 21.3
04-14 24.7 36.0 19.0 6.0 45.0 41.0 24.0 39.0 1.0 11.0 ... 30.0 47.0 45.0 14.0 48.0 24.7 16.0 8.0 3.0 24.7

Python: Array-based equation

I have a dataframe 500 rows long by 4 columns. I need to find the proper python code that would divide the current row by the row below and then multiply that value by the value in the last row for every value in each column. I need to replicate this excel formula basically.
It's not clear if your data is stored in an array as provided by Numpy, were it true you'd write, with the original data contained in a
b = a[-1]*(a[:-1]/a[+1:])
a[-1] is the last row, a[:-1] the array without the last row and a[+1:] the array without the first (index zero, that is) row
Assuming you are talking about pandas DataFrame
import pandas as pd
import random
# sample DataFrame object
df = pd.DataFrame((float(random.randint(1, 100)),
float(random.randint(1, 100)),
float(random.randint(1, 100)),
float(random.randint(1, 100)))
for _ in range(10))
def function(col):
for i in range(len(col)-1):
col[i] = (col[i]/col[i+1])*col[len(col)-1]
print(df) # before formula apply
df.apply(function)
print(df) # after formula apply
>>>
0 1 2 3
0 10.0 78.0 27.0 23.0
1 72.0 42.0 77.0 86.0
2 82.0 12.0 58.0 98.0
3 27.0 92.0 19.0 86.0
4 48.0 83.0 14.0 43.0
5 55.0 18.0 58.0 77.0
6 20.0 58.0 20.0 22.0
7 76.0 19.0 63.0 82.0
8 23.0 99.0 58.0 15.0
9 60.0 57.0 89.0 100.0
0 1 2 3
0 8.333333 105.857143 31.207792 26.744186
1 52.682927 199.500000 118.155172 87.755102
2 182.222222 7.434783 271.684211 113.953488
3 33.750000 63.180723 120.785714 200.000000
4 52.363636 262.833333 21.482759 55.844156
5 165.000000 17.689655 258.100000 350.000000
6 15.789474 174.000000 28.253968 26.829268
7 198.260870 10.939394 96.672414 546.666667
8 23.000000 99.000000 58.000000 15.000000
9 60.000000 57.000000 89.000000 100.000000

Interpolation of a dataframe with immediate data appearing before and after it - Pandas

Let's say I've a dataframe like this -
ID Weight Height
1 80.0 180.0
2 60.0 170.0
3 NaN NaN
4 NaN NaN
5 82.0 185.0
I want the dataframe to be transormed to -
ID Weight Height
1 80.0 180.0
2 60.0 170.0
3 71.0 177.5
4 76.5 181.25
5 82.0 185.0
It takes the average of the immediate data available before and after a NaN and updates the missing/NaN value accordingly.
You can use interpolation from the pandas library by using the following:
df['Weight'], df['Height'] = df.Weight.interpolate(), df.Height.interpolate()
Check the arguments on the documentation for the method of interpolation to tune this to your problem case: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.interpolate.html

Categories