I created a script to load data, check NA values, and fill all NA values. Here is my code:
import pandas as pd
def filter_df(merged_df, var_list):
ind = merged_df.Name.isin(var_list)
return merged_df[ind]
def pivot_df(df):
return df.pivot(index='Date', columns='Name', values=['Open', 'High', 'Low', 'Close'])
def validation_df(input, summary = False):
df = input.copy()
# na check
missing = df.isna().sum().sort_values(ascending=False)
percent_missing = ((missing / df.isnull().count()) * 100).sort_values(ascending=False)
missing_df = pd.concat([missing, percent_missing], axis=1, keys=['Total', 'Percent'], sort=False)
# fill na
columns = list(missing_df[missing_df['Total'] >= 1].reset_index()['index'])
for col in columns:
null_index = df.index[df[col].isnull() == True].tolist()
null_index.sort()
for ind in null_index:
if ind > 0:
print(df.loc[ind, col])
print(df.loc[ind - 1, col])
df.loc[ind, col] = df.loc[ind - 1, col]
if ind == 0:
df.loc[ind, col] = 0
# outliers check
count = []
for col in df.columns:
count.append(sum(df[col] > df[col].mean() + 2 * df[col].std()) + sum(df[col] < df[col].mean() - 2 * df[col].std()))
outliers_df = pd.DataFrame({'Columns': df.columns, 'Count': count}).sort_values(by = 'Count')
if summary == True:
print('missing value check:/n')
print(missing_df)
print('/n outliers check:/n')
print(outliers_df)
return df
def join_df(price_df, transaction_df, var_list):
price_df = filter_df(price_df, var_list)
price_df = pivot_df(price_df)
joined_df = transaction_df.merge(price_df, how = 'left', on = 'Date')
#joined_df = validation_df(joined_df)
return joined_df
token_path = 'https://raw.githubusercontent.com/Carloszone/Cryptocurrency_Research_project/main/datasets/1_token_df.csv'
transaction_path = 'https://raw.githubusercontent.com/Carloszone/Cryptocurrency_Research_project/main/datasets/transaction_df.csv'
var_list = ['Bitcoin', 'Ethereum', 'Golem', 'Solana']
token_df = pd.read_csv(token_path)
transaction_df = pd.read_csv(transaction_path)
df = join_df(token_df, transaction_df, var_list)
df = validation_df(df)
But it did not work. I checked my code and found this issue came from the loc(). For example:
df = join_df(token_df, transaction_df, var_list)
print(df[df.columns[15]])
print(df.loc[1,df.columns[15]])
what I got is:
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
..
2250 NaN
2251 NaN
2252 NaN
2253 NaN
2254 NaN
Name: (High, Solana), Length: 2255, dtype: float64
AssertionError Traceback (most recent call last)
<ipython-input-19-75f01cc22c9c> in <module>()
2
3 print(df[df.columns[15]])
----> 4 print(df.loc[1,df.columns[15]])
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key)
923 with suppress(KeyError, IndexError):
924 return self.obj._get_value(*key, takeable=self._takeable)
--> 925 return self._getitem_tuple(key)
926 else:
927 # we by definition only have the 0th axis
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
1107 return self._multi_take(tup)
1108
-> 1109 return self._getitem_tuple_same_dim(tup)
1110
1111 def _get_label(self, label, axis: int):
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _getitem_tuple_same_dim(self, tup)
807 # We should never have retval.ndim < self.ndim, as that should
808 # be handled by the _getitem_lowerdim call above.
--> 809 assert retval.ndim == self.ndim
810
811 return retval
AssertionError:
I don't know why df[column_name] is available, but df.loc[index,columns_name] is wrong.
You can check my code on Colab: https://colab.research.google.com/drive/1Yg280JRwFayW1tdp4OJqTO5-X3dGsItB?usp=sharing
The issue is that you're merging two DataFrames on a column they don't share in common (because you pivoted price_df, Date column became the index). Also the Date columns don't have a uniform format, so you have to make them the same. Replace your join_df function with the one below and it will work as expected.
I added comments on the lines that had to be added.
def join_df(price_df, transaction_df, var_list):
price_df = filter_df(price_df, var_list)
price_df = pivot_df(price_df)
# After pivot the Date column is the index, and price_df has MultiIndex columns
# since we want to merge it with transaction_df, we need to first flatten the columns
price_df.columns = price_df.columns.map('.'.join)
# and reset_index so that we have the index as the Date column
price_df = price_df.reset_index()
# the Dates are formatted differently across the two DataFrames;
# one has the following format: '2016-01-01' and the other '2016/1/1'
# to have a uniform format, we convert the both Date columns to datetime objects
price_df['Date'] = pd.to_datetime(price_df['Date'])
transaction_df['Date'] = pd.to_datetime(transaction_df['Date'])
joined_df = transaction_df.merge(price_df, how = 'left', on = 'Date')
#joined_df = validation_df(joined_df)
return joined_df
Output:
Date total_transaction_count Volume gas_consumption \
0 2016-01-01 2665 NaN NaN
1 2016-01-02 4217 NaN NaN
2 2016-01-03 4396 NaN NaN
3 2016-01-04 4776 NaN NaN
4 2016-01-05 26649 NaN NaN
... ... ... ... ...
2250 2022-02-28 1980533 1.968686e+06 8.626201e+11
2251 2022-03-01 2013145 2.194055e+06 1.112079e+12
2252 2022-03-02 1987934 2.473327e+06 1.167615e+12
2253 2022-03-03 1973190 3.093248e+06 1.260826e+12
2254 2022-03-04 1861286 4.446204e+06 1.045814e+12
old_ave_gas_fee new_avg_gas_fee new_avg_base_fee \
0 0.000000e+00 0.000000e+00 0.000000e+00
1 0.000000e+00 0.000000e+00 0.000000e+00
2 0.000000e+00 0.000000e+00 0.000000e+00
3 0.000000e+00 0.000000e+00 0.000000e+00
4 0.000000e+00 0.000000e+00 0.000000e+00
... ... ... ...
2250 6.356288e-08 6.356288e-08 5.941877e-08
2251 5.368574e-08 5.368574e-08 4.982823e-08
2252 5.567472e-08 5.567472e-08 4.782055e-08
2253 4.763823e-08 4.763823e-08 4.140883e-08
2254 4.566440e-08 4.566440e-08 3.547666e-08
new_avg_priority_fee Open.Bitcoin Open.Ethereum ... High.Golem \
0 0.000000e+00 430.0 NaN ... NaN
1 0.000000e+00 434.0 NaN ... NaN
2 0.000000e+00 433.7 NaN ... NaN
3 0.000000e+00 430.7 NaN ... NaN
4 0.000000e+00 433.3 NaN ... NaN
... ... ... ... ... ...
2250 4.144109e-09 37707.2 2616.34 ... 0.48904
2251 3.857517e-09 43187.2 2922.44 ... 0.48222
2252 7.854179e-09 44420.3 2975.80 ... 0.47550
2253 6.229401e-09 NaN NaN ... NaN
2254 1.018774e-08 NaN NaN ... NaN
High.Solana Low.Bitcoin Low.Ethereum Low.Golem Low.Solana \
0 NaN 425.9 NaN NaN NaN
1 NaN 430.7 NaN NaN NaN
2 NaN 423.1 NaN NaN NaN
3 NaN 428.6 NaN NaN NaN
4 NaN 428.9 NaN NaN NaN
... ... ... ... ... ...
2250 NaN 37458.9 2574.12 0.41179 NaN
2251 NaN 42876.6 2858.54 0.45093 NaN
2252 NaN 43361.3 2914.70 0.43135 NaN
2253 NaN NaN NaN NaN NaN
2254 NaN NaN NaN NaN NaN
Close.Bitcoin Close.Ethereum Close.Golem Close.Solana
0 434.0 NaN NaN NaN
1 433.7 NaN NaN NaN
2 430.7 NaN NaN NaN
3 433.3 NaN NaN NaN
4 431.2 NaN NaN NaN
... ... ... ... ...
2250 43188.2 2922.50 0.47748 NaN
2251 44420.3 2975.81 0.47447 NaN
2252 43853.2 2952.47 0.43964 NaN
2253 NaN NaN NaN NaN
2254 NaN NaN NaN NaN
[2255 rows x 24 columns]
Related
I have the following dataframe:
ID Datetime Y
0 1000 00:29:59 0.117
1 1000 00:59:59 0.050
2 1000 01:29:59 0.025
3 1000 01:59:59 0.025
4 1000 02:29:59 0.049
... ... ...
48973133 2999 21:59:59 0.618
48973134 2999 22:29:59 0.495
48973135 2999 22:59:59 0.745
48973136 2999 23:29:59 0.514
48973137 2999 23:59:59 0.419
The Datetime column is not actually in that format, here it is:
0 00:29:59
1 00:59:59
2 01:29:59
3 01:59:59
4 02:29:59
...
48973133 21:59:59
48973134 22:29:59
48973135 22:59:59
48973136 23:29:59
48973137 23:59:59
Name: Datetime, Length: 48973138, dtype: object
I am trying to run the following pivot code:
print(df.assign(group=df.index//48).pivot(index='group', values='Y', columns=df['Datetime'][0:48]))
But i am getting following error:
KeyError: '00:29:59'
How can i fix it? I expect to get 48 columns (1 day of half-hourly measured data) in the pivoted dataframe, so my columns should be:
00:29:59 00:59:59 01:29:59 ... 23:29:59 23:59:59
The first row should have the first 48 values of Y, the second row should have the next 48, and so on.
EDIT: Picture of the cumcount()issue:
Based on your comment, it seems you have the same ID for multiple days. I would therefore suggest to keep track of the day with cumcount before pivoting:
df['Day'] = df.groupby(['ID', 'Datetime']).cumcount()
df.pivot(index=['ID', 'Day'], values='Y', columns='Datetime')
Edit: based on your comment under my answer, it seems that not all days have all timestamps. A solution could be to generate the right number of timestamps (repeating [00:29:59 00:59:59 01:29:59 ... 23:29:59 23:59:59]) and add missing values to df. This would be quite CPU intensive though:
import math
from itertools import cycle
# gapless list of Datetime:
dt = (x for i in range(24) for x in [f"{i}:29:59".zfill(8), f"{i}:59:59".zfill(8)])
for i, t in enumerate(cycle(dt)):
if i == len(df):
break
if df.loc[i, 'Datetime'] != t:
if t == "00:29:59": # filling missing IDs
id_ = df.loc[i, 'ID']
else:
id_ = df.loc[i-1, 'ID']
df = pd.concat([df.loc[0:i-1], pd.DataFrame({'ID': id_, 'Datetime': [t]}), df.loc[i:]], ignore_index=True)
Then apply groupby and pivot like shown above.
Edit2: using cycle instead of chain + tee
You can use DataFrame.pivot_table for avoid ValueError: Index contains duplicate entries, cannot reshape - if same value per ID and Datetime valuesa are aggregate - here is used default function mean:
td = pd.timedelta_range('00:00:00','24:00:00', freq='30Min')[1:]
td = [f'{x - pd.Timedelta("1 sec")}'[-8:] for x in td]
print (td)
['00:29:59', '00:59:59', '01:29:59', '01:59:59', '02:29:59', '02:59:59', '03:29:59', '03:59:59', '04:29:59', '04:59:59', '05:29:59', '05:59:59', '06:29:59', '06:59:59', '07:29:59', '07:59:59', '08:29:59', '08:59:59', '09:29:59', '09:59:59', '10:29:59', '10:59:59', '11:29:59', '11:59:59', '12:29:59', '12:59:59', '13:29:59', '13:59:59', '14:29:59', '14:59:59', '15:29:59', '15:59:59', '16:29:59', '16:59:59', '17:29:59', '17:59:59', '18:29:59', '18:59:59', '19:29:59', '19:59:59', '20:29:59', '20:59:59', '21:29:59', '21:59:59', '22:29:59', '22:59:59', '23:29:59', '23:59:59']
df1 = df.pivot_table(index='ID', columns='Datetime', values='Y', aggfunc='mean')
print (df1)
Datetime 00:29:59 00:59:59 01:29:59 01:59:59 02:29:59 21:59:59 \
ID
1000 0.117 0.05 0.025 0.025 0.049 NaN
2999 NaN NaN NaN NaN NaN 0.618
Datetime 22:29:59 22:59:59 23:59:59
ID
1000 NaN NaN NaN
2999 0.495 0.745 0.4665
If need all times add DataFrame.reindex:
df1 = (df.pivot_table(index='ID', columns='Datetime', values='Y', aggfunc='mean')
.reindex(td, axis=1))
print (df1)
Datetime 00:29:59 00:59:59 01:29:59 01:59:59 02:29:59 02:59:59 \
ID
1000 0.117 0.05 0.025 0.025 0.049 NaN
2999 NaN NaN NaN NaN NaN NaN
Datetime 03:29:59 03:59:59 04:29:59 04:59:59 05:29:59 05:59:59 \
ID
1000 NaN NaN NaN NaN NaN NaN
2999 NaN NaN NaN NaN NaN NaN
Datetime 06:29:59 06:59:59 07:29:59 07:59:59 08:29:59 08:59:59 \
ID
1000 NaN NaN NaN NaN NaN NaN
2999 NaN NaN NaN NaN NaN NaN
Datetime 09:29:59 09:59:59 10:29:59 10:59:59 11:29:59 11:59:59 \
ID
1000 NaN NaN NaN NaN NaN NaN
2999 NaN NaN NaN NaN NaN NaN
Datetime 12:29:59 12:59:59 13:29:59 13:59:59 14:29:59 14:59:59 \
ID
1000 NaN NaN NaN NaN NaN NaN
2999 NaN NaN NaN NaN NaN NaN
Datetime 15:29:59 15:59:59 16:29:59 16:59:59 17:29:59 17:59:59 \
ID
1000 NaN NaN NaN NaN NaN NaN
2999 NaN NaN NaN NaN NaN NaN
Datetime 18:29:59 18:59:59 19:29:59 19:59:59 20:29:59 20:59:59 \
ID
1000 NaN NaN NaN NaN NaN NaN
2999 NaN NaN NaN NaN NaN NaN
Datetime 21:29:59 21:59:59 22:29:59 22:59:59 23:29:59 23:59:59
ID
1000 NaN NaN NaN NaN NaN NaN
2999 NaN 0.618 0.495 0.745 NaN 0.4665
I am scraping the stock prices, and names from Yahoo's finance website. After making a dataframe with three columns "Name", "Code", and "Price" and representing the passed index variable. I want to go to another loop and add a column to the original dataframe with updated prices. But when I add the column it creates NaN values for my original data. What do I need to do to correctly place the indexes and not disturb the original dataframe data?
class Stocks():
def __init__(self):
return self
def Stock_ABV(str):
pattern = re.compile(r'/([A-Z]*-[A-Z]*|[A-Z]*)')
match = pattern.finditer(str)
length = len(str)
for match in match:
name = match.group(0)
return match.group(0)[1:length]
def Yahoo_Finance(index):
url_list = ['https://finance.yahoo.com/quote/GOOG','https://finance.yahoo.com/quote/DOGE-USD',
'https://finance.yahoo.com/quote/AAPL', 'https://finance.yahoo.com/quote/HMC',
'https://finance.yahoo.com/quote/TM', 'https://finance.yahoo.com/quote/DKS',
'https://finance.yahoo.com/quote/SHIB-USD', 'https://finance.yahoo.com/quote/BTC-USD',
'https://finance.yahoo.com/quote/WMT', 'https://finance.yahoo.com/quote/AMZN',
'https://finance.yahoo.com/quote/NKE', 'https://finance.yahoo.com/quote/KO',
'https://finance.yahoo.com/quote/PEP', 'https://finance.yahoo.com/quote/DAL',
'https://finance.yahoo.com/quote/SAVE', 'https://finance.yahoo.com/quote/BLL',
'https://finance.yahoo.com/quote/KMB', 'https://finance.yahoo.com/quote/GIS']
url = ''
i = 0
L1 = []
L2 = []
structure = pd.DataFrame({'Name': [], 'Code': [], 'Price': []})
if index == 1:
while i < len(url_list):
url = url + url_list[i]
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
name = soup.find('h1', {'class': 'D(ib) Fz(18px)'}).text
price = soup.find('fin-streamer', {'class': 'Fw(b) Fz(36px) Mb(-4px) D(ib)'}).text
L1.append([name,Stocks.Stock_ABV(url_list[i]),price])
df = pd.DataFrame(L1, columns = ['Name', 'Code', 'Price'])
i += 1
url = ''
structure = df
structure = structure.set_index(df.index)
else:
while i < len(url_list):
req = requests.get(url_list[i])
soup = BeautifulSoup(req.text, 'html.parser')
price = soup.find('fin-streamer', {'class': 'Fw(b) Fz(36px) Mb(-4px) D(ib)'}).text
L2.append(price)
df2 = pd.DataFrame(L2, columns = [f'Price{index}'])
i += 1
url = ''
structure[f'Price{index}'] = L2
pd.set_option('display.max_rows', None)
return structure
def AFK_Runner():
Stocks.Yahoo_Finance(1)
return Stocks.Yahoo_Finance(2)
Stocks.AFK_Runner()
Do you know the yfinance package?
# pip install yfinance
import yfinance as yf
data = yf.download('GOOG DOGE-USD AAPL HMC')
Output:
Adj Close Close ... Open Volume
AAPL DOGE-USD GOOG HMC AAPL DOGE-USD GOOG HMC ... AAPL DOGE-USD GOOG HMC AAPL DOGE-USD GOOG HMC
Date ...
1980-03-17 NaN NaN NaN 0.718973 NaN NaN NaN 0.893750 ... NaN NaN NaN 0.893750 NaN NaN NaN 26000.0
1980-03-18 NaN NaN NaN 0.731542 NaN NaN NaN 0.909375 ... NaN NaN NaN 0.909375 NaN NaN NaN 2000.0
1980-03-19 NaN NaN NaN 0.724001 NaN NaN NaN 0.900000 ... NaN NaN NaN 0.900000 NaN NaN NaN 2000.0
1980-03-20 NaN NaN NaN 0.724001 NaN NaN NaN 0.900000 ... NaN NaN NaN 0.900000 NaN NaN NaN 0.0
1980-03-21 NaN NaN NaN 0.724001 NaN NaN NaN 0.900000 ... NaN NaN NaN 0.900000 NaN NaN NaN 2000.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2022-02-09 176.279999 0.159123 2829.060059 30.500000 176.279999 0.159123 2829.060059 30.500000 ... 176.050003 0.158357 2816.995117 30.120001 71285000.0 7.786708e+08 1431400.0 1554600.0
2022-02-10 172.119995 0.151889 2772.050049 30.760000 172.119995 0.151889 2772.050049 30.760000 ... 174.139999 0.159145 2790.000000 31.000000 90865900.0 1.053631e+09 1650900.0 1398400.0
2022-02-11 168.639999 0.144847 2682.600098 30.459999 168.639999 0.144847 2682.600098 30.459999 ... 172.330002 0.151895 2775.000000 30.760000 98566000.0 7.767306e+08 1937700.0 1004200.0
2022-02-12 NaN 0.144405 NaN NaN NaN 0.144405 NaN NaN ... NaN 0.144856 NaN NaN NaN 6.026994e+08 NaN NaN
2022-02-13 NaN 0.153793 NaN NaN NaN 0.153793 NaN NaN ... NaN 0.144308 NaN NaN NaN 1.346092e+09 NaN NaN
[11055 rows x 24 columns]
Here is my definition:
def fill(df_name):
"""
Function to fill rows and dates.
"""
# Fill Down
for row in df_name[0]:
if 'Unnamed' in row:
df_name[0] = df_name[0].replace(row, np.nan)
df_name[0] = df_name[0].ffill(limit=2)
df_name[1] = df_name[1].ffill(limit=2)
# Fill in Dates
for col in df_name.columns:
if col >= 3:
old_dt = datetime(1998, 11, 15)
add_dt = old_dt + relativedelta(months=col - 3)
new_dt = add_dt.strftime('%#m/%d/%Y')
df_name = df_name.rename(columns={col: new_dt})
and then I call:
fill(df_cars)
The first half of the formula works (columns 0 and 1 have filled in correctly). However, as you can see, the columns are labeled 0-288. When I delete this function and simply run the code (changing df_name to df_cars) it runs correctly and the column names are the dates specified in the second half of the function.
What could be causing this to not execute the # Fill in Dates portion when defined in a function? Does it have to do with local variables?
0 1 2 3 4 5 ... 287 288 289 290 291 292
0 France NaN Market 3330 7478 2273 ... NaN NaN NaN NaN NaN NaT
1 France NaN World 362 798 306 ... NaN NaN NaN NaN NaN NaT
2 France NaN % 0.108709 0.106713 0.134624 ... NaN NaN NaN NaN NaN NaT
3 Germany NaN Market 1452 2025 1314 ... NaN NaN NaN NaN NaN NaT
4 Germany NaN World 209 246 182 ... NaN NaN NaN NaN NaN NaT
.. ... ... ... ... ... ... ... ... ... ... ... ... ..
349 Slovakia 0 World 1 1 0 ... NaN NaN NaN NaN NaN NaT
350 Slovakia 0 % 0.5 0.5 0 ... NaN NaN NaN NaN NaN NaT
I tried to concatenate two Pandas DataFrames, but it concatenates wrong.
Initial dataset looks like:
df
>>>
well qoil cum_oil wct top_perf bot_perf st x y
5233 101 259 3.684131e+05 97 -2352.13 -2359.12 0 517228 5931024
12786 102 3495 1.369303e+06 5.47 -2352.92 -2566.81 0 517192 5927187
13062 103 2691 1.353718e+06 0.5 -2377.93 -2581.73 0 517731 5926430
. . . .
65 rows × 9 columns
Then I generate a Euclidean distance between every well from x and y coordinates (last two columns):
from sklearn.neighbors import DistanceMetric
dist = DistanceMetric.get_metric('euclidean')
loc = pd.DataFrame(dist.pairwise(df[['x','y']].to_numpy()),
columns=df.well.unique(), index=df.well.unique())
and receive 65x65 matrix (pandas.core.frame.DataFrame type) where contains the distance between every well
loc
>>>
101 102 103 . . .
101 0.000000 152.278917 270.835312 . . .
102 151.278917 0.000000 326.310146 . . .
103 270.835312 346.310146 0.000000 . . .
. . .
Then I drop extra columns and concatenate two dataframes:
df_train_prep = df.drop(['well', 'wct', 'x', 'y'], axis=1)
df2 = pd.concat([df_train_prep, loc], axis=1)
As a result I receive not 65 rows x (9 + 65) columns dataframe but 130 rows × 70 columns df like:
df2
>>>
qoil cum_oil top_perf bot_perf st 101 102 103 . . .
236 0.001 542681.0 -2427.66 -2539.25 0.0 NaN NaN NaN NaN NaN ...
258 2291 292356.0 -2537.38 -2657.02 1.0 NaN NaN NaN NaN NaN ...
537 3290 237163.0 -2714.32 -2741.49 0.0 NaN NaN NaN NaN NaN ...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
101 NaN NaN NaN NaN NaN 0.000000 157.278917 280.835312 323.423701 ...
102 NaN NaN NaN NaN NaN 154.278917 0.000000 356.310146 210.348200 518.786999 ...
It looks like some data concatenate in the right but some moved to the bottom. Moreover, strange NaN value popped up.
Please, help me to understand what I am doing wrong.
# Dummy Data
df = pd.DataFrame({'x': range(5), 'y': range(5)})
# Pairewice euclidean distances
from sklearn.metrics.pairwise import euclidean_distances
distance = pd.DataFrame(euclidean_distances(df[['x', 'y']]))
# Concatenate
df = pd.concat([df, distance], axis=1)
print (df)
Output:
x y 0 1 2 3 4
0 0 0 0.000000 1.414214 2.828427 4.242641 5.656854
1 1 1 1.414214 0.000000 1.414214 2.828427 4.242641
2 2 2 2.828427 1.414214 0.000000 1.414214 2.828427
3 3 3 4.242641 2.828427 1.414214 0.000000 1.414214
4 4 4 5.656854 4.242641 2.828427 1.414214 0.000000
As you can see the parewise distance is a symmetric matrix.
I have a pandas dataframe with a two level hierarchical index ('item_id' and 'date'). Each row has columns for a variety of metrics for a particular item in a particular month. Here's a sample:
total_annotations unique_tags
date item_id
2007-04-01 2 30 14
2007-05-01 2 32 16
2007-06-01 2 36 19
2008-07-01 2 81 33
2008-11-01 2 82 34
2009-04-01 2 84 35
2010-03-01 2 90 35
2010-04-01 2 100 36
2010-11-01 2 105 40
2011-05-01 2 106 40
2011-07-01 2 108 42
2005-08-01 3 479 200
2005-09-01 3 707 269
2005-10-01 3 980 327
2005-11-01 3 1176 373
2005-12-01 3 1536 438
2006-01-01 3 1854 497
2006-02-01 3 2206 560
2006-03-01 3 2558 632
2007-02-01 3 5650 1019
As you can see, there are not observations for all consecutive months for each item. What I want to do is reindex the dataframe such that each item has rows for each month in a specified range. Now, this is easy to accomplish for any given item. So, for item_id 99, for example:
baseDateRange = pd.date_range('2005-07-01','2013-01-01',freq='MS')
data.xs(99,level='item_id').reindex(baseDateRange,method='ffill')
But with this method, I'd have to iterate through all the item_ids, then merge everything together, which seems woefully over-complicated.
So how can I apply this to the full dataframe, ffill-ing the observations (but also the item_id index) such that each item_id has properly filled rows for all the dates in baseDateRange?
Essentially for each group you want to reindex and ffill. The apply gets passed a data frame that has the item_id and date still in the index, so reset, then set and reindex with filling.
idx is your baseDateRange from above.
In [33]: df.groupby(level='item_id').apply(
lambda x: x.reset_index().set_index('date').reindex(idx,method='ffill')).head(30)
Out[33]:
item_id annotations tags
item_id
2 2005-07-01 NaN NaN NaN
2005-08-01 NaN NaN NaN
2005-09-01 NaN NaN NaN
2005-10-01 NaN NaN NaN
2005-11-01 NaN NaN NaN
2005-12-01 NaN NaN NaN
2006-01-01 NaN NaN NaN
2006-02-01 NaN NaN NaN
2006-03-01 NaN NaN NaN
2006-04-01 NaN NaN NaN
2006-05-01 NaN NaN NaN
2006-06-01 NaN NaN NaN
2006-07-01 NaN NaN NaN
2006-08-01 NaN NaN NaN
2006-09-01 NaN NaN NaN
2006-10-01 NaN NaN NaN
2006-11-01 NaN NaN NaN
2006-12-01 NaN NaN NaN
2007-01-01 NaN NaN NaN
2007-02-01 NaN NaN NaN
2007-03-01 NaN NaN NaN
2007-04-01 2 30 14
2007-05-01 2 32 16
2007-06-01 2 36 19
2007-07-01 2 36 19
2007-08-01 2 36 19
2007-09-01 2 36 19
2007-10-01 2 36 19
2007-11-01 2 36 19
2007-12-01 2 36 19
Constructing on Jeff's answer, I consider this to be somewhat more readable. It is also considerably more efficient since only the droplevel and reindex methods are used.
df = df.set_index(['item_id', 'date'])
def fill_missing_dates(x, idx=all_dates):
x.index = x.index.droplevel('item_id')
return x.reindex(idx, method='ffill')
filled_df = (df.groupby('item_id')
.apply(fill_missing_dates))