Let's start with a simple DataFrame:
df = pd.DataFrame({"a":[100,100,105,110,100,106,120,110,105,70,90, 100]})
df:
a
0 100
1 100
2 105
3 110
4 100
5 106
6 120
7 110
8 105
9 70
10 90
11 100
Now, I want to calculate the returns on a 7-day rolling basis. So I apply the following:
df['delta_rol_a_last_first'] = np.nan
for i in range(7,len(df)):
df['delta_rol_a_last_first'].iloc[i] = (df['a'].iloc[i] - df['a'].iloc[i-7])/df['a'].iloc[i-6]
df.dropna(inplace=True)
df:
a delta_rol_a_last_first
7 110 0.100000
8 105 0.047619
9 70 -0.318182
10 90 -0.200000
11 100 0.000000
Now I just want the negative returns, apply quantiles to them and I want to add identities to the rows as follows:
df_quant = df['delta_rol_a_last_first'][df['delta_rol_a_last_first'] <0].quantile([0.01,0.03,0.05,0.1])
df_quant.index.names = ['quantile']
df_quant=df_quant.to_frame()
df_quant['Type'] = 'pct'
df_quant['timeframe'] = 'weekly'
df_quant:
delta_rol_a_last_first Type timeframe
quantile
0.01 -0.317000 pct weekly
0.03 -0.314636 pct weekly
0.05 -0.312273 pct weekly
0.10 -0.306364 pct weekly
So that works perfectly.
Now imagine I want to do the same but more dynamically. So consider a DataFrame with multiple columns as follows:
data = [[99330,12,122],[1123,1230,1287],[123,101,812739],[1143,1230123,252],[234,342,4546],[2445,3453,3457],[7897,8657,5675],[46,5675,453],[76,484,3735],[363,93,4568],[385,568,367],[458,846,4847],[574,45747,658468],[57457,46534,4675]]
df1 = pd.DataFrame(data, index=['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
'2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
'2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12',
'2022-01-13', '2022-01-14'],
columns=['col_A', 'col_B', 'col_C'])
df1.index = pd.to_datetime(df1.index)
df1:
col_A col_B col_C
2022-01-01 99330 12 122
2022-01-02 1123 1230 1287
2022-01-03 123 101 812739
2022-01-04 1143 1230123 252
2022-01-05 234 342 4546
2022-01-06 2445 3453 3457
2022-01-07 7897 8657 5675
2022-01-08 46 5675 453
2022-01-09 76 484 3735
2022-01-10 363 93 4568
2022-01-11 385 568 367
2022-01-12 458 846 4847
2022-01-13 574 45747 658468
2022-01-14 57457 46534 4675
I will create a dictionary for the periods over which I want to calculate my rolling returns:
periodicity_dict = {'1D':'daily', '1W':'weekly'}
Now I want to create the same DataFrame as df_quant above. So my DataFrame should look something like this:
col_A_rolling col_B_rolling col_C_rolling Type timeframe
quantile
0.01 -0.317000 -0.234 -0.0443 pct weekly
0.03 -0.314636 -0.022 ... pct weekly
0.05 ... ... ... ...
0.10 ... ...
0.01 ... ...
0.03 ... ...
0.05 ... ...
0.10 -0.306364 -.530023 pct daily
(NOTE: the numbers in this DataFrame are hypothetical)
EDIT:
My attempt is this:
periodicity_dict = {'1D':'daily', '1W':'weekly'}
df_columns = df1.columns
for key in periodicity_dict:
for col in df_columns:
df1[col+'_rolling']= np.nan
for i in pd.date_range(start=df1[col].first_valid_index(), end=df1[col].last_valid_index(), freq=key):
df1[col+'_rolling'].iloc[i] = (df1[col].iloc[i] - df1[col].iloc[i-key])/df1[col].iloc[i-key]
What is the best way to do this? Any help would be appreciated.
I didn't test all code but first part can be replaced by DataFrame.roling
df = pd.DataFrame({"a":[100,100,105,110,100,106,120,110,105,70,90, 100]})
# ---
def convert(data):
return (data.iloc[-1] - data.iloc[0])/data.iloc[1]
df[['delta_rol_a_last_first']] = df.rolling(8).apply(convert)
# ---
print(df)
or using lambda
df[['delta_rol_a_last_first']] = df.rolling(8).apply(lambda data: ((data.iloc[-1] - data.iloc[0])/data.iloc[1]))
The same for many columns:
import pandas as pd
data = [
[99330,12,122], [1123,1230,1287], [123,101,812739], [1143,1230123,252],
[234,342,4546], [2445,3453,3457], [7897,8657,5675], [46,5675,453],
[76,484,3735], [363,93,4568], [385,568,367], [458,846,4847],
[574,45747,658468], [57457,46534,4675]
]
df = pd.DataFrame(
data,
index=['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
'2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
'2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12',
'2022-01-13', '2022-01-14'],
columns=['col_A', 'col_B', 'col_C']
)
df.index = pd.to_datetime(df.index)
# ---
def convert(data):
return (data.iloc[-1] - data.iloc[0])/data.iloc[1]
#df[['col_A_weekly', 'col_B_weekly', 'col_C_weekly']] = df.rolling(8).apply(convert)
new_columns = [name+'_weekly' for name in df.columns]
df[new_columns] = df.rolling(8).apply(convert)
# ---
print(df)
Result:
col_A col_B col_C col_A_weekly col_B_weekly col_C_weekly
2022-01-01 99330 12 122 NaN NaN NaN
2022-01-02 1123 1230 1287 NaN NaN NaN
2022-01-03 123 101 812739 NaN NaN NaN
2022-01-04 1143 1230123 252 NaN NaN NaN
2022-01-05 234 342 4546 NaN NaN NaN
2022-01-06 2445 3453 3457 NaN NaN NaN
2022-01-07 7897 8657 5675 NaN NaN NaN
2022-01-08 46 5675 453 -88.409617 4.604065 0.257187
2022-01-09 76 484 3735 -8.512195 -7.386139 0.003012
2022-01-10 363 93 4568 0.209974 -0.000007 -3207.027778
2022-01-11 385 568 367 -3.239316 -3595.190058 0.025297
2022-01-12 458 846 4847 0.091616 0.145960 0.087070
2022-01-13 574 45747 658468 -0.236925 4.885526 115.420441
2022-01-14 57457 46534 4675 1077.391304 6.674361 -2.207506
EDIT:
Using two ranges daily and weekly
old_columns = df.columns
new_columns = [name+'_weekly' for name in old_columns]
df[new_columns] = df[old_columns].rolling(8).apply(convert)
new_columns = [name+'_daily' for name in old_columns]
df[new_columns] = df[old_columns].rolling(2).apply(convert)
or using loop:
old_columns = df.columns
for days, suffix in ((1, 'daily'), (7, 'weekly')):
new_columns = [name+'_'+suffix for name in old_columns]
df[new_columns] = df[old_columns].rolling(days+1).apply(convert)
or
for days, suffix in ((1, 'daily'), (7, 'weekly')):
for name in old_columns:
new_name = name + '_' + suffix
df[new_name] = df[name].rolling(days+1).apply(convert)
Result:
col_A col_B col_C col_A_weekly col_B_weekly col_C_weekly col_A_daily col_B_daily col_C_daily
2022-01-01 99330 12 122 NaN NaN NaN NaN NaN NaN
2022-01-02 1123 1230 1287 NaN NaN NaN -87.450579 0.990244 0.905206
2022-01-03 123 101 812739 NaN NaN NaN -8.130081 -11.178218 0.998416
2022-01-04 1143 1230123 252 NaN NaN NaN 0.892388 0.999918 -3224.154762
2022-01-05 234 342 4546 NaN NaN NaN -3.884615 -3595.850877 0.944567
2022-01-06 2445 3453 3457 NaN NaN NaN 0.904294 0.900956 -0.315013
2022-01-07 7897 8657 5675 NaN NaN NaN 0.690389 0.601132 0.390837
2022-01-08 46 5675 453 -88.409617 4.604065 0.257187 -170.673913 -0.525463 -11.527594
2022-01-09 76 484 3735 -8.512195 -7.386139 0.003012 0.394737 -10.725207 0.878715
2022-01-10 363 93 4568 0.209974 -0.000007 -3207.027778 0.790634 -4.204301 0.182356
2022-01-11 385 568 367 -3.239316 -3595.190058 0.025297 0.057143 0.836268 -11.446866
2022-01-12 458 846 4847 0.091616 0.145960 0.087070 0.159389 0.328605 0.924283
2022-01-13 574 45747 658468 -0.236925 4.885526 115.420441 0.202091 0.981507 0.992639
2022-01-14 57457 46534 4675 1077.391304 6.674361 -2.207506 0.990010 0.016912 -139.848770
EDIT:
Quantile:
finall_df = pd.DataFrame()
for days, suffix in ((1, 'daily'), (7, 'weekly')):
df_quant = pd.DataFrame()
for name in old_columns:
new_name = name + '_' + suffix
df_quant[name] = df[new_name][df[new_name]<0].quantile([0.01,0.03,0.05,0.1])
df_quant.index.names = ['quantile']
df_quant['Type'] = 'pct'
df_quant['timeframe'] = suffix
print(df_quant.to_string())
#finall_df = finall_df.append(df_quant)
finall_df = pd.concat([finall_df,df_quant])
print(finall_df)
Result:
col_A col_B col_C Type timeframe
quantile
0.01 -168.177213 -3452.463971 -3100.782522 pct daily
0.03 -163.183813 -3165.690158 -2854.038043 pct daily
0.05 -158.190413 -2878.916345 -2607.293564 pct daily
0.10 -145.706913 -2161.981813 -1990.432365 pct daily
0.01 -86.012694 -3523.433980 -3174.979575 pct weekly
0.03 -81.218849 -3379.921823 -3110.883170 pct weekly
0.05 -76.425004 -3236.409666 -3046.786764 pct weekly
0.10 -64.440391 -2877.629275 -2886.545751 pct weekly
Related
I have two DataFrames:
df1:
ticker A B C
date
2022-01-01 NaN NaN 100
2022-01-02 NaN 200 NaN
2022-01-03 100 NaN NaN
2022-01-04 NaN NaN 120
df2:
ticker A B C
date
2022-01-02 145 233 100
2022-01-03 231 200 241
2022-01-04 100 200 422
2022-01-05 424 324 222
2022-01-06 400 421 320
I want to fill the values in df2 as np.nan for each index and column, where the value in df1 is not null to get the following:
df3:
ticker A B C
date
2022-01-02 145 NaN 100
2022-01-03 NaN 200 241
2022-01-04 100 200 NaN
2022-01-05 424 324 222
2022-01-06 400 421 320
How can this be done Pythonically without going into many loops?
use this:
df2.columns=df2.columns + '2'
final=df.merge(df2,left_on='date',right_on='date2')
final['A2']=np.where(final['A'].notnull(),np.nan,final['A2'])
final['B2']=np.where(final['B'].notnull(),np.nan,final['B2'])
final['C2']=np.where(final['C'].notnull(),np.nan,final['C2'])
final=final[df2.columns]
final=pd.concat([final,df2]).drop_duplicates(subset='date2',keep='first')
final.columns=df.columns
print(final)
'''
date A B C
0 2022-01-02 145.0 nan 100.0
1 2022-01-03 nan 200.0 241.0
2 2022-01-04 100.0 200.0 nan
3 2022-01-05 424.0 324.0 222.0
4 2022-01-06 400.0 421.0 320.0
'''
for col in df1:
idx = df1[df1[col].notna()].index
try:
df2[col][idx] = np.nan
except Exception as e:
print(e)
Suppose I have two DataFrames:
df1:
ticker A B C
date
2022-01-01 NaN NaN 100
2022-01-02 NaN 200 NaN
2022-01-03 100 NaN NaN
2022-01-04 NaN NaN 120
df2:
ticker A B C
date
2022-01-02 145 233 100
2022-01-03 231 200 241
2022-01-04 100 200 422
2022-01-05 424 324 222
2022-01-06 400 421 320
I want to fill the values in df2 as np.nan for each index and column, where the value in df1 is not null to get the following:
df3:
ticker A B C
date
2022-01-02 145 NaN 100
2022-01-03 NaN 200 241
2022-01-04 100 200 NaN
2022-01-05 424 324 222
2022-01-06 400 421 320
I am applying the following code:
for col in df1.columns:
idx = df1[df1[col].notna()].index
if df2[col][idx] == df1[col][idx]:
df2[col][idx] = np.nan
However, this gives the error: ValueError: The truth value of a Series is ambiguous. Use a.empty(), a.bool(), a.item(), a.any() or a.all().
How can I re-write the above loop?
You can use reindex_like to align df1 with df2, then mask the values of df2 for which the matching df1 are notna:
out = df2.mask(df1.reindex_like(df2).notna())
To modify df2 in place:
df2[df1.reindex_like(df2).notna()] = float('nan')
Output:
A B C
date
2022-01-02 145.0 NaN 100.0
2022-01-03 NaN 200.0 241.0
2022-01-04 100.0 200.0 NaN
2022-01-05 424.0 324.0 222.0
2022-01-06 400.0 421.0 320.0
combining several conditions
df1b = df1.reindex_like(df2)
out = df2.mask(df1b.notna()&df2.ne(df1b), df2-df1b)
Output:
A B C
date
2022-01-02 145 33 100
2022-01-03 131 200 241
2022-01-04 100 200 302
2022-01-05 424 324 222
2022-01-06 400 421 320
I have a table with 3 main columns. I would like to first group the data by Company ID, then get the Highest Post Valuation per Company ID, and its corresponding Deal Date.
Question: How do I add corresponding Deal Date in?
The data:
Company ID
Post Valuation
Deal Date
60
119616-85
NaN
2022-03-01
80
160988-50
6.77
2022-02-10
85
108827-47
NaN
2022-02-01
89
154876-33
1.40
2022-01-27
104
435509-92
6.16
2022-01-05
107
186777-73
17.26
2022-01-03
111
232001-47
NaN
2022-01-01
113
160988-50
NaN
2021-12-31
119
114196-78
NaN
2021-12-15
128
481375-00
2.82
2021-12-01
130
128348-20
NaN
2021-11-25
131
166855-60
658.36
2021-11-25
150
113503-87
NaN
2021-10-20
156
178448-68
21.75
2021-10-07
170
479007-64
NaN
2021-09-13
182
128479-51
NaN
2021-09-01
185
113503-87
NaN
2021-08-31
186
128348-20
NaN
2021-08-30
191
108643-42
8.02
2021-08-13
192
186272-74
NaN
2021-08-12
The attempt
df_X.sort_values('Post Valuation', ascending=True).groupby('Company ID', as_index=False)['Post Valuation'].first()
Sort and drop duplicates:
result = df.sort_values('Post Valuation').drop_duplicates(subset='Company ID', keep='last')
I have a dataframe like this:
ID Date Value
783 C 2018-02-23 0.704
580 B 2018-08-04 -1.189
221 A 2018-08-10 -0.788
228 A 2018-08-17 0.038
578 B 2018-08-02 1.188
What I want is expanding the dataframe based on Date column to 1-month earlier, and fill ID with the same person, and fill Value with nan until the last observation.
The expected result is similar to this:
ID Date Value
0 C 2018/01/24 nan
1 C 2018/01/25 nan
2 C 2018/01/26 nan
3 C 2018/01/27 nan
4 C 2018/01/28 nan
5 C 2018/01/29 nan
6 C 2018/01/30 nan
7 C 2018/01/31 nan
8 C 2018/02/01 nan
9 C 2018/02/02 nan
10 C 2018/02/03 nan
11 C 2018/02/04 nan
12 C 2018/02/05 nan
13 C 2018/02/06 nan
14 C 2018/02/07 nan
15 C 2018/02/08 nan
16 C 2018/02/09 nan
17 C 2018/02/10 nan
18 C 2018/02/11 nan
19 C 2018/02/12 nan
20 C 2018/02/13 nan
21 C 2018/02/14 nan
22 C 2018/02/15 nan
23 C 2018/02/16 nan
24 C 2018/02/17 nan
25 C 2018/02/18 nan
26 C 2018/02/19 nan
27 C 2018/02/20 nan
28 C 2018/02/21 nan
29 C 2018/02/22 nan
30 C 2018/02/23 1.093
31 B 2018/07/05 nan
32 B 2018/07/06 nan
33 B 2018/07/07 nan
34 B 2018/07/08 nan
35 B 2018/07/09 nan
36 B 2018/07/10 nan
37 B 2018/07/11 nan
38 B 2018/07/12 nan
39 B 2018/07/13 nan
40 B 2018/07/14 nan
41 B 2018/07/15 nan
42 B 2018/07/16 nan
43 B 2018/07/17 nan
44 B 2018/07/18 nan
45 B 2018/07/19 nan
46 B 2018/07/20 nan
47 B 2018/07/21 nan
48 B 2018/07/22 nan
49 B 2018/07/23 nan
50 B 2018/07/24 nan
51 B 2018/07/25 nan
52 B 2018/07/26 nan
53 B 2018/07/27 nan
54 B 2018/07/28 nan
55 B 2018/07/29 nan
56 B 2018/07/30 nan
57 B 2018/07/31 nan
58 B 2018/08/01 nan
59 B 2018/08/02 nan
60 B 2018/08/03 nan
61 B 2018/08/04 0.764
62 A 2018/07/11 nan
63 A 2018/07/12 nan
64 A 2018/07/13 nan
65 A 2018/07/14 nan
66 A 2018/07/15 nan
67 A 2018/07/16 nan
68 A 2018/07/17 nan
69 A 2018/07/18 nan
70 A 2018/07/19 nan
71 A 2018/07/20 nan
72 A 2018/07/21 nan
73 A 2018/07/22 nan
74 A 2018/07/23 nan
75 A 2018/07/24 nan
76 A 2018/07/25 nan
77 A 2018/07/26 nan
78 A 2018/07/27 nan
79 A 2018/07/28 nan
80 A 2018/07/29 nan
81 A 2018/07/30 nan
82 A 2018/07/31 nan
83 A 2018/08/01 nan
84 A 2018/08/02 nan
85 A 2018/08/03 nan
86 A 2018/08/04 nan
87 A 2018/08/05 nan
88 A 2018/08/06 nan
89 A 2018/08/07 nan
90 A 2018/08/08 nan
91 A 2018/08/09 nan
92 A 2018/08/10 2.144
93 A 2018/07/18 nan
94 A 2018/07/19 nan
95 A 2018/07/20 nan
96 A 2018/07/21 nan
97 A 2018/07/22 nan
98 A 2018/07/23 nan
99 A 2018/07/24 nan
100 A 2018/07/25 nan
101 A 2018/07/26 nan
102 A 2018/07/27 nan
103 A 2018/07/28 nan
104 A 2018/07/29 nan
105 A 2018/07/30 nan
106 A 2018/07/31 nan
107 A 2018/08/01 nan
108 A 2018/08/02 nan
109 A 2018/08/03 nan
110 A 2018/08/04 nan
111 A 2018/08/05 nan
112 A 2018/08/06 nan
113 A 2018/08/07 nan
114 A 2018/08/08 nan
115 A 2018/08/09 nan
116 A 2018/08/10 nan
117 A 2018/08/11 nan
118 A 2018/08/12 nan
119 A 2018/08/13 nan
120 A 2018/08/14 nan
121 A 2018/08/15 nan
122 A 2018/08/16 nan
123 A 2018/08/17 0.644
124 B 2018/07/03 nan
125 B 2018/07/04 nan
126 B 2018/07/05 nan
127 B 2018/07/06 nan
128 B 2018/07/07 nan
129 B 2018/07/08 nan
130 B 2018/07/09 nan
131 B 2018/07/10 nan
132 B 2018/07/11 nan
133 B 2018/07/12 nan
134 B 2018/07/13 nan
135 B 2018/07/14 nan
136 B 2018/07/15 nan
137 B 2018/07/16 nan
138 B 2018/07/17 nan
139 B 2018/07/18 nan
140 B 2018/07/19 nan
141 B 2018/07/20 nan
142 B 2018/07/21 nan
143 B 2018/07/22 nan
144 B 2018/07/23 nan
145 B 2018/07/24 nan
146 B 2018/07/25 nan
147 B 2018/07/26 nan
148 B 2018/07/27 nan
149 B 2018/07/28 nan
150 B 2018/07/29 nan
151 B 2018/07/30 nan
152 B 2018/07/31 nan
153 B 2018/08/01 nan
154 B 2018/08/02 -0.767
The source data can be created as below:
import pandas as pd
from itertools import chain
import numpy as np
df_1 = pd.DataFrame({
'ID' : list(chain.from_iterable([['A'] * 365, ['B'] * 365, ['C'] * 365])),
'Date' : pd.date_range(start = '2018-01-01', end = '2018-12-31').tolist() + pd.date_range(start = '2018-01-01', end = '2018-12-31').tolist() + pd.date_range(start = '2018-01-01', end = '2018-12-31').tolist(),
'Value' : np.random.randn(365 * 3)
})
df_1 = df_1.sample(5, random_state = 123)
Thanks for the advice!
You can create another DataFrame with previous months, then join together by concat, create DatetimeIndex, so possible use groupby with resample by d for days for add all values between:
df_2 = df_1.assign(Date = df_1['Date'] - pd.DateOffset(months=1) + pd.DateOffset(days=1),
Value = np.nan)
df = (pd.concat([df_2, df_1], sort=False)
.reset_index()
.set_index('Date')
.groupby('index', sort=False)
.resample('D')
.ffill()
.reset_index(level=1)
.drop('index', 1)
.rename_axis(None))
print (df)
Date ID Value
783 2018-01-24 C NaN
783 2018-01-25 C NaN
783 2018-01-26 C NaN
783 2018-01-27 C NaN
783 2018-01-28 C NaN
.. ... .. ...
578 2018-07-29 B NaN
578 2018-07-30 B NaN
578 2018-07-31 B NaN
578 2018-08-01 B NaN
578 2018-08-02 B 0.562684
[155 rows x 3 columns]
Another solution with list comprehension and concat, but last is necessary back filling of columns for index and ID, solution working if no missing value in original ID column:
offset = pd.DateOffset(months=1) + pd.DateOffset(days=1)
df=pd.concat([df_1.iloc[[i]].reset_index().set_index('Date').reindex(pd.date_range(d-offset,d))
for i, d in enumerate(df_1['Date'])], sort=False)
df = (df.assign(index = df['index'].bfill().astype(int), ID = df['ID'].bfill())
.rename_axis('Date')
.reset_index()
.set_index('index')
.rename_axis(None)
)
print (df)
Date ID Value
783 2018-01-24 C NaN
783 2018-01-25 C NaN
783 2018-01-26 C NaN
783 2018-01-27 C NaN
783 2018-01-28 C NaN
.. ... .. ...
578 2018-07-29 B NaN
578 2018-07-30 B NaN
578 2018-07-31 B NaN
578 2018-08-01 B NaN
578 2018-08-02 B 1.224345
[155 rows x 3 columns]
We can create a date range in the "Date" column, then explode it.
Then group the "Value" column by the index and set values to nan but the last.
Finally reset the index.
def drange(t):
return pd.date_range( t-pd.DateOffset(months=1)+pd.DateOffset(days=1),t,freq="D",normalize=True)
df["Date"]= df["Date"].transform(drange)
ID Date Value
index
783 C DatetimeIndex(['2018-01-24', '2018-01-25', '20... 0.704
580 B DatetimeIndex(['2018-07-05', '2018-07-06', '20... -1.189
221 A DatetimeIndex(['2018-07-11', '2018-07-12', '20... -0.788
228 A DatetimeIndex(['2018-07-18', '2018-07-19', '20... 0.038
578 B DatetimeIndex(['2018-07-03', '2018-07-04', '20... 1.188
df= df.reset_index(drop=True).explode(column="Date")
ID Date Value
0 C 2018-01-24 0.704
0 C 2018-01-25 0.704
0 C 2018-01-26 0.704
0 C 2018-01-27 0.704
0 C 2018-01-28 0.704
.. .. ... ...
4 B 2018-07-29 1.188
4 B 2018-07-30 1.188
4 B 2018-07-31 1.188
4 B 2018-08-01 1.188
4 B 2018-08-02 1.188
df["Value"]= df.groupby(level=0)["Value"].transform(lambda v: [np.nan]*(len(v)-1)+[v.iloc[0]])
df= df.reset_index(drop=True)
ID Date Value
0 C 2018-01-24 NaN
1 C 2018-01-25 NaN
2 C 2018-01-26 NaN
3 C 2018-01-27 NaN
4 C 2018-01-28 NaN
.. .. ... ...
150 B 2018-07-29 NaN
151 B 2018-07-30 NaN
152 B 2018-07-31 NaN
153 B 2018-08-01 NaN
154 B 2018-08-02 1.188
I have data like this, without z1, what i need is to add a column to DataFrame, so it will add column z1 and represent values as in the example, what it should do is to shift z value equally on 1 day before for the same Start date.
I was thinking it could be done with apply and lambda in pandas, but i`m not sure how to define lambda function
data = pd.read_csv("....")
data["Z"] = data[[
"Start", "Z"]].apply(lambda x:
You can use DataFrameGroupBy.shift with merge:
#if not datetime
df['date'] = pd.to_datetime(df.date)
df.set_index('date', inplace=True)
df1 = df.groupby('start')['z'].shift(freq='1D',periods=1).reset_index()
print (pd.merge(df.reset_index(),df1, on=['start','date'], how='left', suffixes=('','1')))
date start z z1
0 2012-12-01 324 564545 NaN
1 2012-12-01 384 5555 NaN
2 2012-12-01 349 554 NaN
3 2012-12-02 855 635 NaN
4 2012-12-02 324 56 564545.0
5 2012-12-01 341 98 NaN
6 2012-12-03 324 888 56.0
EDIT:
Try find duplicates and fillna by 0:
df['date'] = pd.to_datetime(df.date)
df.set_index('date', inplace=True)
df1 = df.groupby('start')['z'].shift(freq='1D',periods=1).reset_index()
df2 = pd.merge(df.reset_index(),df1, on=['start','date'], how='left', suffixes=('','1'))
mask = df2.start.duplicated(keep=False)
df2.ix[mask, 'z1'] = df2.ix[mask, 'z1'].fillna(0)
print (df2)
date start z z1
0 2012-12-01 324 564545 0.0
1 2012-12-01 384 5555 NaN
2 2012-12-01 349 554 NaN
3 2012-12-02 855 635 NaN
4 2012-12-02 324 56 564545.0
5 2012-12-01 341 98 NaN
6 2012-12-03 324 888 56.0