I have a dataframe that consists of hourly data for a whole year. I want to calculate the monthly means and show them in a time series plot. I have one variable which is NO2 values.
#Cleaning data
ck_2000 = pd.read_csv('2000-CamdenKerbside.csv', header=0,skiprows=4,usecols=range(0,3),skipfooter = 1, na_values = 'No data',engine = 'python')
colnames = ['Date', 'Time', 'NO2']
ck_2000.columns = colnames
#Reformat date/time
ck_2000.Time.replace(to_replace = '24:00:00', value = '00:00:00', inplace = True)
dtw = pd.to_datetime(ck_2000.Date + ck_2000.Time,format='%d/%m/%Y%H:%M:%S')
ck_2000.index = dtw
#Index dataframe by date
firstDate = ck_2000.index[0]
lastDate = ck_2000.index[len(ck_2000.Date) - 1]
ck2000 = ck_2000.reindex(index=pd.date_range(start = firstDate, end =lastDate, freq = '1H'), fill_value= None)
#Change data type to float
ck2000['NO2'] = ck2000['NO2'].dropna().astype('int64')
#Interpolation
ck_2000_int = ck_2000.interpolate()
#df's for all months
ck_2000_jan = ck_2000_int['2000-01']
ck_2000_feb = ck_2000_int['2000-02']
ck_2000_mar = ck_2000_int['2000-03']
ck_2000_apr = ck_2000_int['2000-04']
ck_2000_may = ck_2000_int['2000-05']
ck_2000_jun = ck_2000_int['2000-06']
ck_2000_jul = ck_2000_int['2000-07']
ck_2000_aug = ck_2000_int['2000-08']
ck_2000_sept = ck_2000_int['2000-09']
ck_2000_oct = ck_2000_int['2000-10']
ck_2000_nov = ck_2000_int['2000-11']
ck_2000_dec = ck_2000_int['2000-12']
you should be able to use resample
Consider the following example
tidx = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H')
ck_2000_int = pd.DataFrame(dict(NO2=np.random.randn(len(tidx))), tidx)
ck_2000_int.resample('M').mean().plot()
Related
I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.
I am pretty new to dash and I have tried to read as much as I can to understand what the issue might be. In a nutshell I have a single datepicker which is an input to the DataTable and Graph callback. The graph callback is working fine so it is just the DataTable which is causing problems. I also tried the single input to multiple output callback but didnt work. My code is as below:
app = JupyterDash()
folder = os.getcwd()
portfolio_returns_table = pd.read_csv(Path(folder, 'portfolioreturns_maria.csv',parse_dates=[0]))
portfolio_returns_table = portfolio_returns_table.set_index('Unnamed: 0')
name_portfolioID_table = pd.read_csv(Path(folder, 'name_portfolioID.csv'))
#Calculate portfolio cumulative returns
df_cumret = (portfolio_returns_table+1).cumprod().round(5)
df_cumret.index = pd.to_datetime(df_cumret.index)
app.layout = html.Div(html.Div([dcc.DatePickerSingle(
id='my-date-picker-single',
min_date_allowed=dt.date(df_cumret.index.min()),
max_date_allowed=dt.date(df_cumret.index.max()),
initial_visible_month=dt.date(df_cumret.index.max()),
date = dt.date(df_cumret.index.max())
,display_format = 'Y-MM-DD',clearable = True),
html.Div(id='output-container-date-picker-single'),
html.Div(dash_table.DataTable(id = 'data_table',
data = {},
fixed_rows={'headers': True},
style_cell = {'textAlign': 'left'},
style_table={'height': 400})),
html.Div(dcc.Graph('my_graph'))
]))
#app.callback([Output('data_table','data'),Output('data_table','columns')],
[Input('my-date-picker-
single','date')])
def update_leader_table(date):
#Get data for the selected date and transpose
df_T = df_cumret.loc[[date]].T
#Sort the table to reveal the top leaders
df_Top = df_T.sort_values(df_T.columns[0], ascending=False)[:10]
#Convert the index to an interger
df_Top.index = df_Top.index.astype(int)
#Generate the leaderboard to given date
df_leader = pd.merge(df_Top,name_portfolioID_table,
left_index=True,right_index=True, how = 'left')
#Create the col rank
df_leader['Rank'] = range(1,len(df_leader)+1)
df_leader.columns = ['Cum Return', 'Investor','Rank']
df_leader.reset_index(drop = True, inplace = True)
data = df_leader.to_dict('records')
columns= [{'id': c, 'name': c, "selectable": True} for c in
df_leader.columns]
return (data,columns)
#callback to link calendar to graph
#app.callback(Output('my_graph','figure'),[Input('my-date-picker-single','date')])
def update_graph(date):
#date filter
df_T = df_cumret.loc[:date].T
#Sort the table to reveal the top leaders & filter for leaderboard
df_Top = df_T.sort_values(df_T.columns[-1], ascending=False)[:10]
#Transpose to have date as index
df_top_graph = df_Top.T
#set the columns as an Int
df_top_graph.columns = df_top_graph.columns.astype(int)
#Rename columns
df_top_graph.rename(columns=dict(zip(name_portfolioID_table.index,
name_portfolioID_table.name)),
inplace=True)
#Generate graph
fig = px.line(df_top_graph, x = df_top_graph.index, y =
df_top_graph.columns, title='ETF LEADERBOARD PERFORMANCE: '+date, labels=
{'Unnamed: 0':'Date','value':'Cumulative Returns'})
fig.update_layout(hovermode = 'x unified')
fig.update_traces(hovertemplate='Return: %{y} <br>Date: %{x}')
fig.update_layout(legend_title_text = 'Investor')
return fig
if __name__ == '__main__':
app.run_server(mode = 'inline',debug=True, port = 65398)
The data frame shows the date with the amount of import and export
and it is further bifurcated into coastal and regional data per day
of one month.
What I wish to achieve is to club i.e sum all the data presented, which is of one month in this
case, in the end, it will show only one entry that will be of month
ending date and adding all the corresponding fields.
This is the following code:
df=pd.read_csv('output.csv',
encoding="utf-8",skipinitialspace=True,engine='python')
datadf = df
datadf = datadf.dropna(axis = 0, how ='any')
datadf = datadf.astype({'ForeignType' : 'category','ImportType' : 'category','ArrDate' : 'datetime64',
'DepDate' : 'datetime64'})
# datadf = datadf.groupby(datadf['ArrDate'].dt.strftime('%B'))['ComoQty'].sum()
datadf1 = datadf.groupby(['ArrDate','ImportType','ForeignType'])['ComoQty'].sum()
datadf2 = datadf1.to_frame()
datadf2.fillna(value=0,inplace=True)
# datadf2 = datadf2.reset_index('ImportType')
# datadf2 = datadf2.reset_index('ForeignType')
# datadf2 = datadf2.reset_index('ArrDate')
datadf2
datadf1 = datadf.drop(columns='Unnamed: 0')
prac = datadf1
prac =prac.set_index('ArrDate')
prac_dates = prac.copy()
prac = prac.resample('D').apply({'ShipName':'count','ComoQty':'sum'}).reset_index()
prac_dates = ((prac_dates.resample('M').apply({'ComoQty':'sum'}))/1000).reset_index()
prac_dates['Month'] = pd.DatetimeIndex(prac_dates['ArrDate']).strftime('%B')
del prac_dates['ArrDate']
# prac_dates
prac['Month'] = pd.DatetimeIndex(prac['ArrDate']).strftime('%B')
# prac['Month'] = pd.to_datetime(prac['Month'], format='%B')
prac['ArrDate'] = pd.DatetimeIndex(prac['ArrDate']).strftime('%d')
I have the below two functions :
def create_base_df(start_date, end_date):
base_df = pd.DataFrame({"dt": pd.date_range(start_date, end_date)})
base_df["dt_num_key"] = base_df.dt.apply(lambda x: datetime.datetime.strftime(x, "%Y%m%d")).astype(int)
base_df["cal_yr_nkey"] = base_df.dt.dt.strftime("%Y")
base_df["cal_mon_ofyr_nkey"] = base_df.dt.dt.strftime("%m")
base_df["cal_qtr_ofyr_nkey"] = base_df.dt.dt.quarter.astype(str).apply(lambda x: x.rjust(2, '0'))
base_df["cal_wk_ofyr_nkey"] = base_df.dt.dt.week.astype(str)
return base_df
def month_operations(df):
df["cal_mon_nm"] = df.dt.dt.strftime("%B")
df["cal_mon_shrt_nm"] = df.dt.dt.strftime("%b")
df["cal_yr_mon_nkey"] = df["cal_yr_nkey"] + df["cal_mon_ofyr_nkey"]
df["mon_seq_id"] = df.cal_yr_mon_nkey.sort_values().reset_index() ["cal_yr_mon_nkey"].rank(method='dense').astype(int)
df["dt_frst_dayof_mon"] = df.dt.apply(lambda x: datetime.datetime(x.year, x.month, 1))
df["dt_frst_dayof_mon_nkey"] = df["dt_frst_dayof_mon"].dt.strftime("%Y%m%d")
df["dt_lst_dayof_mon"] = df["dt_frst_dayof_mon"] + pd.tseries.offsets.DateOffset(
months=1) - pd.tseries.offsets.DateOffset(days=1)
df["dt_lst_dayof_mon_nkey"] = df["dt_lst_dayof_mon"].dt.strftime("%Y%m%d")
df["dt_frst_dayof_lst_mon"] = df["dt_frst_dayof_mon"] - pd.DateOffset(months=1)
df["dt_frst_dayof_lst_mon_nkey"] = df["dt_frst_dayof_lst_mon"].dt.strftime("%Y%m%d")
df["dt_lst_mon"] = df.dt - pd.tseries.offsets.DateOffset(months=1)
df["dt_lst_mon_nkey"] = df["dt_lst_mon"].dt.strftime("%Y%m%d")
df["dt_lst_yr_lst_mon"] = df.dt_lst_mon - pd.tseries.offsets.DateOffset(years=1)
df["dt_lst_yr_lst_mon_nkey"] = df["dt_lst_yr_lst_mon"].dt.strftime("%Y%m%d")
return df
The columns dt_lst_yr_lst_mon_nkey, dt_lst_mon_nkey and dt_frst_dayof_lst_mon_nkey are returning values in datetime format ('1899-12-01 00:00:00') and I cant seem to figure out why. All the other *key columns return integers as expected
my main looks like below:
base_df = create_base_df(start_date="01/01/1900", end_date="01/12/1900")
month_df = month_operations(base_df)
The expected output : if the value of dt_lst_yr_lst_mon is "1900-12-01 00:00:00" then dt_lst_yr_lst_mon_nkey will be "19001201"
Any pointers on where I am going wrong is appreciated.
Thanks.
I am trying to calculate the duration of the drawdowns and the time to recovery for a stock series. I can calculate the drawdowns but am struggling to the the durations and recovery time for each drawdown. So far I have this code:
import pandas as pd
import pickle
import xlrd
import numpy as np
np.random.seed(0)
df = pd.Series(np.random.randn(2500)*0.7+0.05, index=pd.date_range('1/1/2000', periods=2500, freq='D'))
df= 100*(1+df/100).cumprod()
df=pd.DataFrame(df)
df.columns = ['close']
df['ret'] = df.close/df.close[0]
df['modMax'] = df.ret.cummax()
df['modDD'] = 1-df.ret.div(df['modMax'])
groups = df.groupby(df['modMax'])
dd = groups['modMax','modDD'].apply(lambda g: g[g['modDD'] == g['modDD'].max()])
top10dd = dd.sort_values('modDD', ascending=False).head(10)
top10dd
This gives the 10 highest drawdowns of the series but I also want the duration of the drawdown and time to recovery.
I solved the problem as follows:
def drawdown_group(df,index_list):
group_max,dd_date = index_list
ddGroup = df[df['modMax'] == group_max]
group_length = len(ddGroup)
group_dd = ddGroup['dd'].max()
group_dd_length = len(ddGroup[ddGroup.index <= dd_date])
group_start = ddGroup[0:1].index[0]
group_end = ddGroup.tail(1).index[0]
group_rec = group_length - group_dd_length
#print (group_start,group_end,group_dd,dd_date,group_dd_length,group_rec,group_length)
return group_start,group_end,group_max,group_dd,dd_date,group_dd_length,group_rec,group_length
dd_col = ('start','end','peak', 'dd','dd_date','dd_length','dd_rec','tot_length')
df_dd = pd.DataFrame(columns = dd_col)
for i in range(1,10):
index_list = top10dd[i-1:i].index.tolist()[0]
#print(index_list)
start,end,peak,dd,dd_date,dd_length,dd_rec,tot_length = drawdown_group(df,index_list)
#print(start,end,dd,dd_date,dd_length,dd_rec,tot_length)
df_dd.loc[i-1] = drawdown_group(df,index_list)
Produces this table: