How to groupby specifically datetime index in a multiindex column by month - python

The data frame shows the date with the amount of import and export
and it is further bifurcated into coastal and regional data per day
of one month.
What I wish to achieve is to club i.e sum all the data presented, which is of one month in this
case, in the end, it will show only one entry that will be of month
ending date and adding all the corresponding fields.
This is the following code:
df=pd.read_csv('output.csv',
encoding="utf-8",skipinitialspace=True,engine='python')
datadf = df
datadf = datadf.dropna(axis = 0, how ='any')
datadf = datadf.astype({'ForeignType' : 'category','ImportType' : 'category','ArrDate' : 'datetime64',
'DepDate' : 'datetime64'})
# datadf = datadf.groupby(datadf['ArrDate'].dt.strftime('%B'))['ComoQty'].sum()
datadf1 = datadf.groupby(['ArrDate','ImportType','ForeignType'])['ComoQty'].sum()
datadf2 = datadf1.to_frame()
datadf2.fillna(value=0,inplace=True)
# datadf2 = datadf2.reset_index('ImportType')
# datadf2 = datadf2.reset_index('ForeignType')
# datadf2 = datadf2.reset_index('ArrDate')
datadf2

datadf1 = datadf.drop(columns='Unnamed: 0')
prac = datadf1
prac =prac.set_index('ArrDate')
prac_dates = prac.copy()
prac = prac.resample('D').apply({'ShipName':'count','ComoQty':'sum'}).reset_index()
prac_dates = ((prac_dates.resample('M').apply({'ComoQty':'sum'}))/1000).reset_index()
prac_dates['Month'] = pd.DatetimeIndex(prac_dates['ArrDate']).strftime('%B')
del prac_dates['ArrDate']
# prac_dates
prac['Month'] = pd.DatetimeIndex(prac['ArrDate']).strftime('%B')
# prac['Month'] = pd.to_datetime(prac['Month'], format='%B')
prac['ArrDate'] = pd.DatetimeIndex(prac['ArrDate']).strftime('%d')

Related

Pandas how to search one df for a certain date and return that data

I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.

Additional columns added to saved CSV

I have following code which generate features from csv
def gen_features_per_id(file_name, label):
df = pd.read_csv(file_name, delimiter=',')
df['dt'] = pd.to_datetime(df['datetime'], unit='s')
row = []
column_names = ['group_timestamp', 'label',
'x_mean', 'x_median', 'x_stdev', 'x_raw_min', 'x_raw_max', 'x_abs_min', 'x_abs_max',
'y_mean', 'y_median', 'y_stdev', 'y_raw_min', 'y_raw_max', 'y_abs_min', 'y_abs_max',
'z_mean', 'z_median', 'z_stdev', 'z_raw_min', 'z_raw_max', 'z_abs_min', 'z_abs_max' ]
group_df = pd.DataFrame(columns=column_names)
for group_name, g in df.groupby(pd.Grouper(freq='10s', key='dt')):
print(f'Start time {group_name} has {len(g)} records within 10 secs')
group_timestamp = group_name
label = label
x = g['x'].head(50)
x_mean = x.mean()
x_median = x.median()
x_std_dev = statistics.stdev(x)
x_raw_min = min(x)
x_raw_max = max(x)
x_abs_min = min(abs(x))
x_abs_max = max(abs(x))
# print(
# f'Mean : {x_mean}, Median : {x_median}, Stdev : {x_std_dev}, '
# f'X raw Min : {x_raw_min}, X raw Max : {x_raw_max}, '
# f'X abs Min : {x_abs_min}, X abs Max : {x_abs_max}'
# )
y = g['y'].head(50)
y_mean = y.mean()
y_median = y.median()
y_std_dev = statistics.stdev(y)
y_raw_min = min(y)
y_raw_max = max(y)
y_abs_min = min(abs(y))
y_abs_max = max(abs(y))
# print(
# f'Mean : {y_mean}, Median : {y_median}, Std dev : {y_std_dev}, '
# f'X raw Min : {y_raw_min}, X raw Max : {y_raw_max}, '
# f'X abs Min : {y_abs_min}, X abs Max : {y_abs_max}'
# )
z = g['z'].head(50)
z_mean = z.mean()
z_median = z.median()
z_std_dev = statistics.stdev(z)
z_raw_min = min(z)
z_raw_max = max(z)
z_abs_min = min(abs(z))
z_abs_max = max(abs(z))
# print(
# f'Mean : {z_mean}, Median : {z_median}, Std dev : {z_std_dev}, '
# f'X raw Min : {z_raw_min}, X raw Max : {z_raw_max}, '
# f'X abs Min : {z_abs_min}, X abs Max : {z_abs_max}'
# )
row.append(group_timestamp)
row.append(label)
row.append(x_mean)
row.append(x_median)
row.append(x_std_dev)
row.append(x_raw_min)
row.append(x_raw_max)
row.append(x_abs_min)
row.append(x_abs_max)
row.append(y_mean)
row.append(y_median)
row.append(y_std_dev)
row.append(y_raw_min)
row.append(y_raw_max)
row.append(y_abs_min)
row.append(y_abs_max)
row.append(z_mean)
row.append(z_median)
row.append(z_std_dev)
row.append(z_raw_min)
row.append(z_raw_max)
row.append(z_abs_min)
row.append(z_abs_max)
group_df = group_df.append([row], ignore_index=True)
group_df.to_csv("some.csv", index=False)
row = []
But saved csv file have additional columns added to the start of the csv header which is equal to supplied number of columns
Sample CSV
datetime,x,y,z,label
1493740845,0.0004,-0.0001,0.0045,bad
1493740846,0.0003,0.0002,0.0047,bad
1493740847,0.0005,0.0001,0.0049,bad
1493740848,0.0006,0.0004,0.005,bad
1493740849,0.0006,-0.0003,0.005,bad
1493740851,0.0001,-0.0003,0.0039,bad
1493740852,-0.0006,0.0003,0.0046,bad
B1493740853,0.0001,0.0,0.0048,bad
Output:
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,group_timestamp,label,x_abs_max,x_abs_min,x_mean,x_median,x_raw_max,x_raw_min,x_stdev,y_abs_max,y_abs_min,y_mean,y_median,y_raw_max,y_raw_min,y_stdev,z_abs_max,z_abs_min,z_mean,z_median,z_raw_max,z_raw_min,z_stdev
# data ... ,,,,,,,,,,,,,,,,,,,,,,,
# data ... ,,,,,,,,,,,,,,,,,,,,,,,
How to fix this?
Additionally : If you can help me to simplify the code more.
There is problem for each loop in groupby is necessary append values to row list and then append to rows outside loop for nested lists, so possible pass to DataFrame cosntructor in last step:
#added for nested lists (outside loops)
rows = []
df['dt'] = pd.to_datetime(df['datetime'], unit='s')
for group_name, g in df.groupby(pd.Grouper(freq='10s', key='dt')):
#added for row per loop
row = []
print(f'Start time {group_name} has {len(g)} records within 10 secs')
group_timestamp = group_name
label = label
x = g['x'].head(50)
x_mean = x.mean()
....
row.append(z_abs_max)
rows.append(row)
#DataFrame outside loops
group_df = pd.DataFrame(rows, columns=column_names)
print (group_df)
Your solution should be improved by GroupBy.agg:
#custom aggregate functions
def std_dev(x):
return statistics.stdev(x)
def abs_min(x):
return x.abs().min()
def abs_max(x):
return x.abs().max()
d = ['mean','median',std_dev, 'min','max', abs_min, abs_max]
cols = ['x','y','z']
#filtered first 50 rows
df[cols] = df.groupby(pd.Grouper(freq='10s', key='dt'))[cols].head(50)
#aggregate functions
group_df = df.groupby(pd.Grouper(freq='10s', key='dt'))[cols].agg(d)
group_df.columns = group_df.columns.map('_'.join)
print (group_df)

DashTable not updating with DatePickerSingle input in Callback

I am pretty new to dash and I have tried to read as much as I can to understand what the issue might be. In a nutshell I have a single datepicker which is an input to the DataTable and Graph callback. The graph callback is working fine so it is just the DataTable which is causing problems. I also tried the single input to multiple output callback but didnt work. My code is as below:
app = JupyterDash()
folder = os.getcwd()
portfolio_returns_table = pd.read_csv(Path(folder, 'portfolioreturns_maria.csv',parse_dates=[0]))
portfolio_returns_table = portfolio_returns_table.set_index('Unnamed: 0')
name_portfolioID_table = pd.read_csv(Path(folder, 'name_portfolioID.csv'))
#Calculate portfolio cumulative returns
df_cumret = (portfolio_returns_table+1).cumprod().round(5)
df_cumret.index = pd.to_datetime(df_cumret.index)
app.layout = html.Div(html.Div([dcc.DatePickerSingle(
id='my-date-picker-single',
min_date_allowed=dt.date(df_cumret.index.min()),
max_date_allowed=dt.date(df_cumret.index.max()),
initial_visible_month=dt.date(df_cumret.index.max()),
date = dt.date(df_cumret.index.max())
,display_format = 'Y-MM-DD',clearable = True),
html.Div(id='output-container-date-picker-single'),
html.Div(dash_table.DataTable(id = 'data_table',
data = {},
fixed_rows={'headers': True},
style_cell = {'textAlign': 'left'},
style_table={'height': 400})),
html.Div(dcc.Graph('my_graph'))
]))
#app.callback([Output('data_table','data'),Output('data_table','columns')],
[Input('my-date-picker-
single','date')])
def update_leader_table(date):
#Get data for the selected date and transpose
df_T = df_cumret.loc[[date]].T
#Sort the table to reveal the top leaders
df_Top = df_T.sort_values(df_T.columns[0], ascending=False)[:10]
#Convert the index to an interger
df_Top.index = df_Top.index.astype(int)
#Generate the leaderboard to given date
df_leader = pd.merge(df_Top,name_portfolioID_table,
left_index=True,right_index=True, how = 'left')
#Create the col rank
df_leader['Rank'] = range(1,len(df_leader)+1)
df_leader.columns = ['Cum Return', 'Investor','Rank']
df_leader.reset_index(drop = True, inplace = True)
data = df_leader.to_dict('records')
columns= [{'id': c, 'name': c, "selectable": True} for c in
df_leader.columns]
return (data,columns)
#callback to link calendar to graph
#app.callback(Output('my_graph','figure'),[Input('my-date-picker-single','date')])
def update_graph(date):
#date filter
df_T = df_cumret.loc[:date].T
#Sort the table to reveal the top leaders & filter for leaderboard
df_Top = df_T.sort_values(df_T.columns[-1], ascending=False)[:10]
#Transpose to have date as index
df_top_graph = df_Top.T
#set the columns as an Int
df_top_graph.columns = df_top_graph.columns.astype(int)
#Rename columns
df_top_graph.rename(columns=dict(zip(name_portfolioID_table.index,
name_portfolioID_table.name)),
inplace=True)
#Generate graph
fig = px.line(df_top_graph, x = df_top_graph.index, y =
df_top_graph.columns, title='ETF LEADERBOARD PERFORMANCE: '+date, labels=
{'Unnamed: 0':'Date','value':'Cumulative Returns'})
fig.update_layout(hovermode = 'x unified')
fig.update_traces(hovertemplate='Return: %{y} <br>Date: %{x}')
fig.update_layout(legend_title_text = 'Investor')
return fig
if __name__ == '__main__':
app.run_server(mode = 'inline',debug=True, port = 65398)

Resampling and regrouping using pivot table

edited --- code added
I'm trying to group all the values of the dataframe essaie['night_cons'] by day (and by year) but the result just gives me NAN.
colss = {'Date_Time': ['2017-11-10','2017-11-11','2017-11-12','2017-11-13', '2017-11-14', '2017-11-15', '2017-11-16', '2017-11-17', '2017-11-18', '2017-11-19'],
'Night_Cons(+)': [4470.76,25465.72,25465.72,25465.72, 21480.59, 20024.53, 19613.29, 28015.18, 28394.20, 29615.69]
}
dataframe = pd.DataFrame(colss, columns = ['Date_Time', 'Night_Cons(+)'])
#print (dataframe)
dataframe['Date_Time'] = pd.to_datetime(dataframe['Date_Time'], errors = 'coerce')
# Create new columns
dataframe['Day'] = dataframe['Date_Time'].dt.day
dataframe['Month'] = dataframe['Date_Time'].dt.month
dataframe['Year'] = dataframe['Date_Time'].dt.year
# Set index
#essaie = essaie.set_index('Date_Time')
dataframe = dataframe[['Night_Cons(+)', 'Day', 'Month', 'Year']]
#dataframe
#daily_data = pd.pivot_table(essaie, values = "Night_Cons(+)", columns = ["Month"], index = "Day")
daily_data = pd.pivot_table(dataframe, values = "Night_Cons(+)", columns = ["Year"], index = "Day")
daily_data = daily_data.reindex(index = ['Montag','Dienstag','Mittwoch', 'Donnerstag', 'Freitag', 'Samstag', 'Sonntag'])
daily_data
DataFrame and Results
please see the image below.
Sample:
colss = {'Date_Time': ['2017-11-10','2017-11-11','2017-11-12','2017-11-13', '2017-11-14', '2017-11-15', '2017-11-16', '2017-11-17', '2017-11-18', '2017-11-19'],
'Night_Cons(+)': [4470.76,25465.72,25465.72,25465.72, 21480.59, 20024.53, 19613.29, 28015.18, 28394.20, 29615.69]
}
dataframe = pd.DataFrame(colss, columns = ['Date_Time', 'Night_Cons(+)'])
First convert Date column to Series.dt.dayofweek, then pivoting and last rename index values:
dataframe['Date_Time'] = pd.to_datetime(dataframe['Date_Time'], errors = 'coerce')
dataframe['Year'] = dataframe['Date_Time'].dt.year
dataframe['Date'] = dataframe['Date_Time'].dt.dayofweek
daily_data = dataframe.pivot_table(values = "Night_Cons(+)",
columns = "Year",
index = "Date")
days = ['Montag','Dienstag','Mittwoch', 'Donnerstag', 'Freitag', 'Samstag', 'Sonntag']
daily_data = daily_data.rename(dict(enumerate(days)))
print (daily_data)
Year 2017
Date
Montag 25465.720
Dienstag 21480.590
Mittwoch 20024.530
Donnerstag 19613.290
Freitag 16242.970
Samstag 26929.960
Sonntag 27540.705

I want to create a time series of monthly means in Pandas

I have a dataframe that consists of hourly data for a whole year. I want to calculate the monthly means and show them in a time series plot. I have one variable which is NO2 values.
#Cleaning data
ck_2000 = pd.read_csv('2000-CamdenKerbside.csv', header=0,skiprows=4,usecols=range(0,3),skipfooter = 1, na_values = 'No data',engine = 'python')
colnames = ['Date', 'Time', 'NO2']
ck_2000.columns = colnames
#Reformat date/time
ck_2000.Time.replace(to_replace = '24:00:00', value = '00:00:00', inplace = True)
dtw = pd.to_datetime(ck_2000.Date + ck_2000.Time,format='%d/%m/%Y%H:%M:%S')
ck_2000.index = dtw
#Index dataframe by date
firstDate = ck_2000.index[0]
lastDate = ck_2000.index[len(ck_2000.Date) - 1]
ck2000 = ck_2000.reindex(index=pd.date_range(start = firstDate, end =lastDate, freq = '1H'), fill_value= None)
#Change data type to float
ck2000['NO2'] = ck2000['NO2'].dropna().astype('int64')
#Interpolation
ck_2000_int = ck_2000.interpolate()
#df's for all months
ck_2000_jan = ck_2000_int['2000-01']
ck_2000_feb = ck_2000_int['2000-02']
ck_2000_mar = ck_2000_int['2000-03']
ck_2000_apr = ck_2000_int['2000-04']
ck_2000_may = ck_2000_int['2000-05']
ck_2000_jun = ck_2000_int['2000-06']
ck_2000_jul = ck_2000_int['2000-07']
ck_2000_aug = ck_2000_int['2000-08']
ck_2000_sept = ck_2000_int['2000-09']
ck_2000_oct = ck_2000_int['2000-10']
ck_2000_nov = ck_2000_int['2000-11']
ck_2000_dec = ck_2000_int['2000-12']
you should be able to use resample
Consider the following example
tidx = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H')
ck_2000_int = pd.DataFrame(dict(NO2=np.random.randn(len(tidx))), tidx)
ck_2000_int.resample('M').mean().plot()

Categories