Resampling and regrouping using pivot table - python

edited --- code added
I'm trying to group all the values of the dataframe essaie['night_cons'] by day (and by year) but the result just gives me NAN.
colss = {'Date_Time': ['2017-11-10','2017-11-11','2017-11-12','2017-11-13', '2017-11-14', '2017-11-15', '2017-11-16', '2017-11-17', '2017-11-18', '2017-11-19'],
'Night_Cons(+)': [4470.76,25465.72,25465.72,25465.72, 21480.59, 20024.53, 19613.29, 28015.18, 28394.20, 29615.69]
}
dataframe = pd.DataFrame(colss, columns = ['Date_Time', 'Night_Cons(+)'])
#print (dataframe)
dataframe['Date_Time'] = pd.to_datetime(dataframe['Date_Time'], errors = 'coerce')
# Create new columns
dataframe['Day'] = dataframe['Date_Time'].dt.day
dataframe['Month'] = dataframe['Date_Time'].dt.month
dataframe['Year'] = dataframe['Date_Time'].dt.year
# Set index
#essaie = essaie.set_index('Date_Time')
dataframe = dataframe[['Night_Cons(+)', 'Day', 'Month', 'Year']]
#dataframe
#daily_data = pd.pivot_table(essaie, values = "Night_Cons(+)", columns = ["Month"], index = "Day")
daily_data = pd.pivot_table(dataframe, values = "Night_Cons(+)", columns = ["Year"], index = "Day")
daily_data = daily_data.reindex(index = ['Montag','Dienstag','Mittwoch', 'Donnerstag', 'Freitag', 'Samstag', 'Sonntag'])
daily_data
DataFrame and Results
please see the image below.

Sample:
colss = {'Date_Time': ['2017-11-10','2017-11-11','2017-11-12','2017-11-13', '2017-11-14', '2017-11-15', '2017-11-16', '2017-11-17', '2017-11-18', '2017-11-19'],
'Night_Cons(+)': [4470.76,25465.72,25465.72,25465.72, 21480.59, 20024.53, 19613.29, 28015.18, 28394.20, 29615.69]
}
dataframe = pd.DataFrame(colss, columns = ['Date_Time', 'Night_Cons(+)'])
First convert Date column to Series.dt.dayofweek, then pivoting and last rename index values:
dataframe['Date_Time'] = pd.to_datetime(dataframe['Date_Time'], errors = 'coerce')
dataframe['Year'] = dataframe['Date_Time'].dt.year
dataframe['Date'] = dataframe['Date_Time'].dt.dayofweek
daily_data = dataframe.pivot_table(values = "Night_Cons(+)",
columns = "Year",
index = "Date")
days = ['Montag','Dienstag','Mittwoch', 'Donnerstag', 'Freitag', 'Samstag', 'Sonntag']
daily_data = daily_data.rename(dict(enumerate(days)))
print (daily_data)
Year 2017
Date
Montag 25465.720
Dienstag 21480.590
Mittwoch 20024.530
Donnerstag 19613.290
Freitag 16242.970
Samstag 26929.960
Sonntag 27540.705

Related

pandas merge dataframes with same columns and reference its id

I have a problem with python pandas. I have serveral different dataframes which I want to split up into an SQLite Database. My first Dataframe to_country:
to_country = df[['country']]
to_country = to_country.rename(columns={'country': 'Name'})
to_country = to_country.drop_duplicates()
#Add Index to Country Dataframe
to_country.insert(0, 'Id', range(1, 1 + len(to_country)))
to_country=to_country.reindex(columns=['Id','Name'])
to_country = to_country.set_index('Id')
to_country.to_sql('Country', con=con, if_exists='append', index=True)```
This part works fine.
Now i have another Dataframe to_state which looks like that:
to_state = df[['state','country']]
to_state = to_state.rename(columns={'state': 'Name'})
to_state = to_state.drop_duplicates()
to_state.insert(0, 'Id', range(1, 1 + len(to_state)))
to_state=to_state.reindex(columns=['Id','Name','country'])
to_state = to_state.set_index('Id')
Now I want to replace the Country USA with the Id from the previous Dataframe, i want it to look like that:
Note the CountryId should be the attribute Id from the dataframe to_country
Id___Name___CountryId
1_____CA_________1
I tried following Statement but which only resulted in:
to_state = pd.merge(to_state, to_country, left_on='country', right_on="Name")
I really do not know how should i solve this. what is even more irritating, I don't know why the Colums Id from both Dataframes disappear.
As I don't have your example dataframe, test this
import pandas as pd
to_country = pd.DataFrame({"id": [20,30],
"country": ['USA','CHINA']})
to_state = pd.DataFrame({"id": [90,80],
"state": ['CA','AB'],
"country": ['USA','CHINA']})
print(f'__________ORIGINALS DATAFRAMES__________ \n##STATE##\n{to_state}\n\n###COUNTRY###\n{to_country}')
def func(line):
t = 0
for x in range(0, len(to_country['country'])):
t = to_country.loc[to_state['country'] == line['country']]
t = t.values.tolist()
return t[0][0]
print(f'\n_________FINAL DATAFRAME__________\n')
to_state['ID_NEW_country'] = to_state.apply(func, axis = 1)
print(f' \n{to_state}')
I solved it like that in the end:
#Add Countries to Database
to_country = df[['country']]
to_country = to_country.rename(columns={'country': 'Name'})
to_country = to_country.drop_duplicates()
#Add Index to Country Dataframe
to_country = to_country.reset_index()
to_country = to_country.rename(columns={"index":"ID"})
to_country['ID'] = to_country.index + 1
to_country.set_index('ID').to_sql('Country', con=con, if_exists='append', index=True)
#Add States to Database
to_state = df[['state','country']]
to_state = to_state.drop_duplicates()
#Add Index to Country Dataframe
to_state = to_state.reset_index()
to_state = to_state.rename(columns={"index":"ID", 'state': 'Name'})
to_state['ID'] = to_state.index + 1
to_state = to_state.merge(to_country, how='left', left_on='country', right_on='Name').drop(['country', 'Name_y'], axis= 1)
print(to_state)
to_state = to_state.rename(columns={'ID_x': 'ID', 'Name_x': 'Name', 'ID_y': 'Country_ID'})
print(to_state)
to_state.set_index('ID').to_sql('State', con=con, if_exists='append', index=True)

Pandas how to search one df for a certain date and return that data

I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.

Add new column to DataFrame with same default value

I would like to add a name column based on the 'lNames' list. But my code is overwriting the whole column in the last iteration as follows:
import pandas as pd
def consulta_bc(codigo_bcb):
url = 'http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(codigo_bcb)
df = pd.read_json(url)
df['data'] = pd.to_datetime(df['data'], dayfirst=True)
df.set_index('data', inplace=True)
return df
lCodigos = [12, 11, 1, 21619, 21623, 12466]
lNames = ['CDI', 'SELIC', 'USD', 'EUR', 'GPB', 'IMAB']
iter_len = len(lCodigos)
saida = pd.DataFrame()
for i in range(iter_len):
saida = saida.append(consulta_bc(lCodigos[i]))
saida['nome']= lNames[i]
saida.to_csv('Indice', sep=';', index=True)
saida
Any help will be fully appreciated
Change the for loop in this way:
for i in range(iter_len):
df = consulta_bc(lCodigos[i])
df['nome'] = lNames[i]
saida = saida.append(df)

How to groupby specifically datetime index in a multiindex column by month

The data frame shows the date with the amount of import and export
and it is further bifurcated into coastal and regional data per day
of one month.
What I wish to achieve is to club i.e sum all the data presented, which is of one month in this
case, in the end, it will show only one entry that will be of month
ending date and adding all the corresponding fields.
This is the following code:
df=pd.read_csv('output.csv',
encoding="utf-8",skipinitialspace=True,engine='python')
datadf = df
datadf = datadf.dropna(axis = 0, how ='any')
datadf = datadf.astype({'ForeignType' : 'category','ImportType' : 'category','ArrDate' : 'datetime64',
'DepDate' : 'datetime64'})
# datadf = datadf.groupby(datadf['ArrDate'].dt.strftime('%B'))['ComoQty'].sum()
datadf1 = datadf.groupby(['ArrDate','ImportType','ForeignType'])['ComoQty'].sum()
datadf2 = datadf1.to_frame()
datadf2.fillna(value=0,inplace=True)
# datadf2 = datadf2.reset_index('ImportType')
# datadf2 = datadf2.reset_index('ForeignType')
# datadf2 = datadf2.reset_index('ArrDate')
datadf2
datadf1 = datadf.drop(columns='Unnamed: 0')
prac = datadf1
prac =prac.set_index('ArrDate')
prac_dates = prac.copy()
prac = prac.resample('D').apply({'ShipName':'count','ComoQty':'sum'}).reset_index()
prac_dates = ((prac_dates.resample('M').apply({'ComoQty':'sum'}))/1000).reset_index()
prac_dates['Month'] = pd.DatetimeIndex(prac_dates['ArrDate']).strftime('%B')
del prac_dates['ArrDate']
# prac_dates
prac['Month'] = pd.DatetimeIndex(prac['ArrDate']).strftime('%B')
# prac['Month'] = pd.to_datetime(prac['Month'], format='%B')
prac['ArrDate'] = pd.DatetimeIndex(prac['ArrDate']).strftime('%d')

I want to create a time series of monthly means in Pandas

I have a dataframe that consists of hourly data for a whole year. I want to calculate the monthly means and show them in a time series plot. I have one variable which is NO2 values.
#Cleaning data
ck_2000 = pd.read_csv('2000-CamdenKerbside.csv', header=0,skiprows=4,usecols=range(0,3),skipfooter = 1, na_values = 'No data',engine = 'python')
colnames = ['Date', 'Time', 'NO2']
ck_2000.columns = colnames
#Reformat date/time
ck_2000.Time.replace(to_replace = '24:00:00', value = '00:00:00', inplace = True)
dtw = pd.to_datetime(ck_2000.Date + ck_2000.Time,format='%d/%m/%Y%H:%M:%S')
ck_2000.index = dtw
#Index dataframe by date
firstDate = ck_2000.index[0]
lastDate = ck_2000.index[len(ck_2000.Date) - 1]
ck2000 = ck_2000.reindex(index=pd.date_range(start = firstDate, end =lastDate, freq = '1H'), fill_value= None)
#Change data type to float
ck2000['NO2'] = ck2000['NO2'].dropna().astype('int64')
#Interpolation
ck_2000_int = ck_2000.interpolate()
#df's for all months
ck_2000_jan = ck_2000_int['2000-01']
ck_2000_feb = ck_2000_int['2000-02']
ck_2000_mar = ck_2000_int['2000-03']
ck_2000_apr = ck_2000_int['2000-04']
ck_2000_may = ck_2000_int['2000-05']
ck_2000_jun = ck_2000_int['2000-06']
ck_2000_jul = ck_2000_int['2000-07']
ck_2000_aug = ck_2000_int['2000-08']
ck_2000_sept = ck_2000_int['2000-09']
ck_2000_oct = ck_2000_int['2000-10']
ck_2000_nov = ck_2000_int['2000-11']
ck_2000_dec = ck_2000_int['2000-12']
you should be able to use resample
Consider the following example
tidx = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H')
ck_2000_int = pd.DataFrame(dict(NO2=np.random.randn(len(tidx))), tidx)
ck_2000_int.resample('M').mean().plot()

Categories