I have the below two functions :
def create_base_df(start_date, end_date):
base_df = pd.DataFrame({"dt": pd.date_range(start_date, end_date)})
base_df["dt_num_key"] = base_df.dt.apply(lambda x: datetime.datetime.strftime(x, "%Y%m%d")).astype(int)
base_df["cal_yr_nkey"] = base_df.dt.dt.strftime("%Y")
base_df["cal_mon_ofyr_nkey"] = base_df.dt.dt.strftime("%m")
base_df["cal_qtr_ofyr_nkey"] = base_df.dt.dt.quarter.astype(str).apply(lambda x: x.rjust(2, '0'))
base_df["cal_wk_ofyr_nkey"] = base_df.dt.dt.week.astype(str)
return base_df
def month_operations(df):
df["cal_mon_nm"] = df.dt.dt.strftime("%B")
df["cal_mon_shrt_nm"] = df.dt.dt.strftime("%b")
df["cal_yr_mon_nkey"] = df["cal_yr_nkey"] + df["cal_mon_ofyr_nkey"]
df["mon_seq_id"] = df.cal_yr_mon_nkey.sort_values().reset_index() ["cal_yr_mon_nkey"].rank(method='dense').astype(int)
df["dt_frst_dayof_mon"] = df.dt.apply(lambda x: datetime.datetime(x.year, x.month, 1))
df["dt_frst_dayof_mon_nkey"] = df["dt_frst_dayof_mon"].dt.strftime("%Y%m%d")
df["dt_lst_dayof_mon"] = df["dt_frst_dayof_mon"] + pd.tseries.offsets.DateOffset(
months=1) - pd.tseries.offsets.DateOffset(days=1)
df["dt_lst_dayof_mon_nkey"] = df["dt_lst_dayof_mon"].dt.strftime("%Y%m%d")
df["dt_frst_dayof_lst_mon"] = df["dt_frst_dayof_mon"] - pd.DateOffset(months=1)
df["dt_frst_dayof_lst_mon_nkey"] = df["dt_frst_dayof_lst_mon"].dt.strftime("%Y%m%d")
df["dt_lst_mon"] = df.dt - pd.tseries.offsets.DateOffset(months=1)
df["dt_lst_mon_nkey"] = df["dt_lst_mon"].dt.strftime("%Y%m%d")
df["dt_lst_yr_lst_mon"] = df.dt_lst_mon - pd.tseries.offsets.DateOffset(years=1)
df["dt_lst_yr_lst_mon_nkey"] = df["dt_lst_yr_lst_mon"].dt.strftime("%Y%m%d")
return df
The columns dt_lst_yr_lst_mon_nkey, dt_lst_mon_nkey and dt_frst_dayof_lst_mon_nkey are returning values in datetime format ('1899-12-01 00:00:00') and I cant seem to figure out why. All the other *key columns return integers as expected
my main looks like below:
base_df = create_base_df(start_date="01/01/1900", end_date="01/12/1900")
month_df = month_operations(base_df)
The expected output : if the value of dt_lst_yr_lst_mon is "1900-12-01 00:00:00" then dt_lst_yr_lst_mon_nkey will be "19001201"
Any pointers on where I am going wrong is appreciated.
Thanks.
Related
Wondering if any experienced pandas users can point me along the way? For the following code, python doesn't accept that df is defined. Output -> "NameError: name 'df' is not defined"
It seems like maybe there is a merge and/or replace function required to setup df, but what I've tested has not been successful.
Thankful for all feedback and/or suggestions!
TICKERS = ['A', 'AA', 'AAPL', 'ABNB', 'ADBE','AMAT', 'AMD', 'AMC', 'AMGN', 'AMZN']
LOOK_BACK_PERIOD = 100
def last_business_day():
test_date = date.today()
diff = 1
if test_date.weekday() == 0:
diff = 3
elif test_date.weekday() == 6:
diff = 2
else:
diff = 1
res = test_date - timedelta(days=diff)
return str(res)
def get_symbol_prices(symbol, start_date, end_date):
session = requests.Session()
request = f"https://financialmodelingprep.com/api/v3/historical-price-full/{symbol}\
?apikey=YOURAPI\
&from={start_date}&to={end_date}".replace(" ", "")
r = session.get(request)
if r.status_code == requests.codes.ok:
df = pd.io.json.read_json(r.text)
if not df.empty:
df = pd.DataFrame(df['historical'].to_list())
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index()
return df
def prepare_data(symbol, look_back_period):
start_date = date.today() - timedelta(days=look_back_period)
end_date = date.today()
prices_df = get_symbol_prices(symbol=symbol, start_date=start_date, end_date=end_date)
prices_df = prices_df[['open', 'high', 'low', 'close', 'volume']]
# {missing merge function here???}
df.index.name = 'datetime'
df['symbol'] = symbol
return df
def get_final_df(tickers, look_back_period):
df = pd.DataFrame()
for symbol in tickers:
df = df.append(prepare_data(symbol=symbol, look_back_period=look_back_period))
# {missing replace function here???}
return df.loc[last_business_day()]
def main():
historical_df = get_final_df(tickers=TICKERS, look_back_period=LOOK_BACK_PERIOD)
output_folder = 'E:/'
file_name = 'HISTORICALPORTFOLIO.csv'
historical_df.to_csv(os.path.join(output_folder, file_name))
if __name__ == '__main__':
main()
I am trying to do a function where I check if a date is in my excel file, and if unfortunately it is not. I retrieve the date before.
I succeeded with the after date and here is my code.
Only with the date before, I really can't do it.
i tried this for the day before:
def get_all_dates_between_2_dates_with_special_begin_substraction(Class, date_départ, date_de_fin, date_debut_analyse, exclus=False):
date_depart = date_départ
date_fin = date_de_fin
result_dates = []
inFile = "database/Calendar_US_Target.xlsx"
inSheetName = "Sheet1"
df =(pd.read_excel(inFile, sheet_name = inSheetName))
date_depart = datetime.datetime.strptime(date_depart, '%Y-%m-%d')
date_fin = datetime.datetime.strptime(date_fin, '%Y-%m-%d')
date_calcul_depart = datetime.datetime.strptime(date_debut_analyse, '%Y-%m-%d')
var_date_depart = date_depart
time_to_add = ""
if (Class.F0 == "mois"):
time_to_add = relativedelta(months=1)
if (Class.F0 == "trimestre"):
time_to_add = relativedelta(months=3)
if (Class.F0 == "semestre"):
time_to_add = relativedelta(months=6)
if (Class.F0 == "année"):
time_to_add = relativedelta(years=1)
while var_date_depart <= date_fin:
-------------------------------------------------------------
df['mask'] = (var_date_depart <= df['TARGETirs_holi']) # daybefore
print(df.head())
print(df[df.mask =="True"].head(1)) #want to check the last true value
------------------------------------------------------------------------------
if (result >= date_calcul_depart):
result = (str(result)[0:10])
result = result[8:10] + "/" + result[5:7] + "/" + result[0:4]
result_dates.append(str(result))
var_date_depart = var_date_depart + time_to_add
if (exclus == True):
result_dates = result_dates[1:-1]
return(result_dates)
I want to say, do a column (or a dataframe) where the first date is true where the first date smaller than the second then i take the last value who is true.
for example:
I have this array [12-05-2022,15-05-2022,16-05-2022 and 19-05-2022]
if i put 15-05-2022, it gives me 15-05-2022, but if i put 18-05-2022, its gives me 16-05-2022
Thanks!
I tried to convert it over like this but it still doesn't work as intended.
ts = pd.Timestamp('2022-01-02T12')
ts_utc = ts.replace(tzinfo=timezone.utc)
x = pd.Timestamp.utcnow()
ts_delta = x - ts_utc
ts_new = ts_delta.total_seconds()
time_yesterday = ts - pd.Timedelta(days=1)
ts_y_utc = time_yesterday.replace(tzinfo=timezone.utc)
ts_y_delta = x - ts_y_utc
ts_y_new = ts_y_delta.total_seconds()
This code works and returns UTC seconds from the Pandas Timestamp.
ts = pd.Timestamp('2022-01-02T12')
timestamp_og = time.mktime(ts.timetuple())
dt = datetime.fromtimestamp(timestamp_og)
timestamp = dt.replace(tzinfo=timezone.utc).timestamp()
time_yesterday = ts - pd.Timedelta(hours=1)
timestamp2_og = time.mktime(time_yesterday.timetuple())
dt_2 = datetime.fromtimestamp(timestamp2_og)
timestamp2 = dt_2.replace(tzinfo=timezone.utc).timestamp()
This is a snippet of my code. I need to clean up one day of older data. how do we do that for a dictionary of dataframes?
master_train_dict = {}
for id in list_of_id:
temp_df = df.loc[df["id"] == id].copy(deep=False)
temp_df.drop('id', axis=1, inplace=True)
temp_df.reset_index(drop=True, inplace=True)
alert_list = list(temp_df["title"])
train_embedding = get_embeddings(alert_list, model)
temp_df["train_embedding"] = train_embedding
master_train_dict[parent_id] =
temp_df[["title","train_embedding","#timestamp"]]
#master_train_dict[parent_id] = temp_df
global master_dict
master_dict = master_train_dict
print(master_dict)
#clean up function
if len(master_dict)>0:
d = datetime.today() - timedelta(hours=1, minutes= 0)
master_dict=master_dict[id]['#timestamp']>d.strftime("%Y-%m-%d %H:%M:%S")
print(master_dict)
Consider working in defined methods and use groupby for building list or dict of subsetted data frames. Then call functions via dictionary comprehensions.
def build_df(sub):
sub_df.drop('id', axis=1, inplace=True)
sub_df.reset_index(drop=True, inplace=True)
alert_list = list(sub_df["title"])
train_embedding = get_embeddings(alert_list, model)
sub_df["train_embedding"] = train_embedding
sub_df = sub_df.reindex(["title","train_embedding","#timestamp"], axis="columns")
return sub_df
master_train_dict = {i:build_df(g) for i, g in df.groupby(["id"])}
def clean_df(df):
d = datetime.today() - timedelta(hours=1, minutes= 0)
df = df[df['#timestamp'] > d.strftime("%Y-%m-%d %H:%M:%S")]
return df
clean_master_train_dict = {k:clean_df(v) for k, v in master_train_dict.items()}
I have a dataframe that consists of hourly data for a whole year. I want to calculate the monthly means and show them in a time series plot. I have one variable which is NO2 values.
#Cleaning data
ck_2000 = pd.read_csv('2000-CamdenKerbside.csv', header=0,skiprows=4,usecols=range(0,3),skipfooter = 1, na_values = 'No data',engine = 'python')
colnames = ['Date', 'Time', 'NO2']
ck_2000.columns = colnames
#Reformat date/time
ck_2000.Time.replace(to_replace = '24:00:00', value = '00:00:00', inplace = True)
dtw = pd.to_datetime(ck_2000.Date + ck_2000.Time,format='%d/%m/%Y%H:%M:%S')
ck_2000.index = dtw
#Index dataframe by date
firstDate = ck_2000.index[0]
lastDate = ck_2000.index[len(ck_2000.Date) - 1]
ck2000 = ck_2000.reindex(index=pd.date_range(start = firstDate, end =lastDate, freq = '1H'), fill_value= None)
#Change data type to float
ck2000['NO2'] = ck2000['NO2'].dropna().astype('int64')
#Interpolation
ck_2000_int = ck_2000.interpolate()
#df's for all months
ck_2000_jan = ck_2000_int['2000-01']
ck_2000_feb = ck_2000_int['2000-02']
ck_2000_mar = ck_2000_int['2000-03']
ck_2000_apr = ck_2000_int['2000-04']
ck_2000_may = ck_2000_int['2000-05']
ck_2000_jun = ck_2000_int['2000-06']
ck_2000_jul = ck_2000_int['2000-07']
ck_2000_aug = ck_2000_int['2000-08']
ck_2000_sept = ck_2000_int['2000-09']
ck_2000_oct = ck_2000_int['2000-10']
ck_2000_nov = ck_2000_int['2000-11']
ck_2000_dec = ck_2000_int['2000-12']
you should be able to use resample
Consider the following example
tidx = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H')
ck_2000_int = pd.DataFrame(dict(NO2=np.random.randn(len(tidx))), tidx)
ck_2000_int.resample('M').mean().plot()