pandas - get a dataframe for every day - python

I have a DataFrame with dates in the index. I make a Subset of the DataFrame for every Day. Is there any way to write a function or a loop to generate these steps automatically?
import json
import requests
import pandas as pd
from pandas.io.json import json_normalize
import datetime as dt
#Get the channel feeds from Thinkspeak
response = requests.get("https://api.thingspeak.com/channels/518038/feeds.json?api_key=XXXXXX&results=500")
#Convert Json object to Python object
response_data = response.json()
channel_head = response_data["channel"]
channel_bottom = response_data["feeds"]
#Create DataFrame with Pandas
df = pd.DataFrame(channel_bottom)
#rename Parameters
df = df.rename(columns={"field1":"PM 2.5","field2":"PM 10"})
#Drop all entrys with at least on nan
df = df.dropna(how="any")
#Convert time to datetime object
df["created_at"] = df["created_at"].apply(lambda x:dt.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ"))
#Set dates as Index
df = df.set_index(keys="created_at")
#Make a DataFrame for every day
df_2018_12_07 = df.loc['2018-12-07']
df_2018_12_06 = df.loc['2018-12-06']
df_2018_12_05 = df.loc['2018-12-05']
df_2018_12_04 = df.loc['2018-12-04']
df_2018_12_03 = df.loc['2018-12-03']
df_2018_12_02 = df.loc['2018-12-02']

Supposing that you do that on the first day of next week (so, exporting monday to sunday next monday, you can do that as follows:
from datetime import date, timedelta
day = date.today() - timedelta(days=7) # so, if today is monday, we start monday before
df = df.loc[today]
while day < today:
df1 = df.loc[str(day)]
df1.to_csv('mypath'+str(day)+'.csv') #so that export files have different names
day = day+ timedelta(days=1)

you can use:
from datetime import date
today = str(date.today())
df = df.loc[today]
and schedule the script using any scheduler such as crontab.

You can create dictionary of DataFrames - then select by keys for DataFrame:
dfs = dict(tuple(df.groupby(df.index.strftime('%Y-%m-%d'))))
print (dfs['2018-12-07'])

Related

Python - Remove lines prior to current month and year

I have a dataframe that contain arrival dates for vessels and I'd want to make python recognize the current year and month that we are at the moment and remove all entries that are prior to the current month and year.
I have a column with the date itself in the format '%d/%b/%Y' and columns for month and year separatly if needed.
For instance, if today is 01/01/2022. I'd like to remove everything that is from dec/2021 and prior.
Using pandas periods and boolean indexing:
# set up example
df = pd.DataFrame({'date': ['01/01/2022', '08/02/2022', '09/03/2022'], 'other_col': list('ABC')})
# find dates equal or greater to this month
keep = (pd.to_datetime(df['date'], dayfirst=False)
.dt.to_period('M')
.ge(pd.Timestamp('today').to_period('M'))
)
# filter
out = df[keep]
Output:
date other_col
1 08/02/2022 B
2 09/03/2022 C
from datetime import datetime
import pandas as pd
df = ...
# assuming your date column is named 'date'
t = datetime.utcnow()
df = df[pd.to_datetime(df.date) >= datetime(t.year, t.month, t.day)]
Let us consider this example dataframe:
import pandas as pd
import datetime
df = pd.DataFrame()
data = [['nao victoria', '21/Feb/2012'], ['argo', '6/Jun/2022'], ['kon tiki', '23/Aug/2022']]
df = pd.DataFrame(data, columns=['Vessel', 'Date'])
You can convert your dates to datetimes, by using pandas' to_datetime method; for instance, you may save the output into a new Series (column):
df['Datetime']=pd.to_datetime(df['Date'], format='%d/%b/%Y')
You end up with the following dataframe:
Vessel Date Datetime
0 nao victoria 21/Feb/2012 2012-02-21
1 argo 6/Jun/2022 2022-06-06
2 kon tiki 23/Aug/2022 2022-08-23
You can then reject rows containing datetime values that are smaller than today's date, defined using datetime's now method:
df = df[df.Datetime > datetime.datetime.now()]
This returns:
Vessel Date Datetime
2 kon tiki 23/Aug/2022 2022-08-23

Remove the weekend days from the event log - Pandas

Could you please help me with the following tackle?
I need to remove the weekend days from the dataframe (attached link: dataframe_running_example. I can get a list of all the weekend days between mix and max date pulled out from the event however I cannot filter out the df based on "list_excluded" list.
from datetime import timedelta, date
import pandas as pd
#Data Loading
df= pd.read_csv("running-example.csv", delimiter=";")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["timestamp_date"] = df["timestamp"].dt.date
def daterange(date1, date2):
for n in range(int ((date2 - date1).days)+1):
yield date1 + timedelta(n)
#start_dt & end_dt
start_dt = df["timestamp"].min()
end_dt = df["timestamp"].max()
print("Start_dt: {} & end_dt: {}".format(start_dt, end_dt))
weekdays = [6,7]
#List comprehension
list_excluded = [dt for dt in daterange(start_dt, end_dt) if dt.isoweekday() in weekdays]
df.info()
df_excluded = pd.DataFrame(list_excluded).rename({0: 'timestamp_excluded'}, axis='columns')
df_excluded["ts_excluded"] = df_excluded["timestamp_excluded"].dt.date
df[~df["timestamp_date"].isin(df_excluded["ts_excluded"])]
ooh an issue has been resolved. I used pd.bdate_range() function.
from datetime import timedelta, date
import pandas as pd
import numpy as np
#Wczytanie danych
df= pd.read_csv("running-example.csv", delimiter=";")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["timestamp_date"] = df["timestamp"].dt.date
#Zakres timestamp: start_dt & end_dt
start_dt = df["timestamp"].min()
end_dt = df["timestamp"].max()
print("Start_dt: {} & end_dt: {}".format(start_dt, end_dt))
bus_days = pd.bdate_range(start_dt, end_dt)
df["timestamp_date"] = pd.to_datetime(df["timestamp_date"])
df['Is_Business_Day'] = df['timestamp_date'].isin(bus_days)
df[df["Is_Business_Day"]!=False]

Adding rows to pandas dataframe with date range, created_at and today, python

I have a dataframe dataframe consisting of two columns, customer_id and a date column, created_at.
I wish to add another row for each month the customer remains in the customer base.
For example, if the customer_id was created during July, the dataframe would add 4 additional rows for that customer, between the range of "created_at" and "today". For example; for customer1 I would have 9 rows, one for each month up to day, for customer2: 7 rows, and customer3: 4 rows. I was thinking of maybe something like I've copied below, with the idea of merging df with seqDates...
import pandas as pd
import numpy as np
df = pd.DataFrame([("customer1", "05-02-2020"), ("customer2","05-04-2020"), ("customer3","04-07-2020")], index=["1","2","3"], columns= ("customer_id","created_at"))
df["created_at"] = pd.to_datetime(df["created_at"])
# create month expansion column
start = min(df["created_at"])
end = pd.to_datetime("today")
seqDates = pd.date_range(start, end, freq="D")
seqDates = pd.DataFrame(seqDates)
columns = ["created_at"]
Try this:
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
from dateutil import rrule, parser
outList = []
operations_date = datetime.datetime.now().date()
dfDict = df.to_dict(orient='records')
for aDict in dfDict:
created_at = aDict['created_at']
start_date = datetime.datetime.strptime(created_at, '%d-%m-%Y').date() -
relativedelta(months = 1)
end_date = parser.parse(str(operations_date))
date_range = list(rrule.rrule(rrule.MONTHLY, bymonthday=1, dtstart=start_date,
until=end_date))
for aDate in date_range:
outList.append({'customer_id' : aDict['customer_id'], 'created_at' : aDate})
df = pd.DataFrame(outList)

Sort by date with Excel file and Pandas

I am trying to sort my Excel file by the date column. When the code runs it turns the cells from a text string to a time date and it sorts, but only within the same month. That is, when I have dates from October and September it completes by the month.
I have been all over Google and YouTube.
import pandas as pd
import datetime
from datetime import timedelta
x = datetime.datetime.now()
excel_workbook = 'data.xlsx'
sheet1 = pd.read_excel(excel_workbook, sheet_name='RAW DATA')
sheet1['Call_DateTime'] = pd.to_datetime(sheet1['Call_DateTime'])
sheet1.sort_values(sheet1['Call_DateTime'], axis=1, ascending=True, inplace=True)
sheet1['SegmentDuration'] = pd.to_timedelta(sheet1['SegmentDuration'], unit='s')
sheet1['SegmentDuration'] = timedelta(hours=0.222)
sheet1.style.apply('h:mm:ss', column=['SegmentDuration'])
sheet1.to_excel("S4x Output"+x.strftime("%m-%d")+".xlsx", index = False)
print("All Set!!")
I would like it to sort oldest to newest.
Update code and this works.
import pandas as pd
import datetime
from datetime import timedelta
x = datetime.datetime.now()
excel_workbook = 'data.xlsx'
sheet1 = pd.read_excel(excel_workbook, sheet_name='RAW DATA')
sheet1['Call_DateTime'] = pd.to_datetime(sheet1['Call_DateTime'])
sheet1.sort_values(['Call_DateTime'], axis=0, ascending=True, inplace=True)
sheet1['SegmentDuration'] = pd.to_timedelta(sheet1['SegmentDuration'], unit='s')
sheet1['SegmentDuration'] = timedelta(hours=0.222)
sheet1.style.apply('h:mm:ss', column=['SegmentDuration'])
sheet1.to_excel("S4x Output"+x.strftime("%m-%d")+".xlsx", index = False)
print("All Set!!")

Get csv into pandas dataframe from current date

I have a problem with automatically importing csv and creating pandas dataframe. The code I've got:
from datetime import time
from datetime import date
from datetime import datetime
import os
import fnmatch
def get_local_file(pdate, hour, path='/apps/dev_data/data/'):
"""Get date+hour processing file from local drive
:param pdate: str Processing date
:param hour: str Processing hour
:param path: str Path to file location
:return: Pandas DF Retrieved DataFrame
"""
sdate = pdate + '-' + str(hour)
for p_file in os.listdir(path):
if fnmatch.fnmatch(p_file, 'RSRAN098_IP_R*'+sdate+'*.csv'):
return path+p_file
def get_files(pdate, path='/apps/dev_data/data/'):
hours = [time(i).strftime('%H') for i in range(24)]
fileList=[]
for hour in hours:
fileList.append(get_local_file(pdate, hour))
return fileList
processing_date = datetime.strptime('20170614', '%Y%m%d').date()
a = get_files(str(processing_date).replace('-', '_'))
print a
frame = pd.DataFrame()
list_ = []
for file_ in a:
df = pd.read_csv(file_,index_col=None, header=0, delimiter=';')
list_.append(df)
frame = pd.concat(list_)
The only problem is that I have a fixed date, I can't find a way to put the current date,
you can get current date with datetime module.
replace this
processing_date = datetime.strptime('20170614', '%Y%m%d').date()
with something like datetime.datetime.now()
but I think maybe I don't your point. because the answer seems too straightford.

Categories