KeyError: 'Date' - python

import pandas as pd
import numpy as np
from nsepy import get_history
import datetime as dt
start = dt.datetime(2015, 1, 1)
end = dt.datetime.today()
infy = get_history(symbol='INFY', start = start, end = end)
infy.index = pd.to_datetime(infy.index)
infy.head()
infy_volume = infy.groupby(infy['Date'].dt.year).reset_index().Volume.sum().
"Error showed as Date", but Infy_volume should be a multi-index series
with two levels of index - Year and Month
.

Here you have the date column as index so use
infy.groupby(infy.index.year).Volume.sum().reset_index()
If you want to groupby with year and month use
infy_volume = infy.groupby([infy.index.year, infy.index.month]).Volume.sum()
infy_volume.index = infy_volume.index.rename('Month', level=1)
print(infy_volume)
# infy_volume.reset_index()

Related

Remove the weekend days from the event log - Pandas

Could you please help me with the following tackle?
I need to remove the weekend days from the dataframe (attached link: dataframe_running_example. I can get a list of all the weekend days between mix and max date pulled out from the event however I cannot filter out the df based on "list_excluded" list.
from datetime import timedelta, date
import pandas as pd
#Data Loading
df= pd.read_csv("running-example.csv", delimiter=";")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["timestamp_date"] = df["timestamp"].dt.date
def daterange(date1, date2):
for n in range(int ((date2 - date1).days)+1):
yield date1 + timedelta(n)
#start_dt & end_dt
start_dt = df["timestamp"].min()
end_dt = df["timestamp"].max()
print("Start_dt: {} & end_dt: {}".format(start_dt, end_dt))
weekdays = [6,7]
#List comprehension
list_excluded = [dt for dt in daterange(start_dt, end_dt) if dt.isoweekday() in weekdays]
df.info()
df_excluded = pd.DataFrame(list_excluded).rename({0: 'timestamp_excluded'}, axis='columns')
df_excluded["ts_excluded"] = df_excluded["timestamp_excluded"].dt.date
df[~df["timestamp_date"].isin(df_excluded["ts_excluded"])]
ooh an issue has been resolved. I used pd.bdate_range() function.
from datetime import timedelta, date
import pandas as pd
import numpy as np
#Wczytanie danych
df= pd.read_csv("running-example.csv", delimiter=";")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["timestamp_date"] = df["timestamp"].dt.date
#Zakres timestamp: start_dt & end_dt
start_dt = df["timestamp"].min()
end_dt = df["timestamp"].max()
print("Start_dt: {} & end_dt: {}".format(start_dt, end_dt))
bus_days = pd.bdate_range(start_dt, end_dt)
df["timestamp_date"] = pd.to_datetime(df["timestamp_date"])
df['Is_Business_Day'] = df['timestamp_date'].isin(bus_days)
df[df["Is_Business_Day"]!=False]

manipulate Date from yfinance

When I pull stock data from yfinance, can I create other columns of data that manipulate the 'date' column? I am new to python and still learning a lot. I have created other columns using the stock price data, but I cannot figure out how to manipulate the 'date' column.
For example, 10/26/2020, I would like to create columns with the following data:
day_of_week, Monday = 1
year = 2020
month = 10
day = 26
week = 44
trade_day = 207
import pandas as pd
import numpy as np
import yfinance as yf
import pandas_datareader as pdr
import datetime as dt
import matplotlib.pyplot as plt
##Get stock price data
ticker = 'NVDA'
#Data time period
now = dt.datetime.now()
startyear = 2017
startmonth=1
startday=1
start = dt.datetime(startyear, startmonth, startday)
#get data from YFinance
df = pdr.get_data_yahoo(ticker, start, now)
#create a column
df['% Change'] = (df['Adj Close'] / df['Adj Close'].shift(1))-1
df['Range'] = df['High'] - df['Low']
df
You want to use the index of your dataframe, which is of type pd.DatetimeIndex.
To split the date into new columns:
new_df = df.copy()
new_df['year'], new_df['month'], new_df['day'] = df.index.year, df.index.month, df.index.day
To carry up arithmetic operations from the first trade date:
start_date = df.index.min()
new_df['trade_day'] = df.index.day - start_date.day
new_df['trade_week'] = df.index.week - start_date.week
new_df['trade_year'] = df.index.year - start_date.year
new_df['day_of_week'] = df.index.weekday
new_df['days_in_month'] = df.index.days_in_month
new_df['day_name'] = df.index.day_name()
new_df['month_name'] = df.index.month_name()
Choose another start date
start_date = pd.to_datetime('2017-01-01')
I did figure out most of the problem. I cannot figure out how to calculate the 'trade date'.
#Convert the 'Date' Index to 'Date' Column
df.reset_index(inplace=True)
#Create columns manipulating 'Date'
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Week of Year'] = df['Date'].dt.isocalendar().week
df['Day of Week'] = df['Date'].dt.dayofweek

Adding rows to pandas dataframe with date range, created_at and today, python

I have a dataframe dataframe consisting of two columns, customer_id and a date column, created_at.
I wish to add another row for each month the customer remains in the customer base.
For example, if the customer_id was created during July, the dataframe would add 4 additional rows for that customer, between the range of "created_at" and "today". For example; for customer1 I would have 9 rows, one for each month up to day, for customer2: 7 rows, and customer3: 4 rows. I was thinking of maybe something like I've copied below, with the idea of merging df with seqDates...
import pandas as pd
import numpy as np
df = pd.DataFrame([("customer1", "05-02-2020"), ("customer2","05-04-2020"), ("customer3","04-07-2020")], index=["1","2","3"], columns= ("customer_id","created_at"))
df["created_at"] = pd.to_datetime(df["created_at"])
# create month expansion column
start = min(df["created_at"])
end = pd.to_datetime("today")
seqDates = pd.date_range(start, end, freq="D")
seqDates = pd.DataFrame(seqDates)
columns = ["created_at"]
Try this:
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
from dateutil import rrule, parser
outList = []
operations_date = datetime.datetime.now().date()
dfDict = df.to_dict(orient='records')
for aDict in dfDict:
created_at = aDict['created_at']
start_date = datetime.datetime.strptime(created_at, '%d-%m-%Y').date() -
relativedelta(months = 1)
end_date = parser.parse(str(operations_date))
date_range = list(rrule.rrule(rrule.MONTHLY, bymonthday=1, dtstart=start_date,
until=end_date))
for aDate in date_range:
outList.append({'customer_id' : aDict['customer_id'], 'created_at' : aDate})
df = pd.DataFrame(outList)

Sort by date with Excel file and Pandas

I am trying to sort my Excel file by the date column. When the code runs it turns the cells from a text string to a time date and it sorts, but only within the same month. That is, when I have dates from October and September it completes by the month.
I have been all over Google and YouTube.
import pandas as pd
import datetime
from datetime import timedelta
x = datetime.datetime.now()
excel_workbook = 'data.xlsx'
sheet1 = pd.read_excel(excel_workbook, sheet_name='RAW DATA')
sheet1['Call_DateTime'] = pd.to_datetime(sheet1['Call_DateTime'])
sheet1.sort_values(sheet1['Call_DateTime'], axis=1, ascending=True, inplace=True)
sheet1['SegmentDuration'] = pd.to_timedelta(sheet1['SegmentDuration'], unit='s')
sheet1['SegmentDuration'] = timedelta(hours=0.222)
sheet1.style.apply('h:mm:ss', column=['SegmentDuration'])
sheet1.to_excel("S4x Output"+x.strftime("%m-%d")+".xlsx", index = False)
print("All Set!!")
I would like it to sort oldest to newest.
Update code and this works.
import pandas as pd
import datetime
from datetime import timedelta
x = datetime.datetime.now()
excel_workbook = 'data.xlsx'
sheet1 = pd.read_excel(excel_workbook, sheet_name='RAW DATA')
sheet1['Call_DateTime'] = pd.to_datetime(sheet1['Call_DateTime'])
sheet1.sort_values(['Call_DateTime'], axis=0, ascending=True, inplace=True)
sheet1['SegmentDuration'] = pd.to_timedelta(sheet1['SegmentDuration'], unit='s')
sheet1['SegmentDuration'] = timedelta(hours=0.222)
sheet1.style.apply('h:mm:ss', column=['SegmentDuration'])
sheet1.to_excel("S4x Output"+x.strftime("%m-%d")+".xlsx", index = False)
print("All Set!!")

pandas - get a dataframe for every day

I have a DataFrame with dates in the index. I make a Subset of the DataFrame for every Day. Is there any way to write a function or a loop to generate these steps automatically?
import json
import requests
import pandas as pd
from pandas.io.json import json_normalize
import datetime as dt
#Get the channel feeds from Thinkspeak
response = requests.get("https://api.thingspeak.com/channels/518038/feeds.json?api_key=XXXXXX&results=500")
#Convert Json object to Python object
response_data = response.json()
channel_head = response_data["channel"]
channel_bottom = response_data["feeds"]
#Create DataFrame with Pandas
df = pd.DataFrame(channel_bottom)
#rename Parameters
df = df.rename(columns={"field1":"PM 2.5","field2":"PM 10"})
#Drop all entrys with at least on nan
df = df.dropna(how="any")
#Convert time to datetime object
df["created_at"] = df["created_at"].apply(lambda x:dt.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ"))
#Set dates as Index
df = df.set_index(keys="created_at")
#Make a DataFrame for every day
df_2018_12_07 = df.loc['2018-12-07']
df_2018_12_06 = df.loc['2018-12-06']
df_2018_12_05 = df.loc['2018-12-05']
df_2018_12_04 = df.loc['2018-12-04']
df_2018_12_03 = df.loc['2018-12-03']
df_2018_12_02 = df.loc['2018-12-02']
Supposing that you do that on the first day of next week (so, exporting monday to sunday next monday, you can do that as follows:
from datetime import date, timedelta
day = date.today() - timedelta(days=7) # so, if today is monday, we start monday before
df = df.loc[today]
while day < today:
df1 = df.loc[str(day)]
df1.to_csv('mypath'+str(day)+'.csv') #so that export files have different names
day = day+ timedelta(days=1)
you can use:
from datetime import date
today = str(date.today())
df = df.loc[today]
and schedule the script using any scheduler such as crontab.
You can create dictionary of DataFrames - then select by keys for DataFrame:
dfs = dict(tuple(df.groupby(df.index.strftime('%Y-%m-%d'))))
print (dfs['2018-12-07'])

Categories