Get index value from pandas groupby first/last

Get index value from pandas groupby first/last - python

I am trying to recover the original date or index from grouping a times series with a datetime index by year. Is there a faster way without a loop and an extra column to obtain first_day_indices
import pandas as pd
import numpy as np
import datetime as dt
# Data
T = 1000
base = dt.date.today()
date_list = [base - dt.timedelta(weeks=x) for x in range(T)]
date_list.reverse()
test_data = pd.DataFrame(np.random.randn(T)/100, columns=['Col1'])
test_data.index = pd.to_datetime(date_list)
test_data['date'] = test_data.index
first_days = test_data['date'].groupby(test_data.index.year).first()
first_day_indices= []
for i in first_days:
first_day_indices.append(np.where(test_data.index == i)[0][0])
print(first_day_indices)

You can use pandas.Series.isin to check whether elements in Series are contained in a list of values.
test_data.reset_index()[test_data.index.isin(first_days)].index.tolist()

Related

Find minima and maxima of DataFrame by chronological order

I have a pandas data frame where I extract minima and extrema values. It work good so far, but the problem is how can I place them by Date (chronological order) into a list? They are separated into two list and I only want one price values list with them being in chronological order
import pandas as pd
import numpy as np
import yfinance
from scipy.signal import argrelextrema
import matplotlib.dates as mpl_dates
def extract_data():
ticker = 'GBPJPY=X'
ticker = yfinance.Ticker(ticker)
start_date = '2022-09-25'
end_date = '2022-10-08'
df = ticker.history(interval='1h', start=start_date, end=end_date)
df['Date'] = pd.to_datetime(df.index)
df['Date'] = df['Date'].apply(mpl_dates.date2num)
df = df.loc[:, ['Date', 'Open', 'High', 'Low', 'Close']]
# Call function to find Min-Max Extrema
find_extrema(df)
def find_extrema(df):
n = 10 # number of points to be checked before and after
# Find local peaks
df['min'] = df.iloc[argrelextrema(df.Close.values, np.less_equal,
order=n)[0]]['Close']
df['max'] = df.iloc[argrelextrema(df.Close.values, np.greater_equal,
order=n)[0]]['Close']
min_values_list = []
max_values_list = []
# Add min value to list
for item in df['min']:
check_NaN = np.isnan(item) # check if values is empty
if check_NaN == True:
pass
else:
min_values_list.append(item)
# Add max values to list
for item in df['max']:
check_NaN = np.isnan(item) # check if values is empty
if check_NaN == True:
pass
else:
max_values_list.append(item)
print(f"Min: {min_values_list}")
print(f"Max: {max_values_list}")
extract_data()

Option 1
First, use df.to_numpy to convert columns min and max to a np.array.
Get rid of all the NaN values by selecting from the array using np.logical_or applied to a boolean mask (created with np.isnan).
arr = df[['min','max']].to_numpy()
value_list = arr[np.logical_not(np.isnan(arr))].tolist()
print(value_list)
[159.7030029296875,
154.8979949951172,
160.7830047607422,
165.43800354003906,
149.55799865722656,
162.80499267578125,
156.6529998779297,
164.31900024414062,
156.125,
153.13499450683594,
161.3520050048828,
156.9340057373047,
162.52200317382812,
155.7740020751953,
160.98500061035156,
161.83700561523438]
Option 2
Rather more cumbersome:
n = 10
# get the indices for `min` and `max` in two arrays
_min = argrelextrema(df.Close.values, np.less_equal, order=n)[0]
_max = argrelextrema(df.Close.values, np.greater_equal, order=n)[0]
# create columns (assuming you need this for other purposes as well)
df['min'] = df.iloc[_min]['Close']
df['max'] = df.iloc[_max]['Close']
# create lists for `min` and `max`
min_values_list = df['min'].dropna().tolist()
max_values_list = df['max'].dropna().tolist()
# join the lists
value_list2 = min_values_list + max_values_list
value_idxs = _min.tolist() + _max.tolist()
# finally, sort `value_list2` based on `value_idxs`
value_list2 = [x for _, x in sorted(zip(value_idxs, value_list2))]
# check if result is the same:
value_list2 == value_list
# True

Assuming that you have max and min columns, what about something like this?
df['max_or_min'] = np.where(df['max'].notna(), df['max'], df['min'])
min_max_values = df['max_or_min'].dropna().values.tolist()

Alternative to using for loop for performing logic on multiple pandas series with using shift/offset

I'm new to python and created the following function for calculating the Relative Strength Index (an indicator for stock markets). I have a pandas series containing averages and want to perform a modification using the values from another series and need to use the offset value from the series itself.
I now use a for loop, iterating through all the values in the series and having access to the index which does the job. But I was wondering if there was a way to get the same result using an apply method or something similar which would make the code cleaner.
import pandas as pd
from datetime import date
import yfinance as yf
def yahoo_rsi(data:pd.DataFrame, period:int=14, column:str="Close"):
delta = data[column].diff()
delta = delta[1:]
up = delta.clip(lower=0)
down = delta.clip(upper=0)
up[ up < 0 ] = 0
down[down > 0] = 0
down = down.abs()
avg_ups:pd.Series = up.ewm(span=period).mean()
avg_downs:pd.Series = down.abs().ewm(span=period).mean()
alpha:float = 1/period
# This is the for loop i would like cleaner
for x in range(period+1, len(avg_ups)):
avg_ups[x] = up[x]*alpha+(1-alpha)*avg_ups[x-1]
avg_downs[x] = down[x]*alpha+(1-alpha)*avg_downs[x-1]
rsis:pd.Series = pd.Series(100 - (100/(1+avg_ups/avg_downs)), avg_ups.index)
return rsis
def run():
ticker = yf.Ticker("AAPL")
df = ticker.history("1d","1d", date(2022, 2, 2), date(2022, 8, 2),
prepost=True,auto_adjust=False, back_adjust=False)
yahoo_rsi(df)
run()
I have tried the following but it did not give the desired result because I believe it uses the old values of the series in stead of the updates ones, which it needs.
avg_ups[period+1::] = up[period+1::]*alpha+(1-alpha)*avg_ups[period+1::].shift(1)
avg_downs[period+1::] = down[period+1::]*alpha+(1-alpha)*avg_downs[period+1::].shift(1)

Pandas dataframe - how to calculate the difference of end and start values for a list of items

The task is to create a function that takes in two inputs (1) a list (2) the start date of calculation. The output is to create a function that generates a list of the names of highest 10 values based on percentage return.
import numpy as np
import pandas as pd
import yfinance as yf
#Stocks.xlsx contains the field "Stock Code"
stock_list = pd.read_excel("Stocks.xlsx")
stock_code_List = stock_list["Stock code"]
start_date = "2019-01-01"
def top_10_stocks (start_date,stock_code):
for i in stock_code:
yf.download("IBM",start_date)["Close"]
print(i)
%Gain_Loss = ((df.iloc[-1]-df.iloc[0])/df.iloc[0])*100
sort_value = %Gain_Lost.sort()
This is where I got stuck.

Gaps in dates while rolling up quarters into a single row

I am attempting to roll-up rows from a data set with similar measures into a consolidated row. There are two conditions that must be met for the roll-up:
The measures (ranging from 1-5) should remain the same across the
rows for them to be rolled up to a single row.
The dates should be continuous (no gaps in dates).
If these conditions are not met, the code should generate a separate row.
This is the sample data that I am using:
id,measure1,measure2,measure3,measure4,measure5,begin_date,end_date
ABC123XYZ789,1,1,1,1,1,1/1/2019,3/31/2019
ABC123XYZ789,1,1,1,1,1,4/23/2019,6/30/2019
ABC123XYZ789,1,1,1,1,1,7/1/2019,9/30/2019
ABC123XYZ789,1,1,1,1,1,10/12/2019,12/31/2019
FGH589J6U88SW,1,1,1,1,1,1/1/2019,3/31/2019
FGH589J6U88SW,1,1,1,1,1,4/1/2019,6/30/2019
FGH589J6U88SW,1,1,1,2,1,7/1/2019,9/30/2019
FGH589J6U88SW,1,1,1,2,1,10/1/2019,12/31/2019
253DRWQ85AT2F334B,1,2,1,3,1,1/1/2019,3/31/2019
253DRWQ85AT2F334B,1,2,1,3,1,4/1/2019,6/30/2019
253DRWQ85AT2F334B,1,2,1,3,1,7/1/2019,9/30/2019
253DRWQ85AT2F334B,1,2,1,3,1,10/1/2019,12/31/2019
The expected result should be:
id,measure1,measure2,measure3,measure4,measure5,begin_date,end_date
ABC123XYZ789,1,1,1,1,1,1/1/2019,3/31/2019
ABC123XYZ789,1,1,1,1,1,4/23/2019,9/30/2019
ABC123XYZ789,1,1,1,1,1,10/12/2019,12/31/2019
FGH589J6U88SW,1,1,1,1,1,1/1/2019,6/30/2019
FGH589J6U88SW,1,1,1,2,1,7/1/2019,12/31/2019
253DRWQ85AT2F334B,1,2,1,3,1,1/1/2019,12/31/2019
I have implemented the code below which seems to address condition # 1, but I am looking for ideas on how to incorporate condition # 2 into the solution.
import pandas as pd
import time
startTime=time.time()
data=pd.read_csv('C:\\Users\\usertemp\\Data\\Rollup2.csv')
data['end_date']= pd.to_datetime(data['end_date'])
data['begin_date']= pd.to_datetime(data['begin_date'])
data = data.groupby(['id','measure1','measure2', 'measure3', 'measure4', 'measure5']) \
['begin_date', 'end_date'].agg({'begin_date': ['min'], 'end_date': ['max']}).reset_index()
print(data)
print("It took %s seconds for the collapse process" % (time.time() - startTime))
Any help is appreciated.

You can do the following.
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
# Convert begin_date and end_time to datetime
df['begin_date'] = pd.to_datetime(df['begin_date'], format='%m/%d/%Y')
df['end_date']= pd.to_datetime(df['end_date'], format='%m/%d/%Y')
# We create a new column which contains the end_date+1 from the previous row
df['end_date_prev'] = df['end_date'].iloc[:-1] + timedelta(days=1)
df['end_date_prev'] = np.roll(df['end_date_prev'], 1)
# Create a cumsum that resets when begin_date and end_date_prev doesn't match
df['cont'] = (~(df['begin_date'] == df['end_date_prev'])).astype(int).cumsum()
# Since we need all measures to match we create a string column containing all measurements
df['comb_measure'] = df['measure1'].astype(str).str.cat(df[['measure{}'.format(i) for i in range(2,6)]].astype(str))
# Get the final df
new_df = df.groupby(['id', 'comb_measure', 'cont']).agg(
{'measure1':'first', 'measure2':'first', 'measure3':'first', 'measure4':'first', 'measure5':'first',
'begin_date':'first', 'end_date':'last'})

Extract column info based on index of another Data frame for time based Data frames - Avoid Iterating

I have 2 DataFrames indexed by Time.
import datetime as dt
import pandas as pd
rng1 = pd.date_range("11:00:00","11:00:30",freq="500ms")
df1 = pd.DataFrame({'A':range(1,62), 'B':range(1000,62000,1000)},index = rng)
rng2 = pd.date_range("11:00:03","11:01:03",freq="700ms")
df2 = pd.DataFrame({'Z':range(10,880,10)},index = rng2)
I am trying to assign 'C' in df1 the last element of 'Z' in df2 closest to time index of df1. The following code seems to work now (returns a list).
df1['C'] = None
for tidx,a,b,c in df1.itertuples():
df1['C'].loc[tidx] = df2[:tidx].tail(1).Z.values
#df1['C'].loc[tidx] = df2[:tidx].Z -->Was trying this which didn't work
df1
Is it possible to avoid iterating.

TIL: Pandas Index instances have map method attributes.
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Index.map.html
def fn(df):
def inner(dt):
return df.ix[abs(df.index - dt).argmin(), 'Z']
return inner
df1['C'] = df1.index.map(fn(df2))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Get index value from pandas groupby first/last - python

You can use pandas.Series.isin to check whether elements in Series are contained in a list of values. test_data.reset_index()[test_data.index.isin(first_days)].index.tolist()

Related

Find minima and maxima of DataFrame by chronological order

Alternative to using for loop for performing logic on multiple pandas series with using shift/offset

Pandas dataframe - how to calculate the difference of end and start values for a list of items

Gaps in dates while rolling up quarters into a single row

Extract column info based on index of another Data frame for time based Data frames - Avoid Iterating

Categories

Resources