I have a pandas data frame where I extract minima and extrema values. It work good so far, but the problem is how can I place them by Date (chronological order) into a list? They are separated into two list and I only want one price values list with them being in chronological order
import pandas as pd
import numpy as np
import yfinance
from scipy.signal import argrelextrema
import matplotlib.dates as mpl_dates
def extract_data():
ticker = 'GBPJPY=X'
ticker = yfinance.Ticker(ticker)
start_date = '2022-09-25'
end_date = '2022-10-08'
df = ticker.history(interval='1h', start=start_date, end=end_date)
df['Date'] = pd.to_datetime(df.index)
df['Date'] = df['Date'].apply(mpl_dates.date2num)
df = df.loc[:, ['Date', 'Open', 'High', 'Low', 'Close']]
# Call function to find Min-Max Extrema
find_extrema(df)
def find_extrema(df):
n = 10 # number of points to be checked before and after
# Find local peaks
df['min'] = df.iloc[argrelextrema(df.Close.values, np.less_equal,
order=n)[0]]['Close']
df['max'] = df.iloc[argrelextrema(df.Close.values, np.greater_equal,
order=n)[0]]['Close']
min_values_list = []
max_values_list = []
# Add min value to list
for item in df['min']:
check_NaN = np.isnan(item) # check if values is empty
if check_NaN == True:
pass
else:
min_values_list.append(item)
# Add max values to list
for item in df['max']:
check_NaN = np.isnan(item) # check if values is empty
if check_NaN == True:
pass
else:
max_values_list.append(item)
print(f"Min: {min_values_list}")
print(f"Max: {max_values_list}")
extract_data()
Option 1
First, use df.to_numpy to convert columns min and max to a np.array.
Get rid of all the NaN values by selecting from the array using np.logical_or applied to a boolean mask (created with np.isnan).
arr = df[['min','max']].to_numpy()
value_list = arr[np.logical_not(np.isnan(arr))].tolist()
print(value_list)
[159.7030029296875,
154.8979949951172,
160.7830047607422,
165.43800354003906,
149.55799865722656,
162.80499267578125,
156.6529998779297,
164.31900024414062,
156.125,
153.13499450683594,
161.3520050048828,
156.9340057373047,
162.52200317382812,
155.7740020751953,
160.98500061035156,
161.83700561523438]
Option 2
Rather more cumbersome:
n = 10
# get the indices for `min` and `max` in two arrays
_min = argrelextrema(df.Close.values, np.less_equal, order=n)[0]
_max = argrelextrema(df.Close.values, np.greater_equal, order=n)[0]
# create columns (assuming you need this for other purposes as well)
df['min'] = df.iloc[_min]['Close']
df['max'] = df.iloc[_max]['Close']
# create lists for `min` and `max`
min_values_list = df['min'].dropna().tolist()
max_values_list = df['max'].dropna().tolist()
# join the lists
value_list2 = min_values_list + max_values_list
value_idxs = _min.tolist() + _max.tolist()
# finally, sort `value_list2` based on `value_idxs`
value_list2 = [x for _, x in sorted(zip(value_idxs, value_list2))]
# check if result is the same:
value_list2 == value_list
# True
Assuming that you have max and min columns, what about something like this?
df['max_or_min'] = np.where(df['max'].notna(), df['max'], df['min'])
min_max_values = df['max_or_min'].dropna().values.tolist()
I'm new to python and created the following function for calculating the Relative Strength Index (an indicator for stock markets). I have a pandas series containing averages and want to perform a modification using the values from another series and need to use the offset value from the series itself.
I now use a for loop, iterating through all the values in the series and having access to the index which does the job. But I was wondering if there was a way to get the same result using an apply method or something similar which would make the code cleaner.
import pandas as pd
from datetime import date
import yfinance as yf
def yahoo_rsi(data:pd.DataFrame, period:int=14, column:str="Close"):
delta = data[column].diff()
delta = delta[1:]
up = delta.clip(lower=0)
down = delta.clip(upper=0)
up[ up < 0 ] = 0
down[down > 0] = 0
down = down.abs()
avg_ups:pd.Series = up.ewm(span=period).mean()
avg_downs:pd.Series = down.abs().ewm(span=period).mean()
alpha:float = 1/period
# This is the for loop i would like cleaner
for x in range(period+1, len(avg_ups)):
avg_ups[x] = up[x]*alpha+(1-alpha)*avg_ups[x-1]
avg_downs[x] = down[x]*alpha+(1-alpha)*avg_downs[x-1]
rsis:pd.Series = pd.Series(100 - (100/(1+avg_ups/avg_downs)), avg_ups.index)
return rsis
def run():
ticker = yf.Ticker("AAPL")
df = ticker.history("1d","1d", date(2022, 2, 2), date(2022, 8, 2),
prepost=True,auto_adjust=False, back_adjust=False)
yahoo_rsi(df)
run()
I have tried the following but it did not give the desired result because I believe it uses the old values of the series in stead of the updates ones, which it needs.
avg_ups[period+1::] = up[period+1::]*alpha+(1-alpha)*avg_ups[period+1::].shift(1)
avg_downs[period+1::] = down[period+1::]*alpha+(1-alpha)*avg_downs[period+1::].shift(1)
I am attempting to roll-up rows from a data set with similar measures into a consolidated row. There are two conditions that must be met for the roll-up:
The measures (ranging from 1-5) should remain the same across the
rows for them to be rolled up to a single row.
The dates should be continuous (no gaps in dates).
If these conditions are not met, the code should generate a separate row.
This is the sample data that I am using:
id,measure1,measure2,measure3,measure4,measure5,begin_date,end_date
ABC123XYZ789,1,1,1,1,1,1/1/2019,3/31/2019
ABC123XYZ789,1,1,1,1,1,4/23/2019,6/30/2019
ABC123XYZ789,1,1,1,1,1,7/1/2019,9/30/2019
ABC123XYZ789,1,1,1,1,1,10/12/2019,12/31/2019
FGH589J6U88SW,1,1,1,1,1,1/1/2019,3/31/2019
FGH589J6U88SW,1,1,1,1,1,4/1/2019,6/30/2019
FGH589J6U88SW,1,1,1,2,1,7/1/2019,9/30/2019
FGH589J6U88SW,1,1,1,2,1,10/1/2019,12/31/2019
253DRWQ85AT2F334B,1,2,1,3,1,1/1/2019,3/31/2019
253DRWQ85AT2F334B,1,2,1,3,1,4/1/2019,6/30/2019
253DRWQ85AT2F334B,1,2,1,3,1,7/1/2019,9/30/2019
253DRWQ85AT2F334B,1,2,1,3,1,10/1/2019,12/31/2019
The expected result should be:
id,measure1,measure2,measure3,measure4,measure5,begin_date,end_date
ABC123XYZ789,1,1,1,1,1,1/1/2019,3/31/2019
ABC123XYZ789,1,1,1,1,1,4/23/2019,9/30/2019
ABC123XYZ789,1,1,1,1,1,10/12/2019,12/31/2019
FGH589J6U88SW,1,1,1,1,1,1/1/2019,6/30/2019
FGH589J6U88SW,1,1,1,2,1,7/1/2019,12/31/2019
253DRWQ85AT2F334B,1,2,1,3,1,1/1/2019,12/31/2019
I have implemented the code below which seems to address condition # 1, but I am looking for ideas on how to incorporate condition # 2 into the solution.
import pandas as pd
import time
startTime=time.time()
data=pd.read_csv('C:\\Users\\usertemp\\Data\\Rollup2.csv')
data['end_date']= pd.to_datetime(data['end_date'])
data['begin_date']= pd.to_datetime(data['begin_date'])
data = data.groupby(['id','measure1','measure2', 'measure3', 'measure4', 'measure5']) \
['begin_date', 'end_date'].agg({'begin_date': ['min'], 'end_date': ['max']}).reset_index()
print(data)
print("It took %s seconds for the collapse process" % (time.time() - startTime))
Any help is appreciated.
You can do the following.
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
# Convert begin_date and end_time to datetime
df['begin_date'] = pd.to_datetime(df['begin_date'], format='%m/%d/%Y')
df['end_date']= pd.to_datetime(df['end_date'], format='%m/%d/%Y')
# We create a new column which contains the end_date+1 from the previous row
df['end_date_prev'] = df['end_date'].iloc[:-1] + timedelta(days=1)
df['end_date_prev'] = np.roll(df['end_date_prev'], 1)
# Create a cumsum that resets when begin_date and end_date_prev doesn't match
df['cont'] = (~(df['begin_date'] == df['end_date_prev'])).astype(int).cumsum()
# Since we need all measures to match we create a string column containing all measurements
df['comb_measure'] = df['measure1'].astype(str).str.cat(df[['measure{}'.format(i) for i in range(2,6)]].astype(str))
# Get the final df
new_df = df.groupby(['id', 'comb_measure', 'cont']).agg(
{'measure1':'first', 'measure2':'first', 'measure3':'first', 'measure4':'first', 'measure5':'first',
'begin_date':'first', 'end_date':'last'})
I have 2 DataFrames indexed by Time.
import datetime as dt
import pandas as pd
rng1 = pd.date_range("11:00:00","11:00:30",freq="500ms")
df1 = pd.DataFrame({'A':range(1,62), 'B':range(1000,62000,1000)},index = rng)
rng2 = pd.date_range("11:00:03","11:01:03",freq="700ms")
df2 = pd.DataFrame({'Z':range(10,880,10)},index = rng2)
I am trying to assign 'C' in df1 the last element of 'Z' in df2 closest to time index of df1. The following code seems to work now (returns a list).
df1['C'] = None
for tidx,a,b,c in df1.itertuples():
df1['C'].loc[tidx] = df2[:tidx].tail(1).Z.values
#df1['C'].loc[tidx] = df2[:tidx].Z -->Was trying this which didn't work
df1
Is it possible to avoid iterating.
TIL: Pandas Index instances have map method attributes.
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Index.map.html
def fn(df):
def inner(dt):
return df.ix[abs(df.index - dt).argmin(), 'Z']
return inner
df1['C'] = df1.index.map(fn(df2))