Here's some code where that will generate some random data, and chart plus lines representing 30th & 90th percentiles.
import pandas as pd
import numpy as np
from numpy.random import randint
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(10) # added for reproductibility
rng = pd.date_range('10/9/2018 00:00', periods=10, freq='1H')
df = pd.DataFrame({'Random_Number':randint(1, 100, 10)}, index=rng)
df.plot()
plt.axhline(df.quantile(0.3)[0], linestyle="--", color="g")
plt.axhline(df.quantile(0.90)[0], linestyle="--", color="r")
plt.show()
Outputs: (minus the highlighted part of the chart)
Im trying to figure out if its possible to calculate the time in the data it takes to reach (highlighted yellow) from green to the red line.
I can manually enter in the data:
minStart = df.loc[df['Random_Number'] < 18].index[0]
maxStart = df.loc[df['Random_Number'] > 90].index[0]
hours = maxStart - minStart
hours
Which will output:
Timedelta('0 days 05:00:00')
But if I attempt to use:
minStart = df.loc[df['Random_Number'] < df.quantile(0.3)].index[0]
maxStart = df.loc[df['Random_Number'] > df.quantile(0.90)].index[0]
hours = maxStart - minStart
hours
This will throw an ValueError: Can only compare identically-labeled Series objects
Would there be a better method to madness? Ideally it would be nice to create some sort of an algorithm that can calculate delta Time to it takes to go from 30th - 90th percentile and then delta back from 90th - 30th.. But I may have to put some thought towards how that could be accomplished..
minStart = df.loc[df['Random_Number'] < df.quantile(0.3)[0]].index[0]
maxStart = df.loc[df['Random_Number'] > df.quantile(0.90)[0]].index[0]
hours = maxStart - minStart
hours
df.quantile doesn't return a number so you need to get the first entry of it
Related
I have some data of an owl being present in the nest box. In a previous question you helped me visualize when the owl is in the box:
In addition I created a plot of the hours per day spent in the box with the code below (probably this can be done more efficiently):
import pandas as pd
import matplotlib.pyplot as plt
# raw data indicating time spent in box (each row represents start and end time)
time = pd.DatetimeIndex(["2021-12-01 18:08","2021-12-01 18:11",
"2021-12-02 05:27","2021-12-02 05:29",
"2021-12-02 22:40","2021-12-02 22:43",
"2021-12-03 19:24","2021-12-03 19:27",
"2021-12-06 18:04","2021-12-06 18:06",
"2021-12-07 05:28","2021-12-07 05:30",
"2021-12-10 03:05","2021-12-10 03:10",
"2021-12-10 07:11","2021-12-10 07:13",
"2021-12-10 20:40","2021-12-10 20:41",
"2021-12-12 19:42","2021-12-12 19:45",
"2021-12-13 04:13","2021-12-13 04:17",
"2021-12-15 04:28","2021-12-15 04:30",
"2021-12-15 05:21","2021-12-15 05:25",
"2021-12-15 17:40","2021-12-15 17:44",
"2021-12-15 22:31","2021-12-15 22:37",
"2021-12-16 04:24","2021-12-16 04:28",
"2021-12-16 19:58","2021-12-16 20:09",
"2021-12-17 17:42","2021-12-17 18:04",
"2021-12-17 22:19","2021-12-17 22:26",
"2021-12-18 05:41","2021-12-18 05:44",
"2021-12-19 07:40","2021-12-19 16:55",
"2021-12-19 20:39","2021-12-19 20:52",
"2021-12-19 21:56","2021-12-19 23:17",
"2021-12-21 04:53","2021-12-21 04:59",
"2021-12-21 05:37","2021-12-21 05:39",
"2021-12-22 08:06","2021-12-22 17:22",
"2021-12-22 20:04","2021-12-22 21:24",
"2021-12-22 21:44","2021-12-22 22:47",
"2021-12-23 02:20","2021-12-23 06:17",
"2021-12-23 08:07","2021-12-23 16:54",
"2021-12-23 19:36","2021-12-23 23:59:59",
"2021-12-24 00:00","2021-12-24 00:28",
"2021-12-24 07:53","2021-12-24 17:00",
])
# create dataframe with column indicating presence (1) or absence (0)
time_df = pd.DataFrame(data={'present':[1,0]*int(len(time)/2)}, index=time)
# calculate interval length and add to time_df
time_df['interval'] = time_df.index.to_series().diff().astype('timedelta64[m]')
# add column with day to time_df
time_df['day'] = time.day
#select only intervals where owl is present
timeinbox = time_df.iloc[1::2, :]
interval = timeinbox.interval
day = timeinbox.day
# sum multiple intervals per day
interval_tot = [interval[0]]
day_tot = [day[0]]
for i in range(1, len(day)):
if day[i] == day[i-1]:
interval_tot[-1] +=interval[i]
else:
day_tot.append(day[i])
interval_tot.append(interval[i])
# recalculate to hours
for i in range(len(interval_tot)):
interval_tot[i] = interval_tot[i]/(60)
plt.figure(figsize=(15, 5))
plt.grid(zorder=0)
plt.bar(day_tot, interval_tot, color='g', zorder=3)
plt.xlim([1,31])
plt.xlabel('day in December')
plt.ylabel('hours per day in nest box')
plt.xticks(np.arange(1,31,1))
plt.ylim([0, 24])
Now I would like to combine all data in one plot by making a stacked bar chart, where each day is represented by a bar and each bar indicating for each of the 24*60 minutes whether the owl is present or not. Is this possible from the current data structure?
The data seems to have been created manually, so I have changed the format of the data presented. The approach I took was to create the time spent and the time not spent, with a continuous index of 1 minute intervals with the start and end time as the difference time and a flag of 1. Now to create non-stay time, I will create a time series index of start and end date + 1 at 1 minute intervals. Update the original data frame with the newly created index. This is the data for the graph. In the graph, based on the data frame extracted in days, create a color list with red for stay and green for non-stay. Then, in a bar graph, stack the height one. It may be necessary to consider grouping the data into hourly units.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
import io
data = '''
start_time,end_time
"2021-12-01 18:08","2021-12-01 18:11"
"2021-12-02 05:27","2021-12-02 05:29"
"2021-12-02 22:40","2021-12-02 22:43"
"2021-12-03 19:24","2021-12-03 19:27"
"2021-12-06 18:04","2021-12-06 18:06"
"2021-12-07 05:28","2021-12-07 05:30"
"2021-12-10 03:05","2021-12-10 03:10"
"2021-12-10 07:11","2021-12-10 07:13"
"2021-12-10 20:40","2021-12-10 20:41"
"2021-12-12 19:42","2021-12-12 19:45"
"2021-12-13 04:13","2021-12-13 04:17"
"2021-12-15 04:28","2021-12-15 04:30"
"2021-12-15 05:21","2021-12-15 05:25"
"2021-12-15 17:40","2021-12-15 17:44"
"2021-12-15 22:31","2021-12-15 22:37"
"2021-12-16 04:24","2021-12-16 04:28"
"2021-12-16 19:58","2021-12-16 20:09"
"2021-12-17 17:42","2021-12-17 18:04"
"2021-12-17 22:19","2021-12-17 22:26"
"2021-12-18 05:41","2021-12-18 05:44"
"2021-12-19 07:40","2021-12-19 16:55"
"2021-12-19 20:39","2021-12-19 20:52"
"2021-12-19 21:56","2021-12-19 23:17"
"2021-12-21 04:53","2021-12-21 04:59"
"2021-12-21 05:37","2021-12-21 05:39"
"2021-12-22 08:06","2021-12-22 17:22"
"2021-12-22 20:04","2021-12-22 21:24"
"2021-12-22 21:44","2021-12-22 22:47"
"2021-12-23 02:20","2021-12-23 06:17"
"2021-12-23 08:07","2021-12-23 16:54"
"2021-12-23 19:36","2021-12-24 00:00"
"2021-12-24 00:00","2021-12-24 00:28"
"2021-12-24 07:53","2021-12-24 17:00"
'''
df = pd.read_csv(io.StringIO(data), sep=',')
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
time_df = pd.DataFrame()
for idx, row in df.iterrows():
rng = pd.date_range(row['start_time'], row['end_time']-timedelta(minutes=1), freq='1min')
tmp = pd.DataFrame({'present':[1]*len(rng)}, index=rng)
time_df = time_df.append(tmp)
date_add = pd.date_range(time_df.index[0].date(), time_df.index[-1].date()+timedelta(days=1), freq='1min')
time_df = time_df.reindex(date_add, fill_value=0)
time_df['day'] = time_df.index.day
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8,15))
ax.set_yticks(np.arange(0,1500,60))
ax.set_ylim(0,1440)
ax.set_xticks(np.arange(1,25,1))
days = time_df['day'].unique()
for d in days:
#if d == 1:
day_df = time_df.query('day == #d')
colors = [ 'r' if p == 1 else 'g' for p in day_df['present']]
for i in range(len(day_df)):
ax.bar(d, height=1, width=0.5, bottom=i+1, color=colors[i])
plt.show()
I want to compare the 50 day moving average and 50 day weighted moving average of a company.
import yfinance as yf
import datetime as dt
start = '2021-05-01' # format: YYYY-MM-DD
end = dt.datetime.now() # today
stock='AMD'
df = yf.download(stock,start, end, interval='1h')
This is just to set up the data frame.
The code below adds a column to the data frame with the moving average, but I have been unsuccessful trying to do the same for a weighted moving average.
df['50MA']= df.iloc[:, 4].rolling(window=50).mean()
This is what I have which is incorrect
for i in range(len(df.index)):
df['W50MA']=(df.iloc[i, 4]) * (df.iloc[i, 5]/sum(df.iloc[:, 5]))
You could try something like this:
weights = np.array(list(range(1, 51))) / 100
sum_weights = np.sum(weights)
def weighted_ma(value):
return np.sum(weights*value) / sum_weights
df['50WMA'] = df.iloc[:, 4].rolling(window=50).apply(weighted_ma)
Wondering if there is a fast way of getting the biggest rise in a time series within a window.
Intended code is...
import datetime
import numpy as np
import pandas as pd
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, 365)]
data = np.random.randint(low=1, high=10, size=len(date_list))
df = pd.DataFrame({'date': date_list, 'value': data})
def biggest_rise(df, windowsize = 10):
'''gets the biggest rise within a window size specified
'''
# Some magic code here
return df.rolling_max(window=10, ...)
I don't really get what you mean 'biggest rise', but using rolling may be helpful. For example with that code you can get the difference of the maximum and minimum value within a 10-day window:
df.sort_values(['date']).set_index('date').rolling('10d').max() - df.sort_values(['date']).set_index('date').rolling('10d').min()
I think I found the answer... as per code below. Upped the high to 10K to really see the changes:
import datetime
import numpy as np
import pandas as pd
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, 365)]
data = np.random.randint(low=1, high=10000, size=len(date_list))
df = pd.DataFrame({'date': date_list, 'value': data})
window = 10
dfs = [df.iloc[i: i+window] for i in range(0, len(df)) if i+window < len(df)]
biggest_rise = max([d.value.max()-d.value.min() for d in dfs])
Takes 112 ms for 365 datapoints. Anything better is welcome.
The biggest_rise could be the biggest_fall in the window. Don't know how to differentiate.
Here is a better answer to get the maximum rise using #TywinLannister88 suggestion:
import numpy as np
import pandas as pd
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, 365)]
data = np.random.randint(low=1, high=10000, size=len(date_list))
df = pd.DataFrame({'date': date_list, 'value': data})
# 10-day rolling window
df1 = df.sort_values(['date']).set_index('date').rolling('10d').max() - \
df.sort_values(['date']).set_index('date').rolling('10d').min()
# percent change to see if there is a rise or fall
df2 = df.sort_values(['date']).set_index('date').value.pct_change(periods=10)
# filter out the rises (pctchange > 0) and find the maximum rise
df3 = df.sort_values(['date']).set_index('date').assign(delta=df1, pctchange=df2)
biggest_rise = df3[df3.pctchange>0].pctchange.max()
I have a pandas data frame with two columns one is temperature the other is time.
I would like to make third and fourth columns called min and max. Each of these columns would be filled with nan's except where there is a local min or max, then it would have the value of that extrema.
Here is a sample of what the data looks like, essentially I am trying to identify all the peaks and low points in the figure.
Are there any built in tools with pandas that can accomplish this?
The solution offered by fuglede is great but if your data is very noisy (like the one in the picture) you will end up with lots of misleading local extremes. I suggest that you use scipy.signal.argrelextrema() method. The .argrelextrema() method has its own limitations but it has a useful feature where you can specify the number of points to be compared, kind of like a noise filtering algorithm. for example:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.signal import argrelextrema
# Generate a noisy AR(1) sample
np.random.seed(0)
rs = np.random.randn(200)
xs = [0]
for r in rs:
xs.append(xs[-1] * 0.9 + r)
df = pd.DataFrame(xs, columns=['data'])
n = 5 # number of points to be checked before and after
# Find local peaks
df['min'] = df.iloc[argrelextrema(df.data.values, np.less_equal,
order=n)[0]]['data']
df['max'] = df.iloc[argrelextrema(df.data.values, np.greater_equal,
order=n)[0]]['data']
# Plot results
plt.scatter(df.index, df['min'], c='r')
plt.scatter(df.index, df['max'], c='g')
plt.plot(df.index, df['data'])
plt.show()
Some points:
you might need to check the points afterward to ensure there are no twine points very close to each other.
you can play with n to filter the noisy points
argrelextrema returns a tuple and the [0] at the end extracts a numpy array
Assuming that the column of interest is labelled data, one solution would be
df['min'] = df.data[(df.data.shift(1) > df.data) & (df.data.shift(-1) > df.data)]
df['max'] = df.data[(df.data.shift(1) < df.data) & (df.data.shift(-1) < df.data)]
For example:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Generate a noisy AR(1) sample
np.random.seed(0)
rs = np.random.randn(200)
xs = [0]
for r in rs:
xs.append(xs[-1]*0.9 + r)
df = pd.DataFrame(xs, columns=['data'])
# Find local peaks
df['min'] = df.data[(df.data.shift(1) > df.data) & (df.data.shift(-1) > df.data)]
df['max'] = df.data[(df.data.shift(1) < df.data) & (df.data.shift(-1) < df.data)]
# Plot results
plt.scatter(df.index, df['min'], c='r')
plt.scatter(df.index, df['max'], c='g')
df.data.plot()
using Numpy
ser = np.random.randint(-40, 40, 100) # 100 points
peak = np.where(np.diff(ser) < 0)[0]
or
double_difference = np.diff(np.sign(np.diff(ser)))
peak = np.where(double_difference == -2)[0]
using Pandas
ser = pd.Series(np.random.randint(2, 5, 100))
peak_df = ser[(ser.shift(1) < ser) & (ser.shift(-1) < ser)]
peak = peak_df.index
You can do something similar to Foad's .argrelextrema() solution, but with the Pandas .rolling() function:
# Find local peaks
n = 5 #rolling period
local_min_vals = df.loc[df['data'] == df['data'].rolling(n, center=True).min()]
local_max_vals = df.loc[df['data'] == df['data'].rolling(n, center=True).max()]
plt.scatter(local_min_vals.index, local_min_vals, c='r')
plt.scatter(local_max_vals.index, local_max_vals, c='g')
Fitting a linear trend to a set of data is straight forward. But how can I fit multiple trend lines to one time series? I define up and down trends as prices above or below a exponential moving average. When the price is above the EMA I need to fit a positive trend and when the trend turns negative a new negative trend line and so forth. In my code below the market_data['Signal'] in my pandas dataframe tells me if the trend is up +1 or down -1.
I'm guessing I need some kind of a loop, but I cannot work out the logic...
import pandas as pd
import pandas_datareader.data as web
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.dates as mdates
#Colecting data
market = '^DJI'
end = dt.datetime(2016, 12, 31)
start = dt.date(end.year-10, end.month, end.day)
market_data = web.DataReader(market, 'yahoo', start, end)
#Calculating EMA and difference
market_data['ema'] = market_data['Close'].ewm(200).mean()
market_data['diff_pc'] = (market_data['Close'] / market_data['ema']) - 1
#Defining bull/bear signal
TH = 0
market_data['Signal'] = np.where(market_data['diff_pc'] > TH, 1, 0)
market_data['Signal'] = np.where(market_data['diff_pc'] < -TH, -1, market_data['Signal'])
To fit the trend lines I wan to use numpy polyfit
x = np.array(mdates.date2num(market_data.index.to_pydatetime()))
fit = np.polyfit(x, market_data['Close'], 1)
Ideally I would like to only plot the trends where the signal last more than n periods.
The result should look something like this:
Here is a solution. min_signal is the number of consecutive signals in a row that are needed to change trend. I imported Seaborn to get a better-looking plot, but it works all the same without that line:
import pandas as pd
import pandas_datareader.data as web
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.dates as mdates
#Colecting data
market = '^DJI'
end = dt.datetime(2016, 12, 31)
start = dt.date(end.year-10, end.month, end.day)
market_data = web.DataReader(market, 'yahoo', start, end)
#Calculating EMA and difference
market_data['ema'] = market_data['Close'].ewm(200).mean()
market_data['diff_pc'] = (market_data['Close'] / market_data['ema']) - 1
#Defining bull/bear signal
TH = 0
market_data['Signal'] = np.where(market_data['diff_pc'] > TH, 1, 0)
market_data['Signal'] = np.where(market_data['diff_pc'] < -TH, -1, market_data['Signal'])
# Plot data and fits
import seaborn as sns # This is just to get nicer plots
signal = market_data['Signal']
# How many consecutive signals are needed to change trend
min_signal = 2
# Find segments bounds
bounds = (np.diff(signal) != 0) & (signal[1:] != 0)
bounds = np.concatenate(([signal[0] != 0], bounds))
bounds_idx = np.where(bounds)[0]
# Keep only significant bounds
relevant_bounds_idx = np.array([idx for idx in bounds_idx if np.all(signal[idx] == signal[idx:idx + min_signal])])
# Make sure start and end are included
if relevant_bounds_idx[0] != 0:
relevant_bounds_idx = np.concatenate(([0], relevant_bounds_idx))
if relevant_bounds_idx[-1] != len(signal) - 1:
relevant_bounds_idx = np.concatenate((relevant_bounds_idx, [len(signal) - 1]))
# Iterate segments
for start_idx, end_idx in zip(relevant_bounds_idx[:-1], relevant_bounds_idx[1:]):
# Slice segment
segment = market_data.iloc[start_idx:end_idx + 1, :]
x = np.array(mdates.date2num(segment.index.to_pydatetime()))
# Plot data
data_color = 'green' if signal[start_idx] > 0 else 'red'
plt.plot(segment.index, segment['Close'], color=data_color)
# Plot fit
coef, intercept = np.polyfit(x, segment['Close'], 1)
fit_val = coef * x + intercept
fit_color = 'yellow' if coef > 0 else 'blue'
plt.plot(segment.index, fit_val, color=fit_color)
This is the result: