How to make a Pandas Timestamp object subscriptable? - python

Getting an error Timestamp object is not subscriptable. I understand what it means, but can't figure out how to solve the code.
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from mplfinance.original_flavor import candlestick_ohlc
API_URL = 'https://api.coingecko.com/api/v3'
r = requests.get(API_URL + '/coins/bitcoin/market_chart?vs_currency=usd&days=3&interval=hourly')
d = r.json()
df = pd.DataFrame(d['prices'], columns = ['dateTime', 'price'])
df['date'] = pd.to_datetime(df['dateTime'], unit='ms')
ohlc = df.set_index('date')['price'].resample('1h').ohlc()
stock_data = ohlc
class CandlesIndexes():
def __init__(self, stock_data):
self.high = stock_data.high
self.low = stock_data.low
self.close = stock_data.close
self.stock_data = stock_data
self.date = stock_data.index
def display_candle_bars(self):
fig, self.ax = plt.subplots()
#Describe candle properties
candle_ohlc = candlestick_ohlc(self.ax, self.date, width= 0.6, colorup= 'green', colordown= 'red', alpha= 0.8)
The error happens at the very last candle_ohlc = candlestick_ohlc() line.
Please help a brotha out. Thanks, much appreciated.
EDIT: full error:
File "/Users/teo/.local/share/virtualenvs/trend-nriNAUCq/lib/python3.8/site-packages/mplfinance/original_flavor.py", line 234, in candlestick_ohlc
return _candlestick(ax, quotes, width=width, colorup=colorup,
File "/Users/teo/.local/share/virtualenvs/trend-nriNAUCq/lib/python3.8/site-packages/mplfinance/original_flavor.py", line 283, in _candlestick
t, open, high, low, close = q[:5]
TypeError: 'Timestamp' object is not subscriptable```

There are a few issues with your code as is.
First, it is generally a bad idea to name a variable with the same name as a method/function (ohlc).
Second, it appears you are pulling data from the server in hourly intervals, and then trying to resample the data for ohlc() in the same hourly intervals - this will give you equal values for open, high, close and low. You need to either increase your frequency for the data pulled from the API, or decrease the frequency for the resampling.
Third, you are using candlestick_ohlc() incorrectly. You need to first create a column (not an index) with the datetimes (in a format usable by candlestick_ohlc()), and then pass the values only of that data frame to the function.
This sample works as described, but can still be improved upon:
import requests
import matplotlib.pyplot as plt
from mplfinance.original_flavor import candlestick_ohlc
import matplotlib.dates as dates
API_URL = 'https://api.coingecko.com/api/v3'
r = requests.get(API_URL + '/coins/bitcoin/market_chart?vs_currency=usd&days=3&interval=hourly')
d = r.json()
df = pd.DataFrame(d['prices'], columns=['dateTime', 'price'])
df['dateTime'] = pd.to_datetime(df['dateTime'], unit='ms')
ohla = df.set_index('dateTime')['price'].resample('1d').ohlc()
stock_data = ohla.reset_index()
stock_data["dateTime"] = stock_data["dateTime"].apply(dates.date2num)
class CandlesIndexes():
def __init__(self, stock_data):
self.high = stock_data.high
self.low = stock_data.low
self.close = stock_data.close
self.stock_data = stock_data
self.date = stock_data.index
def display_candle_bars(self):
fig, self.ax = plt.subplots()
candlestick_ohlc(self.ax, self.stock_data.values, width= 0.6, colorup= 'green', colordown= 'red', alpha= 0.8)
plt.show()
x = CandlesIndexes(stock_data)
x.display_candle_bars()

#Teo
The plotting portion of your code is far more complicated than it needs to be. Since you already have your data in a DataFrame with a Timestamp index, I recommend you use the new mplfinance module (instead of the old one that you are using). It does a lot of the work for you automatically. Thus, your code can be a lot simpler:
import requests
import pandas as pd
import mplfinance as mpf
API_URL = 'https://api.coingecko.com/api/v3'
r = requests.get(API_URL + '/coins/bitcoin/market_chart?vs_currency=usd&days=3&interval=hourly')
d = r.json()
df = pd.DataFrame(d['prices'], columns = ['dateTime', 'price'])
df['date'] = pd.to_datetime(df['dateTime'], unit='ms')
ohlcdf = df.set_index('date')['price'].resample('4h').ohlc()
mpf.plot(ohlcdf,type='candle',style='yahoo')
Result:
Notice that I changed your resample from '1h' to '4h'. This is because your data is already approximately hourly. So if you resample at 1h then your Open, High, Low, and Close will be all equal. You can easily see the difference between various resample frequencies by changing the last two lines of the above code to:
for freq in ('1h','2h','3h','4h'):
ohlcdf = df.set_index('date')['price'].resample(freq).ohlc()
mpf.plot(ohlcdf,type='candle',style='yahoo')
Notice that for '1h' you get flat lines instead of candles because o,h,l,c are all approximately the same value.
HTH.

Related

Plot the graph with moving x-axis that contains datetime values changing during real time plot

I would like to plot the graph for 30 minute interval I have data of 1 min interval so plotting it directly results in contraction of x-axis and I have also reviwed a similar question for datetime nut there the date interval was constant in my case the date changes everytime as it is real time stock market graph
Following is the small part of my code
import yfinance as yf
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.animation
class real_time_graph:
def __init__(self):
self.stock_ticker = "RELIANCE.BO"
today=dt.datetime.utcnow() + dt.timedelta(hours=5, minutes=30)
self.todays_data=yf.download(self.stock_ticker,start=today.strftime("%Y-%m-%d"),interval="1m")
self.todays_data=pd.DataFrame(self.todays_data)
del self.todays_data['Open']
del self.todays_data['High']
del self.todays_data['Low']
del self.todays_data['Adj Close']
del self.todays_data['Volume']
self.todays_data.reset_index(inplace=True)
self.todays_data['Datetime'] = self.todays_data['Datetime'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S")).astype(str)
self.todays_data['Datetime'] = pd.DataFrame(map(lambda x: dt.datetime.strptime(x,"%Y-%m-%d %H:%M:%S"),self.todays_data['Datetime']))
self.todays_data['Datetime']=pd.to_datetime(self.todays_data['Datetime'],format="%Y-%m-%d %H:%M:%S")
self.todays_data=self.todays_data.set_index('Datetime')
def current_price(self):
current_info=yf.Ticker(self.stock_ticker).info
today=dt.datetime.utcnow() + dt.timedelta(hours=5, minutes=30)
cp_dict={'Datetime':[],'Close': []}
cp_dict['Datetime'].append(dt.datetime.strptime(today.strftime("%Y-%m-%d %H:%M:%S"),"%Y-%m-%d %H:%M:%S"))
cp_dict['Close'].append(round(current_info.get('currentPrice'),2))
return cp_dict
def animate_real_tm_g(self,i):
CP_dict=self.current_price()
current_df=pd.DataFrame.from_dict(CP_dict)
current_df['Datetime']=pd.to_datetime(current_df['Datetime'],format="%Y-%m-%d %H:%M:%S")
current_df=current_df.set_index('Datetime')
self.todays_data = pd.concat([self.todays_data,current_df])
plt.clf()
plt.plot(self.todays_data['Close'], label = 'RELIANCE.BO', linewidth = 2, alpha = 0.3)
def plot_graph(self):
fig1 = plt.figure()
g_anim=matplotlib.animation.FuncAnimation(fig1, self.animate_real_tm_g,interval = 1000)
plt.show()
plt.close(fig1)
if __name__=='__main__':
obj1 = real_time_graph()
today = dt.datetime.utcnow() + dt.timedelta(hours=5, minutes=30)
obj1.plot_graph()
obj1.todays_data.to_csv(f'{today.strftime("%d_%m_%Y")}Data{"RELIANCE.BO"}.csv',index=True)
Above code I have written for Indian stock of Reliance companny so it will run only on stock market working days based on Indian Time
For a reference I have shared the csv file too
01_06_2022DataRELIANCE.BO.csv
Thank you in advance for your solutions

animation.FuncAnimation mplfinance (candlestick and line togheter)? (python)

I created a reproducible example of random data for candlestick chart ohlc that is working correctly.
Now I need, in the same plot, to plot a random line (in the real application it will be a function of the ohlc data (not moving average)), so I created a random varialbe y0 that I will "concat" every loop through ani = animation.FuncAnimation() function.
The objects that mpf "will use" to plot are df and y0_arr and they have the "same format" (they are pd.DataFrame, have DateIndex as index, same dates, same dtypes=float).
if you comment(exclude) the part of the ##random line the code will work with no problem, only for the candlestick chart, but it won't if you include the random line. I've also tried to plot just/only the random line but it won't work also when it's only the random line.
this below is the code:
import numpy as np
import pandas as pd
import datetime
from datetime import timedelta
from datetime import date, datetime
import mplfinance as mpf
import matplotlib.animation as animation
import time
i=0
sign=1
def genData():
global df0, df, i, close, sign, y0_arr
sign = sign*-1
## random ohlc generation
if i==0: #variable initialization
y0_arr = pd.DataFrame()
df = pd.DataFrame()
i=0
close = 0
open_ = np.round(np.random.normal(10, 4), decimals=2) ##initial open
print("i:",i)
dt = datetime.now() + timedelta(days=i)
dt = dt.date()
if i > 0:
open_ = close
high = np.round(open_ + np.random.normal(0.5, 2), decimals=2)
low = np.round(open_ - np.random.normal(0.5, 2), decimals=2)
close = np.round(open_ + sign*np.random.normal(0.2, 0.4), decimals=2)
dfi = np.column_stack((dt, open_, high, low, close))
dfi = pd.DataFrame(dfi)
dfi.columns = ['date', 'open', 'high', 'low', 'close']
dfi['date'] = pd.to_datetime(dfi['date'], format="%Y/%m/%d")
dfi.set_index('date', inplace=True)
dfi = dfi.convert_dtypes(float)
df = pd.concat([df, dfi])
## random ohlc generation
##random line (exclude from here...)
y0 = np.round(np.random.normal(1,2), decimals=2)
y0i = np.column_stack((dt,y0))
y0i = pd.DataFrame(y0i)
y0i.columns = ['date','open']
y0i['date'] = pd.to_datetime(y0i['date'],format="%Y/%m/%d")
y0i.set_index('date', inplace=True)
y0i = y0i.convert_dtypes(float)
y0_arr = pd.concat([y0_arr,y0i])
##random line (...to here)
time.sleep(1)
i=i+1
#### plotting
fig = mpf.figure(style="charles",figsize=(7,8))
ax1 = fig.add_subplot(1,1,1)
def animate(ival):
global df, y0_arr
print("animate()")
genData() ##create new data
ax1.clear
mpf.plot(df, ax=ax1, type='candle', ylabel='Price US$') ##ohlc
mpf.plot(y0_arr, ax=ax1, type='line',ylabel='Price US$') ##random line (...exclude this line)
ani = animation.FuncAnimation(fig, animate, interval=250)
mpf.show()
This below are the error messages that I'm getting:
How can I solve this problem? Where to look?
Thank You
Notice that the error message is KeyError: 'Open'. This is because mpf.plot() expects the first argument to be a DataFrame with columns 'Open', 'High', 'Low', and 'Close' (or with OHLC column names that you specify using kwarg columns=).
Apparently your y0_arr is not such a dataframe.
The correct way to add a line to a candlestick plot is with the mpf.make_addplot() call. <-Click here to see the documentation for addplot.
See also https://github.com/matplotlib/mplfinance/blob/master/examples/mpf_animation_macd.py for an example of how to add lines to an animated candlestick plot.

Unable to plot histogram with time on x-axis using Matplotlib and Python

I am trying to plot the number of times a user tweeted at specific times of the day. I plan to plot these on a histogram/bar chart with 24 "bins" - one for each hour.
I have the data in a Pandas dataframe in two columns - the tweet and the time of the tweet (as a datetime object).
I have converted the Time column into a Pandas time, however I am having a hard time plotting correctly. If I set the value of bins to be 24, then I get the following chart (here) which doesn't look correct. Firstly the chart looks wrong, but secondly the x-axis has horrible formatting.
I would like to try to resolve these two issues. Firstly the data isn't being plotted correctly and secondly the horizontal axis formatting is incorrect.
I have plotted the data using Google Sheets and the correct chart should look like this. I don't mind if the values are % of total or absolute volume.
Code to generate plots can be found here. generate_data.py and plot_data.py
Any help is hugely appreciated.
plot_data.py
import datetime
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
import random
import generate_data
screen_name = "#joebiden"
data = generate_data.get_data(screen_name, save_output=True)
df = pd.DataFrame(data)
df["Time"]= pd.to_datetime(data["Time"], format="%H:%M")
fig, ax = plt.subplots(1,1)
bin_list = [datetime.time(x) for x in range(24)]
ax.hist(df["Time"], bins=24, color='lightblue')
plt.show()
generate_data.py
import json
import re
from datetime import datetime
import tweepy
import common_words
import twitter_auth
def create_connection():
auth = tweepy.OAuthHandler(twitter_auth.CONSUMER_KEY, twitter_auth.CONSUMER_SECRET)
auth.set_access_token(twitter_auth.ACCESS_KEY, twitter_auth.ACCESS_SECRET)
return tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
def retrieve_next_set_of_tweets(screen_name, api, count,max_id):
'''Return next 200 user tweets'''
return api.user_timeline(screen_name=screen_name,count=count, tweet_mode='extended', max_id=max_id)
def get_tweet_times(screen_name, api):
user_tweet_count = api.get_user(screen_name).statuses_count
all_tweets = {'Tweet':[], 'Time':[]}
block_of_tweets = api.user_timeline(screen_name=screen_name,count=200, tweet_mode='extended')
all_tweets["Tweet"].extend([tweet.full_text for tweet in block_of_tweets])
all_tweets["Time"].extend([tweet.created_at for tweet in block_of_tweets])
oldest = block_of_tweets[-1].id - 1
while block_of_tweets:
try:
block_of_tweets = retrieve_next_set_of_tweets(screen_name, api, 200, oldest)
oldest = block_of_tweets[-1].id - 1
except IndexError: #Reached limit of 3245
pass
# all_tweets.update({tweet.full_text: tweet.created_at.time() for tweet in block_of_tweets})
all_tweets["Tweet"].extend([tweet.full_text for tweet in block_of_tweets])
all_tweets["Time"].extend([tweet.created_at for tweet in block_of_tweets])
return all_tweets
def get_all_tweets(screen_name, api):
user_tweet_count = api.get_user(screen_name).statuses_count
all_tweets = []
block_of_tweets = api.user_timeline(screen_name=screen_name,count=200, tweet_mode='extended')
all_tweets.extend([tweet.full_text for tweet in block_of_tweets])
oldest = block_of_tweets[-1].id - 1
while block_of_tweets:
try:
block_of_tweets = retrieve_next_set_of_tweets(screen_name, api, 200, oldest)
oldest = block_of_tweets[-1].id - 1
except IndexError: #Reached limit of 3245
pass
all_tweets.extend([tweet.full_text for tweet in block_of_tweets])
return all_tweets
def parse_all_tweets(tweet_list, max_words_to_show=50):
tweet_dict = {}
regex = re.compile('[^a-zA-Z ]')
for tweet in tweet_list:
text = regex.sub("", tweet).lower().strip().split()
for word in text:
if word in common_words.MOST_COMMON_ENGLISH_WORDS: continue
if word in tweet_dict.keys():
tweet_dict[word] += 1
else:
if len(tweet_dict.items()) == max_words_to_show:
return tweet_dict
tweet_dict[word] = 1
return tweet_dict
def get_data(screen_name, words_or_times="t", save_output=False):
api = create_connection()
print(f"...Getting max of 3245 tweets for {screen_name}...")
if words_or_times == "t":
all_tweets = get_tweet_times(screen_name, api)
suffix = "tweet_times"
elif words_or_times == "w":
suffix = "ranked_words"
parsed_tweets = parse_all_tweets(get_all_tweets(screen_name, api))
parsed_tweets = {k:v for k,v in sorted(parsed_tweets.items(), key=lambda item: item[1], reverse=True)}
else:
return "...Error. Please enter 't' or 'w' to signify 'times' or 'words'."
if save_output:
f_name = f"{screen_name}_{suffix}.json"
with open(f_name, "w") as f:
json.dump(all_tweets if words_or_times == "t" else parsed_tweets, f, indent=4, default=str)
print(f"...Complete! File saved as '{f_name}'")
return all_tweets if words_or_times == "t" else parsed_tweets
if __name__ == "__main__":
get_data(screen_name="#joebiden", save_output=True)
OK. So you want to have time only from your date time. Try replacing
df["Time"]= pd.to_datetime(data["Time"], format="%H:%M")
With
df['Time'] = pd.to_datetime(df['Time'],format= '%H:%M' ).dt.time
I have tried plotting the data you have shared in the comment to the answer by Serge de Gosson de Varennes. The only thing I needed to change in your plot_data.py script was the date format where I added seconds. The rest worked as expected, the times are processed correctly for the x-axis.
Here is an example where the histogram is created with pd.Series.hist for convenience. The weights argument is included to produce a graph with percentages. Most of the code is for formatting:
import numpy as np # v 1.19.2
import pandas as pd # v 1.1.3
import matplotlib.pyplot as plt # v 3.3.2
import matplotlib.dates as mdates
# Import data from html table into pandas dataframe
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTc97VEzlfDP_jEkjC7dTbJzcLBLDQeFwPMg6E36BaiH5qkhnedSz8wsVGUMyW6kt85rD20BcTMbvqp/pubhtml'
table, = pd.read_html(url, header=[1], index_col=1)
df = table.iloc[:, 1:]
# Save time variable as a pandas series of datetime dtype
time_var = pd.to_datetime(df['Time'], format='%H:%M:%S')
# Plot variable with pandas histogram function
ax = time_var.hist(bins=24, figsize=(10,5), grid=False, edgecolor='white', zorder=2,
weights=np.ones(time_var.size)/time_var.size)
# Format x and y-axes tick labels
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
ax.yaxis.set_major_formatter('{x:.1%}')
# Additional formatting
alpha = 0.3
ax.grid(axis='y', zorder=1, color='black', alpha=alpha)
for spine in ['top', 'right', 'left']:
ax.spines[spine].set_visible(False)
ax.spines['bottom'].set_alpha(alpha)
ax.tick_params(axis='x', which='major', color=[0, 0, 0, alpha])
ax.tick_params(axis='y', which='major', length=0)
ax.set_title('Tweets sent per hour of day in UTC', fontsize=14, pad=20)
ax.set_ylabel('Relative frequency (% of total)', size=12, labelpad=10)
plt.show()
Because the counts are spread over 24 hours in this histogram, you may notice that the height of the bars are slightly different from those in the histogram in the image you have shared as a reference where the counts seem to be grouped into 23 bins instead of 24.
Reference: this answer by ImportanceOfBeingErnest

Plotly: How to handle missing dates for a financial time series?

Financial time series are often fraught with missing data. And out of the box, plotly handles a series with missing timestamps visually by just displaying a line like below. But the challenge here is that plotly interprets the timestamps as a value, and inserts all missing dates in the figure.
Most of the time, I find that the plot would look better by just completely leaving those dates out.
An example from the plotly docs under https://plotly.com/python/time-series/#hiding-weekends-and-holidays shows how to handle missing dates for some date categories like weekends or holidays using:
fig.update_xaxes(
rangebreaks=[
dict(bounds=["sat", "mon"]), #hide weekends
dict(values=["2015-12-25", "2016-01-01"]) # hide Christmas and New Year's
]
)
The downside here is that your dataset may just as well be missing some data for any other weekday. And of course you would have to specify given dates for holidays for different countries, so are there any other approaches?
Reproducible code:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
# data
np.random.seed(1234)
n_obs = 15
frequency = 'D'
daterange = pd.date_range('2020', freq=frequency, periods=n_obs)
values = np.random.randint(low=-5, high=6, size=n_obs).tolist()
df = pd.DataFrame({'time':daterange, 'value':values})
df = df.set_index('time')
df.iloc[0]=100; df['value']=df.value.cumsum()
# Missing timestamps
df.iloc[2:5] = np.nan; df.iloc[8:13] = np.nan
df.dropna(inplace = True)
# plotly figure
fig=go.Figure(go.Scatter(x=df.index, y =df['value']))
fig.update_layout(template = 'plotly_dark')
fig.show()
They key here is still to use the rangebreak attribute. But if you were to follow the approach explained in the linked example, you'd have to include each missing date manually. But the solution to missing data in this case is actually more missing data. And this is why:
1. You can retrieve the timestamps from the beginning and the end of your series, and then
2. build a complete timeline within that period (with possibly more missing dates) using:
dt_all = pd.date_range(start=df.index[0],
end=df.index[-1],
freq = 'D')
3. Next you can isolate the timestamps you do have in df.index that are not in that timeline using:
dt_breaks = [d for d in dt_all_py if d not in dt_obs_py]
4. And finally you can include those timestamps in rangebreaks like so:
fig.update_xaxes(
rangebreaks=[dict(values=dt_breaks)]
)
Plot:
Complete code:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
# data
np.random.seed(1234)
n_obs = 15
frequency = 'D'
daterange = pd.date_range('2020', freq=frequency, periods=n_obs)
values = np.random.randint(low=-5, high=6, size=n_obs).tolist()
df = pd.DataFrame({'time':daterange, 'value':values})
df = df.set_index('time')
df.iloc[0]=100; df['value']=df.value.cumsum()
# Missing timestamps
df.iloc[2:5] = np.nan; df.iloc[8:13] = np.nan
df.dropna(inplace = True)
# plotly figure
fig=go.Figure(go.Scatter(x=df.index, y =df['value']))
fig.update_layout(template = 'plotly_dark')
# complete timeline between first and last timestamps
dt_all = pd.date_range(start=df.index[0],
end=df.index[-1],
freq = frequency)
# make sure input and synthetic time series are of the same types
dt_all_py = [d.to_pydatetime() for d in dt_all]
dt_obs_py = [d.to_pydatetime() for d in df.index]
# find which timestamps are missing in the complete timeline
dt_breaks = [d for d in dt_all_py if d not in dt_obs_py]
# remove missing timestamps from visualization
fig.update_xaxes(
rangebreaks=[dict(values=dt_breaks)] # hide timestamps with no values
)
#fig.update_layout(title=dict(text="Some dates are missing, but still displayed"))
fig.update_layout(title=dict(text="Missing dates are excluded by rangebreaks"))
fig.update_xaxes(showgrid=False)
fig.show()
When handle big size data, the 'rangebreaks' method is working but performance is low, change the xaxis type to 'category' is also working.
fig.update_xaxes(type='category')
You can use dtick property. change the tick interval to be one day which should be defined in Milliseconds like 86400000 via the dtick property. Refer the following code:
fig.update_xaxes(dtick=86400000)

Set maximum of datapoints per plot

Im using the following code:
import matplotlib.pyplot as pyplot
import pandas as pandas
from datetime import datetime
dataset = pandas.read_csv("HugLog_17.01.11.csv", sep=",", header=0)
print('filter data for SrcAddr')
dataset_filtered = dataset[dataset['SrcAddr']=='0x1FD3']
print('get Values')
varY = dataset_filtered.Battery_Millivolt.values
varX = dataset_filtered.Timestamp.values
print('Convert the date-strings in date-objects.')
dates_list = [datetime.strptime(date, '%y-%m-%d %H:%M:%S') for date in varX]
fig = pyplot.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.set_xlabel('Time')
ax1.set_ylabel('Millivolt')
ax1.bar(dates_list, varY)
pyplot.locator_params(axis='x',nbins=10)
pyplot.show()
The problem i have is, its a large datacollection with 180k datapoints.
And pyplot displays all points an the graph which makes it slow and the bars overlap. Is there a way to set a maximum-limit on how much datapoints a displayed at a "view".
What i mean by that is, that as soon as the graph is render ther are only 50 datapoints and when i zoomm in i only get a maximum of 50 datapoints again.
Resampling can be done with the resample function from pandas.
Note that the resample syntax has changed between version 0.17 and 0.19 of pandas. The example below uses the old style. See e.g. this tutorial for the new style.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# generate some data for every second over a whole day
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
np.random.rand(len(df.index))*100
# resample the data, taking the mean over 1 hours ("H")
t = "H" # for hours, try "T" for minutes as well
width=1./24 #matplotlib default uses a width of 1 day per bar
# try width=1./(24*60) for minutes
df_resampled = pd.DataFrame()
df_resampled['data'] = df.data.resample(t, how="mean")
fig, ax = plt.subplots()
#ax.bar(df.index, df['data'], width=1./(24*60*60)) # original data, takes too long to plot
ax.bar(df_resampled.index, df_resampled['data'], width=width)
ax.xaxis_date()
plt.show()
Automatic adaption of the resampling when zooming would indeed require some manual work. There is a resampling example on the matplotlib event handling page, which does not work out of the box but could be adapted accordingly.
This is how it would look like:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates
class Sampler():
def __init__(self,df):
self.df = df
def resample(self, limits):
print limits
dt = limits[1] - limits[0]
if (type(dt) != pd.tslib.Timedelta) and (type(dt) != datetime.timedelta):
dt = datetime.timedelta(days=dt)
print dt
#see #http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
if dt > datetime.timedelta(hours=5):
t = "H"; width=1./24
elif dt > datetime.timedelta(minutes=60):
t = "15T"; width=15./(24.*60)
elif dt > datetime.timedelta(minutes=5):
t = "T"; width=1./(24.*60)
elif dt > datetime.timedelta(seconds=60):
t = "15S"; width=15./(24.*60*60)
else:
#dt < datetime.timedelta(seconds=60):
t = "S"; width=1./(24.*60*60)
self.resampled = pd.DataFrame()
self.resampled['data'] = self.df.data.resample(t, how="mean")
print t, len(self.resampled['data'])
print "indextype", type(self.resampled.index[0])
print "limitstype", type(limits[1])
if type(limits[1]) == float or type(limits[1]) == np.float64 :
dlowlimit = matplotlib.dates.num2date(limits[0])
duplimit = matplotlib.dates.num2date(limits[1])
print type(duplimit), duplimit
self.resampled = self.resampled.loc[self.resampled.index <= duplimit]
self.resampled = self.resampled.loc[self.resampled.index >= dlowlimit]
else:
self.resampled = self.resampled.loc[self.resampled.index <= limits[1]]
self.resampled = self.resampled.loc[self.resampled.index >= limits[0]]
return self.resampled.index,self.resampled['data'],width
def update(self, ax):
print "update"
lims = ax.viewLim
start, stop = lims.intervalx
ax.clear()
x,y,width = self.resample([start, stop])
ax.bar(x,y, width=width)
ax.set_xlim([start, stop])
ax.callbacks.connect('xlim_changed', self.update)
ax.figure.canvas.draw()
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
np.random.rand(len(df.index))*500
sampler = Sampler(df)
x,y,width = sampler.resample( [df.index[0],df.index[-1] ] )
fig, ax = plt.subplots()
ax.bar(x,y, width=width)
ax.xaxis_date()
# connect to limits changes
ax.callbacks.connect('xlim_changed', sampler.update)
plt.show()
One thing you can do is plot a random subset of the data by using the sample method on your pandas DataFrame. Use the frac argument to determine the fraction of points you want to use. It ranges from 0 to 1.
After you get your dataset_filtered DataFrame, take a sample of it like this
dataset_filtered_sample = dataset_filtered.sample(frac=.001)

Categories