Set maximum of datapoints per plot - python

Im using the following code:
import matplotlib.pyplot as pyplot
import pandas as pandas
from datetime import datetime
dataset = pandas.read_csv("HugLog_17.01.11.csv", sep=",", header=0)
print('filter data for SrcAddr')
dataset_filtered = dataset[dataset['SrcAddr']=='0x1FD3']
print('get Values')
varY = dataset_filtered.Battery_Millivolt.values
varX = dataset_filtered.Timestamp.values
print('Convert the date-strings in date-objects.')
dates_list = [datetime.strptime(date, '%y-%m-%d %H:%M:%S') for date in varX]
fig = pyplot.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.set_xlabel('Time')
ax1.set_ylabel('Millivolt')
ax1.bar(dates_list, varY)
pyplot.locator_params(axis='x',nbins=10)
pyplot.show()
The problem i have is, its a large datacollection with 180k datapoints.
And pyplot displays all points an the graph which makes it slow and the bars overlap. Is there a way to set a maximum-limit on how much datapoints a displayed at a "view".
What i mean by that is, that as soon as the graph is render ther are only 50 datapoints and when i zoomm in i only get a maximum of 50 datapoints again.

Resampling can be done with the resample function from pandas.
Note that the resample syntax has changed between version 0.17 and 0.19 of pandas. The example below uses the old style. See e.g. this tutorial for the new style.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# generate some data for every second over a whole day
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
np.random.rand(len(df.index))*100
# resample the data, taking the mean over 1 hours ("H")
t = "H" # for hours, try "T" for minutes as well
width=1./24 #matplotlib default uses a width of 1 day per bar
# try width=1./(24*60) for minutes
df_resampled = pd.DataFrame()
df_resampled['data'] = df.data.resample(t, how="mean")
fig, ax = plt.subplots()
#ax.bar(df.index, df['data'], width=1./(24*60*60)) # original data, takes too long to plot
ax.bar(df_resampled.index, df_resampled['data'], width=width)
ax.xaxis_date()
plt.show()
Automatic adaption of the resampling when zooming would indeed require some manual work. There is a resampling example on the matplotlib event handling page, which does not work out of the box but could be adapted accordingly.
This is how it would look like:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates
class Sampler():
def __init__(self,df):
self.df = df
def resample(self, limits):
print limits
dt = limits[1] - limits[0]
if (type(dt) != pd.tslib.Timedelta) and (type(dt) != datetime.timedelta):
dt = datetime.timedelta(days=dt)
print dt
#see #http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
if dt > datetime.timedelta(hours=5):
t = "H"; width=1./24
elif dt > datetime.timedelta(minutes=60):
t = "15T"; width=15./(24.*60)
elif dt > datetime.timedelta(minutes=5):
t = "T"; width=1./(24.*60)
elif dt > datetime.timedelta(seconds=60):
t = "15S"; width=15./(24.*60*60)
else:
#dt < datetime.timedelta(seconds=60):
t = "S"; width=1./(24.*60*60)
self.resampled = pd.DataFrame()
self.resampled['data'] = self.df.data.resample(t, how="mean")
print t, len(self.resampled['data'])
print "indextype", type(self.resampled.index[0])
print "limitstype", type(limits[1])
if type(limits[1]) == float or type(limits[1]) == np.float64 :
dlowlimit = matplotlib.dates.num2date(limits[0])
duplimit = matplotlib.dates.num2date(limits[1])
print type(duplimit), duplimit
self.resampled = self.resampled.loc[self.resampled.index <= duplimit]
self.resampled = self.resampled.loc[self.resampled.index >= dlowlimit]
else:
self.resampled = self.resampled.loc[self.resampled.index <= limits[1]]
self.resampled = self.resampled.loc[self.resampled.index >= limits[0]]
return self.resampled.index,self.resampled['data'],width
def update(self, ax):
print "update"
lims = ax.viewLim
start, stop = lims.intervalx
ax.clear()
x,y,width = self.resample([start, stop])
ax.bar(x,y, width=width)
ax.set_xlim([start, stop])
ax.callbacks.connect('xlim_changed', self.update)
ax.figure.canvas.draw()
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
np.random.rand(len(df.index))*500
sampler = Sampler(df)
x,y,width = sampler.resample( [df.index[0],df.index[-1] ] )
fig, ax = plt.subplots()
ax.bar(x,y, width=width)
ax.xaxis_date()
# connect to limits changes
ax.callbacks.connect('xlim_changed', sampler.update)
plt.show()

One thing you can do is plot a random subset of the data by using the sample method on your pandas DataFrame. Use the frac argument to determine the fraction of points you want to use. It ranges from 0 to 1.
After you get your dataset_filtered DataFrame, take a sample of it like this
dataset_filtered_sample = dataset_filtered.sample(frac=.001)

Related

Plot datetime data in 24 hour window on x axis

I have a dataframe with datetime data:
Start_time: eg(2013-09-21 00:14:00) - the timestamp a task has started
End_time: eg(2013-09-22 11:04:00) - the timestamp a task has ended
Time_diff:eg(0 days 06:07:00) - the time the task took.
I want to plot a histogram of the time events start and end, without considering the date (so only the 24 clock).
I have tried to use:
df['Start_time'].dt.time
to just get the time and plot.
However I am then unable afterwards to BIN the timestamps (now objects) in 20 bins.
This is my result so far:
This is what I am trying to get, a plot with 24hours on the x axis, and the binned distribution of start time & end_time for the y
Here is the code
from random import randrange
import datetime
import pandas as pd
import plotly.express as px
# make the EXAMPLE dataset
startDate = datetime.datetime(2013, 9, 20,13,00)
start_lst = []
end_lst = []
for i in range(200):
start_time= startDate + datetime.timedelta(hours=randrange(23), minutes= randrange(60))
end_time = start_time + datetime.timedelta(hours=randrange(2,7), minutes= randrange(60))
startDate = startDate + datetime.timedelta(days=randrange(4))
start_lst.append(start_time)
end_lst.append(end_time)
df = pd.DataFrame({'Start_time': start_lst,
'End_time': end_lst
})
df['Time_diff'] = df['End_time']-df['Start_time']
#start of code
#tried just using histogram, but sicne the date changes, it wont plot over 24hours
fig = px.histogram(df, x=['Start_time', 'End_time'], nbins=20)
fig.show()
#so tried removing the date part, and just leaving time, however now it wont properly bin
df['Start_time_nodate'] = df['Start_time'].dt.time
df['End_time_nodate'] = df['End_time'].dt.time
fig = px.histogram(df, x=['Start_time_nodate', 'End_time_nodate'], nbins=20)
fig.show()
If I understand correctly, with your example dataframe, here is one way to do it with Matplotlib:
from matplotlib import pyplot as plt
# Setup
df["Start_time_nodate"] = df["Start_time"].dt.hour
df["End_time_nodate"] = df["End_time"].dt.hour
fig, ax = plt.subplots(figsize=(8, 4))
# Plot frequencies
ax.plot(df["Start_time_nodate"].value_counts(sort=False).sort_index())
ax.plot(df["End_time_nodate"].value_counts(sort=False).sort_index())
# Style plot
ax.legend(["Start time", "End time"])
ax.set_xticks(ticks=[i for i in range(0, 25)])
ax.set_xticklabels([i for i in range(0, 25)])
plt.xlabel("24 hours")
plt.ylabel("Frequency")
ax.margins(x=0)
In a Jupyter notebook, this code outputs the following image:

Candlesticks in matplotlib

I'm trying to make a cryptoscanner but I'm struggling a bit. The code right now can loop through the different coins in symbols.csv and print plots for all of them. The plots include close price, SMA and bollinger bands. Now I really want the close price to be candlesticks and not a line. I've found that there are other plots like mpf to make candlesticks. The problem is that I don't know how to make the bollinger bands work with the mpf plots and I don't know how to make the candlesticks work with matplotlib. Can someone help me making candlesticks in matplotlib orrr make the bollingerbands in the mpf plots.
Thanks in advance!
The graph looks like this right now
import yfinance as yf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
with open('symbols.csv') as f:
symbols = f.read().splitlines()
for symbol in symbols:
df = yf.download(symbol, start='2020-01-01')
# df = yf.download(symbol, period = '22h', interval = '15m')
print(df)
# df = yf.download('ADA-USD', start='2021-01-01')
df['SMA'] = df.Close.rolling(window=20).mean()
df['stddev'] = df.Close.rolling(window=20).std()
df['Upper'] = df.SMA + 2* df.stddev
df['Lower'] = df.SMA - 2* df.stddev
df['Buy_Signal'] = np.where(df.Lower > df.Close, True, False)
df['Sell_Signal'] = np.where(df.Upper < df.Close, True, False)
buys = []
sells = []
open_pos = False
for i in range(len(df)):
if df.Lower[i] > df.Close[i]:
if open_pos == False:
buys.append(i)
open_pos = True
elif df.Upper[i] < df.Close[i]:
if open_pos:
sells.append(i)
open_pos = False
plt.figure(figsize=(12, 6))
plt.scatter(df.iloc[buys].index, df.iloc[buys].Close, marker = '^', color ='g')
plt.scatter(df.iloc[sells].index, df.iloc[sells].Close, marker = '^', color ='r')
plt.plot(df[['Close', 'SMA', 'Upper', 'Lower']])
plt.fill_between(df.index, df.Upper, df.Lower, color='grey', alpha=0.3)
plt.legend(['Close', 'SMA', 'Upper', 'Lower'])
plt.show()
merged = pd.concat([df.iloc[buys].Close, df.iloc[sells].Close], axis=1)
merged.columns = ['Buys', 'Sells']
print(merged)
totalprofit = merged.shift(-1).Sells - merged.Buys
print(totalprofit)
relprofits = (merged.shift(-1).Sells - merged.Buys) / merged.Buys
print(relprofits.mean())
The links in the comments provide a wealth of examples. Since you want to graph candlesticks, Bollinger Bands, and SMAs in mpf, I have modified the additional plot examples from the previous examples to suit your needs. The graphs were created from data obtained in stocks instead of currencies.
import yfinance as yf
import pandas as pd
import mplfinance as mpf
df = yf.download("AAPL", start="2020-01-01")
df['SMA'] = df.Close.rolling(window=20).mean()
df['stddev'] = df.Close.rolling(window=20).std()
df['Upper'] = df.SMA + 2* df.stddev
df['Lower'] = df.SMA - 2* df.stddev
df['Buy_Signal'] = np.where(df.Lower > df.Close, True, False)
df['Sell_Signal'] = np.where(df.Upper < df.Close, True, False)
tcdf = df[['Lower','Upper','SMA']]
apd = mpf.make_addplot(tcdf)
mpf.plot(df, figratio=(8,4), type='candle', addplot=apd, volume=False, style='yahoo')

Unable to plot histogram with time on x-axis using Matplotlib and Python

I am trying to plot the number of times a user tweeted at specific times of the day. I plan to plot these on a histogram/bar chart with 24 "bins" - one for each hour.
I have the data in a Pandas dataframe in two columns - the tweet and the time of the tweet (as a datetime object).
I have converted the Time column into a Pandas time, however I am having a hard time plotting correctly. If I set the value of bins to be 24, then I get the following chart (here) which doesn't look correct. Firstly the chart looks wrong, but secondly the x-axis has horrible formatting.
I would like to try to resolve these two issues. Firstly the data isn't being plotted correctly and secondly the horizontal axis formatting is incorrect.
I have plotted the data using Google Sheets and the correct chart should look like this. I don't mind if the values are % of total or absolute volume.
Code to generate plots can be found here. generate_data.py and plot_data.py
Any help is hugely appreciated.
plot_data.py
import datetime
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
import random
import generate_data
screen_name = "#joebiden"
data = generate_data.get_data(screen_name, save_output=True)
df = pd.DataFrame(data)
df["Time"]= pd.to_datetime(data["Time"], format="%H:%M")
fig, ax = plt.subplots(1,1)
bin_list = [datetime.time(x) for x in range(24)]
ax.hist(df["Time"], bins=24, color='lightblue')
plt.show()
generate_data.py
import json
import re
from datetime import datetime
import tweepy
import common_words
import twitter_auth
def create_connection():
auth = tweepy.OAuthHandler(twitter_auth.CONSUMER_KEY, twitter_auth.CONSUMER_SECRET)
auth.set_access_token(twitter_auth.ACCESS_KEY, twitter_auth.ACCESS_SECRET)
return tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
def retrieve_next_set_of_tweets(screen_name, api, count,max_id):
'''Return next 200 user tweets'''
return api.user_timeline(screen_name=screen_name,count=count, tweet_mode='extended', max_id=max_id)
def get_tweet_times(screen_name, api):
user_tweet_count = api.get_user(screen_name).statuses_count
all_tweets = {'Tweet':[], 'Time':[]}
block_of_tweets = api.user_timeline(screen_name=screen_name,count=200, tweet_mode='extended')
all_tweets["Tweet"].extend([tweet.full_text for tweet in block_of_tweets])
all_tweets["Time"].extend([tweet.created_at for tweet in block_of_tweets])
oldest = block_of_tweets[-1].id - 1
while block_of_tweets:
try:
block_of_tweets = retrieve_next_set_of_tweets(screen_name, api, 200, oldest)
oldest = block_of_tweets[-1].id - 1
except IndexError: #Reached limit of 3245
pass
# all_tweets.update({tweet.full_text: tweet.created_at.time() for tweet in block_of_tweets})
all_tweets["Tweet"].extend([tweet.full_text for tweet in block_of_tweets])
all_tweets["Time"].extend([tweet.created_at for tweet in block_of_tweets])
return all_tweets
def get_all_tweets(screen_name, api):
user_tweet_count = api.get_user(screen_name).statuses_count
all_tweets = []
block_of_tweets = api.user_timeline(screen_name=screen_name,count=200, tweet_mode='extended')
all_tweets.extend([tweet.full_text for tweet in block_of_tweets])
oldest = block_of_tweets[-1].id - 1
while block_of_tweets:
try:
block_of_tweets = retrieve_next_set_of_tweets(screen_name, api, 200, oldest)
oldest = block_of_tweets[-1].id - 1
except IndexError: #Reached limit of 3245
pass
all_tweets.extend([tweet.full_text for tweet in block_of_tweets])
return all_tweets
def parse_all_tweets(tweet_list, max_words_to_show=50):
tweet_dict = {}
regex = re.compile('[^a-zA-Z ]')
for tweet in tweet_list:
text = regex.sub("", tweet).lower().strip().split()
for word in text:
if word in common_words.MOST_COMMON_ENGLISH_WORDS: continue
if word in tweet_dict.keys():
tweet_dict[word] += 1
else:
if len(tweet_dict.items()) == max_words_to_show:
return tweet_dict
tweet_dict[word] = 1
return tweet_dict
def get_data(screen_name, words_or_times="t", save_output=False):
api = create_connection()
print(f"...Getting max of 3245 tweets for {screen_name}...")
if words_or_times == "t":
all_tweets = get_tweet_times(screen_name, api)
suffix = "tweet_times"
elif words_or_times == "w":
suffix = "ranked_words"
parsed_tweets = parse_all_tweets(get_all_tweets(screen_name, api))
parsed_tweets = {k:v for k,v in sorted(parsed_tweets.items(), key=lambda item: item[1], reverse=True)}
else:
return "...Error. Please enter 't' or 'w' to signify 'times' or 'words'."
if save_output:
f_name = f"{screen_name}_{suffix}.json"
with open(f_name, "w") as f:
json.dump(all_tweets if words_or_times == "t" else parsed_tweets, f, indent=4, default=str)
print(f"...Complete! File saved as '{f_name}'")
return all_tweets if words_or_times == "t" else parsed_tweets
if __name__ == "__main__":
get_data(screen_name="#joebiden", save_output=True)
OK. So you want to have time only from your date time. Try replacing
df["Time"]= pd.to_datetime(data["Time"], format="%H:%M")
With
df['Time'] = pd.to_datetime(df['Time'],format= '%H:%M' ).dt.time
I have tried plotting the data you have shared in the comment to the answer by Serge de Gosson de Varennes. The only thing I needed to change in your plot_data.py script was the date format where I added seconds. The rest worked as expected, the times are processed correctly for the x-axis.
Here is an example where the histogram is created with pd.Series.hist for convenience. The weights argument is included to produce a graph with percentages. Most of the code is for formatting:
import numpy as np # v 1.19.2
import pandas as pd # v 1.1.3
import matplotlib.pyplot as plt # v 3.3.2
import matplotlib.dates as mdates
# Import data from html table into pandas dataframe
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTc97VEzlfDP_jEkjC7dTbJzcLBLDQeFwPMg6E36BaiH5qkhnedSz8wsVGUMyW6kt85rD20BcTMbvqp/pubhtml'
table, = pd.read_html(url, header=[1], index_col=1)
df = table.iloc[:, 1:]
# Save time variable as a pandas series of datetime dtype
time_var = pd.to_datetime(df['Time'], format='%H:%M:%S')
# Plot variable with pandas histogram function
ax = time_var.hist(bins=24, figsize=(10,5), grid=False, edgecolor='white', zorder=2,
weights=np.ones(time_var.size)/time_var.size)
# Format x and y-axes tick labels
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
ax.yaxis.set_major_formatter('{x:.1%}')
# Additional formatting
alpha = 0.3
ax.grid(axis='y', zorder=1, color='black', alpha=alpha)
for spine in ['top', 'right', 'left']:
ax.spines[spine].set_visible(False)
ax.spines['bottom'].set_alpha(alpha)
ax.tick_params(axis='x', which='major', color=[0, 0, 0, alpha])
ax.tick_params(axis='y', which='major', length=0)
ax.set_title('Tweets sent per hour of day in UTC', fontsize=14, pad=20)
ax.set_ylabel('Relative frequency (% of total)', size=12, labelpad=10)
plt.show()
Because the counts are spread over 24 hours in this histogram, you may notice that the height of the bars are slightly different from those in the histogram in the image you have shared as a reference where the counts seem to be grouped into 23 bins instead of 24.
Reference: this answer by ImportanceOfBeingErnest

Skipping certain values in Python with Matplotlib

I am currently working on an intra-day stock chart using the Alpha Vantage API. The data frame contains values from 4:00 to 20:00. In my matplotlib.pyplot chart however, the x-Axis also includes values from 20:00 to 4:00 over night. I dont want this as it messes up the aesthetics and also the Volume subplot.
Q: Is there any way to skip x-Axis values which dont exist in the actual Data Frame (the values from 20:00 to 04:00)?
As you can see, the Data Frame clearly jumps from 20:00 to 04:00
However in the Matplotlib chart, the x-Axis contains the values from 20:00 to 4:00, messing with the chart
Code so far. I believe so far everything is right:
import pandas as pd
import matplotlib.pyplot as plt
from alpha_vantage.timeseries import TimeSeries
import time
import datetime as dt
from datetime import timedelta as td
from dateutil.relativedelta import relativedelta
#Accessing and Preparing API
ts = TimeSeries(key=api_key, output_format='pandas')
ticker_input = "TSLA"
interval_input = "15min"
df, meta_data = ts.get_intraday(symbol = ticker_input, interval = interval_input, outputsize = 'full')
slice_date = 16*4*5
df = df[0:slice_date]
df = df.iloc[::-1]
df["100ma"] = df["4. close"].rolling(window = 50, min_periods = 0).mean()
df["Close"] = df["4. close"]
df["Date"] = df.index
#Plotting all as 2 different subplots
ax1 = plt.subplot2grid((7,1), (0,0), rowspan = 5, colspan = 1)
ax1.plot(df["Date"], df['Close'])
ax1.plot(df["Date"], df["100ma"], linewidth = 0.5)
plt.xticks(rotation=45)
ax2 = plt.subplot2grid((6,1), (5,0), rowspan = 2, colspan = 2, sharex = ax1)
ax2.bar(df["Date"], df["5. volume"])
ax2.axes.xaxis.set_visible(False)
plt.tight_layout()
plt.show()
It would be great if anybody could help. Im still a complete beginner and only started Python 2 weeks ago.
We got the data from the same place, although the data acquisition method is different. After extracting it in 15 units, I created a graph by excluding the data after 8pm and before 4pm. I created the code with the understanding that your skip would open up the pause. What you want it to skip is skipped once the NaN is set.
import datetime
import pandas as pd
import numpy as np
import pandas_datareader.data as web
import mplfinance as mpf
# import matplotlib.pyplot as plt
with open('./alpha_vantage_api_key.txt') as f:
api_key = f.read()
now_ = datetime.datetime.today()
start = datetime.datetime(2019, 1, 1)
end = datetime.datetime(now_.year, now_.month, now_.day - 1)
symbol = 'TSLA'
df = web.DataReader(symbol, 'av-intraday', start, end, api_key=api_key)
df.columns = ['Open', 'High', 'Low', 'Close', 'Volume']
df.index = pd.to_datetime(df.index)
df["100ma"] = df["Close"].rolling(window = 50, min_periods = 0).mean()
df["Date"] = df.index
df_15 = df.asfreq('15min')
df_15 = df_15[(df_15.index.hour >= 4)&(df_15.index.hour <= 20) ]
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,4.5),dpi=144)
#Plotting all as 2 different subplots
ax1 = plt.subplot2grid((7,1), (0,0), rowspan = 5, colspan = 1)
ax1.plot(df_15["Date"], df_15['Close'])
ax1.plot(df_15["Date"], df_15["100ma"], linewidth = 0.5)
plt.xticks(rotation=20)
ax2 = plt.subplot2grid((6,1), (5,0), rowspan = 2, colspan = 2, sharex = ax1)
ax2.bar(df_15["Date"], df_15["Volume"])
ax2.axes.xaxis.set_visible(False)
# plt.tight_layout()
plt.show()
I fixed it using matplotlib.ticker.formatter.
I first created a class and using:
class MyFormatter(Formatter):
def __init__(self, dates, fmt='%Y-%m-%d %H:%M'):
self.dates = dates
self.fmt = fmt
def __call__(self, x, pos=0):
'Return the label for time x at position pos'
ind = int(np.round(x))
if ind >= len(self.dates) or ind < 0:
return ''
return self.dates[ind].strftime(self.fmt)
formatter = MyFormatter(df.index)
ax1 = plt.subplot2grid((7,1), (0,0), rowspan = 5, colspan = 1)
ax1.xaxis.set_major_formatter(formatter)
ax1.plot(np.arange(len(df)), df["Close"])
ax1.plot(np.arange(len(df)), df["100ma"], linewidth = 0.5)
ax1.xticks(rotation=45)
ax1.axis([xmin,xmax,ymin,ymax])
ax2 = plt.subplot2grid((6,1), (5,0), rowspan = 2, colspan = 2, sharex = ax1)
ax2.bar(np.arange(len(df)), df["5. volume"])
plt.show()
This gave me a smoother graph than the one before and also that recommended by r-beginner.
The only issue that I have is that if I zoom in the x-axis doesnt really change. it always has teh year, month, date, hour, and minute. Obviously I only want hour and minute when Im zoomed in further. I am yet to figure out how to do that

Cannot prepare proper labels in Matplotlib

I have very simple code:
from matplotlib import dates
import matplotlib.ticker as ticker
my_plot=df_h.boxplot(by='Day',figsize=(12,5), showfliers=False, rot=90)
I've got:
but I would like to have fewer labels on X axis. To do this I've add:
my_plot.xaxis.set_major_locator(ticker.MaxNLocator(12))
It generates fewer labels but values of labels have wrong values (=first of few labels from whole list)
What am I doing wrong?
I have add additional information:
I've forgoten to show what is inside DataFrame.
I have three columns:
reg_Date - datetime64 (index)
temperature - float64
Day - date converted from reg_Date to string, it looks like '2017-10' (YYYY-MM)
Box plot group date by 'Day' and I would like to show values 'Day" as a label but not all values
, for example every third one.
You were almost there. Just set ticker.MultipleLocator.
The pandas.DataFrame.boxplot also returns axes, which is an object of class matplotlib.axes.Axes. So you can use this code snippet to customize your labels:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
center = np.random.randint(50,size=(10, 20))
spread = np.random.rand(10, 20) * 30
flier_high = np.random.rand(10, 20) * 30 + 30
flier_low = np.random.rand(10, 20) * -30
y = np.concatenate((spread, center, flier_high, flier_low))
fig, ax = plt.subplots(figsize=(10, 5))
ax.boxplot(y)
x = ['Label '+str(i) for i in range(20)]
ax.set_xticklabels(x)
ax.set_xlabel('Day')
# Set a tick on each integer multiple of a base within the view interval.
ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
plt.xticks(rotation=90)
I think there is a compatibility issue with Pandas plots and Matplotlib formatters.
With the following code:
df = pd.read_csv('lt_stream-1001-full.csv', header=0, encoding='utf8')
df['reg_date'] = pd.to_datetime(df['reg_date'] , format='%Y-%m-%d %H:%M:%S')
df.set_index('reg_date', inplace=True)
df_h = df.resample(rule='H').mean()
df_h['Day']=df_h.index.strftime('%Y-%m')
print(df_h)
f, ax = plt.subplots()
my_plot = df_h.boxplot(by='Day',figsize=(12,5), showfliers=False, rot=90, ax=ax)
locs, labels = plt.xticks()
i = 0
new_labels = list()
for l in labels:
if i % 3 == 0:
label = labels[i]
i += 1
new_labels.append(label)
else:
label = ''
i += 1
new_labels.append(label)
ax.set_xticklabels(new_labels)
plt.show()
You get this chart:
But I notice that this is grouped by month instead of by day. It may not be what you wanted.
Adding the day component to the string 'Day' messes up the chart as there seems to be too many boxes.
df = pd.read_csv('lt_stream-1001-full.csv', header=0, encoding='utf8')
df['reg_date'] = pd.to_datetime(df['reg_date'] , format='%Y-%m-%d %H:%M:%S')
df.set_index('reg_date', inplace=True)
df_h = df.resample(rule='H').mean()
df_h['Day']=df_h.index.strftime('%Y-%m-%d')
print(df_h)
f, ax = plt.subplots()
my_plot = df_h.boxplot(by='Day',figsize=(12,5), showfliers=False, rot=90, ax=ax)
locs, labels = plt.xticks()
i = 0
new_labels = list()
for l in labels:
if i % 15 == 0:
label = labels[i]
i += 1
new_labels.append(label)
else:
label = ''
i += 1
new_labels.append(label)
ax.set_xticklabels(new_labels)
plt.show()
The for loop creates the tick labels every as many periods as desired. In the first chart they were set every 3 months. In the second one, every 15 days.
If you would like to see less grid lines:
df = pd.read_csv('lt_stream-1001-full.csv', header=0, encoding='utf8')
df['reg_date'] = pd.to_datetime(df['reg_date'] , format='%Y-%m-%d %H:%M:%S')
df.set_index('reg_date', inplace=True)
df_h = df.resample(rule='H').mean()
df_h['Day']=df_h.index.strftime('%Y-%m-%d')
print(df_h)
f, ax = plt.subplots()
my_plot = df_h.boxplot(by='Day',figsize=(12,5), showfliers=False, rot=90, ax=ax)
locs, labels = plt.xticks()
i = 0
new_labels = list()
new_locs = list()
for l in labels:
if i % 3 == 0:
label = labels[i]
loc = locs[i]
i += 1
new_labels.append(label)
new_locs.append(loc)
else:
i += 1
ax.set_xticks(new_locs)
ax.set_xticklabels(new_labels)
ax.grid(axis='y')
plt.show()
I've read about x_compat in Pandas plot in order to apply Matplotlib formatters, but I get an error when trying to apply it. I'll give it another shot later.
Old unsuccesful answer
The tick labels seem to be dates. If they are set as datetime in your dataframe, you can:
months = mdates.MonthLocator(1,4,7,10) #Choose the months you like the most
ax.xaxis.set_major_locator(months)
Otherwise, you can let Matplotlib know they are dates by:
ax.xaxis_date()
Your comment:
I have add additional information:
I've forgoten to show what is inside DataFrame.
I have three columns:
reg_Date - datetime64 (index)
temperature - float64
Day - date converted from reg_Date to string, it looks like '2017-10' *(YYYY-MM) *
Box plot group date by 'Day' and I would like to show values 'Day" as a label but not all values
, for example every third one.
Based on your comment in italic above, I would use reg_Date as the input and the following lines:
days = mdates.DayLocator(interval=3)
daysFmt = mdates.DateFormatter('%Y-%m') #to format display
ax.xaxis.set_major_locator(days)
ax.xaxis.set_major_formatter(daysFmt)
I forgot to mention that you will need to:
import matplotlib.dates as mdates
Does this work?

Categories