Shifting the origin in ruptures.display - python

I am trying to perform change point detection using the ruptures package. When I use the ruptures.display for plotting, the x axis starts of with 0 as the start point.
And here is how the plot looks like:
However, I would like to start with an offset. Therefore I have tried to create a custom display function using the ruptures.display source code. But, I am not able to figure out how to shift the origin.
Below is the main code:
data = pd.read_csv("test_flooding.csv")
Start_time = pd.to_datetime('81028520.26',unit='s')
End_time = pd.to_datetime('81113495.41',unit='s')
#Format the 'Date' column
data['Time']=data['Time'].astype(str)
#Convert the Date column into a date object
data['Time']=pd.to_datetime(data['Time'],unit='s')
#Selecting a specific range
data=data[(data['Time']<=End_time)]
data=data[(Start_time <=data['Time'])]
data = data.loc[data['ID'] == "id1"]
#Convert the time series values to a numpy 1D array
points=np.array(data['Signal1_of_ID'])
#RUPTURES PACKAGE
#Changepoint detection with the Pelt search method
start_timestamp = int(time.mktime(Start_time.timetuple()))
model="rbf"
algo = rpt.Pelt(model=model).fit(points)
result = algo.predict(pen=10)
display(points,start_timestamp , result, figsize=(10, 6))
plt.title('Change Point Detection: Pelt Search Method')
plt.show()
And here is the custom display code:
from itertools import cycle
import matplotlib.pyplot as plt
import numpy as np
from ruptures.utils import pairwise
COLOR_CYCLE = ["#4286f4", "#f44174"]
def display(signal,Start_time, true_chg_pts, computed_chg_pts=None, **kwargs):
"""
Display a signal and the change points provided in alternating colors. If another set of change
point is provided, they are displayed with dashed vertical dashed lines.
Args:
signal (array): signal array, shape (n_samples,) or (n_samples, n_features).
true_chg_pts (list): list of change point indexes.
computed_chg_pts (list, optional): list of change point indexes.
Returns:
tuple: (figure, axarr) with a :class:`matplotlib.figure.Figure` object and an array of Axes objects.
"""
if signal.ndim == 1:
signal = signal.reshape(-1, 1)
n_samples, n_features = signal.shape
# let's set all options
figsize = (10, 2 * n_features) # figure size
alpha = 0.2 # transparency of the colored background
color = "k" # color of the lines indicating the computed_chg_pts
linewidth = 3 # linewidth of the lines indicating the computed_chg_pts
linestyle = "--" # linestyle of the lines indicating the computed_chg_pts
if "figsize" in kwargs:
figsize = kwargs["figsize"]
if "alpha" in kwargs:
alpha = kwargs["alpha"]
if "color" in kwargs:
color = kwargs["color"]
if "linewidth" in kwargs:
linewidth = kwargs["linewidth"]
if "linestyle" in kwargs:
linestyle = kwargs["linestyle"]
fig, axarr = plt.subplots(n_features, figsize=figsize, sharex=True)
if n_features == 1:
axarr = [axarr]
for axe, sig in zip(axarr, signal.T):
color_cycle = cycle(COLOR_CYCLE)
# plot s
axe.plot(range(Start_time,Start_time+n_samples), sig)
# color each (true) regime
bkps = [0] + sorted(true_chg_pts)
for (start, end), col in zip(pairwise(bkps), color_cycle):
axe.axvspan(max(0, start - 0.5),
end - 0.5,
facecolor=col, alpha=alpha)
# vertical lines to mark the computed_chg_pts
if computed_chg_pts is not None:
for bkp in computed_chg_pts:
if bkp != 0 and bkp < n_samples:
axe.axvline(x=bkp - 0.5,
color=color,
linewidth=linewidth,
linestyle=linestyle)
fig.tight_layout()
return fig, axarr
Here is how the image looks with my custom display trial, which still plots with the origin as 0:
Any help is highly appreciated.

It looks like your code might include zero, since you're prepending zero to the break points list:
axe.axvspan(max(0, start - 0.5),
end - 0.5,
facecolor=col, alpha=alpha)
should probably use your Start_time instead of zero:
axe.axvspan(max(Start_time, start - 0.5),
end - 0.5,
facecolor=col, alpha=alpha)

Related

Adding a colorbar whose color corresponds to the different lines in an existing plot

My dataset is in the form of :
Data[0] = [headValue,x0,x1,..xN]
Data[1] = [headValue_ya,ya0,ya1,..yaN]
Data[2] = [headValue_yb,yb0,yb1,..ybN]
...
Data[n] = [headvalue_yz,yz0,yz1,..yzN]
I want to plot f(y*) = x, so I can visualize all Lineplots in the same figure with different colors, each color determined by the headervalue_y*.
I also want to add a colorbar whose color matching the lines and therefore the header values, so we can link visually which header value leads to which behaviour.
Here is what I am aiming for :(Plot from Lacroix B, Letort G, Pitayu L, et al. Microtubule Dynamics Scale with Cell Size to Set Spindle Length and Assembly Timing. Dev Cell. 2018;45(4):496–511.e6. doi:10.1016/j.devcel.2018.04.022)
I have trouble adding the colorbar, I have tried to extract N colors from a colormap (N is my number of different headValues, or column -1) and then adding for each line plot the color corresponding here is my code to clarify:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
Data = [['Time',0,0.33,..200],[0.269,4,4.005,...11],[0.362,4,3.999,...16.21],...[0.347,4,3.84,...15.8]]
headValues = [0.269,0.362,0.335,0.323,0.161,0.338,0.341,0.428,0.245,0.305,0.305,0.314,0.299,0.395,0.32,0.437,0.203,0.41,0.392,0.347]
# the differents headValues_y* of each column here in a list but also in Data
# with headValue[0] = Data[1][0], headValue[1] = Data[2][0] ...
cmap = mpl.cm.get_cmap('rainbow') # I choose my colormap
rgba = [] # the color container
for value in headValues:
rgba.append(cmap(value)) # so rgba will contain a different color for each headValue
fig, (ax,ax1) = plt.subplots(2,1) # creating my figure and two axes to put the Lines and the colorbar
c = 0 # index for my colors
for i in range(1, len(Data)):
ax.plot( Data[0][1:], Data[i][1:] , color = rgba[c])
# Data[0][1:] is x, Data[i][1:] is y, and the color associated with Data[i][0]
c += 1
fig.colorbar(mpl.cm.ScalarMappable(cmap= mpl.colors.ListedColormap(rgba)), cax=ax1, orientation='horizontal')
# here I create my scalarMappable for my lineplot and with the previously selected colors 'rgba'
plt.show()
The current result:
How to add the colorbar on the side or the bottom of the first axis ?
How to properly add a scale to this colorbar correspondig to different headValues ?
How to make the colorbar scale and colors match to the different lines on the plot with the link One color = One headValue ?
I have tried to work with scatterplot which are more convenient to use with scalarMappable but no solution allows me to do all these things at once.
Here is a possible approach. As the 'headValues' aren't sorted, nor equally spaced and one is even used twice, it is not fully clear what the most-desired result would be.
Some remarks:
The standard way of creating a colorbar in matplotlib doesn't need a separate subplot. Matplotlib will reduce the existing plot a bit and put the colorbar next to it (or below for a vertical bar).
Converting the 'headValues' to a numpy array allows for compact code, e.g. writing rgba = cmap(headValues) directly calculates the complete array.
Calling cmap on unchanged values will map 0 to the lowest color and 1 to the highest color, so for values only between 0.16 and 0.44 they all will be mapped to quite similar colors. One approach is to create a norm to map 0.16 to the lowest color and 0.44 to the highest. In code: norm = plt.Normalize(headValues.min(), headValues.max()) and then calculate rgba = cmap(norm(headValues)).
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
headValues = np.array([0.269, 0.362, 0.335, 0.323, 0.161, 0.338, 0.341, 0.428, 0.245, 0.305, 0.305, 0.314, 0.299, 0.395, 0.32, 0.437, 0.203, 0.41, 0.392, 0.347])
x = np.linspace(0, 200, 500)
# create Data similar to the data in the question
Data = [['Time'] + list(x)] + [[val] + list(np.sqrt(4 * x) * val + 4) for val in headValues]
headValues = np.array([d[0] for d in Data[1:]])
order = np.argsort(headValues)
inverse_order = np.argsort(order)
cmap = mpl.cm.get_cmap('rainbow')
rgba = cmap(np.linspace(0, 1, len(headValues))) # evenly spaced colors
fig, ax = plt.subplots(1, 1)
for i in range(1, len(Data)):
ax.plot(Data[0][1:], Data[i][1:], color=rgba[inverse_order[i-1]])
# Data[0][1:] is x, Data[i][1:] is y, and the color associated with Data[i-1][0]
cbar = fig.colorbar(mpl.cm.ScalarMappable(cmap=mpl.colors.ListedColormap(rgba)), orientation='vertical',
ticks=np.linspace(0, 1, len(rgba) * 2 + 1)[1::2])
cbar.set_ticklabels(headValues[order])
plt.show()
Alternatively, the colors can be assigned using their position in the colormap, but without creating
cmap = mpl.cm.get_cmap('rainbow')
norm = plt.Normalize(headValues.min(), headValues.max())
fig, ax = plt.subplots(1, 1)
for i in range(1, len(Data)):
ax.plot(Data[0][1:], Data[i][1:], color=cmap(norm(Data[i][0])))
cbar = fig.colorbar(mpl.cm.ScalarMappable(cmap=cmap, norm=norm))
To get ticks for each of the 'headValues', these ticks can be set explicitly. As putting a label for each tick will result in overlapping text, labels that are too close to other labels can be replaced by an empty string:
headValues.sort()
cbar2 = fig.colorbar(mpl.cm.ScalarMappable(cmap=cmap, norm=norm), ticks=headValues)
cbar2.set_ticklabels([val if val < next - 0.007 else '' for val, next in zip(headValues[:-1], headValues[1:])]
+ [headValues[-1]])
At the left the result of the first approach (colors in segments), at the right the alternative colorbars (color depending on value):

How to change the size of markers and line widths in boxplots (pyplot)

I have following code defining a function:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
from Utils import cmp_iso_forest_od_params
## agg backend is used to create plot as a .png file
mpl.use('agg')
def print_boxplots(data: pd.DataFrame,
graph_filename: str,
col_examined: str,
col_related: str,
sort_func,
title: str,
x_title: str,
y_title: str,
min_val=None,
max_val=None
):
g = data.groupby([col_related])
# graph parameters
scale = 2
show_fliers = True
mean_color='b'
mean_marker='o'
labels = []
data_to_plot_arr = []
#switch = True
for group, group_df in g:
data_to_plot_arr.append(group_df[col_examined])
labels.append(group)
# dynamically set parameters of the graphs so that they are uniform across all graphs, but are minimalised
figsize = ((len(g)) * scale, 25 * scale) # originally (60, 30)
if max_val is None:
max_val = data[col_examined].max()
if min_val is None:
min_val = data[col_examined].min()
tick = (max_val - min_val) / 40
y_labels = np.concatenate([ np.arange(0, min_val-tick, -tick)[::-1], np.arange(0, max_val+6*tick, tick)])
# Create a figure instance
_fig = plt.figure( figsize=figsize)
# Create an axes instance
_ax = _fig.add_subplot(111)
_ax.set_xlabel(col_related, fontsize=20*scale)
# this sorts times and labels for display in the boxplot by the parameters of the boxplots
data_to_plot_arr, labels = zip(*sorted(zip(data_to_plot_arr,labels), key=sort_func ))
# Create the boxplot
bp = _ax.boxplot(data_to_plot_arr, positions=[x for x in range(len(labels))], showfliers=show_fliers)
# following function is described here: https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot
_ax.plot([x for x in range(len(labels))], list(map(lambda x: x.mean(), list(data_to_plot_arr))), marker=mean_marker, color=mean_color)
_ax.set_title(title,
fontsize=25 * scale)
_ax.set_xlabel(x_title, fontsize=25 * scale)
_ax.set_ylabel(y_title, rotation=90, fontsize=25 * scale)
_ax.set_xticklabels(labels, rotation=90)
_ax.set_yticks(y_labels)
_ax.tick_params(axis='x', labelsize=22*scale)
_ax.tick_params(axis='y', labelsize=22*scale)
# custom legend elements gymnastics (it is really awful, but I coudl not find better solution)
colors = [mean_color]
sizes = [6*scale]
texts = ["Mean"]
patches = [plt.plot([], [], marker=mean_marker, ms=sizes[i], ls="", mec=None, color=colors[i],
label="{:s}".format(texts[i]))[0] for i in range(len(texts))]
legend = plt.legend(handles=patches,
bbox_to_anchor=[0.5, -0.08],
loc='center',
title="Boxplots show first and third quartile,\n with variability represented with whiskers",
ncol=2,
prop={'size': 16 * scale})
legend.get_title().set_fontsize(16 * scale)
_ax.grid(True)
# Save the figure
_fig.savefig(graph_filename+'.png', bbox_inches='tight')
What it does is it saves a series of boxplots with plotted means to a file.
What I need is to adjust the thickness of lines and size of all markers in the graph in relation to the scale parameter so I can control the definition of the final picture.
I was not able to find any useful parameters in the documentation so I ended up here.
Current example picture looks like this:

Adding quantitative values to differentiate data through colours in a scatterplot's legend in Python?

Currently, I'm working on an introductory paper on data manipulation and such; however... the CSV I'm working on has some things I wish to do a scatter graph on!
I want a scatter graph to show me the volume sold on certain items as well as their average price, differentiating all data according to their region (Through colours I assume).
So what I want is to know if I can add the region column as a quantitative value
or if there's a way to make this possible...
It's my first time using Python and I'm confused way too often
I'm not sure if this is what you mean, but here is some working code, assuming you have data in the format of [(country, volume, price), ...]. If not, you can change the inputs to the scatter method as needed.
import random
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
n_countries = 50
# get the data into "countries", for example
countries = ...
# in this example: countries is [('BS', 21, 25), ('WZ', 98, 25), ...]
df = pd.DataFrame(countries)
# arbitrary method to get a color
def get_color(i, max_i):
cmap = matplotlib.cm.get_cmap('Spectral')
return cmap(i/max_i)
# get the figure and axis - make a larger figure to fit more points
# add labels for metric names
def get_fig_ax():
fig = plt.figure(figsize=(14,14))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel('volume')
ax.set_ylabel('price')
return fig, ax
# switch around the assignments depending on your data
def get_x_y_labels():
x = df[1]
y = df[2]
labels = df[0]
return x, y, labels
offset = 1 # offset just so annotations aren't on top of points
x, y, labels = get_x_y_labels()
fig, ax = get_fig_ax()
# add a point and annotation for each of the labels/regions
for i, region in enumerate(labels):
ax.annotate(region, (x[i] + offset, y[i] + offset))
# note that you must use "label" for "legend" to work
ax.scatter(x[i], y[i], color=get_color(i, len(x)), label=region)
# Add the legend just outside of the plot.
# The .1, 0 at the end will put it outside
ax.legend(loc='upper right', bbox_to_anchor=(1, 1, .1, 0))
plt.show()

Can I render only part of a scatter_matrix?

On Python, using the Pandas library, I'm trying to generate the scatter plot of a DataFrame using scatter_matrix as follows:
scatter_matrix(df, alpha=0.5, figsize=(14,14), diagonal='kde')
That program takes very long to run and eventually crashes, possibly because there are too many (26) columns, and the resulting image would be to big. Nether less, I noticed I'm able to render 13 variables just fine. That way, one solution would be to generate 4 plots instead, one for each quadrant of the resulting scatter matrix, i.e., the ranges [[0,0],[13,13]], [[13,0],[26,13]], [[0,13],[13,26]], [[13,13],[26,26]]. Notice those don't refer to ranges on the source DataFrame, but of the target scatter matrix I'm rendering. Is it possible?
I couldn't find any official way to do it, so I modified the scatter_matrix implementation to receive 2 additional params, cols and rows, which are arrays with the labels you want to compare:
"""
This is a modification of the scatter_matrix method;
it allows plotting sub-sections of the scatter plot
matrix. With the official method you can only plot
the entire matrix. This method allows selecting the
`cols` and `rows` you're interested in plotting.
Note that this wouldn't be possible even by calling
scatter_matrix on a subset of the dataframe, because
this wouldn't allow comparing different `cols`/`rows'.
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas
import pandas.tools.plotting
from pandas.compat import range, lrange, lmap, map, zip, string_types
def scatter_matrix(frame, cols, rows, figsize=None, ax=None, grid=False,
diagonal='hist', marker='.', density_kwds=None,
hist_kwds=None, range_padding=0.05, **kwds):
"""
Draw a matrix of scatter plots.
Parameters
----------
frame : DataFrame
cols : [str]
labels of the columns to be rendered
rows: [str]
labels of the rows to be rendered
alpha : float, optional
amount of transparency applied
figsize : (float,float), optional
a tuple (width, height) in inches
ax : Matplotlib axis object, optional
grid : bool, optional
setting this to True will show the grid
diagonal : {'hist', 'kde'}
pick between 'kde' and 'hist' for
either Kernel Density Estimation or Histogram
plot in the diagonal
marker : str, optional
Matplotlib marker type, default '.'
hist_kwds : other plotting keyword arguments
To be passed to hist function
density_kwds : other plotting keyword arguments
To be passed to kernel density estimate plot
range_padding : float, optional
relative extension of axis range in x and y
with respect to (x_max - x_min) or (y_max - y_min),
default 0.05
kwds : other plotting keyword arguments
To be passed to scatter function
Examples
--------
>>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
>>> scatter_matrix(df, alpha=0.2)
"""
import matplotlib.pyplot as plt
df = frame._get_numeric_data()
w = len(cols)
h = len(rows)
naxes = w * h
fig, axes = pandas.tools.plotting._subplots(naxes=naxes, figsize=figsize, ax=ax,
squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
#mask = pandas.tools.plotting.notnull(df)
marker = pandas.tools.plotting._get_marker_compat(marker)
hist_kwds = hist_kwds or {}
density_kwds = density_kwds or {}
# workaround because `c='b'` is hardcoded in matplotlibs scatter method
#kwds.setdefault('c', plt.rcParams['patch.facecolor'])
cols_boundaries_list = []
for a in cols:
values = df[a]#.values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
cols_boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
rows_boundaries_list = []
for a in rows:
values = df[a]#.values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
rows_boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
for i, a in zip(lrange(w), cols):
for j, b in zip(lrange(h), rows):
ax = axes[i, j]
if cols[i] == rows[j]:
values = df[a]#.values[mask[a].values]
# Deal with the diagonal by drawing a histogram there.
if diagonal == 'hist':
ax.hist(values, **hist_kwds)
elif diagonal in ('kde', 'density'):
from scipy.stats import gaussian_kde
y = values
gkde = gaussian_kde(y)
ind = np.linspace(y.min(), y.max(), 1000)
ax.plot(ind, gkde.evaluate(ind), **density_kwds)
ax.set_xlim(cols_boundaries_list[i])
else:
#common = (mask[a] & mask[b]).values
ax.scatter(df[b], df[a], marker=marker, **kwds)
ax.set_xlim(rows_boundaries_list[j])
ax.set_ylim(cols_boundaries_list[i])
ax.set_xlabel(b)
ax.set_ylabel(a)
if j != 0:
ax.yaxis.set_visible(False)
if i != w - 1:
ax.xaxis.set_visible(False)
# what is that for?
#if len(df.columns) > 1:
#lim1 = cols_boundaries_list[0]
#locs = axes[0][1].yaxis.get_majorticklocs()
#locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
#adj = (locs - lim1[0]) / (lim1[1] - lim1[0])
#lim0 = axes[0][0].get_ylim()
#adj = adj * (lim0[1] - lim0[0]) + lim0[0]
#axes[0][0].yaxis.set_ticks(adj)
#if np.all(locs == locs.astype(int)):
## if all ticks are int
#locs = locs.astype(int)
#axes[0][0].yaxis.set_ticklabels(locs)
pandas.tools.plotting._set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
return axes
I was quite careless modifying that code so it is probably broken but served my needs.
The 4 quadrants will be:
mid_ind = len(df.index)//2
mid_col = len(df.columns)//2
df.iloc[:mid_ind,:mid_col]
df.iloc[mid_ind:,:mid_col]
df.iloc[mid_ind:,mid_col:]
df.iloc[:mid_ind,mid_col:]

Trying to visualize a sorted table with matplotlib (parallel coordinates?)

I'm trying to visualize a sorted table (sorted on a column). My ideal result should be something like
visualization of a sorted table
Any suggestion on how to reach this goal with matplotlib?
I'have already tried with suggestions given here and here but I'm looking for something fancier like that in the attached image.
Thanks in advance,
Matplotlib does not support this directly, but it is fairly easy to replicate the plot that you have linked to.
The function below does something similar given a 2d array of data. It can be sorted or not, the function doesn't really care.
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
def sorted_table_plot(data, labels, categories, cmap=None, ax=None):
# check if an axes was supplied
if ax is None:
ax = plt.gca()
# check if a colormap was supplied
if cmap is None:
cmap = plt.cm.jet
# generate the grid arrays with the coordinates for the annotations
yy, xx = np.mgrid[:data.shape[0], :data.shape[1]]
x = xx.flatten()
y = yy.flatten()
d = data.flatten()
# a norm object which we will use with the colorbar
norm = plt.Normalize(d.min(), d.max())
# iterate over the data points and draw the labels
for di, xi, yi in zip(d, x, y):
color = cmap(norm(di))
hsv = mcolors.rgb_to_hsv(color[:3])
fc = 'w' if hsv[2] < 0.7 else 'k'
ax.annotate(str(di), xy=(xi,yi), xycoords="data",
va="center", ha="center", color=fc,
bbox=dict(boxstyle="circle", fc=color))
# iteratve over all the appearing values and draw the lines
for i in np.unique(data):
xi, yi = x[d==i], y[d==i]
idx = np.argsort(xi)
plt.plot(xi[idx], yi[idx], color=plt.cm.jet(norm(i)), lw=2)
# add the axes labels
ax.set_xticks(xx[0,:])
ax.set_xticklabels(categories)
ax.set_yticks(yy[:,0])
ax.set_yticklabels(labels)
# adjust the axes ranges
ax.set_xlim(xx[0,0] - 0.5, xx[-1,-1] + 0.5)
ax.set_ylim(yy[-1,-1] + 0.5, yy[0,0] - 0.5)
Now, you can simply call it on a data array. In the following I created a random array, since you didn't care to supply an example data set.
# fix the seed for reproducability
np.random.seed(2)
# create random data
data = np.tile(np.arange(1,8), (3,1)).T
labels = map(lambda x: 'label ' + str(x), data[:,1])
categories = map(lambda x: 'cat ' + str(x), np.arange(data.shape[1])+1)
for i in range(1,data.shape[1]):
# shuffle all but the first column
np.random.shuffle(data[:,i])
# call the function and show the plot
sorted_table_plot(data, labels, categories)
plt.show()
Result:

Categories