number of observation on violinplot is not good - python

I use this tips https://python-graph-gallery.com/58-show-number-of-observation-on-violinplot/ to add Number of observation on a violon plot.
Here is m code:
# Calculate number of obs per group & median to position labels
medians = dataset.groupby([x_attrib])[y_attrib].median().values
nobs = dataset[x_attrib].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
#nobs = ["Nb: " + i for i in nobs]
nobs = [i for i in nobs]
# Add it to the plot
pos = range(len(nobs))
for tick,label in zip(pos,ax.get_xticklabels()):
ax.text(pos[tick], medians[tick] + 0.03, nobs[tick], horizontalalignment='center', size='x-large', color='black', weight='semibold')
I plot variable with these value counts:
0 355
1 174
2 36
-1 19
3 15
4 5
...
As you can see on the plot, for -1 value: real count is 19 and the plot return 355 (count for 0 value)
How can i modify the code to get a good plot please?
Thanks a lot.
Theo

Related

Getting the center point of a cluster for latitude and longitude in Python

I have a list of of coordinates that have areas mapped out as follows
df=pd.DataFrame({'user_id':[55,55,356,356,356,356,632,752,938,963,963,1226,2663,2663,2663,2663,2663,3183,3197,3344,3387,3387,3387,3387,3396,3515,3536,3570,3819,3883,3883,3883,3883,3883,3883,3883,3883,3883,3883,3883,3883,4584,4594,4713,4931,4931,5026,5487,5487,5575,5575,5575,5602,5639,5639,5639,5639,5783,5783,5783,5783,5783,5801,6373,6718,6886,6886,7055,7055,7608,7608,7777,8186,8186,8307,8712,9271,9896,9991,9991,9991,],
'latitude':[13.2633943,13.2633964,12.809677124,12.8099212646,12.8100585938,12.810065981,12.9440132,12.2958104,12.5265661,13.0767648,13.0853577,12.6301221,12.8558120728,12.8558349609,12.8558654785,12.8558807373,12.8558959961,12.9141417,13.0696411133,13.0708333,10.7904833,10.7904833,10.7904833,12.884091,13.0694428,13.204637,12.6922086,13.0767648,13.3489958,12.8653798,12.8654014,12.8654124,12.8654448,12.8654521,12.8654658,12.8654733,12.8654815,12.8654844,12.8655367,12.8655376,12.865576,12.4025539,13.1986348,12.9548317,11.664325,11.6690603,13.0656551,13.1137554,13.1137978,12.770418,12.9141417,12.9141417,15.3530727,12.8285405054,12.8285925,12.8288406,12.829668,12.2958104,12.5583190918,12.7367172241,12.7399597168,12.7422103882,12.8631981,13.3378762,12.5638375681,13.1961683,13.1993678,12.1210997,12.5265661,13.1332778931,13.13331604,12.1210997,13.0649372,13.0658797,12.6955714,12.8213806152,13.0641708374,13.2013835,13.1154662,13.1957473755,13.2329025269,],
'longitude':[75.4341412,75.4341377,77.6955155017,77.6952344177,77.6952628334,77.6952629697,75.7926285,76.6393805,78.2149575,77.6397007,77.6445166,77.1145378,77.7985897361,77.7985953164,77.798622112,77.7985610742,77.7986275271,74.8559568,77.6520116309,77.6519444,78.7046725,78.7046725,78.7046725,74.8372421,77.6523596,77.6506622,78.6181131,77.6397007,74.7855559,77.7972191,77.7971733,77.7971429,77.7971621,77.7970823,77.7970327,77.7970371,77.7972272,77.7970335,77.7969649,77.796956,77.7971244,75.9811564,77.7065928,77.4739615,78.1460142,78.139311,77.4380296,77.5732437,77.573201,74.8609332,74.8559568,74.8559568,75.1386825,77.6891233027,77.6899376,77.6892531,77.6902955,76.6393805,77.7842363745,77.7841222429,77.7837989946,77.7830295359,77.4336428,77.117325,75.5833357573,77.7053231,77.7095658,78.1582143,78.2149575,77.5728687166,77.5729374436,78.1582143,77.7435873,77.7444939,78.0620963,77.6606095672,77.746332751,77.7082838,77.6069977,77.7055573788,77.6956690934,],
})
For the following latitude longitude pairs I am using DBSCAN to cluster them
X=np.array(df[['latitude', 'longitude']])
kms_per_radian = 6371.0088
epsilon = 1 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=5)
model=db.fit(np.radians(X))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
cluster_labels = cluster_labels.astype(float)
cluster_labels[cluster_labels == -1] = np.nan
clusters = pd.Series( [X[cluster_labels==n] for n in range(num_clusters)] )
labels = pd.DataFrame(db.labels_,columns=['CLUSTER_LABEL'])
dfnew=pd.concat([df,labels],axis=1,sort=False)
How do I get the get the center point of these clusters and map it back to the dataset so that when I display the same in folium with a marker and the summary starts there?
So far I have tried
def get_centermost_point(cluster):
centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
return tuple(centermost_point)
centermost_points = clusters.map(get_centermost_point)
which gives me a IndexError: list index out of range error
To get the coordinates of each cluster's centroid:
for ea in clusters:
print(MultiPoint(ea).centroid)
Outcome:
POINT (12.85585784912 77.79859915316)
POINT (12.86547048333333 77.79709629166666)
POINT (13.1982603551 77.70706457576)
POINT EMPTY
To create a geodataframe from the centroids and plot it.
(assuming the coordinates are long/lat)
# To create a geodataframe of the centroids
clusters_centroids = [MultiPoint(ea).centroid for ea in clusters]
crs = {'init': 'epsg:4326'}
cgdf = gpd.GeoDataFrame(clusters, crs=crs, geometry=clusters_centroids)
# Eliminate some empty row(s)
good_cdgf = cgdf[ ~cgdf['geometry'].is_empty ]
# plot to see the centroids
good_cdgf.plot()
The output plot:
To add the center points back into the original dataframe df.
Here I start with checking dfnew which is simply df with added column CLUSTER_LABEL.
print(dfnew)
user_id latitude longitude CLUSTER_LABEL
0 55 13.263394 75.434141 -1
1 55 13.263396 75.434138 -1
2 356 12.809677 77.695516 -1
3 356 12.809921 77.695234 -1
4 356 12.810059 77.695263 -1
.. ... ... ... ...
76 9271 13.064171 77.746333 -1
77 9896 13.201384 77.708284 2
78 9991 13.115466 77.606998 -1
79 9991 13.195747 77.705557 2
80 9991 13.232903 77.695669 -1
[81 rows x 4 columns]
The column CLUSTER_LABEL will be used to join and get values from cgdf dataframe.
Add a new CLUSTER_LABEL column with proper cluster's label values to cgdf
cgdf["CLUSTER_LABEL"] = [0,1,2, -1]
Drop column 0 of cgdf
cgdf.drop(columns=[0], axis=1, inplace=True)
Check current cgdf
print(cgdf)
geometry CLUSTER_LABEL
0 POINT (12.856 77.799) 0
1 POINT (12.865 77.797) 1
2 POINT (13.198 77.707) 2
3 POINT EMPTY -1
Merge two dataframes into new dataframe dfnew2.
dfnew2 = dfnew.merge(cgdf, on='CLUSTER_LABEL')
Check current status of dfnew2, it should look like this:
user_id latitude longitude CLUSTER_LABEL geometry
0 55 13.263394 75.434141 -1 POINT EMPTY
1 55 13.263396 75.434138 -1 POINT EMPTY
2 356 12.809677 77.695516 -1 POINT EMPTY
3 356 12.809921 77.695234 -1 POINT EMPTY
4 356 12.810059 77.695263 -1 POINT EMPTY
.. ... ... ... ... ...
76 4594 13.198635 77.706593 2 POINT (13.198 77.707)
77 6886 13.196168 77.705323 2 POINT (13.198 77.707)
78 6886 13.199368 77.709566 2 POINT (13.198 77.707)
79 9896 13.201384 77.708284 2 POINT (13.198 77.707)
80 9991 13.195747 77.705557 2 POINT (13.198 77.707)
[81 rows x 5 columns]
'dfnew2' should be equivalent with the original dataframe with 2 additional special columns, 'CLUSTER_LABEL' and 'geometry' (of cluster's center point).
try:
from sklearn.tree import DecisionTreeClassifier
except:
pass
from sklearn.cluster import KMeans
def kmeans_centers(list_of_lats_lngs): #type of input list of lists
try:
data = pd.DataFrame([list_of_lats_lngs],columns=['lat','lng'])
data['eventType']= "test"
data.dropna(axis=0,how='any',subset=['lat','lng'],inplace=True)
X=data.loc[:,['eventType','lat','lng']]
K_clusters = range(1,10)
kmeans = [KMeans(n_clusters=i) for i in K_clusters]
Y_axis = data[['lat']]
X_axis = data[['lng']]
kmeans = KMeans(n_clusters = 3, init ='k-means++')
kmeans.fit(X[X.columns[1:3]])
X['cluster_label'] = kmeans.fit_predict(X[X.columns[1:3]])
centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
# labels = kmeans.predict(X[X.columns[1:3]]) # Labels of each point
return centers
except Exception as e:
print("kmeans - CLustering exception",e)
return None
Ready to use
Input
[[12.02,12.34],[12.12,12.04],[12.092,12.74],[22.02,13.34]]

How to make plots with small whitespace separations in Matplotlib or Seaborn?

I'd like to make this type of plot with multiple columns separated by small whitespace, each having different category having 3-5 (5 in this example) different observations with varying values on y axis:
actually, i can plot this plot use ggplot2. for example:
head(mtcars)
# mpg cyl disp hp drat wt qsec vs am gear carb
# Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
# Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
# Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
# Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
# Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
# Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
library(dplyr)
library(ggplot2)
mtcars %>% reshape2::melt() %>%
ggplot(aes(x = variable, y = value)) +
geom_point() + facet_grid(~ variable) +
theme(axis.text.x = element_blank())
you set a categorical variable in your dataset,then use the facet_grid(~).this function can change your plot into multiple plot by your categrical variable
Here is an approach to draw a similar plot using Python's matplotlib. The plot has a grey background and white major and minor gridlines to delimit the zones. Getting the dots in the center of each little cell is somewhat tricky: divide into n+1 spaces and shift half a cell (1/2n). A secondary x-axis can be used to set the labels. A zorder has to be set to have the dots on top of the gridlines.
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import ticker
n = 5
cols = 7
values = [np.random.uniform(1, 10, n) for c in range(cols)]
fig, ax = plt.subplots()
ax.set_facecolor('lightgrey')
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.xaxis.set_minor_locator(ticker.MultipleLocator(1 / (n)))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
ax.grid(True, which='both', axis='both', color='white')
ax.set_xticklabels([])
ax.tick_params(axis='x', which='both', length=0)
ax.grid(which='major', axis='both', lw=3)
ax.set_xlim(1, cols + 1)
for i in range(1, cols + 1):
ax.scatter(np.linspace(i, i + 1, n, endpoint=False) + 1 / (2 * n), values[i-1], c='crimson', zorder=2)
ax2 = ax.twiny()
ax2.set_xlim(0.5, cols + 0.5)
ticks = range(1, cols + 1)
ax2.set_xticks(ticks)
ax2.set_xticklabels([f'Cat_{t:02d}' for t in ticks])
bbox = dict(boxstyle="round", ec="limegreen", fc="limegreen", alpha=0.5)
plt.setp(ax2.get_xticklabels(), bbox=bbox)
ax2.tick_params(axis='x', length=0)
plt.show()

How to calculate p-values for pairwise correlation of columns in Pandas?

Pandas has the very handy function to do pairwise correlation of columns using pd.corr().
That means it is possible to compare correlations between columns of any length. For instance:
df = pd.DataFrame(np.random.randint(0,100,size=(100, 10)))
0 1 2 3 4 5 6 7 8 9
0 9 17 55 32 7 97 61 47 48 46
1 8 83 87 56 17 96 81 8 87 0
2 60 29 8 68 56 63 81 5 24 52
3 42 76 6 75 7 59 19 17 3 63
...
Now it is possible to test correlation between all 10 columns with df.corr(method='pearson'):
0 1 2 3 4 5 6 7 8 9
0 1.000000 0.082789 -0.094096 -0.086091 0.163091 0.013210 0.167204 -0.002514 0.097481 0.091020
1 0.082789 1.000000 0.027158 -0.080073 0.056364 -0.050978 -0.018428 -0.014099 -0.135125 -0.043797
2 -0.094096 0.027158 1.000000 -0.102975 0.101597 -0.036270 0.202929 0.085181 0.093723 -0.055824
3 -0.086091 -0.080073 -0.102975 1.000000 -0.149465 0.033130 -0.020929 0.183301 -0.003853 -0.062889
4 0.163091 0.056364 0.101597 -0.149465 1.000000 -0.007567 -0.017212 -0.086300 0.177247 -0.008612
5 0.013210 -0.050978 -0.036270 0.033130 -0.007567 1.000000 -0.080148 -0.080915 -0.004612 0.243713
6 0.167204 -0.018428 0.202929 -0.020929 -0.017212 -0.080148 1.000000 0.135348 0.070330 0.008170
7 -0.002514 -0.014099 0.085181 0.183301 -0.086300 -0.080915 0.135348 1.000000 -0.114413 -0.111642
8 0.097481 -0.135125 0.093723 -0.003853 0.177247 -0.004612 0.070330 -0.114413 1.000000 -0.153564
9 0.091020 -0.043797 -0.055824 -0.062889 -0.008612 0.243713 0.008170 -0.111642 -0.153564 1.000000
Is there a simple way to also get the corresponding p-values (ideally in pandas), as it is returned e.g. by scipy's kendalltau()?
Why not using the "method" argument of pandas.DataFrame.corr():
pearson : standard correlation coefficient.
kendall : Kendall Tau correlation coefficient.
spearman : Spearman rank correlation.
callable: callable with input two 1d ndarrays and returning a float.
from scipy.stats import kendalltau, pearsonr, spearmanr
def kendall_pval(x,y):
return kendalltau(x,y)[1]
def pearsonr_pval(x,y):
return pearsonr(x,y)[1]
def spearmanr_pval(x,y):
return spearmanr(x,y)[1]
and then
corr = df.corr(method=pearsonr_pval)
Probably just loop. It's basically what pandas does in the source code to generate the correlation matrix anyway:
import pandas as pd
import numpy as np
from scipy import stats
df_corr = pd.DataFrame() # Correlation matrix
df_p = pd.DataFrame() # Matrix of p-values
for x in df.columns:
for y in df.columns:
corr = stats.pearsonr(df[x], df[y])
df_corr.loc[x,y] = corr[0]
df_p.loc[x,y] = corr[1]
If you want to leverage the fact that this is symmetric, so you only need to calculate this for roughly half of them, then do:
mat = df.values.T
K = len(df.columns)
correl = np.empty((K,K), dtype=float)
p_vals = np.empty((K,K), dtype=float)
for i, ac in enumerate(mat):
for j, bc in enumerate(mat):
if i > j:
continue
else:
corr = stats.pearsonr(ac, bc)
#corr = stats.kendalltau(ac, bc)
correl[i,j] = corr[0]
correl[j,i] = corr[0]
p_vals[i,j] = corr[1]
p_vals[j,i] = corr[1]
df_p = pd.DataFrame(p_vals)
df_corr = pd.DataFrame(correl)
#pd.concat([df_corr, df_p], keys=['corr', 'p_val'])
This will work:
from scipy.stats import pearsonr
column_values = [column for column in df.columns.tolist() ]
df['Correlation_coefficent'], df['P-value'] = zip(*df.T.apply(lambda x: pearsonr(x[column_values ],x[column_values ])))
df_result = df[['Correlation_coefficent','P-value']]
Does this work for you?
#call the correlation function, you could round the values if needed
df_c = df_c.corr().round(1)
#get the p values
pval = df_c.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
#set the p values, *** for less than 0.001, ** for less than 0.01, * for less than 0.05
p = pval.applymap(lambda x: ''.join(['*' for t in [0.001,0.01,0.05] if x<=t]))
#dfc_2 below will give you the dataframe with correlation coefficients and p values
df_c2 = df_c.astype(str) + p
#you could also plot the correlation matrix using sns.heatmap if you want
#plot the triangle
matrix = np.triu(df_c.corr())
#convert to array for the heatmap
df_c3 = df_c2.to_numpy()
#plot the heatmap
plt.figure(figsize=(13,8))
sns.heatmap(df_c, annot = df_c3, fmt='', vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', mask = matrix)

Different box plots on the same oX position

I am trying to combine box plots with a scatter plot for an algorithm scoring visualization. My data is divided as following:
oX - information about the time period (1 year, 2 years, etc.)
oY - information about the score
2 algorithms for each period with different simulation results (plotted as boxplots)
2 heuristics with a single value (plotted as a point)
I'm trying to easily compare method efficiency for each period of time.
Small sample data:
1 year 2 years
A1 A2 H1 H2 A1 A2 H1 H2
124 168 155 167 130 130 150 164
102 155 100 172
103 153 117 145
102 132 145 143
145 170 133 179
136 125 115 153
116 150 136 131
146 192 106 148
124 122 127 158
128 123 149 200
141 158 137 156
I'm trying to get something that looks like this:
So far I've cleared up my data to have the observations for each algorithm (RS, EA) and for each period (52, 104, 156 etc.) separately like so but I can't figure out how to group them per period while drawing 2 different boxplots for the same X tick. I assume once I'd sort out the boxplot dataframe and plot, I can just plot the scatter on top.
Managed to solve this meanwhile, in case it helps anyone else out:
ax1 = sns.boxplot(data = meta, x = 'Time', y = 'PRS', color = '#880BDD', linewidth=0.8)
ax1 = sns.boxplot(data = meta, x = 'Time', y = 'EA', color = '#0BC9DD', linewidth=0.8)
ax1 = sns.boxplot(data = meta, x = 'Time', y = 'ERS', color = '#9BD19D', linewidth=0.8)
ax1 = sns.pointplot(data = simple, x = 'Time', y = 'Greedy Average', color='#FFC48C', markers ='s', join=False)
ax1 = sns.pointplot(data = simple, x = 'Time', y = 'Greedy Total', color='#FF9F80', markers='o', join=False)
ax1 = sns.pointplot(data = simple, x = 'Time', y = 'Greedy Weeks', color='#F56991', markers='*', join=False)
ax1.set(xlabel = "Planning Horizon (weeks)")
ax1.set(ylabel = "Hypervolume")
EA = mpatches.Patch(color='#0BC9DD', label = 'EA')
PRS = mpatches.Patch(color='#880BDD', label = 'PRS')
ERS = mpatches.Patch(color='#9BD19D', label = 'ERS')
GA = mlines.Line2D([], [], color='#FFC48C', marker = 's', label = 'Greedy Average')
GT = mlines.Line2D([], [],color='#FF9F80', label = 'Greedy Total', marker = 'o')
GW = mlines.Line2D([], [],color='#F56991', label = 'Greedy Weeks', marker = '*')
plt.legend(handles = [EA, ERS, PRS, GA, GT, GW], loc = 'bottom left', title = "Algorithm")
ax1.set_title("Algorithm Comparison")
Results in this:

Hierarchical clustering of time series in Python scipy/numpy/pandas?

I have a DataFrame with some time series. I created a correlation matrix from those time series and I'd like to create a hierarchical clustering on this correlation matrix. How can I do that?
#
# let't pretend this DataFrame contains some time series
#
df = pd.DataFrame((np.random.randn(150)).reshape(10,15))
0 1 2 13 14
0 0.369746 0.093882 -0.656211 .... -0.596936 0 0.095960
1 0.641457 1.120405 -0.468639 .... -2.070802 1 -1.254159
2 0.360756 -0.222554 0.367893 .... 0.566299 2 0.932898
3 0.733130 0.666270 -0.624351 .... -0.377017 3 0.340360
4 -0.263967 1.143818 0.554947 .... 0.220406 4 -0.585353
5 0.082964 -0.311667 1.323161 .... -1.190672 5 -0.828039
6 0.173685 0.719818 -0.881854 .... -1.048066 6 -1.388395
7 0.118301 -0.268945 0.909022 .... 0.094301 7 1.111376
8 -1.341381 0.599435 -0.318425 .... 1.053272 8 -0.763416
9 -1.146692 0.453125 0.150241 .... 0.454584 9 1.506249
#
# I can create a correlation matrix like this
#
correlation_matrix = df.corr(method='spearman')
0 1 ... 13 14
0 1.000000 -0.139394 ... 0.090909 0.309091
1 -0.139394 1.000000 ... -0.636364 0.115152
2 0.175758 0.733333 ... -0.515152 -0.163636
3 0.309091 0.163636 ... -0.248485 -0.127273
4 0.600000 -0.103030 ... 0.151515 0.175758
5 -0.078788 0.054545 ... -0.296970 -0.187879
6 -0.175758 -0.272727 ... 0.151515 -0.139394
7 0.163636 -0.042424 ... 0.187879 0.248485
8 0.030303 0.915152 ... -0.430303 0.296970
9 -0.696970 0.321212 ... -0.236364 -0.151515
10 0.163636 0.115152 ... -0.163636 0.381818
11 0.321212 -0.236364 ... -0.127273 -0.224242
12 -0.054545 -0.200000 ... 0.078788 0.236364
13 0.090909 -0.636364 ... 1.000000 0.381818
14 0.309091 0.115152 ... 0.381818 1.000000
Now, how can build the Hierarchical clustering on this matrix?
Here is a step by step guide on how to build the Hierarchical Clustering and Dendrogram out of our time series using SciPy. Please note that also scikit-learn (a powerful data analysis library built on top of SciPY) has many other clustering algorithms implemented.
First we build some synthetic time series to work with. We'll build 6 groups of correlated time series and we expect the hierarchical clustering to detect those six groups.
import numpy as np
import seaborn as sns
import pandas as pd
from scipy import stats
import scipy.cluster.hierarchy as hac
import matplotlib.pyplot as plt
#
# build 6 time series groups for testing, called: a, b, c, d, e, f
#
num_samples = 61
group_size = 10
#
# create the main time series for each group
#
x = np.linspace(0, 5, num_samples)
scale = 4
a = scale * np.sin(x)
b = scale * (np.cos(1+x*3) + np.linspace(0, 1, num_samples))
c = scale * (np.sin(2+x*6) + np.linspace(0, -1, num_samples))
d = scale * (np.cos(3+x*9) + np.linspace(0, 4, num_samples))
e = scale * (np.sin(4+x*12) + np.linspace(0, -4, num_samples))
f = scale * np.cos(x)
#
# from each main series build 'group_size' series
#
timeSeries = pd.DataFrame()
ax = None
for arr in [a,b,c,d,e,f]:
arr = arr + np.random.rand(group_size, num_samples) + np.random.randn(group_size, 1)
df = pd.DataFrame(arr)
timeSeries = timeSeries.append(df)
# We use seaborn to plot what we have
#ax = sns.tsplot(ax=ax, data=df.values, ci=[68, 95])
ax = sns.tsplot(ax=ax, data=df.values, err_style="unit_traces")
plt.show()
Now we do the clustering and plot it:
# Do the clustering
Z = hac.linkage(timeSeries, method='single', metric='correlation')
# Plot dendogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
hac.dendrogram(
Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
)
plt.show()
if we want to decide what kind of correlation to apply or to use another distance metric, then we can provide a custom metric function:
# Here we use spearman correlation
def my_metric(x, y):
r = stats.pearsonr(x, y)[0]
return 1 - r # correlation to distance: range 0 to 2
# Do the clustering
Z = hac.linkage(timeSeries, method='single', metric=my_metric)
# Plot dendogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
hac.dendrogram(
Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
)
plt.show()
To retrieve the Clusters we can use the fcluster function. It can be run in multiple ways (check the documentation) but in this example we'll give it as target the number of clusters we want:
from scipy.cluster.hierarchy import fcluster
def print_clusters(timeSeries, Z, k, plot=False):
# k Number of clusters I'd like to extract
results = fcluster(Z, k, criterion='maxclust')
# check the results
s = pd.Series(results)
clusters = s.unique()
for c in clusters:
cluster_indeces = s[s==c].index
print("Cluster %d number of entries %d" % (c, len(cluster_indeces)))
if plot:
timeSeries.T.iloc[:,cluster_indeces].plot()
plt.show()
print_clusters(timeSeries, Z, 6, plot=False)
Output:
Cluster 2 number of entries 10
Cluster 5 number of entries 10
Cluster 3 number of entries 10
Cluster 6 number of entries 10
Cluster 1 number of entries 10
Cluster 4 number of entries 10

Categories