Editing a Heatmap in Python via Scipy - python

I am trying to plot a heatmap and I found this code online although I am having some difficulty using it. I am trying to to do hierarchical clustering and test gene methylation of one another, I made a DataFrame using pandas where I have Betavalues and Genes as separate columns (df4). Later I converted it to a matrix like scipy prefers.I tried running the code with my matrix but it gave me a value error saying "ValueError: could not convert string to float: 'tAKR'", I already removed the N/A and anything that is not a gene or a valid Beta value.
I was wondering if you may have any suggestions?
Below I have attached a picture of what my dataframe looks like before changing into a matrix. [![enter image description here][1]][1]
import scipy
import pylab
import scipy.cluster.hierarchy as sch
df5 = df4.as_matrix()
# Generate random features and distance matrix.
x = scipy.rand(40)
D = scipy.zeros([40,40])
for i in range(40):
for j in range(40):
D[i,j] = abs(x[i] - x[j])
# Compute and plot first dendrogram.
fig = pylab.figure(figsize=(8,8))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
Y = sch.linkage(df5, method='centroid')
Z1 = sch.dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])
# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
Y = sch.linkage(df5, method='single')
Z2 = sch.dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])
# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1,:]
D = D[:,idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])
# Plot colorbar.
axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
pylab.colorbar(im, cax=axcolor)
fig.show()
fig.savefig('dendrogram.png')

You may be interested in an out-of-the-box solution if you're not comfortable with this. Check out seaboarn's clustermap, which accepts pandas data frames as inputs.
>>> import seaborn as sns; sns.set()
>>> flights = sns.load_dataset("flights")
>>> flights = flights.pivot("month", "year", "passengers")
>>> g = sns.clustermap(flights)
I am not affiliated with Seaborn.

Related

Create a Seaborn style histogram / kernel density plot using the actual density function

I really like to the look of Seaborn's KDE plot:
I was wondering how can I replicate this for line plot.
In my case I actually have the function to generate the density instead of samples of the data.
So assuming I have the data in a data frame:
x - The value of x per sample.
y - The value of the density function at y.
μσ - Categorical variable to group data from the same density (In the code, I use the mean and standard deviation of a normal distribution).
I can use Seaborn's lineplot to get what I want without the area below the curve as in the image above.
I'm after achieving the look as above for the data I have.
Is there a way to replicate this theme, area under the curve included, for lineplot?
The code below shows what I got so far:
import numpy as np
import scipy as sp
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
num_grid_pts = 1000
val_μ = [0, -1, 1, 0]
val_σ = [1, 2, 3, 4]
num_var = len(val_μ) # variations
x = np.linspace(-10, 10, num_grid_pts)
P = np.zeros((num_grid_pts, num_var)) # PDF
μσ = [f'μ = {μ}, σ = {σ}' for μ, σ in zip(val_μ, val_σ)]
for ii, (μ, σ) in enumerate(zip(val_μ, val_σ)):
randVar = norm(μ, σ)
P[:, ii] = randVar.pdf(x)
df_P = pd.DataFrame(data = {'x': np.tile(x, num_var), 'PDF': P.flatten('F'), 'μσ': np.repeat(μσ, len(x))})
f, ax = plt.subplots(figsize=(15, 10))
sns.lineplot(data=df_P, x='x', y='PDF', hue='μσ', ax=ax)
plot_lines = ax.get_lines()
for ii in range(num_var):
ax.fill_between(x=plot_lines[ii].get_xdata(), y1=plot_lines[ii].get_ydata(), alpha=0.25, color=plot_lines[ii].get_color())
ax.set_title(f'Normal Distribution')
ax.set_xlabel(f'Value')
ax.set_ylabel(f'Probability')
plt.show()
I used the lineplot to create the lines and then created the fills. But this is a hack, I was wondering if I can do it more naturally within Seaborn.
I found a way to manually play with the elements do so using the area object:
(
so.Plot(healthexp, "Year", "Spending_USD", color="Country")
.add(so.Area(alpha=.7), so.Stack())
)
The result is:
Yet for some reason the example code doesn't work.
What I did was using Seabron's lineplot() and then manually add fill_between() polygon:
ax = sns.lineplot(data=data_frame, x='data_x', y='data_y', hue='data_color')
plot_lines = ax.get_lines()
for i in range(num_unique_colors):
ax.fill_between(x=plot_lines[i].get_xdata(), y1=plot_lines[i].get_ydata(), alpha=0.25, color=plot_lines[i].get_color())

Plotting a fancy diagonal correlation matrix in python with coefficients in upper triangle

I have the following synthetic dataframe, including numerical and categorical columns as well as the label column.
I want to plot a diagonal correlation matrix and display correlation coefficients in the upper part as the following:
expected output:
Despite the point that categorical columns within synthetic dataset/dataframedf needs to be converted into numerical, So far I have used this seaborn example using 'titanic' dataset which is synthetic and fits my task, but I added label column as follows:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="white")
# Generate a large random dataset with synthetic nature (categorical + numerical)
data = sns.load_dataset("titanic")
df = pd.DataFrame(data=data)
# Generate label column randomly '0' or '1'
df['label'] = np.random.randint(0,2, size=len(df))
# Compute the correlation matrix
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1.0, vmax=1.0, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
I checked a related post but couldn't figure it out to do this task. The best I could find so far is this workaround which can be installed using this package that gives me the following output:
#!pip install heatmapz
# Import the two methods from heatmap library
from heatmap import heatmap, corrplot
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="white")
# Generate a large random dataset
data = sns.load_dataset("titanic")
df = pd.DataFrame(data=data)
# Generate label column randomly '0' or '1'
df['label'] = np.random.randint(0,2, size=len(df))
# Compute the correlation matrix
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
mask[np.diag_indices_from(mask)] = False
np.fill_diagonal(mask, True)
# Set up the matplotlib figure
plt.figure(figsize=(8, 8))
# Draw the heatmap using "Heatmapz" package
corrplot(corr[mask], size_scale=300)
Sadly, corr[mask] doesn't mask the upper triangle in this package.
I also noticed that in R, reaching this fancy plot is much easier, so I'm open if there is a more straightforward way to convert Python Pandas dataFrame to R dataframe since it seems there is a package, so-called rpy2 that we could use Python & R together even in Google Colab notebook: Ref.1
from rpy2.robjects import pandas2ri
pandas2ri.activate()
So if it is the case, I find this post1 & post2 using R for regarding Visualization of a correlation matrix.
So, in short, my 1st priority is using Python and its packages Matplotlib, seaborn, Plotly Express, and then R and its packages to reach the expected output.
Note
I provided you with executable code in google Colab notebook with R using dataset so that you can form/test your final answer if your solution is by rpy2 otherwise I'd be interested in a Pythonic solution.
I'm not an expert in rpy2, so I can't help there, but here is how I would build it out in R. Since I don't have your data, I can't promise that everything will work perfectly for your dataset, but here is a general outline:
library(tidyverse)
#get some data
df <- as_tibble(mtcars) |>
(\(d) select(d, order(colnames(d))))()
#calculate correlation matrix
cor_mat <- cor(df)
#make 2 "blank" matrices
low <- matrix(NA, nrow = nrow(cor_mat), ncol = ncol(cor_mat))
up <- matrix(NA, nrow = nrow(cor_mat), ncol = ncol(cor_mat))
#populate upper and lower matrices
up[upper.tri(up)] <- cor_mat[upper.tri(cor_mat)]
low[lower.tri(low)] <- cor_mat[lower.tri(cor_mat)]
#pivot upper and lower for plotting
lower_dat <- low|>
as.data.frame() |>
`colnames<-`(colnames(df)) |>
mutate(xvar = colnames(df)) |>
pivot_longer(cols = -xvar, names_to = "yvar")
upper_dat <- up|>
as.data.frame() |>
`colnames<-`(colnames(df)) |>
mutate(xvar = colnames(df)) |>
pivot_longer(cols = -xvar, names_to = "yvar")
#plot
lower_dat|> #lower matrix data
ggplot(aes((xvar), yvar))+
geom_tile(fill = NA, color = "grey")+ #background grid
geom_point(aes(fill = value, size = value), pch = 22)+ # differnt sized points
geom_text(data = upper_dat, aes(color = value, label = round(value, 2)))+ #plot cor in upper right
scale_size_continuous(breaks = seq(-1, 1, by = 0.5))+ # define size breaks
labs(x = "", y = "")+ #remove unnecessary labels
scale_fill_gradient2(low = "darkred",mid = "white", high = "darkblue", midpoint = 0)+ #define square colors
scale_color_gradient2(low = "darkred",mid = "white", high = "darkblue", midpoint = 0)+ #define text colors
scale_x_discrete(limits = rev)+# rev to make the triagle a certain side
#make it look pretty
theme(panel.background = element_blank(),
panel.border = element_rect(fill = NA, color = "black"),
axis.text = element_text(color = "black", size = 10),
axis.title = element_text(size = 12))
Another option is creating two corrplots from the corrplot package in R. You can specify one plot with add=TRUE to combine both plots. Here is a reproducible example with mtcars dataset:
library(corrplot)
M<-cor(mtcars)
diag(M) <- 0
corrplot(M, method="number", type = "upper", tl.pos = "t")
corrplot(M, method="square", type = "lower", tl.pos = "l", cl.pos = "n", add = TRUE)
Output:
I'd be interested in a Pythonic solution.
Use a seaborn scatter plot with matplotlib text/line annotations:
Plot the lower triangle via sns.scatterplot with square markers
Annotate the upper triangle via plt.text
Draw the heatmap grid via plt.vlines and plt.hlines
Full code using the titanic sample:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="white")
# generate sample correlation matrix
df = sns.load_dataset("titanic")
df["label"] = np.random.randint(0, 2, size=len(df))
corr = df.corr()
# mask and melt correlation matrix
mask = np.tril(np.ones_like(corr, dtype=bool)) | corr.abs().le(0.1)
melt = corr.mask(mask).melt(ignore_index=False).reset_index()
melt["size"] = melt["value"].abs()
fig, ax = plt.subplots(figsize=(8, 6))
# normalize colorbar
cmap = plt.cm.RdBu
norm = plt.Normalize(-1, 1)
sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
cbar = plt.colorbar(sm, ax=ax)
cbar.ax.tick_params(labelsize="x-small")
# plot lower triangle (scatter plot with normalized hue and square markers)
sns.scatterplot(ax=ax, data=melt, x="index", y="variable", size="size",
hue="value", hue_norm=norm, palette=cmap,
style=0, markers=["s"], legend=False)
# format grid
xmin, xmax = (-0.5, corr.shape[0] - 0.5)
ymin, ymax = (-0.5, corr.shape[1] - 0.5)
ax.vlines(np.arange(xmin, xmax + 1), ymin, ymax, lw=1, color="silver")
ax.hlines(np.arange(ymin, ymax + 1), xmin, xmax, lw=1, color="silver")
ax.set(aspect=1, xlim=(xmin, xmax), ylim=(ymax, ymin), xlabel="", ylabel="")
ax.tick_params(labelbottom=False, labeltop=True)
plt.xticks(rotation=90)
# annotate upper triangle
for y in range(corr.shape[0]):
for x in range(corr.shape[1]):
value = corr.mask(mask).to_numpy()[y, x]
if pd.notna(value):
plt.text(x, y, f"{value:.2f}", size="x-small",
# color=sm.to_rgba(value), weight="bold",
ha="center", va="center")
Note that since most of these titanic correlations are low, I disabled the text coloring for readability.
If you want color-coded text, uncomment the color=sm.to_rgba(value) line at the end:
I cannot setup heatmap package in Windows, but have you tried to set upper diagonal elements to nan?
corr_masked = corr.copy()
corr_masked[mask] = np.nan
corrplot(corr_masked, size_scale=300)
plt.plot for example does not plot nan samples, so the same trick may work here. If not, just setting the UD elements to 0 may suffice (or whatever color corresponds to the white on the scale).

Python 2D interpolation with scipy.interpolate.RBFInterpolator

Last week I asked a question about finding a way to interpolate a surface from multiple curves (data from multiple Excel files) and someone referred me to a question which explains how to use scipy.interpolate.RBFInterpolator (How can I perform two-dimensional interpolation using scipy?).
I tried this method but I am getting a bad surface fitting (see the pictures below). Does anyone understand what is wrong with my code? I tried to change the kernel parameter but "linear" seems to be the best. Am I doing an error when I am using np.meshgrid? Thanks for the help.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from scipy.interpolate import RBFInterpolator
fig = plt.figure(figsize=(15,10),dpi=400)
ax = fig.gca(projection='3d')
# List all the results files in the folder (here 'Sress_Strain') to plot them.
results_list = os.listdir(r"C:/Users/bdhugu/Desktop/Strain_Stress")
for i in range(len(results_list)):
if i == 0:
results = pd.read_excel(r"C:/Users/bdhugu/Desktop/Strain_Stress/"+results_list[i])
strain = results["Strain (mm/mm)"]
stress = results["Stress (MPa)"]
strain_rate = results["Strain rate (s^-1)"]
if i>0:
new_results = pd.read_excel(r"C:/Users/bdhugu/Desktop/Strain_Stress/"+results_list[i])
new_strain = new_results["Strain (mm/mm)"]
new_stress = new_results["Stress (MPa)"]
new_strain_rate = new_results["Strain rate (s^-1)"]
strain = strain.append(new_strain, ignore_index=False)
stress = stress.append(new_stress, ignore_index=False)
strain_rate = strain_rate.append(new_strain_rate,ignore_index=False)
# RBFINTERPOLATOR METHOD
# ----------------------------------------------------------------------------
x_scattered = strain
y_scattered = strain_rate
z_scattered = stress
scattered_points = np.stack([x_scattered.ravel(), y_scattered.ravel()],-1)
x_dense, y_dense = np.meshgrid(np.linspace(min(strain), max(strain), 20),np.linspace(min(strain_rate), max(strain_rate), 21))
dense_points = np.stack([x_dense.ravel(), y_dense.ravel()], -1)
interpolation = RBFInterpolator(scattered_points, z_scattered.ravel(), smoothing = 0, kernel='linear',epsilon=1, degree=0)
z_dense = interpolation(dense_points).reshape(x_dense.shape)
fig = plt.figure(figsize=(15,10),dpi=400)
ax = plt.axes(projection='3d')
ax.plot_surface(x_dense, y_dense, z_dense ,cmap='viridis', edgecolor='none')
ax.invert_xaxis()
ax.set_title('Surface plot')
plt.show()
Data to interpolate
Surface fitting with RBFInterpolator

Line-based heatmap or 2D line histogram

I have a synthetic dataset with 1000 noisy polygons of various orders and sin/cos curves that I can plot as lines using python seaborn.
Since I have quite a few lines that are overlapping, I'd like to plot some sort of heatmap or histogram of my line graphs.
I've tried iterating over the columns and aggregating the counts to use seaborn's heatmap graph, but with many lines this takes quite a while.
The next best thing that results in what I want was a hexbin graph (with seaborn jointgraph).
But it's a compromise between runtime and granularity (the shown graph has gridsize 750). I couldn't find any other graph-type for my problem. But I also don't know exactly what it might be called.
I've also tried with line alpha set to 0.2. This results in a similar graph to what I want. But it's less precise (if more than 5 lines overlap at the same point I already have zero transparency left). Also, it misses the typical coloration of heatmaps.
(Moot search terms were: heatmap, 2D line histogram, line histogram, density plots...)
Does anybody know packages to plot this more efficiently and high(er) quality or knows how to do it with the popular python plotters (i.e. the matplotlib family: matplotlib, seaborn, bokeh). I'm really fine with any package though.
It took me awhile, but I finally solved this using Datashader. If using a notebook, the plots can be embedded into interactive Bokeh plots, which looks really nice.
Anyhow, here is the code for static images, in case someone else is in need of something similar:
# coding: utf-8
import time
import numpy as np
from numpy.polynomial import polynomial
import pandas as pd
import matplotlib.pyplot as plt
import datashader as ds
import datashader.transfer_functions as tf
plt.style.use("seaborn-whitegrid")
def create_data():
# ...
# Each column is one data sample
df = create_data()
# Following will append a nan-row and reshape the dataframe into two columns, with each sample stacked on top of each other
# THIS IS CRUCIAL TO OPTIMIZE SPEED: https://github.com/bokeh/datashader/issues/286
# Append row with nan-values
df = df.append(pd.DataFrame([np.array([np.nan] * len(df.columns))], columns=df.columns, index=[np.nan]))
# Reshape
x, y = df.shape
arr = df.as_matrix().reshape((x * y, 1), order='F')
df_reshaped = pd.DataFrame(arr, columns=list('y'), index=np.tile(df.index.values, y))
df_reshaped = df_reshaped.reset_index()
df_reshaped.columns.values[0] = 'x'
# Plotting parameters
x_range = (min(df.index.values), max(df.index.values))
y_range = (df.min().min(), df.max().max())
w = 1000
h = 750
dpi = 150
cvs = ds.Canvas(x_range=x_range, y_range=y_range, plot_height=h, plot_width=w)
# Aggregate data
t0 = time.time()
aggs = cvs.line(df_reshaped, 'x', 'y', ds.count())
print("Time to aggregate line data: {}".format(time.time()-t0))
# One colored plot
t1 = time.time()
stacked_img = tf.Image(tf.shade(aggs, cmap=["darkblue", "darkblue"]))
print("Time to create stacked image: {}".format(time.time() - t1))
# Save
f0 = plt.figure(figsize=(w / dpi, h / dpi), dpi=dpi)
ax0 = f0.add_subplot(111)
ax0.imshow(stacked_img.to_pil())
ax0.grid(False)
f0.savefig("stacked.png", bbox_inches="tight", dpi=dpi)
# Heat map - This uses a equalized histogram (built-in default), there are other options, though.
t2 = time.time()
heatmap_img = tf.Image(tf.shade(aggs, cmap=plt.cm.Spectral_r))
print("Time to create stacked image: {}".format(time.time() - t2))
# Save
f1 = plt.figure(figsize=(w / dpi, h / dpi), dpi=dpi)
ax1 = f1.add_subplot(111)
ax1.imshow(heatmap_img.to_pil())
ax1.grid(False)
f1.savefig("heatmap.png", bbox_inches="tight", dpi=dpi)
With following run times (in seconds):
Time to aggregate line data: 0.7710442543029785
Time to create stacked image: 0.06000351905822754
Time to create stacked image: 0.05600309371948242
The resulting plots:
Although it seems you have tried this, plotting the counts seems to give a good representation of the data. However, it really depends what you're trying to find in your data, what is it supposed to tell you?
The reason for the long run time is due to plotting so many lines, a heatmap based on the counts however will plot fairly quickly.
I created some dummy data for sinus waves, based on noise, no. of lines, amplitude and shift. Added both a boxplot and heatmap.
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import random
import pandas as pd
np.random.seed(0)
#create dummy data
N = 200
sinuses = []
no_lines = 200
for i in range(no_lines):
a = np.random.randint(5, 40)/5 #amplitude
x = random.choice([int(N/5), int(N/(2/5))]) #random shift
sinuses.append(np.roll(a * np.sin(np.linspace(0, 2 * np.pi, N)) + np.random.randn(N), x))
fig = plt.figure(figsize=(20 / 2.54, 20 / 2.54))
sins = pd.DataFrame(sinuses, )
ax1 = plt.subplot2grid((3,10), (0,0), colspan=10)
ax2 = plt.subplot2grid((3,10), (1,0), colspan=10)
ax3 = plt.subplot2grid((3,10), (2,0), colspan=9)
ax4 = plt.subplot2grid((3,10), (2,9))
# plot line data
sins.T.plot(ax=ax1, color='lightblue',linewidth=.3)
ax1.legend_.remove()
ax1.set_xlim(0, N)
# try boxplot
sins.plot.box(ax=ax2, showfliers=False)
xticks = ax2.xaxis.get_major_ticks()
for index, label in enumerate(ax2.get_xaxis().get_ticklabels()):
xticks[index].set_visible(False) # hide ticks where labels are hidden
#make a list of bins
no_bins = 20
bins = list(np.arange(sins.min().min(), sins.max().max(), int(abs(sins.min().min())+sins.max().max())/no_bins))
bins.append(sins.max().max())
# calculate histogram
hists = []
for col in sins.columns:
count, division = np.histogram(sins.iloc[:,col], bins=bins)
hists.append(count)
hists = pd.DataFrame(hists, columns=[str(i) for i in bins[1:]])
print(hists.shape, '\n', hists.head())
cmap = mpl.colors.ListedColormap(['white', '#FFFFBB', '#C3FDB8', '#B5EAAA', '#64E986', '#54C571',
'#4AA02C', '#347C17', '#347235', '#25383C', '#254117'])
#heatmap
im = ax3.pcolor(hists.T, cmap=cmap)
cbar = plt.colorbar(im, cax=ax4)
yticks = np.arange(0, len(bins))
yticklabels = hists.columns.tolist()
ax3.set_yticks(yticks)
ax3.set_yticklabels([round(i,1) for i in bins])
ax3.set_title('Count')
yticks = ax3.yaxis.get_major_ticks()
for index, label in enumerate(ax3.get_yaxis().get_ticklabels()):
if index % 3 != 0: #make some labels invisible
yticks[index].set_visible(False) # hide ticks where labels are hidden
plt.show()
Although the boxplot is easy to interpret, it doesn't show the actual distribution of the data very well, but knowing where the median and quantiles lie may be helpful.
Increasing the number of lines and amount of values per line will increase plotting time considerably for the line plots, the heatmap is still fairly quick though to generate. The boxplot becomes indiscernible however.
I couldn't exactly replicate your data (or know the actual size of it), but perhaps the heatmap may be helpful.

Scipy dendrogram with names

I'm using the example dendrogram from this post in my work but would also like to keep track of which row / column is from which piece of data.
I've edited the code with records of names of the data as names as follows and would like to print out the names at the bottom and to the right of the distance matrix visualization. I've tried adding labels = names in the call to dendrogram but this didn't help.
Does anyone know how to add labels to this?
import scipy
import pylab
import scipy.cluster.hierarchy as sch
# Generate random features and distance matrix.
x = scipy.rand(40)
D = scipy.zeros([40,40])
for i in range(40):
for j in range(40):
D[i,j] = abs(x[i] - x[j])
### new code
names = [ ]
for i in range(40):
names.append( 'str%i'%( i ) )
print names[-1]
### end new code
# Compute and plot first dendrogram.
fig = pylab.figure(figsize=(8,8))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
Y = sch.linkage(D, method='centroid')
Z1 = sch.dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])
# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
Y = sch.linkage(D, method='single')
Z2 = sch.dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])
# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1,:]
D = D[:,idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])
# Plot colorbar.
#axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
#pylab.colorbar(im, cax=axcolor)
fig.show()
fig.savefig('dendrogram.png')
The python package heatmapcluster (available on PyPI) that I wrote accepts (in fact, requires) labels.
Here's a simplified version of your script using heatmapcluster:
import numpy as np
import matplotlib.pyplot as plt
from heatmapcluster import heatmapcluster
# Generate random features and distance matrix.
x = np.random.rand(40)
D = np.abs(np.subtract.outer(x, x))
names = ['str%i' % i for i in range(len(x))]
h = heatmapcluster(D, names, names,
num_row_clusters=3, num_col_clusters=3,
label_fontsize=8,
xlabel_rotation=-75,
cmap=plt.cm.coolwarm,
show_colorbar=True,
top_dendrogram=True)
plt.show()
And here is the plot it generates:
(Note that, for a symmetric array like D, there is really no point in clustering both axes. By symmetry, they will generate the same dendrogram.)

Categories