Scipy dendrogram with names - python

I'm using the example dendrogram from this post in my work but would also like to keep track of which row / column is from which piece of data.
I've edited the code with records of names of the data as names as follows and would like to print out the names at the bottom and to the right of the distance matrix visualization. I've tried adding labels = names in the call to dendrogram but this didn't help.
Does anyone know how to add labels to this?
import scipy
import pylab
import scipy.cluster.hierarchy as sch
# Generate random features and distance matrix.
x = scipy.rand(40)
D = scipy.zeros([40,40])
for i in range(40):
for j in range(40):
D[i,j] = abs(x[i] - x[j])
### new code
names = [ ]
for i in range(40):
names.append( 'str%i'%( i ) )
print names[-1]
### end new code
# Compute and plot first dendrogram.
fig = pylab.figure(figsize=(8,8))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
Y = sch.linkage(D, method='centroid')
Z1 = sch.dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])
# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
Y = sch.linkage(D, method='single')
Z2 = sch.dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])
# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1,:]
D = D[:,idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])
# Plot colorbar.
#axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
#pylab.colorbar(im, cax=axcolor)
fig.show()
fig.savefig('dendrogram.png')

The python package heatmapcluster (available on PyPI) that I wrote accepts (in fact, requires) labels.
Here's a simplified version of your script using heatmapcluster:
import numpy as np
import matplotlib.pyplot as plt
from heatmapcluster import heatmapcluster
# Generate random features and distance matrix.
x = np.random.rand(40)
D = np.abs(np.subtract.outer(x, x))
names = ['str%i' % i for i in range(len(x))]
h = heatmapcluster(D, names, names,
num_row_clusters=3, num_col_clusters=3,
label_fontsize=8,
xlabel_rotation=-75,
cmap=plt.cm.coolwarm,
show_colorbar=True,
top_dendrogram=True)
plt.show()
And here is the plot it generates:
(Note that, for a symmetric array like D, there is really no point in clustering both axes. By symmetry, they will generate the same dendrogram.)

Related

Extracting error bars and point positions from a Python MatPlotLib figure

If I have a Python MatPlotLib figure (for example, a matplotlib.axes._subplots.AxesSubplot object), is there a way to extract from it positions of the points and the error bars? I.e. I want to get arrays containing x,y-coordinates and y-errors.
Example:
import numpy as np
import seaborn as sb
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
p = sb.regplot(x=x, y=y, x_bins=10, fit_reg=None)
How can I extract from 'p' positions of the points and the error bars?
Thank you for your help!
The errorbar data are stored in p.lines, since seaborn plots them using plt.plot.
You can access their positions using line.get_xdata() and line.get_ydata().
The point data are stored in p.collections, since they are plotted internally in seaborn using plt.scatter.
Getting at the point positions from the PathCollection object takes one extra step, as shown in this answer: Get positions of points in PathCollection created by scatter(): i.e. you have to set the offset_position first, before accessing the offsets.
Here's an example to get both the point data and the errorbar data from the
matplotlib Axes object, p.
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
p = sb.regplot(x=x, y=y, x_bins=10, fit_reg=None)
# First, get the positions of the points:
coll = p.collections[0]
coll.set_offset_position('data')
points_xy = coll.get_offsets()
print points_xy
#[[-1.65295679 3.05723876]
# [-1.29981986 1.60258005]
# [-0.94417279 0.8999881 ]
# [-0.56964819 0.38035406]
# [-0.20253243 0.0774201 ]
# [ 0.15535504 0.024336 ]
# [ 0.5362322 0.30849082]
# [ 0.90482003 0.85788122]
# [ 1.26136841 1.66294418]
# [ 1.63048127 3.02934186]]
# Next, get the positions of the errorbars
xerr = []
yerr = []
for line in p.lines:
xerr.append(line.get_xdata()[0])
yerr.append(line.get_ydata().tolist())
print xerr
# [-1.6529567859649865, -1.2998198636006264, -0.94417278886439027, -0.56964818931133276, -0.20253243328132031, 0.15535504153419355, 0.53623219583456194, 0.90482002911787607, 1.2613684083224488, 1.6304812696399549]
print yerr
# [[2.908807029542707, 3.200571530218434], [1.4449980200239572, 1.751504207194087], [0.7633753040974505, 1.029774999216172], [0.26593411110949544, 0.4753543268237353], [-0.0030674495857816496, 0.15582564460187567], [-0.052610243112427575, 0.09899773706322114], [0.21019700161329888, 0.41120457637300634], [0.7328000635837721, 0.9826379405190817], [1.508513523393156, 1.8184617796582343], [2.885113765027557, 3.1670479251950376]]
plt.show()
Here points_xy is a list of (x,y) coordinates of the points, xerr is the x-coordinate of the errorbars (which is, of course, the same as the x-coordinates in points_xy), and yerr is a list of pairs of y-coordinates: the top and bottom of each errorbar.
If you know the points are in the center of the error bars (it looks like they are, for this example), then this should do it:
import numpy as np
import seaborn as sb
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
p = sb.regplot(x=x, y=y, x_bins=10, fit_reg=None)
def get_data(p):
x_list = []
lower_list = []
upper_list = []
for line in p.lines:
x_list.append(line.get_xdata()[0])
lower_list.append(line.get_ydata()[0])
upper_list.append(line.get_ydata()[1])
y = 0.5 * (np.asarray(lower_list) + np.asarray(upper_list))
y_error = np.asarray(upper_list) - y
x = np.asarray(x_list)
return x, y, y_error
get_data(p)
Here the returned y_error will be the magnitude of the error bars.

python KDE get contours and paths into specific json format leaflet-friendly

I am doing a Kernel Density Estimation in Python and getting the contours and paths as shown below. (here is my sample data: https://pastebin.com/193PUhQf).
from numpy import *
from math import *
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
x_2d = []
y_2d = []
data = {}
data['nodes'] = []
# here is the sample data:
# https://pastebin.com/193PUhQf
X = [.....]
for Picker in xrange(0, len(X)):
x_2d.append(X[Picker][0])
y_2d.append(X[Picker][1])
# convert to arrays
m1 = np.array([x_2d])
m2 = np.array([y_2d])
x_min = m1.min() - 30
x_max = m1.max() + 30
y_min = m2.min() - 30
y_max = m2.max() + 30
x, y = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
positions = np.vstack([x.ravel(), y.ravel()])
values = np.vstack([m1, m2])
kde = stats.gaussian_kde(values)
z = np.reshape(kde(positions).T, x.shape)
fig = plt.figure(2, dpi=200)
ax = fig.add_subplot(111)
pc = ax.pcolor(x, y, z)
cb = plt.colorbar(pc)
cb.ax.set_ylabel('Probability density')
c_s = plt.contour(x, y, z, 20, linewidths=1, colors='k')
ax.plot(m1, m2, 'o', mfc='w', mec='k')
ax.set_title("My Title", fontsize='medium')
plt.savefig("kde.png", dpi=200)
plt.show()
There is a similar way to get the contours using R, which is described here:
http://bl.ocks.org/diegovalle/5166482
Question: how can I achieve the same output using my python script or as a start point?
the desired output should be like contours_tj.json which can be used by leaflet.js lib.
UPDATE:
My input data structure is composed of three columns, comma separated:
first one is the X value
second one is the Y value
third one is the ID of my data, it has no numerical value, it is simply an identifier of the data point.
Update 2:
Question, if simply put, is that I want the same output as in the above link using my input file which is in numpy array format.
update 3:
my input data structure is of list type:
print type(X)
<type 'list'>
and here are the first few lines:
print X[0:5]
[[10.800584, 11.446064, 4478597], [10.576840,11.020229, 4644503], [11.434276,10.790881, 5570870], [11.156718,11.034633, 6500333], [11.054956,11.100243, 6513301]]
geojsoncontour is a python library to convert matplotlib contours to geojson
geojsoncontour.contour_to_geojson requires a contour_levels argument. The levels in pyplot.contour are chosen automatically, but you can access them with c_s._levels
So, for your example you could do:
import geojsoncontour
# your code here
c_s = plt.contour(x, y, z, 20, linewidths=1, colors='k')
# Convert matplotlib contour to geojson
geojsoncontour.contour_to_geojson(
contour=c_s,
geojson_filepath='out.geojson',
contour_levels=c_s._levels,
ndigits=3,
unit='m'
)

Removing Data Below A Line In A Scatterplot (Python)

So I had code that graphed a 2dhistogram of my dataset. I plotted it like so:
histogram = plt.hist2d(fehsc, ofesc, bins=nbins, range=[[-1,.5],[0.225,0.4]])
I wanted to only look at data above a certain line though, so I added the following and it worked just fine:
counts = histogram[0]
xpos = histogram[1]
ypos = histogram[2]
image = histogram[3]
newcounts = counts #we're going to iterate over this
for i in range (nbins):
xin = xpos[i]
yin = ypos
yline = m*xin + b
reset = np.where(yin < yline) #anything less than yline we want to be 0
#index = index[0:len(index)-1]
countout = counts[i]
countout[reset] = 0
newcounts[i] = countout
However, I now need to draw a regression line through that cut region. Doing so is not possible (AFAIK) in plt.2dhist, so I'm using plt.scatter. Problem is I don't know how to make that cut anymore - I can't index the scatterplot.
I have this now:
plt.xlim(-1,.5)
plt.ylim(.225, .4)
scatter = plt.scatter(fehsc,ofesc, marker = ".")
and I only want to retain the data above some line:
xarr = np.arange(-1,0.5, 0.015)
yarr = m*xarr + b
plt.plot(xarr, yarr, color='r')
I've tried running the loop with some variations of the variables but I don't actually understand or know how to get it to work.
You could define a mask for your data before you plot and then just plot the data points that actually meet your criteria. Below an example, where all data points above a certain line are plotted in green and all data points below the line are plotted in black.
from matplotlib import pyplot as plt
import numpy as np
#the scatterplot data
xvals = np.random.rand(100)
yvals = np.random.rand(100)
#the line
b = 0.1
m = 1
x = np.linspace(0,1,num=100)
y = m*x+b
mask = yvals > m*xvals+b
plt.scatter(xvals[mask],yvals[mask],color='g')
plt.scatter(xvals[~mask],yvals[~mask],color='k')
plt.plot(x,y,'r')
plt.show()
The result looks like this
Hope this helps.
EDIT:
If you want to create a 2D histogram, where the portion below the line is set to zero, you can do that by first generating the histogram using numpy (as an array) and then setting the values inside that array to zero, if the bins fall below the line. After that, you can plot the matrix using plt.pcolormesh:
from matplotlib import pyplot as plt
import numpy as np
#the scatterplot data
xvals = np.random.rand(1000)
yvals = np.random.rand(1000)
histogram,xbins,ybins = np.histogram2d(xvals,yvals,bins=50)
#computing the bin centers from the bin edges:
xcenters = 0.5*(xbins[:-1]+xbins[1:])
ycenters = 0.5*(ybins[:-1]+ybins[1:])
#the line
b = 0.1
m = 1
x = np.linspace(0,1,num=100)
y = m*x+b
#hiding the part of the histogram below the line
xmesh,ymesh = np.meshgrid(xcenters,ycenters)
mask = m*xmesh+b > ymesh
histogram[mask] = 0
#making the plot
mat = plt.pcolormesh(xcenters,ycenters,histogram)
line = plt.plot(x,y,'r')
plt.xlim([0,1])
plt.ylim([0,1])
plt.show()
The result would be something like this:

Editing a Heatmap in Python via Scipy

I am trying to plot a heatmap and I found this code online although I am having some difficulty using it. I am trying to to do hierarchical clustering and test gene methylation of one another, I made a DataFrame using pandas where I have Betavalues and Genes as separate columns (df4). Later I converted it to a matrix like scipy prefers.I tried running the code with my matrix but it gave me a value error saying "ValueError: could not convert string to float: 'tAKR'", I already removed the N/A and anything that is not a gene or a valid Beta value.
I was wondering if you may have any suggestions?
Below I have attached a picture of what my dataframe looks like before changing into a matrix. [![enter image description here][1]][1]
import scipy
import pylab
import scipy.cluster.hierarchy as sch
df5 = df4.as_matrix()
# Generate random features and distance matrix.
x = scipy.rand(40)
D = scipy.zeros([40,40])
for i in range(40):
for j in range(40):
D[i,j] = abs(x[i] - x[j])
# Compute and plot first dendrogram.
fig = pylab.figure(figsize=(8,8))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
Y = sch.linkage(df5, method='centroid')
Z1 = sch.dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])
# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
Y = sch.linkage(df5, method='single')
Z2 = sch.dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])
# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1,:]
D = D[:,idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])
# Plot colorbar.
axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
pylab.colorbar(im, cax=axcolor)
fig.show()
fig.savefig('dendrogram.png')
You may be interested in an out-of-the-box solution if you're not comfortable with this. Check out seaboarn's clustermap, which accepts pandas data frames as inputs.
>>> import seaborn as sns; sns.set()
>>> flights = sns.load_dataset("flights")
>>> flights = flights.pivot("month", "year", "passengers")
>>> g = sns.clustermap(flights)
I am not affiliated with Seaborn.

Trying to visualize a sorted table with matplotlib (parallel coordinates?)

I'm trying to visualize a sorted table (sorted on a column). My ideal result should be something like
visualization of a sorted table
Any suggestion on how to reach this goal with matplotlib?
I'have already tried with suggestions given here and here but I'm looking for something fancier like that in the attached image.
Thanks in advance,
Matplotlib does not support this directly, but it is fairly easy to replicate the plot that you have linked to.
The function below does something similar given a 2d array of data. It can be sorted or not, the function doesn't really care.
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
def sorted_table_plot(data, labels, categories, cmap=None, ax=None):
# check if an axes was supplied
if ax is None:
ax = plt.gca()
# check if a colormap was supplied
if cmap is None:
cmap = plt.cm.jet
# generate the grid arrays with the coordinates for the annotations
yy, xx = np.mgrid[:data.shape[0], :data.shape[1]]
x = xx.flatten()
y = yy.flatten()
d = data.flatten()
# a norm object which we will use with the colorbar
norm = plt.Normalize(d.min(), d.max())
# iterate over the data points and draw the labels
for di, xi, yi in zip(d, x, y):
color = cmap(norm(di))
hsv = mcolors.rgb_to_hsv(color[:3])
fc = 'w' if hsv[2] < 0.7 else 'k'
ax.annotate(str(di), xy=(xi,yi), xycoords="data",
va="center", ha="center", color=fc,
bbox=dict(boxstyle="circle", fc=color))
# iteratve over all the appearing values and draw the lines
for i in np.unique(data):
xi, yi = x[d==i], y[d==i]
idx = np.argsort(xi)
plt.plot(xi[idx], yi[idx], color=plt.cm.jet(norm(i)), lw=2)
# add the axes labels
ax.set_xticks(xx[0,:])
ax.set_xticklabels(categories)
ax.set_yticks(yy[:,0])
ax.set_yticklabels(labels)
# adjust the axes ranges
ax.set_xlim(xx[0,0] - 0.5, xx[-1,-1] + 0.5)
ax.set_ylim(yy[-1,-1] + 0.5, yy[0,0] - 0.5)
Now, you can simply call it on a data array. In the following I created a random array, since you didn't care to supply an example data set.
# fix the seed for reproducability
np.random.seed(2)
# create random data
data = np.tile(np.arange(1,8), (3,1)).T
labels = map(lambda x: 'label ' + str(x), data[:,1])
categories = map(lambda x: 'cat ' + str(x), np.arange(data.shape[1])+1)
for i in range(1,data.shape[1]):
# shuffle all but the first column
np.random.shuffle(data[:,i])
# call the function and show the plot
sorted_table_plot(data, labels, categories)
plt.show()
Result:

Categories