I want to plot scatter points corresponding to 6 different datasets over global maps of the Earth. The problem is that some of these quantities have negative values and they don't appear in the maps. I have tried to overcome this problem by taking absolute values of the data and multiplying (or taking the power of) them by some factors, but nothing seems to work the way I want. The problem is that the datasets have very different ranges. Ideally, I want them all to have the same scale so everything will be more organized, but I don't know how to do this.
I created some synthetic data to illustrate this issue
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.basemap import Basemap, addcyclic, shiftgrid
from matplotlib.pyplot import cm
np.random.seed(100)
VarReTx = np.random.uniform(low=-0.087, high=0.0798, size=(52,))
VarReTy = np.random.uniform(low=-0.076, high=0.1919, size=(52,))
VarImTx = np.random.uniform(low=-0.0331, high=0.0527, size=(52,))
VarImTy = np.random.uniform(low=-0.0311, high=0.2007, size=(52,))
eTx = np.random.uniform(low=0.0019, high=0.0612, size=(52,))
eTx = np.random.uniform(low=0.0031, high=0.0258, size=(52,))
obslat = np.array([18.62, -65.25, -13.8, -7.95, -23.77, 51.84, 40.14, 58.07,
-12.1875, -35.32, 36.37, -46.43, 40.957, -43.474, 38.2 , 37.09,
48.17, 0.6946, 13.59, 28.32, 51., -25.88, -34.43, 21.32,
-12.05, 52.27, 36.23, -12.69, 31.42, 5.21, -22.22, 36.1,
14.38, -54.5, 43.91, 61.16, 48.27, 52.07, 54.85, 45.403,
52.971, -17.57, -51.7, 18.11, 39.55, 47.595, 22.79, -37.067,
-1.2, 32.18, 51.933, 48.52])
obslong = np.array([-287.13, -64.25, -171.78, -14.38, -226.12, -339.21, -105.24,
-321.77, -263.1664, -210.64, -233.146, -308.13, -359.667, -187.607,
-77.37, -119.72, -348.72, -287.8463, -215.13, -16.43, -4.48,
-332.29, -340.77, -158., -75.33, -255.55, -219.82, -227.53,
-229.12, -52.73, -245.9, -256.16, -16.97, -201.05, -215.81,
-45.442, -117.12, -347.32, -276.77, -75.552, -201.752, -149.58,
-57.89, -66.15, -4.35, -52.677, -354.47, -12.315, -48.5,
-110.73, -10.25, -123.42, ])
fig, ([ax1, ax2], [ax3, ax4], [eax1, eax2]) = plt.subplots(3,2, figsize=(24,23))
matplotlib.rc('xtick', labelsize=12)
matplotlib.rc('ytick', labelsize=12)
plots = [ax1, ax2, ax3, ax4, eax1, eax2]
Vars = [VarReTx, VarReTy, VarImTx, VarImTy, eTx, eTy]
titles = [r'$\Delta$ ReTx', r'$\Delta$ ReTy', r'$\Delta$ ImTx', r'$\Delta$ ImTy', 'Error (X)', 'Error (Y)']
colors = iter(cm.jet(np.reshape(np.linspace(0.0, 1.0, len(plots)), ((len(plots), 1)))))
for j in range(len(plots)):
c3 = next(colors)
lat = np.arange(-91, 91, 0.5)
long = np.arange(-0.1, 360.1, 0.5)
longrid, latgrid = np.meshgrid(long, lat)
plots[j].set_title(titles[j], fontsize=48, y=1.05)
condmap = Basemap(projection='robin', llcrnrlat=-90, urcrnrlat=90,\
llcrnrlon=-180, urcrnrlon=180, resolution='c', lon_0=0, ax=plots[j])
maplong, maplat = condmap(longrid, latgrid)
condmap.drawcoastlines()
condmap.drawmapboundary(fill_color='white')
parallels = np.arange(-90, 90, 15)
condmap.drawparallels(parallels,labels=[False,True,True,False], fontsize=15)
x,y = condmap(obslong, obslat)
w = []
for m in range(obslong.size):
w.append(Vars[j][m])
w = np.array(w)
condmap.scatter(x, y, s = w*1e+4, c=c3)
r = np.linspace(np.min(Vars[j]), np.max(Vars[j]), 4)
for n in r:
condmap.scatter([], [], c=c3, s=n*1e+4, label=str(np.round(n, 4)))
plots[j].legend(bbox_to_anchor=(0., -0.2, 1., .102), loc='lower left',
ncol=4, mode="expand", borderaxespad=0., fontsize=16, frameon = False)
plt.show()
plt.close('all')
As you can see in the map, negative data does not are not being exhibited. I want they all to appear in the maps and that all the scatter plots have the same scale in their respective ranges. Thanks!
It looks like you are trying to map your dataset to dot size. Obviously you cannot have negative size dots, so that won't work.
Instead, you need to normalize your dataset to a strictly positive range and use those normalized values for the size parameter. A simple way to do this would be to use matplotlib.colors.Normalize(vmin, vmax), which allows you to map any values in the interval [vmin, vmax] to the interval [0,1].
If you want to have a shared scale for all your datasets, first find the global min and max, and use that to instantiate your normalization, then normalize each dataset when plotting:
datasets = [VarReTx,VarReTy,VarImTx,VarImTy,eTx,eTx]
min_val = min([d.min() for d in datasets])
max_val = max([d.max() for d in datasets])
norm = matplotlib.colors.Normalize(vmin=min_val, vmax=max_val)
plt.scatter(x,y,s=norm(VarReTx)*100) # choose appropiate scaling factor instead of 100 to get nicely sized dots
Related
I would like to see both the density and frequency on my histogram. For example, display density on the left side and frequency on the right side.
Here is my code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x = [6.950915827194559, 0.5704464713012669, -1.655326152372283, 5.867122206816244, -1.809359944941513, -6.164821482653027, -2.538999462076397, 0.2108693568484643, -8.740600769897465, 2.121232876712331, 7.967032967032961, 10.61701196601832, 1.847419201771516, 0.6858006670780847, -2.008695652173909, 2.86991153132885, 1.703131050506168, -1.346913193356314, 3.334927671049193, -15.64688995215311, 20.00022688856367, 10.05956454173731, 2.044936877124148, 3.06513409961684, -0.9973614775725559, 1.190631873030967, -1.509991311902692, -0.3333827233664155, 1.898473282442747, 1.618299899267539, -0.1897860593512823, 1.000000000000001, 3.03501945525293, -7.646697418593529, -0.9769069279216391, -2.918403811792736, -3.90929422276739, 9.609846259653532, 3.240690674452962, 10.08973134408675, 1.98356309650054, 1.915301127899549, -0.7792207792207684, -3.308682400714091, -3.312977099236647, 19.98101265822785, 3.661973444534827, -5.770676691729326, 0.5268044012063156, -1.573767040370533, 3.234974862888484, -1.514352732634994, 6.564849624060143, 9.956794019127146, 3.232590278195024, 2.042007001166857, 1.601164483260553, -2.384737678855331, -2.731242556570068, 0.6069707315088602, 1.40561881957264, -6.805306861851957, 2.492102492102499, -3.639688275501762, 0.7958485384154335, 2.799187725631769, 0.9195966872689088, -2.366608280379856, 0.797679477882518, -3.80380434782609]
df = pd.DataFrame(x, columns=["Returns"])
def plot_histogram():
bins = range(-11, 12, 1)
bins_str = []
for i in bins:
bins_str.append(str(i)+"%")
fig, ax = plt.subplots(figsize=(9, 5))
_, bins, patches = plt.hist(np.clip(df.Returns, bins[0], bins[-1]),
bins=bins, density=True, rwidth=0.8)
xlabels = bins_str[:]
xlabels[-1] = "Over"
xlabels[0] = "Under"
N_labels = len(xlabels)
plt.xlim([bins[0], bins[-1]])
plt.xticks(bins)
ax.set_xticklabels(xlabels)
plt.title("Returns distribution")
plt.grid(axis="y", linewidth=0.5)
plot_histogram()
I tried adding density=True in plt.hist() but it removes the count from the histogram. Is it possible to display both the frequency and density on the same histogram?
A density plot sets the heights of the bars such that the area of all the bars (taking rwidth=1 for that calculation) sums to 1. As such, the bar heights of a counting histogram get divided by (the number of values times the bar widths).
With that conversion factor, you can recalculate the counts from the density (or vice versa). The recalculation can be used to label the bars and/or set a secondary y-axis. Note that the ticks of both y axes are aligned, so the grid only works well for one of them. (A secondary y-axis is a bit different from ax.twiny(), as the former has a fixed conversion between both y axes).
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
x = [6.950915827194559, 0.5704464713012669, -1.655326152372283, 5.867122206816244, -1.809359944941513, -6.164821482653027, -2.538999462076397, 0.2108693568484643, -8.740600769897465, 2.121232876712331, 7.967032967032961, 10.61701196601832, 1.847419201771516, 0.6858006670780847, -2.008695652173909, 2.86991153132885, 1.703131050506168, -1.346913193356314, 3.334927671049193, -15.64688995215311, 20.00022688856367, 10.05956454173731, 2.044936877124148, 3.06513409961684, -0.9973614775725559, 1.190631873030967, -1.509991311902692, -0.3333827233664155, 1.898473282442747, 1.618299899267539, -0.1897860593512823, 1.000000000000001, 3.03501945525293, -7.646697418593529, -0.9769069279216391, -2.918403811792736, -3.90929422276739, 9.609846259653532, 3.240690674452962, 10.08973134408675, 1.98356309650054, 1.915301127899549, -0.7792207792207684, -3.308682400714091, -3.312977099236647, 19.98101265822785, 3.661973444534827, -5.770676691729326, 0.5268044012063156, -1.573767040370533, 3.234974862888484, -1.514352732634994, 6.564849624060143, 9.956794019127146, 3.232590278195024, 2.042007001166857, 1.601164483260553, -2.384737678855331, -2.731242556570068, 0.6069707315088602, 1.40561881957264, -6.805306861851957, 2.492102492102499, -3.639688275501762, 0.7958485384154335, 2.799187725631769, 0.9195966872689088, -2.366608280379856, 0.797679477882518, -3.80380434782609]
df = pd.DataFrame(x, columns=["Returns"])
bins = range(-11, 12, 1)
bins_str = [str(i) + "%" for i in bins]
fig, ax = plt.subplots(figsize=(9, 5))
values, bins, patches = ax.hist(np.clip(df["Returns"], bins[0], bins[-1]),
bins=bins, density=True, rwidth=0.8)
# conversion between counts and density: number of values times bin width
factor = len(df) * (bins[1] - bins[0])
ax.bar_label(patches, ['' if v == 0 else f'{v * factor:.0f}' for v in values])
xlabels = bins_str[:]
xlabels[-1] = "Over"
xlabels[0] = "Under"
ax.set_xlim([bins[0], bins[-1]])
ax.set_xticks(bins, xlabels)
ax.set_title("Returns distribution")
ax.grid(axis="y", linewidth=0.5)
secax = ax.secondary_yaxis('right', functions=(lambda y: y * factor, lambda y: y / factor))
secax.set_ylabel('counts')
ax.set_ylabel('density')
plt.show()
To have the same grid positions for both y-axes, you can copy the ticks of one and convert them to set them at the other. For the ticks to be calculated, the plot needs to be drawn once (at the end of the code). Note that the converted values are only shown with a limited number of digits.
fig.canvas.draw()
ax.set_yticks(secax.get_yticks() / factor)
plt.show()
I have the following synthetic dataframe, including numerical and categorical columns as well as the label column.
I want to plot a diagonal correlation matrix and display correlation coefficients in the upper part as the following:
expected output:
Despite the point that categorical columns within synthetic dataset/dataframedf needs to be converted into numerical, So far I have used this seaborn example using 'titanic' dataset which is synthetic and fits my task, but I added label column as follows:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="white")
# Generate a large random dataset with synthetic nature (categorical + numerical)
data = sns.load_dataset("titanic")
df = pd.DataFrame(data=data)
# Generate label column randomly '0' or '1'
df['label'] = np.random.randint(0,2, size=len(df))
# Compute the correlation matrix
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1.0, vmax=1.0, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
I checked a related post but couldn't figure it out to do this task. The best I could find so far is this workaround which can be installed using this package that gives me the following output:
#!pip install heatmapz
# Import the two methods from heatmap library
from heatmap import heatmap, corrplot
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="white")
# Generate a large random dataset
data = sns.load_dataset("titanic")
df = pd.DataFrame(data=data)
# Generate label column randomly '0' or '1'
df['label'] = np.random.randint(0,2, size=len(df))
# Compute the correlation matrix
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
mask[np.diag_indices_from(mask)] = False
np.fill_diagonal(mask, True)
# Set up the matplotlib figure
plt.figure(figsize=(8, 8))
# Draw the heatmap using "Heatmapz" package
corrplot(corr[mask], size_scale=300)
Sadly, corr[mask] doesn't mask the upper triangle in this package.
I also noticed that in R, reaching this fancy plot is much easier, so I'm open if there is a more straightforward way to convert Python Pandas dataFrame to R dataframe since it seems there is a package, so-called rpy2 that we could use Python & R together even in Google Colab notebook: Ref.1
from rpy2.robjects import pandas2ri
pandas2ri.activate()
So if it is the case, I find this post1 & post2 using R for regarding Visualization of a correlation matrix.
So, in short, my 1st priority is using Python and its packages Matplotlib, seaborn, Plotly Express, and then R and its packages to reach the expected output.
Note
I provided you with executable code in google Colab notebook with R using dataset so that you can form/test your final answer if your solution is by rpy2 otherwise I'd be interested in a Pythonic solution.
I'm not an expert in rpy2, so I can't help there, but here is how I would build it out in R. Since I don't have your data, I can't promise that everything will work perfectly for your dataset, but here is a general outline:
library(tidyverse)
#get some data
df <- as_tibble(mtcars) |>
(\(d) select(d, order(colnames(d))))()
#calculate correlation matrix
cor_mat <- cor(df)
#make 2 "blank" matrices
low <- matrix(NA, nrow = nrow(cor_mat), ncol = ncol(cor_mat))
up <- matrix(NA, nrow = nrow(cor_mat), ncol = ncol(cor_mat))
#populate upper and lower matrices
up[upper.tri(up)] <- cor_mat[upper.tri(cor_mat)]
low[lower.tri(low)] <- cor_mat[lower.tri(cor_mat)]
#pivot upper and lower for plotting
lower_dat <- low|>
as.data.frame() |>
`colnames<-`(colnames(df)) |>
mutate(xvar = colnames(df)) |>
pivot_longer(cols = -xvar, names_to = "yvar")
upper_dat <- up|>
as.data.frame() |>
`colnames<-`(colnames(df)) |>
mutate(xvar = colnames(df)) |>
pivot_longer(cols = -xvar, names_to = "yvar")
#plot
lower_dat|> #lower matrix data
ggplot(aes((xvar), yvar))+
geom_tile(fill = NA, color = "grey")+ #background grid
geom_point(aes(fill = value, size = value), pch = 22)+ # differnt sized points
geom_text(data = upper_dat, aes(color = value, label = round(value, 2)))+ #plot cor in upper right
scale_size_continuous(breaks = seq(-1, 1, by = 0.5))+ # define size breaks
labs(x = "", y = "")+ #remove unnecessary labels
scale_fill_gradient2(low = "darkred",mid = "white", high = "darkblue", midpoint = 0)+ #define square colors
scale_color_gradient2(low = "darkred",mid = "white", high = "darkblue", midpoint = 0)+ #define text colors
scale_x_discrete(limits = rev)+# rev to make the triagle a certain side
#make it look pretty
theme(panel.background = element_blank(),
panel.border = element_rect(fill = NA, color = "black"),
axis.text = element_text(color = "black", size = 10),
axis.title = element_text(size = 12))
Another option is creating two corrplots from the corrplot package in R. You can specify one plot with add=TRUE to combine both plots. Here is a reproducible example with mtcars dataset:
library(corrplot)
M<-cor(mtcars)
diag(M) <- 0
corrplot(M, method="number", type = "upper", tl.pos = "t")
corrplot(M, method="square", type = "lower", tl.pos = "l", cl.pos = "n", add = TRUE)
Output:
I'd be interested in a Pythonic solution.
Use a seaborn scatter plot with matplotlib text/line annotations:
Plot the lower triangle via sns.scatterplot with square markers
Annotate the upper triangle via plt.text
Draw the heatmap grid via plt.vlines and plt.hlines
Full code using the titanic sample:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="white")
# generate sample correlation matrix
df = sns.load_dataset("titanic")
df["label"] = np.random.randint(0, 2, size=len(df))
corr = df.corr()
# mask and melt correlation matrix
mask = np.tril(np.ones_like(corr, dtype=bool)) | corr.abs().le(0.1)
melt = corr.mask(mask).melt(ignore_index=False).reset_index()
melt["size"] = melt["value"].abs()
fig, ax = plt.subplots(figsize=(8, 6))
# normalize colorbar
cmap = plt.cm.RdBu
norm = plt.Normalize(-1, 1)
sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
cbar = plt.colorbar(sm, ax=ax)
cbar.ax.tick_params(labelsize="x-small")
# plot lower triangle (scatter plot with normalized hue and square markers)
sns.scatterplot(ax=ax, data=melt, x="index", y="variable", size="size",
hue="value", hue_norm=norm, palette=cmap,
style=0, markers=["s"], legend=False)
# format grid
xmin, xmax = (-0.5, corr.shape[0] - 0.5)
ymin, ymax = (-0.5, corr.shape[1] - 0.5)
ax.vlines(np.arange(xmin, xmax + 1), ymin, ymax, lw=1, color="silver")
ax.hlines(np.arange(ymin, ymax + 1), xmin, xmax, lw=1, color="silver")
ax.set(aspect=1, xlim=(xmin, xmax), ylim=(ymax, ymin), xlabel="", ylabel="")
ax.tick_params(labelbottom=False, labeltop=True)
plt.xticks(rotation=90)
# annotate upper triangle
for y in range(corr.shape[0]):
for x in range(corr.shape[1]):
value = corr.mask(mask).to_numpy()[y, x]
if pd.notna(value):
plt.text(x, y, f"{value:.2f}", size="x-small",
# color=sm.to_rgba(value), weight="bold",
ha="center", va="center")
Note that since most of these titanic correlations are low, I disabled the text coloring for readability.
If you want color-coded text, uncomment the color=sm.to_rgba(value) line at the end:
I cannot setup heatmap package in Windows, but have you tried to set upper diagonal elements to nan?
corr_masked = corr.copy()
corr_masked[mask] = np.nan
corrplot(corr_masked, size_scale=300)
plt.plot for example does not plot nan samples, so the same trick may work here. If not, just setting the UD elements to 0 may suffice (or whatever color corresponds to the white on the scale).
I am using scipy.signal to calculate the width of the different peaks. I have 100 values wrt to different time points. I am using following code to calculate the peak, then width. The problem is it is not considering the time on x axis while calculating the width.
peaks_control, _ = find_peaks(x_control, height=2100)
time_control = time[:100]
width_control = peak_widths(x_control, peaks_control, rel_height=0.9)
The output of width_control is
array([12.84785714, 13.21299534, 13.4502381 , 12.71311143]),
array([2042.5, 2048.8, 2057.4, 2065. ]),
array([ 5.795 ,28.29469697, 51.245 , 74.17150396]),
array([18.64285714, 41.50769231, 64.6952381 , 86.88461538]))
I am using following to use time on x axis and show the signals, which is correct
plt.plot(time_control, x_control)
plt.plot(time_control[peaks_control], x_control[peaks_control], "x")
#plt.plot(np.zeros_like(x_control), "--", color="gray")
#plt.xlim(time_control.tolist())
plt.title('Control')
plt.xlabel('Time (s)')
plt.ylabel('RFU')
plt.show()
I am using following code to show the width also, but not able to put the actual time on x axis.
plt.plot(x_control)
plt.plot(peaks_control, x_control[peaks_control], "x")
plt.hlines(*width_control[1:], color="C3")
plt.title('Control')
plt.xlabel('Time (s)')
plt.ylabel('RFU')
plt.show()
I had the same problem just now, so here's my solution (there are probably more elegant solutions but this worked for me):
peak_widths() returns the widths (in samples), height at which the widths were calculated, and the interpolated positions of left and right intersection points of a horizontal line at the respective evaluation height (also in samples).
To convert those values from samples back to our x-axis we can use scipy.interpolate.interp1():
from scipy.signal import find_peaks, peak_widths
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
import numpy as np
def index_to_xdata(xdata, indices):
"interpolate the values from signal.peak_widths to xdata"
ind = np.arange(len(xdata))
f = interp1d(ind,xdata)
return f(indices)
x = np.linspace(0, 1, 10)
y = np.sin(4*x-0.2)
peaks, _ = find_peaks(y)
widths, width_heights, left_ips, right_ips = peak_widths(y, peaks)
widths = index_to_xdata(x, widths)
left_ips = index_to_xdata(x, left_ips)
right_ips = index_to_xdata(x, right_ips)
plt.plot(x,y)
plt.plot(x[peaks], y[peaks], "x")
plt.hlines(width_heights, left_ips, right_ips, color='r')
plt.xlabel('x values')
plt.ylabel('y values')
plt.show()
image of plot
I am preparing a graph of latency percentile results. This is my pd.DataFrame looks like:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
result = pd.DataFrame(np.random.randint(133000, size=(5,3)), columns=list('ABC'), index=[99.0, 99.9, 99.99, 99.999, 99.9999])
I am using this function (commented lines are different pyplot methods I have already tried to achieve my goal):
def plot_latency_time_bar(result):
ind = np.arange(4)
means = []
stds = []
for index, row in result.iterrows():
means.append(np.mean([row[0]//1000, row[1]//1000, row[2]//1000]))
stds.append(np .std([row[0]//1000, row[1]//1000, row[2]//1000]))
plt.bar(result.index.values, means, 0.2, yerr=stds, align='center')
plt.xlabel('Percentile')
plt.ylabel('Latency')
plt.xticks(result.index.values)
# plt.xticks(ind, ('99.0', '99.9', '99.99', '99.999', '99.99999'))
# plt.autoscale(enable=False, axis='x', tight=False)
# plt.axis('auto')
# plt.margins(0.8, 0)
# plt.semilogx(basex=5)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
fig = plt.gcf()
fig.set_size_inches(15.5, 10.5)
And here is the figure:
As you can see bars for all percentiles above 99.0 overlaps and are completely unreadable. I would like to set some fixed space between ticks to have a same space between all of them.
Since you're using pandas, you can do all this from within that library:
means = df.mean(axis=1)/1000
stds = df.std(axis=1)/1000
means.plot.bar(yerr=stds, fc='b')
# Make some room for the x-axis tick labels
plt.subplots_adjust(bottom=0.2)
plt.show()
Not wishing to take anything away from xnx's answer (which is the most elegant way to do things given that you're working in pandas, and therefore likely the best answer for you) but the key insight you're missing is that, in matplotlib, the x positions of the data you're plotting and the x tick labels are independent things. If you say:
nominalX = np.arange( 1, 6 ) ** 2
y = np.arange( 1, 6 ) ** 4
positionalX = np.arange(len(y))
plt.bar( positionalX, y ) # graph y against the numbers 1..n
plt.gca().set(xticks=positionalX + 0.4, xticklabels=nominalX) # ...but superficially label the X values as something else
then that's different from tying positions to your nominal X values:
plt.bar( nominalX, y )
Note that I added 0.4 to the x position of the ticks, because that's half the default width of the bars bar( ..., width=0.8 )—so the ticks end up in the middle of the bar.
I'm trying to visualize a sorted table (sorted on a column). My ideal result should be something like
visualization of a sorted table
Any suggestion on how to reach this goal with matplotlib?
I'have already tried with suggestions given here and here but I'm looking for something fancier like that in the attached image.
Thanks in advance,
Matplotlib does not support this directly, but it is fairly easy to replicate the plot that you have linked to.
The function below does something similar given a 2d array of data. It can be sorted or not, the function doesn't really care.
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
def sorted_table_plot(data, labels, categories, cmap=None, ax=None):
# check if an axes was supplied
if ax is None:
ax = plt.gca()
# check if a colormap was supplied
if cmap is None:
cmap = plt.cm.jet
# generate the grid arrays with the coordinates for the annotations
yy, xx = np.mgrid[:data.shape[0], :data.shape[1]]
x = xx.flatten()
y = yy.flatten()
d = data.flatten()
# a norm object which we will use with the colorbar
norm = plt.Normalize(d.min(), d.max())
# iterate over the data points and draw the labels
for di, xi, yi in zip(d, x, y):
color = cmap(norm(di))
hsv = mcolors.rgb_to_hsv(color[:3])
fc = 'w' if hsv[2] < 0.7 else 'k'
ax.annotate(str(di), xy=(xi,yi), xycoords="data",
va="center", ha="center", color=fc,
bbox=dict(boxstyle="circle", fc=color))
# iteratve over all the appearing values and draw the lines
for i in np.unique(data):
xi, yi = x[d==i], y[d==i]
idx = np.argsort(xi)
plt.plot(xi[idx], yi[idx], color=plt.cm.jet(norm(i)), lw=2)
# add the axes labels
ax.set_xticks(xx[0,:])
ax.set_xticklabels(categories)
ax.set_yticks(yy[:,0])
ax.set_yticklabels(labels)
# adjust the axes ranges
ax.set_xlim(xx[0,0] - 0.5, xx[-1,-1] + 0.5)
ax.set_ylim(yy[-1,-1] + 0.5, yy[0,0] - 0.5)
Now, you can simply call it on a data array. In the following I created a random array, since you didn't care to supply an example data set.
# fix the seed for reproducability
np.random.seed(2)
# create random data
data = np.tile(np.arange(1,8), (3,1)).T
labels = map(lambda x: 'label ' + str(x), data[:,1])
categories = map(lambda x: 'cat ' + str(x), np.arange(data.shape[1])+1)
for i in range(1,data.shape[1]):
# shuffle all but the first column
np.random.shuffle(data[:,i])
# call the function and show the plot
sorted_table_plot(data, labels, categories)
plt.show()
Result: