Adding labels to seaborn bars - python

I'm trying to create two, vertically aligned, horizontal grouped bar charts. I have a huge amount of data for several Machine Learning models and their corresponding runtimes and would like to display all this data in a meaningful way. My attempt so far looks as follows:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
labels = ['MLP','FCN','ResNet','ROCKET','1-NN DTW','LightGBM','XGBoost','CatBoost']
Data1_Accuracy = [20, 34, 30, 35, 27,77.83125,78.7204167,78.5354167]
Data2_Accuracy = [20, 34, 30, 35, 27,75.7979167,76.2520833,77.87]
Data3_Accuracy = [20, 34, 30, 35, 27,80.14625,81.5033333,81.4625]
Data4_Accuracy = [20, 34, 30, 35, 27,78.3841667,79.34875,80.5270833]
Data5_Accuracy = [20, 34, 30, 35, 27,79.2495833,77.5370833,79.2666667]
Data6_Accuracy = [20, 34, 30, 35, 27,77.03125,77.2429167,77.9960275]
Data7_Accuracy = [20, 34, 30, 35, 27,81.3241667,80.5408333,84.2083333]
Data8_Accuracy = [20, 34, 30, 35, 27,78.1470833,78.1225,80.2754167]
Data9_Accuracy = [20, 34, 30, 35, 27,80.7383333,79.9358333,79.6916667]
Data10_Accuracy = [20, 34, 30, 35, 27,74.1095833,73.0879167,73.0529167]
Data11_Accuracy = [20, 34, 30, 35, 27,78.4775,77.8658333,78.35]
Data12_Accuracy = [20, 34, 30, 35, 27,73.0991667,71.9683333,72.75625]
Data13_Accuracy = [20, 34, 30, 35, 27,79.03,79.575,80.3870833]
Data14_Accuracy = [20, 34, 30, 35, 27,81.0241667,81.455,80.5516667]
Data15_Accuracy = [20, 34, 30, 35, 27,79.4829167,80.01375,81.68]
Data16_Accuracy = [20, 34, 30, 35, 27,81.1158333,80.9795833,80.6541667]
Data1_Times = [20, 34, 30, 35, 27,829.0177925,58.6558111,8493.968922]
Data2_Times = [20, 34, 30, 35, 27,604.5935536,64.3871907,6833.585728]
Data3_Times = [20, 34, 30, 35, 27,1286.01507,92.4329714,6821.308612]
Data4_Times = [20, 34, 30, 35, 27,757.3903304,78.7253731,5455.483287]
Data5_Times = [20, 34, 30, 35, 27,401.3722335,30.4119882,5160.041989]
Data6_Times = [20, 34, 30, 35, 27,321.4673242,54.1971346,4465.557807]
Data7_Times = [20, 34, 30, 35, 27,2598.48826,193.1256487,10811.65574]
Data8_Times = [20, 34, 30, 35, 27,1545.059628,139.9638344,7784.332016]
Data9_Times = [20, 34, 30, 35, 27,663.416329,615.3660963,3560.337827]
Data10_Times = [20, 34, 30, 35, 27,670.1615828,621.8249994,3567.653313]
Data11_Times = [20, 34, 30, 35, 27,619.1959161,572.3292757,3493.582855]
Data12_Times = [20, 34, 30, 35, 27,626.107683,579.0746278,3528.605614]
Data13_Times = [20, 34, 30, 35, 27,2936.5633,2631.284413,6465.254111]
Data14_Times = [20, 34, 30, 35, 27,2967.02757,2672.068268,6551.57865]
Data15_Times = [20, 34, 30, 35, 27,4102.511475,3711.899848,7704.401239]
Data16_Times = [20, 34, 30, 35, 27,4075.485739,3726.896591,7737.482708]
Data1_TimesInHours = np.array(Data1_Times) / 3600
Data2_TimesInHours = np.array(Data2_Times) / 3600
Data3_TimesInHours = np.array(Data3_Times) / 3600
Data4_TimesInHours = np.array(Data4_Times) / 3600
Data5_TimesInHours = np.array(Data5_Times) / 3600
Data6_TimesInHours = np.array(Data6_Times) / 3600
Data7_TimesInHours = np.array(Data7_Times) / 3600
Data8_TimesInHours = np.array(Data8_Times) / 3600
Data9_TimesInHours = np.array(Data9_Times) / 3600
Data10_TimesInHours = np.array(Data10_Times) / 3600
Data11_TimesInHours = np.array(Data11_Times) / 3600
Data12_TimesInHours = np.array(Data12_Times) / 3600
Data13_TimesInHours = np.array(Data13_Times) / 3600
Data14_TimesInHours = np.array(Data14_Times) / 3600
Data15_TimesInHours = np.array(Data15_Times) / 3600
Data16_TimesInHours = np.array(Data16_Times) / 3600
accuraciesDataFrame = pd.DataFrame({'Index': labels,
'Data1_Accuracy': Data1_Accuracy,
'Data2_Accuracy': Data2_Accuracy,
'Data3_Accuracy': Data3_Accuracy,
'Data4_Accuracy': Data4_Accuracy,
'Data5_Accuracy': Data5_Accuracy,
'Data6_Accuracy': Data6_Accuracy,
'Data7_Accuracy': Data7_Accuracy,
'Data8_Accuracy': Data8_Accuracy,
'Data9_Accuracy': Data9_Accuracy,
'Data10_Accuracy': Data10_Accuracy,
'Data11_Accuracy': Data11_Accuracy,
'Data12_Accuracy)': Data12_Accuracy,
'Data13_Accuracy': Data13_Accuracy,
'Data14_Accuracy': Data14_Accuracy,
'Data15_Accuracy': Data15_Accuracy,
'Data16_Accuracy': Data16_Accuracy},
columns = ['Index','Data1_Accuracy','Data2_Accuracy','Data3_Accuracy','Data4_Accuracy','Data5_Accuracy','Data6_Accuracy','Data7_Accuracy','Data8_Accuracy','Data9_Accuracy','Data10_Accuracy',
'Data11_Accuracy','Data12_Accuracy','Data13_Accuracy','Data14_Accuracy','Data15_Accuracy','Data16_Accuracy'])
timesDataFrame = pd.DataFrame({'Index': labels,
'Data1_TimesInHours': Data1_TimesInHours,
'Data2_TimesInHours': Data2_TimesInHours,
'Data3_TimesInHours': Data3_TimesInHours,
'Data4_TimesInHours': Data4_TimesInHours,
'Data5_TimesInHours': Data5_TimesInHours,
'Data6_TimesInHours': Data6_TimesInHours,
'Data7_TimesInHours': Data7_TimesInHours,
'Data8_TimesInHours': Data8_TimesInHours,
'Data9_TimesInHours': Data9_TimesInHours,
'Data10_TimesInHours': Data10_TimesInHours,
'Data11_TimesInHours': Data11_TimesInHours,
'Data12_TimesInHours': Data12_TimesInHours,
'Data13_TimesInHours': Data13_TimesInHours,
'Data14_TimesInHours': Data14_TimesInHours,
'Data15_TimesInHours': Data15_TimesInHours,
'Data16_TimesInHours': Data16_TimesInHours},
columns = [
'Index','Data1_TimesInHours','Data2_TimesInHours','Data3_TimesInHours','Data4_TimesInHours',
'Data5_TimesInHours','Data6_TimesInHours','Data7_TimesInHours','Data8_TimesInHours','Data9_TimesInHours','Data10_TimesInHours',
'Data11_TimesInHours','Data12_TimesInHours','Data13_TimesInHours','Data14_TimesInHours','Data15_TimesInHours','Data16_TimesInHours'
])
accuraciesDataFrameMelted = pd.melt(accuraciesDataFrame, id_vars=['Index'])
timesDataFrameMelted = pd.melt(timesDataFrame, id_vars=['Index'])
fig, axs = plt.subplots(1,2)
fig.set_size_inches(30,10)
xRangeFirstChart = list(range(0,101))
fig.suptitle('Rounded accuracies (%) and times for training and evaluation (h) for different data types and models',fontsize=26)
g1 = sns.barplot(x='value', y='Index', hue='variable', data=accuraciesDataFrameMelted, ax=axs[0])
axs[0].set_xlim([xRangeFirstChart[0],xRangeFirstChart[-1]])
axs[0].set_ylabel('Model',fontsize=24)
axs[0].set_xlabel('Rounded Accuracy (%)',fontsize=24)
axs[0].set_title('Rounded accuracies (%) for different data types and models',fontsize=22)
g2 = sns.barplot(x='value', y='Index', hue='variable', data=timesDataFrameMelted, ax=axs[1])
axs[0].get_legend().remove()
axs[1].get_legend().remove()
axs[1].get_yaxis().set_visible(False)
axs[1].set_xlabel('Training and evaluation time (h)',fontsize=24)
axs[1].set_title('Rounded training and evaluation time (h) for different data types and models',fontsize=22)
plt.savefig('PathToFigure/MyFigure.png', dpi=300, bbox_inches='tight', pad_inches=0)
What I'm missing is a way to write the labels "Data 1", "Data 2", Data 3", etc... in every bar. Please refer to the image for a visualization of what I'm trying to achieve. Any help is highly appreciated!

Since there are so many bars in one graph, I would use sns.catplot to draw the the different categories into a Facet Grid and then it would be much better for adding labels, which you can do with the custom function add_labels (please note the different parameters -- feel free to remove some/add others. I have adapted from this solution).
You could also make the x-axis more variable if you pass sharex=False when creating the catplots (see end of this solution)
Also, sns.catplot doesn't work well with adding to subplots, so that you can save as one figure. This is why I use plt.close(fig) to get rid of the blank figure we created, and this would also mean adding any formatting (such as adding a title) to that figure would be pointless, since we are getting rid of the figure at the end; however, there are hacks. One is to save as separate figures and use a solution from here: to combine into one .pdf. I think it would be better to have the extra space of one graph per page or image. Another option is to use somewhat of a hack to get into one figure:
fig, ax = plt.subplots(nrows=2)
sns.set_context('paper', font_scale=1.4)
plt.style.use('dark_background')
n_cols=4 #this is used later in a couple of places to make dynamic
g1 = sns.catplot(data=accuraciesDataFrameMelted, x='value', y='variable', col='Index', kind='bar',
col_wrap=n_cols, ax=ax[0])
g1.fig.suptitle('Rounded accuracies (%) for different data types and models',fontsize=22)
plt.subplots_adjust(top=0.9, bottom=-0.5)
g2 = sns.catplot(data=timesDataFrameMelted, x='value', y='variable', col='Index', kind='bar',
col_wrap=n_cols, ax=ax[1])
g2.fig.suptitle('Rounded training and evaluation time (h) for different data types and models',fontsize=22)
plt.subplots_adjust(top=0.9, bottom=-0.5)
def add_labels(graph, category_size, axis_number, omit_thresh, width_var, num_format):
for i in range(category_size):
ax = graph.facet_axis(axis_number,i)
for p in ax.patches:
if p.get_width() > omit_thresh: # omit labels close to zero or other threshold
width = p.get_width() * width_var # get bar length
ax.text(width, # set the text at 1 unit right of the bar
p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
num_format.format(p.get_width()), # set variable to display, 2 decimals
ha = 'center', # horizontal alignment
va = 'center') # vertical alignment
else:
pass
l1 = len(accuraciesDataFrameMelted['Index'].unique())
l2 = len(timesDataFrame['Index'].unique())
add_labels(graph=g1, category_size=l1, axis_number=0, omit_thresh=1, width_var=0.5, num_format='{:1.0f}')
add_labels(graph=g2, category_size=l2, axis_number=1, omit_thresh=0.1, width_var=0.5, num_format='{:1.2f}')
for g, i in zip([g1,g2], [0, n_cols]):
g.axes[i].set_ylabel('Model')
for g in [g1,g2]:
g.set_titles("{col_name}", fontsize=12)
g1.set_axis_labels(x_var="Rounded Accuracy (%)", y_var="Model")
g2.set_axis_labels(x_var="Training and evaluation time (h)", y_var="Model")
plt.close(fig)
g1.fig.savefig('g1.pdf',dpi=300, bbox_inches = "tight")
g2.fig.savefig('g2.pdf',dpi=300, bbox_inches = "tight")
plt.show()
(Zoomed In to show first graph)
(Zoomed Out to show both graphs)
You could also make the x-axis more variable if you pass sharex=False when creating the catplot, by making the changes below (pass sharex and change one of the params in my function to `omit_thresh=0:
g1 = sns.catplot(data=accuraciesDataFrameMelted, x='value', y='variable',
col='Index', kind='bar',
col_wrap=n_cols, ax=ax[0], sharex=False)
g2 = sns.catplot(data=timesDataFrameMelted, x='value', y='variable', col='Index', kind='bar',
col_wrap=n_cols, ax=ax[1], sharex=False)
add_labels(graph=g1, category_size=l1, axis_number=0, omit_thresh=0, width_var=0.5, num_format='{:1.0f}')
add_labels(graph=g2, category_size=l2, axis_number=1, omit_thresh=0, width_var=0.5, num_format='{:1.3f}')

Related

Matplotlib blank space with no color when use fill_between with where option

Update:
I slice days into 100 points then interpolate the corresponding value of min_temp and max_temp, the result become better, but still some area have no color, how to modify it?
days_vals=numpy.linspace(1,10,100)
min_interp=numpy.interp(days_vals,days,min_temp)
max_interp=numpy.interp(days_vals,days,max_temp)
plt.xticks(days)
plt.plot(days_vals,min_interp,c='b',marker='o')
plt.plot(days_vals,max_interp,c='g',marker='o')
plt.fill_between(days_vals,min_interp,max_interp,where=[i>35 for i in min_interp],
facecolor='lightgreen',alpha=0.7,interpolate=False)
plt.fill_between(days_vals,min_interp,max_interp,where=[i<=35 for i in min_interp],
facecolor='lightpink',alpha=0.7,interpolate=False)
==========================================================================
I am using fill_between with where option to fill the color, min_temp > 35 fill green and min_temp <= 35 fill pink, but see the result is not as my expected
there are so many blank area with no color.
I search one question somelike my issue link
it solution is to add additional data-points to the series that that lie on the axis, but it not fix my issue
How can i modify my codes to make the color continuous with no blank space?
here's the codes:
from matplotlib import pyplot as plt
days=range(1,11)
max_temp=[37, 35, 42, 36, 39, 56, 50, 45, 41, 39]
min_temp=[32, 30, 37, 20, 34, 40, 37, 38, 32, 30]
fig=plt.figure(figsize=(10,8))
font={'weight':'normal',
'color':'cyan',
'fontsize':24,
}
plt.title('Weather 2014',fontdict=font)
plt.xlabel('Month',fontdict=font)
plt.ylabel('Temperature',fontdict=font)
plt.title('Weather 2014',fontdict=font)
plt.xlabel('Month',fontdict=font)
plt.ylabel('Temperature',fontdict=font)
plt.xticks(days)
plt.plot(days,max_temp,marker='o',mfc='red',mec='None',markersize=3,label='Max Temp')
plt.plot(days,min_temp,marker='o',mfc='g',mec='None',markersize=3,label='Min Temp')
'''add additional data points'''
eta=1e-6
plt.fill_between(days,min_temp,max_temp,where=[i+eta>35 for i in min_temp],
facecolor='lightgreen',alpha=0.7)
plt.fill_between(days,min_temp,max_temp,where=[i-eta<=35 for i in min_temp],
facecolor='lightpink',alpha=0.7)
plt.legend(loc='upper left',bbox_to_anchor=(1,1))
fig.autofmt_xdate()
plt.grid(True)
plt.show()

Cluster datapoints using kmeans sklearn in python

I am using the following python code to cluster my datapoints using kmeans.
data = np.array([[30, 17, 10, 32, 32], [18, 20, 6, 20, 15], [10, 8, 10, 20, 21], [3, 16, 20, 10, 17], [3, 15, 21, 17, 20]])
kmeans_clustering = KMeans( n_clusters = 3 )
idx = kmeans_clustering.fit_predict( data )
#use t-sne
X = TSNE(n_components=2).fit_transform( data )
fig = plt.figure(1)
plt.clf()
#plot graph
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
plt.scatter(X[:,0], X[:,1], c=colors[kmeans_clustering.labels_])
plt.title('K-Means (t-SNE)')
plt.show()
However, the plot of the clusters I get is wrong as I get everything in one point.
Hence, please let me know where I am making my code wrong? I want to view the kmeans clusters seperately in my scatter plot.
EDIT
The t-sne vales I get are as follows.
[[ 1.12758575e-04 9.30458337e-05]
[ -1.82559784e-04 -1.06657936e-04]
[ -9.56485652e-05 -2.38951623e-04]
[ 5.56515580e-05 -4.42453191e-07]
[ -1.42039677e-04 -5.62548119e-05]]
Use the perplexity parameter of the TSNE. The default value of the perplexity is 30, it seems that's too much for your case, even though the documentation states that TSNE is quite insensitive to this parameter.
The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. The choice is not extremely critical since t-SNE is quite insensitive to this parameter.
X = TSNE(n_components=2, perplexity=2.0).fit_transform( data )
You could also use PCA (Principal Components Analysis) instead of t-SNE to plot your clusters:
import numpy as np
import pandas as pd
from sklearn.cluster import Kmeans
from sklearn.decomposition import PCA
data = np.array([[30, 17, 10, 32, 32], [18, 20, 6, 20, 15], [10, 8, 10, 20,
21], [3, 16, 20, 10, 17], [3, 15, 21, 17, 20]])
kmeans = KMeans(n_clusters = 3)
labels = kmeans.fit_predict(data)
pca = PCA(n_components=2)
data_reduced = pca.fit_transform(data)
data_reduced = pd.DataFrame(data_reduced)
ax = data_reduced.plot(kind='scatter', x=0, y=1, c=labels, cmap='rainbow')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('Projection of the clustering on a the axis of the PCA')
for x, y, label in zip(data_reduced[0], data_reduced[1], kmeans.labels_):
ax.annotate('Cluster {0}'.format(label), (x,y))

How to make grid of the irregular data?

I have the numpy arrays of longitudes, latitudes, and the data.
I want to plot this data as a raster image using numpy, scipy, and matplotlib.
import numpy as np
from matplotlib.mlab import griddata
import matplotlib.pyplot as plt
longitudes = np.array([[139.79391479492188, 140.51760864257812, 141.19119262695312, 141.82083129882812, 142.41165161132812],
[139.79225158691406, 140.51416015625, 141.18606567382812, 141.8140869140625, 142.40338134765625],
[139.78591918945312, 140.50637817382812, 141.17694091796875, 141.80377197265625, 142.3919677734375],
[139.78387451171875, 140.50253295898438, 141.17147827148438, 141.79678344726562, 142.38360595703125],
[139.77781677246094, 140.4949951171875, 141.16250610351562, 141.78646850585938, 142.37196350097656]],dtype=float)
latitudes = np.array([[55.61929702758789, 55.621070861816406, 55.61888122558594, 55.613487243652344, 55.60547637939453],
[55.53120040893555, 55.532840728759766, 55.53053665161133, 55.525047302246094, 55.5169677734375],
[55.44305419921875, 55.444580078125, 55.44219207763672, 55.43663024902344, 55.42848587036133],
[55.35470199584961, 55.356109619140625, 55.353614807128906, 55.34796905517578, 55.33975601196289],
[55.26683807373047, 55.268131256103516, 55.26553726196289, 55.25981140136719, 55.25152587890625]],dtype=float)
data = np.array([[10, 10, 10, 10, 10],
[20, 20, 20, 20, 20],
[30, 30, 30, 30, 30],
[40, 40, 40, 40, 40],
[50, 50, 50, 50, 50]],dtype=float)
x = longitudes.ravel()
y = latitudes.ravel()
z = data.ravel()
xMin, xMax = np.min(x), np.max(x)
yMin, yMax = np.min(y), np.max(y)
xi = np.linspace(xMin, xMax, 0.005) ##choosen spacing of 0.005
yi = np.linspace(yMin, yMax, 0.005) ##choosen spacing of 0.005
The data are not exactly a grid. Actually I could not imagine how to do it ahead:
zi_matplotlib = griddata(x, y, z, xi, yi, interp='linear')
from scipy.interpolate import griddata ##Using scipy method
zi_scipy = griddata((x, y), z, (xi, yi), method='nearest')
plt.imshow(????)
Any ideas and solution please.
You can use interpolation to convert the distorted grid into a regular grid. The interpolation fits the original data points and returns a function that can be evaluated at any point of your choosing, and in this case, you would choose a regular grid of points.
Here's an example:
import numpy as np
from scipy.interpolate import interp2d
import matplotlib.pyplot as plt
# your data here, as posted in the question
f = interp2d(lon, lat, data, kind="cubic", bounds_error=False)
dlon, dlat = 1.2, .2
xlon = np.linspace(min(lon.flat), max(lon.flat), 20)
xlat = np.linspace(min(lat.flat), max(lat.flat), 20)
# the next few lines are because there seems to be a bug in interp2d
# instead one would just want to use r = interp2d(X.flat, Y.flat) (where X,Y are as below)
# but for the version of scipy I'm using ('0.13.3'), this throws an exception.
r = np.zeros((len(xlon), len(xlat)))
for i, rlat in enumerate(xlat):
for j, rlon in enumerate(xlon):
r[i,j] = f(rlon, rlat)
X, Y = np.meshgrid(xlon, xlat)
plt.imshow(r, interpolation="nearest", origin="lower", extent=[min(xlon), max(xlon), min(xlat), max(xlat)], aspect=6.)
plt.scatter(lon.flat, lat.flat, color='k')
plt.show()
Here, I left the mesh fairly coarse (20x20) and used interpolation="nearest" so you could still see the colored squares representing each of the interpolated values, done, of course, on a regular grid (created using the two linspace calls). Note also the use or origin="lower" which sets the image and the scatter plot to have the same orientation.
To interpret this, the main issue is that changing of values from left-to-right. This is due to the data being specified as constant across the horizontal set of points, but because the points where these specified were warped, the interpolated values slowly change as they move across. For example, the lowest scatter point on the right should have approximately the same color as the highest one towards the left. Also, indicative of this is that there's not much color change between any of the two leftmost pairs, but a lot between the two right most, where the warping is largest.
Note that the interpolation could be done for any values, not only a regular grid, which is just being used for imshow as per the original question. Also note that I used bounds_error=False so I could evaluate a few points slightly outside of the original dataset, but be very careful with this as points outside of the original data will quickly become unreasonable due to the cubics being evaluated beyond the region where they were fit.
Assuming that longitudes and latitudes are equally spaced, you can use imshow directly as it features interpolation:
import numpy as np
import matplotlib.pyplot as plt
longitudes = np.array([[139.79391479492188, 140.51760864257812, 141.19119262695312, 141.82083129882812, 142.41165161132812],
[139.79225158691406, 140.51416015625, 141.18606567382812, 141.8140869140625, 142.40338134765625],
[139.78591918945312, 140.50637817382812, 141.17694091796875, 141.80377197265625, 142.3919677734375],
[139.78387451171875, 140.50253295898438, 141.17147827148438, 141.79678344726562, 142.38360595703125],
[139.77781677246094, 140.4949951171875, 141.16250610351562, 141.78646850585938, 142.37196350097656]],dtype=float)
latitudes = np.array([[55.61929702758789, 55.621070861816406, 55.61888122558594, 55.613487243652344, 55.60547637939453],
[55.53120040893555, 55.532840728759766, 55.53053665161133, 55.525047302246094, 55.5169677734375],
[55.44305419921875, 55.444580078125, 55.44219207763672, 55.43663024902344, 55.42848587036133],
[55.35470199584961, 55.356109619140625, 55.353614807128906, 55.34796905517578, 55.33975601196289],
[55.26683807373047, 55.268131256103516, 55.26553726196289, 55.25981140136719, 55.25152587890625]],dtype=float)
data = np.array([[10, 10, 10, 10, 10],
[20, 20, 20, 20, 20],
[30, 30, 30, 30, 30],
[40, 40, 40, 40, 40],
[50, 50, 50, 50, 50]],dtype=float)
extent = (longitudes[0,0], longitudes[0,-1], latitudes[0,0], latitudes[-1,0])
plt.imshow(data, interpolation='bilinear', extent=extent, aspect='auto')
plt.show()
I'm aware that this does not exactly answer your question. But I think it is an easy solution to the underlying problem.
Edit
I just realized that your data is in fact not exactly a grid, but almost. You have to decide if you still want to use my solution...
Here's an example of a scatter 3d plot using your data, breaking out each set of lat/long data in its own series with respective colored markers.
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
longitudes = np.array([[139.79391479492188, 140.51760864257812, 141.19119262695312, 141.82083129882812, 142.41165161132812],
[139.79225158691406, 140.51416015625, 141.18606567382812, 141.8140869140625, 142.40338134765625],
[139.78591918945312, 140.50637817382812, 141.17694091796875, 141.80377197265625, 142.3919677734375],
[139.78387451171875, 140.50253295898438, 141.17147827148438, 141.79678344726562, 142.38360595703125],
[139.77781677246094, 140.4949951171875, 141.16250610351562, 141.78646850585938, 142.37196350097656]],dtype=float)
latitudes = np.array([[55.61929702758789, 55.621070861816406, 55.61888122558594, 55.613487243652344, 55.60547637939453],
[55.53120040893555, 55.532840728759766, 55.53053665161133, 55.525047302246094, 55.5169677734375],
[55.44305419921875, 55.444580078125, 55.44219207763672, 55.43663024902344, 55.42848587036133],
[55.35470199584961, 55.356109619140625, 55.353614807128906, 55.34796905517578, 55.33975601196289],
[55.26683807373047, 55.268131256103516, 55.26553726196289, 55.25981140136719, 55.25152587890625]],dtype=float)
data = np.array([[10, 10, 10, 10, 10],
[20, 20, 20, 20, 20],
[30, 30, 30, 30, 30],
[40, 40, 40, 40, 40],
[50, 50, 50, 50, 50]],dtype=float)
colors = ['r','g','b','k','k']
markers = ['o','o','o','o','^']
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for i in range(5):
ax.scatter(longitudes[i], latitudes[i], data[i], c=colors[i], marker=markers[i])
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_zlabel('Data')
plt.show()
Which results in an image like

Python display specific values on x-axis using matplotlib

I'm querying data from a simple sqlite3 DB which is pulling a list of the number of connections per port observed on my system. I'm trying to graph this into a simple bar-chart using matplotlib.
Thus far, I'm using the follow code:
import matplotlib as mpl
mpl.use('Agg') # force no x11
import matplotlib.pyplot as plt
import sqlite3
con = sqlite3.connect('test.db')
cur = con.cursor()
cur.execute('''
SELECT dst_port, count(dst_port) as count from logs
where dst_port != 0
group by dst_port
order by count desc;
'''
)
data = cur.fetchall()
dst_ports, dst_port_count = zip(*data)
#dst_ports = [22, 53223, 40959, 80, 3389, 23, 443, 35829, 8080, 4899, 21320, 445, 3128, 44783, 4491, 9981, 8001, 21, 1080, 8081, 3306, 8002, 8090]
#dst_port_count = [5005, 145, 117, 41, 34, 21, 17, 16, 15, 11, 11, 8, 8, 8, 6, 6, 4, 3, 3, 3, 1, 1, 1]
print dst_ports
print dst_port_count
fig = plt.figure()
# aesthetics and data
plt.grid()
plt.bar(dst_ports, dst_port_count, align='center')
#plt.xticks(dst_ports)
# labels
plt.title('Number of connections to port')
plt.xlabel('Destination Port')
plt.ylabel('Connection Attempts')
# save figure
fig.savefig('temp.png')
When I run the above, the data is successful retrieved from the DB and a graph is generated. However, the graph isn't what I was expecting. For example, on the x-axis, it plots all values between 0 and 5005. I'm looking for it to display only the values in dst_ports. I've tried using xticks but this doesn't work either.
I've included some sample data in the above code which I've commented out that may be useful.
In addition, here is an example of the graph output from the above code:
And also a grpah when using xticks:
You need to create some xdata by np.arange():
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt
dst_ports = [22, 53223, 40959, 80, 3389, 23, 443, 35829, 8080, 4899, 21320, 445, 3128, 44783, 4491, 9981, 8001, 21, 1080, 8081, 3306, 8002, 8090]
dst_port_count = [5005, 145, 117, 41, 34, 21, 17, 16, 15, 11, 11, 8, 8, 8, 6, 6, 4, 3, 3, 3, 1, 1, 1]
fig = plt.figure(figsize=(12, 4))
# aesthetics and data
plt.grid()
x = np.arange(1, len(dst_ports)+1)
plt.bar(x, dst_port_count, align='center')
plt.xticks(x, dst_ports, rotation=45)
# labels
plt.title('Number of connections to port')
plt.xlabel('Destination Port')
plt.ylabel('Connection Attempts')
Here is the output:

Financial Charts / Graphs in Ruby or Python

What are my best options for creating a financial open-high-low-close (OHLC) chart in a high level language like Ruby or Python? While there seem to be a lot of options for graphing, I haven't seen any gems or eggs with this kind of chart.
http://en.wikipedia.org/wiki/Open-high-low-close_chart (but I don't need the moving average or Bollinger bands)
JFreeChart can do this in Java, but I'd like to make my codebase as small and simple as possible.
Thanks!
You can use matplotlib and the the optional bottom parameter of matplotlib.pyplot.bar. You can then use line plot to indicate the opening and closing prices:
For example:
#!/usr/bin/env python
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import lines
import random
deltas = [4, 6, 13, 18, 15, 14, 10, 13, 9, 6, 15, 9, 6, 1, 1, 2, 4, 4, 4, 4, 10, 11, 16, 17, 12, 10, 12, 15, 17, 16, 11, 10, 9, 9, 7, 10, 7, 16, 8, 12, 10, 14, 10, 15, 15, 16, 12, 8, 15, 16]
bases = [46, 49, 45, 45, 44, 49, 51, 52, 56, 58, 53, 57, 62, 63, 68, 66, 65, 66, 63, 63, 62, 61, 61, 57, 61, 64, 63, 58, 56, 56, 56, 60, 59, 54, 57, 54, 54, 50, 53, 51, 48, 43, 42, 38, 37, 39, 44, 49, 47, 43]
def rand_pt(bases, deltas):
return [random.randint(base, base + delta) for base, delta in zip(bases, deltas)]
# randomly assign opening and closing prices
openings = rand_pt(bases, deltas)
closings = rand_pt(bases, deltas)
# First we draw the bars which show the high and low prices
# bottom holds the low price while deltas holds the difference
# between high and low.
width = 0
ax = plt.axes()
rects1 = ax.bar(np.arange(50), deltas, width, color='r', bottom=bases)
# Now draw the ticks indicating the opening and closing price
for opening, closing, bar in zip(openings, closings, rects1):
x, w = bar.get_x(), 0.2
args = {
}
ax.plot((x - w, x), (opening, opening), **args)
ax.plot((x, x + w), (closing, closing), **args)
plt.show()
creates a plot like this:
Obviously, you'd want to package this up in a function that drew the plot using (open, close, min, max) tuples (and you probably wouldn't want to randomly assign your opening and closing prices).
You can use Pylab (matplotlib.finance) with Python. Here are some examples: http://matplotlib.sourceforge.net/examples/pylab_examples/plotfile_demo.html . There is some good material specifically on this problem in Beginning Python Visualization.
Update: I think you can use matplotlib.finance.candlestick for the Japanese candlestick effect.
Have you considered using R and the quantmod package? It likely provides exactly what you need.
Some examples about financial plots (OHLC) using matplotlib can be found here:
finance demo
#!/usr/bin/env python
from pylab import *
from matplotlib.dates import DateFormatter, WeekdayLocator, HourLocator, \
DayLocator, MONDAY
from matplotlib.finance import quotes_historical_yahoo, candlestick,\
plot_day_summary, candlestick2
# (Year, month, day) tuples suffice as args for quotes_historical_yahoo
date1 = ( 2004, 2, 1)
date2 = ( 2004, 4, 12 )
mondays = WeekdayLocator(MONDAY) # major ticks on the mondays
alldays = DayLocator() # minor ticks on the days
weekFormatter = DateFormatter('%b %d') # Eg, Jan 12
dayFormatter = DateFormatter('%d') # Eg, 12
quotes = quotes_historical_yahoo('INTC', date1, date2)
if len(quotes) == 0:
raise SystemExit
fig = figure()
fig.subplots_adjust(bottom=0.2)
ax = fig.add_subplot(111)
ax.xaxis.set_major_locator(mondays)
ax.xaxis.set_minor_locator(alldays)
ax.xaxis.set_major_formatter(weekFormatter)
#ax.xaxis.set_minor_formatter(dayFormatter)
#plot_day_summary(ax, quotes, ticksize=3)
candlestick(ax, quotes, width=0.6)
ax.xaxis_date()
ax.autoscale_view()
setp( gca().get_xticklabels(), rotation=45, horizontalalignment='right')
show()
finance work 2
Are you free to use JRuby instead of Ruby? That'd let you use JFreeChart, plus your code would still be in Ruby
Please look at the Open Flash Chart embedding for WHIFF
http://aaron.oirt.rutgers.edu/myapp/docs/W1100_1600.openFlashCharts
An example of a candle chart is right at the top. This would be especially
good for embedding in web pages.
Open Flash Chart is nice choice if you like the look of examples. I've moved to JavaScript/Canvas library like Flot for HTML embedded charts, as it is more customizable and I get desired effect without much hacking (http://itprolife.worona.eu/2009/08/scatter-chart-library-moving-to-flot.html).
This is the stock chart I draw just days ago using Matplotlib, I've posted the source too, for your reference: StockChart_Matplotlib

Categories