I'm triying to make a figure where the stem plot has the baseline on the data of dataframe_3_merged['TOTAL'].
import numpy as np
from eurostatapiclient import EurostatAPIClient
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import pandas as pd
#Set versions and formats, so far only the ones used here are availeable and call client
VERSION = 'v2.1'
FORMAT = 'json'
LANGUAGE = 'en'
client = EurostatAPIClient(VERSION, FORMAT, LANGUAGE)
dataframe_3_query_total = 'ilc_peps01?precision=1&sex=T&geo=AT&geo=BE&geo=BG&geo=CH&geo=CY&geo=CZ&geo=DK&geo=EA19&geo=EE&geo=EL&geo=ES&geo=EU28&geo=FI&geo=FR&geo=HR&geo=HU&geo=IE&geo=IS&geo=IT&geo=LT&geo=LU&geo=LV&geo=ME&geo=MK&geo=MT&geo=NL&geo=NO&geo=PL&geo=PT&geo=RO&geo=RS&geo=SE&geo=SI&geo=SK&geo=TR&geo=UK&unit=PC&unitLabel=label&time=2018&age=TOTAL'
dataframe_3_query_urb = 'ilc_peps13?precision=1°_urb=DEG1°_urb=DEG2°_urb=DEG3&geo=AT&geo=BE&geo=BG&geo=CH&geo=CY&geo=CZ&geo=DE&geo=DK&geo=EA19&geo=EE&geo=EL&geo=ES&geo=EU28&geo=FI&geo=FR&geo=HR&geo=HU&geo=IE&geo=IS&geo=IT&geo=LT&geo=LU&geo=LV&geo=MK&geo=MT&geo=NL&geo=NO&geo=PL&geo=PT&geo=RO&geo=RS&geo=SE&geo=SI&geo=SK&geo=UK&unit=PC&unitLabel=label&time=2018'
dataframe_3_total = client.get_dataset(dataframe_3_query_total).to_dataframe().pivot(index = 'geo',columns = 'age',values = 'values')
dataframe_3_urb =client.get_dataset(dataframe_3_query_urb).to_dataframe().pivot(index = 'geo',columns = 'deg_urb',values = 'values')
dataframe_3_merged = dataframe_3_total.join(dataframe_3_urb).dropna()
fig, ax = plt.subplots(figsize=(15, 4))
plt.ylim(0,51)
x = range(0,32,1)
stem_1 =plt.stem(x,dataframe_3_merged['DEG1'])
stem_2=plt.stem(x, dataframe_3_merged['DEG2'])
stem_3=plt.stem(x, dataframe_3_merged['DEG3'])
plt.setp(stem_2, color = 'r')
plt.setp(stem_3, color = 'g')
scatterplot= sns.scatterplot(x=dataframe_3_merged.index, #We draw the scatterplot and specify the arguments
y = dataframe_3_merged['TOTAL'],
ax=ax ,
s = 100 ,
legend = False,
marker="_",
color = 'b')
The goal is to have a plot similar to this image:
I tried to use the list dataframe_3_merged['TOTAL'] as the parameter in the bottom argument of plt.stem but I have this traceback: ValueError: setting an array element with a sequence.
Thank you for your help!
You could replace each stem plot by a scatter plot and a plot of vertical lines (plt.vlines). Setting the zorder=0 ensures the lines are drawn behind the dots.
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
names = ['hydrogen', 'helium', 'lithium', 'beryllium', 'boron', 'carbon', 'nitrogen', 'oxygen', 'fluorine', 'neon', 'sodium', 'magnesium', 'aluminium', 'silicon', 'phosphorus', 'sulphur', 'chlorine', 'argon', 'potassium', 'calcium', 'scandium', 'titanium', 'vanadium', 'chromium', 'manganese', 'iron', 'cobalt', 'nickel', 'copper', 'zinc', 'gallium', 'germanium', 'arsenic', 'selenium', 'bromine', 'krypton']
N = len(names)
df = pd.DataFrame({'Deg1': 35 + np.random.normal(size=N).cumsum(),
'Deg2': 25 + np.random.normal(size=N).cumsum(),
'Deg3': 15 + np.random.normal(size=N).cumsum()},
index=names)
df['Total'] = df.mean(axis=1)
for deg, color, label in zip(['Deg1', 'Deg2', 'Deg3'], ['tomato', 'orange', 'palegreen'],
['label1', 'label2', 'label3']):
plt.vlines(df.index, df[deg], df['Total'], lw=0.2, color='k', zorder=0)
plt.scatter(df.index, df[deg], marker='o', color=color, label=label)
plt.scatter(df.index, df['Total'], marker='_', color='deepskyblue', s=100)
plt.xticks(rotation='vertical')
plt.ylim(0, 51)
plt.margins(x=0.02)
plt.legend(ncol=3, bbox_to_anchor=(0.5, -0.4), loc='upper center')
plt.grid(True, axis='y')
plt.tick_params(length=0)
for where in ['top', 'left', 'right']:
plt.gca().spines[where].set_visible(False)
plt.tight_layout()
plt.show()
Related
I would also like to indicate the "box plot all" for the monthly box plots too (for years box plot works) , but I can't: function axvspan() for axes[1].
Also, I can't edit (for example) xtick and ytick correctly and the image is blurry. What do you think of this code of mine? How to improve it?
The code:
import os
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import seaborn as sns
import calendar
df_air = pd.read_csv('https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv',
parse_dates=['Month'], date_parser=lambda x: pd.to_datetime(x, format='%Y-%m',
errors = 'coerce'))
# data preparation
df_air['year'] = [d.year for d in df_air.Month]
df_air['month'] = [d.strftime('%b') for d in df_air.Month]
years = df_air['year'].unique()
# plot drawing
fig, axes = plt.subplots(2, 1, figsize=(20,22), dpi= 120)
plt.subplots_adjust(hspace=0.25) # problem with 'hspace' because doesn't work
axes[0] = sns.boxplot(x='year', y='#Passengers', data=pd.concat([df_air_all, df_air]),
ax=axes[0], showmeans=True, meanprops={"marker":"d"})
axes[1] = sns.boxplot(x='month', y='#Passengers', data=df_air.loc[~df_air.year.isin([1949, 1961]), :], ax=axes[1], showmeans=True,
meanprops={"marker":"d"})
# Set Title
axes[0].set_title('Yearly Boxplot', fontsize=20, fontweight="bold", pad=20)
axes[1].set_title('Monthly Boxplot', fontsize=20, fontweight="bold", pad=20)
axes[0].set_ylabel("Passengers")
axes[1].set_ylabel("Passengers")
axes[0].axvspan(-0.5, 0.5, color='0.85', zorder=-1)
sns.set_style('darkgrid')
palette = ['#9400D3'] + sns.color_palette('viridis', len(df_air['year'].unique()))
axes[0].get_xticklabels()[0].set_weight('bold')
axes[0].set_xlabel('')
axes[1].set_xlabel('')
axes[0].grid(color = 'green', linestyle = '--', linewidth = 0.5)
axes[1].grid(color = 'green', linestyle = '--', linewidth = 0.5)
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 18
plt.rc('xtick',labelsize=16)
plt.rc('ytick',labelsize=16)
plt.rcParams["figure.autolayout"] = True
plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 1
plt.show()
[UPDATE: Sorry for not providing the piece where the author of the codes create example data. I have updated the codes]
I found an example of a 3D mesh line chart that satisfied what I need (colouring change with level on z dimension). However, instead of line, I want surface plot. How can I change the codes to have the 3d surface plot?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import animation, rc
from matplotlib.cm import get_cmap
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.font_manager import FontProperties
from matplotlib.collections import LineCollection
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d.art3d import Line3DCollection
index_returns = np.random.normal(loc=1e-4, scale=5e-3, size=(783, 9))
index_returns = np.vstack((np.zeros(shape=(1, 9)) + 100, index_returns))
index_prices = np.cumprod(1 + index_returns, axis=0)
window = 261
df = np.zeros(shape=(index_prices.shape[0]-window, 9))
for i in range(window, index_prices.shape[0], 1):
df[i-window] = (index_prices[i]/index_prices[i-window]) - 1
index = pd.date_range('2019-01-01', periods=index_prices.shape[0]-window, freq='B')
columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
df = pd.DataFrame(df, index=index, columns=columns)
# create the figure
fig = plt.figure(figsize=(14.4, 9))
ax = fig.add_subplot(111, projection='3d')
fig.patch.set_alpha(1)
# get the cmap to use
cmap = get_cmap('RdYlGn')
# get the slice based on data frame
current_slice = df.values[:261, :]
index_names = df.columns
index_dates = df.index
# list holding the lines
lines = []
# for each index...
for i in range(current_slice.shape[1]):
# get the coordinates
x = np.array(np.arange(current_slice.shape[0]))
y = np.tile(i, current_slice.shape[0])
z = np.array(current_slice[:, i])
# crete points and segments to color
points = np.array([x, y, z]).T.reshape(-1, 1, 3)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
# Create a continuous norm to map from data points to colors
norm = plt.Normalize(-0.19, 0.19)
lc = Line3DCollection(segments, cmap=cmap, norm=norm, zorder=current_slice.shape[1]-i)
# Set the values used for colormapping
lc.set_array(z)
lc.set_linewidth(2)
lc.set_color(cmap(z[-1] * 2.5 + 0.5))
lc.set_label(index_names[i])
lines.append(ax.add_collection(lc))
# add the grids
ax.legend(loc='center right', bbox_to_anchor=(1.1, 0.46), fancybox=True, facecolor=(.95,.95,.95,1), framealpha=1, shadow=False, frameon=True, ncol=1, columnspacing=0, prop={'family': 'DejaVu Sans Mono'})
ax.set_zlabel('Rolling Equity 1Y', labelpad=10)
ax.set_zlim(-0.39, 0.39)
ax.set_zticklabels([' '* 3 + '{:.0%}'.format(val) for val in ax.get_zticks()], fontdict={'verticalalignment': 'center', 'horizontalalignment': 'center'})
ax.set_xlabel('Date', labelpad=30)
ax.set_xlim(0, current_slice.shape[0]-1)
ax.set_xticklabels([index_dates[int(val)].strftime('%m/%y') for val in ax.get_xticks()[:-1]] + [''], rotation=0, fontdict={'verticalalignment': 'top', 'horizontalalignment': 'center'})
ax.set_yticks(np.arange(current_slice.shape[1]))
ax.set_yticklabels([index_names[i] for i in range(current_slice.shape[1])], rotation=-15, fontdict={'verticalalignment': 'center', 'horizontalalignment': 'left'})
# show the plot
plt.show()
I try to produce a plot and want to automatically add text (in this case is percentage) to each circle in correspond to each y axis types. Any help would be very helpful.
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# Make some data
index=['Stream flow',
'Soil moisture',
'Water indices',
'Others',
'Temperature',
'Precipitation',
'Vegetative indices']
value=[2.13, 6.38, 10.64, 12.77, 17.73, 21.99, 28.37]
# create dataframe
percentages = pd.Series(value,index=index)
df = pd.DataFrame({'percentage' : percentages})
df = df.sort_values(by='percentage')
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(df.index)+1))
fig, ax = plt.subplots(figsize=(15,8))
# create for each expense type an horizontal line that starts at x = 0 with the length
plt.hlines(y=my_range, xmin=0, xmax=df['percentage']-0.5, color='black', alpha=0.8, linewidth=1)
# create for each expense type a dot at the level of the expense percentage value
line=plt.plot(df['percentage'], my_range, "o", markersize=30, color='#fd8c00', alpha=0.6, linewidth=0.3)
# set labels
ax.set_xlabel('Percentage', fontsize=15)
ax.set_ylabel('')
# set axis
ax.tick_params(axis='both', which='major', labelsize=14)
plt.yticks(my_range, df.index)
ax.set_xlim(0,30)
You can use matplotlib.axes.Axes.text:
x_space = 0.4
y_space = 0.05
fontsize = 7
for y_i, val in enumerate(value, 1):
ax.text(x = val - x_space, y = y_i - y_space, s = f'{val}%', fontsize = fontsize)
You have to adjust x_space, y_space and fontsize in order to fit properly the text within the circles.
Complete code
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# Make some data
index=['Stream flow',
'Soil moisture',
'Water indices',
'Others',
'Temperature',
'Precipitation',
'Vegetative indices']
value=[2.13, 6.38, 10.64, 12.77, 17.73, 21.99, 28.37]
# create dataframe
percentages = pd.Series(value,index=index)
df = pd.DataFrame({'percentage' : percentages})
df = df.sort_values(by='percentage')
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(df.index)+1))
fig, ax = plt.subplots(figsize=(15,8))
# create for each expense type an horizontal line that starts at x = 0 with the length
plt.hlines(y=my_range, xmin=0, xmax=df['percentage']-0.5, color='black', alpha=0.8, linewidth=1)
# create for each expense type a dot at the level of the expense percentage value
line=plt.plot(df['percentage'], my_range, "o", markersize=30, color='#fd8c00', alpha=0.6, linewidth=0.3)
# set labels
ax.set_xlabel('Percentage', fontsize=15)
ax.set_ylabel('')
# set axis
ax.tick_params(axis='both', which='major', labelsize=14)
plt.yticks(my_range, df.index)
ax.set_xlim(0,30)
x_space = 0.4
y_space = 0.05
for y_i, val in enumerate(value, 1):
ax.text(x = val - x_space, y = y_i - y_space, s = f'{val:>5.2f}%', fontsize = 7)
plt.show()
Same code as above, but with increased circle radius and font, in order to improve readability.
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# Make some data
index=['Stream flow',
'Soil moisture',
'Water indices',
'Others',
'Temperature',
'Precipitation',
'Vegetative indices']
value=[2.13, 6.38, 10.64, 12.77, 17.73, 21.99, 28.37]
# create dataframe
percentages = pd.Series(value,index=index)
df = pd.DataFrame({'percentage' : percentages})
df = df.sort_values(by='percentage')
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(df.index)+1))
fig, ax = plt.subplots(figsize=(15,8))
# create for each expense type an horizontal line that starts at x = 0 with the length
plt.hlines(y=my_range, xmin=0, xmax=df['percentage']-0.85, color='black', alpha=0.8, linewidth=1)
# create for each expense type a dot at the level of the expense percentage value
line=plt.plot(df['percentage'], my_range, "o", markersize=50, color='#fd8c00', alpha=0.6, linewidth=0.3)
# set labels
ax.set_xlabel('Percentage', fontsize=15)
ax.set_ylabel('')
# set axis
ax.tick_params(axis='both', which='major', labelsize=14)
plt.yticks(my_range, df.index)
ax.set_xlim(0,30)
ax.set_ylim(0, len(value) + 1)
x_space = 0.75
y_space = 0.06
fontsize = 12
for y_i, val in enumerate(value, 1):
ax.text(x = val - x_space, y = y_i - y_space, s = f'{val:>5.2f}%', fontsize = fontsize)
plt.show()
Even better, you can use matplotlib.axes.Axes.annotate to get rid of x_space and y_space:
fontsize = 12
for y_i, x_i in enumerate(value, 1):
ax.annotate(f'{x_i:>5.2f}%', xy = (x_i, y_i), xytext = (0, 0), textcoords = 'offset points', ha = 'center', va = 'center', fontsize = fontsize)
You still have to adjust the fontsize to properly fit the radius of the circles.
I would like to put specific marker like second label bottom of the first label in a plot of matplotlib.
The format of my files is like this:
File 1.txt
3
4
6
.
.
etc
file 2.txt
5
12
8
.
.
etc
file 3.txt
230.45
345.65
342.3
.
.
etc.
My script is this:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from numpy import *
from matplotlib.ticker import FormatStrFormatter
from matplotlib.ticker import MaxNLocator
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.ticker as tkr
import matplotlib.patches as patches
with open("1.txt") as f:
lstx = [int(x) for x in f.read().split()]
with open("2.txt") as f:
lsty = [int(x) for x in f.read().split()]
with open("3.txt") as f:
lstz = [float(x) for x in f.read().split()]
def numfmt(x, pos):
s = '{}'.format(int(x + 120))
return s
def numfmty(y, pos):
m = '{}'.format(int(y + 120))
return m
x=np.array(lstx)
y=np.array(lsty)
z=np.array(lstz)
df = pd.DataFrame.from_dict(np.array([y,x,z]).T)
df.columns = ['X_value','Y_value','Z_value']
df['Z_value'] = pd.to_numeric(df['Z_value'])
fig, ax = plt.subplots(figsize=(11,9))
pivotted= df.pivot('X_value','Y_value','Z_value')
ax = sns.heatmap(pivotted, cmap='plasma_r', vmin=0.0, vmax=234.525)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=20)
plt.gca().invert_yaxis()
xfmt = tkr.FuncFormatter(numfmt)
plt.gca().xaxis.set_major_formatter(xfmt)
yfmt = tkr.FuncFormatter(numfmty)
plt.gca().yaxis.set_major_formatter(yfmt)
plt.xlabel('\n Number', fontsize=24)
plt.ylabel('Number \n', fontsize=24)
plt.xticks(size=16)
plt.yticks(size=16)
plt.tight_layout()
major_ticks = np.arange(0, 33, 1)
minor_ticks = np.arange(0, 33, 1)
ax.set_xticks(major_ticks)
ax.set_xticks(minor_ticks, minor=True)
ax.set_yticks(major_ticks)
ax.set_yticks(minor_ticks, minor=True)
ax.grid(which='both')
ax.grid(which='minor', alpha=0.5)
ax.grid(which='major', alpha=0.5)
rect3 = patches.Rectangle((5,5),13,13,linewidth=1.7,linestyle='--',edgecolor='black',facecolor='none')
ax2 = ax.twiny()
ax2.xaxis.set_ticks_position("bottom")
ax2.xaxis.set_label_position("bottom")
newpos=[2,4,6]
newlabel=['*', '*', '*']
ax2.set_xticks(newpos)
ax2.set_xticklabels(newlabel)
ax.add_patch(rect3)
plt.grid()
plt.show()
I would like to put a marker '*' in the positions 125, 128, 130, 133, 138, 142 and 143 in both axis, with a size of 16.
When I try to put them, these are very small, are up of the first label and the grid is move it. The output is this:
How can I fit that? Thanks a lot
The following code adds stars at the indicated columns and rows.
Something confusing about seaborn is that it is quite opinionated giving priority to how formatting looks like and not caring too much about the internal representation. For example, the real tick positions are at the halves, but shown as integers.
Note that plt.tight_layout() is preferably one of the last commands just before plt.show(). Also note that if you created the ax beforehand, it is recommended to pass it as a parameter to sns.heatmap().
In the code below, the major ticks are at the halves to position the tick labels, while the minor ticks are at the integer positions to show the grid.
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import pandas as pd
import seaborn as sns
x = np.tile(np.arange(0, 33, dtype=int), 33)
y = np.repeat(np.arange(0, 33, dtype=int), 33)
z = np.random.randint(0, 50, len(x)).astype(float)
z[x == y] = np.nan
z[np.abs(x - y) == 1] = 200
z[np.abs(x - y) == 2] = 150
df = pd.DataFrame.from_dict({'X_value': x, 'Y_value': y, 'Z_value': z})
pivotted = df.pivot('X_value', 'Y_value', 'Z_value')
fig, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(pivotted, cmap='plasma_r', vmin=0.0, vmax=234.525, square=True, ax=ax)
ax.invert_yaxis()
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=20)
ax.set_xlabel('\n Number', fontsize=24)
ax.set_ylabel('Number \n', fontsize=24)
major_tick_pos = np.arange(0.5, 33, 1)
special_ticks = [125, 128, 130, 133, 138, 142, 143]
major_tick_labels = [('★ ' if i + 120 in special_ticks else '') + f'{i + 120}' for i in range(33)]
minor_tick_pos = np.arange(0, 34, 1)
ax.set_xticks(major_tick_pos)
ax.set_xticks(minor_tick_pos, minor=True)
ax.set_xticklabels(major_tick_labels, size=16, rotation=90)
ax.set_yticks(major_tick_pos)
ax.set_yticks(minor_tick_pos, minor=True)
ax.set_yticklabels(major_tick_labels, size=16, rotation=0)
ax.grid(which='minor', color='black', ls=':', alpha=0.5, lw=2)
ax.tick_params(axis='both', length=0)
rect3 = patches.Rectangle((5, 5), 13, 13, linewidth=1.7, linestyle='--', edgecolor='black', facecolor='none')
ax.add_patch(rect3)
plt.tight_layout()
plt.show()
PS: If you'd like the stars at the other side of the grid, both twinx() and twiny() are needed, only using the '★ ' if i + 120 in special_ticks else '' part of the labels.
An alternative idea would be to use annotations inside the cells to mark the special rows and columns:
stars = [['☆' if x in special_ticks or y in special_ticks else '' for x in range(120, 153)]
for y in range(120, 153)]
sns.heatmap(pivotted, cmap='plasma_r', vmin=0.0, vmax=234.525,
annot=stars, fmt='s', annot_kws={'size':20}, square=True, ax=ax)
To change the tick label colors, an approach could be:
xticks = ax.set_xticklabels(major_tick_labels, size=16, rotation=90)
yticks = ax.set_yticklabels(major_tick_labels, size=16, rotation=0)
for t in xticks + yticks:
if t.get_text().startswith('★'):
t.set_color('crimson')
I am trying to recreate this scatterplot with my own code: https://www.machinelearningplus.com/wp-content/uploads/2018/11/25_Distributed_Dotplot_Matplotlib-min.png
The code to produce this is:
(https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/#25.-Distributed-Dot-Plot)
# !pip install brewer2mpl
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; warnings.filterwarnings(action='once')
large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
'legend.fontsize': med,
'figure.figsize': (16, 10),
'axes.labelsize': med,
'axes.titlesize': med,
'xtick.labelsize': med,
'ytick.labelsize': med,
'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")
%matplotlib inline
# Version
print(mpl.__version__) #> 3.0.0
print(sns.__version__) #> 0.9.0
import matplotlib.patches as mpatches
# Prepare Data
df_raw = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv")
cyl_colors = {4:'tab:red', 5:'tab:green', 6:'tab:blue', 8:'tab:orange'}
df_raw['cyl_color'] = df_raw.cyl.map(cyl_colors)
# Mean and Median city mileage by make
df = df_raw[['cty', 'manufacturer']].groupby('manufacturer').apply(lambda x: x.mean())
df.sort_values('cty', ascending=False, inplace=True)
df.reset_index(inplace=True)
df_median = df_raw[['cty', 'manufacturer']].groupby('manufacturer').apply(lambda x: x.median())
# Draw horizontal lines
fig, ax = plt.subplots(figsize=(16,10), dpi= 80)
ax.hlines(y=df.index, xmin=0, xmax=40, color='gray', alpha=0.5, linewidth=.5, linestyles='dashdot')
# Draw the Dots
for i, make in enumerate(df.manufacturer):
df_make = df_raw.loc[df_raw.manufacturer==make, :]
ax.scatter(y=np.repeat(i, df_make.shape[0]), x='cty', data=df_make, s=75, edgecolors='gray', c='w', alpha=0.5)
ax.scatter(y=i, x='cty', data=df_median.loc[df_median.index==make, :], s=75, c='firebrick')
# Annotate
ax.text(33, 13, "$red \; dots \; are \; the \: median$", fontdict={'size':12}, color='firebrick')
# Decorations
red_patch = plt.plot([],[], marker="o", ms=10, ls="", mec=None, color='firebrick', label="Median")
plt.legend(handles=red_patch)
ax.set_title('Distribution of City Mileage by Make', fontdict={'size':22})
ax.set_xlabel('Miles Per Gallon (City)', alpha=0.7)
ax.set_yticks(df.index)
ax.set_yticklabels(df.manufacturer.str.title(), fontdict={'horizontalalignment': 'right'}, alpha=0.7)
ax.set_xlim(1, 40)
plt.xticks(alpha=0.7)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.grid(axis='both', alpha=.4, linewidth=.1)
plt.show()
However, whenever I try to run this code I get ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
So, I added .any() to y=np.repeat(i, df_make.shape[0]) and reran
And now i am getting ValueError: x and y must be the same size
Just from googling I understand that somehow the x and y are different dimensions, but I am not sure how to fix this and make it work so i can then apply it to my own data.
Thanks!
The error is probably caused by the fact that the first 'y' is an array and not a list.
ax.scatter(y=[i]*df_make.shape[0], ...)
full code:
import matplotlib.patches as mpatches
# Prepare Data
df_raw = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv")
cyl_colors = {4:'tab:red', 5:'tab:green', 6:'tab:blue', 8:'tab:orange'}
df_raw['cyl_color'] = df_raw.cyl.map(cyl_colors)
# Mean and Median city mileage by make
df = df_raw[['cty', 'manufacturer']].groupby('manufacturer').apply(lambda x: x.mean())
df.sort_values('cty', ascending=False, inplace=True)
df.reset_index(inplace=True)
df_median = df_raw[['cty', 'manufacturer']].groupby('manufacturer').apply(lambda x: x.median())
# Draw horizontal lines
fig, ax = plt.subplots(figsize=(16,10), dpi= 80)
ax.hlines(y=df.index, xmin=0, xmax=40, color='gray', alpha=0.5, linewidth=.5, linestyles='dashdot')
# Draw the Dots
for i, make in enumerate(df.manufacturer):
df_make = df_raw.loc[df_raw.manufacturer==make, :]
ax.scatter(y=[i]*df_make.shape[0], x='cty', data=df_make, s=75, edgecolors='gray', c='w', alpha=0.5)
ax.scatter(y=i, x='cty', data=df_median.loc[df_median.index==make, :], s=75, c='firebrick')
# Annotate
ax.text(33, 13, "$red \; dots \; are \; the \: median$", fontdict={'size':12}, color='firebrick')
# Decorations
red_patch = plt.plot([],[], marker="o", ms=10, ls="", mec=None, color='firebrick', label="Median")
plt.legend(handles=red_patch)
ax.set_title('Distribution of City Mileage by Make', fontdict={'size':22})
ax.set_xlabel('Miles Per Gallon (City)', alpha=0.7)
ax.set_yticks(df.index)
ax.set_yticklabels(df.manufacturer.str.title(), fontdict={'horizontalalignment': 'right'}, alpha=0.7)
ax.set_xlim(1, 40)
plt.xticks(alpha=0.7)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.grid(axis='both', alpha=.4, linewidth=.1)
plt.show()