How can I add jitter to my seaborn and matplot plots? - python

I am working on trying to add Jitter to my plots using seaborn and matplot plots. I am getting mixed information form what I am reading online. Some information is saying coding needs to be done and other information show it as being as simple as jitter = True. I there another library or something that I should be importing that I am not aware of? Below is the code that I am running and trying to add jitter to:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
filename = 'https://library.startlearninglabs.uw.edu/DATASCI410/Datasets/JitteredHeadCount.csv'
headcount_df = pd.read_csv(filename)
headcount_df.describe()
%matplotlib inline
ax = plt.figure(figsize=(12, 6)).gca() # define axis
headcount_df.plot.scatter(x = 'Hour', y = 'TablesOpen', ax = ax, alpha = 0.2)
# auto_price.plot(kind = 'scatter', x = 'city-mpg', y = 'price', ax = ax)
ax.set_title('Hour vs TablesOpen') # Give the plot a main title
ax.set_ylabel('TablesOpen')# Set text for y axis
ax.set_xlabel('Hour')
ax = sns.kdeplot(headcount_df.loc[:, ['TablesOpen', 'Hour']], shade = True, cmap = 'PuBu')
headcount_df.plot.scatter(x = 'Hour', y = 'TablesOpen', ax = ax, jitter = True)
ax.set_title('Hour vs TablesOpen') # Give the plot a main title
ax.set_ylabel('TablesOpen')# Set text for y axis
ax.set_xlabel('Hour')
I receive the error: AttributeError: 'PathCollection' object has no property 'jitter' when trying to add the jitter. Any help or more information on this would be much appreciated

To add jitter to a scatter plot, first get a handle to the collection that contains the scatter dots. When a scatter plot is just created on an ax, ax.collections[-1] will be the desired collection.
Calling get_offsets() on the collection gets all the xy coordinates of the dots. Add some small random number to each of them. As in this case all coordinates are integers, adding a random number between 0 and 1 spreads the dots out evenly.
In this case the number of dots is very huge. To better see where the dots are concentrated, they can be made very small (marker=',', linewidth=0, s=1,) and be very transparent (e.g.alpha=0.1).
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
filename = 'https://library.startlearninglabs.uw.edu/DATASCI410/Datasets/JitteredHeadCount.csv'
headcount_df = pd.read_csv(filename)
fig, ax = plt.subplots(figsize=(12, 6))
headcount_df.plot.scatter(x='Hour', y='TablesOpen', marker=',', linewidth=0, s=1, alpha=.1, color='crimson', ax=ax)
dots = ax.collections[-1]
offsets = dots.get_offsets()
jittered_offsets = offsets + np.random.uniform(0, 1, offsets.shape)
dots.set_offsets(jittered_offsets)
ax.set_title('Hour vs TablesOpen') # Give the plot a main title
ax.set_ylabel('TablesOpen') # Set text for y axis
ax.set_xlabel('Hour')
ax.set_xticks(range(25))
ax.autoscale(enable=True, tight=True)
plt.tight_layout()
plt.show()
As there are a huge number of points, drawing the 2D kde takes a long time. The time can be reduced by taking a random sample from the rows. Note that to draw a 2D kde, the latest versions of Seaborn want each column as a separate parameter.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
filename = 'https://library.startlearninglabs.uw.edu/DATASCI410/Datasets/JitteredHeadCount.csv'
headcount_df = pd.read_csv(filename)
fig, ax = plt.subplots(figsize=(12, 6))
N = 5000
rand_sel_df = headcount_df.iloc[np.random.choice(range(len(headcount_df)), N)]
ax = sns.kdeplot(rand_sel_df['Hour'], rand_sel_df['TablesOpen'], shade=True, cmap='PuBu', ax=ax)
ax.set_title('Hour vs TablesOpen')
ax.set_xticks(range(25))
plt.tight_layout()
plt.show()

Related

Heatmap with multi-color y-axis and correspondend colorbar

I want to create a heatmap with seaborn, similar to this (with the following code):
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# Create data
df = pd.DataFrame(np.random.random((5,5)), columns=["a","b","c","d","e"])
# Default heatmap
ax = sns.heatmap(df)
plt.show()
I'd also like to add a new variable (lets say new_var = pd.DataFrame(np.random.random((5,1)), columns=["new variable"])), such as that the values (and possibly the spine and ticks as well) of the y-axis are colored according to the new variable and a second color bar plotted in the same plot to represent the colors of the y-axis values. How can I do that?
This uses the new values to color the y-ticks and the y-tick labels and adds the associated colorbar.
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import pandas as pd
import numpy as np
# Create data
df = pd.DataFrame(np.random.random((5,5)), columns=["a","b","c","d","e"])
# Default heatmap
ax = sns.heatmap(df)
new_var = pd.DataFrame(np.random.random((5,1)), columns=["new variable"])
# Create the colorbar for y-ticks and labels
norm = plt.Normalize(new_var.min(), new_var.max())
cmap = matplotlib.cm.get_cmap('turbo')
yticks_locations = ax.get_yticks()
yticks_labels = df.index.values
#hide original ticks
ax.tick_params(axis='y', left=False)
ax.set_yticklabels([])
for var, ytick_loc, ytick_label in zip(new_var.values, yticks_locations, yticks_labels):
color = cmap(norm(float(var)))
ax.annotate(ytick_label, xy=(1, ytick_loc), xycoords='data', xytext=(-0.4, ytick_loc),
arrowprops=dict(arrowstyle="-", color=color, lw=1), zorder=0, rotation=90, color=color)
# Add colorbar for y-tick colors
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
cb = ax.figure.colorbar(sm)
# Match the seaborn style
cb.outline.set_visible(False)
I found your problem interesting, and inspired by the unanswered comment above:
How do you change the second colorbar position? For example, one on top the other on bottom sides. - Py-ser
I decided to spend a while doing some tests. After a little digging i find that cbar_kws={"orientation": "horizontal"} is the argument for sns.heatmap that makes the colorbars horizontal.
Borrowing the code from the solution and making some changes, you can format your plot the way you want as in:
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import pandas as pd
import numpy as np
# Create data
df = pd.DataFrame(np.random.random((5,5)), columns=["a","b","c","d","e"])
# Default heatmap
ax = sns.heatmap(df, cbar_kws={"orientation": "horizontal"}, square = False, annot = True)
new_var = pd.DataFrame(np.random.random((5,1)), columns=["new variable"])
# Create the colorbar for y-ticks and labels
norm = plt.Normalize(new_var.min(), new_var.max())
cmap = matplotlib.cm.get_cmap('turbo')
yticks_locations = ax.get_yticks()
yticks_labels = df.index.values
#hide original ticks
ax.tick_params(axis='y', left=False)
ax.set_yticklabels([])
for var, ytick_loc, ytick_label in zip(new_var.values, yticks_locations, yticks_labels):
color = cmap(norm(float(var)))
ax.annotate(ytick_label, xy=(1, ytick_loc), xycoords='data', xytext=(-0.4, ytick_loc),
arrowprops=dict(arrowstyle="-", color=color, lw=1), zorder=0, rotation=90, color=color)
# Add colorbar for y-tick colors
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
cb = ax.figure.colorbar(sm)
# Match the seaborn style
cb.outline.set_visible(False)
Also, you will notice that I listed the values ​​related to each cell in the heatmap, but just out of curiosity to make it clearer to check that everything was working as expected.
I'm still not very happy with the shape/size of the horizontal colorbar, but I'll keep testing and update any progress by editing this answer!
==========================================
EDIT
just to keep track of the updates, first i tried to change just some parameters of seaborn's heatmap function but wouldn't consider this a major improvement on the task... by adding
ax = sns.heatmap(df, cbar_kws = dict(use_gridspec=True, location="top", shrink =0.6), square = True, annot = True)
I end up with:
I did get to separate the colormap using the matplotlib subplot routine and honestly i believe this is the right way given the parameter control that is possible to get here, by:
# Define two rows for subplots
fig, (cax, ax) = plt.subplots(nrows=2, figsize=(5,5.025), gridspec_kw={"height_ratios":[0.025, 1]})
# Default heatmap
ax = sns.heatmap(df, cbar=False, annot = True)
# colorbar
fig.colorbar(ax.get_children()[0], cax=cax, orientation="horizontal")
plt.show()
I obtained:
Which is still not the prettiest graph I've ever made, but now the position and size of the heatmap can be edited normally within the plt.subplots subroutines that give absolute control over these parameters.

How to plot colors for two variables in scatterplot in python?

I have a dataset with two different variables, i want to give colors to each with different color, Can anyone help please? Link to my dataset : "https://github.com/mayuripandey/Data-Analysis/blob/main/word.csv"
import matplotlib.pyplot as plt
import pandas as pd
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(x = df['Friends Network-metrics'], y = df['Number of Followers'],cmap = "magma")
plt.xlabel("Friends Network-metrics")
plt.ylabel("Number of Followers")
plt.show()
Not very clear what you want to do here. But I'll provide a solution that may help you a bit.
Could use seaborn to implement the colors on the variables. Otherwise, you'd need to iterate through the points to set the color. Or create a new column that conditionally inputs a color for a value.
I don't know what your variable is, but you just want to put that in for the hue parameter:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
df = pd.read_csv('https://raw.githubusercontent.com/mayuripandey/Data-Analysis/main/word.csv')
# Use the 'hue' argument to provide a factor variable
sns.lmplot(x='Friends Network-metrics',
y='Number of Followers',
height=8,
aspect=.8,
data=df,
fit_reg=False,
hue='Sentiment',
legend=True)
plt.xlabel("Friends Network-metrics")
plt.ylabel("Number of Followers")
plt.show()
This can give you a view like this:
If you were looking for color scale for one of the variables though, you would do the below. However, the max value is so big that the range also doesn't make it really an effective visual:
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/mayuripandey/Data-Analysis/main/word.csv')
fig, ax = plt.subplots(figsize=(10, 6))
g = ax.scatter(x = df['Friends Network-metrics'],
y = df['Number of Followers'],
c = df['Friends Network-metrics'],
cmap = "magma")
fig.colorbar(g)
plt.xlabel("Friends Network-metrics")
plt.ylabel("Number of Followers")
plt.show()
So you could adjust the scale (I'd also add edgecolors = 'black' as its hard to see the light plots):
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/mayuripandey/Data-Analysis/main/word.csv')
fig, ax = plt.subplots(figsize=(10, 6))
g = ax.scatter(x = df['Friends Network-metrics'],
y = df['Number of Followers'],
c = df['Friends Network-metrics'],
cmap = "magma",
vmin=0, vmax=10000,
edgecolors = 'black')
fig.colorbar(g)
plt.xlabel("Friends Network-metrics")
plt.ylabel("Number of Followers")
plt.show()

Scatter plot with colorbar and datetime axis ticks

I am getting lost in different methods used in matplotlib.
I want to create a colour-coded scatter plot with a colorbar on the side and datetime on the x axis.
But depending on how I define my ax, I get different errors.
Below is the core of my code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import matplotlib.dates as mdates
#.....loading files etc.
norm = mcolors.Normalize(vmin=0,vmax=1000)
timerange = pd.date_range(start='2015-01-01', end='2016-01-01', freq='30D')
### PLOTTING
fig = plt.figure(figsize=(6.,5))
ax = fig.add_subplot(111)
for Af in Afiles:
for index, row in Af.iterrows():
time = pd.to_datetime(row['date'], format="%Y-%m-%d")
plt.scatter(time, row['A'], c=row['z'], norm=norm, cmap=colormap,edgecolor='k', lw=0.8, s=80)
plt.xticks(timerange, rotation=90)
ax.xaxis.set_major_formatter(mdates.DateFormatter("%d/%m/%Y"))
plt.xlabel('Time', fontsize=11, color='k')
clb = fig.colorbar(ax)
clb.ax.set_title('Value', y=-0.125, fontsize=11)
clb.ax.invert_yaxis()
fig.tight_layout()
this produces AttributeError: 'AxesSubplot' object has no attribute 'autoscale_None'
but if I specify my ax as the scatter plot so that I can get my colour-coding working, I then have trouble with the axis formatter.
Writing instead ax = plt.scatter generates AttributeError: 'PathCollection' object has no attribute 'xaxis'.
How can I have both the colorbar AND formatted axis ticks?
Don't call the scatter ax. (This overwrites the existinge axes ax.)
The colorbar expects as first argument a ScalarMappable (as e.g. the scatter). Since the scatters are all normalized, you can use it from the loop,
norm = plt.Normalize(...)
for bla in blubb:
scatter = plt.scatter(..., norm=norm)
Then,
clb = fig.colorbar(scatter)
The rest should stay the same.
The basic idea is that you need to add an extra axis for the colorbar.
It's hard to know if this is an exact match, as you haven't provided a working example with data. But this may at least serve as a template.
First, some example data:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import matplotlib.dates as mdates
from mpl_toolkits.axes_grid1 import make_axes_locatable
vmin = 0
vmax = 1000
timerange = pd.date_range(start='2015-01-01', end='2016-01-01', freq='30D')
N = len(timerange)
data = np.random.randint(vmin, vmax, size=N)
# z contains the colorbar values for each point
cmap = plt.get_cmap('Reds')
z = [cmap((x-vmin)/(vmax-vmin))[:3] for x in data]
df = pd.DataFrame({"value":data, "datetime":timerange, "z":z})
Now plot:
fig = plt.figure(figsize=(6.,5))
ax = fig.add_subplot(111)
plt.scatter(x=df.datetime.values, y=df.value.values, c=df.z)
ax.set_xticklabels(timerange, rotation=90)
ax.xaxis.set_major_formatter(mdates.DateFormatter("%d/%m/%Y"))
ax.set_xlabel('Time')
Now add colorbar:
norm = mcolors.Normalize(vmin=vmin,vmax=vmax)
m = cm.ScalarMappable(cmap='Reds', norm=norm)
m.set_array([(x-vmin)/(vmax-vmin) for x in df.value.values])
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.1)
clb = plt.colorbar(m, cax=cax)

Histogram at specific coordinates inside axes

What I want to achieve with Python 3.6 is something like this :
Obviously made in paint and missing some ticks on the xAxis. Is something like this possible? Essentially, can I control exactly where to plot a histogram (and with what orientation)?
I specifically want them to be on the same axes just like the figure above and not on separate axes or subplots.
fig = plt.figure()
ax2Handler = fig.gca()
ax2Handler.scatter(np.array(np.arange(0,len(xData),1)), xData)
ax2Handler.hist(xData,bins=60,orientation='horizontal',normed=True)
This and other approaches (of inverting the axes) gave me no results. xData is loaded from a panda dataframe.
# This also doesn't work as intended
fig = plt.figure()
axHistHandler = fig.gca()
axScatterHandler = fig.gca()
axHistHandler.invert_xaxis()
axHistHandler.hist(xData,orientation='horizontal')
axScatterHandler.scatter(np.array(np.arange(0,len(xData),1)), xData)
A. using two axes
There is simply no reason not to use two different axes. The plot from the question can easily be reproduced with two different axes:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")
xData = np.random.rand(1000)
fig,(ax,ax2)= plt.subplots(ncols=2, sharey=True)
fig.subplots_adjust(wspace=0)
ax2.scatter(np.linspace(0,1,len(xData)), xData, s=9)
ax.hist(xData,bins=60,orientation='horizontal',normed=True)
ax.invert_xaxis()
ax.spines['right'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.tick_params(axis="y", left=0)
plt.show()
B. using a single axes
Just for the sake of answering the question: In order to plot both in the same axes, one can shift the bars by their length towards the left, effectively giving a mirrored histogram.
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")
xData = np.random.rand(1000)
fig,ax= plt.subplots(ncols=1)
fig.subplots_adjust(wspace=0)
ax.scatter(np.linspace(0,1,len(xData)), xData, s=9)
xlim1 = ax.get_xlim()
_,__,bars = ax.hist(xData,bins=60,orientation='horizontal',normed=True)
for bar in bars:
bar.set_x(-bar.get_width())
xlim2 = ax.get_xlim()
ax.set_xlim(-xlim2[1],xlim1[1])
plt.show()
You might be interested in seaborn jointplots:
# Import and fake data
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
data = np.random.randn(2,1000)
# actual plot
jg = sns.jointplot(data[0], data[1], marginal_kws={"bins":100})
jg.ax_marg_x.set_visible(False) # remove the top axis
plt.subplots_adjust(top=1.15) # fill the empty space
produces this:
See more examples of bivariate distribution representations, available in Seaborn.

Adding labels in x y scatter plot with seaborn

I've spent hours on trying to do what I thought was a simple task, which is to add labels onto an XY plot while using seaborn.
Here's my code
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df_iris=sns.load_dataset("iris")
sns.lmplot('sepal_length', # Horizontal axis
'sepal_width', # Vertical axis
data=df_iris, # Data source
fit_reg=False, # Don't fix a regression line
size = 8,
aspect =2 ) # size and dimension
plt.title('Example Plot')
# Set x-axis label
plt.xlabel('Sepal Length')
# Set y-axis label
plt.ylabel('Sepal Width')
I would like to add to each dot on the plot the text in "species" column.
I've seen many examples using matplotlib but not using seaborn.
Any ideas? Thank you.
One way you can do this is as follows:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
df_iris=sns.load_dataset("iris")
ax = sns.lmplot('sepal_length', # Horizontal axis
'sepal_width', # Vertical axis
data=df_iris, # Data source
fit_reg=False, # Don't fix a regression line
size = 10,
aspect =2 ) # size and dimension
plt.title('Example Plot')
# Set x-axis label
plt.xlabel('Sepal Length')
# Set y-axis label
plt.ylabel('Sepal Width')
def label_point(x, y, val, ax):
a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
for i, point in a.iterrows():
ax.text(point['x']+.02, point['y'], str(point['val']))
label_point(df_iris.sepal_length, df_iris.sepal_width, df_iris.species, plt.gca())
Here's a more up-to-date answer that doesn't suffer from the string issue described in the comments.
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df_iris=sns.load_dataset("iris")
plt.figure(figsize=(20,10))
p1 = sns.scatterplot(x='sepal_length', # Horizontal axis
y='sepal_width', # Vertical axis
data=df_iris, # Data source
size = 8,
legend=False)
for line in range(0,df_iris.shape[0]):
p1.text(df_iris.sepal_length[line]+0.01, df_iris.sepal_width[line],
df_iris.species[line], horizontalalignment='left',
size='medium', color='black', weight='semibold')
plt.title('Example Plot')
# Set x-axis label
plt.xlabel('Sepal Length')
# Set y-axis label
plt.ylabel('Sepal Width')
Thanks to the 2 other answers, here is a function scatter_text that makes it possible to reuse these plots several times.
import seaborn as sns
import matplotlib.pyplot as plt
def scatter_text(x, y, text_column, data, title, xlabel, ylabel):
"""Scatter plot with country codes on the x y coordinates
Based on this answer: https://stackoverflow.com/a/54789170/2641825"""
# Create the scatter plot
p1 = sns.scatterplot(x, y, data=data, size = 8, legend=False)
# Add text besides each point
for line in range(0,data.shape[0]):
p1.text(data[x][line]+0.01, data[y][line],
data[text_column][line], horizontalalignment='left',
size='medium', color='black', weight='semibold')
# Set title and axis labels
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
return p1
Use the function as follows:
df_iris=sns.load_dataset("iris")
plt.figure(figsize=(20,10))
scatter_text('sepal_length', 'sepal_width', 'species',
data = df_iris,
title = 'Iris sepals',
xlabel = 'Sepal Length (cm)',
ylabel = 'Sepal Width (cm)')
See also this answer on how to have a function that returns a plot:
https://stackoverflow.com/a/43926055/2641825
Below is a solution that does not iterate over rows in the data frame using the dreaded for loop.
There are many issues regarding iterating over a data frame.
The answer is don't iterate! See this link.
The solution below relies on a function (plotlabel) within the petalplot function, which is called by df.apply.
Now, I know readers will comment on the fact that I use scatter and not lmplot, but that is a bit besides the point.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df_iris=sns.load_dataset("iris")
def petalplot(df):
def plotlabel(xvar, yvar, label):
ax.text(xvar+0.002, yvar, label)
fig = plt.figure(figsize=(30,10))
ax = sns.scatterplot(x = 'sepal_length', y = 'sepal_width', data=df)
# The magic starts here:
df.apply(lambda x: plotlabel(x['sepal_length'], x['sepal_width'], x['species']), axis=1)
plt.title('Example Plot')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
petalplot(df_iris)
Same idea with Scott Boston's answer, however with Seaborn v0.12+, you can leverage seaborn.FacetGrid.apply to add labels on plots and set up your figure in one go:
import seaborn as sns
import pandas as pd
%matplotlib inline
sns.set_theme()
df_iris = sns.load_dataset("iris")
(
sns.lmplot(
data=df_iris,
x="sepal_length",
y="sepal_width",
fit_reg=False,
height=8,
aspect=2
)
.apply(lambda grid: [
grid.ax.text(r["sepal_length"]+.02, r["sepal_width"], r["species"])
for r in df_iris.to_dict(orient="records")
])
.set(title="Example Plot")
.set_axis_labels("Sepal Length", "Sepal Width")
)
Or, if you don't need to use lmplot, also from v0.12, you can use the seaborn.objects interface. This way we don't need to manually iterate over the Iris dataframe nor refer to df_iris or column names sepal_... multiple times.
import seaborn.objects as so
(
so.Plot(df_iris, x="sepal_length", y="sepal_width", text="species")
.add(so.Dot())
.add(so.Text(halign="left"))
.label(title="Example plot", x="Sepal Length", y="Sepal Width")
.layout(size=(20, 10))
)
This produces the below figure:
Use the powerful declarative API to avoid loops (seaborn>=0.12).
Specifically, put x,y, and annotations into a pandas data frame and call plotting.
Here is an example from my own research work.
import seaborn.objects as so
import pandas as pd
df = pd.DataFrame(..,columns=['phase','P(X=1)','text'])
fig,ax = plt.subplots()
p = so.Plot(df,x='phase',y='P(X=1)',text='text').add(so.Dot(marker='+')).add(so.Text(halign='left'))
p.on(ax).show()

Categories