How to set x-axis size in seaborn Pairplot - python

I am using Seaborn to draw Pairplots. Problem is that, for some variables, size of x-axis is small and data points are very close to each other, as can be seen below (first row of plots):
As you can see, plots in the second row are fine.
This is the code I am using:
import math
import matplotlib.pyplot as plt
import seaborn as sns
y_name = 'y'
features = data.iloc[:, :-1]
features_names = features.columns
plot_size=7
num_plots_x=10 # No. of plots in every row
num_plots_y = math.ceil(len(features_names)/num_plots_x) # No. of plots in y direction
fig = plt.figure(figsize=(plot_size*num_plots_y, plot_size*num_plots_x), facecolor='white')
axes = [fig.add_subplot(num_plots_y,1,i+1) for i in range(num_plots_y)]
for i, ax in enumerate(axes):
start_index = i * num_plots_x
end_index = (i+1) * num_plots_x
if end_index > len(features_names): end_index = len(features_names)
sns.pairplot(x_vars=features_names[start_index:end_index], y_vars=y_name, data = data)
plt.savefig('figure.png')
Is there any way that I can set size or scale of x-axis?

Related

Show density and frequency on the same histogram

I would like to see both the density and frequency on my histogram. For example, display density on the left side and frequency on the right side.
Here is my code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x = [6.950915827194559, 0.5704464713012669, -1.655326152372283, 5.867122206816244, -1.809359944941513, -6.164821482653027, -2.538999462076397, 0.2108693568484643, -8.740600769897465, 2.121232876712331, 7.967032967032961, 10.61701196601832, 1.847419201771516, 0.6858006670780847, -2.008695652173909, 2.86991153132885, 1.703131050506168, -1.346913193356314, 3.334927671049193, -15.64688995215311, 20.00022688856367, 10.05956454173731, 2.044936877124148, 3.06513409961684, -0.9973614775725559, 1.190631873030967, -1.509991311902692, -0.3333827233664155, 1.898473282442747, 1.618299899267539, -0.1897860593512823, 1.000000000000001, 3.03501945525293, -7.646697418593529, -0.9769069279216391, -2.918403811792736, -3.90929422276739, 9.609846259653532, 3.240690674452962, 10.08973134408675, 1.98356309650054, 1.915301127899549, -0.7792207792207684, -3.308682400714091, -3.312977099236647, 19.98101265822785, 3.661973444534827, -5.770676691729326, 0.5268044012063156, -1.573767040370533, 3.234974862888484, -1.514352732634994, 6.564849624060143, 9.956794019127146, 3.232590278195024, 2.042007001166857, 1.601164483260553, -2.384737678855331, -2.731242556570068, 0.6069707315088602, 1.40561881957264, -6.805306861851957, 2.492102492102499, -3.639688275501762, 0.7958485384154335, 2.799187725631769, 0.9195966872689088, -2.366608280379856, 0.797679477882518, -3.80380434782609]
df = pd.DataFrame(x, columns=["Returns"])
def plot_histogram():
bins = range(-11, 12, 1)
bins_str = []
for i in bins:
bins_str.append(str(i)+"%")
fig, ax = plt.subplots(figsize=(9, 5))
_, bins, patches = plt.hist(np.clip(df.Returns, bins[0], bins[-1]),
bins=bins, density=True, rwidth=0.8)
xlabels = bins_str[:]
xlabels[-1] = "Over"
xlabels[0] = "Under"
N_labels = len(xlabels)
plt.xlim([bins[0], bins[-1]])
plt.xticks(bins)
ax.set_xticklabels(xlabels)
plt.title("Returns distribution")
plt.grid(axis="y", linewidth=0.5)
plot_histogram()
I tried adding density=True in plt.hist() but it removes the count from the histogram. Is it possible to display both the frequency and density on the same histogram?
A density plot sets the heights of the bars such that the area of all the bars (taking rwidth=1 for that calculation) sums to 1. As such, the bar heights of a counting histogram get divided by (the number of values times the bar widths).
With that conversion factor, you can recalculate the counts from the density (or vice versa). The recalculation can be used to label the bars and/or set a secondary y-axis. Note that the ticks of both y axes are aligned, so the grid only works well for one of them. (A secondary y-axis is a bit different from ax.twiny(), as the former has a fixed conversion between both y axes).
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
x = [6.950915827194559, 0.5704464713012669, -1.655326152372283, 5.867122206816244, -1.809359944941513, -6.164821482653027, -2.538999462076397, 0.2108693568484643, -8.740600769897465, 2.121232876712331, 7.967032967032961, 10.61701196601832, 1.847419201771516, 0.6858006670780847, -2.008695652173909, 2.86991153132885, 1.703131050506168, -1.346913193356314, 3.334927671049193, -15.64688995215311, 20.00022688856367, 10.05956454173731, 2.044936877124148, 3.06513409961684, -0.9973614775725559, 1.190631873030967, -1.509991311902692, -0.3333827233664155, 1.898473282442747, 1.618299899267539, -0.1897860593512823, 1.000000000000001, 3.03501945525293, -7.646697418593529, -0.9769069279216391, -2.918403811792736, -3.90929422276739, 9.609846259653532, 3.240690674452962, 10.08973134408675, 1.98356309650054, 1.915301127899549, -0.7792207792207684, -3.308682400714091, -3.312977099236647, 19.98101265822785, 3.661973444534827, -5.770676691729326, 0.5268044012063156, -1.573767040370533, 3.234974862888484, -1.514352732634994, 6.564849624060143, 9.956794019127146, 3.232590278195024, 2.042007001166857, 1.601164483260553, -2.384737678855331, -2.731242556570068, 0.6069707315088602, 1.40561881957264, -6.805306861851957, 2.492102492102499, -3.639688275501762, 0.7958485384154335, 2.799187725631769, 0.9195966872689088, -2.366608280379856, 0.797679477882518, -3.80380434782609]
df = pd.DataFrame(x, columns=["Returns"])
bins = range(-11, 12, 1)
bins_str = [str(i) + "%" for i in bins]
fig, ax = plt.subplots(figsize=(9, 5))
values, bins, patches = ax.hist(np.clip(df["Returns"], bins[0], bins[-1]),
bins=bins, density=True, rwidth=0.8)
# conversion between counts and density: number of values times bin width
factor = len(df) * (bins[1] - bins[0])
ax.bar_label(patches, ['' if v == 0 else f'{v * factor:.0f}' for v in values])
xlabels = bins_str[:]
xlabels[-1] = "Over"
xlabels[0] = "Under"
ax.set_xlim([bins[0], bins[-1]])
ax.set_xticks(bins, xlabels)
ax.set_title("Returns distribution")
ax.grid(axis="y", linewidth=0.5)
secax = ax.secondary_yaxis('right', functions=(lambda y: y * factor, lambda y: y / factor))
secax.set_ylabel('counts')
ax.set_ylabel('density')
plt.show()
To have the same grid positions for both y-axes, you can copy the ticks of one and convert them to set them at the other. For the ticks to be calculated, the plot needs to be drawn once (at the end of the code). Note that the converted values are only shown with a limited number of digits.
fig.canvas.draw()
ax.set_yticks(secax.get_yticks() / factor)
plt.show()

last bar is hiding behind in pandas grouped bar plot

I am plotting a grouped bar plot out of the data:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({'x':[0.716468, 0.799652, 0.611284, 0.700020, 0.745372, 0.717280, 0.212407, 0.225291, 0.443395, 0.649912, 0.756463, 0.588992],
'y':[1.891988, 2.750937, 4.495497, 5.260436, 6.100882, 6.262784, 7.339279, 6.877465, 6.349050, 4.797649, 3.290293, 2.106541],
'x_err':[0.022882, 0.021447, 0.009402, 0.011324, 0.008872, 0.015882, 0.009615, 0.007617, 0.012816, 0.010310, 0.009213, 0.020137],
'y_err':[0.156298, 0.151681, 0.178215, 0.143700, 0.137071, 0.133951, 0.209588, 0.185246, 0.214665, 0.214598, 0.163624, 0.132138]})
with the following code:
fig, ax = plt.subplots()
width = 0.35
df['x'].plot(kind = 'bar',ax = ax ,width = width, position = 0 , yerr = df['x_err'],color = 'red',use_index = True)
ax.set_ylabel('X')
ax1= ax.twinx()
df['y'].plot(kind = 'bar',ax = ax1 ,width = width, position = 1 , yerr = df['y_err'],color = 'blue',use_index = True)
ax1.set_ylabel('Y')
plt.show()
and got the following plot:
The plot is okay except for the red bar in the last group i.e. group bar 11 (shown by arrow) only appears half. I know that after reducing the width I can visualize it. The problem is the bars become thin, which I do not want. As you can see, there is still plenty of gap between two successive groups, I want to reduce the gap and accommodate all bars clearly.
Any help would be highly appreciated.
That's due to the fact that you modify the position of the bars with position. Try adjusting xlims:
# other plotting functions
# ...
xlims = ax1.get_xlim()
ax1.set_xlim(xlims[0], xlims[1] + width)
plt.show()
Output:

How to grid plot 2D categorical data

I hava data that looks like:
Name X Y
A HIGH MID
B LOW LOW
C MID LOW
D HIGH MID
How to plot this data in a 2-D diagram with a 3x3 grid adding a random variation to place each data point including its name with enough spacing between each other.
So it should look somewhat like that:
The following i tried, but i dont know how to plot the values not exactly on the grid, but in between, so they do nbot overlap.
import pandas as pd
import matplotlib.pyplot as plt
### Mock Data ###
data = """A0,LOW,LOW
A,MID,MID
B,LOW,MID
C,MID,HIGH
D,LOW,MID
E,HIGH,HIGH"""
df = pd.DataFrame([x.split(',') for x in data.split('\n')])
df.columns = ['name','X','Y']
### Plotting ###
fig,axs = plt.subplots()
axs.scatter(df.X,df.Y,label=df.name)
axs.set_xlabel('X')
axs.set_ylabel('Y')
for i,p in enumerate(df.name):
axs.annotate(p, (df.X[i],df.Y[i]))
axs.grid()
axs.set_axisbelow(True)
fig.tight_layout()
plt.show()
resulting:
You can control directly the positions and change the labels on the axis. There are a few problems with your drawing because you are not taking into account some issue such as "what label will you have if you have more than one point at the same location?".
In any case here is a possible solution:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
### Mock Data ###
data = """A0,LOW,LOW
A,MID,MID
B,LOW,MID
C,MID,HIGH
D,LOW,MID
E,HIGH,HIGH"""
df = pd.DataFrame([x.split(',') for x in data.split('\n')])
df.columns = ['name','X','Y']
pos = [0, 1, 2]
lbls = ["LOW", "MID", "HIGH"]
trans = {lbls[i]:pos[i] for i in range(len(pos))}
mat = np.zeros((3, 3), dtype="U10") # This is limited to 10 characters
xxs = []
yys = []
offset = 0.05
for i in range(df.shape[0]):
xc, yc = trans[df.X[i]], trans[df.Y[i]]
if mat[xc, yc]=="":
mat[xc, yc] = df.name[i]
else:
mat[xc, yc] = mat[xc, yc] + ";" + df.name[i]
xxs.append(xc)
yys.append(yc)
fig,axs = plt.subplots()
axs.scatter(xxs, yys)
for i in range(df.shape[0]):
name = mat[xxs[i], yys[i]]
axs.text(xxs[i]+offset, yys[i]+offset, name)
axs.set_xticks(pos)
axs.set_xticklabels(lbls)
axs.set_yticks(pos)
axs.set_yticklabels(lbls)
for i in pos:
axs.axhline(pos[i]-0.5, color="black")
axs.axvline(pos[i]-0.5, color="black")
axs.set_xlim(-0.5, 2.5)
axs.set_ylim(-0.5, 2.5)
plt.show()
This result in the following image:

Python: Changing visual parameters of ptitprince repo derived from seaborn and matplotlib

I am using a github repository called ptitprince, which is derived from seaborn and matplotlib, to generate graphs.
For example, this is the code using the ptitprince repo:
# coding: utf8
import pandas as pd
import ptitprince as pt
import seaborn as sns
import os
import matplotlib.pyplot as plt
#sns.set(style="darkgrid")
#sns.set(style="whitegrid")
#sns.set_style("white")
sns.set(style="whitegrid",font_scale=2)
import matplotlib.collections as clt
df = pd.read_csv ("u118phag.csv", sep= ",")
df.head()
savefigs = True
figs_dir = 'figs'
if savefigs:
# Make the figures folder if it doesn't yet exist
if not os.path.isdir('figs'):
os.makedirs('figs')
#automation
f, ax = plt.subplots(figsize=(4, 5))
#f.subplots_adjust(hspace=0,wspace=0)
dx = "Treatment"; dy = "score"; ort = "v"; pal = "Set2"; sigma = .2
ax=pt.RainCloud(x = dx, y = dy, data = df, palette = pal, bw = sigma,
width_viol = .6, ax = ax, move=.2, offset=.1, orient = ort, pointplot = True)
f.show()
if savefigs:
f.savefig('figs/figure20.png', bbox_inches='tight', dpi=500)
which generates the following graph
The raw code not using ptitprince is as follows and produces the same graph as above:
# coding: utf8
import pandas as pd
import ptitprince as pt
import seaborn as sns
import os
import matplotlib.pyplot as plt
#sns.set(style="darkgrid")
#sns.set(style="whitegrid")
#sns.set_style("white")
sns.set(style="whitegrid",font_scale=2)
import matplotlib.collections as clt
df = pd.read_csv ("u118phag.csv", sep= ",")
df.head()
savefigs = True
figs_dir = 'figs'
if savefigs:
# Make the figures folder if it doesn't yet exist
if not os.path.isdir('figs'):
os.makedirs('figs')
f, ax = plt.subplots(figsize=(7, 5))
dy="Treatment"; dx="score"; ort="h"; pal = sns.color_palette(n_colors=1)
#adding color
pal = "Set2"
f, ax = plt.subplots(figsize=(7, 5))
ax=pt.half_violinplot( x = dx, y = dy, data = df, palette = pal, bw = .2, cut = 0.,
scale = "area", width = .6, inner = None, orient = ort)
ax=sns.stripplot( x = dx, y = dy, data = df, palette = pal, edgecolor = "white",
size = 3, jitter = 1, zorder = 0, orient = ort)
ax=sns.boxplot( x = dx, y = dy, data = df, color = "black", width = .15, zorder = 10,\
showcaps = True, boxprops = {'facecolor':'none', "zorder":10},\
showfliers=True, whiskerprops = {'linewidth':2, "zorder":10},\
saturation = 1, orient = ort)
if savefigs:
f.savefig('figs/figure21.png', bbox_inches='tight', dpi=500)
Now, what I'm trying to do is to figure out how to modify the graph so that I can (1) move the plots closer together, so there is not so much white space between them, and (2) shift the x-axis to the right, so that I can make the distribution (violin) plot wider without it getting cut in half by the y-axis.
I have tried to play around with subplots_adjust() as you can see in the first box of code, but I receive an error. I cannot figure out how to appropriately use this function, or even if that will actually bring the different graphs closer together.
I also know that I can increase the distribution size by increasing this value width = .6, but if I increase it too high, the distribution plot begins to being cut off by the y-axis. I can't figure out if I need to adjust the overall plot using the plt.subplots,or if I need to move each individual plot.
Any advice or recommendations on how to change the visuals of the graph? I've been staring at this for awhile, and I can't figure out how to make seaborn/matplotlib play nicely with ptitprince.
You may try to change the interval of X-axis being shown using ax.set_xbound (put a lower value than you currently have for the beginning).

Add either a density or box plot to the margins of a plot in Matplotlib

I have a scatter plot in linear scale. I want to add a box plot to the margins (left and bottom) of my scatter plot like this figure from Marginal Histograms and Box Charts?
Update
Here is my current working solution, share your thoughts on it or make a better suggestion.
ax.plot(df['vcnt'], df['ecnt'], 'ko', alpha=0.5)
# Save the default tick positions, so we can reset them..
tcksx = ax.get_xticks()
tcksy = ax.get_yticks()
ax.boxplot(df['ecnt'], positions=[min(tcksx)], notch=True, widths=1.)
ax.boxplot(df['vcnt'], positions=[min(tcksy)], vert=False, notch=True, widths=1.)
ax.set_yticks(tcksy) # pos = tcksy
ax.set_xticks(tcksx) # pos = tcksx
ax.set_yticklabels([int(j) for j in tcksy])
ax.set_xticklabels([int(j) for j in tcksx])
ax.set_ylim([min(tcksy-1),max(tcksy)])
ax.set_xlim([min(tcksx-1),max(tcksx)])
You can achieve this by creating additional axes for the bar plots.
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
x_data = np.random.randn(100)
y_data = -x_data + np.random.randn(100)*0.5
df = pd.DataFrame()
df['vcnt'] = x_data
df['ecnt'] = y_data
left = 0.1
bottom = 0.1
top = 0.8
right = 0.8
main_ax = plt.axes([left,bottom,right-left,top-bottom])
# create axes to the top and right of the main axes and hide them
top_ax = plt.axes([left,top,right - left,1-top])
plt.axis('off')
right_ax = plt.axes([right,bottom,1-right,top-bottom])
plt.axis('off')
main_ax.plot(df['vcnt'], df['ecnt'], 'ko', alpha=0.5)
# Save the default tick positions, so we can reset them..
tcksx = main_ax.get_xticks()
tcksy = main_ax.get_yticks()
right_ax.boxplot(df['ecnt'], positions=[0], notch=True, widths=1.)
top_ax.boxplot(df['vcnt'], positions=[0], vert=False, notch=True, widths=1.)
main_ax.set_yticks(tcksy) # pos = tcksy
main_ax.set_xticks(tcksx) # pos = tcksx
main_ax.set_yticklabels([int(j) for j in tcksy])
main_ax.set_xticklabels([int(j) for j in tcksx])
main_ax.set_ylim([min(tcksy-1),max(tcksy)])
main_ax.set_xlim([min(tcksx-1),max(tcksx)])
# set the limits to the box axes
top_ax.set_xlim(main_ax.get_xlim())
top_ax.set_ylim(-1,1)
right_ax.set_ylim(main_ax.get_ylim())
right_ax.set_xlim(-1,1)
plt.show()

Categories