I have a problem. I want to show mean and median inside my plot. But unfortunately it overwrites it (see the second code snippet). When I do plt.legend([...,...] the output is not what I want. How can I show the mean and median as linie and plot the legend of them both with the correct color and linestyle?
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from numpy import random
#d = {'distance_km': [1, 100, 12, 14],}
d = {'distance_km': random.randint(100, size=(20)),}
df_calculation = pd.DataFrame(data=d)
sns.set_style("white")
# Calculation of meand and median
mean=df_calculation["distance_km"].mean()
median = df_calculation["distance_km"].median()
plt.figure(figsize=(11,5))
ax = sns.distplot(df_calculation["distance_km"],color='lightblue')
ax.axvline(mean, color='r', linestyle='--')
ax.axvline(median, color='green', linestyle='--')
plt.legend([{f'Mean = {round(df_calculation["distance_km"].mean(), 2)}':mean}, {f'Median = {round(df_calculation["distance_km"].mean(), 2)}':median}])
plt.box(False)
plt.xlabel("\n Km", fontsize = 12)
plt.ylabel("", fontsize = 12)
plt.show()
Here it works bot it would be overwritten but the color is not correct
#Plotting Distribution Drugs Numbers per Condition
#plt.style.use('seaborn-whitegrid')
mean=df_calculation["distance_km"].mean()
median = df_calculation["distance_km"].median()
plt.figure(figsize=(11,5))
sns.set_style("white")
ax = sns.distplot(df_calculation["distance_km"],color='lightblue')
ax.axvline(mean, color='r', linestyle='--')
plt.legend({f'Mean = {round(df_calculation["distance_km"].mean(), 2)}':mean})
ax.axvline(median, color='green', linestyle='--')
plt.legend({f'Median = {round(df_calculation["distance_km"].median(), 2)}':median})
plt.box(False)
plt.xlabel("\n Kilometer", fontsize = 12)
plt.ylabel("", fontsize = 12)
plt.show()
Use the label parameter inside ax.axvline. E.g. in your second code block, instead of:
ax.axvline(mean, color='r', linestyle='--')
plt.legend({f'Mean = {round(df_calculation["distance_km"].mean(), 2)}':mean})
ax.axvline(median, color='green', linestyle='--')
plt.legend({f'Median = {round(df_calculation["distance_km"].median(), 2)}':median})
write:
ax.axvline(mean, color='r', linestyle='--', label=f'mean: {mean}')
ax.axvline(median, color='green', linestyle='--', label=f'median {median}')
plt.legend()
Result:
Related
I would like to see standard distribution (or cumulative %) along with the values in normal distribution, something like this.
The following code produces values in X axis.
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
vals = np.random.randint(0, 10, 1000)
vals.sort()
mean = np.mean(vals)
sd = np.std(vals)
pdf = stats.norm.pdf(vals, mean, sd)
fig = plt.figure(figsize=(6, 5))
ax = fig.add_subplot(111)
ax.plot(vals, pdf, c='blue', marker='.')
ax.set_xlabel("vals", fontsize=10)
ax.set_ylabel("p", fontsize=10)
ax.grid(b=True, which='major', color='0.55', linestyle='--')
ax.grid(b=True, which='minor', color='0.85', linestyle='--')
ax.minorticks_on()
plt.show()
You can do something like this:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
vals = np.random.randint(0, 10, 1000)
vals.sort()
na = np.array(vals, dtype='float')
mean = np.mean(vals)
sd = np.std(vals)
pdf = stats.norm.pdf(vals, mean, sd)
fl = True
l = []
l.append(mean)
co = 1
while(fl):
left = mean-(co*sd)
l.insert(0, left)
right = mean+(co*sd)
l.insert(len(l), right)
if ((left<=vals[0]) | (right>=vals[-1])):
fl = False
co = co +1;
fig = plt.figure(figsize=(6, 5))
ax = fig.add_subplot(111)
ax.plot(vals, pdf, c='blue', marker='.')
ax.set_xlabel("vals", fontsize=10)
ax.set_ylabel("p", fontsize=10)
ax.grid(b=True, which='major', color='0.55', linestyle='--')
ax.grid(b=True, which='minor', color='0.85', linestyle='--')
ax.minorticks_on()
g = lambda x: stats.norm.cdf(x, loc=mean, scale=sd)
for a,b in list(zip(l[1:], l)):
plt.axvline(a, color='red', label='z=ε')
plt.axvline(b, color='red', label='z=ε')
plt.fill_between(vals, pdf, 0, where=((b<=na) | (na<=a)), color='gray')
plt.text(a-sd, 0.05, "{:.2f}".format((g(a)-g(b))), fontsize=12, rotation=45)
plt.show()
feel free to edit the code as per your requirements.
Thanks #Pygirl for your initial idea of Z score, which suits my purpose. Having said that, highly appreciate your effort on modifying the script with cumulative %, cheers!
Here's what did work for me.
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
np.random.seed(42)
vals = np.random.randint(0, 10, 1000)
vals.sort()
mean = np.mean(vals)
sd = np.std(vals)
pdf = stats.norm.pdf(vals, mean, sd)
zScore = stats.zscore(vals)
fig = plt.figure(figsize=(6, 5))
ax = fig.add_subplot(111)
ax.plot(vals, pdf, c='b', marker='.')
ax.set_xlabel("vals", fontsize=10)
ax.set_ylabel("p", fontsize=10)
ax.grid(b=True, which='major', color='0.5', linestyle='--')
ax2 = ax.twiny()
ax2.plot(zScore, pdf, c='b', marker='.')
ax2.grid(b=True, which='major', color='r', linestyle='-')
ax2.set_xlabel("zScore", fontsize=10)
plt.show()
My plotting function produces subplots, in which sometimes it receives regular data to plot by numerical order, and sometimes (case == 'special') the x-labels bars should have different, uniquely ordered numerical values. The problem is that I cannot seem to get rid of the x-axis spaces between the bars. any idea how to solve? thanks.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
case = 'special'
data = np.random.rand(10)
if case='regular':
xvec0 = np.linspace(1, len(data), len(data), dtype='uint8')
else:
xvec0 = np.array([3, 4, 5, 9, 10, 11, 12, 13, 18, 20])
fig, axs = plt.subplots(1, 2, figsize=(15, 8))
fig.suptitle('Features Importance Summary', fontsize=16)
med_vec = np.ones(len(xvec0)) * np.median(data)
avg_vec = np.ones(len(xvec0)) * np.mean(data)
axs[0].bar(xvec0, data, edgecolor='k')
axs[0].plot(xvec0, med_vec, 'r--', label='Median')
axs[0].plot(xvec0, avg_vec, 'k--', label='Average')
axs[0].set_xlabel('Feature IDX', size=18)
axs[0].set_ylabel('Normalized feature importance', size=18)
axs[0].xaxis.set_major_locator(mticker.MultipleLocator(1))
axs[0].set_xticks(xvec0)
axs[0].grid(which='major', linestyle=':', linewidth='0.3', color='k')
axs[0].grid(which='minor', linestyle=':', linewidth='0.1', color='k')
axs[0].legend()
fig.show()
Thanks to Mr.T here's a working code in case any one else in the future will encounter the same issue:
fig, axs = plt.subplots(1, 2, figsize=(15, 8))
fig.suptitle('Features Importance Summary', fontsize=16)
med_vec = np.ones(len(xvec0)) * np.median(fi_wl)
avg_vec = np.ones(len(xvec0)) * np.mean(fi_wl)
axs[0].bar(range(len(xvec0)), fi_wl, edgecolor='k')
axs[0].plot(range(len(xvec0)), med_vec, 'r--', label='Median')
axs[0].plot(range(len(xvec0)), avg_vec, 'k--', label='Average')
axs[0].set_xlabel('Feature IDX', size=18)
axs[0].set_ylabel('Normalized feature importance', size=18)
axs[0].xaxis.set_major_locator(mticker.MultipleLocator(1))
axs[0].set_xticks(range(len(xvec0)))
axs[0].set_xticklabels(xvec0)
axs[0].grid(which='major', linestyle=':', linewidth='0.3', color='k')
axs[0].grid(which='minor', linestyle=':', linewidth='0.1', color='k')
axs[0].legend()
How can I create distplot from countplot
plt.rcdefaults()
%config InlineBackend.figure_format='retina'
sns.set_style('darkgrid')
ax = sns.countplot(x='Age',hue='Gender',data=df,edgecolor="None")
ax.tick_params(bottom=False, left=False)
ax.set_axisbelow(True)
for rect in ax.patches:
x = rect.get_x() + rect.get_width()/2.
y = rect.get_height()
try:
ax.annotate("{}".format(int(y)), (x,y), ha='center', va='bottom', clip_on=True)
except:
pass
ax.set_xlabel('Age', color='green')
ax.set_ylabel('Count', color='green')
ax.set_title('Countplot for Age(Gender)', color='tomato',weight='bold')
plt.legend(title='Gender', fontsize='large', loc='upper right').get_frame().set_facecolor('white')
plt.tight_layout()
plt.savefig('files\\Countplot_for_Age(Gender).jpg')
I want distplot for 2 Genders either in same plot or separately
Any suggestions or help will be highly appreciable
The x-axis of a countplot is categorical: it puts one bar for each encountered age, skipping bars when there are no rows for a certain age (21 and 23 in the example). Internally the bars are numbered as 0, 1, 2, ...
The y-axis is the count, which is proportional to the number of rows.
For a distplot, the x-axis are the ages themselves, and the y-axis is a probability distribution, which usually are quite small numbers (the area under the curve is normalized to be 1).
So, as both the x-axis and the y-axis are different, it is better to use separate subplots.
A distplot can be generated directly from the given data. Passing the same ax results in two distplots in the same subplot. A distplot is a combination of a histogram and a kdeplot. If the histogram isn't needed, hist=False leaves
it out, or the kdeplot can be called directly. The shade=True option adds shading to the plot.
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
NF = 50
NM = 10
df = pd.DataFrame({'Age': np.concatenate([np.random.randint(13, 20, NF) + np.random.randint(2, 7, NF),
np.random.randint(15, 23, NM)]),
'Gender': np.repeat(['female', 'male'], (NF, NM))})
df['Age'] = df['Age'].where((df['Age'] != 21) & (df['Age'] != 23), 20)
sns.set_style('darkgrid')
fig, axs = plt.subplots(ncols=2, figsize=(12, 4))
ax = sns.countplot(x='Age', hue='Gender', data=df, edgecolor="None", ax=axs[0])
ax.tick_params(bottom=False, left=False)
ax.set_axisbelow(True)
for rect in ax.patches:
x = rect.get_x() + rect.get_width() / 2.
y = rect.get_height()
ax.annotate(f"{y:.0f}", (x, y), ha='center', va='bottom', clip_on=True)
ax.set_xlabel('Age', color='green')
ax.set_ylabel('Count', color='green')
ax.set_title('Countplot for Age(Gender)', color='tomato', weight='bold')
ax.legend(title='Gender', fontsize='large', loc='upper right').get_frame().set_facecolor('white')
for gender in ('female', 'male'):
# ax2 = sns.kdeplot(df[df['Gender'] == gender]['Age'], shade=True, ax=axs[1], label=gender)
ax2 = sns.distplot(df[df['Gender'] == gender]['Age'], hist=False, kde_kws={'shade': True}, ax=axs[1], label=gender)
ax2.set_axisbelow(True)
ax2.set_xlabel('Age', color='green')
ax2.set_ylabel('probability distribution', color='green')
ax2.set_title('Distplot for Age(Gender)', color='tomato', weight='bold')
ax2.legend(title='Gender', fontsize='large', loc='upper right').get_frame().set_facecolor('white')
plt.tight_layout()
plt.show()
This is my function to create box plot. But it does not show the outliers. any help?
def box_plot_draw (data, box_colors):
#plt.style.use('Solarize_Light2')
fig, ax = plt.subplots()#figsize=(10, 6))
fig.canvas.set_window_title('A Boxplot Example')
fig.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.25) #
bp = ax.boxplot(data, notch=0, sym='+', vert=1, whis=1.5)
plt.setp(bp['boxes'], color='black')
plt.setp(bp['whiskers'], color='black')
plt.setp(bp['fliers'], color='red', marker='+')
box_colors = box_colors
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) #what is which?
ax.set_axisbelow(True)
#setting the tile for distribution
ax.set_title('Comparison of the duration of time for two Distributions')
ax.set_xlabel('Distribution')
ax.set_ylabel('Value')
You can test the code using this data.
np.random.seed(10)
collectn_1 = np.random.normal(100, 10, 200)
collectn_2 = np.random.normal(80, 30, 200)
## combine these different collections into a list
data_to_plot = [collectn_1, collectn_2]
box_colors = ['black', 'green']
box_plot_draw (data_to_plot, box_colors)
plt.show()
I am using the Jupyter notebook and I used styles for other functions. But I did not use any style for this function.
I'm currently trying to animate a scatterplot of monthly data saved in a pandas dataframe. So far I made loop, which generates one single plot after another. Now I would like to join them in a single gif (or mp4 I don't care). Is there an easy way to make use of mathplotlibs animation function? I can't get my head around, how to loop sliced data through FuncAnimation. So far I did this:
time = df.monat.unique()
for i in time:
dft = df[(df.monat == i) & (df.xcol < 4000)]
plt.scatter(x=dft['xcol'],
y=dft['ycol'],
s=dft['scol'] / 25,
c=dft['clr'],
linewidth=0,
alpha=0.8)
plt.title('Title ' + str(i), fontsize=10)
plt.xlabel('x label', fontsize=9)
plt.ylabel('y label', fontsize=9)
legend1_line2d = list()
for val in clrdict.values():
legend1_line2d.append(mlines.Line2D([0], [0],
linestyle='none',
marker='o',
alpha=0.6,
markersize=6,
markeredgecolor=None,
markeredgewidth=0,
markerfacecolor=val))
legend1 = plt.legend(legend1_line2d,
names,
frameon=False,
numpoints=1,
fontsize=8,
loc='upper right')
plt.show()
I figured it out by myself:
Generate an empty plot (fig). Like before all unique time-values are stored in a series(time). A simple counter (i) helps generating the correct slice of the data (dft) for each month (df.monat == a value from the series 'time') within the update-function. The update-function is called times the value of the frame-parameter in the anim.FuncAnimation (frames=len(time)).
Hope, this will be helpful for somebody else (most of the explanations for the matplotlib FuncAnimation I've found worked with random numbers - not with specific pandas columns):
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.animation as anim
...
time = df.monat.unique()
fig = plt.figure()
i = 1
def update(i):
plt.clf()
dft = df[(df.monat == time[i]) & (df.xcol < 4000)]
plt.scatter(x=dft['xcol'],
y=dft['ycol'],
s=dft['scol'] / 25,
c=dft['clr'],
linewidth=0,
alpha=0.8)
plt.title('Title ' + str(time[i]), fontsize=10)
plt.xlabel('x label', fontsize=9)
plt.ylabel('y label', fontsize=9)
plt.xlim(0, 900) # fixed dimensions x
plt.ylim(-5, 100) # fixed dimensions y
legend1_line2d = list()
for val in clrdict.values():
legend1_line2d.append(mlines.Line2D([0], [0],
linestyle='none',
marker='o',
alpha=0.6,
markersize=6,
markeredgecolor=None,
markeredgewidth=0,
markerfacecolor=val))
legend1 = plt.legend(legend1_line2d,
names,
frameon=False,
numpoints=1,
fontsize=8,
loc='upper right')
i += 1
ani = anim.FuncAnimation(fig, update, frames=len(time), interval=500)
# plt.show() # this will show the ani over and over
ani.save("test.mp4", dpi=200, fps=1, codec="libx264", bitrate=5000, extra_args=['-pix_fmt', 'yuv420p'])