legends not print fully when multiple plots are plotted on same figure - python

I have the code as below to plot multiple plots on the same figure
fig, ax = plt.subplots(figsize=(25, 10))
def wl_ratioplot(wavelength1,wavelength2, dataframe, x1=0.1,x2=1.5,y1=-500,y2=25000):
a=dataframe[['asphalt_index','layer_thickness',wavelength1,wavelength2]].copy()
sns.scatterplot(x=a[wavelength1]/a[wavelength2],y=a['layer_thickness'],data=a)
ax.set_xlim(x1,x2)
ax.set_ylim(y1,y2)
leg = "{} vs {}".format(wavelength1,wavelength2)
print(leg) #this line is only to see the variable legend has the proper content
ax.legend(leg)
wl_ratioplot(wave_lengths[2],wave_lengths[0],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[0],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[3],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[3],wave_lengths[0],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[2],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
I get the plot as the below pic where the legend seems to be first 5 letters separately even though the variable legend has the right content
There was another similar question & the solution was to put a square bracket to the variable legend. I tried this with the code as below.
fig, ax = plt.subplots(figsize=(25, 10))
def wl_ratioplot(wavelength1,wavelength2, dataframe, x1=0.1,x2=1.5,y1=-500,y2=25000):
a=dataframe[['asphalt_index','layer_thickness',wavelength1,wavelength2]].copy()
sns.scatterplot(x=a[wavelength1]/a[wavelength2],y=a['layer_thickness'],data=a)
ax.set_xlim(x1,x2)
ax.set_ylim(y1,y2)
leg = "{} vs {}".format(wavelength1,wavelength2)
print(leg)#this line is only to see the variable legend has the proper content
ax.legend([leg])
wl_ratioplot(wave_lengths[2],wave_lengths[0],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[0],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[3],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[3],wave_lengths[0],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[2],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
Now I get the full legend but only the first legend is shown as the pic below
Can someone let me know how to get the full legend for all the plots? Thanks.
dummy data (the plot in pic will NOT match)
14nm 15nm 16nm 17nm 18nm 19nm layer_thickness
1 2 3 4 5 6 0
1 2 3 4 5 6 0
3 5 7 9 11 13 5700
1 2 3 4 5 6 0
3 5 7 9 11 13 8600
1 2 3 4 5 6 0
3 5 7 9 11 13 5000
1 2 3 4 5 6 0
45 55 65 75 85 95 100
1 2 3 4 5 6 0
8 15 22 29 36 43 16600
wave_lengths=['15nm','16nm','14nm','18nm']
Answer Update
Based on answer from Quang Hoang. The output pics using scatter plot from matplotlib & sns.scatterplot

With plt it is pretty natural:
def wl_ratioplot(wavelength1,wavelength2, dataframe,
x1=0.1,x2=1.5,y1=-500,y2=25000,
ax=None):
leg = "{} vs {}".format(wavelength1,wavelength2)
# set the label here, and let plt deal with it
# also, you don't need to copy the dataframe:
ax.scatter(x=dataframe[wavelength1]/dataframe[wavelength2],
y=dataframe['layer_thickness'],label=leg)
ax.set_xlim(x1,x2)
ax.set_ylim(y1,y2)
fig, ax = plt.subplots(figsize=(25, 10))
wl_ratioplot(wave_lengths[2],wave_lengths[0],dataframe=df,x1=-.1,x2=3, ax=ax)
wl_ratioplot(wave_lengths[0],wave_lengths[1],dataframe=df,x1=-.1,x2=3, ax=ax)
wl_ratioplot(wave_lengths[3],wave_lengths[1],dataframe=df,x1=-.1,x2=3, ax=ax)
wl_ratioplot(wave_lengths[3],wave_lengths[0],dataframe=df,x1=-.1,x2=3, ax=ax)
wl_ratioplot(wave_lengths[2],wave_lengths[1],dataframe=df,x1=-.1,x2=3, ax=ax)
ax.legend()
Output:

every time you call the function wl_ratioplot the legend is being reset the final value. use a array to store all the legends then access it all through a loop.
ax.legend([leg]) #it is resetting the legend after each call.
use a legends = [];
legends.append([leg])
after all function calls, draw the legend differently
ax.legend(legends)

Related

Bar plot not appearing normally using df.plot.bar()

I have the following code. I am trying to loop through variables (dataframe columns) and create bar plots. I have attached below an example of a graph for the column newerdf['age'].
I believe this should produce 3 bars (one for each option - male (value = 1), female (value = 2), other(value = 3)).
However, the graph below does not seem to show this.
I would be so grateful for a helping hand as to where I am going wrong!
listedvariables = ['age','gender-quantised','hours_of_sleep','frequency_of_alarm_usage','nap_duration_mins','frequency_of_naps','takes_naps_yes/no','highest_education_level_acheived','hours_exercise_per_week_in_last_6_months','drink_alcohol_yes/no','drink_caffeine_yes/no','hours_exercise_per_week','hours_of_phone_use_per_week','video_game_phone/tablet_hours_per_week','video_game_all_devices_hours_per_week']
for i in range(0,len(listedvariables)):
fig = newerdf[[listedvariables[i]]].plot.bar(figsize=(30,20))
fig.tick_params(axis='x',labelsize=40)
fig.tick_params(axis='y',labelsize=40)
plt.tight_layout()
newerdf['age']
age
0 2
1 2
2 4
3 3
5 2
... ...
911 2
912 1
913 2
914 3
915 2
The data are not grouped into categories yet, so a value count is needed before calling the plotting method:
for var in listedvariables:
ax = newerdf[var].value_counts().plot.bar(figsize=(30,20))
ax.tick_params(axis='x', labelsize=40)
ax.tick_params(axis='y', labelsize=40)
plt.tight_layout()
plt.show()

Colour by Category in scatterplot

My dataframe looks like this:
date index count weekday_num max_temperature_C
0 2019-04-01 0 1379 0 18
1 2019-04-02 1 1395 1 21
2 2019-04-03 2 1155 2 19
3 2019-04-04 3 342 3 18
4 2019-04-05 4 216 4 14
I would like to plot count vs max_temperature_C and colour by weekday_num
I have tried the below:
#create the scatter plot of trips vs Temp
plt.scatter(comb2['count'], comb2['max_temperature_C'], c=comb2['weekday_num'])
# Label the axis
plt.xlabel('Daily Trip count')
plt.ylabel('Max Temp c')
plt.legend(['weekday_num'])
# Show it!
plt.show()
However I am not sure quite how to get the legend to display all of the colours which correspond to each of the 'weekday_num' ?
Thanks
You can use the automated legend creation like this:
fig, ax = plt.subplots()
scatter = ax.(comb2['count'], comb2['max_temperature_C'], c=comb2['weekday_num'])
# produce a legend with the unique colors from the scatter
legend = ax.legend(*scatter.legend_elements(),
loc="upper right", title="Weekday num")
ax.add_artist(legend)
plt.show()

panda DataFrame.value_counts().plot().bar() and DataFrame.value_counts().cumsum().plot() not using the same axis

I am trying to draw a frequency bar plot and a cumulative "ogive" in the same plot. If I draw them separately both are shown OK, but when shown in the same figure, the cumulative graphic is shown shifted. Below the code used.
df = pd.DataFrame({'Correctas': [4,6,5,4,7,2,8,3,5,6,9,6,6,7,5,5,8,10,4,8,3,6,9,5,11,5,12,7,7,5,4,6]});
df['Correctas'].value_counts(sort = False).plot.bar();
df['Correctas'].value_counts(sort = False).cumsum().plot();
plt.show()
The frequency data is
2 1
3 3
4 7
5 14
6 20
7 24
8 27
9 29
10 30
11 31
12 32
So the cumulative shall start from 2 and it starts from 4 on x axis.
image showing the error
This has to do with bar chart plotting categorical x-axis. Here is a quick fix:
df = pd.DataFrame({'Correctas': [4,6,5,4,7,2,8,3,5,6,9,6,6,7,5,5,8,10,4,8,3,6,9,5,11,5,12,7,7,5,4,6]});
df_counts = df['Correctas'].value_counts(sort = False)
df_counts.index = df_counts.index.astype('str')
df_counts.plot.bar(alpha=.8);
df_counts.cumsum().plot(color='k', kind='line');
plt.show();
Output:

Plotting three dimensions of categorical data in Python

My data has three categorical variables I'm trying to visualize:
City (one of five)
Occupation (one of four)
Blood type (one of four)
So far, I've succeeded in grouping the data in a way that I think will be easy to work with:
import numpy as np, pandas as pd
# Make data
cities = ['Tijuana','Las Vegas','Los Angeles','Anaheim','Atlantis']
occupations = ['Doctor','Lawyer','Engineer','Drone security officer']
bloodtypes = ['A','B','AB','O']
df = pd.DataFrame({'City': np.random.choice(cities,500),
'Occupation': np.random.choice(occupations,500),
'Blood Type':np.random.choice(bloodtypes,500)})
# You need to make a dummy column, otherwise the groupby returns an empty df
df['Dummy'] = np.ones(500)
# This is now what I'd like to plot
df.groupby(by=['City','Occupation','Blood Type']).count().unstack(level=1)
Returns:
Dummy
Occupation Doctor Drone security officer Engineer Lawyer
City Blood Type
Anaheim A 7 7 7 7
AB 6 10 8 5
B 2 10 4 2
O 4 3 3 6
Atlantis A 6 5 5 7
AB 12 7 7 10
B 7 4 7 3
O 7 4 6 4
Las Vegas A 8 4 8 5
AB 5 6 8 9
B 6 10 6 6
O 6 9 5 9
Los Angeles A 7 4 8 8
AB 9 8 8 8
B 3 6 4 1
O 9 11 11 9
Tijuana A 3 4 5 3
AB 9 5 5 7
B 3 6 4 9
O 3 5 5 8
My goal is to create something like the Seaborn swarmplot shown below, which comes from the Seaborn documentation. Seaborn applies jitter to the quantitative data so that you can see the individual data points and their hues:
With my data, I'd like to plot City on the x-axis and Occupation on the y-axis, applying jitter to each, and then hue by Blood type. However, sns.swarmplot requires one of the axes to be quantitative:
sns.swarmplot(data=df,x='City',y='Occupation',hue='Blood Type')
returns an error.
An acceptable alternative might be to create 20 categorical bar plots, one for each intersection of City and Occupation, which I would do by running a for loop over each category, but I can't imagine how I'd feed that to matplotlib subplots to get them in a 4x5 grid.
The most similar question I could find was in R, and the asker only wanted to indicate the most common value for the third variable, so I didn't get any good ideas from there.
Thanks for any help you can provide.
Alright, I got to work on the "acceptable alternative" today and I have found a solution using basically pure matplotlib (but I stuck the Seaborn styling on top of it, just because).
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.patches import Patch
import seaborn as sns
# Make data
cities = ['Tijuana','Las Vegas','Los Angeles','Anaheim','Atlantis']
occupations = ['Doctor','Lawyer','Engineer','Drone security officer']
bloodtypes = ['A','B','AB','O']
df = pd.DataFrame({'City': np.random.choice(cities,500),
'Occupation': np.random.choice(occupations,500),
'Blood Type':np.random.choice(bloodtypes,500)})
# Make a dummy column, otherwise the groupby returns an empty df
df['Dummy'] = np.ones(500)
# This is now what I'd like to plot
grouped = df.groupby(by=['City','Occupation','Blood Type']).count().unstack()
# List of blood types, to use later as categories in subplots
kinds = grouped.columns.levels[1]
# colors for bar graph
colors = [get_cmap('viridis')(v) for v in np.linspace(0,1,len(kinds))]
sns.set(context="talk")
nxplots = len(grouped.index.levels[0])
nyplots = len(grouped.index.levels[1])
fig, axes = plt.subplots(nxplots,
nyplots,
sharey=True,
sharex=True,
figsize=(10,12))
fig.suptitle('City, occupation, and blood type')
# plot the data
for a, b in enumerate(grouped.index.levels[0]):
for i, j in enumerate(grouped.index.levels[1]):
axes[a,i].bar(kinds,grouped.loc[b,j],color=colors)
axes[a,i].xaxis.set_ticks([])
axeslabels = fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)
plt.grid(False)
axeslabels.set_ylabel('City',rotation='horizontal',y=1,weight="bold")
axeslabels.set_xlabel('Occupation',weight="bold")
# x- and y-axis labels
for i, j in enumerate(grouped.index.levels[1]):
axes[nyplots,i].set_xlabel(j)
for i, j in enumerate(grouped.index.levels[0]):
axes[i,0].set_ylabel(j)
# Tune this manually to make room for the legend
fig.subplots_adjust(right=0.82)
fig.legend([Patch(facecolor = i) for i in colors],
kinds,
title="Blood type",
loc="center right")
Returns this:
I'd appreciate any feedback, and I'd still love it if someone could provide the preferred solution.

Xtick frequency in pandas boxplot

I am using pandas groupby for plotting wind speed Vs direction using a bar and whisker plot. However the xaxis is not readable due to so many wind direction value close to each other.
I have tried the oc_params ax.set_xticks but instead I am having empty x-axis or modified xaxis with different values
The head of my dataframe
Kvit_TIU dir_cat
0 0.064740 14
1 0.057442 15
2 0.056750 15
3 0.069002 17
4 0.068464 17
5 0.067057 17
6 0.071901 12
7 0.050464 5
8 0.066165 1
9 0.073993 27
10 0.090784 34
11 0.121366 33
12 0.087172 34
13 0.066197 30
14 0.073020 17
15 0.071784 16
16 0.081699 17
17 0.088014 14
18 0.076758 14
19 0.078574 14
I used groupby = dir_cat to create a box plot
fig = plt.figure() # create the canvas for plotting
ax1 = plt.subplot(1,1,1)
ax1 = df_KvTr10hz.boxplot(column='Kvit_TIU', by='dir_cat', showfliers=False, showmeans=True)
ax1.set_xticks([30,90, 180,270, 330])
I would like to have the x-axis plotted with a reduced frequency. So that the plot can be readable
ax1 = df_KvTr10hz.dropna().boxplot(column='Kvit_TIU', by='dir_cat', showfliers=False, showmeans=True)
EDIT: Using OP sample dataframe
However, if we substitute with NaNs the Kvit_TIU values for 'dir_cat'>=30

Categories