Line chart in matplotlib with a double axis(strings on the axis) - python

I am trying to create a chart using python from a data in an Excel sheet. The data looks like this
Location Values
Trial 1 Edge 12
M-2 13
Center 14
M-4 15
M-5 12
Top 13
Trial 2 Edge 10
N-2 11
Center 11
N-4 12
N-5 13
Top 14
Trial 3 Edge 15
R-2 13
Center 12
R-4 11
R-5 10
Top 3
I want my graph to look like this:
Chart-1
.The chart should have the Location column values as X-axis, i.e, string object. This can be done easily(by using/creating Location as an array),
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
datalink=('/Users/Maxwell/Desktop/W1.xlsx')
df=pd.read_excel(datalink,skiprows=2)
x1=df.loc[:,['Location']]
x2=df.loc[:,['Values']]
x3=np.linspace(1,len(x2),num=len(x2),endpoint=True)
vals=['Location','Edge','M-2','Center','M-4','M-5','Top','Edge','N-2','Center','N-4','N-5','Top','Edge','R-2']
plt.figure(figsize=(12,8),dpi=300)
plt.subplot(1,1,1)
plt.xticks(x3,vals)
plt.plot(x3,x2)
plt.show()
But, I also want to show Trial-1, Trial-2 .. on X-axis. Upto now I had been using Excel to generate chart but, I have a lot of similar data and want to use python to automate the task.

With your excel sheet that has data as follows,
,
you can use matplotlib to create the plot you wanted. It is not straightforward but can be done. See below:
EDIT: earlier I suggested factorplot, but it is not applicable because your location values for each trial are not constant.
df = pd.read_excel(r'test_data.xlsx', header = 1, parse_cols = "D:F",
names = ['Trial', 'Location', 'Values'])
'''
Trial Location Values
0 Trial 1 Edge 12
1 NaN M-2 13
2 NaN Center 14
3 NaN M-4 15
4 NaN M-5 12
5 NaN Top 13
6 Trial 2 Edge 10
7 NaN N-2 11
8 NaN Center 11
9 NaN N-4 12
10 NaN N-5 13
11 NaN Top 14
12 Trial 3 Edge 15
13 NaN R-2 13
14 NaN Center 12
15 NaN R-4 11
16 NaN R-5 10
17 NaN Top 3
'''
# this will replace the nan with corresponding trial number for each set of trials
df = df.fillna(method = 'ffill')
'''
Trial Location Values
0 Trial 1 Edge 12
1 Trial 1 M-2 13
2 Trial 1 Center 14
3 Trial 1 M-4 15
4 Trial 1 M-5 12
5 Trial 1 Top 13
6 Trial 2 Edge 10
7 Trial 2 N-2 11
8 Trial 2 Center 11
9 Trial 2 N-4 12
10 Trial 2 N-5 13
11 Trial 2 Top 14
12 Trial 3 Edge 15
13 Trial 3 R-2 13
14 Trial 3 Center 12
15 Trial 3 R-4 11
16 Trial 3 R-5 10
17 Trial 3 Top 3
'''
from matplotlib import rcParams
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
rcParams.update({'font.size': 10})
fig1 = plt.figure()
f, ax1 = plt.subplots(1, figsize = (10,3))
ax1.plot(list(df.Location.index), df['Values'],'o-')
ax1.set_xticks(list(df.Location.index))
ax1.set_xticklabels(df.Location, rotation=90 )
ax1.yaxis.set_label_text("Values")
# create a secondary axis
ax2 = ax1.twiny()
# hide all the spines that we dont need
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines['left'].set_visible(False)
pos1 = ax2.get_position() # get the original position
pos2 = [pos1.x0 + 0, pos1.y0 -0.2, pos1.width , pos1.height ] # create a new position by offseting it
ax2.xaxis.set_ticks_position('bottom')
ax2.set_position(pos2) # set a new position
trials_ticks = 1.0 * df.Trial.value_counts().cumsum()/ (len(df.Trial)) # create a series object for ticks for each trial group
trials_ticks_positions = [0]+list(trials_ticks) # add a additional zero. this will make tick at zero.
trials_labels_offset = 0.5 * df.Trial.value_counts()/ (len(df.Trial)) # create an offset for the tick label, we want the tick label to between ticks
trials_label_positions = trials_ticks - trials_labels_offset # create the position of tick labels
# set the ticks and ticks labels
ax2.set_xticks(trials_ticks_positions)
ax2.xaxis.set_major_formatter(ticker.NullFormatter())
ax2.xaxis.set_minor_locator(ticker.FixedLocator(trials))
ax2.xaxis.set_minor_formatter(ticker.FixedFormatter(list(trials_label_positions.index)))
ax2.tick_params(axis='x', length = 10,width = 1)
plt.show()
results in

Related

legends not print fully when multiple plots are plotted on same figure

I have the code as below to plot multiple plots on the same figure
fig, ax = plt.subplots(figsize=(25, 10))
def wl_ratioplot(wavelength1,wavelength2, dataframe, x1=0.1,x2=1.5,y1=-500,y2=25000):
a=dataframe[['asphalt_index','layer_thickness',wavelength1,wavelength2]].copy()
sns.scatterplot(x=a[wavelength1]/a[wavelength2],y=a['layer_thickness'],data=a)
ax.set_xlim(x1,x2)
ax.set_ylim(y1,y2)
leg = "{} vs {}".format(wavelength1,wavelength2)
print(leg) #this line is only to see the variable legend has the proper content
ax.legend(leg)
wl_ratioplot(wave_lengths[2],wave_lengths[0],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[0],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[3],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[3],wave_lengths[0],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[2],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
I get the plot as the below pic where the legend seems to be first 5 letters separately even though the variable legend has the right content
There was another similar question & the solution was to put a square bracket to the variable legend. I tried this with the code as below.
fig, ax = plt.subplots(figsize=(25, 10))
def wl_ratioplot(wavelength1,wavelength2, dataframe, x1=0.1,x2=1.5,y1=-500,y2=25000):
a=dataframe[['asphalt_index','layer_thickness',wavelength1,wavelength2]].copy()
sns.scatterplot(x=a[wavelength1]/a[wavelength2],y=a['layer_thickness'],data=a)
ax.set_xlim(x1,x2)
ax.set_ylim(y1,y2)
leg = "{} vs {}".format(wavelength1,wavelength2)
print(leg)#this line is only to see the variable legend has the proper content
ax.legend([leg])
wl_ratioplot(wave_lengths[2],wave_lengths[0],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[0],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[3],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[3],wave_lengths[0],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
wl_ratioplot(wave_lengths[2],wave_lengths[1],dataframe=train_df_wo_outliers,x1=-.1,x2=3)
Now I get the full legend but only the first legend is shown as the pic below
Can someone let me know how to get the full legend for all the plots? Thanks.
dummy data (the plot in pic will NOT match)
14nm 15nm 16nm 17nm 18nm 19nm layer_thickness
1 2 3 4 5 6 0
1 2 3 4 5 6 0
3 5 7 9 11 13 5700
1 2 3 4 5 6 0
3 5 7 9 11 13 8600
1 2 3 4 5 6 0
3 5 7 9 11 13 5000
1 2 3 4 5 6 0
45 55 65 75 85 95 100
1 2 3 4 5 6 0
8 15 22 29 36 43 16600
wave_lengths=['15nm','16nm','14nm','18nm']
Answer Update
Based on answer from Quang Hoang. The output pics using scatter plot from matplotlib & sns.scatterplot
With plt it is pretty natural:
def wl_ratioplot(wavelength1,wavelength2, dataframe,
x1=0.1,x2=1.5,y1=-500,y2=25000,
ax=None):
leg = "{} vs {}".format(wavelength1,wavelength2)
# set the label here, and let plt deal with it
# also, you don't need to copy the dataframe:
ax.scatter(x=dataframe[wavelength1]/dataframe[wavelength2],
y=dataframe['layer_thickness'],label=leg)
ax.set_xlim(x1,x2)
ax.set_ylim(y1,y2)
fig, ax = plt.subplots(figsize=(25, 10))
wl_ratioplot(wave_lengths[2],wave_lengths[0],dataframe=df,x1=-.1,x2=3, ax=ax)
wl_ratioplot(wave_lengths[0],wave_lengths[1],dataframe=df,x1=-.1,x2=3, ax=ax)
wl_ratioplot(wave_lengths[3],wave_lengths[1],dataframe=df,x1=-.1,x2=3, ax=ax)
wl_ratioplot(wave_lengths[3],wave_lengths[0],dataframe=df,x1=-.1,x2=3, ax=ax)
wl_ratioplot(wave_lengths[2],wave_lengths[1],dataframe=df,x1=-.1,x2=3, ax=ax)
ax.legend()
Output:
every time you call the function wl_ratioplot the legend is being reset the final value. use a array to store all the legends then access it all through a loop.
ax.legend([leg]) #it is resetting the legend after each call.
use a legends = [];
legends.append([leg])
after all function calls, draw the legend differently
ax.legend(legends)

panda DataFrame.value_counts().plot().bar() and DataFrame.value_counts().cumsum().plot() not using the same axis

I am trying to draw a frequency bar plot and a cumulative "ogive" in the same plot. If I draw them separately both are shown OK, but when shown in the same figure, the cumulative graphic is shown shifted. Below the code used.
df = pd.DataFrame({'Correctas': [4,6,5,4,7,2,8,3,5,6,9,6,6,7,5,5,8,10,4,8,3,6,9,5,11,5,12,7,7,5,4,6]});
df['Correctas'].value_counts(sort = False).plot.bar();
df['Correctas'].value_counts(sort = False).cumsum().plot();
plt.show()
The frequency data is
2 1
3 3
4 7
5 14
6 20
7 24
8 27
9 29
10 30
11 31
12 32
So the cumulative shall start from 2 and it starts from 4 on x axis.
image showing the error
This has to do with bar chart plotting categorical x-axis. Here is a quick fix:
df = pd.DataFrame({'Correctas': [4,6,5,4,7,2,8,3,5,6,9,6,6,7,5,5,8,10,4,8,3,6,9,5,11,5,12,7,7,5,4,6]});
df_counts = df['Correctas'].value_counts(sort = False)
df_counts.index = df_counts.index.astype('str')
df_counts.plot.bar(alpha=.8);
df_counts.cumsum().plot(color='k', kind='line');
plt.show();
Output:

Plotting three dimensions of categorical data in Python

My data has three categorical variables I'm trying to visualize:
City (one of five)
Occupation (one of four)
Blood type (one of four)
So far, I've succeeded in grouping the data in a way that I think will be easy to work with:
import numpy as np, pandas as pd
# Make data
cities = ['Tijuana','Las Vegas','Los Angeles','Anaheim','Atlantis']
occupations = ['Doctor','Lawyer','Engineer','Drone security officer']
bloodtypes = ['A','B','AB','O']
df = pd.DataFrame({'City': np.random.choice(cities,500),
'Occupation': np.random.choice(occupations,500),
'Blood Type':np.random.choice(bloodtypes,500)})
# You need to make a dummy column, otherwise the groupby returns an empty df
df['Dummy'] = np.ones(500)
# This is now what I'd like to plot
df.groupby(by=['City','Occupation','Blood Type']).count().unstack(level=1)
Returns:
Dummy
Occupation Doctor Drone security officer Engineer Lawyer
City Blood Type
Anaheim A 7 7 7 7
AB 6 10 8 5
B 2 10 4 2
O 4 3 3 6
Atlantis A 6 5 5 7
AB 12 7 7 10
B 7 4 7 3
O 7 4 6 4
Las Vegas A 8 4 8 5
AB 5 6 8 9
B 6 10 6 6
O 6 9 5 9
Los Angeles A 7 4 8 8
AB 9 8 8 8
B 3 6 4 1
O 9 11 11 9
Tijuana A 3 4 5 3
AB 9 5 5 7
B 3 6 4 9
O 3 5 5 8
My goal is to create something like the Seaborn swarmplot shown below, which comes from the Seaborn documentation. Seaborn applies jitter to the quantitative data so that you can see the individual data points and their hues:
With my data, I'd like to plot City on the x-axis and Occupation on the y-axis, applying jitter to each, and then hue by Blood type. However, sns.swarmplot requires one of the axes to be quantitative:
sns.swarmplot(data=df,x='City',y='Occupation',hue='Blood Type')
returns an error.
An acceptable alternative might be to create 20 categorical bar plots, one for each intersection of City and Occupation, which I would do by running a for loop over each category, but I can't imagine how I'd feed that to matplotlib subplots to get them in a 4x5 grid.
The most similar question I could find was in R, and the asker only wanted to indicate the most common value for the third variable, so I didn't get any good ideas from there.
Thanks for any help you can provide.
Alright, I got to work on the "acceptable alternative" today and I have found a solution using basically pure matplotlib (but I stuck the Seaborn styling on top of it, just because).
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.patches import Patch
import seaborn as sns
# Make data
cities = ['Tijuana','Las Vegas','Los Angeles','Anaheim','Atlantis']
occupations = ['Doctor','Lawyer','Engineer','Drone security officer']
bloodtypes = ['A','B','AB','O']
df = pd.DataFrame({'City': np.random.choice(cities,500),
'Occupation': np.random.choice(occupations,500),
'Blood Type':np.random.choice(bloodtypes,500)})
# Make a dummy column, otherwise the groupby returns an empty df
df['Dummy'] = np.ones(500)
# This is now what I'd like to plot
grouped = df.groupby(by=['City','Occupation','Blood Type']).count().unstack()
# List of blood types, to use later as categories in subplots
kinds = grouped.columns.levels[1]
# colors for bar graph
colors = [get_cmap('viridis')(v) for v in np.linspace(0,1,len(kinds))]
sns.set(context="talk")
nxplots = len(grouped.index.levels[0])
nyplots = len(grouped.index.levels[1])
fig, axes = plt.subplots(nxplots,
nyplots,
sharey=True,
sharex=True,
figsize=(10,12))
fig.suptitle('City, occupation, and blood type')
# plot the data
for a, b in enumerate(grouped.index.levels[0]):
for i, j in enumerate(grouped.index.levels[1]):
axes[a,i].bar(kinds,grouped.loc[b,j],color=colors)
axes[a,i].xaxis.set_ticks([])
axeslabels = fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)
plt.grid(False)
axeslabels.set_ylabel('City',rotation='horizontal',y=1,weight="bold")
axeslabels.set_xlabel('Occupation',weight="bold")
# x- and y-axis labels
for i, j in enumerate(grouped.index.levels[1]):
axes[nyplots,i].set_xlabel(j)
for i, j in enumerate(grouped.index.levels[0]):
axes[i,0].set_ylabel(j)
# Tune this manually to make room for the legend
fig.subplots_adjust(right=0.82)
fig.legend([Patch(facecolor = i) for i in colors],
kinds,
title="Blood type",
loc="center right")
Returns this:
I'd appreciate any feedback, and I'd still love it if someone could provide the preferred solution.

Xtick frequency in pandas boxplot

I am using pandas groupby for plotting wind speed Vs direction using a bar and whisker plot. However the xaxis is not readable due to so many wind direction value close to each other.
I have tried the oc_params ax.set_xticks but instead I am having empty x-axis or modified xaxis with different values
The head of my dataframe
Kvit_TIU dir_cat
0 0.064740 14
1 0.057442 15
2 0.056750 15
3 0.069002 17
4 0.068464 17
5 0.067057 17
6 0.071901 12
7 0.050464 5
8 0.066165 1
9 0.073993 27
10 0.090784 34
11 0.121366 33
12 0.087172 34
13 0.066197 30
14 0.073020 17
15 0.071784 16
16 0.081699 17
17 0.088014 14
18 0.076758 14
19 0.078574 14
I used groupby = dir_cat to create a box plot
fig = plt.figure() # create the canvas for plotting
ax1 = plt.subplot(1,1,1)
ax1 = df_KvTr10hz.boxplot(column='Kvit_TIU', by='dir_cat', showfliers=False, showmeans=True)
ax1.set_xticks([30,90, 180,270, 330])
I would like to have the x-axis plotted with a reduced frequency. So that the plot can be readable
ax1 = df_KvTr10hz.dropna().boxplot(column='Kvit_TIU', by='dir_cat', showfliers=False, showmeans=True)
EDIT: Using OP sample dataframe
However, if we substitute with NaNs the Kvit_TIU values for 'dir_cat'>=30

pandas indexing error with seaborn

I have a pandas dataframe that I am trying to create a tsplot with seaborn and I am getting index duplicate errors. My question is two fold. First off, when I look at the data there are no duplicate indexes (sample df is given below):
BOLD campaign time type
0 2.735430 6041148 3 Default
1 1.356943 6041148 3 None
2 NaN 6041148 3 Vertical
3 9.550452 6013866 6 Default
4 1.000000 6013866 6 None
5 NaN 6013866 6 Vertical
6 322.675089 6086810 8 Default
7 1.508849 6086810 8 None
8 773.393385 6086810 8 Vertical
9 43.396084 6046619 10 Default
10 26.124405 6046619 10 None
11 NaN 6046619 10 Vertical
12 103.955111 6065909 10 Default
13 1.000000 6065909 10 None
14 NaN 6065909 10 Vertical
15 9.744664 6013866 9 Default
16 9.031970 6013866 9 None
17 NaN 6013866 9 Vertical
18 10.980322 6065742 8 Default
19 0.803821 6065742 8 None
However when I go to plot this dataframe with
sns.tsplot(df, time="time", unit="campaign", condition="type", value="BOLD")
I get
ValueError: Index contains duplicate entries, cannot reshape
Can someone explain why this is? I do not see any duplicate entries (or indices). I also tried using drop_duplicates and get the same result

Categories