Related
[UPDATE: Sorry for not providing the piece where the author of the codes create example data. I have updated the codes]
I found an example of a 3D mesh line chart that satisfied what I need (colouring change with level on z dimension). However, instead of line, I want surface plot. How can I change the codes to have the 3d surface plot?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import animation, rc
from matplotlib.cm import get_cmap
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.font_manager import FontProperties
from matplotlib.collections import LineCollection
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d.art3d import Line3DCollection
index_returns = np.random.normal(loc=1e-4, scale=5e-3, size=(783, 9))
index_returns = np.vstack((np.zeros(shape=(1, 9)) + 100, index_returns))
index_prices = np.cumprod(1 + index_returns, axis=0)
window = 261
df = np.zeros(shape=(index_prices.shape[0]-window, 9))
for i in range(window, index_prices.shape[0], 1):
df[i-window] = (index_prices[i]/index_prices[i-window]) - 1
index = pd.date_range('2019-01-01', periods=index_prices.shape[0]-window, freq='B')
columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
df = pd.DataFrame(df, index=index, columns=columns)
# create the figure
fig = plt.figure(figsize=(14.4, 9))
ax = fig.add_subplot(111, projection='3d')
fig.patch.set_alpha(1)
# get the cmap to use
cmap = get_cmap('RdYlGn')
# get the slice based on data frame
current_slice = df.values[:261, :]
index_names = df.columns
index_dates = df.index
# list holding the lines
lines = []
# for each index...
for i in range(current_slice.shape[1]):
# get the coordinates
x = np.array(np.arange(current_slice.shape[0]))
y = np.tile(i, current_slice.shape[0])
z = np.array(current_slice[:, i])
# crete points and segments to color
points = np.array([x, y, z]).T.reshape(-1, 1, 3)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
# Create a continuous norm to map from data points to colors
norm = plt.Normalize(-0.19, 0.19)
lc = Line3DCollection(segments, cmap=cmap, norm=norm, zorder=current_slice.shape[1]-i)
# Set the values used for colormapping
lc.set_array(z)
lc.set_linewidth(2)
lc.set_color(cmap(z[-1] * 2.5 + 0.5))
lc.set_label(index_names[i])
lines.append(ax.add_collection(lc))
# add the grids
ax.legend(loc='center right', bbox_to_anchor=(1.1, 0.46), fancybox=True, facecolor=(.95,.95,.95,1), framealpha=1, shadow=False, frameon=True, ncol=1, columnspacing=0, prop={'family': 'DejaVu Sans Mono'})
ax.set_zlabel('Rolling Equity 1Y', labelpad=10)
ax.set_zlim(-0.39, 0.39)
ax.set_zticklabels([' '* 3 + '{:.0%}'.format(val) for val in ax.get_zticks()], fontdict={'verticalalignment': 'center', 'horizontalalignment': 'center'})
ax.set_xlabel('Date', labelpad=30)
ax.set_xlim(0, current_slice.shape[0]-1)
ax.set_xticklabels([index_dates[int(val)].strftime('%m/%y') for val in ax.get_xticks()[:-1]] + [''], rotation=0, fontdict={'verticalalignment': 'top', 'horizontalalignment': 'center'})
ax.set_yticks(np.arange(current_slice.shape[1]))
ax.set_yticklabels([index_names[i] for i in range(current_slice.shape[1])], rotation=-15, fontdict={'verticalalignment': 'center', 'horizontalalignment': 'left'})
# show the plot
plt.show()
This question already has answers here:
Color by Column Values in Matplotlib
(6 answers)
Closed 1 year ago.
I am trying to make a simple scatter plot in pyplot using a Pandas DataFrame object, but want an efficient way of plotting two variables but have the symbols dictated by a third column (key). I have tried various ways using df.groupby, but not successfully. A sample df script is below. This colours the markers according to 'key1', but Id like to see a legend with 'key1' categories. Am I close? Thanks.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig1 = plt.figure(1)
ax1 = fig1.add_subplot(111)
ax1.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
plt.show()
You can use scatter for this, but that requires having numerical values for your key1, and you won't have a legend, as you noticed.
It's better to just use plot for discrete categories like this. For example:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
groups = df.groupby('label')
# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=name)
ax.legend()
plt.show()
If you'd like things to look like the default pandas style, then just update the rcParams with the pandas stylesheet and use its color generator. (I'm also tweaking the legend slightly):
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
groups = df.groupby('label')
# Plot
plt.rcParams.update(pd.tools.plotting.mpl_stylesheet)
colors = pd.tools.plotting._get_standard_colors(len(groups), color_type='random')
fig, ax = plt.subplots()
ax.set_color_cycle(colors)
ax.margins(0.05)
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=name)
ax.legend(numpoints=1, loc='upper left')
plt.show()
This is simple to do with Seaborn (pip install seaborn) as a oneliner
sns.scatterplot(x_vars="one", y_vars="two", data=df, hue="key1")
:
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(1974)
df = pd.DataFrame(
np.random.normal(10, 1, 30).reshape(10, 3),
index=pd.date_range('2010-01-01', freq='M', periods=10),
columns=('one', 'two', 'three'))
df['key1'] = (4, 4, 4, 6, 6, 6, 8, 8, 8, 8)
sns.scatterplot(x="one", y="two", data=df, hue="key1")
Here is the dataframe for reference:
Since you have three variable columns in your data, you may want to plot all pairwise dimensions with:
sns.pairplot(vars=["one","two","three"], data=df, hue="key1")
https://rasbt.github.io/mlxtend/user_guide/plotting/category_scatter/ is another option.
With plt.scatter, I can only think of one: to use a proxy artist:
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig1 = plt.figure(1)
ax1 = fig1.add_subplot(111)
x=ax1.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
ccm=x.get_cmap()
circles=[Line2D(range(1), range(1), color='w', marker='o', markersize=10, markerfacecolor=item) for item in ccm((array([4,6,8])-4.0)/4)]
leg = plt.legend(circles, ['4','6','8'], loc = "center left", bbox_to_anchor = (1, 0.5), numpoints = 1)
And the result is:
You can use df.plot.scatter, and pass an array to c= argument defining the color of each point:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
colors = np.where(df["key1"]==4,'r','-')
colors[df["key1"]==6] = 'g'
colors[df["key1"]==8] = 'b'
print(colors)
df.plot.scatter(x="one",y="two",c=colors)
plt.show()
From matplotlib 3.1 onwards you can use .legend_elements(). An example is shown in Automated legend creation. The advantage is that a single scatter call can be used.
In this case:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3),
index = pd.date_range('2010-01-01', freq = 'M', periods = 10),
columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig, ax = plt.subplots()
sc = ax.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
ax.legend(*sc.legend_elements())
plt.show()
In case the keys were not directly given as numbers, it would look as
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3),
index = pd.date_range('2010-01-01', freq = 'M', periods = 10),
columns = ('one', 'two', 'three'))
df['key1'] = list("AAABBBCCCC")
labels, index = np.unique(df["key1"], return_inverse=True)
fig, ax = plt.subplots()
sc = ax.scatter(df['one'], df['two'], marker = 'o', c = index, alpha = 0.8)
ax.legend(sc.legend_elements()[0], labels)
plt.show()
You can also try Altair or ggpot which are focused on declarative visualisations.
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
Altair code
from altair import Chart
c = Chart(df)
c.mark_circle().encode(x='x', y='y', color='label')
ggplot code
from ggplot import *
ggplot(aes(x='x', y='y', color='label'), data=df) +\
geom_point(size=50) +\
theme_bw()
It's rather hacky, but you could use one1 as a Float64Index to do everything in one go:
df.set_index('one').sort_index().groupby('key1')['two'].plot(style='--o', legend=True)
Note that as of 0.20.3, sorting the index is necessary, and the legend is a bit wonky.
seaborn has a wrapper function scatterplot that does it more efficiently.
sns.scatterplot(data = df, x = 'one', y = 'two', data = 'key1'])
I am preparing box plots with a whisker interval of [2,98]. The issue is that I am working with air quality data and have a large range of data points, so the outliers take up the entire figure and overshadow the boxplots. I would like to plot the max and min outliers only and have tried the method from Matplotlib boxplot show only max and min fliers, however, I get an error message that says TypeError: 'AxesSubplot' object is not subscriptable.
Here is my code:
fig,ax = plt.subplots(1, figsize=(8,6))
g = sns.boxplot(data=mda8, orient='v', width = 0.7, whis = (2,98))
fliers = g['fliers']
for fly in fliers:
fdata=fly.get_data
fly.set_data([fdata[0][0],fdata[0][-1],fdata[1][0],fdata[1][-1]])
xvalues = ['Niland', 'El Centro', 'Calexico']
plt.xticks(np.arange(3), xvalues, fontsize=12)
ax.set_ylabel('Ozone MDA8 (ppb)',fontsize=15)
ax.set_ylim(0,105)
plt.show()
Here's some sample data:
mda8 = pd.DataFrame({
'T1':[35.000000, 32.125000, 32.000000, 35.250000, 28.875000, 28.500000, 29.375000, 25.125000, 34.166667, 35.250000],
'T2':[28.375, 30.750, 33.250, 34.000, 32.875, 30.250, 29.875, 100.409, 29.625, 1.232],
'T3':[34.250, 102.232, 28.250, 33.000, 27.625, 21.500, 28.375, 30.250, 3.454, 33.750]})
I need help with plotting the max and min outliers only and am open to doing another method besides the one that I tried here.
EDIT here's the link to my csv file https://drive.google.com/file/d/1E3A0UAYCbSN53JXtfsbrA4i_Phci_JWf/view?usp=sharing
A possible approach could be:
hide the outliers plotted by seaborn.boxplot by passing showfliers = False parameter:
sns.boxplot(data=mda8, orient='v', width = 0.7, whis = (2,98), showfliers = False)
get the list of outliers for each column, find maximum and minimum and plot only them:
outliers = {col: list(stat['fliers']) for col in mda8.columns for stat in boxplot_stats(mda8[col])}
min_max_outliers = {key: [np.min(value), np.max(value)] if value != [] else [] for key, value in outliers.items()}
i = 0
for key, value in min_max_outliers.items():
if value != []:
ax.scatter([i, i], value, marker = 'd', facecolor = 'black')
i += 1
Complete Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.cbook import boxplot_stats
mda8 = pd.DataFrame({'T1': [35.000000, 32.125000, 32.000000, 35.250000, 28.875000, 28.500000, 29.375000, 25.125000, 34.166667, 35.250000],
'T2': [28.375, 30.750, 33.250, 34.000, 32.875, 30.250, 29.875, 100.409, 29.625, 1.232],
'T3': [34.250, 102.232, 28.250, 33.000, 27.625, 21.500, 28.375, 30.250, 3.454, 33.750]})
fig,ax = plt.subplots(1, figsize=(8,6))
sns.boxplot(data=mda8, orient='v', width = 0.7, whis = (2,98), showfliers = False)
outliers = {col: list(stat['fliers']) for col in mda8.columns for stat in boxplot_stats(mda8[col])}
min_max_outliers = {key: [np.min(value), np.max(value)] if value != [] else [] for key, value in outliers.items()}
i = 0
for key, value in min_max_outliers.items():
if value != []:
ax.scatter([i, i], value, marker = 'd', facecolor = 'black')
i += 1
xvalues = ['Niland', 'El Centro', 'Calexico']
plt.xticks(np.arange(3), xvalues, fontsize=12)
ax.set_ylabel('Ozone MDA8 (ppb)',fontsize=15)
ax.set_ylim(0,105)
plt.show()
EDIT
Working on the data your provided, if I plot them as they are:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
mda8 = pd.read_csv(r'data/MDA8_allregions.csv')
mda8 = mda8.drop(['date', 'date.1', 'date.2'], axis = 1)
fig, ax = plt.subplots(1, figsize = (8, 6))
sns.boxplot(data = mda8, orient = 'v', width = 0.7, whis = (2, 98), showfliers = True)
plt.show()
I get:
In the code above I change the parameter showfliers = False, in order to hide outliers.
Then, as suggested by JohanC in the comment, a simpler way to plot outliers is to plot min and max for each column:
for i, col in enumerate(mda8.columns, 0):
ax.scatter([i, i], [mda8[col].min(), mda8[col].max()], marker = 'd', facecolor = 'black')
Complete Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
mda8 = pd.read_csv(r'data/MDA8_allregions.csv')
mda8 = mda8.drop(['date', 'date.1', 'date.2'], axis = 1)
fig, ax = plt.subplots(1, figsize = (8, 6))
sns.boxplot(data = mda8, orient = 'v', width = 0.7, whis = (2, 98), showfliers = False)
for i, col in enumerate(mda8.columns, 0):
ax.scatter([i, i], [mda8[col].min(), mda8[col].max()], marker = 'd', facecolor = 'black')
plt.show()
I'm wondering how can I do the following:
I have a DataFrame with points and classes. I'd like to draw all points and use one color for each class. How can I specify how classes refer to colors in the legend?
fig = plt.figure(figsize=(18,10), dpi=1600)
df = pd.DataFrame(dict(points1 = data_plot[:,0], points2 = data_plot[:,1], \
target = target[0:2000]))
colors = {1: 'green', 2:'red', 3:'blue', 4:'yellow', 5:'orange', 6:'pink', \
7:'brown', 8:'black', 9:'white'}
fig, ax = plt.subplots()
ax.scatter(df['points1'], df['points2'], c = df['target'].apply(lambda x: colors[x]))
The easiest way to get your legend to have separate entries for each color (and therefore it's target value) is to create a separate plot object for each target value.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
x = np.random.rand(100)
y = np.random.rand(100)
target = np.random.randint(1,9, size=100)
df = pd.DataFrame(dict(points1=x, points2=y, target=target))
colors = {1: 'green', 2:'red', 3:'blue', 4:'yellow', 5:'orange', 6:'pink', \
7:'brown', 8:'black', 9:'white'}
fig, ax = plt.subplots()
for k,v in colors.items():
series = df[df['target'] == k]
scat = ax.scatter(series['points1'], series['points2'], c=v, label=k)
plt.legend()
This question already has answers here:
Color by Column Values in Matplotlib
(6 answers)
Closed 1 year ago.
I am trying to make a simple scatter plot in pyplot using a Pandas DataFrame object, but want an efficient way of plotting two variables but have the symbols dictated by a third column (key). I have tried various ways using df.groupby, but not successfully. A sample df script is below. This colours the markers according to 'key1', but Id like to see a legend with 'key1' categories. Am I close? Thanks.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig1 = plt.figure(1)
ax1 = fig1.add_subplot(111)
ax1.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
plt.show()
You can use scatter for this, but that requires having numerical values for your key1, and you won't have a legend, as you noticed.
It's better to just use plot for discrete categories like this. For example:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
groups = df.groupby('label')
# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=name)
ax.legend()
plt.show()
If you'd like things to look like the default pandas style, then just update the rcParams with the pandas stylesheet and use its color generator. (I'm also tweaking the legend slightly):
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
groups = df.groupby('label')
# Plot
plt.rcParams.update(pd.tools.plotting.mpl_stylesheet)
colors = pd.tools.plotting._get_standard_colors(len(groups), color_type='random')
fig, ax = plt.subplots()
ax.set_color_cycle(colors)
ax.margins(0.05)
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=name)
ax.legend(numpoints=1, loc='upper left')
plt.show()
This is simple to do with Seaborn (pip install seaborn) as a oneliner
sns.scatterplot(x_vars="one", y_vars="two", data=df, hue="key1")
:
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(1974)
df = pd.DataFrame(
np.random.normal(10, 1, 30).reshape(10, 3),
index=pd.date_range('2010-01-01', freq='M', periods=10),
columns=('one', 'two', 'three'))
df['key1'] = (4, 4, 4, 6, 6, 6, 8, 8, 8, 8)
sns.scatterplot(x="one", y="two", data=df, hue="key1")
Here is the dataframe for reference:
Since you have three variable columns in your data, you may want to plot all pairwise dimensions with:
sns.pairplot(vars=["one","two","three"], data=df, hue="key1")
https://rasbt.github.io/mlxtend/user_guide/plotting/category_scatter/ is another option.
With plt.scatter, I can only think of one: to use a proxy artist:
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig1 = plt.figure(1)
ax1 = fig1.add_subplot(111)
x=ax1.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
ccm=x.get_cmap()
circles=[Line2D(range(1), range(1), color='w', marker='o', markersize=10, markerfacecolor=item) for item in ccm((array([4,6,8])-4.0)/4)]
leg = plt.legend(circles, ['4','6','8'], loc = "center left", bbox_to_anchor = (1, 0.5), numpoints = 1)
And the result is:
You can use df.plot.scatter, and pass an array to c= argument defining the color of each point:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
colors = np.where(df["key1"]==4,'r','-')
colors[df["key1"]==6] = 'g'
colors[df["key1"]==8] = 'b'
print(colors)
df.plot.scatter(x="one",y="two",c=colors)
plt.show()
From matplotlib 3.1 onwards you can use .legend_elements(). An example is shown in Automated legend creation. The advantage is that a single scatter call can be used.
In this case:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3),
index = pd.date_range('2010-01-01', freq = 'M', periods = 10),
columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig, ax = plt.subplots()
sc = ax.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
ax.legend(*sc.legend_elements())
plt.show()
In case the keys were not directly given as numbers, it would look as
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3),
index = pd.date_range('2010-01-01', freq = 'M', periods = 10),
columns = ('one', 'two', 'three'))
df['key1'] = list("AAABBBCCCC")
labels, index = np.unique(df["key1"], return_inverse=True)
fig, ax = plt.subplots()
sc = ax.scatter(df['one'], df['two'], marker = 'o', c = index, alpha = 0.8)
ax.legend(sc.legend_elements()[0], labels)
plt.show()
You can also try Altair or ggpot which are focused on declarative visualisations.
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
Altair code
from altair import Chart
c = Chart(df)
c.mark_circle().encode(x='x', y='y', color='label')
ggplot code
from ggplot import *
ggplot(aes(x='x', y='y', color='label'), data=df) +\
geom_point(size=50) +\
theme_bw()
It's rather hacky, but you could use one1 as a Float64Index to do everything in one go:
df.set_index('one').sort_index().groupby('key1')['two'].plot(style='--o', legend=True)
Note that as of 0.20.3, sorting the index is necessary, and the legend is a bit wonky.
seaborn has a wrapper function scatterplot that does it more efficiently.
sns.scatterplot(data = df, x = 'one', y = 'two', data = 'key1'])