How can I annotate a seaborn pointplot with the values of col1 "A" "B" or "C" as a text, next to the points where they are drawn.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Data
a = list("ABC") * 4
c = list("DE") * 6
score = np.random.randint(-5, 5, 12)
df = pd.DataFrame({"col1": a, "col2": c, "score": score})
print(df)
col1 col2 value
0 A D 3
1 B E 1
2 C D -3
3 A E -5
4 B D -4
5 C E -5
6 A D 2
7 B E -4
8 C D 4
9 A E 1
10 B D 3
11 C E -2
sns.pointplot(data=df, x="col2", y="value", hue='col1');
Desired outcome is with the labels A, B and C:
plt.text(x, y, 'a text') places a text in a plot. The main problem is to find the exact positions and colors. To get a consistent order, it helps to explicitly make the columns categorical. The categorical numbering (0, 1, 2, ...) is the same as matplotlib uses internally for its categorical axes.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
a = list("ABC") * 4
c = list("DE") * 6
score = np.random.randint(-5, 5, 12)
df = pd.DataFrame({"col1": a, "col2": c, "score": score})
df['col1'] = pd.Categorical(df['col1'])
df['col2'] = pd.Categorical(df['col2'])
palette = sns.color_palette("tab10")
ax = sns.pointplot(data=df, x="col2", y="score", hue='col1', palette=palette)
pos_after_last_x = len(df['col2'].cat.categories) - 1 + 0.05
last_x_cat = df['col2'].cat.categories[-1]
for cat, color in zip(df['col1'].cat.categories, palette):
mean_score = df[(df['col1'] == cat) & (df['col2'] == last_x_cat)]['score'].mean()
ax.text(pos_after_last_x, mean_score, cat, color=color)
plt.show()
Related
I am trying to change the color of each individual bar in my figure here. The code that I used it down below. Instead of each bar changing to the color that I have set in c, there are several colors within each bar. I have included a screenshot of this. How can I fix this? Thank you all in advance!
Clusters is just a categorical variable of 5 groups, ranging from 0 to 4. I have included a second screenshot of the dataframe.
So essentially, what I am trying to do is to plot each cluster for economic ideology and social ideology so I can have a visual comparison of the 5 different clusters over these two dimensions (economic and social ideology). Each cluster should be represented by one color. For example, cluster 0 should be red in color.
c = ['#bf1111', '#1c4975', '#278f36', '#47167a', '#de8314']
plt.subplot(1, 2, 1)
plt.bar(data = ANESdf_LatNEW, height = "EconIdeo",
x = "clusters", color = c)
plt.title('Economic Ideology')
plt.xticks([0, 1, 2, 3, 4])
plt.xlabel('Clusters')
plt.ylabel('')
plt.subplot(1, 2, 2)
plt.bar(data = ANESdf_LatNEW, height = "SocialIdeo",
x = "clusters", color = c)
plt.title('Social Ideology')
plt.xticks([0, 1, 2, 3, 4])
plt.xlabel('Clusters')
plt.ylabel('')
plt.show()
Bar graph here
Top 5 rows of dataframe
I have tried multiple ways of changing colors. For example, instead of having c, I had put in the colors directly at color = ... This did not work either.
Here is a script that does what you seem to be looking for based on your edits and comment.
Note that I do not assume that all clusters have the same size in this context; if that is the case, this approach can be simplified.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# sample dataframe
df = pd.DataFrame(
{
'EconIdeo':[1,2,3,4,3,5,7],
'Clusters':[2,3,0,1,3,0,3]
})
print(df)
# parameters: width for each cluster, colors for each cluster
# (if clusters are not sequential from zero, replace c with dictionary)
width = .75
c = ['#bf1111', '#1c4975', '#278f36', '#47167a', '#de8314']
df['xpos'] = df['Clusters']
df['width'] = width
df['color'] = ''
clusters = df['Clusters'].unique()
for k in clusters:
where = (df['Clusters'] == k)
n = where.sum()
df.loc[where,'xpos'] += np.linspace(-width/2,width/2,2*n+1)[1:-1:2]
df.loc[where,'width'] /=n
df.loc[where,'color'] = c[k]
plt.bar(data = df, height = "EconIdeo", x = 'xpos',
width = 'width', color = 'color')
plt.xticks(clusters,clusters)
plt.show()
Resulting plot:
Input dataframe:
EconIdeo Clusters
0 1 2
1 2 3
2 3 0
3 4 1
4 3 3
5 5 0
6 7 3
Dataframe after script applies changes (to include plotting specifications)
EconIdeo Clusters xpos width color
0 1 2 2.0000 0.750 #278f36
1 2 3 2.7500 0.250 #47167a
2 3 0 -0.1875 0.375 #bf1111
3 4 1 1.0000 0.750 #1c4975
4 3 3 3.0000 0.250 #47167a
5 5 0 0.1875 0.375 #bf1111
6 7 3 3.2500 0.250 #47167a
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
data = np.genfromtxt('file1.txt',delimiter=' ')
lats = data[:,0]
## lon => x
lons = data[:,1]
## values => z
values = data[:,2]
###
lat_uniq = list(set(lats.tolist()))
nlats = len(lat_uniq)
print(nlats)
print(lat_uniq)
lon_uniq = list(set(lons.tolist()))
print(lon_uniq)
nlons = len(lon_uniq)
print(nlons)
print (lats.shape, nlats, nlons)
yre = lats.reshape(nlats,nlons)
xre = lons.reshape(nlats,nlons)
zre = values.reshape(nlats,nlons)
#### later in the defined map
fig,ax=plt.subplots(1,1)
cp = ax.contourf(xre, yre, zre)
fig.colorbar(cp)
plt.savefig('f1.pdf')
file1.txt
1 2 3
4 5 6
7 8 9
10 11 12
..
First column - x values,
Second - y values,
third - z values
I'm using the code to make a contour plot in python, but getting the following error:
Traceback (most recent call last):
File "./yut.py", line 21, in
yre = lats.reshape(nlats,nlons)
ValueError: cannot reshape array of size 4 into shape (4,4)
Could you please help to fix this error? Thanks in advance!
Matplotlib expects a contour plot to receive data in a specific format. Your approach does not provide the data in this format; you have to transform your data like this:
import numpy as np
import matplotlib.pyplot as plt
#from matplotlib.colors import LogNorm
data = np.genfromtxt('test.txt', delimiter=' ')
#print(data)
lats = data[:,0]
## lon => x
lons = data[:,1]
## values => z
values = data[:,2]
###
#get unique lat lon values and their index positions
lat_uniq, lat_idx = np.unique(lats, return_inverse=True)
lon_uniq, lon_idx = np.unique(lons, return_inverse=True)
#create 2D array necessary for the contour plot
xre, yre = np.meshgrid(lon_uniq, lat_uniq)
zre = np.full(xre.shape, np.nan)
#or if you know the standard value of the array, fill it with that
#zre = np.full(xre.shape, 0)
zre[lat_idx, lon_idx] = values
print(zre)
#you can fill in missing data with interpolation
from scipy.interpolate import griddata
zre_interpolated = griddata((lons, lats), values, (xre, yre), method = "linear")
print(zre_interpolated)
#### later in the defined map
fig, (ax1, ax2) = plt.subplots(1,2, figsize = (10, 5))
cp1 = ax1.contourf(xre, yre, zre, levels=4)
plt.colorbar(cp1, ax=ax1)
ax1.set_title("data are not interpolated")
cp2 = ax2.contourf(xre, yre, zre_interpolated, levels=4)
plt.colorbar(cp2, ax=ax2)
ax2.set_title("interpolated data")
plt.show()
Example output:
The example output was generated using the following data in the txt file:
1 1 1
1 2 2
2 4 9
4 5 2
6 1 1
6 2 8
6 4 9
6 5 2
2 5 3
4 2 5
4 3 8
4 4 5
1 3 4
1 5 2
2 1 1
2 3 4
I have three-column data in a file named "sample1.dat" and a code that reads the columns and tries to plot the 3rd column against the 2nd column. I pick up parameter values from the 1st column elements as long as their values remain the same.
"sample1.dat" reads
0 1 1
0 2 4
0 3 9
0 4 16
0 5 25
0 6 36
1 1 1
1 2 8
1 3 27
1 4 64
1 5 125
1 6 216
2 1 1
2 2 16
2 3 81
2 4 256
2 5 625
2 6 1296
And my code:
import matplotlib.pyplot as plt
import numpy as np
data = np.loadtxt('sample1.dat')
x = data[:,0]
y = data[:,1]
z = data[:,2]
L = len(data)
col = ['r','g','b']
x0 = x[0]; j=0; jold=-1
for i in range(L):
print('j, col[j]=',j, col[j])
if x[i] == x0:
print('y[i], z[i]=',y[i],z[i])
if i==0 or j != jold: # j-index decides new or the same paramet
label = 'parameter = {}'.format(x0)
else:
label = ''
print('label =',label)
plt.plot(y[i], z[i], color=col[j], marker='o', label=label)
else:
x0 = x[i] # Update when x-value changes,
# i.e. pick up the next parameter value
i -= 1 # Shift back else we miss the 1st point for new x-value
j += 1; jold = j
plt.legend()
plt.xlabel('2nd column')
plt.ylabel('3rd column')
plt.savefig('sample1.png')
plt.show()
The plot outcome:
One can clearly see that two issues persist:
The legends appear only for the first parameter though I tried to avoid the repitition in my code.
The default linestyle is not appearing though the legends show line plus marker plots.
How could I resolve these or is there a smarter way of coding to fulfill the same purpose.
The first issue is due to some strange logic involving j,jold and x0. The code can be simplified by drawing all y,z for each x-value at once. Numpy allows selecting the y's corresponding to a given x0 as y[x==x0s].
The second issue can be solved by explicitly setting the desired linestyle, i.e. ls=''.
import matplotlib.pyplot as plt
import numpy as np
data = np.loadtxt('sample1.dat')
x = data[:, 0]
y = data[:, 1]
z = data[:, 2]
colors = ['r', 'g', 'b']
for x0, color in zip(np.unique(x), colors):
plt.plot(y[x == x0], z[x == x0], color=color, marker='o', ls='', label=f'parameter = {x0:.0f}')
plt.legend()
plt.xlabel('2nd column')
plt.ylabel('3rd column')
plt.show()
An alternative approach would use the seaborn library, which does the selecting and coloring without a lot of intervention, for example:
import seaborn as sns
sns.scatterplot(x=y, y=z, hue=x, palette=['r', 'g', 'b'])
Seaborn can automatically add labels if the data is organized as a dictionary or a pandas dataframe:
data = {'first column': x.astype(int),
'second column': y,
'third column': z}
sns.scatterplot(data=data, x='second column', y='third column', hue='first column', palette=['r', 'g', 'b'])
You can get the result you want in a few lines by using pandas and seaborn.
If you add column names (for instance A, B, and C) to the data in the sample1.dat file as follow:
A B C
0 1 1
0 2 4
0 3 9
0 4 16
0 5 25
0 6 36
1 1 1
1 2 8
1 3 27
1 4 64
1 5 125
1 6 216
2 1 1
2 2 16
2 3 81
2 4 256
2 5 625
2 6 1296
You can then load your data in a pandas dataframe and plot it with seaborn:
import pandas as pd
import seaborn as sns
df=pd.read_fwf('sample1.dat')
col = ['r','g','b']
sns.scatterplot(data=df,x='B',y='C',hue='A',palette=col)
And the output gives:
I use the following code to produce random data and plot the distribution of probability densities. How can I do the same with my own data as shown below?
Code
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
mean, cov = [0, 1], [(1, .5), (.5, 1)]
data = np.random.multivariate_normal(mean, cov, 200)
df = pd.DataFrame(data, columns=["X", "Y"])
x, y = np.random.multivariate_normal(mean, cov, 1000).T
g = sns.jointplot(x=x, y=y, data=df, kind="kde", n_levels=75, color="m")
g.plot_joint(plt.scatter, c="black", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("X", "Y");
My own data sample
X Y
0 1 8
1 7 8
2 7 9
3 5 8
4 7 7
5 9 9
6 1 3
4 6 8
5 9 7
6 9 6
7 8 2
8 1 9
9 0 10
10 22 2
11 4 45
12 9 8
I tried this but I am getting strange values
import numpy as np
mean = np.mean(data1['X'], axis=0)
cov = np.cov(data1['Y'], rowvar=0)
X = multivariate_normal.pdf(data1['X'], mean=2.5, cov=0.5)
mean = np.mean(data1['X'], axis=0)
cov = np.cov(data1['Y'], rowvar=0)
Y = multivariate_normal.pdf(data1['Y'], mean=2.5, cov=0.5)
df = np.concatenate( (X.reshape(-1,1),Y.reshape(-1,1)) , axis=1)
df = pd.DataFrame(df)
df = df.rename({0: 'X', 1: 'Y'}, axis=1)
g = sns.jointplot(x=X, y=Y, data=df, kind="kde", n_levels=75, color="r")
g.plot_joint(plt.scatter, c="black", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("X", "Y");
This solution worked.
import numpy as np
mean = np.mean(data1['X'], axis=0)
cov = np.cov(data1['Y'], rowvar=0)
X = multivariate_normal.pdf(data1['X'], mean=2.5, cov=0.5)
mean = np.mean(data1['X'], axis=0)
cov = np.cov(data1['Y'], rowvar=0)
Y = multivariate_normal.pdf(data1['Y'], mean=2.5, cov=0.5)
df = np.concatenate( (X.reshape(-1,1),Y.reshape(-1,1)) , axis=1)
df = pd.DataFrame(df)
df = df.rename({0: 'X', 1: 'Y'}, axis=1)
g = sns.jointplot(x=X, y=Y, data=df, kind="kde", n_levels=75, color="r")
g.plot_joint(plt.scatter, c="black", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("X", "Y");
Assuming that I have three Python pandas DataFrames:
df_sale = pd.DataFrame([[20,30,10], [30,20,20], [20,40,40]], columns=list("ABC"))
A B C
0 20 30 10
1 30 20 20
2 20 40 40
df_people = pd.DataFrame([[2,3,1], [3,2,2], [2,4,4]], columns=list("ABC"))
A B C
0 2 3 1
1 3 2 2
2 2 4 4
df_department = pd.DataFrame([[1,2,1], [1,1,2], [2,1,1]], columns=list("ABC"))
A B C
0 1 2 1
1 1 1 2
2 2 1 1
How do I plot a 3D bar chart with all these 3 dataframes in the same place?
I want the X axis to be ['A', 'B', 'C'], Y axis to be the name of dataframes ['df_sale', 'df_people', 'df_department'], and Z axis to show the numbers.
You could use matplotlib's 3D bars.
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
columns = ['A', 'B', 'C']
df_names = ['sale', 'people', 'department']
df = [pd.DataFrame([[20,30,10], [30,20,20], [20,40,40]], columns=columns), pd.DataFrame([[2,3,1], [3,2,2], [2,4,4]], columns=columns), pd.DataFrame([[1,2,1], [1,1,2], [2,1,1]], columns=columns)]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
#make sure x and y axis get the right tick labels
plt.xticks([i for i in range(len(columns))], columns)
plt.yticks([i for i in range(len(df_names))], df_names)
#define a list for x positions
xs = list()
for i in range(len(df)):
for j in range(len(columns)):
xs.append(i + j * 0.1)
for c1, c in enumerate(['r', 'g', 'b']):
ys = list()
for i in range(len(columns)):
ys.extend(df[c1].ix[:,i:i+1].unstack().tolist())
cs = [c] * len(xs)
ax.bar(xs, ys, zs=c1, zdir='y', color=cs, alpha=0.5, width=0.1)
plt.show()
Multicolors and legend
import matplotlib
colors = ['r', 'g', 'b', 'c', 'm', 'y', '#eeefff', '#feefff', '#aeefff']
for c1 in range(3):
ys = list()
for i in range(len(columns)):
ys.extend(df[c1].ix[:,i:i+1].unstack().tolist())
ax.bar(xs, ys, zs=c1, zdir='y', color=colors, alpha=0.5, width=0.1)
legend = list()
for i, c in enumerate(colors):
legend.append(matplotlib.patches.Patch(color=c, label='value {0} of column {1}'.format(i % 3, columns[i // 3])))
plt.legend(handles=legend, loc=4, bbox_to_anchor=(.9, 0), mode="expand")
plt.show()