Scatter Pie Plot Python Pandas - python

"Scatter Pie Plot" ( a scatter plot using pie charts instead of dots). I require this as I have to represent 3 dimensions of data.
1: x axis (0-6)
2: y axis (0-6)
3: Category lets say (A,B,C - H)
If two x and y values are the same I want a pie chart to be in that position representing that Category.
Similar to the graph seen in this link:
https://matplotlib.org/gallery/lines_bars_and_markers/scatter_piecharts.html#sphx-glr-gallery-lines-bars-and-markers-scatter-piecharts-py
or this image from Tableu:
[![enter image description here][1]][1]
As I am limited to only use python I have been struggling to manipulate the code to work for me.
Could anyone help me with this problem? I would very grateful!
Example data:
XVAL YVAL GROUP
1.3 4.5 A
1.3 4.5 B
4 2 E
4 6 A
2 4 A
2 4 B
1 1 G
1 1 C
1 2 B
1 2 D
3.99 4.56 G
The final output should have 6 pie charts on the X & Y with 1 containing 3 groups and 2 containing 3 groups.
My attempt:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
def draw_pie(dist,
xpos,
ypos,
size,
ax=None):
if ax is None:
fig, ax = plt.subplots(figsize=(10,8))
# for incremental pie slices
cumsum = np.cumsum(dist)
cumsum = cumsum/ cumsum[-1]
pie = [0] + cumsum.tolist()
for r1, r2 in zip(pie[:-1], pie[1:]):
angles = np.linspace(2 * np.pi * r1, 2 * np.pi * r2)
x = [0] + np.cos(angles).tolist()
y = [0] + np.sin(angles).tolist()
xy = np.column_stack([x, y])
ax.scatter([xpos], [ypos], marker=xy, s=size)
return ax
fig, ax = plt.subplots(figsize=(40,40))
draw_pie([Group],'xval','yval',10000,ax=ax)
draw_pie([Group], 'xval', 'yval', 20000, ax=ax)
draw_pie([Group], 'xval', 'yval', 30000, ax=ax)
plt.show()

I'm not sure how to get 6 pie charts. If we group on XVAL and YVAL, there are 7 unique pairs. You can do something down this line:
fig, ax = plt.subplots(figsize=(40,40))
for (x,y), d in df.groupby(['XVAL','YVAL']):
dist = d['GROUP'].value_counts()
draw_pie(dist, x, y, 10000*len(d), ax=ax)
plt.show()
Output:

Related

ValueError: cannot reshape array - contour plot python

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
data = np.genfromtxt('file1.txt',delimiter=' ')
lats = data[:,0]
## lon => x
lons = data[:,1]
## values => z
values = data[:,2]
###
lat_uniq = list(set(lats.tolist()))
nlats = len(lat_uniq)
print(nlats)
print(lat_uniq)
lon_uniq = list(set(lons.tolist()))
print(lon_uniq)
nlons = len(lon_uniq)
print(nlons)
print (lats.shape, nlats, nlons)
yre = lats.reshape(nlats,nlons)
xre = lons.reshape(nlats,nlons)
zre = values.reshape(nlats,nlons)
#### later in the defined map
fig,ax=plt.subplots(1,1)
cp = ax.contourf(xre, yre, zre)
fig.colorbar(cp)
plt.savefig('f1.pdf')
file1.txt
1 2 3
4 5 6
7 8 9
10 11 12
..
First column - x values,
Second - y values,
third - z values
I'm using the code to make a contour plot in python, but getting the following error:
Traceback (most recent call last):
File "./yut.py", line 21, in
yre = lats.reshape(nlats,nlons)
ValueError: cannot reshape array of size 4 into shape (4,4)
Could you please help to fix this error? Thanks in advance!
Matplotlib expects a contour plot to receive data in a specific format. Your approach does not provide the data in this format; you have to transform your data like this:
import numpy as np
import matplotlib.pyplot as plt
#from matplotlib.colors import LogNorm
data = np.genfromtxt('test.txt', delimiter=' ')
#print(data)
lats = data[:,0]
## lon => x
lons = data[:,1]
## values => z
values = data[:,2]
###
#get unique lat lon values and their index positions
lat_uniq, lat_idx = np.unique(lats, return_inverse=True)
lon_uniq, lon_idx = np.unique(lons, return_inverse=True)
#create 2D array necessary for the contour plot
xre, yre = np.meshgrid(lon_uniq, lat_uniq)
zre = np.full(xre.shape, np.nan)
#or if you know the standard value of the array, fill it with that
#zre = np.full(xre.shape, 0)
zre[lat_idx, lon_idx] = values
print(zre)
#you can fill in missing data with interpolation
from scipy.interpolate import griddata
zre_interpolated = griddata((lons, lats), values, (xre, yre), method = "linear")
print(zre_interpolated)
#### later in the defined map
fig, (ax1, ax2) = plt.subplots(1,2, figsize = (10, 5))
cp1 = ax1.contourf(xre, yre, zre, levels=4)
plt.colorbar(cp1, ax=ax1)
ax1.set_title("data are not interpolated")
cp2 = ax2.contourf(xre, yre, zre_interpolated, levels=4)
plt.colorbar(cp2, ax=ax2)
ax2.set_title("interpolated data")
plt.show()
Example output:
The example output was generated using the following data in the txt file:
1 1 1
1 2 2
2 4 9
4 5 2
6 1 1
6 2 8
6 4 9
6 5 2
2 5 3
4 2 5
4 3 8
4 4 5
1 3 4
1 5 2
2 1 1
2 3 4

generate a 3d plot from data contained in a three columns file

I have a three columns data file structured in this way (example) :
X Y Z
0 0 0.2
0 1 0.3
0 2 0.1
1 0 0.2
1 1 0.3
1 2 0.9
2 0 0.6
2 1 0.8
2 2 0.99
I don't know how this kind of file is called ... but I did not find any example to plot this using 3d wireframe or 3d surface plot ...
EDIT But there is a way to produce a wireframe or surface plot with data structured in this way?
In order to create a surface plot, you first need to transform your x, y and z data into 2d arrays. Then you can plot it easily:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
# read out data from txt file
data = np.genfromtxt("data.txt")[1:]
x_data, y_data, z_data = data[:, 0], data[:, 1], data[:, 2]
# initialize a figure for the 3d plot
fig = plt.figure()
ax = Axes3D(fig)
# create matrix for z values
dim = int(np.sqrt(len(x_data)))
z = z_data.reshape((dim, dim))
# create matrix for the x and y points
x, y = np.arange(0, dim, 1), np.arange(0, dim, 1)
x, y = np.meshgrid(x, y)
# plot
ax.plot_surface(x, y, z, alpha=0.75)
plt.show()

Convert X and Y arrays into a frequencies grid

I would like to convert two arrays (x and y) into a frequency n x n matrix (n = 5), indicating each cell the number of point that contains. It consists on resampling both variables into five intervals and count the existing number of points per cell.
I have tried using pandas pivot_table but don't know the way of referencing to each axis coordinate.
X and Y arrays are two dependent variables that contain values between 0 and 100.
I would really appreciate some one's aid.
Thank you very much in advance.
This is an example of the code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Arrays example. They are always float type and ranging 0-100. (n_size array = 15)
x = 100 * np.random.random(15)
y = 100 * np.random.random(15)
# Df created for trying to pivot and counting values per cell
df = pd.DataFrame({'X':x,'Y':y})
# Plot the example data:
df.plot(x = 'X',y = 'Y', style = 'o')
This is what I have:
This is the objetive matrix, saved as a df:
If you do not explicitly need to use pandas (which you don't, if it's just about a frequency matrix), consider using numpy.histogram2d:
# Sample data
x = 100*np.random.random(15)
y = 100*np.random.random(15)
Construct your bins (since your x and y bins are the same, one set is enough)
bins = np.linspace(0, 100, 5+1)
# bins = array([ 0., 20., 40., 60., 80., 100.])
Now use the histogram function:
binned, binx, biny = np.histogram2d(x, y, bins = [bins, bins])
# To get the result you desire, transpose
objmat = binned.T
Note: x-values are binned along the first dimension(axis 0), which visually means 'vertical'. Hence the transpose.
Plotting:
fig, ax = plt.subplots()
ax.grid()
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.scatter(x, y)
for i in range(objmat.shape[0]):
for j in range(objmat.shape[1]):
c = int(objmat[::-1][j,i])
ax.text((bins[i]+bins[i+1])/2, (bins[j]+bins[j+1])/2, str(c), fontdict={'fontsize' : 16, 'ha' : 'center', 'va' : 'center'})
Result:
You could use GroupBy.size
matching group axes to the center of each grid.
Then you can use Axes.text to draw them
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(20)
max_val = 100
n = 5
len_group = max_val // 5
x = max_val * np.random.random(15)
y = max_val * np.random.random(15)
# Df created for trying to pivot and counting values per cell
df = pd.DataFrame({'X':x,'Y':y})
x_groups = df['X'] // len_group * len_group + len_group / 2
y_groups = df['Y'] // len_group * len_group + len_group / 2
fig, ax= plt.subplots(figsize=(13, 6))
ax.set_ylim(0, max_val)
ax.set_xlim(0, max_val)
df.plot(x = 'X',y = 'Y', style = 'o', ax=ax)
for i, val in df.groupby([x_groups, y_groups]).size().items():
ax.text(*i, val,fontdict={'fontsize' : 20, 'ha' : 'center', 'va':'center'})
plt.grid()
You can just create bins with pd.cut and then groupby the bins and unstack along the X variable and you have a matrix of frequency counts.
df['Xc'] = pd.cut(df['X'], range(0, 101, 20))
df['Yc'] = pd.cut(df['Y'], range(0, 101, 20))
mat = df.groupby(['Xc', 'Yc']).size().unstack('Xc')
mat
Xc (0, 20] (20, 40] (40, 60] (60, 80] (80, 100]
Yc
(0, 20] 0 1 1 0 0
(20, 40] 4 0 1 2 0
(40, 60] 0 0 0 0 0
(60, 80] 3 0 1 0 0
(80, 100] 1 0 1 0 0
There is no elegant solution to the plotting part of the problem. But here's what you can do.
# Calculate the counts
counts = df.groupby([df.X.astype(int) // 20,
df.Y.astype(int) // 20]).size().astype(str)
# Restore the original scales
counts.index = pd.MultiIndex.from_tuples([(x * 20 + 10,
y * 20 + 10)
for x,y in counts.index.to_list()],
names=counts.index.names)
fig = plt.figure()
ax = fig.add_subplot(111)
# Plot the text labels
[ax.text(*xy, txt) for (xy, txt) in counts.items()]
# Update the axes extents
ax.axis([0, counts.index.levels[0].max() + 10,
0, counts.index.levels[1].max() + 10])
plt.show()
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
# Arrays example. They are always float type and ranging 0-100. (n_size array = 15)
x = 100 * np.random.random(15)
y = 100 * np.random.random(15)
# Df created for trying to pivot and counting values per cell
df = pd.DataFrame({'X':x,'Y':y})
ir = pd.interval_range(start=0, freq=20, end=100, closed='left')
df['xbin'] = pd.cut(df['X'], bins=ir)
df['ybin'] = pd.cut(df['Y'], bins=ir)
df['xbin'] = df['xbin'].apply(lambda x: x.mid)
df['ybin'] = df['ybin'].apply(lambda x: x.mid)
fig, ax= plt.subplots()
ax.set_ylim(0, 100)
ax.set_xlim(0, 100)
for i, val in df.groupby(['xbin', 'ybin']).size().items():
if val!=0:
ax.text(*i, val,fontdict={'fontsize' : 20, 'ha' : 'center', 'va' : 'center'})
One option is to call np.add.at on ravel of frequency matrix
x = 100 * np.random.random(15)
y = 100 * np.random.random(15)
n = 5
points = (np.array([x, y]) / 20).astype(int)
z = np.zeros((n, n), dtype=int)
np.add.at(z.ravel(),
np.ravel_multi_index(points, z.shape),
np.ones(points.shape[1]))
Sample run:
print(points)
print(z)
[[0 0 0 2 4 1 2 1 1 0 1 1 3 0 0]
[0 0 1 4 0 4 1 0 1 3 3 1 0 0 3]]
[[3 1 0 2 0]
[1 2 0 1 1]
[0 1 0 0 1]
[1 0 0 0 0]
[1 0 0 0 0]]

Python: How to plot counts of values grouped by two columns?

I have some data grouped by two columns, with a count column:
Category Subcategory Count
0 1 1 10
1 1 2 15
2 1 3 16
3 2 1 2
4 2 2 8
5 2 3 12
6 3 1 33
7 3 3 23
8 4 2 3
9 5 1 2
I would like to plot a clustered column chart based on the above data.
Not all categories contain all subcategories, so for these the plot should show 0. I would like to show values as counts of subcategory within a category, as percentage of the category.
Here is an example chart that Has 2 categories and multiple subcategories as separate clusters. I would like to achieve a similar result.
https://imge.to/i/AVUiY
Additional question: is it possible to get a break in the scale at the Y axis, so that the outlier columns (high values) become smaller, and the small values become more visible?
I hard coded a few things just to get right to the plotting, so first you will want to create what I have called "cat1"-"cat5" from your columns of data.
import numpy as np
import matplotlib.pyplot as plt
# data to plot
n_groups = 3 #number of subcategories
cat1 = (10,15,16)
cat2 = (2,8,12)
cat3 = (33,0,23)
cat4 = (0,3,0)
cat5 = (2,0,0)
# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.1
rects1 = plt.bar(index, cat1, bar_width, label='1')
rects2 = plt.bar(index + bar_width, cat2, bar_width, label='2')
rects3 = plt.bar(index + 2*bar_width, cat3, bar_width, label='3')
rects4 = plt.bar(index + 3*bar_width, cat4, bar_width, label='4')
rects5 = plt.bar(index + 4*bar_width, cat5, bar_width, label='5')
plt.xlabel('Subcategory')
plt.ylabel('Count')
plt.title('Count by Category')
plt.xticks(index + bar_width, ('1', '2', '3'))
plt.legend()
plt.tight_layout()
plt.show()
To answer your second question check out the brokenaxes package: https://github.com/bendichter/brokenaxes

Python 3D plot for multiple dataframes

Assuming that I have three Python pandas DataFrames:
df_sale = pd.DataFrame([[20,30,10], [30,20,20], [20,40,40]], columns=list("ABC"))
A B C
0 20 30 10
1 30 20 20
2 20 40 40
df_people = pd.DataFrame([[2,3,1], [3,2,2], [2,4,4]], columns=list("ABC"))
A B C
0 2 3 1
1 3 2 2
2 2 4 4
df_department = pd.DataFrame([[1,2,1], [1,1,2], [2,1,1]], columns=list("ABC"))
A B C
0 1 2 1
1 1 1 2
2 2 1 1
How do I plot a 3D bar chart with all these 3 dataframes in the same place?
I want the X axis to be ['A', 'B', 'C'], Y axis to be the name of dataframes ['df_sale', 'df_people', 'df_department'], and Z axis to show the numbers.
You could use matplotlib's 3D bars.
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
columns = ['A', 'B', 'C']
df_names = ['sale', 'people', 'department']
df = [pd.DataFrame([[20,30,10], [30,20,20], [20,40,40]], columns=columns), pd.DataFrame([[2,3,1], [3,2,2], [2,4,4]], columns=columns), pd.DataFrame([[1,2,1], [1,1,2], [2,1,1]], columns=columns)]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
#make sure x and y axis get the right tick labels
plt.xticks([i for i in range(len(columns))], columns)
plt.yticks([i for i in range(len(df_names))], df_names)
#define a list for x positions
xs = list()
for i in range(len(df)):
for j in range(len(columns)):
xs.append(i + j * 0.1)
for c1, c in enumerate(['r', 'g', 'b']):
ys = list()
for i in range(len(columns)):
ys.extend(df[c1].ix[:,i:i+1].unstack().tolist())
cs = [c] * len(xs)
ax.bar(xs, ys, zs=c1, zdir='y', color=cs, alpha=0.5, width=0.1)
plt.show()
Multicolors and legend
import matplotlib
colors = ['r', 'g', 'b', 'c', 'm', 'y', '#eeefff', '#feefff', '#aeefff']
for c1 in range(3):
ys = list()
for i in range(len(columns)):
ys.extend(df[c1].ix[:,i:i+1].unstack().tolist())
ax.bar(xs, ys, zs=c1, zdir='y', color=colors, alpha=0.5, width=0.1)
legend = list()
for i, c in enumerate(colors):
legend.append(matplotlib.patches.Patch(color=c, label='value {0} of column {1}'.format(i % 3, columns[i // 3])))
plt.legend(handles=legend, loc=4, bbox_to_anchor=(.9, 0), mode="expand")
plt.show()

Categories