I am trying to make a pie chart that looks like the below -
I am using geopandas for that-
us_states = gpd.read_file("conus_state.shp")
data = gpd.read_file("data_file.shp")
fig, ax = plt.subplots(figsize= (10,10))
us_states.plot(color = "None", ax = ax)
data.plot(column = ["Column1","Column2"], ax= ax, kind = "pie",subplots=True)
This gives me the following error-
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
C:\Users\LSRATH~1.STU\AppData\Local\Temp/ipykernel_17992/1047905594.py in <module>
1 fig, ax = plt.subplots(figsize= (10,10))
2 us_states.plot(color = "None", ax = ax)
----> 3 diff_env.plot(column = ["WS_MON1","WS_MON2"], ax= ax, kind = "pie")
c:\python38\lib\site-packages\geopandas\plotting.py in __call__(self, *args, **kwargs)
951 if kind in self._pandas_kinds:
952 # Access pandas plots
--> 953 return PlotAccessor(data)(kind=kind, **kwargs)
954 else:
955 # raise error
c:\python38\lib\site-packages\pandas\plotting\_core.py in __call__(self, *args, **kwargs)
921 if isinstance(data, ABCDataFrame):
922 if y is None and kwargs.get("subplots") is False:
--> 923 raise ValueError(
924 f"{kind} requires either y column or 'subplots=True'"
925 )
ValueError: pie requires either y column or 'subplots=True'
Even after specifying, subplots = True, it does not work.
How can I make a pie chart using 2 columns of the dataframe?
Below are the first five rows of the relevant columns-
diff_env[["Column1", "Column2", "geometry"]].head().to_dict()
{'Column1': {0: 2, 1: 0, 2: 0, 3: 1, 4: 12},
'Column2': {0: 2, 1: 0, 2: 0, 3: 1, 4: 12},
'geometry': {0: <shapely.geometry.point.Point at 0x2c94e07f190>,
1: <shapely.geometry.point.Point at 0x2c94e07f130>,
2: <shapely.geometry.point.Point at 0x2c94e07f0d0>,
3: <shapely.geometry.point.Point at 0x2c94bb86d30>,
4: <shapely.geometry.point.Point at 0x2c94e07f310>}}
you have not provided any usable sample data. Have randomly generated some
this is inspired by How to plot scatter pie chart using matplotlib
sample data
value0
value1
geometry
size
0
5
3
POINT (-105.96116535117056 31.014979334448164)
312
1
2
3
POINT (-79.70609244147155 36.46222924414716)
439
2
4
7
POINT (-68.89518006688962 37.84436728093645)
363
3
7
9
POINT (-118.12344177257525 31.909303946488293)
303
4
2
7
POINT (-102.1001252173913 28.57591221070234)
326
5
3
3
POINT (-96.88772103678929 47.76324025083612)
522
6
5
8
POINT (-112.33188157190635 48.16975143812709)
487
7
7
6
POINT (-95.15025297658862 44.59245298996656)
594
8
3
1
POINT (-100.36265715719063 46.787613401337794)
421
9
2
4
POINT (-81.82966451505015 35.161393444816056)
401
full code
import geopandas as gpd
import numpy as np
import shapely
import matplotlib.pyplot as plt
states = (
gpd.read_file(
"https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/geojson/ne_110m_admin_1_states_provinces.geojson"
)
.loc[lambda d: d["iso_3166_2"].ne("US-AK"), "geometry"]
.exterior
)
# geodataframe of points where pies are to be plotted
n = 10
pies = gpd.GeoDataFrame(
geometry=[
shapely.geometry.Point(xy)
for xy in zip(
np.random.choice(np.linspace(*states.total_bounds[[0, 2]], 300), n),
np.random.choice(np.linspace(*states.total_bounds[[1, 3]], 300), n),
)
],
data={f"value{c}": np.random.randint(1, 10, n) for c in range(2)},
crs=states.crs,
).pipe(lambda d: d.assign(size=np.random.randint(300, 600, n)))
# utility function inspired by https://stackoverflow.com/questions/56337732/how-to-plot-scatter-pie-chart-using-matplotlib
def draw_pie(dist, xpos, ypos, size, ax):
# for incremental pie slices
cumsum = np.cumsum(dist)
cumsum = cumsum / cumsum[-1]
pie = [0] + cumsum.tolist()
colors = ["blue", "red", "yellow"]
for i, (r1, r2) in enumerate(zip(pie[:-1], pie[1:])):
angles = np.linspace(2 * np.pi * r1, 2 * np.pi * r2)
x = [0] + np.cos(angles).tolist()
y = [0] + np.sin(angles).tolist()
xy = np.column_stack([x, y])
ax.scatter([xpos], [ypos], marker=xy, s=size, color=colors[i], alpha=1)
return ax
fig, ax = plt.subplots()
ax = states.plot(ax=ax, edgecolor="black", linewidth=0.5)
for _, r in pies.iterrows():
ax = draw_pie([r.value0, r.value1], r.geometry.x, r.geometry.y, r["size"], ax)
output
Related
I wanted to create an accuracy rate of each letter in a bar graph using matplotlib.
Example Dataset
data = {'Actual Letter': ['U', 'A', 'X', 'P', 'C', 'R', 'C', 'U', 'J', 'D'], 'Predicted Letter': ['U', 'A', 'X', 'P', 'C', 'R', 'C', 'U', 'J', 'D']}
df = pd.DataFrame(data, index=[10113, 19164, 12798, 12034, 17719, 17886, 4624, 6047, 15608, 11815])
Actual Letter Predicted Letter
10113 U U
19164 A A
12798 X X
12034 P P
17719 C C
17886 R R
4624 C C
6047 U U
15608 J J
11815 D D
df.plot(kind='bar')
Error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-14-a5f21be4f14b> in <module>
3 df = pd.DataFrame(data, index=[10113, 19164, 12798, 12034, 17719, 17886, 4624, 6047, 15608, 11815])
4
----> 5 df.plot(kind='bar')
e:\Anaconda3\lib\site-packages\pandas\plotting\_core.py in __call__(self, *args, **kwargs)
970 data.columns = label_name
971
--> 972 return plot_backend.plot(data, kind=kind, **kwargs)
973
974 __call__.__doc__ = __doc__
e:\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\__init__.py in plot(data, kind, **kwargs)
69 kwargs["ax"] = getattr(ax, "left_ax", ax)
70 plot_obj = PLOT_CLASSES[kind](data, **kwargs)
---> 71 plot_obj.generate()
72 plot_obj.draw()
73 return plot_obj.result
e:\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in generate(self)
284 def generate(self):
285 self._args_adjust()
--> 286 self._compute_plot_data()
287 self._setup_subplots()
288 self._make_plot()
e:\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in _compute_plot_data(self)
451 # no non-numeric frames or series allowed
452 if is_empty:
--> 453 raise TypeError("no numeric data to plot")
454
455 self.data = numeric_data.apply(self._convert_to_ndarray)
TypeError: no numeric data to plot
I wanted a bar graph that would be like this. However I don't know how to do it.
Imports and Sample DataFrame
import pandas as pd
import numpy as np # for sample data only
import string # for sample data only
# create sample dataframe for testing
np.random.seed(365)
rows = 1100
data = {'Actual': np.random.choice(list(string.ascii_uppercase), size=rows),
'Predicted': np.random.choice(list(string.ascii_uppercase), size=rows)}
df = pd.DataFrame(data)
Calculations and Plotting
Updated
The following implementation is more succinct; unnecessary steps have be removed.
Create a Boolean 'Match' column depending on if there is a match between 'Predicted' and 'Actual'
.groupby on 'Actual', aggregate .mean(), multiply by 100, and round, to get the percent.
The group for each letter will sum the Booleans and divide by the count. For 'A', the sum is 1, because there is 1 True, which is divided by the total count of the group, 33. Therefore, 1/33 = 0.030303030303030304
Plot the bar for the selected data with pandas.DataFrame.plot
Note that step (1) and (2) can be reduced and combined to the following:
dfa = df.Predicted.eq(df.Actual).groupby(df.Actual).mean().mul(100).round(2)
# determine where Predicted equals Actual
df['Match'] = df.Predicted.eq(df.Actual)
# display(df.head())
Actual Predicted Match
0 S Z False
1 U J False
2 B L False
3 M V False
4 F C False
# groupby and get percent
dfa = df.groupby('Actual').Match.mean().mul(100).round(2)
# display(dfa.head())
Actual
A 3.03
B 2.63
C 4.44
D 6.82
E 5.77
Name: Match, dtype: float64
# plot
ax = dfa.plot(kind='bar', x='Actual', y='%', rot=0, legend=False, grid=True, figsize=(8, 5),
ylabel='Percent %', xlabel='Letter', title='Accuracy Rate % per letter')
Original Code
This works as well
# determine where Predicted equals Actual and convert to an int; True = 1 and False = 0
df['Match'] = df.Predicted.eq(df.Actual).astype(int)
# get the normalized value counts
dfg = df.groupby('Actual').Match.value_counts(normalize=True).mul(100).round(2).reset_index(name='%')
# get the accuracy scores where there is a Match
df_accuracy = dfg[dfg.Match.eq(1)]
# display(df_accuracy.head())
Actual Match %
1 A 1 3.03
3 B 1 2.63
5 C 1 4.44
7 D 1 6.82
9 E 1 5.77
# plot
ax = df_accuracy.plot(kind='bar', x='Actual', y='%', rot=0, legend=False, grid=True, figsize=(8, 5),
ylabel='Percent %', xlabel='Letter', title='Accuracy Rate % per letter')
have simulated data that you note
graph is exceptionally simple if you calc the percentages first
import numpy as np
import pandas as pd
# simulate some data...
df = pd.DataFrame(
{"Actual Letter": np.random.choice(list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"), 200)}
).assign(
**{
"Predicted Letter": lambda d: d["Actual Letter"].apply(
lambda l: np.random.choice(
[l] + list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"), 1, p=tuple([0.95]+ [0.05/26]*26)
)[0]
)
}
)
# now just calc percentage of where actual and predicted are the same
# graph it...
df.groupby("Actual Letter").apply(lambda d: (d["Actual Letter"]==d["Predicted Letter"]).sum()/len(d)).plot(kind="bar")
I got a 'No numeric types to aggregate' error when I run the following code.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x_labels = ['Female', 'Male']
genderincomeTM1 = round(TM1.groupby('Gender')['Income'].mean())
genderincomeTM2 = round(TM2.groupby('Gender')['Income'].mean())
genderincomeTM3 = round(TM3.groupby('Gender')['Income'].mean())
genderTM1 = genderincomeTM1.index
genderTM2 = genderincomeTM2.index
genderTM3 = genderincomeTM3.index
x = np.arange(len(x_labels))
plt.figure(figsize=(12,8))
width = 0.35
fig, ax = plt.subplots()
bar1 = ax.bar(x - 0.3, genderincomeTM1, width=0.2, label='TM1')
bar2 = ax.bar(x, genderincomeTM2, width=0.2, label='TM2')
bar3 = ax.bar(x + 0.3, genderincomeTM3, width=0.2, label='TM3')
ax.set_title('Average Income by Product Model', fontsize = 18)
ax.set_ylabel('Sales', fontsize = 12)
ax.set_xticks(x)
ax.set_xticklabels(x_labels)
ax.set_ylim(bottom = 0, top = 90000)
ax.legend(loc=(1.02,0.4), borderaxespad=0, fontsize = 12)
def autolabel(bars):
for each in bars:
height = each.get_height()
ax.annotate('{}'.format(height),
xy=(each.get_x() + each.get_width() / 2, height),
xytext=(0, 2), # 2 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
autolabel(bar1)
autolabel(bar2)
autolabel(bar3)
DataError Traceback (most recent call last)
<ipython-input-24-fb1aa4ae1242> in <module>
1 x_labels = ['Female', 'Male']
2
----> 3 genderincomeTM1 = round(TM1.groupby('Gender')['Income'].mean())
4 genderincomeTM2 = round(TM2.groupby('Gender')['Income'].mean())
5 genderincomeTM3 = round(TM3.groupby('Gender')['Income'].mean())
~\anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in mean(self, *args, **kwargs)
1223 """
1224 nv.validate_groupby_func("mean", args, kwargs, ["numeric_only"])
-> 1225 return self._cython_agg_general(
1226 "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs
1227 )
~\anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
906
907 if len(output) == 0:
--> 908 raise DataError("No numeric types to aggregate")
909
910 return self._wrap_aggregated_output(output)
DataError: No numeric types to aggregate
I have removed all empty rows and checked on 'Income' column using is_numeric_dtype. I also converted the column to int.
from pandas.api.types import is_numeric_dtype
is_numeric_dtype(df['Income'])
>True
df['Income'] = df['Income'].astype(int)
df.info()
><class 'pandas.core.frame.DataFrame'>
>Int64Index: 180 entries, 0 to 182
>Data columns (total 10 columns):
> # Column Non-Null Count Dtype
>--- ------ -------------- -----
> 0 Product 180 non-null object
> 1 Branch 180 non-null object
> 2 Age 180 non-null object
> 3 Gender 180 non-null object
> 4 Education 180 non-null object
> 5 MaritalStatus 180 non-null object
> 6 Usage 180 non-null object
> 7 Fitness 180 non-null object
> 8 Income 180 non-null int32
I am confused why there is no numeric type for Income after validation. Could it be referring to 'Gender'? How should I go about resolving the error?
I can't probably answer your question, because you haven't provided a dataframe which reproduces the error. Maybe you could start by running this code:
import numpy as np
import pandas as pd
TM1 = pd.DataFrame({'Gender':['M','F','M','F'],'Income':['10','20','30','40']})
TM1['Income'] = TM1.Income.astype(int)
TM1.groupby('Gender')['Income'].mean().round()
Because the incomes are initially given as strings, the mean can't be computed until they're converted to integers.
I am trying to combine box plots with a scatter plot for an algorithm scoring visualization. My data is divided as following:
oX - information about the time period (1 year, 2 years, etc.)
oY - information about the score
2 algorithms for each period with different simulation results (plotted as boxplots)
2 heuristics with a single value (plotted as a point)
I'm trying to easily compare method efficiency for each period of time.
Small sample data:
1 year 2 years
A1 A2 H1 H2 A1 A2 H1 H2
124 168 155 167 130 130 150 164
102 155 100 172
103 153 117 145
102 132 145 143
145 170 133 179
136 125 115 153
116 150 136 131
146 192 106 148
124 122 127 158
128 123 149 200
141 158 137 156
I'm trying to get something that looks like this:
So far I've cleared up my data to have the observations for each algorithm (RS, EA) and for each period (52, 104, 156 etc.) separately like so but I can't figure out how to group them per period while drawing 2 different boxplots for the same X tick. I assume once I'd sort out the boxplot dataframe and plot, I can just plot the scatter on top.
Managed to solve this meanwhile, in case it helps anyone else out:
ax1 = sns.boxplot(data = meta, x = 'Time', y = 'PRS', color = '#880BDD', linewidth=0.8)
ax1 = sns.boxplot(data = meta, x = 'Time', y = 'EA', color = '#0BC9DD', linewidth=0.8)
ax1 = sns.boxplot(data = meta, x = 'Time', y = 'ERS', color = '#9BD19D', linewidth=0.8)
ax1 = sns.pointplot(data = simple, x = 'Time', y = 'Greedy Average', color='#FFC48C', markers ='s', join=False)
ax1 = sns.pointplot(data = simple, x = 'Time', y = 'Greedy Total', color='#FF9F80', markers='o', join=False)
ax1 = sns.pointplot(data = simple, x = 'Time', y = 'Greedy Weeks', color='#F56991', markers='*', join=False)
ax1.set(xlabel = "Planning Horizon (weeks)")
ax1.set(ylabel = "Hypervolume")
EA = mpatches.Patch(color='#0BC9DD', label = 'EA')
PRS = mpatches.Patch(color='#880BDD', label = 'PRS')
ERS = mpatches.Patch(color='#9BD19D', label = 'ERS')
GA = mlines.Line2D([], [], color='#FFC48C', marker = 's', label = 'Greedy Average')
GT = mlines.Line2D([], [],color='#FF9F80', label = 'Greedy Total', marker = 'o')
GW = mlines.Line2D([], [],color='#F56991', label = 'Greedy Weeks', marker = '*')
plt.legend(handles = [EA, ERS, PRS, GA, GT, GW], loc = 'bottom left', title = "Algorithm")
ax1.set_title("Algorithm Comparison")
Results in this:
I have data (from a space delimited text file with two columns) which is already binned but only a width of 1. I want to increase this width to about 5. How can I do this using numpy/matplotlib in Python?
Using,
data = loadtxt('file.txt')
x = data[:, 0]
y = data[:, 1]
plt.bar(x,y)
creates too many bars and using,
plt.hist(data)
doesn't plot the histogram appropriately. I guess I don't understand how matplotlib's histogram plotting works.
See some of the data below.
264 1
265 1
266 4
267 2
268 2
269 2
270 2
271 2
272 5
273 3
274 2
275 6
276 7
277 3
278 7
279 5
280 9
281 4
282 8
283 11
284 9
285 15
286 19
287 11
288 12
289 10
290 13
291 18
292 20
293 14
294 15
What if you use numpy.reshape to transform your data before using plt.bar, for example:
In [83]: import numpy as np
In [84]: import matplotlib.pyplot as plt
In [85]: data = np.array([[1,2,3,4,5,6], [4,3,8,9,1,2]]).T
In [86]: data
Out[86]:
array([[1, 4],
[2, 3],
[3, 8],
[4, 9],
[5, 1],
[6, 2]])
In [87]: y = data[:,1].reshape(-1,2).sum(axis=1)
In [89]: y
Out[89]: array([ 7, 17, 3])
In [91]: x = data[:,0].reshape(-1,2).mean(axis=1)
In [92]: x
Out[92]: array([ 1.5, 3.5, 5.5])
In [96]: plt.bar(x, y)
Out[96]: <Container object of 3 artists>
In [97]: plt.show()
I am not an expert at matplotlib but I find hist to be incredibly useful. The examples on the matplotlib site give a great overview of some of the features.
I don't know how to use your provided sample data without transforming it. I altered your example to dequantize those data before creating a histogram.
I calculated the bin size using this question's first answer.
import matplotlib.pyplot as plt
import numpy as np
data = np.loadtxt('file.txt')
dequantized = data[:,0].repeat(data[:,1].astype(int))
dequantized[0:7]
# Each row's first column is repeated the number of times found in the
# second column creating a single array.
# array([ 264., 265., 266., 266., 266., 266., 267.])
def bins(xmin, xmax, binwidth, padding):
# Returns an array of integers which can be used to represent bins
return np.arange(
xmin - (xmin % binwidth) - padding,
xmax + binwidth + padding,
binwidth)
histbins = bins(min(dequantized), max(dequantized), 5, 5)
plt.figure(1)
plt.hist(dequantized, histbins)
plt.show()
This histogram displayed looks like this.
I hope this example is useful.
I am trying to plot some data from a big file.
The data has the following form:
0.025876 139 0
0.030881 140 0
0.030982 141 0
0.035602 142 0
0.035521 143 0
0.038479 144 0
0.040668 145 0
0.040121 146 0
0.037953 147 0
0.039027 148 0
0.038338 149 0
0.047557 139 1
0.045105 140 1
0.044943 141 1
0.042370 142 1
0.042025 143 1
0.038946 144 1
0.037953 145 1
0.033373 146 1
0.030070 147 1
0.029118 148 1
0.025552 149 1
In principle, each line corresponds to a three dimensional point and I would "simply" like to plot a 3d surface generated from these points akin to what I could do with the splot function in gnuplot for those of you that know about it.
Going on the net to find an answer to my problem, I tried the following thing with the matplolib contour function:
#!/usr/bin/python
from numpy import *
import pylab as p
import sys
import mpl_toolkits.mplot3d.axes3d as p3
s = str(sys.argv[1])
f = open(s)
z,y,x = loadtxt(f, unpack = True)
f.close
#x = [1,2,3]
#y = [1,2,3]
#z = [1,8,16]
data = zip(x,y,z)
#map data on the plane
X, Y = meshgrid(arange(0, 89, 1), arange(0, 300, 1))
Z = zeros((len(X),len(Y)),'Float32')
for x_,y_,z_ in data:
Z[x_, y_] = z_ #this should work, but only because x and y are integers
#and arange was done with a step of 1, starting from 0
fig=p.figure()
ax = p3.Axes3D(fig)
ax.contourf(X,Y,Z)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
p.show()
This piece of code worked actually fine with the vectors x,y and z commented with an hashtag in the above code.
But know that I am trying with the data given above, I get "Inputs x and y must be 1D or 2D" error in matplotlib.
I have read that this could be related to the fact that Z does not have the same shape as X or Y...but I am not sure how to deal with this problem.
By the way, as you probably realized, I am a super newbie in Python and I apologize if the code appears very ugly to some of you.
In any case, any help will be very much welcome.
Thanks !
Fabien
Using scipy.interpolate.griddata:
import io
import sys
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as axes3d
import scipy.interpolate as interpolate
content = '''0.025876 139 0
0.030881 140 0
0.030982 141 0
0.035602 142 0
0.035521 143 0
0.038479 144 0
0.040668 145 0
0.040121 146 0
0.037953 147 0
0.039027 148 0
0.038338 149 0
0.047557 139 1
0.045105 140 1
0.044943 141 1
0.042370 142 1
0.042025 143 1
0.038946 144 1
0.037953 145 1
0.033373 146 1
0.030070 147 1
0.029118 148 1
0.025552 149 1'''
data = np.genfromtxt(io.BytesIO(content), dtype=None, names='x, y, z')
# Or, to read from a file:
# data = np.genfromtxt(filename, dtype=None, names='x, y, z')
x, y, z = data['x'], data['y'], data['z']
N = 20
xi = np.linspace(x.min(), x.max(), N)
yi = np.linspace(y.min(), y.max(), N)
X, Y = np.meshgrid(xi, yi)
Z = interpolate.griddata((x, y), z, (X, Y), method='nearest')
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.scatter(data['x'], data['y'], data['z'])
ax.plot_wireframe(X, Y, Z, rstride=1, cstride=1)
# ax.plot_surface(X, Y, Z)
plt.show()
yields
Relevant links:
scipy.interpolate.griddata
np.genfromtxt
Axes3D.plot_wireframe
Axes3D.plot_surface