matplotlib plot not showing empty vals at ends - python

I need to show the empty slots at the ends of the plot. Code to show what I mean:
a = pd.DataFrame([ 1,5,3,2,7 ], index=['b','e','g','h','d'])
i = pd.DataFrame(index=['a','b','c','d','e','f','g','h','i','j','k','l'])
c = pd.concat([i, a], axis=1)
plt.plot(c, lw=0, marker='o')
plt.show()
The content of c is
0
a NaN
b 1.0
c NaN
d 7.0
e 5.0
f NaN
g 3.0
h 2.0
i NaN
j NaN
k NaN
l NaN
This shows a chart (can't upload, not enough points, sorry) that has X axis labels b, c, d, e, f, g, h; c and f have no associated points, just as I want.
I have tried plt.xticks, ax.set_xlabels
How can I get the labels for a, i, j, k, l to show?

import pandas as pd
import matplotlib.pyplot as plt
a = pd.DataFrame([ 1,5,3,2,7 ], index=['b','e','g','h','d'])
i = pd.DataFrame(index=['a','b','c','d','e','f','g','h','i','j','k','l'])
c = pd.concat([i, a], axis=1)
fig, ax = plt.subplots()
ax.plot(c.index, c, lw=0, marker='o')
ax.set_xticks(c.index)
plt.show()

Related

Split column into multiple columns with unique values in pandas

I have the following dataframe:
Col
0 A,B,C
1 B,A,D
2 C
3 A,D,E,F
4 B,C,F
df = pd.DataFrame({'Col': ['A,B,C', 'B,A,D', 'C', 'A,D,E,F', 'B,C,F']})
which needs to be turned into:
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F
You could use str.get_dummies to get the dummy variables, then multiply with the columns:
tmp = df['Col'].str.get_dummies(sep=',')
out = tmp * tmp.columns
One-liner as suggested by #piRSquared:
out = df.Col.str.get_dummies(',').pipe(lambda d: d*[*d])
Output:
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F
Benchmark:
On data created by duplicating the data in the OP:
#piRSquared's first method using numpy methods is the fastest solution here.
On randomly generated DataFrames of increasing sizes:
Code to reproduce the plot:
import perfplot
import pandas as pd
import numpy as np
def enke(df):
tmp = df['Col'].str.get_dummies(sep=',')
return tmp * tmp.columns
def mozway(df):
return pd.concat([pd.Series((idx:=x.split(',')), index=idx)
for x in df['Col']], axis=1).T.fillna('')
def piRSquared(df):
n = df.shape[0]
i = np.repeat(np.arange(n), df.Col.str.count(',')+1)
c, j = np.unique(df.Col.str.cat(sep=',').split(','), return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, df.index, c)
def piRSquared2(df):
n = df.shape[0]
base = df.Col.to_numpy().astype(str)
commas = np.char.count(base, ',')
sepped = ','.join(base).split(',')
i = np.repeat(np.arange(n), commas+1)
c, j = np.unique(sepped, return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, df.index, c)
def constructor1(n):
df = pd.DataFrame({'Col': ['A,B,C', 'B,A,D', 'C', 'A,D,E,F', 'B,C,F']})
return pd.concat([df]*n, ignore_index=True)
def constructor2(n):
uc = np.array([*ascii_uppercase])
k = [','.join(np.random.choice(uc, x, replace=False))
for x in np.random.randint(1, 10, size=n)]
return pd.DataFrame({'Col': k})
kernels = [enke, piRSquared, piRSquared2, mozway]
df = pd.DataFrame({'Col': ['A,B,C', 'B,A,D', 'C', 'A,D,E,F', 'B,C,F']})
perfplot.plot(
setup=constructor1,
kernels=kernels,
labels=[func.__name__ for func in kernels],
n_range=[2**k for k in range(15)],
xlabel='len(df)',
logx=True,
logy=True,
relative_to=0,
equality_check=pd.DataFrame.equals)
Using pandas.concat:
pd.concat([pd.Series((idx:=x.split(',')), index=idx)
for x in df['Col']], axis=1).T
For python < 3.8:
pd.concat([pd.Series(val, index=val)
for x in df['Col']
for val in [x.split(',')]], axis=1).T
Output:
A B C D E F
0 A B C NaN NaN NaN
1 A B NaN D NaN NaN
2 NaN NaN C NaN NaN NaN
3 A NaN NaN D E F
4 NaN B C NaN NaN F
NB. add fillna('') to have empty strings for missing values
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F
This comes from my Project Overkill stash of tricks.
I'll use Numpy to identify where the labels are to be dropped in the 2-d array.
n = df.shape[0] # Get number of rows
base = df.Col.to_numpy().astype(str) # Turn `'Col'` to Numpy array
commas = np.char.count(base, ',') # Count commas in each row
sepped = ','.join(base).split(',') # Flat array of each element
i = np.repeat(np.arange(n), commas+1) # Row indices for flat array
# Note that I could've used `pd.factorize` here but I actually wanted
# a sorted array of labels so `np.unique` was the way to go.
# Otherwise I'd have used `j, c = pd.factorize(sepped)`
c, j = np.unique(sepped, return_inverse=True) # `j` col indices for flat array
# `c` will be the column labels
m = c.shape[0] # Get number of unique labels
a = np.full((n, m), '') # Default array of empty strings
a[i, j] = c[j] # Use row/col indices to insert
# the column labels in right spots
pd.DataFrame(a, df.index, c) # Construct new dataframe
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F
Time Testing
The Functions
import pandas as pd
import numpy as np
from string import ascii_uppercase
def pir(s):
n = s.shape[0]
base = s.to_numpy().astype(str)
commas = np.char.count(base, ',')
sepped = ','.join(base).split(',')
i = np.repeat(np.arange(n), commas+1)
c, j = np.unique(sepped, return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, s.index, c)
def pir2(s):
n = s.shape[0]
sepped = s.str.cat(sep=',').split(',')
commas = s.str.count(',')
i = np.repeat(np.arange(n), commas+1)
c, j = np.unique(sepped, return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, s.index, c)
def mozway(s):
return pd.concat([
pd.Series((idx:=x.split(',')), index=idx)
for x in s
], axis=1).T.fillna('')
def enke(s):
return s.str.get_dummies(',').pipe(lambda d: d*d.columns)
The test data constructor
def constructor(n, m):
uc = np.array([*ascii_uppercase])
m = min(26, m)
k = [','.join(np.random.choice(uc, x, replace=False))
for x in np.random.randint(1, m, size=n)]
return pd.Series(k)
The results dataframe
res = pd.DataFrame(
index=['enke', 'mozway', 'pir', 'pir2'],
columns=[10, 30, 100, 300, 1000, 3000, 10000, 30000],
dtype=float
)
Run the test
from IPython.display import clear_output
for j in res.columns:
s = constructor(j, 10)
for i in res.index:
stmt = f'{i}(s)'
setp = f'from __main__ import s, {i}'
res.at[i, j] = timeit(stmt, setp, number=50)
print(res)
clear_output(True)
Show the results
res.T.plot(loglog=True)
res.div(res.min()).T
enke mozway pir pir2
10 8.634105 19.416376 1.000000 2.300573
30 7.626107 32.741218 1.000000 2.028423
100 5.071308 50.539772 1.000000 1.533791
300 3.475711 66.638151 1.000000 1.184982
1000 2.616885 79.032159 1.012205 1.000000
3000 2.518983 91.521389 1.094863 1.000000
10000 2.536735 98.172680 1.131758 1.000000
30000 2.603489 99.756007 1.149734 1.000000

Scatter Pie Plot Python Pandas

"Scatter Pie Plot" ( a scatter plot using pie charts instead of dots). I require this as I have to represent 3 dimensions of data.
1: x axis (0-6)
2: y axis (0-6)
3: Category lets say (A,B,C - H)
If two x and y values are the same I want a pie chart to be in that position representing that Category.
Similar to the graph seen in this link:
https://matplotlib.org/gallery/lines_bars_and_markers/scatter_piecharts.html#sphx-glr-gallery-lines-bars-and-markers-scatter-piecharts-py
or this image from Tableu:
[![enter image description here][1]][1]
As I am limited to only use python I have been struggling to manipulate the code to work for me.
Could anyone help me with this problem? I would very grateful!
Example data:
XVAL YVAL GROUP
1.3 4.5 A
1.3 4.5 B
4 2 E
4 6 A
2 4 A
2 4 B
1 1 G
1 1 C
1 2 B
1 2 D
3.99 4.56 G
The final output should have 6 pie charts on the X & Y with 1 containing 3 groups and 2 containing 3 groups.
My attempt:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
def draw_pie(dist,
xpos,
ypos,
size,
ax=None):
if ax is None:
fig, ax = plt.subplots(figsize=(10,8))
# for incremental pie slices
cumsum = np.cumsum(dist)
cumsum = cumsum/ cumsum[-1]
pie = [0] + cumsum.tolist()
for r1, r2 in zip(pie[:-1], pie[1:]):
angles = np.linspace(2 * np.pi * r1, 2 * np.pi * r2)
x = [0] + np.cos(angles).tolist()
y = [0] + np.sin(angles).tolist()
xy = np.column_stack([x, y])
ax.scatter([xpos], [ypos], marker=xy, s=size)
return ax
fig, ax = plt.subplots(figsize=(40,40))
draw_pie([Group],'xval','yval',10000,ax=ax)
draw_pie([Group], 'xval', 'yval', 20000, ax=ax)
draw_pie([Group], 'xval', 'yval', 30000, ax=ax)
plt.show()
I'm not sure how to get 6 pie charts. If we group on XVAL and YVAL, there are 7 unique pairs. You can do something down this line:
fig, ax = plt.subplots(figsize=(40,40))
for (x,y), d in df.groupby(['XVAL','YVAL']):
dist = d['GROUP'].value_counts()
draw_pie(dist, x, y, 10000*len(d), ax=ax)
plt.show()
Output:

Annotate pointplot

How can I annotate a seaborn pointplot with the values of col1 "A" "B" or "C" as a text, next to the points where they are drawn.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Data
a = list("ABC") * 4
c = list("DE") * 6
score = np.random.randint(-5, 5, 12)
df = pd.DataFrame({"col1": a, "col2": c, "score": score})
print(df)
col1 col2 value
0 A D 3
1 B E 1
2 C D -3
3 A E -5
4 B D -4
5 C E -5
6 A D 2
7 B E -4
8 C D 4
9 A E 1
10 B D 3
11 C E -2
sns.pointplot(data=df, x="col2", y="value", hue='col1');
Desired outcome is with the labels A, B and C:
plt.text(x, y, 'a text') places a text in a plot. The main problem is to find the exact positions and colors. To get a consistent order, it helps to explicitly make the columns categorical. The categorical numbering (0, 1, 2, ...) is the same as matplotlib uses internally for its categorical axes.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
a = list("ABC") * 4
c = list("DE") * 6
score = np.random.randint(-5, 5, 12)
df = pd.DataFrame({"col1": a, "col2": c, "score": score})
df['col1'] = pd.Categorical(df['col1'])
df['col2'] = pd.Categorical(df['col2'])
palette = sns.color_palette("tab10")
ax = sns.pointplot(data=df, x="col2", y="score", hue='col1', palette=palette)
pos_after_last_x = len(df['col2'].cat.categories) - 1 + 0.05
last_x_cat = df['col2'].cat.categories[-1]
for cat, color in zip(df['col1'].cat.categories, palette):
mean_score = df[(df['col1'] == cat) & (df['col2'] == last_x_cat)]['score'].mean()
ax.text(pos_after_last_x, mean_score, cat, color=color)
plt.show()

Python 3D plot for multiple dataframes

Assuming that I have three Python pandas DataFrames:
df_sale = pd.DataFrame([[20,30,10], [30,20,20], [20,40,40]], columns=list("ABC"))
A B C
0 20 30 10
1 30 20 20
2 20 40 40
df_people = pd.DataFrame([[2,3,1], [3,2,2], [2,4,4]], columns=list("ABC"))
A B C
0 2 3 1
1 3 2 2
2 2 4 4
df_department = pd.DataFrame([[1,2,1], [1,1,2], [2,1,1]], columns=list("ABC"))
A B C
0 1 2 1
1 1 1 2
2 2 1 1
How do I plot a 3D bar chart with all these 3 dataframes in the same place?
I want the X axis to be ['A', 'B', 'C'], Y axis to be the name of dataframes ['df_sale', 'df_people', 'df_department'], and Z axis to show the numbers.
You could use matplotlib's 3D bars.
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
columns = ['A', 'B', 'C']
df_names = ['sale', 'people', 'department']
df = [pd.DataFrame([[20,30,10], [30,20,20], [20,40,40]], columns=columns), pd.DataFrame([[2,3,1], [3,2,2], [2,4,4]], columns=columns), pd.DataFrame([[1,2,1], [1,1,2], [2,1,1]], columns=columns)]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
#make sure x and y axis get the right tick labels
plt.xticks([i for i in range(len(columns))], columns)
plt.yticks([i for i in range(len(df_names))], df_names)
#define a list for x positions
xs = list()
for i in range(len(df)):
for j in range(len(columns)):
xs.append(i + j * 0.1)
for c1, c in enumerate(['r', 'g', 'b']):
ys = list()
for i in range(len(columns)):
ys.extend(df[c1].ix[:,i:i+1].unstack().tolist())
cs = [c] * len(xs)
ax.bar(xs, ys, zs=c1, zdir='y', color=cs, alpha=0.5, width=0.1)
plt.show()
Multicolors and legend
import matplotlib
colors = ['r', 'g', 'b', 'c', 'm', 'y', '#eeefff', '#feefff', '#aeefff']
for c1 in range(3):
ys = list()
for i in range(len(columns)):
ys.extend(df[c1].ix[:,i:i+1].unstack().tolist())
ax.bar(xs, ys, zs=c1, zdir='y', color=colors, alpha=0.5, width=0.1)
legend = list()
for i, c in enumerate(colors):
legend.append(matplotlib.patches.Patch(color=c, label='value {0} of column {1}'.format(i % 3, columns[i // 3])))
plt.legend(handles=legend, loc=4, bbox_to_anchor=(.9, 0), mode="expand")
plt.show()

Extraploation with 'nearest' method in Python

I'm looking to find the Python equivalent of the following Matlab statement:
vq interp1(x,y, xq,'nearest','extrap')
It looks as if interp(xq, x, y) works perfectly for linear interpolation/extrapolation.
I also looked at
F = scipy.interpolate.interp1d(x, y, kind='nearest')
which works perfectly for the nearest method, but will not perform extrapolation.
Is there anything else I've overlooked? Thanks.
For linear interpolation that will extrapolate using nearest interpolation, use numpy.interp. It does this by default.
For example:
yi = np.interp(xi, x, y)
Otherwise, if you just want nearest interpolation everywhere, as you describe, you can do it in the short, but inefficient way: (you can make this a one-liner, if you want)
def nearest_interp(xi, x, y):
idx = np.abs(x - xi[:,None])
return y[idx.argmin(axis=1)]
Or in a more efficient way using searchsorted:
def fast_nearest_interp(xi, x, y):
"""Assumes that x is monotonically increasing!!."""
# Shift x points to centers
spacing = np.diff(x) / 2
x = x + np.hstack([spacing, spacing[-1]])
# Append the last point in y twice for ease of use
y = np.hstack([y, y[-1]])
return y[np.searchsorted(x, xi)]
To illustrate the difference between numpy.interp and the nearest interpolation examples above:
import numpy as np
import matplotlib.pyplot as plt
def main():
x = np.array([0.1, 0.3, 1.9])
y = np.array([4, -9, 1])
xi = np.linspace(-1, 3, 200)
fig, axes = plt.subplots(nrows=2, sharex=True, sharey=True)
for ax in axes:
ax.margins(0.05)
ax.plot(x, y, 'ro')
axes[0].plot(xi, np.interp(xi, x, y), color='blue')
axes[1].plot(xi, nearest_interp(xi, x, y), color='green')
kwargs = dict(x=0.95, y=0.9, ha='right', va='top')
axes[0].set_title("Numpy's $interp$ function", **kwargs)
axes[1].set_title('Nearest Interpolation', **kwargs)
plt.show()
def nearest_interp(xi, x, y):
idx = np.abs(x - xi[:,None])
return y[idx.argmin(axis=1)]
main()
In later versions of SciPy (at least v0.19.1+), scipy.interpolate.interp1d has the option fill_value = “extrapolate”.
For example:
import pandas as pd
>>> s = pd.Series([1, 2, 3])
Out[1]:
0 1
1 2
2 3
dtype: int64
>>> t = pd.concat([s, pd.Series(index=s.index + 0.1)]).sort_index()
Out[2]:
0.0 1.0
0.1 NaN
1.0 2.0
1.1 NaN
2.0 3.0
2.1 NaN
dtype: float64
>>> t.interpolate(method='nearest')
Out[3]:
0.0 1.0
0.1 1.0
1.0 2.0
1.1 2.0
2.0 3.0
2.1 NaN
dtype: float64
>>> t.interpolate(method='nearest', fill_value='extrapolate')
Out[4]:
0.0 1.0
0.1 1.0
1.0 2.0
1.1 2.0
2.0 3.0
2.1 3.0
dtype: float64

Categories