how to plot many categories with matplotlib? - python

Consider the example below
dfa = pd.DataFrame({'type' : ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q'],
'value' : [2,3,4,2,5,3,6,5,3,1,3,5,7,5,3,5,4],
'date' : [pd.to_datetime('2021-01-01')]*17})
dfa
Out[337]:
type value date
0 a 2 2021-01-01
1 b 3 2021-01-01
2 c 4 2021-01-01
3 d 2 2021-01-01
4 e 5 2021-01-01
5 f 3 2021-01-01
6 g 6 2021-01-01
7 h 5 2021-01-01
8 i 3 2021-01-01
9 j 1 2021-01-01
10 k 3 2021-01-01
11 l 5 2021-01-01
12 m 7 2021-01-01
13 n 5 2021-01-01
14 o 3 2021-01-01
15 p 5 2021-01-01
16 q 4 2021-01-01
As you can see, I have (too) many categories but I still need to plot all of them at the same time. I tried to use the hatch argument in matplotlib but this does not seem to shade some patterns and not the others (so that more categories are visually distinct).
dfa.set_index(['date','type']).unstack().plot.bar(stacked = True, hatch = 'o')
What can I do here?
Thanks!

You could loop through the generated bars, and assign a unique hatch pattern to each individual group. You'll need to generate the legend again, so it gets updated with the changed bars.
Choosing bar.set_hatch(pattern * 2) instead of just bar.set_hatch(pattern) will generate a pattern that is twice as dense. See the hatch demo for more examples.
import matplotlib.pyplot as plt
import pandas as pd
dfa = pd.DataFrame({'type': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q'] * 2,
'value': [2, 3, 4, 2, 5, 3, 6, 5, 3, 1, 3, 5, 7, 5, 3, 5, 4,
3, 2, 2, 1, 5, 2, 7, 2, 3, 7, 5, 3, 5, 3, 5, 4, 3],
'date': [pd.to_datetime('2021-01-01')] * 17 + [pd.to_datetime('2021-01-02')] * 17})
ax = dfa.set_index(['date', 'type']).unstack().plot.bar(stacked=True, rot=0)
hatch_patterns = ['/', '\\', '|', '-', '+', 'x', 'o', 'O', '.', '*', '/o', '\\|', '|*', '-\\', '+o', 'x*', 'o-', 'O|']
for bars, pattern in zip(ax.containers, hatch_patterns):
for bar in bars:
bar.set_hatch(pattern * 2)
ax.legend(bbox_to_anchor=(1.01, 1.01), loc='upper left')
plt.tight_layout()
plt.show()

Related

Add a custom value on seaborn boxplots graphs

I am having trouble with a specific demand on my graphs.
For now, I had to do the following instructions:
Read two dataframes
Create boxplots for the first dataframe and color the boxplots depending on the values of the second dataframe (the code is below, and more information are in my previous StackQuestion)
The code below works and my problems come after:
df=pd.DataFrame([['A',10, 22], ['A',12, 15], ['A',0, 2], ['A', 20, 25], ['A', 5, 5], ['A',12, 11], ['B', 0 ,0], ['B', 9 ,0], ['B', 8 ,50], ['B', 0 ,0], ['B', 18 ,5], ['B', 7 ,6],['C', 10 ,11], ['C', 9 ,10], ['C', 8 ,2], ['C', 6 ,2], ['C', 8 ,5], ['C', 6 ,8]],
columns=['Name', 'Value_01','Value_02'])
df_agreement=pd.DataFrame([['A', '<66%', '>80'],['B', '>80%', '>66% & <80%'], ['C', '<66%', '<66%']], columns=['Name', 'Agreement_01', 'Agreement_02'])
fig = plt.figure()
# Change seaborn plot size
fig.set_size_inches(60, 40)
plt.xticks(rotation=70)
plt.yticks(fontsize=40)
df_02=pd.melt(df, id_vars=['Name'],value_vars=['Value_01', 'Value_02'])
bp=sns.boxplot(x='Name',y='value',hue="variable",showfliers=True, data=df_02,showmeans=True,meanprops={"marker": "+",
"markeredgecolor": "black",
"markersize": "20"})
bp.set_xlabel("Name", fontsize=45)
bp.set_ylabel('Value', fontsize=45)
handles, labels = bp.get_legend_handles_labels()
new_handles = handles + [plt.Rectangle((0, 0), 0, 0, facecolor="#D1DBE6", edgecolor='black', linewidth=2),
plt.Rectangle((0, 0), 0, 0, facecolor="#EFDBD1", edgecolor='black', linewidth=2)]
bp.legend(handles=new_handles,
labels=['V_01', 'V_02', "V_01 with less\n than 66% agreement", "V_02 with less\n than 66% agreement"])
list_color_1=[]
list_color_2=[]
for i in range(0, len(df_agreement)):
name=df_agreement.loc[i,'Name']
if df_agreement.loc[i,'Agreement_01']=="<66%":
list_color_1.append(i*2)
if df_agreement.loc[i,'Agreement_02']=="<66%":
list_color_2.append(i*2+1)
for k in list_color_1:
mybox = bp.artists[k]
# Change the appearance of that box
mybox.set_facecolor("#D1DBE6") #facecolor is the inside color of the boxplot
mybox.set_edgecolor('black') #edgecolor is the line color of the box
mybox.set_linewidth(2)
for k in list_color_2:
mybox = bp.artists[k]
# Change the appearance of that box
mybox.set_facecolor("#EFDBD1") #facecolor is the inside color of the boxplot
mybox.set_edgecolor('black') #edgecolor is the line color of the box
mybox.set_linewidth(2)
Now I have a new dataFrame, equivalent to the first one (df), but with different values:
df_02=pd.DataFrame([['A',5, 20], ['A',15, 2], ['A',3, 5], ['A', 21, 24], ['A', 6, 6], ['A',10, 10], ['B', 0 ,0], ['B', 9 ,0], ['B', 9 ,5], ['B', -4 ,-2], ['B', 8 ,7], ['B', 8 ,9],['C', 10 ,15], ['C', 9 ,10], ['C', 8 ,2], ['C', 6 ,2], ['C', 8 ,5], ['C', 6 ,8]],
columns=['Name', 'Value_01','Value_02'])
What I would like to do is that on the boxplots, I would add a bar (only on each boxplot) corresponding to the value of my second dataframe (df_02).
Is there anyone who would have a guess for that one ?

dynamic shift with groupby on dataframe

I need to shift a grouped data frame by a dynamic number. I can do it with apply, but the performance is not very good.
Any way to do that without apply?
Here is a sample of what I would like to do:
df = pd.DataFrame({
'GROUP': ['A', 'A', 'A', 'A', 'A', 'A', 'B','B','B','B','B','B'],
'VALUE': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2],
'SHIFT': [ 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3]
})
df['SUM'] = df.groupby('GROUP').VALUE.cumsum()
# THIS DOESN'T WORK:
df['VALUE'] = df.groupby('GROUP').SUM.shift(df.SHIFT)
I do it with apply the following way:
df = pd.DataFrame({
'GROUP': ['A', 'A', 'A', 'A', 'A', 'A', 'B','B','B','B','B','B'],
'VALUE': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2],
'SHIFT': [ 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3]
})
def func(group):
s = group.SHIFT.iloc[0]
group['SUM'] = group.SUM.shift(s)
return group
df['SUM'] = df.groupby('GROUP').VALUE.cumsum()
df = df.groupby('GROUP').apply(func)
Here is a pure numpy version that works if the data frame is sorted by group (like your example):
# these rows are not null after shifting
notnull = np.where(df.groupby('GROUP').cumcount() >= df['SHIFT'])[0]
# source rows for rows above
source = notnull - df['SHIFT'].values[notnull]
shifted = np.empty(df.shape[0])
shifted[:] = np.nan
shifted[notnull] = df.groupby('GROUP')['VALUE'].cumsum().values[source]
df['SUM'] = shifted
It first gets the indices of rows that are to be updated. The shifts can be subtracted to yield the source rows.
A solution that avoids apply, could be the following, if the groups are contiguous:
import numpy as np
import pandas as pd
df = pd.DataFrame({
'GROUP': ['A', 'A', 'A', 'A', 'A', 'A', 'B','B','B','B','B','B'],
'VALUE': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2],
'SHIFT': [ 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3]
})
# compute values required for the slices
_, start = np.unique(df.GROUP.values, return_index=True)
gp = df.groupby('GROUP')
shifts = gp.SHIFT.first()
sizes = gp.size().values
end = (sizes - shifts.values) + start
# compute slices
source = [i for s, f in zip(start, end) for i in range(s, f)]
target = [i for j, s, f in zip(start, shifts, sizes) for i in range(j + s, j + f)]
# compute cumulative sum and arrays of nan
s = gp.VALUE.cumsum().values
r = np.empty_like(s, dtype=np.float32)
r[:] = np.nan
# set the on the array of nan
np.put(r, target, s[source])
# set the sum column
df['SUM'] = r
print(df)
Output
GROUP SHIFT VALUE SUM
0 A 2 1 NaN
1 A 2 2 NaN
2 A 2 3 1.0
3 A 2 4 3.0
4 A 2 5 6.0
5 A 2 6 10.0
6 B 3 7 NaN
7 B 3 8 NaN
8 B 3 9 NaN
9 B 3 0 7.0
10 B 3 1 15.0
11 B 3 2 24.0
With the exception of building the slices (source and target) all computations are done in a pandas/numpy level that should be fast. The idea is to manually simulate what would be done in the apply function.

How to add rows for all missing values of one multi-index's level?

Suppose that I have the following dataframe df, indexed by a 3-level multi-index:
In [52]: df
Out[52]:
C
L0 L1 L2
0 w P 1
y P 2
R 3
1 x Q 4
R 5
z S 6
Code to create the DataFrame:
idx = pd.MultiIndex(levels=[[0, 1], ['w', 'x', 'y', 'z'], ['P', 'Q', 'R', 'S']],
labels=[[0, 0, 0, 1, 1, 1], [0, 2, 2, 1, 1, 3], [0, 0, 2, 1, 2, 3]],
names=['L0', 'L1', 'L2'])
df = pd.DataFrame({'C': [1, 2, 3, 4, 5, 6]}, index=idx)
The possible values for the L2 level are 'P', 'Q', 'R', and 'S', but some of these values are missing for particular combinations of values for the remaining levels. For example, the combination (L0=0, L1='w', L2='Q') is not present in df.
I would like to add enough rows to df so that, for each combination of values for the levels other than L2, there is exactly one row for each of the L2 level's possible values. For the added rows, the value of the C column should be 0.
IOW, I want to expand df so that it looks like this:
C
L0 L1 L2
0 w P 1
Q 0
R 0
S 0
y P 2
Q 0
R 3
S 0
1 x P 0
Q 4
R 5
S 0
z P 0
Q 0
R 0
S 6
REQUIREMENTS:
the operation should leave the types of the columns unchanged;
the operation should add the smallest number of rows needed to complete only the specified level (L2)
Is there a simple way to perform this expansion?
Suppose L2 initially contains all the possible values you need, you can use unstack.stack trick:
df.unstack('L2', fill_value=0).stack(level=1)

Reshape rows to columns in pandas dataframe

In pandas how to go from a:
a = pd.DataFrame({'foo': ['m', 'm', 'm', 's', 's', 's'],
'bar': [1, 2, 3, 4, 5, 6]})
>>> a
bar foo
0 1 m
1 2 m
2 3 m
3 4 s
4 5 s
5 6 s
to b:
b = pd.DataFrame({'m': [1, 2, 3],
's': [4, 5, 6]})
>>> b
m s
0 1 4
1 2 5
2 3 6
I tried solutions in other answers, e.g. here and here but none seemed to do what I want.
Basically, I want to swap rows with columns and drop the index, but how to do it?
a.set_index(
[a.groupby('foo').cumcount(), 'foo']
).bar.unstack()
This is my solution
a = pd.DataFrame({'foo': ['m', 'm', 'm', 's', 's', 's'],
'bar': [1, 2, 3, 4, 5, 6]})
a.pivot(columns='foo', values='bar').apply(lambda x: pd.Series(x.dropna().values))
foo m s
0 1.0 4.0
1 2.0 5.0
2 3.0 6.0

Python curve fitting on pandas dataframe then add coef to new columns

I have a dataframe that needs to be curve fitted per row (second order polynomial).
There are four columns, each column name denotes the x value.
Each row contains 4 y values corresponding to the x values in the column name.
For example:
Based on the code below, The fitting for the first row will take x = [2, 5, 8, 12] and y = [5.91, 28.06, 67.07, 145.20]
import numpy as np
import panda as pd
df = pd.DataFrame({'id': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
'id2': ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'],
'x': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12],
'y': [5.91, 4.43, 5.22, 1.31, 4.42, 3.65, 4.45, 1.70, 3.94, 3.29, 28.06, 19.51, 23.30, 4.20, 18.61, 17.60, 18.27, 16.18, 16.81, 16.37, 67.07, 46.00, 54.95, 43.66, 42.70, 41.32, 12.69, 36.75, 41.36, 38.66, 145.20, 118.34, 16.74, 94.10, 93.45, 86.60, 26.17, 77.12, 91.42, 83.11]})
pivot_df = df.pivot_table(index=['id','id2'],columns=['x'])
[output]
>>> pivot_df
y
x 2 5 8 12
id id2
1 A 5.91 28.06 67.07 145.20
B 3.65 17.60 41.32 86.60
2 A 4.43 19.51 46.00 118.34
B 4.45 18.27 12.69 26.17
3 A 5.22 23.30 54.95 16.74
B 1.70 16.18 36.75 77.12
4 A 1.31 4.20 43.66 94.10
B 3.94 16.81 41.36 91.42
5 A 4.42 16.37 42.70 93.45
B 3.29 18.61 38.66 83.11
I want to perform the curve fitting without explicitly iterating over the rows in order to make use of the high performance under-the-hood iterating built into pandas' dataframes. I am not sure how to do so.
I wrote the code to loop through the rows anyway to show the desired output. Although the code below does work and provides the desired output, I need help in making it more concise/efficient.
my_coef_array = np.zeros(3)
#get the x values from the column names
x = pivot_df.columns.get_level_values(pivot_df.columns.names.index('x')).values
for index in pivot_df.index:
my_coef_array = np.vstack((my_coef_array,np.polyfit(x, pivot_df.loc[index].values, 2)))
my_coef_array = my_coef_array[1:,:]
pivot_df['m2'] = my_coef_array[:,0]
pivot_df['m1'] = my_coef_array[:,1]
pivot_df['c'] = my_coef_array[:,2]
[output]
>>> pivot_df
y m2 m1 c
x 2 5 8 12
id id2
1 A 5.91 28.06 67.07 145.20 0.934379 0.848422 0.471170
B 3.65 17.60 41.32 86.60 0.510664 1.156009 -0.767408
2 A 4.43 19.51 46.00 118.34 1.034594 -3.221912 7.518221
B 4.45 18.27 12.69 26.17 -0.015300 2.045216 2.496306
3 A 5.22 23.30 54.95 16.74 -1.356997 20.827407 -35.130416
B 1.70 16.18 36.75 77.12 0.410485 1.772052 -3.345097
4 A 1.31 4.20 43.66 94.10 0.803630 -1.577705 -1.148066
B 3.94 16.81 41.36 91.42 0.631377 -0.085651 1.551586
5 A 4.42 16.37 42.70 93.45 0.659044 -0.278738 2.068114
B 3.29 18.61 38.66 83.11 0.478171 1.218486 -0.638888
I found the following numpy.polynomial.polynomial.polyfit which is an alternative to np.polyfit that takes a 2-D array for y.
Starting your code from x, I get the following:
my_coef_array = pd.DataFrame(np.polynomial.polynomial.polyfit(x, pivot_df.T.values, 2)).T
my_coef_array.index = pivot_df.index
my_coef_array.columns = ['c', 'm1', 'm2']
pivot_df = pivot_df.join(my_coef_array)

Categories