Categorial area stackplot in pandas grouped by date - python

I found the way to implement the stackplot if my x-axis is just a list of numbers.
import pandas as pd
import matplotlib.pyplt as plt
d = {'time_key': {0: '2021-03-01',
1: '2021-03-01',
2: '2021-03-01',
3: '2021-03-01'},
'target': {0: 2, 1: 1, 2: 0, 3: 3},
'count': {0: 400, 1: 300, 2: 200, 3: 100},
'fraction': {0: 0.4, 1: 0.3, 2: 0.2, 3: 0.1}}
df = pd.DataFrame(d)
plt.stackplot(range(2), s[s.target==0].fraction, s[s.target==1].fraction,
s[s.target==2].fraction, s[s.target==3].fraction)
But I want to generalize the plot to many dates list.
d = {'time_key': {0: '2021-03-01',
1: '2021-03-01',
2: '2021-03-01',
3: '2021-03-01',
4: '2021-04-01',
5: '2021-04-01',
6: '2021-04-01',
7: '2021-04-01',
8: '2021-05-01',
9: '2021-05-01',
10: '2021-05-01',
11: '2021-05-01'},
'target': {0: 2,
1: 1,
2: 0,
3: 3,
4: 2,
5: 1,
6: 0,
7: 3,
8: 2,
9: 1,
10: 0,
11: 3},
'count': {0: 163,
1: 110,
2: 90,
3: 38,
4: 113,
5: 97,
6: 56,
7: 34,
8: 85,
9: 57,
10: 42,
11: 16},
'fraction': {0: 0.18091009988901222,
1: 0.1220865704772475,
2: 0.09988901220865705,
3: 0.042175360710321866,
4: 0.12541620421753608,
5: 0.1076581576026637,
6: 0.06215316315205328,
7: 0.03773584905660377,
8: 0.09433962264150944,
9: 0.06326304106548279,
10: 0.04661487236403995,
11: 0.017758046614872364}}
And I'd like to assign dates to x-axis in ascending order to see dynamics of the proportions.
Is this a way to implement it in a proper way?
The approximate desired output plot (I need time_key x-axis though):

Try:
dfp = df.set_index(['time_key','target'])['count'].unstack()
dfp.div(dfp.sum(axis=1), axis=0).plot.bar(stacked=True)
Output:

Also useful solution is
d = {0: {'2021-03-01': 0.2, '2021-04-01': 0.25, '2021-05-01': 0.3},
1: {'2021-03-01': 0.3, '2021-04-01': 0.25, '2021-05-01': 0.3},
2: {'2021-03-01': 0.4, '2021-04-01': 0.25, '2021-05-01': 0.3},
3: {'2021-03-01': 0.1, '2021-04-01': 0.25, '2021-05-01': 0.1}}
df = pd.DataFrame(d)
fig, ax = plt.subplots(figsize=(9, 6))
plt.style.use('classic')
df.plot.area(ax=ax)

Related

Adding extra text (second value) under bars in graph, without changing contents of table

I'm trying to make a bar graph that shows the amount of calories of the food with the least amount of calories for each food category in my csv file. I've succeeded in doing this, but in my bar graph, I'm supposed to show both the food category and the name of the food under the bars,
like so. Currently, I only have the name of the food category under teh bars, as shown here
my current code:
Ams3 = data.groupby('Category',).Calories.min()
Ams3.plot.bar()
my data.head().to_dict():
{'Unnamed: 0': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
'Category': {0: 'Breakfast',
1: 'Breakfast',
2: 'Breakfast',
3: 'Breakfast',
4: 'Breakfast'},
'Item': {0: 'Egg McMuffin',
1: 'Egg White Delight',
2: 'Sausage McMuffin',
3: 'Sausage McMuffin with Egg',
4: 'Sausage McMuffin with Egg Whites'},
'Serving Size': {0: '4.8 oz (136 g)',
1: '4.8 oz (135 g)',
2: '3.9 oz (111 g)',
3: '5.7 oz (161 g)',
4: '5.7 oz (161 g)'},
'Calories': {0: 300.0, 1: 250.0, 2: 370.0, 3: 450.0, 4: 400.0},
'Calories from Fat': {0: 120, 1: 70, 2: 200, 3: 250, 4: 210},
'Total Fat': {0: 13.0, 1: 8.0, 2: 23.0, 3: 28.0, 4: 23.0},
'Total Fat (% Daily Value)': {0: 20, 1: 12, 2: 35, 3: 43, 4: 35},
'Saturated Fat': {0: 5.0, 1: 3.0, 2: 8.0, 3: 10.0, 4: 8.0},
'Saturated Fat (% Daily Value)': {0: 25, 1: 15, 2: 42, 3: 52, 4: 42},
'Trans Fat': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0},
'Cholesterol': {0: 260, 1: 25, 2: 45, 3: 285, 4: 50},
'Cholesterol (% Daily Value)': {0: 87, 1: 8, 2: 15, 3: 95, 4: 16},
'Sodium': {0: 750, 1: 770, 2: 780, 3: 860, 4: 880},
'Sodium (% Daily Value)': {0: 31, 1: 32, 2: 33, 3: 36, 4: 37},
'Carbohydrates': {0: 31, 1: 30, 2: 29, 3: 30, 4: 30},
'Carbohydrates (% Daily Value)': {0: 10, 1: 10, 2: 10, 3: 10, 4: 10},
'Dietary Fiber': {0: 4, 1: 4, 2: 4, 3: 4, 4: 4},
'Dietary Fiber (% Daily Value)': {0: 17, 1: 17, 2: 17, 3: 17, 4: 17},
'Sugars': {0: 3, 1: 3, 2: 2, 3: 2, 4: 2},
'Protein': {0: 17, 1: 18, 2: 14, 3: 21, 4: 21},
'Vitamin A (% Daily Value)': {0: 10, 1: 6, 2: 8, 3: 15, 4: 6},
'Vitamin C (% Daily Value)': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0},
'Calcium (% Daily Value)': {0: 25, 1: 25, 2: 25, 3: 30, 4: 25},
'Iron (% Daily Value)': {0: 15, 1: 8, 2: 10, 3: 15, 4: 10}}

go.Box names editing

Here is my code for some NBA project:
fig = make_subplots(
rows = 1, cols = 5,
)
fig.add_trace(
go.Box(x=per_game_player['HOF'], y=per_game_player['trb'], name ='Rebounds'),
row = 1, col = 1
)
fig.add_trace(
go.Box(x=per_game_player['HOF'], y=per_game_player['ast'], name = 'Assists'),
row = 1, col = 2
)
fig.add_trace(
go.Box(x=per_game_player['HOF'], y=per_game_player['stl'], name = 'Steals'),
row = 1, col = 3
)
fig.add_trace(
go.Box(x=per_game_player['HOF'], y=per_game_player['blk'], name = 'Blocks'),
row = 1, col = 4
)
fig.add_trace(
go.Box(x=per_game_player['HOF'], y=per_game_player['pts'], name = 'Points'),
row = 1, col = 5,
)
hovertemp = '<b>%{customdata}: </b> %{y} <br>'
fig.update_traces(
showlegend=False,
customdata = per_game_player['player'],
hovertemplate = hovertemp,
)
fig.update_layout(title_text = 'top stats')
fig.show();
Here's the result:
Do you have any ideas how can I replace '0' and '1' on the horizontal line?
'0' and '1' also appear on hover once I slide through the chart - but not on the outliers.
My table head:
{'player_id': {0: 2218, 1: 3168, 2: 2560, 3: 3228, 4: 4374},
'player': {0: 'A.C. Green',
1: 'A.J. Bramlett',
2: 'A.J. English',
3: 'A.J. Guyton',
4: 'A.J. Hammons'},
'g': {0: 1278.0, 1: 8.0, 2: 151.0, 3: 80.0, 4: 22.0},
'fg': {0: 3.56, 1: 0.5, 2: 4.09, 3: 2.08, 4: 0.77},
'fga': {0: 7.2, 1: 2.62, 2: 9.39, 3: 5.5, 4: 1.91},
'trb': {0: 7.41, 1: 2.75, 2: 2.09, 3: 1.0, 4: 1.64},
'ast': {0: 1.1, 1: 0.0, 2: 2.12, 3: 1.84, 4: 0.18},
'stl': {0: 0.81, 1: 0.12, 2: 0.38, 3: 0.25, 4: 0.05},
'blk': {0: 0.43, 1: 0.0, 2: 0.16, 3: 0.15, 4: 0.59},
'pts': {0: 9.65, 1: 1.0, 2: 9.95, 3: 5.52, 4: 2.18},
'all_star': {0: 1, 1: 0, 2: 0, 3: 0, 4: 0},
'outliers': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0},
'HOF': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}}
Thx.
I'd simply just alter the data in your dataframe.
import numpy as np
per_game_player['HOF'] = np.where(per_game_player['HOF'] == 1, 'HOF', 'Not HOF')
Output:

In Python, pandas, how to ignore invalid values in python when i convert the columns from hexa to decimal?

when I use:
df[["Type 2", "Type 4"]].applymap(lambda n: int(n, 16))
It stops in the error because of invalid value in Type 2 column because of invalid values (negative values, NaN, string...) for hexa conversion. how to ignore this error or mark the invalid value as zero
{'Type 1': {0: 1, 1: 3, 2: 5, 3: 7, 4: 9, 5: 11, 6: 13, 7: 15, 8: 17},
'Type 2': {0: 'AA',
1: 'BB',
2: 'NaN',
3: '55',
4: '3.14',
5: '-96',
6: 'String',
7: 'FFFFFF',
8: 'FEEE'},
'Type 3': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0},
'Type 4': {0: '23',
1: 'fefe',
2: 'abcd',
3: 'dddd',
4: 'dad',
5: 'cfe',
6: 'cf42',
7: '321',
8: '0'},
'Type 5': {0: -120,
1: -120,
2: -120,
3: -120,
4: -120,
5: -120,
6: -120,
7: -120,
8: -120}}
You can create a personalized function that handles this exception to use in your lambda. For example:
def lambda_int(n):
try:
return int(n, 16)
except ValueError:
return 0
df[["Type 2", "Type 4"]] = df[["Type 2", "Type 4"]].applymap(lambda n: lambda_int(n))
Please go through this, i reconstructed your question and gave steps to follow
1. You first dictionary you provided does not have a value, it has a string "NaN"
data = {'Type 1': {0: 1, 1: 3, 2: 5, 3: 7, 4: 9, 5: 11, 6: 13, 7: 15, 8: 17},
'Type 2': {0: 'AA',
1: 'BB',
2: 'NaN',
3: '55',
4: '3.14',
5: '-96',
6: 'String',
7: 'FFFFFF',
8: 'FEEE'},
'Type 3': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0},
'Type 4': {0: '23',
1: 'fefe',
2: 'abcd',
3: 'dddd',
4: 'dad',
5: 'cfe',
6: 'cf42',
7: '321',
8: '0'},
'Type 5': {0: -120,
1: -120,
2: -120,
3: -120,
4: -120,
5: -120,
6: -120,
7: -120,
8: -120}}
import pandas as pd
df = pd.DataFrame(data)
df.head()
To check nan in your df and remove them
columns_with_na = df.isna().sum()
#filter starting from 1 missing value
columns_with_na = columns_with_na[columns_with_na != 0]
print(len(columns_with_na))
print(len(columns_with_na.sort_values(ascending = False))) #print them in descendng order
Prints 0 and 0 because there is no nan
Reconstructed your data to include a nan by using numpy.nan
import numpy as np
#recreated a dataset and included a nan value : np.nan at Type 2
data = {'Type 1': {0: 1, 1: 3, 2: 5, 3: 7, 4: 9, 5: 11, 6: 13, 7: 15, 8: 17},
'Type 2': {0: 'AA',
1: 'BB',
2: np.nan,
3: '55',
4: '3.14',
5: '-96',
6: 'String',
7: 'FFFFFF',
8: 'FEEE'},
'Type 3': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0},
'Type 4': {0: '23',
1: 'fefe',
2: 'abcd',
3: 'dddd',
4: 'dad',
5: 'cfe',
6: 'cf42',
7: '321',
8: '0'},
'Type 5': {0: -120,
1: -120,
2: -120,
3: -120,
4: -120,
5: -120,
6: -120,
7: -120,
8: -120}}
df2 = pd.DataFrame(data)
df2.head()
#sum up number of columns with nan
columns_with_na = df2.isna().sum()
#filter starting from 1 missing value
columns_with_na = columns_with_na[columns_with_na != 0]
print(len(columns_with_na))
print(len(columns_with_na.sort_values(ascending = False)))
prints 1 and 1 because there is a nan at Type 2 column
#drop nan values
df2 = df2.dropna(how = 'any')
#sum up number of columns with nan
columns_with_na = df2.isna().sum()
#filter starting from 1 missing value
columns_with_na = columns_with_na[columns_with_na != 0]
print(len(columns_with_na))
#prints 0 because I dropped all the nan values
df2.head()
To fill nan in df with 0 use:
df2.fillna(0, inplace = True)
Fill in nan with 0 in df2['Type 2'] only:
#if you dont want to change the origina dataframe set inplace to false
df2['Type 2'].fillna(0, inplace = True) #inplace is set to True to change the original df

How to update a seaborn line plot with ipywidgets checkboxes?

I am struggling with the ipywidgets module.
I am trying to make a plot where you can toggle lines off/on with checkboxes based on a province.
fig, ax = plt.subplots(figsize=(10,10))
sns.lineplot(data=df5, x="Date_of_report", y="Total_reported", hue="Province", ax=ax)
provinces = df5["Province"].unique()
chk = [widgets.Checkbox(description=a) for a in provinces]
def updatePlot(**kwargs):
print([(k,v) for k, v in kwargs.items()])
widgets.interact(updatePlot, **{c.description: c.value for c in chk})
As you can see, I can draw the checkboxes and it prints out the status of the boxes.
but I don't know how to update the seaborn line plot. So when you select say: Drenthe it only shows the line from Drenthe.
here is the dataframe as a dict:
{'Date_of_report': {0: Timestamp('2020-03-13 10:00:00'), 1: Timestamp('2020-03-13 10:00:00'), 2: Timestamp('2020-03-13 10:00:00'), 3: Timestamp('2020-03-13 10:00:00'), 4: Timestamp('2020-03-13 10:00:00'), 5: Timestamp('2020-03-13 10:00:00'), 6: Timestamp('2020-03-13 10:00:00'), 7: Timestamp('2020-03-13 10:00:00'), 8: Timestamp('2020-03-13 10:00:00'), 9: Timestamp('2020-03-13 10:00:00')}, 'Province': {0: 'Drenthe', 1: 'Flevoland', 2: 'Friesland', 3: 'Gelderland', 4: 'Groningen', 5: 'Limburg', 6: 'Noord-Brabant', 7: 'Noord-Holland', 8: 'Overijssel', 9: 'Utrecht'}, 'Total_reported': {0: 14, 1: 7, 2: 8, 3: 64, 4: 4, 5: 71, 6: 377, 7: 66, 8: 18, 9: 83}, 'Hospital_admission': {0: 0, 1: 3, 2: 2, 3: 9, 4: 1, 5: 17, 6: 65, 7: 4, 8: 0, 9: 7}, 'Deceased': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 3, 6: 5, 7: 0, 8: 0, 9: 0}}

Modify plotly layout y-axis

I am working on two datasets on churn classification, my problem is as you can see below on the two graph the y-axis are not on the same scale. Bank stops at 0.8 and telco-europa at 1, I would like to force the y-axis to always display 0, 0.2, 0.4, 0.6, 0.8, 1.
I have used the following code:
and my histogram is based on this tutorial: https://www.kaggle.com/pavanraj159/telecom-customer-churn-prediction and the bank dataset is this one https://www.kaggle.com/shrutimechlearn/churn-modelling
import plotly.graph_objs as go#visualization
import plotly.offline as py#visualization
def output_tracer(metric,color, model_performances) :
tracer = go.Bar(x = model_performances["Algorithm"] ,
y = model_performances[metric],
orientation = "v",name = metric ,
marker = dict(line = dict(width =.7),
color = color)
)
return tracer
def output_data(model_performances):
trace1 = output_tracer("1-Precision","#6699FF", model_performances)
trace2 = output_tracer('1-Recall',"red", model_performances)
trace3 = output_tracer('1-F1-score',"#33CC99", model_performances)
trace4 = output_tracer('Accuracy',"lightgrey", model_performances)
trace5 = output_tracer('AUC',"#FFCC99", model_performances)
data = [trace1,trace2,trace3,trace4,trace5]
return data
def output_layout(model):
layout = go.Layout(dict(title = model,
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "",
zerolinewidth=1,
ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
zerolinewidth=1,ticklen=5,gridwidth=2),
margin = dict(l = 250),
height = 400
)
)
return layout
model = "Bank"
model_performances = report_df_scoring[report_df_scoring.Dataset == model]
fig = go.Figure(data=output_data(model_performances),layout=output_layout(model))
py.iplot(fig)
And here you can fin the dataframe as a dictionary "report_df_scoring" for only the "Bank" dataset
{'Dataset': {0: 'Bank',
1: 'Bank',
2: 'Bank',
3: 'Bank',
4: 'Bank',
5: 'Bank',
6: 'Bank'},
'Algorithm': {0: 'LogisticRegressionNoSMOTE',
1: 'Logistic Regression',
2: 'SVM-linear',
3: 'SVM-rbf',
4: 'xgboost',
5: 'GaussianNB',
6: 'RandomForest'},
'W-Precision': {0: 0.8159638339642141,
1: 0.8229500536388679,
2: 0.8243426658647828,
3: 0.7956512785333915,
4: 0.8288351219512194,
5: 0.8302513223140496,
6: 0.8307514249037228},
'W-Recall': {0: 0.8324,
1: 0.7636,
2: 0.7628,
3: 0.8056,
4: 0.836,
5: 0.8176,
6: 0.8408},
'W-F1-score': {0: 0.810103868755423,
1: 0.7811452562742854,
2: 0.7807117770916884,
3: 0.7997335148514852,
4: 0.831622605929424,
5: 0.7598757585104978,
6: 0.8336474053248425},
'0-Precision': {0: 0.8493518104604381,
1: 0.9187236604455148,
2: 0.9206541490006056,
3: 0.8634596695821186,
4: 0.8834146341463415,
5: 0.8152892561983471,
6: 0.8789473684210526},
'0-Recall': {0: 0.958627648839556,
1: 0.7699293642785066,
2: 0.7669021190716448,
3: 0.8965691220988901,
4: 0.9137235116044399,
5: 0.9954591321897074,
6: 0.9268415741675076},
'0-F1-score': {0: 0.9006873666745674,
1: 0.8377710678012626,
2: 0.8367740159647675,
3: 0.8797029702970298,
4: 0.8983134920634921,
5: 0.8964107223989097,
6: 0.9022593320235756},
'1-Precision': {0: 0.6882129277566539,
1: 0.4564958283671037,
2: 0.4558303886925795,
3: 0.5361990950226244,
4: 0.62,
5: 0.8875,
6: 0.6463414634146342},
'1-Recall': {0: 0.34942084942084944,
1: 0.7393822393822393,
2: 0.747104247104247,
3: 0.4575289575289575,
4: 0.5386100386100386,
5: 0.13706563706563707,
6: 0.5115830115830116},
'1-F1-score': {0: 0.4635083226632522,
1: 0.5644804716285925,
2: 0.5662033650329188,
3: 0.49375,
4: 0.5764462809917356,
5: 0.2374581939799331,
6: 0.5711206896551725},
'Accuracy': {0: 0.8324,
1: 0.7636,
2: 0.7628,
3: 0.8056,
4: 0.836,
5: 0.8176,
6: 0.8408},
'AUC': {0: 0.6540242491302027,
1: 0.754655801830373,
2: 0.7570031830879459,
3: 0.6770490398139237,
4: 0.7261667751072393,
5: 0.5662623846276723,
6: 0.7192122928752596},
'SMOTE': {0: 'No',
1: 'Yes',
2: 'Yes',
3: 'Yes',
4: 'Yes',
5: 'Yes',
6: 'Yes'},
'top3var': {0: "['numofproducts_4', 'numofproducts_3', 'geography_germany']",
1: "['numofproducts_4', 'numofproducts_3', 'geography_germany']",
2: "['numofproducts_4', 'numofproducts_3', 'age']",
3: "['empty']",
4: "['numofproducts_2', 'numofproducts_1', 'isactivemember']",
5: "['empty']",
6: "['age', 'numofproducts_2', 'balance']"}}
You can access and edit the range of any axis of your figure using:
fig['layout']['yaxis']['range']
And set the range like:
fig['layout']['yaxis']['range'] = [0, 1]
The same thing goes for your tickvals:
fig['layout']['yaxis']['tickvals'] = [0, 0.2, 0.4, 0.6, 0.8, 1]
You can use:
fig.update_yaxes(tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1])
Your code example does not work for me because "report_df_scoring" is missing.

Categories