go.Box names editing - python

Here is my code for some NBA project:
fig = make_subplots(
rows = 1, cols = 5,
)
fig.add_trace(
go.Box(x=per_game_player['HOF'], y=per_game_player['trb'], name ='Rebounds'),
row = 1, col = 1
)
fig.add_trace(
go.Box(x=per_game_player['HOF'], y=per_game_player['ast'], name = 'Assists'),
row = 1, col = 2
)
fig.add_trace(
go.Box(x=per_game_player['HOF'], y=per_game_player['stl'], name = 'Steals'),
row = 1, col = 3
)
fig.add_trace(
go.Box(x=per_game_player['HOF'], y=per_game_player['blk'], name = 'Blocks'),
row = 1, col = 4
)
fig.add_trace(
go.Box(x=per_game_player['HOF'], y=per_game_player['pts'], name = 'Points'),
row = 1, col = 5,
)
hovertemp = '<b>%{customdata}: </b> %{y} <br>'
fig.update_traces(
showlegend=False,
customdata = per_game_player['player'],
hovertemplate = hovertemp,
)
fig.update_layout(title_text = 'top stats')
fig.show();
Here's the result:
Do you have any ideas how can I replace '0' and '1' on the horizontal line?
'0' and '1' also appear on hover once I slide through the chart - but not on the outliers.
My table head:
{'player_id': {0: 2218, 1: 3168, 2: 2560, 3: 3228, 4: 4374},
'player': {0: 'A.C. Green',
1: 'A.J. Bramlett',
2: 'A.J. English',
3: 'A.J. Guyton',
4: 'A.J. Hammons'},
'g': {0: 1278.0, 1: 8.0, 2: 151.0, 3: 80.0, 4: 22.0},
'fg': {0: 3.56, 1: 0.5, 2: 4.09, 3: 2.08, 4: 0.77},
'fga': {0: 7.2, 1: 2.62, 2: 9.39, 3: 5.5, 4: 1.91},
'trb': {0: 7.41, 1: 2.75, 2: 2.09, 3: 1.0, 4: 1.64},
'ast': {0: 1.1, 1: 0.0, 2: 2.12, 3: 1.84, 4: 0.18},
'stl': {0: 0.81, 1: 0.12, 2: 0.38, 3: 0.25, 4: 0.05},
'blk': {0: 0.43, 1: 0.0, 2: 0.16, 3: 0.15, 4: 0.59},
'pts': {0: 9.65, 1: 1.0, 2: 9.95, 3: 5.52, 4: 2.18},
'all_star': {0: 1, 1: 0, 2: 0, 3: 0, 4: 0},
'outliers': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0},
'HOF': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}}
Thx.

I'd simply just alter the data in your dataframe.
import numpy as np
per_game_player['HOF'] = np.where(per_game_player['HOF'] == 1, 'HOF', 'Not HOF')
Output:

Related

Custom function to replace missing values in dataframe with median located in pivot table

I am attempting to write a function to replace missing values in the 'total_income' column with the median 'total_income' provided by the pivot table, using the row's 'education' and 'income_type' to index the pivot table. I want to populate using these medians so that the values are as optimal as they can be. Here is what I am testing:
This is the first 5 rows of the dataframe as a dictionary:
{'index': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
'children': {0: 1, 1: 1, 2: 0, 3: 3, 4: 0},
'days_employed': {0: 8437.673027760233,
1: 4024.803753850451,
2: 5623.422610230956,
3: 4124.747206540018,
4: 340266.07204682194},
'dob_years': {0: 42, 1: 36, 2: 33, 3: 32, 4: 53},
'education': {0: "bachelor's degree",
1: 'secondary education',
2: 'secondary education',
3: 'secondary education',
4: 'secondary education'},
'education_id': {0: 0, 1: 1, 2: 1, 3: 1, 4: 1},
'family_status': {0: 'married',
1: 'married',
2: 'married',
3: 'married',
4: 'civil partnership'},
'family_status_id': {0: 0, 1: 0, 2: 0, 3: 0, 4: 1},
'gender': {0: 'F', 1: 'F', 2: 'M', 3: 'M', 4: 'F'},
'income_type': {0: 'employee',
1: 'employee',
2: 'employee',
3: 'employee',
4: 'retiree'},
'debt': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0},
'total_income': {0: 40620.102,
1: 17932.802,
2: 23341.752,
3: 42820.568,
4: 25378.572},
'purpose': {0: 'purchase of the house',
1: 'car purchase',
2: 'purchase of the house',
3: 'supplementary education',
4: 'to have a wedding'},
'age_group': {0: 'adult',
1: 'adult',
2: 'adult',
3: 'adult',
4: 'older adult'}}
def fill_income(row):
total_income = row['total_income']
age_group = row['age_group']
income_type = row['income_type']
education = row['education']
table = df.pivot_table(index=['age_group','income_type' ], columns='education', values='total_income', aggfunc='median')
if total_income == 'NaN':
if age_group =='adult':
return table.loc[education, income_type]
My desired output is the pivot table value (the median total_income) for the dataframe row's given education and income_type. When I test it, it returns 'None'.
Thanks in advance for your time helping me with this problem!

Categorial area stackplot in pandas grouped by date

I found the way to implement the stackplot if my x-axis is just a list of numbers.
import pandas as pd
import matplotlib.pyplt as plt
d = {'time_key': {0: '2021-03-01',
1: '2021-03-01',
2: '2021-03-01',
3: '2021-03-01'},
'target': {0: 2, 1: 1, 2: 0, 3: 3},
'count': {0: 400, 1: 300, 2: 200, 3: 100},
'fraction': {0: 0.4, 1: 0.3, 2: 0.2, 3: 0.1}}
df = pd.DataFrame(d)
plt.stackplot(range(2), s[s.target==0].fraction, s[s.target==1].fraction,
s[s.target==2].fraction, s[s.target==3].fraction)
But I want to generalize the plot to many dates list.
d = {'time_key': {0: '2021-03-01',
1: '2021-03-01',
2: '2021-03-01',
3: '2021-03-01',
4: '2021-04-01',
5: '2021-04-01',
6: '2021-04-01',
7: '2021-04-01',
8: '2021-05-01',
9: '2021-05-01',
10: '2021-05-01',
11: '2021-05-01'},
'target': {0: 2,
1: 1,
2: 0,
3: 3,
4: 2,
5: 1,
6: 0,
7: 3,
8: 2,
9: 1,
10: 0,
11: 3},
'count': {0: 163,
1: 110,
2: 90,
3: 38,
4: 113,
5: 97,
6: 56,
7: 34,
8: 85,
9: 57,
10: 42,
11: 16},
'fraction': {0: 0.18091009988901222,
1: 0.1220865704772475,
2: 0.09988901220865705,
3: 0.042175360710321866,
4: 0.12541620421753608,
5: 0.1076581576026637,
6: 0.06215316315205328,
7: 0.03773584905660377,
8: 0.09433962264150944,
9: 0.06326304106548279,
10: 0.04661487236403995,
11: 0.017758046614872364}}
And I'd like to assign dates to x-axis in ascending order to see dynamics of the proportions.
Is this a way to implement it in a proper way?
The approximate desired output plot (I need time_key x-axis though):
Try:
dfp = df.set_index(['time_key','target'])['count'].unstack()
dfp.div(dfp.sum(axis=1), axis=0).plot.bar(stacked=True)
Output:
Also useful solution is
d = {0: {'2021-03-01': 0.2, '2021-04-01': 0.25, '2021-05-01': 0.3},
1: {'2021-03-01': 0.3, '2021-04-01': 0.25, '2021-05-01': 0.3},
2: {'2021-03-01': 0.4, '2021-04-01': 0.25, '2021-05-01': 0.3},
3: {'2021-03-01': 0.1, '2021-04-01': 0.25, '2021-05-01': 0.1}}
df = pd.DataFrame(d)
fig, ax = plt.subplots(figsize=(9, 6))
plt.style.use('classic')
df.plot.area(ax=ax)

In Python, pandas, how to ignore invalid values in python when i convert the columns from hexa to decimal?

when I use:
df[["Type 2", "Type 4"]].applymap(lambda n: int(n, 16))
It stops in the error because of invalid value in Type 2 column because of invalid values (negative values, NaN, string...) for hexa conversion. how to ignore this error or mark the invalid value as zero
{'Type 1': {0: 1, 1: 3, 2: 5, 3: 7, 4: 9, 5: 11, 6: 13, 7: 15, 8: 17},
'Type 2': {0: 'AA',
1: 'BB',
2: 'NaN',
3: '55',
4: '3.14',
5: '-96',
6: 'String',
7: 'FFFFFF',
8: 'FEEE'},
'Type 3': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0},
'Type 4': {0: '23',
1: 'fefe',
2: 'abcd',
3: 'dddd',
4: 'dad',
5: 'cfe',
6: 'cf42',
7: '321',
8: '0'},
'Type 5': {0: -120,
1: -120,
2: -120,
3: -120,
4: -120,
5: -120,
6: -120,
7: -120,
8: -120}}
You can create a personalized function that handles this exception to use in your lambda. For example:
def lambda_int(n):
try:
return int(n, 16)
except ValueError:
return 0
df[["Type 2", "Type 4"]] = df[["Type 2", "Type 4"]].applymap(lambda n: lambda_int(n))
Please go through this, i reconstructed your question and gave steps to follow
1. You first dictionary you provided does not have a value, it has a string "NaN"
data = {'Type 1': {0: 1, 1: 3, 2: 5, 3: 7, 4: 9, 5: 11, 6: 13, 7: 15, 8: 17},
'Type 2': {0: 'AA',
1: 'BB',
2: 'NaN',
3: '55',
4: '3.14',
5: '-96',
6: 'String',
7: 'FFFFFF',
8: 'FEEE'},
'Type 3': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0},
'Type 4': {0: '23',
1: 'fefe',
2: 'abcd',
3: 'dddd',
4: 'dad',
5: 'cfe',
6: 'cf42',
7: '321',
8: '0'},
'Type 5': {0: -120,
1: -120,
2: -120,
3: -120,
4: -120,
5: -120,
6: -120,
7: -120,
8: -120}}
import pandas as pd
df = pd.DataFrame(data)
df.head()
To check nan in your df and remove them
columns_with_na = df.isna().sum()
#filter starting from 1 missing value
columns_with_na = columns_with_na[columns_with_na != 0]
print(len(columns_with_na))
print(len(columns_with_na.sort_values(ascending = False))) #print them in descendng order
Prints 0 and 0 because there is no nan
Reconstructed your data to include a nan by using numpy.nan
import numpy as np
#recreated a dataset and included a nan value : np.nan at Type 2
data = {'Type 1': {0: 1, 1: 3, 2: 5, 3: 7, 4: 9, 5: 11, 6: 13, 7: 15, 8: 17},
'Type 2': {0: 'AA',
1: 'BB',
2: np.nan,
3: '55',
4: '3.14',
5: '-96',
6: 'String',
7: 'FFFFFF',
8: 'FEEE'},
'Type 3': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0},
'Type 4': {0: '23',
1: 'fefe',
2: 'abcd',
3: 'dddd',
4: 'dad',
5: 'cfe',
6: 'cf42',
7: '321',
8: '0'},
'Type 5': {0: -120,
1: -120,
2: -120,
3: -120,
4: -120,
5: -120,
6: -120,
7: -120,
8: -120}}
df2 = pd.DataFrame(data)
df2.head()
#sum up number of columns with nan
columns_with_na = df2.isna().sum()
#filter starting from 1 missing value
columns_with_na = columns_with_na[columns_with_na != 0]
print(len(columns_with_na))
print(len(columns_with_na.sort_values(ascending = False)))
prints 1 and 1 because there is a nan at Type 2 column
#drop nan values
df2 = df2.dropna(how = 'any')
#sum up number of columns with nan
columns_with_na = df2.isna().sum()
#filter starting from 1 missing value
columns_with_na = columns_with_na[columns_with_na != 0]
print(len(columns_with_na))
#prints 0 because I dropped all the nan values
df2.head()
To fill nan in df with 0 use:
df2.fillna(0, inplace = True)
Fill in nan with 0 in df2['Type 2'] only:
#if you dont want to change the origina dataframe set inplace to false
df2['Type 2'].fillna(0, inplace = True) #inplace is set to True to change the original df

plotnine/ggplot - changing legend positions

I have this dataframe:
df = pd.DataFrame({'ymin': {0: 0.0,
1: 0.0,
2: 0.0,
3: 0.0,
4: 0.511,
5: 0.571,
6: 0.5329999999999999,
7: 0.5389999999999999},
'ymax': {0: 0.511,
1: 0.571,
2: 0.533,
3: 0.539,
4: 1.0,
5: 1.0,
6: 1.0,
7: 1.0},
'xmin': {0: 0.0,
1: 0.14799999999999996,
2: 0.22400000000000003,
3: 0.5239999999999999,
4: 0.0,
5: 0.14799999999999996,
6: 0.22400000000000003,
7: 0.5239999999999999},
'xmax': {0: 0.148,
1: 0.22399999999999998,
2: 0.524,
3: 1.001,
4: 0.148,
5: 0.22399999999999998,
6: 0.524,
7: 1.001},
'variable': {0: 'A', 1: 'A', 2: 'A', 3: 'A', 4: 'B', 5: 'B', 6: 'B', 7: 'B'}})
Where I plot this:
(ggplot(df, aes(ymin = "ymin", ymax = "ymax",
xmin = "xmin", xmax = "xmax", fill = "variable"))
+ geom_rect(colour = "grey", alpha=0.7))
I'm looking to change the position of the legends to the same to the positions of the plot: blue-up and red-bottom. And A always will be red and B always will be blue
There might be a more standard way to do it, but here is a quick hack to fix your problem:
Change the order of your variable
Assign colors manually (You could also look for exact color codes and replace it with the color names if it matters in your case)
df = df.assign(variable = pd.Categorical(df['variable'], ['B', 'A']))
(ggplot(df, aes(ymin = "ymin", ymax = "ymax",
xmin = "xmin", xmax = "xmax", fill = "variable"))+
geom_rect(colour = "grey", alpha=0.7)+
scale_fill_manual(values = ["blue", "red"]))
output looks like this:
You could set order of levels with df$variable <- factor(df$variable, levels = c("B","A")

Modify plotly layout y-axis

I am working on two datasets on churn classification, my problem is as you can see below on the two graph the y-axis are not on the same scale. Bank stops at 0.8 and telco-europa at 1, I would like to force the y-axis to always display 0, 0.2, 0.4, 0.6, 0.8, 1.
I have used the following code:
and my histogram is based on this tutorial: https://www.kaggle.com/pavanraj159/telecom-customer-churn-prediction and the bank dataset is this one https://www.kaggle.com/shrutimechlearn/churn-modelling
import plotly.graph_objs as go#visualization
import plotly.offline as py#visualization
def output_tracer(metric,color, model_performances) :
tracer = go.Bar(x = model_performances["Algorithm"] ,
y = model_performances[metric],
orientation = "v",name = metric ,
marker = dict(line = dict(width =.7),
color = color)
)
return tracer
def output_data(model_performances):
trace1 = output_tracer("1-Precision","#6699FF", model_performances)
trace2 = output_tracer('1-Recall',"red", model_performances)
trace3 = output_tracer('1-F1-score',"#33CC99", model_performances)
trace4 = output_tracer('Accuracy',"lightgrey", model_performances)
trace5 = output_tracer('AUC',"#FFCC99", model_performances)
data = [trace1,trace2,trace3,trace4,trace5]
return data
def output_layout(model):
layout = go.Layout(dict(title = model,
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "",
zerolinewidth=1,
ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
zerolinewidth=1,ticklen=5,gridwidth=2),
margin = dict(l = 250),
height = 400
)
)
return layout
model = "Bank"
model_performances = report_df_scoring[report_df_scoring.Dataset == model]
fig = go.Figure(data=output_data(model_performances),layout=output_layout(model))
py.iplot(fig)
And here you can fin the dataframe as a dictionary "report_df_scoring" for only the "Bank" dataset
{'Dataset': {0: 'Bank',
1: 'Bank',
2: 'Bank',
3: 'Bank',
4: 'Bank',
5: 'Bank',
6: 'Bank'},
'Algorithm': {0: 'LogisticRegressionNoSMOTE',
1: 'Logistic Regression',
2: 'SVM-linear',
3: 'SVM-rbf',
4: 'xgboost',
5: 'GaussianNB',
6: 'RandomForest'},
'W-Precision': {0: 0.8159638339642141,
1: 0.8229500536388679,
2: 0.8243426658647828,
3: 0.7956512785333915,
4: 0.8288351219512194,
5: 0.8302513223140496,
6: 0.8307514249037228},
'W-Recall': {0: 0.8324,
1: 0.7636,
2: 0.7628,
3: 0.8056,
4: 0.836,
5: 0.8176,
6: 0.8408},
'W-F1-score': {0: 0.810103868755423,
1: 0.7811452562742854,
2: 0.7807117770916884,
3: 0.7997335148514852,
4: 0.831622605929424,
5: 0.7598757585104978,
6: 0.8336474053248425},
'0-Precision': {0: 0.8493518104604381,
1: 0.9187236604455148,
2: 0.9206541490006056,
3: 0.8634596695821186,
4: 0.8834146341463415,
5: 0.8152892561983471,
6: 0.8789473684210526},
'0-Recall': {0: 0.958627648839556,
1: 0.7699293642785066,
2: 0.7669021190716448,
3: 0.8965691220988901,
4: 0.9137235116044399,
5: 0.9954591321897074,
6: 0.9268415741675076},
'0-F1-score': {0: 0.9006873666745674,
1: 0.8377710678012626,
2: 0.8367740159647675,
3: 0.8797029702970298,
4: 0.8983134920634921,
5: 0.8964107223989097,
6: 0.9022593320235756},
'1-Precision': {0: 0.6882129277566539,
1: 0.4564958283671037,
2: 0.4558303886925795,
3: 0.5361990950226244,
4: 0.62,
5: 0.8875,
6: 0.6463414634146342},
'1-Recall': {0: 0.34942084942084944,
1: 0.7393822393822393,
2: 0.747104247104247,
3: 0.4575289575289575,
4: 0.5386100386100386,
5: 0.13706563706563707,
6: 0.5115830115830116},
'1-F1-score': {0: 0.4635083226632522,
1: 0.5644804716285925,
2: 0.5662033650329188,
3: 0.49375,
4: 0.5764462809917356,
5: 0.2374581939799331,
6: 0.5711206896551725},
'Accuracy': {0: 0.8324,
1: 0.7636,
2: 0.7628,
3: 0.8056,
4: 0.836,
5: 0.8176,
6: 0.8408},
'AUC': {0: 0.6540242491302027,
1: 0.754655801830373,
2: 0.7570031830879459,
3: 0.6770490398139237,
4: 0.7261667751072393,
5: 0.5662623846276723,
6: 0.7192122928752596},
'SMOTE': {0: 'No',
1: 'Yes',
2: 'Yes',
3: 'Yes',
4: 'Yes',
5: 'Yes',
6: 'Yes'},
'top3var': {0: "['numofproducts_4', 'numofproducts_3', 'geography_germany']",
1: "['numofproducts_4', 'numofproducts_3', 'geography_germany']",
2: "['numofproducts_4', 'numofproducts_3', 'age']",
3: "['empty']",
4: "['numofproducts_2', 'numofproducts_1', 'isactivemember']",
5: "['empty']",
6: "['age', 'numofproducts_2', 'balance']"}}
You can access and edit the range of any axis of your figure using:
fig['layout']['yaxis']['range']
And set the range like:
fig['layout']['yaxis']['range'] = [0, 1]
The same thing goes for your tickvals:
fig['layout']['yaxis']['tickvals'] = [0, 0.2, 0.4, 0.6, 0.8, 1]
You can use:
fig.update_yaxes(tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1])
Your code example does not work for me because "report_df_scoring" is missing.

Categories