Plotting variance scale on y-axis for PCA in Python - python

I am trying to make PCA analysis, but cannot plot properly variance on y-axis.
I have data, which I exported for you
{1: {0: 242.0, 1: 290.0, 2: 340.0, 3: 363.0, 4: 430.0, 5: 450.0, 6: 500.0, 7: 390.0, 8: 450.0, 9: 500.0, 10: 475.0, 11: 500.0, 12: 500.0, 13: 600.0, 14: 600.0, 15: 700.0, 16: 700.0, 17: 610.0, 18: 650.0, 19: 575.0, 20: 685.0, 21: 620.0, 22: 680.0, 23: 700.0, 24: 725.0, 25: 720.0, 26: 714.0, 27: 850.0, 28: 1000.0, 29: 920.0, 30: 955.0, 31: 925.0, 32: 975.0, 33: 950.0, 34: 40.0, 35: 69.0, 36: 78.0, 37: 87.0, 38: 120.0, 39: 0.0, 40: 110.0, 41: 120.0, 42: 150.0, 43: 145.0, 44: 160.0, 45: 140.0, 46: 160.0, 47: 169.0, 48: 161.0, 49: 200.0, 50: 180.0, 51: 290.0, 52: 272.0, 53: 390.0, 54: 6.7, 55: 7.5, 56: 7.0, 57: 9.7, 58: 9.8, 59: 8.7, 60: 10.0, 61: 9.9, 62: 9.8, 63: 12.2, 64: 13.4, 65: 12.2, 66: 19.7, 67: 19.9, 68: 200.0, 69: 300.0, 70: 300.0, 71: 300.0, 72: 430.0, 73: 345.0, 74: 456.0, 75: 510.0, 76: 540.0, 77: 500.0, 78: 567.0, 79: 770.0, 80: 950.0, 81: 1250.0, 82: 1600.0, 83: 1550.0, 84: 1650.0}, 2: {0: 23.2, 1: 24.0, 2: 23.9, 3: 26.3, 4: 26.5, 5: 26.8, 6: 26.8, 7: 27.6, 8: 27.6, 9: 28.5, 10: 28.4, 11: 28.7, 12: 29.1, 13: 29.4, 14: 29.4, 15: 30.4, 16: 30.4, 17: 30.9, 18: 31.0, 19: 31.3, 20: 31.4, 21: 31.5, 22: 31.8, 23: 31.9, 24: 31.8, 25: 32.0, 26: 32.7, 27: 32.8, 28: 33.5, 29: 35.0, 30: 35.0, 31: 36.2, 32: 37.4, 33: 38.0, 34: 12.9, 35: 16.5, 36: 17.5, 37: 18.2, 38: 18.6, 39: 19.0, 40: 19.1, 41: 19.4, 42: 20.4, 43: 20.5, 44: 20.5, 45: 21.0, 46: 21.1, 47: 22.0, 48: 22.0, 49: 22.1, 50: 23.6, 51: 24.0, 52: 25.0, 53: 29.5, 54: 9.3, 55: 10.0, 56: 10.1, 57: 10.4, 58: 10.7, 59: 10.8, 60: 11.3, 61: 11.3, 62: 11.4, 63: 11.5, 64: 11.7, 65: 12.1, 66: 13.2, 67: 13.8, 68: 30.0, 69: 31.7, 70: 32.7, 71: 34.8, 72: 35.5, 73: 36.0, 74: 40.0, 75: 40.0, 76: 40.1, 77: 42.0, 78: 43.2, 79: 44.8, 80: 48.3, 81: 52.0, 82: 56.0, 83: 56.0, 84: 59.0}, 3: {0: 25.4, 1: 26.3, 2: 26.5, 3: 29.0, 4: 29.0, 5: 29.7, 6: 29.7, 7: 30.0, 8: 30.0, 9: 30.7, 10: 31.0, 11: 31.0, 12: 31.5, 13: 32.0, 14: 32.0, 15: 33.0, 16: 33.0, 17: 33.5, 18: 33.5, 19: 34.0, 20: 34.0, 21: 34.5, 22: 35.0, 23: 35.0, 24: 35.0, 25: 35.0, 26: 36.0, 27: 36.0, 28: 37.0, 29: 38.5, 30: 38.5, 31: 39.5, 32: 41.0, 33: 41.0, 34: 14.1, 35: 18.2, 36: 18.8, 37: 19.8, 38: 20.0, 39: 20.5, 40: 20.8, 41: 21.0, 42: 22.0, 43: 22.0, 44: 22.5, 45: 22.5, 46: 22.5, 47: 24.0, 48: 23.4, 49: 23.5, 50: 25.2, 51: 26.0, 52: 27.0, 53: 31.7, 54: 9.8, 55: 10.5, 56: 10.6, 57: 11.0, 58: 11.2, 59: 11.3, 60: 11.8, 61: 11.8, 62: 12.0, 63: 12.2, 64: 12.4, 65: 13.0, 66: 14.3, 67: 15.0, 68: 32.3, 69: 34.0, 70: 35.0, 71: 37.3, 72: 38.0, 73: 38.5, 74: 42.5, 75: 42.5, 76: 43.0, 77: 45.0, 78: 46.0, 79: 48.0, 80: 51.7, 81: 56.0, 82: 60.0, 83: 60.0, 84: 63.4}, 4: {0: 30.0, 1: 31.2, 2: 31.1, 3: 33.5, 4: 34.0, 5: 34.7, 6: 34.5, 7: 35.0, 8: 35.1, 9: 36.2, 10: 36.2, 11: 36.2, 12: 36.4, 13: 37.2, 14: 37.2, 15: 38.3, 16: 38.5, 17: 38.6, 18: 38.7, 19: 39.5, 20: 39.2, 21: 39.7, 22: 40.6, 23: 40.5, 24: 40.9, 25: 40.6, 26: 41.5, 27: 41.6, 28: 42.6, 29: 44.1, 30: 44.0, 31: 45.3, 32: 45.9, 33: 46.5, 34: 16.2, 35: 20.3, 36: 21.2, 37: 22.2, 38: 22.2, 39: 22.8, 40: 23.1, 41: 23.7, 42: 24.7, 43: 24.3, 44: 25.3, 45: 25.0, 46: 25.0, 47: 27.2, 48: 26.7, 49: 26.8, 50: 27.9, 51: 29.2, 52: 30.6, 53: 35.0, 54: 10.8, 55: 11.6, 56: 11.6, 57: 12.0, 58: 12.4, 59: 12.6, 60: 13.1, 61: 13.1, 62: 13.2, 63: 13.4, 64: 13.5, 65: 13.8, 66: 15.2, 67: 16.2, 68: 34.8, 69: 37.8, 70: 38.8, 71: 39.8, 72: 40.5, 73: 41.0, 74: 45.5, 75: 45.5, 76: 45.8, 77: 48.0, 78: 48.7, 79: 51.2, 80: 55.1, 81: 59.7, 82: 64.0, 83: 64.0, 84: 68.0}, 5: {0: 38.4, 1: 40.0, 2: 39.8, 3: 38.0, 4: 36.6, 5: 39.2, 6: 41.1, 7: 36.2, 8: 39.9, 9: 39.3, 10: 39.4, 11: 39.7, 12: 37.8, 13: 40.2, 14: 41.5, 15: 38.8, 16: 38.8, 17: 40.5, 18: 37.4, 19: 38.3, 20: 40.8, 21: 39.1, 22: 38.1, 23: 40.1, 24: 40.0, 25: 40.3, 26: 39.8, 27: 40.6, 28: 44.5, 29: 40.9, 30: 41.1, 31: 41.4, 32: 40.6, 33: 37.9, 34: 25.6, 35: 26.1, 36: 26.3, 37: 25.3, 38: 28.0, 39: 28.4, 40: 26.7, 41: 25.8, 42: 23.5, 43: 27.3, 44: 27.8, 45: 26.2, 46: 25.6, 47: 27.7, 48: 25.9, 49: 27.6, 50: 25.4, 51: 30.4, 52: 28.0, 53: 27.1, 54: 16.1, 55: 17.0, 56: 14.9, 57: 18.3, 58: 16.8, 59: 15.7, 60: 16.9, 61: 16.9, 62: 16.7, 63: 15.6, 64: 18.0, 65: 16.5, 66: 18.9, 67: 18.1, 68: 16.0, 69: 15.1, 70: 15.3, 71: 15.8, 72: 18.0, 73: 15.6, 74: 16.0, 75: 15.0, 76: 17.0, 77: 14.5, 78: 16.0, 79: 15.0, 80: 16.2, 81: 17.9, 82: 15.0, 83: 15.0, 84: 15.9}, 6: {0: 13.4, 1: 13.8, 2: 15.1, 3: 13.3, 4: 15.1, 5: 14.2, 6: 15.3, 7: 13.4, 8: 13.8, 9: 13.7, 10: 14.1, 11: 13.3, 12: 12.0, 13: 13.9, 14: 15.0, 15: 13.8, 16: 13.5, 17: 13.3, 18: 14.8, 19: 14.1, 20: 13.7, 21: 13.3, 22: 15.1, 23: 13.8, 24: 14.8, 25: 15.0, 26: 14.1, 27: 14.9, 28: 15.5, 29: 14.3, 30: 14.3, 31: 14.9, 32: 14.7, 33: 13.7, 34: 14.0, 35: 13.9, 36: 13.7, 37: 14.3, 38: 16.1, 39: 14.7, 40: 14.7, 41: 13.9, 42: 15.2, 43: 14.6, 44: 15.1, 45: 13.3, 46: 15.2, 47: 14.1, 48: 13.6, 49: 15.4, 50: 14.0, 51: 15.4, 52: 15.6, 53: 15.3, 54: 9.7, 55: 10.0, 56: 9.9, 57: 11.5, 58: 10.3, 59: 10.2, 60: 9.8, 61: 8.9, 62: 8.7, 63: 10.4, 64: 9.4, 65: 9.1, 66: 13.6, 67: 11.6, 68: 9.7, 69: 11.0, 70: 11.3, 71: 10.1, 72: 11.3, 73: 9.7, 74: 9.5, 75: 9.8, 76: 11.2, 77: 10.2, 78: 10.0, 79: 10.5, 80: 11.2, 81: 11.7, 82: 9.6, 83: 9.6, 84: 11.0}}
Import libraries
import pandas as pd
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
Data given above, but this is the code
fishes = pd.read_csv("fish.csv", header=None, index_col=False, skiprows=1, usecols=range(1,7))
fishes.head()
Create scaler
scaler = StandardScaler()
Create a PCA instance
pca = PCA()
Create pipeline
pipeline = make_pipeline(scaler, pca)
Fit the pipeline to 'samples'
pipeline.fit(fishes)
Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()
My current output is this, which does not make sense.
If I understand correctly, PCA variance on y-axis should be 100%. My three first factors does not explain too much with such scale. Even if 1 here = 10%, it is still not 100% in total.
Either I did something wrong (unlikely) or I need to adjust the scale for y-axis manually? Where is my mistake? Thanks.

Instead of plotting pca.explained_variance_, try plotting:
pca.explained_variance_ratio_
This will sum up to 1. The variance explained sums to 100%, but values depend on your data unless you expressed them as a ratio.

Related

Add a Line to Bar Chart Plotly Python

I created a stacked bar chart and need to add a horizontal line but it doesn't show
What can be the problem?
Below is the code
import plotly.express as px
import pandas as pd
import numpy as np
import plotly.graph_objects as go
#creating array for days of the week
arr1=(['Mon. W1']*18)
arr2=(['Tue. W1']*18)
arr3=(['Wed. W1']*18)
arr4=(['Thu. W1']*18)
arr5=(['Fri. W1']*18)
arr6=(['Mon. W2']*18)
arr7=(['Tue. W2']*18)
arr8=(['Wed. W2']*18)
arr9=(['Thu. W2']*18)
arr10=(['Fri. W2']*18)
dates = np.concatenate((arr1,arr2,arr3,arr4,arr5,arr6,arr7,arr8,arr9,arr10))
#creating array for desk and meeting space population
from itertools import chain
from itertools import zip_longest
x = df3['Occupancy x Hour'].to_numpy()
y = df3['Population x Hour'].to_numpy()
#this is to change numpy array to normal array for further coding
values_array = list(filter(lambda x: x != '', chain.from_iterable(zip_longest(x, y, fillvalue = ''))))
#creating dictionary for the figure
df = pd.DataFrame(
dict(
day=dates,
time=['9am','9am','10am','10am','11am','11am','12pm','12pm','1pm','1pm','2pm','2pm','3pm','3pm','4pm','4pm','5pm','5pm']*10,
type=["Desk", "Meeting"]*90,
numbers=values_array,
)
)
#creating the bar chart
fig = go.Figure()
fig.update_layout(
template="simple_white",
xaxis=dict(title_text="Time"),
yaxis=dict(title_text="Population"),
barmode="stack",
)
colors = ["Blue","LimeGreen"]
#here adding time and day of the week on x-axis
for r, c in zip(df.type.unique(), colors):
plot_df = df[df.type == r]
fig.add_trace(
go.Bar(x=[plot_df.day, plot_df.time], y=plot_df.numbers, name=r, marker_color=c),
)
fig
df3_dictionary = {'Week': {0: 1.0,
1: 1.0,
2: 1.0,
3: 1.0,
4: 1.0,
5: 1.0,
6: 1.0,
7: 1.0,
8: 1.0,
9: 1.0,
10: 1.0,
11: 1.0,
12: 1.0,
13: 1.0,
14: 1.0,
15: 1.0,
16: 1.0,
17: 1.0,
18: 1.0,
19: 1.0,
20: 1.0,
21: 1.0,
22: 1.0,
23: 1.0,
24: 1.0,
25: 1.0,
26: 1.0,
27: 1.0,
28: 1.0,
29: 1.0,
30: 1.0,
31: 1.0,
32: 1.0,
33: 1.0,
34: 1.0,
35: 1.0,
36: 1.0,
37: 1.0,
38: 1.0,
39: 1.0,
40: 1.0,
41: 1.0,
42: 1.0,
43: 1.0,
44: 1.0,
45: 2.0,
46: 2.0,
47: 2.0,
48: 2.0,
49: 2.0,
50: 2.0,
51: 2.0,
52: 2.0,
53: 2.0,
54: 2.0,
55: 2.0,
56: 2.0,
57: 2.0,
58: 2.0,
59: 2.0,
60: 2.0,
61: 2.0,
62: 2.0,
63: 2.0,
64: 2.0,
65: 2.0,
66: 2.0,
67: 2.0,
68: 2.0,
69: 2.0,
70: 2.0,
71: 2.0,
72: 2.0,
73: 2.0,
74: 2.0,
75: 2.0,
76: 2.0,
77: 2.0,
78: 2.0,
79: 2.0,
80: 2.0,
81: 2.0,
82: 2.0,
83: 2.0,
84: 2.0,
85: 2.0,
86: 2.0,
87: 2.0,
88: 2.0,
89: 2.0},
'Day': {0: 'Monday',
1: 'Monday',
2: 'Monday',
3: 'Monday',
4: 'Monday',
5: 'Monday',
6: 'Monday',
7: 'Monday',
8: 'Monday',
9: 'Tuesday',
10: 'Tuesday',
11: 'Tuesday',
12: 'Tuesday',
13: 'Tuesday',
14: 'Tuesday',
15: 'Tuesday',
16: 'Tuesday',
17: 'Tuesday',
18: 'Wednesday',
19: 'Wednesday',
20: 'Wednesday',
21: 'Wednesday',
22: 'Wednesday',
23: 'Wednesday',
24: 'Wednesday',
25: 'Wednesday',
26: 'Wednesday',
27: 'Thursday',
28: 'Thursday',
29: 'Thursday',
30: 'Thursday',
31: 'Thursday',
32: 'Thursday',
33: 'Thursday',
34: 'Thursday',
35: 'Thursday',
36: 'Friday',
37: 'Friday',
38: 'Friday',
39: 'Friday',
40: 'Friday',
41: 'Friday',
42: 'Friday',
43: 'Friday',
44: 'Friday',
45: 'Monday',
46: 'Monday',
47: 'Monday',
48: 'Monday',
49: 'Monday',
50: 'Monday',
51: 'Monday',
52: 'Monday',
53: 'Monday',
54: 'Tuesday',
55: 'Tuesday',
56: 'Tuesday',
57: 'Tuesday',
58: 'Tuesday',
59: 'Tuesday',
60: 'Tuesday',
61: 'Tuesday',
62: 'Tuesday',
63: 'Wednesday',
64: 'Wednesday',
65: 'Wednesday',
66: 'Wednesday',
67: 'Wednesday',
68: 'Wednesday',
69: 'Wednesday',
70: 'Wednesday',
71: 'Wednesday',
72: 'Thursday',
73: 'Thursday',
74: 'Thursday',
75: 'Thursday',
76: 'Thursday',
77: 'Thursday',
78: 'Thursday',
79: 'Thursday',
80: 'Thursday',
81: 'Friday',
82: 'Friday',
83: 'Friday',
84: 'Friday',
85: 'Friday',
86: 'Friday',
87: 'Friday',
88: 'Friday',
89: 'Friday'},
'Time': {0: '9am',
1: '10am',
2: '11am',
3: '12pm',
4: '1pm',
5: '2pm',
6: '3pm',
7: '4pm',
8: '5pm',
9: '9am',
10: '10am',
11: '11am',
12: '12pm',
13: '1pm',
14: '2pm',
15: '3pm',
16: '4pm',
17: '5pm',
18: '9am',
19: '10am',
20: '11am',
21: '12pm',
22: '1pm',
23: '2pm',
24: '3pm',
25: '4pm',
26: '5pm',
27: '9am',
28: '10am',
29: '11am',
30: '12pm',
31: '1pm',
32: '2pm',
33: '3pm',
34: '4pm',
35: '5pm',
36: '9am',
37: '10am',
38: '11am',
39: '12pm',
40: '1pm',
41: '2pm',
42: '3pm',
43: '4pm',
44: '5pm',
45: '9am',
46: '10am',
47: '11am',
48: '12pm',
49: '1pm',
50: '2pm',
51: '3pm',
52: '4pm',
53: '5pm',
54: '9am',
55: '10am',
56: '11am',
57: '12pm',
58: '1pm',
59: '2pm',
60: '3pm',
61: '4pm',
62: '5pm',
63: '9am',
64: '10am',
65: '11am',
66: '12pm',
67: '1pm',
68: '2pm',
69: '3pm',
70: '4pm',
71: '5pm',
72: '9am',
73: '10am',
74: '11am',
75: '12pm',
76: '1pm',
77: '2pm',
78: '3pm',
79: '4pm',
80: '5pm',
81: '9am',
82: '10am',
83: '11am',
84: '12pm',
85: '1pm',
86: '2pm',
87: '3pm',
88: '4pm',
89: '5pm'},
'Occupancy x Hour': {0: 1378.0,
1: 1369.0,
2: 1372.0,
3: 1261.0,
4: 1087.0,
5: 1355.0,
6: 1383.0,
7: 1325.0,
8: 1050.0,
9: 1313.0,
10: 1347.0,
11: 1323.0,
12: 1202.0,
13: 1033.0,
14: 1237.0,
15: 1324.0,
16: 1352.0,
17: 1108.0,
18: 1217.0,
19: 1276.0,
20: 1365.0,
21: 1204.0,
22: 977.0,
23: 1199.0,
24: 1331.0,
25: 1293.0,
26: 1159.0,
27: 1220.0,
28: 1327.0,
29: 1354.0,
30: 1257.0,
31: 982.0,
32: 1199.0,
33: 1218.0,
34: 1271.0,
35: 1101.0,
36: 1139.0,
37: 1207.0,
38: 1259.0,
39: 1189.0,
40: 903.0,
41: 1171.0,
42: 1193.0,
43: 1239.0,
44: 899.0,
45: 1220.0,
46: 1357.0,
47: 1336.0,
48: 1188.0,
49: 1032.0,
50: 1261.0,
51: 1330.0,
52: 1267.0,
53: 1074.0,
54: 1301.0,
55: 1337.0,
56: 1329.0,
57: 1247.0,
58: 970.0,
59: 1233.0,
60: 1271.0,
61: 1246.0,
62: 1063.0,
63: 1210.0,
64: 1288.0,
65: 1331.0,
66: 1220.0,
67: 948.0,
68: 1273.0,
69: 1289.0,
70: 1329.0,
71: 1153.0,
72: 1213.0,
73: 1248.0,
74: 1272.0,
75: 1190.0,
76: 890.0,
77: 1199.0,
78: 1284.0,
79: 1233.0,
80: 1102.0,
81: 1110.0,
82: 1210.0,
83: 1175.0,
84: 1083.0,
85: 807.0,
86: 1101.0,
87: 1188.0,
88: 1181.0,
89: 857.0},
'Population x Hour': {0: 339.0,
1: 516.0,
2: 564.0,
3: 616.0,
4: 637.0,
5: 548.0,
6: 582.0,
7: 527.0,
8: 341.0,
9: 457.0,
10: 711.0,
11: 731.0,
12: 685.0,
13: 747.0,
14: 735.0,
15: 723.0,
16: 657.0,
17: 388.0,
18: 497.0,
19: 703.0,
20: 690.0,
21: 758.0,
22: 759.0,
23: 745.0,
24: 686.0,
25: 633.0,
26: 374.0,
27: 465.0,
28: 588.0,
29: 648.0,
30: 603.0,
31: 663.0,
32: 687.0,
33: 613.0,
34: 597.0,
35: 262.0,
36: 333.0,
37: 477.0,
38: 521.0,
39: 417.0,
40: 443.0,
41: 513.0,
42: 456.0,
43: 389.0,
44: 159.0,
45: 388.0,
46: 584.0,
47: 646.0,
48: 635.0,
49: 640.0,
50: 643.0,
51: 592.0,
52: 496.0,
53: 282.0,
54: 431.0,
55: 677.0,
56: 646.0,
57: 623.0,
58: 717.0,
59: 674.0,
60: 604.0,
61: 553.0,
62: 348.0,
63: 539.0,
64: 724.0,
65: 651.0,
66: 635.0,
67: 729.0,
68: 600.0,
69: 611.0,
70: 596.0,
71: 353.0,
72: 456.0,
73: 673.0,
74: 639.0,
75: 690.0,
76: 655.0,
77: 616.0,
78: 663.0,
79: 576.0,
80: 340.0,
81: 394.0,
82: 518.0,
83: 527.0,
84: 475.0,
85: 466.0,
86: 452.0,
87: 421.0,
88: 288.0,
89: 181.0}}
This code gives this result
Then, i am trying to add a line but it doesn't show
#finding minimum of the population
tot_popul = x+y
min(tot_popul)
#here, I am trying to plot minimum line but it doesn't show
fig.add_hline(y=min(tot_popul))
fig.show()
I need a horizontal line with minimum values

Calculating how often values appear in column in percent, multiple columns, python

I am struggling with calculating how often the values 1-10 appear in a column relative to the total amount of values (percentage).getting the percentage of the values 1-10 in 4 different columns. I tried with the following code but this only works for columns 1 and 2 and gives NAN values for columns 3 and 4. Does anyone know why? Thank you!
{'job1_category': {0: nan, 1: 4.0, 2: 5.0, 3: 5.0, 4: 4.0, 5: 4.0, 6: 5.0, 7: 4.0, 8: 4.0, 9: 4.0, 10: 4.0, 11: 4.0, 12: 4.0, 13: 4.0, 14: nan, 15: 4.0, 16: 3.0, 17: 7.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: 4.0, 23: 1.0, 24: 4.0, 25: 1.0, 26: 4.0, 27: 2.0, 28: 5.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 4.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 3.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 9.0, 41: 4.0, 42: 4.0, 43: 3.0, 44: 4.0, 45: 9.0, 46: 10.0, 47: nan, 48: 10.0, 49: 4.0, 50: 8.0, 51: nan, 52: 5.0, 53: 8.0, 54: 4.0, 55: nan, 56: 4.0, 57: 8.0, 58: 4.0, 59: 4.0, 60: 4.0, 61: 4.0, 62: 8.0, 63: 4.0, 64: 8.0, 65: 7.0, 66: 4.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: nan, 71: 7.0, 72: nan, 73: 10.0, 74: 7.0, 75: 6.0, 76: 7.0, 77: 4.0, 78: 7.0, 79: 7.0, 80: 7.0, 81: 7.0, 82: 4.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 9.0, 88: 4.0, 89: 4.0, 90: 4.0, 91: 4.0, 92: 4.0, 93: 7.0, 94: 2.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job2_category': {0: 6.0, 1: 5.0, 2: 4.0, 3: 5.0, 4: 6.0, 5: 4.0, 6: 5.0, 7: 5.0, 8: 2.0, 9: 4.0, 10: 4.0, 11: nan, 12: 5.0, 13: 4.0, 14: nan, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: 4.0, 21: 3.0, 22: 4.0, 23: 4.0, 24: 4.0, 25: 4.0, 26: 8.0, 27: 6.0, 28: 6.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 7.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 4.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 4.0, 41: 5.0, 42: 5.0, 43: 4.0, 44: 4.0, 45: 1.0, 46: 10.0, 47: 10.0, 48: 1.0, 49: 4.0, 50: 4.0, 51: nan, 52: 7.0, 53: 4.0, 54: 4.0, 55: 9.0, 56: 9.0, 57: 4.0, 58: 3.0, 59: 4.0, 60: 4.0, 61: nan, 62: 7.0, 63: nan, 64: 7.0, 65: 7.0, 66: 7.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: 7.0, 71: 7.0, 72: nan, 73: 10.0, 74: 8.0, 75: 7.0, 76: 8.0, 77: 5.0, 78: nan, 79: 7.0, 80: nan, 81: 4.0, 82: 7.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 4.0, 88: 4.0, 89: 4.0, 90: 10.0, 91: 5.0, 92: nan, 93: 7.0, 94: 4.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job3_category': {0: 5.0, 1: 5.0, 2: 4.0, 3: nan, 4: 10.0, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: 4.0, 12: 5.0, 13: nan, 14: 4.0, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: 1.0, 26: 4.0, 27: nan, 28: nan, 29: 3.0, 30: 4.0, 31: 3.0, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: 4.0, 40: 7.0, 41: 5.0, 42: nan, 43: 4.0, 44: 4.0, 45: 1.0, 46: 7.0, 47: 1.0, 48: 7.0, 49: 7.0, 50: 4.0, 51: 8.0, 52: nan, 53: 4.0, 54: 7.0, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: 3.0, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 7.0, 66: 7.0, 67: 7.0, 68: 7.0, 69: 8.0, 70: 7.0, 71: 4.0, 72: 8.0, 73: nan, 74: 3.0, 75: 10.0, 76: 4.0, 77: 8.0, 78: 8.0, 79: nan, 80: nan, 81: 4.0, 82: 7.0, 83: 4.0, 84: nan, 85: 8.0, 86: 4.0, 87: 4.0, 88: 4.0, 89: nan, 90: 5.0, 91: 7.0, 92: 4.0, 93: nan, 94: 1.0, 95: nan, 96: nan, 97: 10.0, 98: 1.0, 99: 1.0}, 'job4_category': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: nan, 12: nan, 13: nan, 14: 4.0, 15: 8.0, 16: nan, 17: 4.0, 18: 3.0, 19: nan, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: nan, 26: nan, 27: nan, 28: nan, 29: nan, 30: nan, 31: nan, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: nan, 40: 4.0, 41: nan, 42: nan, 43: nan, 44: nan, 45: nan, 46: nan, 47: 7.0, 48: nan, 49: nan, 50: 5.0, 51: 8.0, 52: nan, 53: 4.0, 54: nan, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: nan, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 8.0, 66: 7.0, 67: nan, 68: nan, 69: nan, 70: 8.0, 71: 5.0, 72: nan, 73: nan, 74: nan, 75: 10.0, 76: nan, 77: 4.0, 78: nan, 79: nan, 80: nan, 81: nan, 82: nan, 83: nan, 84: nan, 85: nan, 86: 10.0, 87: nan, 88: 8.0, 89: nan, 90: nan, 91: nan, 92: nan, 93: nan, 94: 4.0, 95: nan, 96: nan, 97: nan, 98: nan, 99: 5.0}}
job1_freq = data_rel1.groupby('job1_category').size()
job2_freq = data_rel1.groupby('job2_category').size()
job3_freq = data_rel1.groupby('job3_category').size()
job4_freq = data_rel1.groupby('job4_category').size()
data_freq = pd.concat([job1_freq, job2_freq, job3_freq, job4_freq], axis=1)
data_freq.columns = [1,2,3,4]
data_freq["prob_1"] = data_freq[1]/sum(data_freq[1])
data_freq["prob_2"] = data_freq[2]/sum(data_freq[2])
data_freq["prob_3"] = data_freq[3]/sum(data_freq[3])
data_freq["prob_4"] = data_freq[4]/sum(data_freq[4])
sum(column) will return nan if one or more of the entries in column are nan. To avoid this problem, use NumPy's np.sum() instead. This will work as expected, provided the missing values in the original data are coded as np.nan:
import numpy as np
import pandas as pd
data = "{'job1_category': {0: nan, 1: 4.0, 2: 5.0, 3: 5.0, 4: 4.0, 5: 4.0, 6: 5.0, 7: 4.0, 8: 4.0, 9: 4.0, 10: 4.0, 11: 4.0, 12: 4.0, 13: 4.0, 14: nan, 15: 4.0, 16: 3.0, 17: 7.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: 4.0, 23: 1.0, 24: 4.0, 25: 1.0, 26: 4.0, 27: 2.0, 28: 5.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 4.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 3.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 9.0, 41: 4.0, 42: 4.0, 43: 3.0, 44: 4.0, 45: 9.0, 46: 10.0, 47: nan, 48: 10.0, 49: 4.0, 50: 8.0, 51: nan, 52: 5.0, 53: 8.0, 54: 4.0, 55: nan, 56: 4.0, 57: 8.0, 58: 4.0, 59: 4.0, 60: 4.0, 61: 4.0, 62: 8.0, 63: 4.0, 64: 8.0, 65: 7.0, 66: 4.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: nan, 71: 7.0, 72: nan, 73: 10.0, 74: 7.0, 75: 6.0, 76: 7.0, 77: 4.0, 78: 7.0, 79: 7.0, 80: 7.0, 81: 7.0, 82: 4.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 9.0, 88: 4.0, 89: 4.0, 90: 4.0, 91: 4.0, 92: 4.0, 93: 7.0, 94: 2.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job2_category': {0: 6.0, 1: 5.0, 2: 4.0, 3: 5.0, 4: 6.0, 5: 4.0, 6: 5.0, 7: 5.0, 8: 2.0, 9: 4.0, 10: 4.0, 11: nan, 12: 5.0, 13: 4.0, 14: nan, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: 4.0, 21: 3.0, 22: 4.0, 23: 4.0, 24: 4.0, 25: 4.0, 26: 8.0, 27: 6.0, 28: 6.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 7.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 4.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 4.0, 41: 5.0, 42: 5.0, 43: 4.0, 44: 4.0, 45: 1.0, 46: 10.0, 47: 10.0, 48: 1.0, 49: 4.0, 50: 4.0, 51: nan, 52: 7.0, 53: 4.0, 54: 4.0, 55: 9.0, 56: 9.0, 57: 4.0, 58: 3.0, 59: 4.0, 60: 4.0, 61: nan, 62: 7.0, 63: nan, 64: 7.0, 65: 7.0, 66: 7.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: 7.0, 71: 7.0, 72: nan, 73: 10.0, 74: 8.0, 75: 7.0, 76: 8.0, 77: 5.0, 78: nan, 79: 7.0, 80: nan, 81: 4.0, 82: 7.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 4.0, 88: 4.0, 89: 4.0, 90: 10.0, 91: 5.0, 92: nan, 93: 7.0, 94: 4.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job3_category': {0: 5.0, 1: 5.0, 2: 4.0, 3: nan, 4: 10.0, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: 4.0, 12: 5.0, 13: nan, 14: 4.0, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: 1.0, 26: 4.0, 27: nan, 28: nan, 29: 3.0, 30: 4.0, 31: 3.0, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: 4.0, 40: 7.0, 41: 5.0, 42: nan, 43: 4.0, 44: 4.0, 45: 1.0, 46: 7.0, 47: 1.0, 48: 7.0, 49: 7.0, 50: 4.0, 51: 8.0, 52: nan, 53: 4.0, 54: 7.0, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: 3.0, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 7.0, 66: 7.0, 67: 7.0, 68: 7.0, 69: 8.0, 70: 7.0, 71: 4.0, 72: 8.0, 73: nan, 74: 3.0, 75: 10.0, 76: 4.0, 77: 8.0, 78: 8.0, 79: nan, 80: nan, 81: 4.0, 82: 7.0, 83: 4.0, 84: nan, 85: 8.0, 86: 4.0, 87: 4.0, 88: 4.0, 89: nan, 90: 5.0, 91: 7.0, 92: 4.0, 93: nan, 94: 1.0, 95: nan, 96: nan, 97: 10.0, 98: 1.0, 99: 1.0}, 'job4_category': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: nan, 12: nan, 13: nan, 14: 4.0, 15: 8.0, 16: nan, 17: 4.0, 18: 3.0, 19: nan, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: nan, 26: nan, 27: nan, 28: nan, 29: nan, 30: nan, 31: nan, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: nan, 40: 4.0, 41: nan, 42: nan, 43: nan, 44: nan, 45: nan, 46: nan, 47: 7.0, 48: nan, 49: nan, 50: 5.0, 51: 8.0, 52: nan, 53: 4.0, 54: nan, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: nan, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 8.0, 66: 7.0, 67: nan, 68: nan, 69: nan, 70: 8.0, 71: 5.0, 72: nan, 73: nan, 74: nan, 75: 10.0, 76: nan, 77: 4.0, 78: nan, 79: nan, 80: nan, 81: nan, 82: nan, 83: nan, 84: nan, 85: nan, 86: 10.0, 87: nan, 88: 8.0, 89: nan, 90: nan, 91: nan, 92: nan, 93: nan, 94: 4.0, 95: nan, 96: nan, 97: nan, 98: nan, 99: 5.0}}"
data = eval(data.replace('nan', 'np.nan'))
data_rel1 = pd.DataFrame(data)
job1_freq = data_rel1.groupby('job1_category').size()
job2_freq = data_rel1.groupby('job2_category').size()
job3_freq = data_rel1.groupby('job3_category').size()
job4_freq = data_rel1.groupby('job4_category').size()
data_freq = pd.concat([job1_freq, job2_freq, job3_freq, job4_freq], axis=1)
data_freq.columns = [1,2,3,4]
data_freq["prob_1"] = data_freq[1] / np.sum(data_freq[1])
data_freq["prob_2"] = data_freq[2] / np.sum(data_freq[2])
data_freq["prob_3"] = data_freq[3] / np.sum(data_freq[3])
data_freq["prob_4"] = data_freq[4] / np.sum(data_freq[4])
data_freq
1 2 3 4 prob_1 prob_2 prob_3 prob_4
1.0 3 4 7.0 NaN 0.034091 0.045455 0.111111 NaN
2.0 2 1 NaN NaN 0.022727 0.011364 NaN NaN
3.0 4 3 4.0 1.0 0.045455 0.034091 0.063492 0.043478
4.0 47 41 25.0 9.0 0.534091 0.465909 0.396825 0.391304
5.0 6 10 5.0 3.0 0.068182 0.113636 0.079365 0.130435
6.0 1 4 NaN NaN 0.011364 0.045455 NaN NaN
7.0 12 14 12.0 2.0 0.136364 0.159091 0.190476 0.086957
8.0 6 4 7.0 6.0 0.068182 0.045455 0.111111 0.260870
9.0 4 3 NaN NaN 0.045455 0.034091 NaN NaN
10.0 3 4 3.0 2.0 0.034091 0.045455 0.047619 0.086957
Try to replace the null value with 0.0 and try your code
data_freq.fillna(0.0)
I feel like it's not going to affect your computation as well. As (0.0)/sum([.....])=0.0
You can melt the data, then group it to get each invidual count, then group again to also get the total sum. Note that .transform() groups the data but then return the result in the same length as the original dataframe.
Sample data:
from numpy import nan
import pandas as pd
d = {'job1_category': {0: nan, 1: 4.0, 2: 5.0, 3: 5.0, 4: 4.0, 5: 4.0, 6: 5.0, 7: 4.0, 8: 4.0, 9: 4.0, 10: 4.0, 11: 4.0, 12: 4.0, 13: 4.0, 14: nan, 15: 4.0, 16: 3.0, 17: 7.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: 4.0, 23: 1.0, 24: 4.0, 25: 1.0, 26: 4.0, 27: 2.0, 28: 5.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 4.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 3.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 9.0, 41: 4.0, 42: 4.0, 43: 3.0, 44: 4.0, 45: 9.0, 46: 10.0, 47: nan, 48: 10.0, 49: 4.0, 50: 8.0, 51: nan, 52: 5.0, 53: 8.0, 54: 4.0, 55: nan, 56: 4.0, 57: 8.0, 58: 4.0, 59: 4.0, 60: 4.0, 61: 4.0, 62: 8.0, 63: 4.0, 64: 8.0, 65: 7.0, 66: 4.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: nan, 71: 7.0, 72: nan, 73: 10.0, 74: 7.0, 75: 6.0, 76: 7.0, 77: 4.0, 78: 7.0, 79: 7.0, 80: 7.0, 81: 7.0, 82: 4.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 9.0, 88: 4.0, 89: 4.0, 90: 4.0, 91: 4.0, 92: 4.0, 93: 7.0, 94: 2.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job2_category': {0: 6.0, 1: 5.0, 2: 4.0, 3: 5.0, 4: 6.0, 5: 4.0, 6: 5.0, 7: 5.0, 8: 2.0, 9: 4.0, 10: 4.0, 11: nan, 12: 5.0, 13: 4.0, 14: nan, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: 4.0, 21: 3.0, 22: 4.0, 23: 4.0, 24: 4.0, 25: 4.0, 26: 8.0, 27: 6.0, 28: 6.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 7.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 4.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 4.0, 41: 5.0, 42: 5.0, 43: 4.0, 44: 4.0, 45: 1.0, 46: 10.0, 47: 10.0, 48: 1.0, 49: 4.0, 50: 4.0, 51: nan, 52: 7.0, 53: 4.0, 54: 4.0, 55: 9.0, 56: 9.0, 57: 4.0, 58: 3.0, 59: 4.0, 60: 4.0, 61: nan, 62: 7.0, 63: nan, 64: 7.0, 65: 7.0, 66: 7.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: 7.0, 71: 7.0, 72: nan, 73: 10.0, 74: 8.0, 75: 7.0, 76: 8.0, 77: 5.0, 78: nan, 79: 7.0, 80: nan, 81: 4.0, 82: 7.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 4.0, 88: 4.0, 89: 4.0, 90: 10.0, 91: 5.0, 92: nan, 93: 7.0, 94: 4.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job3_category': {0: 5.0, 1: 5.0, 2: 4.0, 3: nan, 4: 10.0, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: 4.0, 12: 5.0, 13: nan, 14: 4.0, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: 1.0, 26: 4.0, 27: nan, 28: nan, 29: 3.0, 30: 4.0, 31: 3.0, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: 4.0, 40: 7.0, 41: 5.0, 42: nan, 43: 4.0, 44: 4.0, 45: 1.0, 46: 7.0, 47: 1.0, 48: 7.0, 49: 7.0, 50: 4.0, 51: 8.0, 52: nan, 53: 4.0, 54: 7.0, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: 3.0, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 7.0, 66: 7.0, 67: 7.0, 68: 7.0, 69: 8.0, 70: 7.0, 71: 4.0, 72: 8.0, 73: nan, 74: 3.0, 75: 10.0, 76: 4.0, 77: 8.0, 78: 8.0, 79: nan, 80: nan, 81: 4.0, 82: 7.0, 83: 4.0, 84: nan, 85: 8.0, 86: 4.0, 87: 4.0, 88: 4.0, 89: nan, 90: 5.0, 91: 7.0, 92: 4.0, 93: nan, 94: 1.0, 95: nan, 96: nan, 97: 10.0, 98: 1.0, 99: 1.0}, 'job4_category': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: nan, 12: nan, 13: nan, 14: 4.0, 15: 8.0, 16: nan, 17: 4.0, 18: 3.0, 19: nan, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: nan, 26: nan, 27: nan, 28: nan, 29: nan, 30: nan, 31: nan, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: nan, 40: 4.0, 41: nan, 42: nan, 43: nan, 44: nan, 45: nan, 46: nan, 47: 7.0, 48: nan, 49: nan, 50: 5.0, 51: 8.0, 52: nan, 53: 4.0, 54: nan, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: nan, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 8.0, 66: 7.0, 67: nan, 68: nan, 69: nan, 70: 8.0, 71: 5.0, 72: nan, 73: nan, 74: nan, 75: 10.0, 76: nan, 77: 4.0, 78: nan, 79: nan, 80: nan, 81: nan, 82: nan, 83: nan, 84: nan, 85: nan, 86: 10.0, 87: nan, 88: 8.0, 89: nan, 90: nan, 91: nan, 92: nan, 93: nan, 94: 4.0, 95: nan, 96: nan, 97: nan, 98: nan, 99: 5.0}}
df = pd.DataFrame(d)
Code:
df_melted = df.melt()
df_grouped = df_melted.groupby(['variable', 'value'])['value'].count().to_frame()
df_grouped['pct'] = df_grouped['value'] / df_grouped.groupby(['variable'])['value'].transform('count')
df_grouped['pct']
Output:
variable value
job1_category 1.0 0.300000
2.0 0.200000
3.0 0.400000
4.0 4.700000
5.0 0.600000
6.0 0.100000
7.0 1.200000
8.0 0.600000
9.0 0.400000
10.0 0.300000
job2_category 1.0 0.400000
2.0 0.100000
3.0 0.300000
4.0 4.100000
5.0 1.000000
6.0 0.400000
7.0 1.400000
8.0 0.400000
9.0 0.300000
10.0 0.400000
job3_category 1.0 1.000000
3.0 0.571429
4.0 3.571429
5.0 0.714286
7.0 1.714286
8.0 1.000000
10.0 0.428571
job4_category 3.0 0.166667
4.0 1.500000
5.0 0.500000
7.0 0.333333
8.0 1.000000
10.0 0.333333
Name: pct, dtype: float64

Panda's qcut possibly rounding?

I'm trying to use Panda's qcut to bin my values in quantile-based buckets.
However, when doing so, it's just giving me whole numbers and does not match what I'm expecting.
I'm expecting something along the following - in particular not whole numbers:
Above was calculated with Excel's QUARTILE.EXC() using the exact same data.
Pandas however is just giving me the bins 1,2,3,4.
Any ideas? Here is the code:
import pandas as pd
data = {0: 2.75,
1: 2.875,
2: 3.5,
3: 3.875,
4: 3.125,
5: 2.25,
6: 2.125,
7: 3.375,
8: 3.75,
9: 1.875,
10: 3.125,
11: 2.625,
12: 1.25,
13: 2.625,
14: 2.25,
15: 3.125,
16: 3.375,
17: 2.25,
18: 2.25,
19: 3.125,
20: 3.375,
21: 2.5,
22: 3.375,
23: 3.5,
24: 3.125,
25: 3.0,
26: 2.125,
27: 3.125,
28: 2.375,
29: 2.375,
30: 2.75,
31: 3.0,
32: 2.625,
33: 2.0,
34: 2.75,
35: 3.25,
36: 3.0,
37: 1.5,
38: 3.5,
39: 2.375,
40: 3.375,
41: 2.625,
42: 3.0,
43: 2.5,
44: 2.625,
45: 2.875,
46: 2.25,
47: 2.5,
48: 1.125,
49: 1.625,
50: 1.375,
51: 2.125,
52: 1.625,
53: 2.125,
54: 1.0,
55: 1.5,
56: 1.25,
57: 3.125,
58: 1.125,
59: 1.75}
df = pd.Series(data).to_frame('values')
n_bins = 4
df['qcutbins'] = pd.qcut(df['values'], q=n_bins, labels=range(1,n_bins+1)).astype('float64')
df.groupby(['qcutbins'])['qcutbins'].describe()[['min','max']].sort_values(by='max').reset_index(drop=True)```
it looks like you want something like this instead:
df = pd.Series(data).to_frame('values')
n_bins = 4
df['qcutbins'] = pd.qcut(df['values'], q=n_bins)
df.groupby("qcutbins").agg([min, max])
values
min max
qcutbins
(0.999, 2.125] 1.00 2.125
(2.125, 2.625] 2.25 2.625
(2.625, 3.125] 2.75 3.125
(3.125, 3.875] 3.25 3.875

Seaborn annotate lineplot of projected world population

"""
I'm trying to reproduce a plot showing the world population growth from 1950 to 2100.
ideally, I'd like to show two different colors under the lineplot, darkgreen from 1950 to 2019 because these are actual data, and lightgreen for the projected data (2019 to 2100)
I'd like to annotate specific points corresponding to 1950, 1987, 2019 and 2050. I tried using markers=True but but failed.
I'm looking for something like the following plot (without the annual growth rate in red)
Thank you in advance for helping me out.
"""
data = {'Year': {0: 1950,
1: 1951,
2: 1952,
3: 1953,
4: 1954,
5: 1955,
6: 1956,
7: 1957,
8: 1958,
9: 1959,
10: 1960,
11: 1961,
12: 1962,
13: 1963,
14: 1964,
15: 1965,
16: 1966,
17: 1967,
18: 1968,
19: 1969,
20: 1970,
21: 1971,
22: 1972,
23: 1973,
24: 1974,
25: 1975,
26: 1976,
27: 1977,
28: 1978,
29: 1979,
30: 1980,
31: 1981,
32: 1982,
33: 1983,
34: 1984,
35: 1985,
36: 1986,
37: 1987,
38: 1988,
39: 1989,
40: 1990,
41: 1991,
42: 1992,
43: 1993,
44: 1994,
45: 1995,
46: 1996,
47: 1997,
48: 1998,
49: 1999,
50: 2000,
51: 2001,
52: 2002,
53: 2003,
54: 2004,
55: 2005,
56: 2006,
57: 2007,
58: 2008,
59: 2009,
60: 2010,
61: 2011,
62: 2012,
63: 2013,
64: 2014,
65: 2015,
66: 2016,
67: 2017,
68: 2018,
69: 2019,
70: 2020,
71: 2091,
72: 2092,
73: 2093,
74: 2094,
75: 2095,
76: 2096,
77: 2097,
78: 2098,
79: 2099,
80: 2100},
'billion': {0: 2.5,
1: 2.6,
2: 2.6,
3: 2.7,
4: 2.7,
5: 2.8,
6: 2.8,
7: 2.9,
8: 2.9,
9: 3.0,
10: 3.0,
11: 3.1,
12: 3.2,
13: 3.2,
14: 3.3,
15: 3.3,
16: 3.4,
17: 3.5,
18: 3.6,
19: 3.6,
20: 3.7,
21: 3.8,
22: 3.9,
23: 3.9,
24: 4.0,
25: 4.1,
26: 4.2,
27: 4.2,
28: 4.3,
29: 4.4,
30: 4.5,
31: 4.5,
32: 4.6,
33: 4.7,
34: 4.8,
35: 4.9,
36: 5.0,
37: 5.1,
38: 5.1,
39: 5.2,
40: 5.3,
41: 5.4,
42: 5.5,
43: 5.6,
44: 5.7,
45: 5.7,
46: 5.8,
47: 5.9,
48: 6.0,
49: 6.1,
50: 6.1,
51: 6.2,
52: 6.3,
53: 6.4,
54: 6.5,
55: 6.5,
56: 6.6,
57: 6.7,
58: 6.8,
59: 6.9,
60: 7.0,
61: 7.0,
62: 7.1,
63: 7.2,
64: 7.3,
65: 7.4,
66: 7.5,
67: 7.5,
68: 7.6,
69: 7.7,
70: 7.8,
71: 10.8,
72: 10.8,
73: 10.8,
74: 10.8,
75: 10.9,
76: 10.9,
77: 10.9,
78: 10.9,
79: 10.9,
80: 10.9}}
df = pd.DataFrame(data)
print(df)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
fig,ax = plt.subplots(figsize=(10,8))
sns.lineplot(x='Year',y='billion',data=df,ax=ax,color='b')
ax.set_ylim([2,11])
plt.fill_between(df['Year'].values, df['billion'].values,color='lightgreen')
plt.text(1950,2.5,'2.5 Billion\nin 1950',horizontalalignment='left')
plt.text(1987,5,'5 Billion\nin 1987',horizontalalignment='right')
plt.text(2019,7.7,'7.7 Billion\nin 2019',horizontalalignment='right')
plt.text(2050,9.7,'9.7 Billion\nin 2050',horizontalalignment='right')
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)#hiding y spine
plt.gca().axes.get_yaxis().set_visible(False) #hiding y axis
ax.spines['right'].set_visible(False)
plt.show()
plt.close()
"""
This is what I got so far
"""
You can fill between years using where=:
ax.fill_between(df['Year'], df['billion'], color='darkgreen', where=df['Year'] <= 2019)
ax.fill_between(df['Year'], df['billion'], color='lightgreen', where=df['Year'] >= 2019)
You can interpolate values for the years with np.interp():
marked_years = [1950, 1987, 2019, 2050]
ax.scatter(marked_years, np.interp(marked_years, df['Year'], df['billion']), marker='o', color='black', s=50)
In a similar way the texts could be placed:
for year, value in zip(marked_years, np.interp(marked_years, df['Year'], df['billion'])):
ax.text(year, value, f'{value:.1f} Billion\nin {year}\n', ha='left' if year < 1970 else 'right', va='bottom')
Optionally you set tick marks for the x-axis every 10 years, and leave out the padding:
ax.xaxis.set_major_locator(ticker.MultipleLocator(10))
ax.margins(x=0, tight=True) # zero padding for the x-axis```

Visualization for rolling percentage for accuracy

Is there a way to visualize the results below?
29% of the results has accuracy >= 90%
59% of the results has accuracy >= 80%
88% of the results has accuracy >= 70%
98% of the results has accuracy >= 60%
2 entries with 59% and 57%
Below is the code with test result (accuracy) and rolling percentage
df1 = pd.DataFrame( {'Test Results': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 1.0, 18: 1.0, 19: 1.0, 20: 0.9452757159999999, 21: 0.9450125009999999, 22: 0.941873717, 23: 0.9336287990000001, 24: 0.932937845, 25: 0.928728552, 26: 0.9217334709999999, 27: 0.9212251640000001, 28: 0.903416839, 29: 0.900519655, 30: 0.8946950090000001, 31: 0.894315668, 32: 0.893705918, 33: 0.8911087870000001, 34: 0.88341769, 35: 0.875316697, 36: 0.873369453, 37: 0.8724485759999999, 38: 0.870855835, 39: 0.8682100159999999, 40: 0.866413987, 41: 0.866225758, 42: 0.86607366, 43: 0.861137576, 44: 0.8570423090000001, 45: 0.855989443, 46: 0.8363952240000001, 47: 0.835153771, 48: 0.831573322, 49: 0.8297349890000001, 50: 0.827174152, 51: 0.82533598, 52: 0.8244730259999999, 53: 0.8231749909999999, 54: 0.821542323, 55: 0.81994456, 56: 0.8193151340000001, 57: 0.817544275, 58: 0.8043684740000001, 59: 0.804280654, 60: 0.804033467, 61: 0.8024588559999999, 62: 0.799949648, 63: 0.7998811370000001, 64: 0.7993836879999999, 65: 0.7978083340000001, 66: 0.794987453, 67: 0.7942367459999999, 68: 0.788060922, 69: 0.7833455859999999, 70: 0.783237411, 71: 0.781810415, 72: 0.77947656, 73: 0.775430937, 74: 0.769843183, 75: 0.763548283, 76: 0.76318018, 77: 0.75891367, 78: 0.7588845790000001, 79: 0.758045414, 80: 0.7545263740000001, 81: 0.7514679670000001, 82: 0.74697755, 83: 0.74597623, 84: 0.743042806, 85: 0.740900129, 86: 0.740844543, 87: 0.7246525220000001, 88: 0.7133019070000001, 89: 0.7124335999999999, 90: 0.703856728, 91: 0.694475843, 92: 0.68836286, 93: 0.683978596, 94: 0.679002735, 95: 0.664945699, 96: 0.662761826, 97: 0.649950991, 98: 0.638550239, 99: 0.60593566, 100: 0.603891537, 101: 0.602900777, 102: 0.594660442, 103: 0.565978017}, 'Rolling Percentage': {0: 0.009615385, 1: 0.019230768999999998, 2: 0.028846154, 3: 0.038461537999999997, 4: 0.048076923, 5: 0.057692308, 6: 0.067307692, 7: 0.076923077, 8: 0.086538462, 9: 0.096153846, 10: 0.10576923099999999, 11: 0.115384615, 12: 0.125, 13: 0.134615385, 14: 0.144230769, 15: 0.153846154, 16: 0.16346153800000002, 17: 0.173076923, 18: 0.182692308, 19: 0.192307692, 20: 0.201923077, 21: 0.21153846199999998, 22: 0.22115384600000002, 23: 0.23076923100000002, 24: 0.240384615, 25: 0.25, 26: 0.259615385, 27: 0.269230769, 28: 0.278846154, 29: 0.288461538, 30: 0.298076923, 31: 0.307692308, 32: 0.317307692, 33: 0.326923077, 34: 0.33653846200000004, 35: 0.346153846, 36: 0.355769231, 37: 0.365384615, 38: 0.375, 39: 0.384615385, 40: 0.394230769, 41: 0.403846154, 42: 0.41346153799999996, 43: 0.423076923, 44: 0.43269230799999997, 45: 0.44230769200000003, 46: 0.451923077, 47: 0.46153846200000004, 48: 0.471153846, 49: 0.480769231, 50: 0.490384615, 51: 0.5, 52: 0.509615385, 53: 0.519230769, 54: 0.528846154, 55: 0.538461538, 56: 0.548076923, 57: 0.557692308, 58: 0.567307692, 59: 0.576923077, 60: 0.586538462, 61: 0.596153846, 62: 0.605769231, 63: 0.615384615, 64: 0.625, 65: 0.634615385, 66: 0.644230769, 67: 0.653846154, 68: 0.663461538, 69: 0.673076923, 70: 0.682692308, 71: 0.692307692, 72: 0.701923077, 73: 0.711538462, 74: 0.721153846, 75: 0.7307692309999999, 76: 0.740384615, 77: 0.75, 78: 0.759615385, 79: 0.7692307690000001, 80: 0.778846154, 81: 0.788461538, 82: 0.798076923, 83: 0.807692308, 84: 0.817307692, 85: 0.826923077, 86: 0.836538462, 87: 0.846153846, 88: 0.8557692309999999, 89: 0.865384615, 90: 0.875, 91: 0.884615385, 92: 0.8942307690000001, 93: 0.903846154, 94: 0.913461538, 95: 0.923076923, 96: 0.932692308, 97: 0.942307692, 98: 0.951923077, 99: 0.961538462, 100: 0.971153846, 101: 0.9807692309999999, 102: 0.990384615, 103: 1.0}} )

Categories