Plotting variance scale on y-axis for PCA in Python - python
I am trying to make PCA analysis, but cannot plot properly variance on y-axis.
I have data, which I exported for you
{1: {0: 242.0, 1: 290.0, 2: 340.0, 3: 363.0, 4: 430.0, 5: 450.0, 6: 500.0, 7: 390.0, 8: 450.0, 9: 500.0, 10: 475.0, 11: 500.0, 12: 500.0, 13: 600.0, 14: 600.0, 15: 700.0, 16: 700.0, 17: 610.0, 18: 650.0, 19: 575.0, 20: 685.0, 21: 620.0, 22: 680.0, 23: 700.0, 24: 725.0, 25: 720.0, 26: 714.0, 27: 850.0, 28: 1000.0, 29: 920.0, 30: 955.0, 31: 925.0, 32: 975.0, 33: 950.0, 34: 40.0, 35: 69.0, 36: 78.0, 37: 87.0, 38: 120.0, 39: 0.0, 40: 110.0, 41: 120.0, 42: 150.0, 43: 145.0, 44: 160.0, 45: 140.0, 46: 160.0, 47: 169.0, 48: 161.0, 49: 200.0, 50: 180.0, 51: 290.0, 52: 272.0, 53: 390.0, 54: 6.7, 55: 7.5, 56: 7.0, 57: 9.7, 58: 9.8, 59: 8.7, 60: 10.0, 61: 9.9, 62: 9.8, 63: 12.2, 64: 13.4, 65: 12.2, 66: 19.7, 67: 19.9, 68: 200.0, 69: 300.0, 70: 300.0, 71: 300.0, 72: 430.0, 73: 345.0, 74: 456.0, 75: 510.0, 76: 540.0, 77: 500.0, 78: 567.0, 79: 770.0, 80: 950.0, 81: 1250.0, 82: 1600.0, 83: 1550.0, 84: 1650.0}, 2: {0: 23.2, 1: 24.0, 2: 23.9, 3: 26.3, 4: 26.5, 5: 26.8, 6: 26.8, 7: 27.6, 8: 27.6, 9: 28.5, 10: 28.4, 11: 28.7, 12: 29.1, 13: 29.4, 14: 29.4, 15: 30.4, 16: 30.4, 17: 30.9, 18: 31.0, 19: 31.3, 20: 31.4, 21: 31.5, 22: 31.8, 23: 31.9, 24: 31.8, 25: 32.0, 26: 32.7, 27: 32.8, 28: 33.5, 29: 35.0, 30: 35.0, 31: 36.2, 32: 37.4, 33: 38.0, 34: 12.9, 35: 16.5, 36: 17.5, 37: 18.2, 38: 18.6, 39: 19.0, 40: 19.1, 41: 19.4, 42: 20.4, 43: 20.5, 44: 20.5, 45: 21.0, 46: 21.1, 47: 22.0, 48: 22.0, 49: 22.1, 50: 23.6, 51: 24.0, 52: 25.0, 53: 29.5, 54: 9.3, 55: 10.0, 56: 10.1, 57: 10.4, 58: 10.7, 59: 10.8, 60: 11.3, 61: 11.3, 62: 11.4, 63: 11.5, 64: 11.7, 65: 12.1, 66: 13.2, 67: 13.8, 68: 30.0, 69: 31.7, 70: 32.7, 71: 34.8, 72: 35.5, 73: 36.0, 74: 40.0, 75: 40.0, 76: 40.1, 77: 42.0, 78: 43.2, 79: 44.8, 80: 48.3, 81: 52.0, 82: 56.0, 83: 56.0, 84: 59.0}, 3: {0: 25.4, 1: 26.3, 2: 26.5, 3: 29.0, 4: 29.0, 5: 29.7, 6: 29.7, 7: 30.0, 8: 30.0, 9: 30.7, 10: 31.0, 11: 31.0, 12: 31.5, 13: 32.0, 14: 32.0, 15: 33.0, 16: 33.0, 17: 33.5, 18: 33.5, 19: 34.0, 20: 34.0, 21: 34.5, 22: 35.0, 23: 35.0, 24: 35.0, 25: 35.0, 26: 36.0, 27: 36.0, 28: 37.0, 29: 38.5, 30: 38.5, 31: 39.5, 32: 41.0, 33: 41.0, 34: 14.1, 35: 18.2, 36: 18.8, 37: 19.8, 38: 20.0, 39: 20.5, 40: 20.8, 41: 21.0, 42: 22.0, 43: 22.0, 44: 22.5, 45: 22.5, 46: 22.5, 47: 24.0, 48: 23.4, 49: 23.5, 50: 25.2, 51: 26.0, 52: 27.0, 53: 31.7, 54: 9.8, 55: 10.5, 56: 10.6, 57: 11.0, 58: 11.2, 59: 11.3, 60: 11.8, 61: 11.8, 62: 12.0, 63: 12.2, 64: 12.4, 65: 13.0, 66: 14.3, 67: 15.0, 68: 32.3, 69: 34.0, 70: 35.0, 71: 37.3, 72: 38.0, 73: 38.5, 74: 42.5, 75: 42.5, 76: 43.0, 77: 45.0, 78: 46.0, 79: 48.0, 80: 51.7, 81: 56.0, 82: 60.0, 83: 60.0, 84: 63.4}, 4: {0: 30.0, 1: 31.2, 2: 31.1, 3: 33.5, 4: 34.0, 5: 34.7, 6: 34.5, 7: 35.0, 8: 35.1, 9: 36.2, 10: 36.2, 11: 36.2, 12: 36.4, 13: 37.2, 14: 37.2, 15: 38.3, 16: 38.5, 17: 38.6, 18: 38.7, 19: 39.5, 20: 39.2, 21: 39.7, 22: 40.6, 23: 40.5, 24: 40.9, 25: 40.6, 26: 41.5, 27: 41.6, 28: 42.6, 29: 44.1, 30: 44.0, 31: 45.3, 32: 45.9, 33: 46.5, 34: 16.2, 35: 20.3, 36: 21.2, 37: 22.2, 38: 22.2, 39: 22.8, 40: 23.1, 41: 23.7, 42: 24.7, 43: 24.3, 44: 25.3, 45: 25.0, 46: 25.0, 47: 27.2, 48: 26.7, 49: 26.8, 50: 27.9, 51: 29.2, 52: 30.6, 53: 35.0, 54: 10.8, 55: 11.6, 56: 11.6, 57: 12.0, 58: 12.4, 59: 12.6, 60: 13.1, 61: 13.1, 62: 13.2, 63: 13.4, 64: 13.5, 65: 13.8, 66: 15.2, 67: 16.2, 68: 34.8, 69: 37.8, 70: 38.8, 71: 39.8, 72: 40.5, 73: 41.0, 74: 45.5, 75: 45.5, 76: 45.8, 77: 48.0, 78: 48.7, 79: 51.2, 80: 55.1, 81: 59.7, 82: 64.0, 83: 64.0, 84: 68.0}, 5: {0: 38.4, 1: 40.0, 2: 39.8, 3: 38.0, 4: 36.6, 5: 39.2, 6: 41.1, 7: 36.2, 8: 39.9, 9: 39.3, 10: 39.4, 11: 39.7, 12: 37.8, 13: 40.2, 14: 41.5, 15: 38.8, 16: 38.8, 17: 40.5, 18: 37.4, 19: 38.3, 20: 40.8, 21: 39.1, 22: 38.1, 23: 40.1, 24: 40.0, 25: 40.3, 26: 39.8, 27: 40.6, 28: 44.5, 29: 40.9, 30: 41.1, 31: 41.4, 32: 40.6, 33: 37.9, 34: 25.6, 35: 26.1, 36: 26.3, 37: 25.3, 38: 28.0, 39: 28.4, 40: 26.7, 41: 25.8, 42: 23.5, 43: 27.3, 44: 27.8, 45: 26.2, 46: 25.6, 47: 27.7, 48: 25.9, 49: 27.6, 50: 25.4, 51: 30.4, 52: 28.0, 53: 27.1, 54: 16.1, 55: 17.0, 56: 14.9, 57: 18.3, 58: 16.8, 59: 15.7, 60: 16.9, 61: 16.9, 62: 16.7, 63: 15.6, 64: 18.0, 65: 16.5, 66: 18.9, 67: 18.1, 68: 16.0, 69: 15.1, 70: 15.3, 71: 15.8, 72: 18.0, 73: 15.6, 74: 16.0, 75: 15.0, 76: 17.0, 77: 14.5, 78: 16.0, 79: 15.0, 80: 16.2, 81: 17.9, 82: 15.0, 83: 15.0, 84: 15.9}, 6: {0: 13.4, 1: 13.8, 2: 15.1, 3: 13.3, 4: 15.1, 5: 14.2, 6: 15.3, 7: 13.4, 8: 13.8, 9: 13.7, 10: 14.1, 11: 13.3, 12: 12.0, 13: 13.9, 14: 15.0, 15: 13.8, 16: 13.5, 17: 13.3, 18: 14.8, 19: 14.1, 20: 13.7, 21: 13.3, 22: 15.1, 23: 13.8, 24: 14.8, 25: 15.0, 26: 14.1, 27: 14.9, 28: 15.5, 29: 14.3, 30: 14.3, 31: 14.9, 32: 14.7, 33: 13.7, 34: 14.0, 35: 13.9, 36: 13.7, 37: 14.3, 38: 16.1, 39: 14.7, 40: 14.7, 41: 13.9, 42: 15.2, 43: 14.6, 44: 15.1, 45: 13.3, 46: 15.2, 47: 14.1, 48: 13.6, 49: 15.4, 50: 14.0, 51: 15.4, 52: 15.6, 53: 15.3, 54: 9.7, 55: 10.0, 56: 9.9, 57: 11.5, 58: 10.3, 59: 10.2, 60: 9.8, 61: 8.9, 62: 8.7, 63: 10.4, 64: 9.4, 65: 9.1, 66: 13.6, 67: 11.6, 68: 9.7, 69: 11.0, 70: 11.3, 71: 10.1, 72: 11.3, 73: 9.7, 74: 9.5, 75: 9.8, 76: 11.2, 77: 10.2, 78: 10.0, 79: 10.5, 80: 11.2, 81: 11.7, 82: 9.6, 83: 9.6, 84: 11.0}}
Import libraries
import pandas as pd
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
Data given above, but this is the code
fishes = pd.read_csv("fish.csv", header=None, index_col=False, skiprows=1, usecols=range(1,7))
fishes.head()
Create scaler
scaler = StandardScaler()
Create a PCA instance
pca = PCA()
Create pipeline
pipeline = make_pipeline(scaler, pca)
Fit the pipeline to 'samples'
pipeline.fit(fishes)
Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()
My current output is this, which does not make sense.
If I understand correctly, PCA variance on y-axis should be 100%. My three first factors does not explain too much with such scale. Even if 1 here = 10%, it is still not 100% in total.
Either I did something wrong (unlikely) or I need to adjust the scale for y-axis manually? Where is my mistake? Thanks.
Instead of plotting pca.explained_variance_, try plotting:
pca.explained_variance_ratio_
This will sum up to 1. The variance explained sums to 100%, but values depend on your data unless you expressed them as a ratio.
Related
Add a Line to Bar Chart Plotly Python
I created a stacked bar chart and need to add a horizontal line but it doesn't show What can be the problem? Below is the code import plotly.express as px import pandas as pd import numpy as np import plotly.graph_objects as go #creating array for days of the week arr1=(['Mon. W1']*18) arr2=(['Tue. W1']*18) arr3=(['Wed. W1']*18) arr4=(['Thu. W1']*18) arr5=(['Fri. W1']*18) arr6=(['Mon. W2']*18) arr7=(['Tue. W2']*18) arr8=(['Wed. W2']*18) arr9=(['Thu. W2']*18) arr10=(['Fri. W2']*18) dates = np.concatenate((arr1,arr2,arr3,arr4,arr5,arr6,arr7,arr8,arr9,arr10)) #creating array for desk and meeting space population from itertools import chain from itertools import zip_longest x = df3['Occupancy x Hour'].to_numpy() y = df3['Population x Hour'].to_numpy() #this is to change numpy array to normal array for further coding values_array = list(filter(lambda x: x != '', chain.from_iterable(zip_longest(x, y, fillvalue = '')))) #creating dictionary for the figure df = pd.DataFrame( dict( day=dates, time=['9am','9am','10am','10am','11am','11am','12pm','12pm','1pm','1pm','2pm','2pm','3pm','3pm','4pm','4pm','5pm','5pm']*10, type=["Desk", "Meeting"]*90, numbers=values_array, ) ) #creating the bar chart fig = go.Figure() fig.update_layout( template="simple_white", xaxis=dict(title_text="Time"), yaxis=dict(title_text="Population"), barmode="stack", ) colors = ["Blue","LimeGreen"] #here adding time and day of the week on x-axis for r, c in zip(df.type.unique(), colors): plot_df = df[df.type == r] fig.add_trace( go.Bar(x=[plot_df.day, plot_df.time], y=plot_df.numbers, name=r, marker_color=c), ) fig df3_dictionary = {'Week': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 1.0, 18: 1.0, 19: 1.0, 20: 1.0, 21: 1.0, 22: 1.0, 23: 1.0, 24: 1.0, 25: 1.0, 26: 1.0, 27: 1.0, 28: 1.0, 29: 1.0, 30: 1.0, 31: 1.0, 32: 1.0, 33: 1.0, 34: 1.0, 35: 1.0, 36: 1.0, 37: 1.0, 38: 1.0, 39: 1.0, 40: 1.0, 41: 1.0, 42: 1.0, 43: 1.0, 44: 1.0, 45: 2.0, 46: 2.0, 47: 2.0, 48: 2.0, 49: 2.0, 50: 2.0, 51: 2.0, 52: 2.0, 53: 2.0, 54: 2.0, 55: 2.0, 56: 2.0, 57: 2.0, 58: 2.0, 59: 2.0, 60: 2.0, 61: 2.0, 62: 2.0, 63: 2.0, 64: 2.0, 65: 2.0, 66: 2.0, 67: 2.0, 68: 2.0, 69: 2.0, 70: 2.0, 71: 2.0, 72: 2.0, 73: 2.0, 74: 2.0, 75: 2.0, 76: 2.0, 77: 2.0, 78: 2.0, 79: 2.0, 80: 2.0, 81: 2.0, 82: 2.0, 83: 2.0, 84: 2.0, 85: 2.0, 86: 2.0, 87: 2.0, 88: 2.0, 89: 2.0}, 'Day': {0: 'Monday', 1: 'Monday', 2: 'Monday', 3: 'Monday', 4: 'Monday', 5: 'Monday', 6: 'Monday', 7: 'Monday', 8: 'Monday', 9: 'Tuesday', 10: 'Tuesday', 11: 'Tuesday', 12: 'Tuesday', 13: 'Tuesday', 14: 'Tuesday', 15: 'Tuesday', 16: 'Tuesday', 17: 'Tuesday', 18: 'Wednesday', 19: 'Wednesday', 20: 'Wednesday', 21: 'Wednesday', 22: 'Wednesday', 23: 'Wednesday', 24: 'Wednesday', 25: 'Wednesday', 26: 'Wednesday', 27: 'Thursday', 28: 'Thursday', 29: 'Thursday', 30: 'Thursday', 31: 'Thursday', 32: 'Thursday', 33: 'Thursday', 34: 'Thursday', 35: 'Thursday', 36: 'Friday', 37: 'Friday', 38: 'Friday', 39: 'Friday', 40: 'Friday', 41: 'Friday', 42: 'Friday', 43: 'Friday', 44: 'Friday', 45: 'Monday', 46: 'Monday', 47: 'Monday', 48: 'Monday', 49: 'Monday', 50: 'Monday', 51: 'Monday', 52: 'Monday', 53: 'Monday', 54: 'Tuesday', 55: 'Tuesday', 56: 'Tuesday', 57: 'Tuesday', 58: 'Tuesday', 59: 'Tuesday', 60: 'Tuesday', 61: 'Tuesday', 62: 'Tuesday', 63: 'Wednesday', 64: 'Wednesday', 65: 'Wednesday', 66: 'Wednesday', 67: 'Wednesday', 68: 'Wednesday', 69: 'Wednesday', 70: 'Wednesday', 71: 'Wednesday', 72: 'Thursday', 73: 'Thursday', 74: 'Thursday', 75: 'Thursday', 76: 'Thursday', 77: 'Thursday', 78: 'Thursday', 79: 'Thursday', 80: 'Thursday', 81: 'Friday', 82: 'Friday', 83: 'Friday', 84: 'Friday', 85: 'Friday', 86: 'Friday', 87: 'Friday', 88: 'Friday', 89: 'Friday'}, 'Time': {0: '9am', 1: '10am', 2: '11am', 3: '12pm', 4: '1pm', 5: '2pm', 6: '3pm', 7: '4pm', 8: '5pm', 9: '9am', 10: '10am', 11: '11am', 12: '12pm', 13: '1pm', 14: '2pm', 15: '3pm', 16: '4pm', 17: '5pm', 18: '9am', 19: '10am', 20: '11am', 21: '12pm', 22: '1pm', 23: '2pm', 24: '3pm', 25: '4pm', 26: '5pm', 27: '9am', 28: '10am', 29: '11am', 30: '12pm', 31: '1pm', 32: '2pm', 33: '3pm', 34: '4pm', 35: '5pm', 36: '9am', 37: '10am', 38: '11am', 39: '12pm', 40: '1pm', 41: '2pm', 42: '3pm', 43: '4pm', 44: '5pm', 45: '9am', 46: '10am', 47: '11am', 48: '12pm', 49: '1pm', 50: '2pm', 51: '3pm', 52: '4pm', 53: '5pm', 54: '9am', 55: '10am', 56: '11am', 57: '12pm', 58: '1pm', 59: '2pm', 60: '3pm', 61: '4pm', 62: '5pm', 63: '9am', 64: '10am', 65: '11am', 66: '12pm', 67: '1pm', 68: '2pm', 69: '3pm', 70: '4pm', 71: '5pm', 72: '9am', 73: '10am', 74: '11am', 75: '12pm', 76: '1pm', 77: '2pm', 78: '3pm', 79: '4pm', 80: '5pm', 81: '9am', 82: '10am', 83: '11am', 84: '12pm', 85: '1pm', 86: '2pm', 87: '3pm', 88: '4pm', 89: '5pm'}, 'Occupancy x Hour': {0: 1378.0, 1: 1369.0, 2: 1372.0, 3: 1261.0, 4: 1087.0, 5: 1355.0, 6: 1383.0, 7: 1325.0, 8: 1050.0, 9: 1313.0, 10: 1347.0, 11: 1323.0, 12: 1202.0, 13: 1033.0, 14: 1237.0, 15: 1324.0, 16: 1352.0, 17: 1108.0, 18: 1217.0, 19: 1276.0, 20: 1365.0, 21: 1204.0, 22: 977.0, 23: 1199.0, 24: 1331.0, 25: 1293.0, 26: 1159.0, 27: 1220.0, 28: 1327.0, 29: 1354.0, 30: 1257.0, 31: 982.0, 32: 1199.0, 33: 1218.0, 34: 1271.0, 35: 1101.0, 36: 1139.0, 37: 1207.0, 38: 1259.0, 39: 1189.0, 40: 903.0, 41: 1171.0, 42: 1193.0, 43: 1239.0, 44: 899.0, 45: 1220.0, 46: 1357.0, 47: 1336.0, 48: 1188.0, 49: 1032.0, 50: 1261.0, 51: 1330.0, 52: 1267.0, 53: 1074.0, 54: 1301.0, 55: 1337.0, 56: 1329.0, 57: 1247.0, 58: 970.0, 59: 1233.0, 60: 1271.0, 61: 1246.0, 62: 1063.0, 63: 1210.0, 64: 1288.0, 65: 1331.0, 66: 1220.0, 67: 948.0, 68: 1273.0, 69: 1289.0, 70: 1329.0, 71: 1153.0, 72: 1213.0, 73: 1248.0, 74: 1272.0, 75: 1190.0, 76: 890.0, 77: 1199.0, 78: 1284.0, 79: 1233.0, 80: 1102.0, 81: 1110.0, 82: 1210.0, 83: 1175.0, 84: 1083.0, 85: 807.0, 86: 1101.0, 87: 1188.0, 88: 1181.0, 89: 857.0}, 'Population x Hour': {0: 339.0, 1: 516.0, 2: 564.0, 3: 616.0, 4: 637.0, 5: 548.0, 6: 582.0, 7: 527.0, 8: 341.0, 9: 457.0, 10: 711.0, 11: 731.0, 12: 685.0, 13: 747.0, 14: 735.0, 15: 723.0, 16: 657.0, 17: 388.0, 18: 497.0, 19: 703.0, 20: 690.0, 21: 758.0, 22: 759.0, 23: 745.0, 24: 686.0, 25: 633.0, 26: 374.0, 27: 465.0, 28: 588.0, 29: 648.0, 30: 603.0, 31: 663.0, 32: 687.0, 33: 613.0, 34: 597.0, 35: 262.0, 36: 333.0, 37: 477.0, 38: 521.0, 39: 417.0, 40: 443.0, 41: 513.0, 42: 456.0, 43: 389.0, 44: 159.0, 45: 388.0, 46: 584.0, 47: 646.0, 48: 635.0, 49: 640.0, 50: 643.0, 51: 592.0, 52: 496.0, 53: 282.0, 54: 431.0, 55: 677.0, 56: 646.0, 57: 623.0, 58: 717.0, 59: 674.0, 60: 604.0, 61: 553.0, 62: 348.0, 63: 539.0, 64: 724.0, 65: 651.0, 66: 635.0, 67: 729.0, 68: 600.0, 69: 611.0, 70: 596.0, 71: 353.0, 72: 456.0, 73: 673.0, 74: 639.0, 75: 690.0, 76: 655.0, 77: 616.0, 78: 663.0, 79: 576.0, 80: 340.0, 81: 394.0, 82: 518.0, 83: 527.0, 84: 475.0, 85: 466.0, 86: 452.0, 87: 421.0, 88: 288.0, 89: 181.0}} This code gives this result Then, i am trying to add a line but it doesn't show #finding minimum of the population tot_popul = x+y min(tot_popul) #here, I am trying to plot minimum line but it doesn't show fig.add_hline(y=min(tot_popul)) fig.show() I need a horizontal line with minimum values
Calculating how often values appear in column in percent, multiple columns, python
I am struggling with calculating how often the values 1-10 appear in a column relative to the total amount of values (percentage).getting the percentage of the values 1-10 in 4 different columns. I tried with the following code but this only works for columns 1 and 2 and gives NAN values for columns 3 and 4. Does anyone know why? Thank you! {'job1_category': {0: nan, 1: 4.0, 2: 5.0, 3: 5.0, 4: 4.0, 5: 4.0, 6: 5.0, 7: 4.0, 8: 4.0, 9: 4.0, 10: 4.0, 11: 4.0, 12: 4.0, 13: 4.0, 14: nan, 15: 4.0, 16: 3.0, 17: 7.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: 4.0, 23: 1.0, 24: 4.0, 25: 1.0, 26: 4.0, 27: 2.0, 28: 5.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 4.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 3.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 9.0, 41: 4.0, 42: 4.0, 43: 3.0, 44: 4.0, 45: 9.0, 46: 10.0, 47: nan, 48: 10.0, 49: 4.0, 50: 8.0, 51: nan, 52: 5.0, 53: 8.0, 54: 4.0, 55: nan, 56: 4.0, 57: 8.0, 58: 4.0, 59: 4.0, 60: 4.0, 61: 4.0, 62: 8.0, 63: 4.0, 64: 8.0, 65: 7.0, 66: 4.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: nan, 71: 7.0, 72: nan, 73: 10.0, 74: 7.0, 75: 6.0, 76: 7.0, 77: 4.0, 78: 7.0, 79: 7.0, 80: 7.0, 81: 7.0, 82: 4.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 9.0, 88: 4.0, 89: 4.0, 90: 4.0, 91: 4.0, 92: 4.0, 93: 7.0, 94: 2.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job2_category': {0: 6.0, 1: 5.0, 2: 4.0, 3: 5.0, 4: 6.0, 5: 4.0, 6: 5.0, 7: 5.0, 8: 2.0, 9: 4.0, 10: 4.0, 11: nan, 12: 5.0, 13: 4.0, 14: nan, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: 4.0, 21: 3.0, 22: 4.0, 23: 4.0, 24: 4.0, 25: 4.0, 26: 8.0, 27: 6.0, 28: 6.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 7.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 4.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 4.0, 41: 5.0, 42: 5.0, 43: 4.0, 44: 4.0, 45: 1.0, 46: 10.0, 47: 10.0, 48: 1.0, 49: 4.0, 50: 4.0, 51: nan, 52: 7.0, 53: 4.0, 54: 4.0, 55: 9.0, 56: 9.0, 57: 4.0, 58: 3.0, 59: 4.0, 60: 4.0, 61: nan, 62: 7.0, 63: nan, 64: 7.0, 65: 7.0, 66: 7.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: 7.0, 71: 7.0, 72: nan, 73: 10.0, 74: 8.0, 75: 7.0, 76: 8.0, 77: 5.0, 78: nan, 79: 7.0, 80: nan, 81: 4.0, 82: 7.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 4.0, 88: 4.0, 89: 4.0, 90: 10.0, 91: 5.0, 92: nan, 93: 7.0, 94: 4.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job3_category': {0: 5.0, 1: 5.0, 2: 4.0, 3: nan, 4: 10.0, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: 4.0, 12: 5.0, 13: nan, 14: 4.0, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: 1.0, 26: 4.0, 27: nan, 28: nan, 29: 3.0, 30: 4.0, 31: 3.0, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: 4.0, 40: 7.0, 41: 5.0, 42: nan, 43: 4.0, 44: 4.0, 45: 1.0, 46: 7.0, 47: 1.0, 48: 7.0, 49: 7.0, 50: 4.0, 51: 8.0, 52: nan, 53: 4.0, 54: 7.0, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: 3.0, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 7.0, 66: 7.0, 67: 7.0, 68: 7.0, 69: 8.0, 70: 7.0, 71: 4.0, 72: 8.0, 73: nan, 74: 3.0, 75: 10.0, 76: 4.0, 77: 8.0, 78: 8.0, 79: nan, 80: nan, 81: 4.0, 82: 7.0, 83: 4.0, 84: nan, 85: 8.0, 86: 4.0, 87: 4.0, 88: 4.0, 89: nan, 90: 5.0, 91: 7.0, 92: 4.0, 93: nan, 94: 1.0, 95: nan, 96: nan, 97: 10.0, 98: 1.0, 99: 1.0}, 'job4_category': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: nan, 12: nan, 13: nan, 14: 4.0, 15: 8.0, 16: nan, 17: 4.0, 18: 3.0, 19: nan, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: nan, 26: nan, 27: nan, 28: nan, 29: nan, 30: nan, 31: nan, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: nan, 40: 4.0, 41: nan, 42: nan, 43: nan, 44: nan, 45: nan, 46: nan, 47: 7.0, 48: nan, 49: nan, 50: 5.0, 51: 8.0, 52: nan, 53: 4.0, 54: nan, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: nan, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 8.0, 66: 7.0, 67: nan, 68: nan, 69: nan, 70: 8.0, 71: 5.0, 72: nan, 73: nan, 74: nan, 75: 10.0, 76: nan, 77: 4.0, 78: nan, 79: nan, 80: nan, 81: nan, 82: nan, 83: nan, 84: nan, 85: nan, 86: 10.0, 87: nan, 88: 8.0, 89: nan, 90: nan, 91: nan, 92: nan, 93: nan, 94: 4.0, 95: nan, 96: nan, 97: nan, 98: nan, 99: 5.0}} job1_freq = data_rel1.groupby('job1_category').size() job2_freq = data_rel1.groupby('job2_category').size() job3_freq = data_rel1.groupby('job3_category').size() job4_freq = data_rel1.groupby('job4_category').size() data_freq = pd.concat([job1_freq, job2_freq, job3_freq, job4_freq], axis=1) data_freq.columns = [1,2,3,4] data_freq["prob_1"] = data_freq[1]/sum(data_freq[1]) data_freq["prob_2"] = data_freq[2]/sum(data_freq[2]) data_freq["prob_3"] = data_freq[3]/sum(data_freq[3]) data_freq["prob_4"] = data_freq[4]/sum(data_freq[4])
sum(column) will return nan if one or more of the entries in column are nan. To avoid this problem, use NumPy's np.sum() instead. This will work as expected, provided the missing values in the original data are coded as np.nan: import numpy as np import pandas as pd data = "{'job1_category': {0: nan, 1: 4.0, 2: 5.0, 3: 5.0, 4: 4.0, 5: 4.0, 6: 5.0, 7: 4.0, 8: 4.0, 9: 4.0, 10: 4.0, 11: 4.0, 12: 4.0, 13: 4.0, 14: nan, 15: 4.0, 16: 3.0, 17: 7.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: 4.0, 23: 1.0, 24: 4.0, 25: 1.0, 26: 4.0, 27: 2.0, 28: 5.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 4.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 3.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 9.0, 41: 4.0, 42: 4.0, 43: 3.0, 44: 4.0, 45: 9.0, 46: 10.0, 47: nan, 48: 10.0, 49: 4.0, 50: 8.0, 51: nan, 52: 5.0, 53: 8.0, 54: 4.0, 55: nan, 56: 4.0, 57: 8.0, 58: 4.0, 59: 4.0, 60: 4.0, 61: 4.0, 62: 8.0, 63: 4.0, 64: 8.0, 65: 7.0, 66: 4.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: nan, 71: 7.0, 72: nan, 73: 10.0, 74: 7.0, 75: 6.0, 76: 7.0, 77: 4.0, 78: 7.0, 79: 7.0, 80: 7.0, 81: 7.0, 82: 4.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 9.0, 88: 4.0, 89: 4.0, 90: 4.0, 91: 4.0, 92: 4.0, 93: 7.0, 94: 2.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job2_category': {0: 6.0, 1: 5.0, 2: 4.0, 3: 5.0, 4: 6.0, 5: 4.0, 6: 5.0, 7: 5.0, 8: 2.0, 9: 4.0, 10: 4.0, 11: nan, 12: 5.0, 13: 4.0, 14: nan, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: 4.0, 21: 3.0, 22: 4.0, 23: 4.0, 24: 4.0, 25: 4.0, 26: 8.0, 27: 6.0, 28: 6.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 7.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 4.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 4.0, 41: 5.0, 42: 5.0, 43: 4.0, 44: 4.0, 45: 1.0, 46: 10.0, 47: 10.0, 48: 1.0, 49: 4.0, 50: 4.0, 51: nan, 52: 7.0, 53: 4.0, 54: 4.0, 55: 9.0, 56: 9.0, 57: 4.0, 58: 3.0, 59: 4.0, 60: 4.0, 61: nan, 62: 7.0, 63: nan, 64: 7.0, 65: 7.0, 66: 7.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: 7.0, 71: 7.0, 72: nan, 73: 10.0, 74: 8.0, 75: 7.0, 76: 8.0, 77: 5.0, 78: nan, 79: 7.0, 80: nan, 81: 4.0, 82: 7.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 4.0, 88: 4.0, 89: 4.0, 90: 10.0, 91: 5.0, 92: nan, 93: 7.0, 94: 4.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job3_category': {0: 5.0, 1: 5.0, 2: 4.0, 3: nan, 4: 10.0, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: 4.0, 12: 5.0, 13: nan, 14: 4.0, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: 1.0, 26: 4.0, 27: nan, 28: nan, 29: 3.0, 30: 4.0, 31: 3.0, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: 4.0, 40: 7.0, 41: 5.0, 42: nan, 43: 4.0, 44: 4.0, 45: 1.0, 46: 7.0, 47: 1.0, 48: 7.0, 49: 7.0, 50: 4.0, 51: 8.0, 52: nan, 53: 4.0, 54: 7.0, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: 3.0, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 7.0, 66: 7.0, 67: 7.0, 68: 7.0, 69: 8.0, 70: 7.0, 71: 4.0, 72: 8.0, 73: nan, 74: 3.0, 75: 10.0, 76: 4.0, 77: 8.0, 78: 8.0, 79: nan, 80: nan, 81: 4.0, 82: 7.0, 83: 4.0, 84: nan, 85: 8.0, 86: 4.0, 87: 4.0, 88: 4.0, 89: nan, 90: 5.0, 91: 7.0, 92: 4.0, 93: nan, 94: 1.0, 95: nan, 96: nan, 97: 10.0, 98: 1.0, 99: 1.0}, 'job4_category': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: nan, 12: nan, 13: nan, 14: 4.0, 15: 8.0, 16: nan, 17: 4.0, 18: 3.0, 19: nan, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: nan, 26: nan, 27: nan, 28: nan, 29: nan, 30: nan, 31: nan, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: nan, 40: 4.0, 41: nan, 42: nan, 43: nan, 44: nan, 45: nan, 46: nan, 47: 7.0, 48: nan, 49: nan, 50: 5.0, 51: 8.0, 52: nan, 53: 4.0, 54: nan, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: nan, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 8.0, 66: 7.0, 67: nan, 68: nan, 69: nan, 70: 8.0, 71: 5.0, 72: nan, 73: nan, 74: nan, 75: 10.0, 76: nan, 77: 4.0, 78: nan, 79: nan, 80: nan, 81: nan, 82: nan, 83: nan, 84: nan, 85: nan, 86: 10.0, 87: nan, 88: 8.0, 89: nan, 90: nan, 91: nan, 92: nan, 93: nan, 94: 4.0, 95: nan, 96: nan, 97: nan, 98: nan, 99: 5.0}}" data = eval(data.replace('nan', 'np.nan')) data_rel1 = pd.DataFrame(data) job1_freq = data_rel1.groupby('job1_category').size() job2_freq = data_rel1.groupby('job2_category').size() job3_freq = data_rel1.groupby('job3_category').size() job4_freq = data_rel1.groupby('job4_category').size() data_freq = pd.concat([job1_freq, job2_freq, job3_freq, job4_freq], axis=1) data_freq.columns = [1,2,3,4] data_freq["prob_1"] = data_freq[1] / np.sum(data_freq[1]) data_freq["prob_2"] = data_freq[2] / np.sum(data_freq[2]) data_freq["prob_3"] = data_freq[3] / np.sum(data_freq[3]) data_freq["prob_4"] = data_freq[4] / np.sum(data_freq[4]) data_freq 1 2 3 4 prob_1 prob_2 prob_3 prob_4 1.0 3 4 7.0 NaN 0.034091 0.045455 0.111111 NaN 2.0 2 1 NaN NaN 0.022727 0.011364 NaN NaN 3.0 4 3 4.0 1.0 0.045455 0.034091 0.063492 0.043478 4.0 47 41 25.0 9.0 0.534091 0.465909 0.396825 0.391304 5.0 6 10 5.0 3.0 0.068182 0.113636 0.079365 0.130435 6.0 1 4 NaN NaN 0.011364 0.045455 NaN NaN 7.0 12 14 12.0 2.0 0.136364 0.159091 0.190476 0.086957 8.0 6 4 7.0 6.0 0.068182 0.045455 0.111111 0.260870 9.0 4 3 NaN NaN 0.045455 0.034091 NaN NaN 10.0 3 4 3.0 2.0 0.034091 0.045455 0.047619 0.086957
Try to replace the null value with 0.0 and try your code data_freq.fillna(0.0) I feel like it's not going to affect your computation as well. As (0.0)/sum([.....])=0.0
You can melt the data, then group it to get each invidual count, then group again to also get the total sum. Note that .transform() groups the data but then return the result in the same length as the original dataframe. Sample data: from numpy import nan import pandas as pd d = {'job1_category': {0: nan, 1: 4.0, 2: 5.0, 3: 5.0, 4: 4.0, 5: 4.0, 6: 5.0, 7: 4.0, 8: 4.0, 9: 4.0, 10: 4.0, 11: 4.0, 12: 4.0, 13: 4.0, 14: nan, 15: 4.0, 16: 3.0, 17: 7.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: 4.0, 23: 1.0, 24: 4.0, 25: 1.0, 26: 4.0, 27: 2.0, 28: 5.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 4.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 3.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 9.0, 41: 4.0, 42: 4.0, 43: 3.0, 44: 4.0, 45: 9.0, 46: 10.0, 47: nan, 48: 10.0, 49: 4.0, 50: 8.0, 51: nan, 52: 5.0, 53: 8.0, 54: 4.0, 55: nan, 56: 4.0, 57: 8.0, 58: 4.0, 59: 4.0, 60: 4.0, 61: 4.0, 62: 8.0, 63: 4.0, 64: 8.0, 65: 7.0, 66: 4.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: nan, 71: 7.0, 72: nan, 73: 10.0, 74: 7.0, 75: 6.0, 76: 7.0, 77: 4.0, 78: 7.0, 79: 7.0, 80: 7.0, 81: 7.0, 82: 4.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 9.0, 88: 4.0, 89: 4.0, 90: 4.0, 91: 4.0, 92: 4.0, 93: 7.0, 94: 2.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job2_category': {0: 6.0, 1: 5.0, 2: 4.0, 3: 5.0, 4: 6.0, 5: 4.0, 6: 5.0, 7: 5.0, 8: 2.0, 9: 4.0, 10: 4.0, 11: nan, 12: 5.0, 13: 4.0, 14: nan, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: 4.0, 21: 3.0, 22: 4.0, 23: 4.0, 24: 4.0, 25: 4.0, 26: 8.0, 27: 6.0, 28: 6.0, 29: 3.0, 30: 4.0, 31: 5.0, 32: 7.0, 33: 4.0, 34: 4.0, 35: 4.0, 36: 4.0, 37: 4.0, 38: 4.0, 39: 4.0, 40: 4.0, 41: 5.0, 42: 5.0, 43: 4.0, 44: 4.0, 45: 1.0, 46: 10.0, 47: 10.0, 48: 1.0, 49: 4.0, 50: 4.0, 51: nan, 52: 7.0, 53: 4.0, 54: 4.0, 55: 9.0, 56: 9.0, 57: 4.0, 58: 3.0, 59: 4.0, 60: 4.0, 61: nan, 62: 7.0, 63: nan, 64: 7.0, 65: 7.0, 66: 7.0, 67: 7.0, 68: 8.0, 69: 7.0, 70: 7.0, 71: 7.0, 72: nan, 73: 10.0, 74: 8.0, 75: 7.0, 76: 8.0, 77: 5.0, 78: nan, 79: 7.0, 80: nan, 81: 4.0, 82: 7.0, 83: nan, 84: 4.0, 85: nan, 86: 4.0, 87: 4.0, 88: 4.0, 89: 4.0, 90: 10.0, 91: 5.0, 92: nan, 93: 7.0, 94: 4.0, 95: 4.0, 96: 4.0, 97: nan, 98: 1.0, 99: 9.0}, 'job3_category': {0: 5.0, 1: 5.0, 2: 4.0, 3: nan, 4: 10.0, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: 4.0, 12: 5.0, 13: nan, 14: 4.0, 15: 1.0, 16: 4.0, 17: 4.0, 18: 4.0, 19: 4.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: 1.0, 26: 4.0, 27: nan, 28: nan, 29: 3.0, 30: 4.0, 31: 3.0, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: 4.0, 40: 7.0, 41: 5.0, 42: nan, 43: 4.0, 44: 4.0, 45: 1.0, 46: 7.0, 47: 1.0, 48: 7.0, 49: 7.0, 50: 4.0, 51: 8.0, 52: nan, 53: 4.0, 54: 7.0, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: 3.0, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 7.0, 66: 7.0, 67: 7.0, 68: 7.0, 69: 8.0, 70: 7.0, 71: 4.0, 72: 8.0, 73: nan, 74: 3.0, 75: 10.0, 76: 4.0, 77: 8.0, 78: 8.0, 79: nan, 80: nan, 81: 4.0, 82: 7.0, 83: 4.0, 84: nan, 85: 8.0, 86: 4.0, 87: 4.0, 88: 4.0, 89: nan, 90: 5.0, 91: 7.0, 92: 4.0, 93: nan, 94: 1.0, 95: nan, 96: nan, 97: 10.0, 98: 1.0, 99: 1.0}, 'job4_category': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan, 5: nan, 6: nan, 7: nan, 8: nan, 9: nan, 10: 4.0, 11: nan, 12: nan, 13: nan, 14: 4.0, 15: 8.0, 16: nan, 17: 4.0, 18: 3.0, 19: nan, 20: nan, 21: nan, 22: nan, 23: nan, 24: nan, 25: nan, 26: nan, 27: nan, 28: nan, 29: nan, 30: nan, 31: nan, 32: 4.0, 33: nan, 34: nan, 35: nan, 36: 4.0, 37: nan, 38: nan, 39: nan, 40: 4.0, 41: nan, 42: nan, 43: nan, 44: nan, 45: nan, 46: nan, 47: 7.0, 48: nan, 49: nan, 50: 5.0, 51: 8.0, 52: nan, 53: 4.0, 54: nan, 55: nan, 56: 8.0, 57: nan, 58: nan, 59: nan, 60: nan, 61: nan, 62: nan, 63: nan, 64: nan, 65: 8.0, 66: 7.0, 67: nan, 68: nan, 69: nan, 70: 8.0, 71: 5.0, 72: nan, 73: nan, 74: nan, 75: 10.0, 76: nan, 77: 4.0, 78: nan, 79: nan, 80: nan, 81: nan, 82: nan, 83: nan, 84: nan, 85: nan, 86: 10.0, 87: nan, 88: 8.0, 89: nan, 90: nan, 91: nan, 92: nan, 93: nan, 94: 4.0, 95: nan, 96: nan, 97: nan, 98: nan, 99: 5.0}} df = pd.DataFrame(d) Code: df_melted = df.melt() df_grouped = df_melted.groupby(['variable', 'value'])['value'].count().to_frame() df_grouped['pct'] = df_grouped['value'] / df_grouped.groupby(['variable'])['value'].transform('count') df_grouped['pct'] Output: variable value job1_category 1.0 0.300000 2.0 0.200000 3.0 0.400000 4.0 4.700000 5.0 0.600000 6.0 0.100000 7.0 1.200000 8.0 0.600000 9.0 0.400000 10.0 0.300000 job2_category 1.0 0.400000 2.0 0.100000 3.0 0.300000 4.0 4.100000 5.0 1.000000 6.0 0.400000 7.0 1.400000 8.0 0.400000 9.0 0.300000 10.0 0.400000 job3_category 1.0 1.000000 3.0 0.571429 4.0 3.571429 5.0 0.714286 7.0 1.714286 8.0 1.000000 10.0 0.428571 job4_category 3.0 0.166667 4.0 1.500000 5.0 0.500000 7.0 0.333333 8.0 1.000000 10.0 0.333333 Name: pct, dtype: float64
Panda's qcut possibly rounding?
I'm trying to use Panda's qcut to bin my values in quantile-based buckets. However, when doing so, it's just giving me whole numbers and does not match what I'm expecting. I'm expecting something along the following - in particular not whole numbers: Above was calculated with Excel's QUARTILE.EXC() using the exact same data. Pandas however is just giving me the bins 1,2,3,4. Any ideas? Here is the code: import pandas as pd data = {0: 2.75, 1: 2.875, 2: 3.5, 3: 3.875, 4: 3.125, 5: 2.25, 6: 2.125, 7: 3.375, 8: 3.75, 9: 1.875, 10: 3.125, 11: 2.625, 12: 1.25, 13: 2.625, 14: 2.25, 15: 3.125, 16: 3.375, 17: 2.25, 18: 2.25, 19: 3.125, 20: 3.375, 21: 2.5, 22: 3.375, 23: 3.5, 24: 3.125, 25: 3.0, 26: 2.125, 27: 3.125, 28: 2.375, 29: 2.375, 30: 2.75, 31: 3.0, 32: 2.625, 33: 2.0, 34: 2.75, 35: 3.25, 36: 3.0, 37: 1.5, 38: 3.5, 39: 2.375, 40: 3.375, 41: 2.625, 42: 3.0, 43: 2.5, 44: 2.625, 45: 2.875, 46: 2.25, 47: 2.5, 48: 1.125, 49: 1.625, 50: 1.375, 51: 2.125, 52: 1.625, 53: 2.125, 54: 1.0, 55: 1.5, 56: 1.25, 57: 3.125, 58: 1.125, 59: 1.75} df = pd.Series(data).to_frame('values') n_bins = 4 df['qcutbins'] = pd.qcut(df['values'], q=n_bins, labels=range(1,n_bins+1)).astype('float64') df.groupby(['qcutbins'])['qcutbins'].describe()[['min','max']].sort_values(by='max').reset_index(drop=True)```
it looks like you want something like this instead: df = pd.Series(data).to_frame('values') n_bins = 4 df['qcutbins'] = pd.qcut(df['values'], q=n_bins) df.groupby("qcutbins").agg([min, max]) values min max qcutbins (0.999, 2.125] 1.00 2.125 (2.125, 2.625] 2.25 2.625 (2.625, 3.125] 2.75 3.125 (3.125, 3.875] 3.25 3.875
Seaborn annotate lineplot of projected world population
""" I'm trying to reproduce a plot showing the world population growth from 1950 to 2100. ideally, I'd like to show two different colors under the lineplot, darkgreen from 1950 to 2019 because these are actual data, and lightgreen for the projected data (2019 to 2100) I'd like to annotate specific points corresponding to 1950, 1987, 2019 and 2050. I tried using markers=True but but failed. I'm looking for something like the following plot (without the annual growth rate in red) Thank you in advance for helping me out. """ data = {'Year': {0: 1950, 1: 1951, 2: 1952, 3: 1953, 4: 1954, 5: 1955, 6: 1956, 7: 1957, 8: 1958, 9: 1959, 10: 1960, 11: 1961, 12: 1962, 13: 1963, 14: 1964, 15: 1965, 16: 1966, 17: 1967, 18: 1968, 19: 1969, 20: 1970, 21: 1971, 22: 1972, 23: 1973, 24: 1974, 25: 1975, 26: 1976, 27: 1977, 28: 1978, 29: 1979, 30: 1980, 31: 1981, 32: 1982, 33: 1983, 34: 1984, 35: 1985, 36: 1986, 37: 1987, 38: 1988, 39: 1989, 40: 1990, 41: 1991, 42: 1992, 43: 1993, 44: 1994, 45: 1995, 46: 1996, 47: 1997, 48: 1998, 49: 1999, 50: 2000, 51: 2001, 52: 2002, 53: 2003, 54: 2004, 55: 2005, 56: 2006, 57: 2007, 58: 2008, 59: 2009, 60: 2010, 61: 2011, 62: 2012, 63: 2013, 64: 2014, 65: 2015, 66: 2016, 67: 2017, 68: 2018, 69: 2019, 70: 2020, 71: 2091, 72: 2092, 73: 2093, 74: 2094, 75: 2095, 76: 2096, 77: 2097, 78: 2098, 79: 2099, 80: 2100}, 'billion': {0: 2.5, 1: 2.6, 2: 2.6, 3: 2.7, 4: 2.7, 5: 2.8, 6: 2.8, 7: 2.9, 8: 2.9, 9: 3.0, 10: 3.0, 11: 3.1, 12: 3.2, 13: 3.2, 14: 3.3, 15: 3.3, 16: 3.4, 17: 3.5, 18: 3.6, 19: 3.6, 20: 3.7, 21: 3.8, 22: 3.9, 23: 3.9, 24: 4.0, 25: 4.1, 26: 4.2, 27: 4.2, 28: 4.3, 29: 4.4, 30: 4.5, 31: 4.5, 32: 4.6, 33: 4.7, 34: 4.8, 35: 4.9, 36: 5.0, 37: 5.1, 38: 5.1, 39: 5.2, 40: 5.3, 41: 5.4, 42: 5.5, 43: 5.6, 44: 5.7, 45: 5.7, 46: 5.8, 47: 5.9, 48: 6.0, 49: 6.1, 50: 6.1, 51: 6.2, 52: 6.3, 53: 6.4, 54: 6.5, 55: 6.5, 56: 6.6, 57: 6.7, 58: 6.8, 59: 6.9, 60: 7.0, 61: 7.0, 62: 7.1, 63: 7.2, 64: 7.3, 65: 7.4, 66: 7.5, 67: 7.5, 68: 7.6, 69: 7.7, 70: 7.8, 71: 10.8, 72: 10.8, 73: 10.8, 74: 10.8, 75: 10.9, 76: 10.9, 77: 10.9, 78: 10.9, 79: 10.9, 80: 10.9}} df = pd.DataFrame(data) print(df) import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.ticker as ticker import seaborn as sns fig,ax = plt.subplots(figsize=(10,8)) sns.lineplot(x='Year',y='billion',data=df,ax=ax,color='b') ax.set_ylim([2,11]) plt.fill_between(df['Year'].values, df['billion'].values,color='lightgreen') plt.text(1950,2.5,'2.5 Billion\nin 1950',horizontalalignment='left') plt.text(1987,5,'5 Billion\nin 1987',horizontalalignment='right') plt.text(2019,7.7,'7.7 Billion\nin 2019',horizontalalignment='right') plt.text(2050,9.7,'9.7 Billion\nin 2050',horizontalalignment='right') ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False)#hiding y spine plt.gca().axes.get_yaxis().set_visible(False) #hiding y axis ax.spines['right'].set_visible(False) plt.show() plt.close() """ This is what I got so far """
You can fill between years using where=: ax.fill_between(df['Year'], df['billion'], color='darkgreen', where=df['Year'] <= 2019) ax.fill_between(df['Year'], df['billion'], color='lightgreen', where=df['Year'] >= 2019) You can interpolate values for the years with np.interp(): marked_years = [1950, 1987, 2019, 2050] ax.scatter(marked_years, np.interp(marked_years, df['Year'], df['billion']), marker='o', color='black', s=50) In a similar way the texts could be placed: for year, value in zip(marked_years, np.interp(marked_years, df['Year'], df['billion'])): ax.text(year, value, f'{value:.1f} Billion\nin {year}\n', ha='left' if year < 1970 else 'right', va='bottom') Optionally you set tick marks for the x-axis every 10 years, and leave out the padding: ax.xaxis.set_major_locator(ticker.MultipleLocator(10)) ax.margins(x=0, tight=True) # zero padding for the x-axis```
Visualization for rolling percentage for accuracy
Is there a way to visualize the results below? 29% of the results has accuracy >= 90% 59% of the results has accuracy >= 80% 88% of the results has accuracy >= 70% 98% of the results has accuracy >= 60% 2 entries with 59% and 57% Below is the code with test result (accuracy) and rolling percentage df1 = pd.DataFrame( {'Test Results': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 1.0, 18: 1.0, 19: 1.0, 20: 0.9452757159999999, 21: 0.9450125009999999, 22: 0.941873717, 23: 0.9336287990000001, 24: 0.932937845, 25: 0.928728552, 26: 0.9217334709999999, 27: 0.9212251640000001, 28: 0.903416839, 29: 0.900519655, 30: 0.8946950090000001, 31: 0.894315668, 32: 0.893705918, 33: 0.8911087870000001, 34: 0.88341769, 35: 0.875316697, 36: 0.873369453, 37: 0.8724485759999999, 38: 0.870855835, 39: 0.8682100159999999, 40: 0.866413987, 41: 0.866225758, 42: 0.86607366, 43: 0.861137576, 44: 0.8570423090000001, 45: 0.855989443, 46: 0.8363952240000001, 47: 0.835153771, 48: 0.831573322, 49: 0.8297349890000001, 50: 0.827174152, 51: 0.82533598, 52: 0.8244730259999999, 53: 0.8231749909999999, 54: 0.821542323, 55: 0.81994456, 56: 0.8193151340000001, 57: 0.817544275, 58: 0.8043684740000001, 59: 0.804280654, 60: 0.804033467, 61: 0.8024588559999999, 62: 0.799949648, 63: 0.7998811370000001, 64: 0.7993836879999999, 65: 0.7978083340000001, 66: 0.794987453, 67: 0.7942367459999999, 68: 0.788060922, 69: 0.7833455859999999, 70: 0.783237411, 71: 0.781810415, 72: 0.77947656, 73: 0.775430937, 74: 0.769843183, 75: 0.763548283, 76: 0.76318018, 77: 0.75891367, 78: 0.7588845790000001, 79: 0.758045414, 80: 0.7545263740000001, 81: 0.7514679670000001, 82: 0.74697755, 83: 0.74597623, 84: 0.743042806, 85: 0.740900129, 86: 0.740844543, 87: 0.7246525220000001, 88: 0.7133019070000001, 89: 0.7124335999999999, 90: 0.703856728, 91: 0.694475843, 92: 0.68836286, 93: 0.683978596, 94: 0.679002735, 95: 0.664945699, 96: 0.662761826, 97: 0.649950991, 98: 0.638550239, 99: 0.60593566, 100: 0.603891537, 101: 0.602900777, 102: 0.594660442, 103: 0.565978017}, 'Rolling Percentage': {0: 0.009615385, 1: 0.019230768999999998, 2: 0.028846154, 3: 0.038461537999999997, 4: 0.048076923, 5: 0.057692308, 6: 0.067307692, 7: 0.076923077, 8: 0.086538462, 9: 0.096153846, 10: 0.10576923099999999, 11: 0.115384615, 12: 0.125, 13: 0.134615385, 14: 0.144230769, 15: 0.153846154, 16: 0.16346153800000002, 17: 0.173076923, 18: 0.182692308, 19: 0.192307692, 20: 0.201923077, 21: 0.21153846199999998, 22: 0.22115384600000002, 23: 0.23076923100000002, 24: 0.240384615, 25: 0.25, 26: 0.259615385, 27: 0.269230769, 28: 0.278846154, 29: 0.288461538, 30: 0.298076923, 31: 0.307692308, 32: 0.317307692, 33: 0.326923077, 34: 0.33653846200000004, 35: 0.346153846, 36: 0.355769231, 37: 0.365384615, 38: 0.375, 39: 0.384615385, 40: 0.394230769, 41: 0.403846154, 42: 0.41346153799999996, 43: 0.423076923, 44: 0.43269230799999997, 45: 0.44230769200000003, 46: 0.451923077, 47: 0.46153846200000004, 48: 0.471153846, 49: 0.480769231, 50: 0.490384615, 51: 0.5, 52: 0.509615385, 53: 0.519230769, 54: 0.528846154, 55: 0.538461538, 56: 0.548076923, 57: 0.557692308, 58: 0.567307692, 59: 0.576923077, 60: 0.586538462, 61: 0.596153846, 62: 0.605769231, 63: 0.615384615, 64: 0.625, 65: 0.634615385, 66: 0.644230769, 67: 0.653846154, 68: 0.663461538, 69: 0.673076923, 70: 0.682692308, 71: 0.692307692, 72: 0.701923077, 73: 0.711538462, 74: 0.721153846, 75: 0.7307692309999999, 76: 0.740384615, 77: 0.75, 78: 0.759615385, 79: 0.7692307690000001, 80: 0.778846154, 81: 0.788461538, 82: 0.798076923, 83: 0.807692308, 84: 0.817307692, 85: 0.826923077, 86: 0.836538462, 87: 0.846153846, 88: 0.8557692309999999, 89: 0.865384615, 90: 0.875, 91: 0.884615385, 92: 0.8942307690000001, 93: 0.903846154, 94: 0.913461538, 95: 0.923076923, 96: 0.932692308, 97: 0.942307692, 98: 0.951923077, 99: 0.961538462, 100: 0.971153846, 101: 0.9807692309999999, 102: 0.990384615, 103: 1.0}} )