Related
Say I have the following DataFrame() where I have repeated observations per individual (column id_ind). Hence, first two rows belong the first individual, the third and fourth rows belong to the second individual, and so forth...
import pandas as pd
X = pd.DataFrame.from_dict({'x1_1': {0: -0.1766214634108258, 1: 1.645852185286492, 2: -0.13348860101031038, 3: 1.9681043689968933, 4: -1.7004428240831382, 5: 1.4580091413853749, 6: 0.06504113741068565, 7: -1.2168493676768384, 8: -0.3071304478616376, 9: 0.07121332925591593}, 'x1_2': {0: -2.4207773498298844, 1: -1.0828751040719462, 2: 2.73533787008624, 3: 1.5979611987152071, 4: 0.08835542172064115, 5: 1.2209786277076156, 6: -0.44205979195950784, 7: -0.692872860268244, 8: 0.0375521181289943, 9: 0.4656030062266639}, 'x1_3': {0: -1.548320898226322, 1: 0.8457342014424675, 2: -0.21250514722879738, 3: 0.5292389938329516, 4: -2.593946520223666, 5: -0.6188958526077123, 6: 1.6949245117526974, 7: -1.0271341091035742, 8: 0.637561891142571, 9: -0.7717170035055559}, 'x2_1': {0: 0.3797245517345564, 1: -2.2364391598508835, 2: 0.6205947900678905, 3: 0.6623865847688559, 4: 1.562036259999875, 5: -0.13081282910947759, 6: 0.03914373833251773, 7: -0.995761652421108, 8: 1.0649494418154162, 9: 1.3744782478849122}, 'x2_2': {0: -0.5052556836786106, 1: 1.1464291788297152, 2: -0.5662380273138174, 3: 0.6875729143723538, 4: 0.04653136473130827, 5: -0.012885303852347407, 6: 1.5893672346098884, 7: 0.5464286050059511, 8: -0.10430829457707284, 9: -0.5441755265313813}, 'x2_3': {0: -0.9762973303149007, 1: -0.983731467806563, 2: 1.465827578266328, 3: 0.5325950414202745, 4: -1.4452121324204903, 5: 0.8148816373643869, 6: 0.470791989780882, 7: -0.17951636294180473, 8: 0.7351814781280054, 9: -0.28776723200679066}, 'x3_1': {0: 0.12751822396637064, 1: -0.21926633684030983, 2: 0.15758799357206943, 3: 0.5885412224632464, 4: 0.11916562911189271, 5: -1.6436210334529249, 6: -0.12444368631987467, 7: 1.4618564171802453, 8: 0.6847234328916137, 9: -0.23177118858569187}, 'x3_2': {0: -0.6452955690715819, 1: 1.052094761527654, 2: 0.20190339195326157, 3: 0.6839430295237913, 4: -0.2607691613858866, 5: 0.3315513026670213, 6: 0.015901139336566113, 7: 0.15243420084881903, 8: -0.7604225072161022, 9: -0.4387652927008854}, 'x3_3': {0: -1.067058994377549, 1: 0.8026914180717286, 2: -1.9868531745912268, 3: -0.5057770735303253, 4: -1.6589569342151713, 5: 0.358172252880764, 6: 1.9238983803281329, 7: 2.2518318810978246, 8: -1.2781475121874357, 9: -0.7103081175166167}})
Y = pd.DataFrame.from_dict({'CHOICE': {0: 1.0, 1: 1.0, 2: 2.0, 3: 2.0, 4: 3.0, 5: 2.0, 6: 1.0, 7: 1.0, 8: 2.0, 9: 2.0}})
Z = pd.DataFrame.from_dict({'z1': {0: 2.4196730570917233, 1: 2.4196730570917233, 2: 2.822802255159467, 3: 2.822802255159467, 4: 2.073171091633643, 5: 2.073171091633643, 6: 2.044165101485163, 7: 2.044165101485163, 8: 2.4001241292606275, 9: 2.4001241292606275}, 'z2': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 0.0, 9: 0.0}, 'z3': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 2.0, 5: 2.0, 6: 2.0, 7: 2.0, 8: 3.0, 9: 3.0}})
id = pd.DataFrame.from_dict({'id_choice': {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0, 4: 5.0, 5: 6.0, 6: 7.0, 7: 8.0, 8: 9.0, 9: 10.0}, 'id_ind': {0: 1.0, 1: 1.0, 2: 2.0, 3: 2.0, 4: 3.0, 5: 3.0, 6: 4.0, 7: 4.0, 8: 5.0, 9: 5.0}} )
# Create a dataframe with all the data
data = pd.concat([id, X, Z, Y], axis=1)
print(data.head(4))
# id_choice id_ind x1_1 x1_2 x1_3 x2_1 x2_2 \
# 0 1.0 1.0 -0.176621 -2.420777 -1.548321 0.379725 -0.505256
# 1 2.0 1.0 1.645852 -1.082875 0.845734 -2.236439 1.146429
# 2 3.0 2.0 -0.133489 2.735338 -0.212505 0.620595 -0.566238
# 3 4.0 2.0 1.968104 1.597961 0.529239 0.662387 0.687573
#
# x2_3 x3_1 x3_2 x3_3 z1 z2 z3 CHOICE
# 0 -0.976297 0.127518 -0.645296 -1.067059 2.419673 0.0 1.0 1.0
# 1 -0.983731 -0.219266 1.052095 0.802691 2.419673 0.0 1.0 1.0
# 2 1.465828 0.157588 0.201903 -1.986853 2.822802 0.0 1.0 2.0
# 3 0.532595 0.588541 0.683943 -0.505777 2.822802 0.0 1.0 2.0
I want to perform two operations.
First, I want to convert the DataFrame data into a dictionary of DataFrame()s where the keys are the number of individuals (in this particular case, numbers ranging from 1.0 to 5.0.). I've done this below as suggested here. Unfortunately, I am getting a dictionary of numpy values and not a dictionary of DataFrame()s.
# Create a dictionary with the data for each individual
data_dict = data.set_index('id_ind').groupby('id_ind').apply(lambda x : x.to_numpy().tolist()).to_dict()
print(data_dict.keys())
# dict_keys([1.0, 2.0, 3.0, 4.0, 5.0])
print(data_dict[1.0])
#[[1.0, -0.1766214634108258, -2.4207773498298844, -1.548320898226322, 0.3797245517345564, -0.5052556836786106, -0.9762973303149007, 0.12751822396637064, -0.6452955690715819, -1.067058994377549, 2.4196730570917233, 0.0, 1.0, 1.0], [2.0, 1.645852185286492, -1.0828751040719462, 0.8457342014424675, -2.2364391598508835, 1.1464291788297152, -0.983731467806563, -0.21926633684030983, 1.052094761527654, 0.8026914180717286, 2.4196730570917233, 0.0, 1.0, 1.0]]
Second, I want to recover the original DataFrame data reversing the previous operation. The naive approach is as follows. However, it is, of course, not producing the expected result.
# Naive approach
res = pd.DataFrame.from_dict(data_dict, orient='index')
print(res)
# 0 1
#1.0 [1.0, -0.1766214634108258, -2.4207773498298844... [2.0, 1.645852185286492, -1.0828751040719462, ...
#2.0 [3.0, -0.13348860101031038, 2.73533787008624, ... [4.0, 1.9681043689968933, 1.5979611987152071, ...
#3.0 [5.0, -1.7004428240831382, 0.08835542172064115... [6.0, 1.4580091413853749, 1.2209786277076156, ...
#4.0 [7.0, 0.06504113741068565, -0.4420597919595078... [8.0, -1.2168493676768384, -0.692872860268244,...
#5.0 [9.0, -0.3071304478616376, 0.0375521181289943,... [10.0, 0.07121332925591593, 0.4656030062266639...
This solution was inspired by #mozway comments.
# Create a dictionary with the data for each individual
data_dict = dict(list(data.groupby('id_ind')))
# Convert the dictionary into a dataframe
res = pd.concat(data_dict, axis=0).reset_index(drop=True)
print(res.head(4))
# id_choice id_ind x1_1 x1_2 x1_3 x2_1 x2_2 \
#0 1.0 1.0 -0.176621 -2.420777 -1.548321 0.379725 -0.505256
#1 2.0 1.0 1.645852 -1.082875 0.845734 -2.236439 1.146429
#2 3.0 2.0 -0.133489 2.735338 -0.212505 0.620595 -0.566238
#3 4.0 2.0 1.968104 1.597961 0.529239 0.662387 0.687573
#
# x2_3 x3_1 x3_2 x3_3 z1 z2 z3 CHOICE
#0 -0.976297 0.127518 -0.645296 -1.067059 2.419673 0.0 1.0 1.0
#1 -0.983731 -0.219266 1.052095 0.802691 2.419673 0.0 1.0 1.0
#2 1.465828 0.157588 0.201903 -1.986853 2.822802 0.0 1.0 2.0
#3 0.532595 0.588541 0.683943 -0.505777 2.822802 0.0 1.0 2.0
Could someone explain me how does method average_neighbor_degree from networkX works?
For normal graphs (not directed) results match with my intuition and my hand calculation, but I have problem with directed graphs - I don't understand how it's calculated and my hand calculation give different results then this method.
My graph:
[Example directed graph]
I've run average_neighbor_degree with every possible combination with params source and target which implies basic attributes for directed graph. Here's my code and results:
G = nx.DiGraph([(0, 3), (1, 3), (2, 4), (3, 5), (3, 6), (4, 6), (6, 5), (4, 3)])
print(f'in-in: {nx.average_neighbor_degree(G, source="in", target= "in")}')
print(f'in-out: {nx.average_neighbor_degree(G, source="in", target= "out")}')
print(f'out-in: {nx.average_neighbor_degree(G, source="out", target= "in")}')
print(f'out-out: {nx.average_neighbor_degree(G, source="out", target= "out")}')
in-in: {0: 3.0, 3: 1.3333333333333333, 1: 3.0, 2: 1.0, 4: 5.0, 5: 0.0, 6: 1.0}
in-out: {0: 2.0, 3: 0.3333333333333333, 1: 2.0, 2: 2.0, 4: 3.0, 5: 0.0, 6: 0.0}
out-in: {0: 3.0, 3: 2.0, 1: 3.0, 2: 1.0, 4: 2.5, 5: 0.0, 6: 2.0}
out-out: {0: 2.0, 3: 0.5, 1: 2.0, 2: 2.0, 4: 1.5, 5: 0.0, 6: 0.0}
Looking at this graph I don't understand how this is calculated and from where those results come from. Could someone explain it please?
Environment:
Python 3.8.8
NetworkX 2.5
I think you ran into this issue. The old code contained some weird normalisation (setting to 1 if respective degree is 0). This was changed in 2.6. The latest version, see code here should work as intended.
Input1:
d = {1: 0.2, 2: 0.4, 3: 0.5, 4: 0.5, 5: 0.5, 6: 0.7, 7: 0.7, 8: 0.8, 9: 0.95, 10:1}
L = [0.48, 0.72]
Expected result1: [3, 6]
Input2:
d = {1: 0.9, 2: 0.88, 3: 0.88, 4: 0.76, 5: 0.76, 6: 0.7, 7: 0.5, 8: 0.44, 9: 0.2, 10:0}
L = [0.48, 0.77]
Expected result2: [7, 4]
You can assume the values in d is sorted (always increase or decrease as the key gets bigger)
If there are values that are equal, return the smallest key.
I'm able to do this in O(n), is there a better way to use binary search to do this?
One possible solution - without thorough testing - but it's not better than O(n):
def find_closest(source: dict, targets: list) -> list:
result = {}
for idx, target in enumerate(targets):
last = None
for key, val in source.items():
if not last or abs(target - val) < last:
last = abs(target - val)
result[idx] = key
elif abs(target - val) > last:
break
result = list(result.values())
return result
if __name__ == '__main__':
d1 = {1: 0.2, 2: 0.4, 3: 0.5, 4: 0.5, 5: 0.5, 6: 0.7, 7: 0.7, 8: 0.8, 9: 0.95, 10: 1}
L1 = [0.48, 0.72]
d2 = {1: 0.9, 2: 0.88, 3: 0.88, 4: 0.76, 5: 0.76, 6: 0.7, 7: 0.5, 8: 0.44, 9: 0.2, 10: 0}
L2 = [0.48, 0.77]
d3 = {1: 0.1, 2: 0.2, 3: 0.3, 4: 0.4, 5: 0.5, 6: 0.6, 7: 0.7, 8: 0.8, 9: 0.9, 10: 1.0}
L3 = [0.48, 0.77]
d4 = {1: 0.9, 2: 0.8, 3: 0.7, 4: 0.6, 5: 0.5, 6: 0.4, 7: 0.3, 8: 0.2, 9: 0.1, 10: 0.0}
L4 = [0.48, 0.77]
print(find_closest(d1, L1))
print(find_closest(d2, L2))
print(find_closest(d3, L3))
print(find_closest(d4, L4))
[3, 6]
[7, 4]
[5, 8]
[5, 2]
I have constructed a Bokeh stacked barchart by the code below. The chart shows the different tree types for the districts of Copenhagen. At the moment I have a hoverTool which shows the excat amount of trees (corrosponding to the columns with the tree names) for the tree type but I also want it to show the percentage (the columns with _pat the end), but how can I do this with the stacked bar chart?
A reduced part of the data frame:
temp=pd.DataFrame( {'bydelsnavn': {0: 'Amager Vest', 1: 'Amager Øst', 2: 'Bispebjerg', 3: 'Brønshøj-Husum', 4: 'Indre By', 5: 'Nørrebro', 6: 'Valby', 7: 'Vanløse', 8: 'Vesterbro', 9: 'Østerbro'}, 'Alder': {0: 53.0, 1: 21.0, 2: 1.0, 3: 9.0, 4: 4.0, 5: 2.0, 6: 3.0, 7: 44.0, 8: 46.0, 9: 59.0}, 'Alderm': {0: 63.0, 1: 32.0, 2: 49.0, 3: 13.0, 4: 45.0, 5: 55.0, 6: 104.0, 7: 0.0, 8: 50.0, 9: 4.0}, 'Apple': {0: 94.0, 1: 109.0, 2: 115.0, 3: 12.0, 4: 22.0, 5: 81.0, 6: 41.0, 7: 3.0, 8: 132.0, 9: 51.0}, 'Alder_p': {0: 21.9, 1: 8.68, 2: 0.41, 3: 3.72, 4: 1.65, 5: 0.83, 6: 1.24, 7: 18.18, 8: 19.01, 9: 24.38}, 'Alderm_p': {0: 15.18, 1: 7.71, 2: 11.81, 3: 3.13, 4: 10.84, 5: 13.25, 6: 25.06, 7: 0.0, 8: 12.05, 9: 0.96}, 'Apple_p': {0: 14.24, 1: 16.52, 2: 17.42, 3: 1.82, 4: 3.33, 5: 12.27, 6: 6.21, 7: 0.45, 8: 20.0, 9: 7.73}})
My code:
treeName = ['Alder','Alderm','Apple']
treeName_p = ['Alder_p','Alderm_p','Apple_p']
colornames = named.__all__
colornames = colornames[:len(treeName)]
# Create an empty figure
p = figure(x_range = temp['bydelsnavn'].values,plot_width = 700, plot_height=400,
title='Tree pr. district', toolbar_sticky = False,
tools = 'pan,wheel_zoom,reset')
# Stacked bar chart
renderers = p.vbar_stack(stackers=treeName,x='bydelsnavn',source=temp,
width=0.8, color = colornames)
# Add the hover tool
for r in renderers:
tree = r.name
hover = HoverTool(tooltips=[
("%s" % tree, "#{%s}" % tree)
], renderers = [r])
p.add_tools(hover)
# remove the grid
p.xgrid.grid_line_color=None
p.ygrid.grid_line_color=None
# Make sure bars stat at 0
p.y_range.start = 0
# remove - y-axis
p.yaxis.visible = False
# Remove the grey box around the plot
p.outline_line_color = None
# Turn the x-labels
p.xaxis.major_label_orientation = 0.5
# Remove tool bar logo
p.toolbar.logo = None
# Move the border of the left side to show "Amager"
p.min_border_left = 30
show(p)
My current chart looks like this:
Assuming that the values of the _p columns are actually in the data source, you can just add another tooltip to the HoverTool:
for r in renderers:
tree = r.name
p.add_tools(HoverTool(tooltips=[(tree, "#$name"),
(f"{tree} %", f"#{tree}_p")],
renderers=[r]))
Notice how #$name is used in there - not that necessary in this particular case but sometimes comes in handy.
This question is related to this one from SO (matplotlib-change-colormap-tab20-to-have-three-colors)
I would like to tweak the tab10 colormap in a way that I can change the alpha level of each color in as many steps as I would like to. Below is an example (for 9 color with 3 alpha levels) which does not yield the expected output. Furthermore, it is not generic enough (because of the if elif staements).
Any ideas how I could do that ?
In this example, I do have 3 groups with 3 subgroups:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
n_feature = 3
sub_feature = 3
col = []
for index in range(n_feature*sub_feature):
# loop over colors and change the last entry in descending order 3 times
col.append(list(plt.cm.tab10(index)))
i = 0
for item in col:
# loop over colors and change the last entry in descending order 3 times
if i == 0:
item[-1] = 0.9
i+=1
elif i == 1:
item[-1] = 0.7
i+=1
elif i == 2:
item[-1] = 0.5
i = 0
gr = df.groupby(['a', 'a1'])
for index, item in enumerate(gr):
name, val = item
y = val.iloc[0,2:].values
x = np.arange(len(y))
plt.plot(x, y, '.-', color=col[index])
plt.show()
This is the data:
{'a': {0: 'A', 1: 'A', 2: 'A', 3: 'B', 4: 'B', 5: 'B', 6: 'C', 7: 'C', 8: 'C'},
'a1': {0: 1, 1: 2, 2: 3, 3: 1, 4: 2, 5: 3, 6: 1, 7: 2, 8: 3},
'b': {0: 1.0,
1: 5.0,
2: 9.0,
3: 1.5,
4: 5.5,
5: 9.5,
6: 1.75,
7: 5.75,
8: 9.75},
'c': {0: 2.0,
1: 6.0,
2: 10.0,
3: 2.5,
4: 6.5,
5: 10.5,
6: 2.75,
7: 6.75,
8: 10.75},
'd': {0: 3.0,
1: 7.0,
2: 11.0,
3: 3.5,
4: 7.5,
5: 11.5,
6: 3.75,
7: 7.75,
8: 11.75},
'e': {0: 4.0,
1: 8.0,
2: 12.0,
3: 4.5,
4: 8.5,
5: 12.5,
6: 4.75,
7: 8.75,
8: 12.75}}
You may use the HSV system to obtain differently saturated and luminated colors for the same hue. Suppose you have at most 10 categories, then the tab10 map can be used to get a certain number of base colors. From those you can choose a couple of lighter shades for the subcategories.
The following would be a function categorical_cmap, which takes as input the number of categories (nc) and the number of subcategories (nsc) and returns a colormap with nc*nsc different colors, where for each category there are nsc colors of same hue.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
def categorical_cmap(nc, nsc, cmap="tab10", continuous=False):
if nc > plt.get_cmap(cmap).N:
raise ValueError("Too many categories for colormap.")
if continuous:
ccolors = plt.get_cmap(cmap)(np.linspace(0,1,nc))
else:
ccolors = plt.get_cmap(cmap)(np.arange(nc, dtype=int))
cols = np.zeros((nc*nsc, 3))
for i, c in enumerate(ccolors):
chsv = matplotlib.colors.rgb_to_hsv(c[:3])
arhsv = np.tile(chsv,nsc).reshape(nsc,3)
arhsv[:,1] = np.linspace(chsv[1],0.25,nsc)
arhsv[:,2] = np.linspace(chsv[2],1,nsc)
rgb = matplotlib.colors.hsv_to_rgb(arhsv)
cols[i*nsc:(i+1)*nsc,:] = rgb
cmap = matplotlib.colors.ListedColormap(cols)
return cmap
c1 = categorical_cmap(4, 3, cmap="tab10")
plt.scatter(np.arange(4*3),np.ones(4*3)+1, c=np.arange(4*3), s=180, cmap=c1)
c2 = categorical_cmap(2, 5, cmap="tab10")
plt.scatter(np.arange(10),np.ones(10), c=np.arange(10), s=180, cmap=c2)
c3 = categorical_cmap(5, 4, cmap="tab10")
plt.scatter(np.arange(20),np.ones(20)-1, c=np.arange(20), s=180, cmap=c3)
plt.margins(y=0.3)
plt.xticks([])
plt.yticks([0,1,2],["(5, 4)", "(2, 5)", "(4, 3)"])
plt.show()