I'm trying to make a Sankey-plot using Plotly, which follows the filtering of certain documents into either being in scope or out of scope, i.e. 1 source, 2 targets, however some documents are filtered during step 1, some during step 2 etc. This leads to the following Sankey-plot:
Current output
Now what I would ideally like is for it to look something like this:
Ideal output
I've already tried to look through the documentation on : https://plot.ly/python/reference/#sankey but I fail to find what I'm looking for, ideally I would like to implement a feature to prevent the plot from overlapping nodes and links.
This is the code I'm using the generate the plot object:
def genSankeyPlotObject(df, cat_cols=[], value_cols='', visible = False):
### COLORPLATTE TO USE
colorPalette = ['472d3c', '5e3643', '7a444a', 'a05b53', 'bf7958', 'eea160', 'f4cca1', 'b6d53c', '71aa34', '397b44',
'3c5956', '302c2e', '5a5353', '7d7071', 'a0938e', 'cfc6b8', 'dff6f5', '8aebf1', '28ccdf', '3978a8',
'394778', '39314b', '564064', '8e478c', 'cd6093', 'ffaeb6', 'f4b41b', 'f47e1b', 'e6482e', 'a93b3b',
'827094', '4f546b']
### CREATES LABELLIST FROM DEFINED COLUMNS
labelList = []
for catCol in cat_cols:
labelListTemp = list(set(df[catCol].values))
labelList = labelList + labelListTemp
labelList = list(dict.fromkeys(labelList))
### DEFINES THE NUMBER OF COLORS IN THE COLORPALLET
colorNum = len(df[cat_cols[0]].unique()) + len(df[cat_cols[1]].unique()) + len(df[cat_cols[2]].unique())
TempcolorPallet = colorPalette * math.ceil(len(colorPalette)/colorNum)
shuffle(TempcolorPallet)
colorList = TempcolorPallet[0:colorNum]
### TRANSFORMS DF INTO SOURCE -> TARGET PAIRS
for i in range(len(cat_cols)-1):
if i==0:
sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
sourceTargetDf.columns = ['source','target','count']
else:
tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
tempDf.columns = ['source','target','count']
sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
### ADDING INDEX TO SOURCE -> TARGET PAIRS
sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
### CREATES THE SANKEY PLOT OBJECT
data = go.Sankey(node = dict(pad = 15,
thickness = 20,
line = dict(color = "black",
width = 0.5),
label = labelList,
color = colorList),
link = dict(source = sourceTargetDf['sourceID'],
target = sourceTargetDf['targetID'],
value = sourceTargetDf['count']),
valuesuffix = ' ' + value_cols,
visible = visible)
return data
Related
I would like to update a network graph using a datetime range slider with Bokeh. So appearing/disappearing nodes depending on the datetime range, and also width of edges is proportional to the number of connections between sources and target within datetime range.
So far here is my code:
nb_conn = df.groupby(['src','dst'])['src'].count()
nb_conn = nb_conn.rename("nb_conn")
nb_conn_tot = nb_conn.sum()
ratio_nb_conn = (nb_conn/nb_conn_tot)*100
netflow_feat = (
df.merge(ratio_nb_conn, on=["src", 'dst'])
)
G = nx.from_pandas_edgelist(netflow_feat, source='src', target='dst' ,edge_attr='nb_conn')
degrees = dict(nx.degree(G))
nx.set_node_attributes(G, name='degree', values=degrees)
number_to_adjust_by = 5
adjusted_node_size = dict([(node, degree+number_to_adjust_by) for node, degree in nx.degree(G)])
nx.set_node_attributes(G, name='adjusted_node_size', values=adjusted_node_size)
number_to_adjust_by = 5
adjusted_node_size = dict([(node, degree+number_to_adjust_by) for node, degree in nx.degree(G)])
nx.set_node_attributes(G, name='adjusted_node_size', values=adjusted_node_size)
#Choose attributes from G network to size and color by — setting manual size (e.g. 10) or color (e.g. 'skyblue') also allowed
size_by_this_attribute = 'adjusted_node_size'
color_by_this_attribute = 'adjusted_node_size'
#Pick a color palette — Blues8, Reds8, Purples8, Oranges8, Viridis8
color_palette = Blues8
#Choose a title!
title = 'Cibles Network'
#Establish which categories will appear when hovering over each node
HOVER_TOOLTIPS = [
("IP", "#index"),
]
#Create a plot — set dimensions, toolbar, and title
plot = figure(tooltips = HOVER_TOOLTIPS,
tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
x_range=Range1d(-20.1, 20.1), y_range=Range1d(-20.1, 20.1), title=title)
#Create a network graph object
# https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.drawing.layout.spring_layout.html\
network_graph = from_networkx(G, nx.spring_layout, scale=20, center=(0, 0))
#Set node sizes and colors according to node degree (color as spectrum of color palette)
minimum_value_color = min(network_graph.node_renderer.data_source.data[color_by_this_attribute])
maximum_value_color = max(network_graph.node_renderer.data_source.data[color_by_this_attribute])
network_graph.node_renderer.glyph = Circle(fill_color=linear_cmap(color_by_this_attribute, color_palette, minimum_value_color, maximum_value_color))
#Set edge opacity and width
network_graph.edge_renderer.data_source.data["line_width"] = [G.get_edge_data(a,b)['nb_conn'] for a, b in G.edges()]
network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5)
network_graph.edge_renderer.glyph.line_width = {'field': 'line_width'}
plot.renderers.append(network_graph)
backup_edge_data = copy.deepcopy(network_graph.edge_renderer.data_source.data)
code = """
# print out array of date from, date to
console.log(cb_obj.value);
# dates returned from slider are not at round intervals and include time;
const date_from = Date.parse(new Date(cb_obj.value[0]).toDateString());
const date_to = Date.parse(new Date(cb_obj.value[1]).toDateString());
const old_Weight = df["nb_conn"];
const old_start = df.loc[start];
const old_end = df.loc[end];
const df_filtered = df[(df['timestamp'] >= date_from) & (df['timestamp'] <= date_to)]
What should I do here???
graph_setup.edge_renderer.data_source.data = new_data_edge;
graph_setup.edge_renderer.data_source.change.emit();
"""
callback = CustomJS(args = dict(graph_setup = network_graph,
df=netflow_feat,
start = netflow_feat['timestamp'].min,
end = netflow_feat['timestamp'].max), code = code)
datetime_range_slider = DatetimeRangeSlider(value=(datetime(2023, 1, 5, 12), datetime(2022, 1, 6, 18)),
start=datetime(2023, 1, 5), end=datetime(2023, 1, 7))
datetime_range_slider.js_on_change("value", callback)
layout = Column(plot, datetime_range_slider)
show(layout)
In the callback function, is it supposed to be only javascript? I guess my callback function is not correct but I don't know how to do what I'd like to do, or is it even possible?
So, I'm working with a dataset of stores, each store with its lat, lng, name and category.
Since we are talking about several hundreds of even thousands of stores, I'm using marker clusters, and they are working fine...
Now, I need to also set these stores in different layers based on their category, so that when I click on say "electronics stores", I only get those stores in the map (and they should be removed from the marker cluster as well)
Consider this sample data:
stores = [(-23.5578906,-46.6665546, 'store1','electronics'),
(-23.562711,-46.674363, 'store2','home goods'),
(-23.5642399,-46.6681833, 'store3','beauty'),
(-23.584167,-46.678497, 'store4','electronics'),
(-23.5956238,-46.6865377, 'store5','electronics'),
(-23.5868682,-46.6773554,'store6','home goods'),
(-23.6011096,-46.6739275, 'store7','beauty'),
(-23.6087354,-46.6973713, 'store8','home goods'),
(-23.5943515,-46.6846959, 'store9','beauty')]
My code works ok for putting the markers in clusters, but when I try to also add them to layers based on their categories it doesn't work. I get no errors, and the map "loads", but the markers and clusters don't get displayed, and I get no layers on the map.
This is my code:
mymap = folium.Map(location=[y_map, x_map], zoom_start=11,tiles=None)
folium.TileLayer(name="Mapbox Bright",control=False).add_to(mymap)
markers_list = []
all_gp = []
for lat, lng, name, category zip(df_stores['LAT'],
df_stores['LNG'],
df_stores['NAME'],
df_stores['CATEGORY']
):
html = '''NAME: ''' + name + '''<br>CATEGORY: ''' + category
iframe = folium.IFrame(html,
width=300,
height=130)
popup = folium.Popup(iframe,
max_width=300)
lead_marker = folium.Marker(
[lat, lng],
popup=popup,
icon=folium.Icon(color='purple', icon='glyphicon-cutlery', prefix='glyphicon')
)
markers_list.append(lead_marker)
pg = category
all_gp.append(pg)
mCluster = MarkerCluster(name="Stores").add_to(mymap)
for pnt in markers_list:
pnt.add_to(mCluster)
######################################################################
# Create point_layer object
unique_gp = list(set(all_gp))
vlist = []
for i,k in enumerate(unique_gp):
locals()[f'point_layer{i}'] = folium.FeatureGroup(name=k)
vlist.append(locals()[f'point_layer{i}'])
# Creating list for point_layer
pl_group = []
for n in all_gp:
for v in vlist:
if n == vars(v)['layer_name']:
pl_group.append(v)
for pnt,pg in zip(markers_list,pl_group):
pnt.add_to(pg)
pg.add_to(mymap)
######################################################################
folium.LayerControl().add_to(mymap)
mymap.add_child(MeasureControl())
mymap.render()
mymap.save('stores.html')
The code between the lines of ############ I took form another post here (How to add categorical layered data to LayerControl() in python Folium map?) and adapted it to my code, but it seems I'm missing something. If I take out the last for cycle from the code, the map loads correctly with its clusters working ok, any suggestions?
I will answer with the understanding that the question is how to create a category layer, add markers for the information that belongs to it, and control the show/hide with a layer control. First, set the respective column data from the row information in the data frame and add the pop-up information. Add the category information based on the category information to the pre-prepared per-category layer.
import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster
stores = [(-23.5578906,-46.6665546, 'store1','electronics'),
(-23.562711,-46.674363, 'store2','home goods'),
(-23.5642399,-46.6681833, 'store3','beauty'),
(-23.584167,-46.678497, 'store4','electronics'),
(-23.5956238,-46.6865377, 'store5','electronics'),
(-23.5868682,-46.6773554,'store6','home goods'),
(-23.6011096,-46.6739275, 'store7','beauty'),
(-23.6087354,-46.6973713, 'store8','home goods'),
(-23.5943515,-46.6846959, 'store9','beauty')]
df = pd.DataFrame(stores, columns=['LAT','LNG','NAME','CATEGORY'])
mymap = folium.Map(location=[df['LAT'].mean(), df['LNG'].mean()], zoom_start=12)
#mCluster = MarkerCluster(name="Stores").add_to(mymap)
mCluster_hg = MarkerCluster(name="home goods").add_to(mymap)
mCluster_ele = MarkerCluster(name="electronics").add_to(mymap)
mCluster_bea = MarkerCluster(name="beauty").add_to(mymap)
for row in df.itertuples():
#print(row)
location = row[1], row[2]
icon=folium.Icon(color='purple', icon='glyphicon-cutlery', prefix='glyphicon')
html = '''NAME: ''' + row[3] + '''<br>CATEGORY: ''' + row[4]
iframe = folium.IFrame(html, width=300, height=130)
popup = folium.Popup(iframe, max_width=300)
marker = folium.Marker(location=location, popup=popup, icon=icon)
#folium.Popup(popup).add_to(marker)
#mCluster_bea.add_child(marker)
if row[4] == 'electronics':
mCluster_ele.add_child(marker)
elif row[4] == 'home goods':
mCluster_hg.add_child(marker)
elif row[4] == 'beauty':
mCluster_bea.add_child(marker)
folium.LayerControl().add_to(mymap);
mymap
I want to keep the labels when you hover, but hide the labels from just appearing over the Sankey as text.
Here is my code:
labels = df_mapping['Name'].to_numpy().tolist() + labels
count_dict = {}
source = []
target = []
value = df_subset['Stuff'].to_numpy().tolist()
index = 0
for x in unique_broad:
count_dict[x] = len(df_mapping.loc[df_mapping['Stuff'] == x])
for key in count_dict:
for i in range(count_dict[key]):
source.append(index)
index += 1
for key in count_dict:
for i in range(count_dict[key]):
target.append(index)
index += 1
number_of_colors = len(source)
color_link = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
for i in range(number_of_colors)]
link = dict(source=source, target=target, value=value, color=color_link)
node = dict(label=labels, pad=35, thickness=10)
data = go.Sankey(link=link, node=node)
fig = go.Figure(data)
fig.update_layout(
hovermode = 'x',
title="Sankey for Stuff",
font=dict(size=8, color='white'),
paper_bgcolor='#51504f'
)
return fig
You can make the labels invisible by setting the color of the labels to rgba(0,0,0,0). This ensures that the label will remain in the hovertemplate, but not show up on the nodes.
To do this you can pass textfont=dict(color="rgba(0,0,0,0)", size=1) to go.Sankey such as in the example you used from the Plotly sankey diagram documentation:
import plotly.graph_objects as go
import urllib.request, json
url = 'https://raw.githubusercontent.com/plotly/plotly.js/master/test/image/mocks/sankey_energy.json'
response = urllib.request.urlopen(url)
data = json.loads(response.read())
# override gray link colors with 'source' colors
opacity = 0.4
# change 'magenta' to its 'rgba' value to add opacity
data['data'][0]['node']['color'] = ['rgba(255,0,255, 0.8)' if color == "magenta" else color for color in data['data'][0]['node']['color']]
data['data'][0]['link']['color'] = [data['data'][0]['node']['color'][src].replace("0.8", str(opacity))
for src in data['data'][0]['link']['source']]
fig = go.Figure(data=[go.Sankey(
textfont=dict(color="rgba(0,0,0,0)", size=1),
valueformat = ".0f",
valuesuffix = "TWh",
# Define nodes
node = dict(
pad = 15,
thickness = 15,
line = dict(color = "black", width = 0.5),
label = data['data'][0]['node']['label'],
color = data['data'][0]['node']['color']
),
# Add links
link = dict(
source = data['data'][0]['link']['source'],
target = data['data'][0]['link']['target'],
value = data['data'][0]['link']['value'],
label = data['data'][0]['link']['label'],
color = data['data'][0]['link']['color']
))])
fig.update_layout(title_text="Energy forecast for 2050<br>Source: Department of Energy & Climate Change, Tom Counsell via <a href='https://bost.ocks.org/mike/sankey/'>Mike Bostock</a>",
font_size=10)
fig.show()
You get the following:
I need to modify the following code to print bar graph of the cluster populations. Briefly it stores all values in numpy arrays and than print the bar histogram (indicating number of conformations in each clusters on Y, and some inherent value of the cluster (energy) on X) using Tkinter module, which seems to be not very practical solution..
r = Tkinter.Tk()
dataList = []
reverseList = []
rLctr = 0
confL = d.ch.conformations
e = d.clusterer.energy_used
#for l in mol.cluSEQ:
for l in d.clusterer.clustering_dict[cut_off]:
dataList.append([l[0].energy, len(l)])
reverseList.append(range(rLctr, rLctr+len(l)))
mol.elist = numpy.array(elist)
mol.r = [numpy.minimum.reduce(mol.elist),
numpy.maximum.reduce(mol.elist)]
mol.nbins = Tkinter.IntVar()
mol.nbins.set(10)
mol.min = Tkinter.StringVar()
mol.min.set(str(mol.r[0]))
mol.max = Tkinter.StringVar()
mol.max.set(str(mol.r[1]))
r = (float(mol.min.get()), float(mol.max.get()))
mol.ehist = HistogramRI(mol.elist,mol.nbins.get(),range=r)
mol.ehist.createReverseIndex()
nodeList = mol.ehist.array
tstr = mol.name + ' histogram'
top = Tkinter.Toplevel()
top.title(tstr)
mol.ehist
#top = Tkinter.Toplevel()
xlabel = 'ENERGY'+ 'clusterized with ' + str(cut_off) + 'A'
mol.clustNB = InteractiveHistogramGraph(mol.name,
master=top, nodeList = dataList, reverseIndex=reverseList,
xlabel_text=xlabel,
ylabel_text='#\nC\nO\nN\nF\nO\nR\nM\nA\nT\nI\nO\nN\nS')
mol.clustNB.draw.update()
mol.clustNB.draw.postscript({'file':outputfilename, 'colormode':'color'})
top.update_idletasks()
Could you suggest me a simple way to convert it to the matplot lib in order that I could control all printing options?
I have a dictionary with multiple key defined as (arbitrary inputs):
colors = {}
colors['red'] = {}
colors['blue'] = {}
colors['red'][clustname] = np.array([])
colors['blue'][clustname] = np.array([])
basically I want to plot a red v blue graph for each 'cluster'. I have 13 'clusters' in total with differing color values for each. The names in my code are different from the arbitrary ones above, but I figured it would be easier to understand with basic values then to look at the overall code:
colpath = '/home/jacob/PHOTOMETRY/RESTFRAME_COLOURS/' #This is the path to the restframe colors
goodcolindx = {}
colfiledat = {}
colors = {}
colors['UMINV'] = {}
colors['VMINJ'] = {}
colors['NUVMINV'] = {}
colors['id'] = {}
for iclust in range(len(clustname)):
colors['UMINV'][clustname[iclust]] = np.array([])
colors['VMINJ'][clustname[iclust]] = np.array([])
colors['id'][clustname[iclust]] = np.array([])
colors['NUVMINV'][clustname[iclust]] = np.array([])
filepath = catpath + clustname[iclust] + "_totalall_" + extname[iclust] + ".cat"
photdat[clustname[iclust]] = ascii.read(filepath)
filepath = zpath + "compilation_" + clustname[iclust] + ".dat"
zdat[clustname[iclust]] = ascii.read(filepath)
colfilepath = colpath + 'RESTFRAME_MASTER_' + clustname[iclust] + '_indivredshifts.cat'
colfiledat[clustname[iclust]] = ascii.read(colfilepath)
goodcolindx[clustname[iclust]] = np.where((colfiledat[clustname[iclust]]['REDSHIFTUSED'] > 0.9) & \
(colfiledat[clustname[iclust]]['REDSHIFTUSED'] < 1.5) & \
(photdat[clustname[iclust]]['totmask'] == 0) & \
(photdat[clustname[iclust]]['K_flag'] == 0) & \
((zdat[clustname[iclust]]['quality'] == 3) | (zdat[clustname[iclust]]['quality'] == 4)))
goodcolindx[clustname[iclust]] = goodcolindx[clustname[iclust]][0]
for igood in range(len(goodcolindx[clustname[iclust]])):
idstring = str(photdat[clustname[iclust]]['id'][goodcolindx[clustname[iclust]][igood]])
colors['NUVMINV'][clustname[iclust]] = np.append(colors['NUVMINV'][clustname[iclust]], -2.5 *
np.log10(colfiledat[clustname[iclust]]['NUV'][goodcolindx[clustname[iclust]][igood]]
/ colfiledat[clustname[iclust]]['V'][goodcolindx[clustname[iclust]][igood]]))'SpARCS-0035'
colors['UMINV'][clustname[iclust]] = np.append(colors['UMINV'][clustname[iclust]], colfiledat[clustname[iclust]]['UMINV'][goodcolindx[clustname[iclust]][igood]])
colors['id'][clustname[iclust]] = np.append(colors['id'][clustname[iclust]], photdat[clustname[iclust]]['id'][goodcolindx[clustname[iclust]][igood]])
colors['VMINJ'][clustname[iclust]] = np.append(colors['VMINJ'][clustname[iclust]], colfiledat[clustname[iclust]]['VMINJ'][goodcolindx[clustname[iclust]][igood]])
for iclustc in colors:
plt.plot(colors['VMINJ'][clustname[iclustc]], colors['UMINV'][clustname[iclustc]], 'ko')
plt.show()
So in this case, my 'red' is the VMINJ and my 'blue' is the UMINV. I am trying to use a for loop to cycle through all the cluster names that I have, but I keep getting the error back 'String indices must be integers'. I understand the basics of that, but don't know how to fix my code to make plots for each 'red' v 'blue' for each cluster. Any help would be awesome, let me know if you have questions
I figured it out. I changed the for loop to:
for iclust in range(len(clustname)):
plt.plot(colors['UMINV'][clustname[iclust]]....
and that worked