Create plotly scattermapbox from pandas dataframe - python

I would like to create a scattermapbox for indonesia for various statistics (population, GDP, etc.) on a regional basis.
I am working with a geopandas file from github.
The example on the plotly website creates multiple files for each layer and then uses the github link as source.
#republican counties
source = 'https://raw.githubusercontent.com/plotly/datasets/master/florida-red-data.json'
#democrat counties
source = 'https://raw.githubusercontent.com/plotly/datasets/master/florida-blue-data.json'
My question therefore is, how can I use the pandas dataframe to create layer dict for every region and use that as a source (also colouring of each region by specific values in other dataframes).
Should that not be possible at all and it is necessary to create a seperate file for each region how would I do that? My attempt (lines 16-20) doesn't seem to work
import pandas as pd
import json
import string
import plotly
from plotly.graph_objs import Scattermapbox, Layout
ID_regions = pd.read_json('https://raw.githubusercontent.com/N1x0/indonesia-geojson/master/indonesia-edit.geojson')
region_names = []
for region in ID_regions['features']:
region_names.append(state['properties']['name'])
print(region_names)
#This shit creates json and doesn't work
def create_region_files():
for i in range(len(ID_regions)):
region_data = ID_regions.iloc[i,:]
region_data.to_json(f'C:\\Users\\nicho\\Desktop\\Waste Management\\Map_Maker\\ID_regions\\{region_names[i]}.json')
i += 1
def create_Chloropleth():
mapbox_access_token = 'My Access Key'
data = [
Scattermapbox(
lat=['45.5017'],
lon=['-73.5673'],
mode='markers',
)
]
layout = Layout(
height=900,
autosize=True,
showlegend=False,
hovermode='closest',
mapbox=dict(
layers=[
dict(
sourcetype = 'geojson',
source = 'https://raw.githubusercontent.com/N1x0/indonesia-geojson/master/indonesia-edit.geojson',
type = 'fill',
color = 'green'
),
dict(
sourcetype = 'geojson',
source = 'https://raw.githubusercontent.com/N1x0/indonesia-geojson/master/west-sulawesi.json',
type = ' fill',
color = 'red',
)
],
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=0.7893,
lon=113.9213
),
pitch=0,
zoom=4.5,
style='light'
),
)
fig = dict(data=data, layout=layout)
plotly.offline.plot(fig, filename='Chloropleth_Province_Population.html')
create_Chloropleth()
Thank you for the help!

Ok took me a while but i figured it all out. Big thanks to Emma Grimaldi over at Medium and Vince Pota. Their posts were what helped me through most of it.
So here the answers to my own question in order:
It is not necessary to create an individual file for each region. I.e. you can use a pandas dataframe to match names of the regions in the json and that'll work just fine.
with open('indonesia-en.geojson') as f:
geojson = json.load(f)
def make_sources(downsample = 0):
sources = []
geojson_copy = copy.deepcopy(geojson['features']) # do not overwrite the original file
for feature in geojson_copy:
if downsample > 0:
coords = np.array(feature['geometry']['coordinates'][0][0])
coords = coords[::downsample]
feature['geometry']['coordinates'] = [[coords]]
sources.append(dict(type = 'FeatureCollection',
features = [feature])
)
return sources
So you just extract the coordinates from the geojson and append them to a a list of dicts[{}].
How to use this list to dynamically create layers:
MAPBOX_APIKEY = "Your API Key"
data = dict(type='scattermapbox',
lat=lats,
lon=lons,
mode='markers',
text=hover_text,
marker=dict(size=1,
color=scatter_colors,
showscale = True,
cmin = minpop/1000000,
cmax = maxpop/1000000,
colorscale = colorscale,
colorbar = dict(
title='Population in Millions'
)
),
showlegend=False,
hoverinfo='text'
)
layers=([dict(sourcetype = 'geojson',
source =sources[k],
below="water",
type = 'line', # the borders
line = dict(width = 1),
color = 'black',
) for k in range(n_provinces) # where n_provinces = len(geojson['features'])
] +
[dict(sourcetype = 'geojson',
source =sources[k],
type = 'fill', # the area inside the borders
color = scatter_colors[k],
opacity=0.8
) for k in range(n_provinces) # where n_provinces = len(geojson['features'])
]
)
So the solution here is too set sources = sources[k] I.e. the list with the dict of lat/long values created in make_sources()
How to color the layers accordingly color=scatter_colors[k]
Using the linked example I used 3 functions
3.1 scalarmappable
#sets colors based on min and max values
def scalarmappable(cmap, cmin, cmax):
colormap = cm.get_cmap(cmap)
norm = Normalize(vmin=cmin, vmax=cmax+(cmax*0.10)) #vmax get's increased 10 percent because otherwise the most populous region doesnt get colored
return cm.ScalarMappable(norm=norm, cmap=colormap)
3.2 scatter_colors
#uses matplotlib to create colors based on values and sets grey for isnan value
def get_scatter_colors(sm, df):
grey = 'rgba(128,128,128,1)'
return ['rgba' + str(sm.to_rgba(m, bytes = True, alpha = 1)) if not np.isnan(m) else grey for m in df]
3.3 colorscale
#defines horizontal range and corresponding values for colorscale
def get_colorscale(sm, df, cmin, cmax):
xrange = np.linspace(0, 1, len(df))
values = np.linspace(cmin, cmax, len(df))
return [[i, 'rgba' + str(sm.to_rgba(v, bytes = True))] for i,v in zip(xrange, values) ]
Then variables using the functions are set
#assigning values
colormap = 'nipy_spectral'
minpop = stats['population'].min()
maxpop = stats['population'].max()
sources = make_sources(downsample=0)
lons, lats = get_centers()
sm = scalarmappable(colormap, minpop, maxpop)
scatter_colors = get_scatter_colors(sm, stats['population'])
colorscale = get_colorscale(sm, stats, minpop, maxpop)
hover_text = get_hover_text(stats['population'])
So if anyone had some problems with this answer can help you progress :)

Related

editing sizes in plotly scatter

Here's a sample of my simple plotly chart:
px.scatter(data_frame=testdf,
x = 'some_x',
y = 'some_y',
size = 'some_size',
color = 'sth_colorful',
title = 'stackoverflow',
range_x = [-10, 1100],
range_y = [-0.08, 1],
hover_name = 'sth_i_want_to_check',
animation_frame = 'year_of_course'
)
It is alomost good. One thing that bothers me is that 'some_size' min value is about 20 and max is about 35 so it is hard to notice the size difference between the circles.
Is there a way I can manage the diameter of these? I'd like to keep the original values to appear in a hover.
When you pass the argument size = 'some_size' to px.scatter, this directly determines the size of the markers as they render on the plot.
However, what you can do to get around this is create a new column called "display_size" with your intended sizes for the markers, and set customdata to df['some_size'] so you can access these values in the hovertemplate.
After that, you can loop through all of the traces in your px.scatter and modify the hovertemplate to display some_size instead of display_size.
For example:
## increase the standard deviation of the size values
## so that the difference between small and large markers is more apparent
testdf["display_size"] = 2*test_df["some_size"]
fig = px.scatter(data_frame=testdf,
x = 'some_x',
y = 'some_y',
size = 'display_size',
color = 'sth_colorful',
title = 'stackoverflow',
range_x = [-10, 1100],
range_y = [-0.08, 1],
hover_name = 'sth_i_want_to_check',
animation_frame = 'year_of_course'
)
fig.update_traces(customdata=testdf["some_size"])
## replace marker.size in the hovertemplate with your customdata (the actual size values)
for trace in fig.data:
trace['hovertemplate'] = trace['hovertemplate'].replace('marker.size','customdata')

How to change the function parameters (visualize_topics_over_time) in BERTopic?

I am using BERTopic to perform the topic modelling, everything works perfectly fine. However, since I am forcing the algorithm to give me 10 topics using nr_topics=10 as output, and when I visualize the topics overtime using
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=450), some colors are repeated for topics as there are only 7 colors mentioned in the function visualize_topics_over_time. I tried executing the same function in my python notebook with additional color values, but it gives me the following error:
Can someone please help me update the function with additional four colors?
To add colors to the function, you will indeed have to copy the function and change it to include more colors:
import pandas as pd
from typing import List
import plotly.graph_objects as go
from sklearn.preprocessing import normalize
def visualize_topics_over_time(topic_model,
topics_over_time: pd.DataFrame,
top_n_topics: int = None,
topics: List[int] = None,
normalize_frequency: bool = False,
width: int = 1250,
height: int = 450) -> go.Figure:
""" Visualize topics over time
Arguments:
topic_model: A fitted BERTopic instance.
topics_over_time: The topics you would like to be visualized with the
corresponding topic representation
top_n_topics: To visualize the most frequent topics instead of all
topics: Select which topics you would like to be visualized
normalize_frequency: Whether to normalize each topic's frequency individually
width: The width of the figure.
height: The height of the figure.
Returns:
A plotly.graph_objects.Figure including all traces
Usage:
To visualize the topics over time, simply run:
```python
topics_over_time = topic_model.topics_over_time(docs, topics, timestamps)
topic_model.visualize_topics_over_time(topics_over_time)
```
Or if you want to save the resulting figure:
```python
fig = topic_model.visualize_topics_over_time(topics_over_time)
fig.write_html("path/to/file.html")
```
<iframe src="../../getting_started/visualization/trump.html"
style="width:1000px; height: 680px; border: 0px;""></iframe>
"""
colors = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#D55E00", "#0072B2", "#CC79A7"] # DEFINE MORE COLORS HERE
# Select topics
if topics:
selected_topics = topics
elif top_n_topics:
selected_topics = topic_model.get_topic_freq().head(top_n_topics + 1)[1:].Topic.values
else:
selected_topics = topic_model.get_topic_freq().Topic.values
# Prepare data
topic_names = {key: value[:40] + "..." if len(value) > 40 else value
for key, value in topic_model.topic_names.items()}
topics_over_time["Name"] = topics_over_time.Topic.map(topic_names)
data = topics_over_time.loc[topics_over_time.Topic.isin(selected_topics), :]
# Add traces
fig = go.Figure()
for index, topic in enumerate(data.Topic.unique()):
trace_data = data.loc[data.Topic == topic, :]
topic_name = trace_data.Name.values[0]
words = trace_data.Words.values
if normalize_frequency:
y = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
else:
y = trace_data.Frequency
fig.add_trace(go.Scatter(x=trace_data.Timestamp, y=y,
mode='lines',
marker_color=colors[index % 7],
hoverinfo="text",
name=topic_name,
hovertext=[f'<b>Topic {topic}</b><br>Words: {word}' for word in words]))
# Styling of the visualization
fig.update_xaxes(showgrid=True)
fig.update_yaxes(showgrid=True)
fig.update_layout(
yaxis_title="Normalized Frequency" if normalize_frequency else "Frequency",
title={
'text': "<b>Topics over Time",
'y': .95,
'x': 0.40,
'xanchor': 'center',
'yanchor': 'top',
'font': dict(
size=22,
color="Black")
},
template="simple_white",
width=width,
height=height,
hoverlabel=dict(
bgcolor="white",
font_size=16,
font_family="Rockwell"
),
legend=dict(
title="<b>Global Topic Representation",
)
)
return fig
Then, you can use the function as follows:
import re
import pandas as pd
from bertopic import BERTopic
# Prepare data
trump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')
trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="#", row.text.split())), 1)
trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
trump = trump.loc[(trump.isRetweet == "f") & (trump.text != ""), :]
timestamps = trump.date.to_list()
tweets = trump.text.to_list()
# Create topics over time
model = BERTopic(verbose=True)
topics, probs = model.fit_transform(tweets)
topics_over_time = model.topics_over_time(tweets, topics, timestamps)
# Visualize topics over time with the updated colors
visualize_topics_over_time(model, topics_over_time)

How to make an interactive time serie plot using plotly?

I am trying to make an interactive time serie visualization using plotly and jupyter notebook.
I want to have a simple plot where I can filter the index of a dataframe using plotly and ipywidget and store the new index I have. But, I have no idea how to do so. I am investigating the documentation without any success. What I am doing so far :
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from ipywidgets import interactive
index = pd.date_range(start='2020-01-01', end='2020-01-15', freq='D')
timeserie = pd.DataFrame(np.random.normal(0,1,size=index.size), index=index, columns=['sensor'])
fig = go.FigureWidget([
go.Scatter(
x=timeserie.index.values,
y=timeserie.values,
mode='markers'
)
])
def update_training_dataset(index_min, index_max, sensor):
scatter = fig.data[0]
index = timeserie.loc[(timeserie.index >= index_min) & (timeserie.index <= index_max)].index
sensor_value = timeserie.loc[scatter.x, sensor].values
with fig.batch_update():
fig.layout.yaxis.title = sensor
scatter.x = index
scatter.y = sensor_value
interactive(update_training_dataset, index_min=index, index_max=index, sensor=timeserie.columns)
But, it leads to a strange error..
KeyError : "None of [Int64Index([15778368000000000000, ... are in the [index]"
This is weird as the index of my timeserie has datetimeindex as type.
This code would lead to updating the dataframe according to the values of sensor, index_min, index_max that the user set. Also, I note that the date are provided in a select widget... I would love to have a date picker here. Can someone help me ? Provide any code that I can get some insights from ? Thank you :)
EDIT
The solution is provided below thanks to Serge :)
fig = go.FigureWidget([
go.Scatter(
x=timeserie.index,
y=timeserie.values,
mode='markers'
)
])
def update_training_dataset(index_min, index_max, Sensor):
scatter = fig.data[0]
index = timeserie.loc[(timeserie.index >= index_min) & (timeserie.index <= index_max)].index
sensor_value = timeserie.loc[scatter.x, Sensor].values
with fig.batch_update():
fig.layout.yaxis.title = Sensor
scatter.x = index
scatter.y = sensor_value
date_picker_max = DatePicker(
description='End date',
disabled=False,
value = index.max()
)
date_picker_min = DatePicker(
description='Start date',
disabled=False,
value = index.min()
)
interact(
update_training_dataset,
index_min=date_picker_min,
index_max=date_picker_max,
Sensor=timeserie.columns
)
I am still working on a way to have hours:minutes:seconds in the date picker.
EDIT 2
By the way, no need to use interact instead of interactive : they seem to support widgets as parameters. Also, you need to import ipydatetime as below to get datetime picker.
# usual imports
from ipydatetime import DatetimePicker
fig = go.FigureWidget([
go.Scatter(
x=timeserie.index,
y=timeserie.values,
mode='markers'
)
])
def update_training_dataset(index_min, index_max, Sensor):
scatter = fig.data[0]
index = timeserie.loc[(timeserie.index >= index_min) & (timeserie.index <= index_max)].index
sensor_value = timeserie.loc[scatter.x, Sensor].values
with fig.batch_update():
fig.layout.yaxis.title = Sensor
scatter.x = index
scatter.y = sensor_value
date_picker_max = DatetimePicker(
description='End date',
disabled=False,
value = index.max()
)
date_picker_min = DatetimePicker(
description='Start date',
disabled=False,
value = index.min()
)
interact(
update_training_dataset,
index_min=date_picker_min,
index_max=date_picker_max,
Sensor=timeserie.columns
)
Actually, your code is all good. You did a simple mistake in the definition of fig. Try the following
fig = go.FigureWidget([
go.Scatter(
x=timeserie.index,
y=timeserie.values,
mode='markers'
)
])
def update_training_dataset(index_min, index_max, sensor):
scatter = fig.data[0]
index = timeserie.loc[(timeserie.index >= index_min) & (timeserie.index <= index_max)].index
sensor_value = timeserie.loc[scatter.x, sensor].values
with fig.batch_update():
fig.layout.yaxis.title = sensor
scatter.x = index
scatter.y = sensor_value
interactive(update_training_dataset, index_min=index, index_max=index, sensor=timeserie.columns)
You simly made the error of defining x=timeserie.index.values when it actually should be x=timeserie.index.
The result is fine when this is changed.

Linking plots (box select, lasso etc) in Bokeh generated in a loop

I am plotting some data using bokeh using a for loop to iterate over my columns in the dataframe. For some reason the box select and lasso tools which I have managed to have as linked in plots explicitly plotted (i.e. not generated with a for loop) does not seem to work now.
Do I need to increment some bokeh function within the for loop?
#example dataframe
array = {'variable': ['var1', 'var2', 'var3', 'var4'],
'var1': [np.random.rand(10)],
'var2': [np.random.rand(10)],
'var3': [np.random.rand(10)],
'var4': [np.random.rand(10)]}
cols = ['var1',
'var2',
'var3',
'var4']
df = pd.DataFrame(array, columns = cols)
w = 500
h = 400
#collect plots in a list (start with an empty)
plots = []
#iterate over the columns in the dataframe
# specify the tools in TOOLS
#add additional lines to show tolerance bands etc
for c in df[cols]:
source = ColumnDataSource(data = dict(x = df.index, y = df[c]))
TOOLS = "pan,wheel_zoom,box_zoom,reset,save,box_select,lasso_select"
f = figure(tools = TOOLS, width = w, plot_height = h, title = c + ' Run Chart',
x_axis_label = 'Run ID', y_axis_label = c)
f.line('x', 'y', source = source, name = 'data')
f.triangle('x', 'y', source = source)
#data mean line
f.line(df.index, df[c].mean(), color = 'orange')
#tolerance lines
f.line (df.index, df[c + 'USL'][0], color = 'red', line_dash = 'dashed', line_width = 2)
f.line (df.index, df[c + 'LSL'][0], color = 'red', line_dash = 'dashed', line_width = 2)
#append the new plot in this loop to the existing list of plots
plots.append(f)
#link all the x_ranges
for i in plots:
i.x_range = plots[0].x_range
#plot
p = gridplot(plots, ncols = 2)
output_notebook()
show(p)
I expect to produce plots which are linked and allow me to box or lasso select some points on one chart and for them to be highlighted on the others. However, the plots only let me select on one plot with no linked behaviour.
SOLUTION
This may seem a bit of a noob problem, but I am sure someone else will come across this, so here is the answer!!!
Bokeh works by referring to a datasource object (the columndatasource object). You can pass your dataframe completely into this and then call explicit x and y values within the glyph creation (e.g. my f.line, f.triangle etc).
So I moved the 'source' outside of the loop to prevent it being reset each iteration and just passed my df to it. I then within the loop, call the iteration index + descriptor string (USL, LSL, mean) for the y values and the 'index' for my x values.
I add a box select tool explicitly with a 'name' defined so that when the box selects, it only selects those glyphs that I want it to select (i.e. don't want it to select my constant value mean and spec limit lines).
Also, be careful that if you want to output to a html or something, that you probably will need to supress your in-notebook output as bokeh does not like having duplicate plots open. I have not included my html output solution here.
In terms of adding linked lasso objects for loop generated plots, I could only find an explicit box select tool generator so not sure this is possible.
So here it is:
#keep the source out of the loop to stop it resetting every time
Source = ColumnDataSource(df)
for c in cols:
TOOLS = "pan,wheel_zoom,box_zoom,reset,save"
f = figure(tools = TOOLS, width = w, plot_height = h, title = c + ' Run Chart',
x_axis_label = 'Run ID', y_axis_label = c)
f.line(x = 'index', y = c , source = Source, name = 'data')
f.triangle(x = 'index', y = c, source = Source, name = 'data')
#data mean line
f.line(x = 'index', y = c + '_mean', source = Source, color = 'orange')
#tolerance lines
f.line (x = 'index', y = c + 'USL', color = 'red', line_dash = 'dashed', line_width = 2, source = Source)
f.line (x = 'index', y = c + 'LSL', color = 'red', line_dash = 'dashed', line_width = 2, source = Source)
# Add BoxSelect tool - this allows points on one plot to be highligted on all linked plots. Note only the delta info
# is linked using name='data'. Again names can be used to ensure only the relevant glyphs are highlighted.
bxselect1 = BoxSelectTool(renderers=f.select(name='data'))
f.add_tools(bxselect1)
plots.append(f)
#tie the x_ranges together so that panning is linked between plots
for i in plots:
i.x_range = plots[0].x_range
forp = gridplot(plots, ncols = 2)
show(forp)

Plotly for python, only first data point is being graphed

I am new to plotly and working on a script to generate a graph based on some results pulled from a database. However when I send the data over to plotly, only the first data point for each of the three traces is being graphed. I've verified that the lists contain the right data, I've even simply pasted the lists in instead of dynamically creating the variables. Unfortunately each time only the first data point is being graphed. Does anyone know what I am missing here? I am also open to another library if needed.
Is it also possible to have the x axis show as a string?
import plotly.plotly as py
import plotly.graph_objs as go
# Custom database class, works fine.
from classes.database import DatabaseConnection
# Database Connections and instances
db_instance = DatabaseConnection()
db_conn = db_instance.conn
db_cur = db_instance.cur
def main():
# Get a list of versions and their stats.
db_cur.execute(
"""
select row_to_json(x) from
(SELECT
versions.version_number,
cast(AVG(results.average) as double precision) as average,
cast(AVG(results.minimum) as double precision) as minimum,
cast(AVG(results.maximum) as double precision) as maximum
FROM versions,results
WHERE
versions.version_number = results.version_number
GROUP BY
versions.version_number) x;
"""
)
versions = []
average = []
minimum = []
maximum = []
unclean = db_cur.fetchall()
# Create lists for x and y coordinates.
for row in unclean:
versions.append(row[0]['version_number'])
average.append(int(row[0]['average']))
minimum.append(int(row[0]['minimum']))
maximum.append(int(row[0]['maximum']))
grph_average = go.Scatter(
x=versions,
y=average,
name = 'Average',
mode='lines',
)
grph_minimum = go.Scatter(
x=versions,
y=minimum,
name = 'Minimum',
mode='lines',
)
grph_maximum = go.Scatter(
x=versions,
y=maximum,
name = 'Maximum',
mode='lines',
)
data = go.Data([grph_average, grph_minimum, grph_maximum])
# Edit the layout
layout = dict(title = 'Responses',
xaxis = dict(title = 'Versions'),
yaxis = dict(title = 'Ms'),
)
fig = dict(data=data, layout=layout)
py.plot(fig, filename='response-times', auto_open=False)
if __name__ == '__main__':
main()
The data that query returns is as follows, if you want to plug in the values :
versions = ['6.1', '5.0', '5.2']
average = [11232, 29391, 10429]
minimum = [3641, 7729, 3483]
maximum = [57440, 62535, 45201]
Here is some matplotlib that might get you started on this:
import matplotlib.pyplot as plt
versions = ['6.1', '5.0', '5.2']
average = [11232, 29391, 10429]
minimum = [3641, 7729, 3483]
maximum = [57440, 62535, 45201]
plt.plot(minimum)
plt.plot(average)
plt.plot(maximum)
plt.xticks(range(len(versions)), versions)
It looks like it was an issue with my x axis. By adding some text before the version number and specifically type casting to a string I was able to get the graphs to generate properly.
# Create lists for x and y coordinates.
for row in unclean:
versions.append("Version: " + str(row[0]['version_number']))
average.append(int(row[0]['average']))
minimum.append(int(row[0]['minimum']))
maximum.append(int(row[0]['maximum']))

Categories