I've created a waterfall graph using Jupyter and Atom (as I'm looking for a decent substitute for Jupyter, specially when it's related to dataframe visualisation)
Thing is that I used the same exact code in both editors but the output of the graph is different.
Does someone have an explanation?
Here is the code used:
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objs as go
#read excel file
df=pd.read_csv('C:/Users/Usuario/Desktop/python/HP/waterfall.csv',sep=';')
df['Measure']=df['Measure'].str.lower()
display(df)
#store values in different variables
x=df['Deal ID']
y=df['deal value (USD)']
measure = df['Measure']
text=df['deal value (USD)']
#let's create the figure
fig = go.Figure(go.Waterfall(
measure=measure,
x=x,
y=y,
text=text,
textposition="outside",
decreasing = {"marker":{"color":"Maroon", "line":{"color":"red", "width":2}}},
increasing = {"marker":{"color":"Teal"}},
totals = {"marker":{"color":"deep sky blue", "line":{"color":"blue", "width":3}}},
showlegend=False
))
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False, visible=False)
fig.update_traces(hovertemplate=None)
fig.update_layout(title='Total deal value per customer X', height=470,
margin=dict(t=90, b=20, l=70, r=70),
hovermode="x unified",
xaxis_title='QvsQ ', yaxis_title="deal value in USD",
plot_bgcolor='rgba(0,0,0,0)',
#paper_bgcolor='#333',
title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
font=dict(color='#8a8d93'))
Atom otput:
Jupyter output: [![Jupyter output][2]][2]
Thanks
[2]: https://i.stack.imgur.com/wYPEG.png
Related
I'm trying to create faceted maps by the column rank in my df. Each map will display the product for each state. I want the color of the product to be consistent across maps.
With the solution below I can achieve that, but the legend will show multiple entries for the same product, one for each state. How can I have the legend show only one entry per distinct product?
import pandas as pd
import plotly.express as px
from random import randint
df = pd.DataFrame({'rank': [1,1,1,1,2,2,2,2],'product':['A','B','C','D','C','D','Z','X'],'state':['WA','OR','CA','ID','WA','OR','CA','ID']})
unique_hi = df['product'].unique()
color_discrete_map = {unique_hi[k]: '#%06X' % randint(0, 0xFFFFFF) for k in range(len(unique_hi))}
fig = px.choropleth(df, color='product', facet_col="rank",facet_col_wrap=2,
locations="state", #featureidkey="properties.district",
locationmode="USA-states",
projection="mercator",height=600,
color_discrete_map=color_discrete_map,
title='Regional products'
)
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()
If you check the contents of the created map in fig.data, you will find the original name of the legend, which is collected and only the names of the non-duplicated.
import pandas as pd
import plotly.express as px
from random import randint
df = pd.DataFrame({'rank': [1,1,1,1,2,2,2,2],'product':['A','B','C','D','C','D','Z','X'],'state':['WA','OR','CA','ID','WA','OR','CA','ID']})
unique_hi = df['product'].unique()
color_discrete_map = {unique_hi[k]: '#%06X' % randint(0, 0xFFFFFF) for k in range(len(unique_hi))}
fig = px.choropleth(df, color='product', facet_col="rank",facet_col_wrap=2,
locations="state", #featureidkey="properties.district",
locationmode="USA-states",
projection="mercator",height=600,
color_discrete_map=color_discrete_map,
title='Regional products'
)
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
# update
names = set()
fig.for_each_trace(
lambda trace:
trace.update(showlegend=False)
if (trace.name in names) else names.add(trace.name))
fig.show()
The way to add a product name as an annotation is not possible to specify it using map coordinates (I referred to this for the rationale), so adding the following code will make the annotation, but all products will need to be manually addressed. Upon further investigation, it seems that a combination of go.choroplethmapbox() and go.scattergeo() would do it. In this case, you will need to rewrite the code from scratch.
fig.add_annotation(
x=0.2,
xref='paper',
y=0.85,
yref='paper',
text='A',
showarrow=False,
font=dict(
color='yellow',
size=14
)
)
I am very new to using Python and especially new to using the Bokeh library. I am trying to plot a Choropleth map of the United States with the fill color of each state corresponding to their bee population of a year.
It shows the value when you hover over it, but only the states with a value of zero have color.
Link to an image of the output plot is here.
I know there is a big difference in the range (minimum:0, maximum: 310,000) which I believe is causing the problem. How can I change the range of the color map to not fill all of the higher values with grey?
Code for reference below:
from bokeh.models import LogColorMapper
from bokeh.palettes import YlGnBu9 as YlGnBu
from bokeh.sampledata.us_states import data as us_states
import pandas as pd
import numpy as np
bee_pop = pd.read_csv('./BeePopulation.csv')
us_states_df = pd.DataFrame(us_states).T
us_states_df = us_states_df[~us_states_df["name"].isin(['Alaska', "Hawaii", "District of
Columbia"])]
us_states_df["lons"] = us_states_df.lons.values.tolist()
us_states_df["lats"] = us_states_df.lats.values.tolist()
us_states_df = us_states_df.reset_index()
bee_2016 = bee_pop[bee_pop['Year']==2016]
us_states_df = us_states_df.merge(bee_2016[["State", "Pop"]], how="left", left_on="index",
right_on="State")
us_states_df.head()
us_states_datasource = {}
us_states_datasource["lons"] = us_states_df.lons.values.tolist()
us_states_datasource["lats"] = us_states_df.lats.values.tolist()
us_states_datasource["name"] = us_states_df.name.values.tolist()
us_states_datasource["BeePop"] = us_states_df.Pop.values.tolist()
fig = figure(plot_width=900, plot_height=600,
title="United Bee Population Per State Choropleth Map",
x_axis_location=None, y_axis_location=None,
tooltips=[
("Name", "#name"), ("Bee Population", "#BeePop")
])
fig.grid.grid_line_color = None
fig.patches("lons", "lats", source=us_states_datasource,
fill_color={'field': 'BeePop', 'transform': LogColorMapper(palette=YlGnBu[::-1])},
fill_alpha=0.7, line_color="white", line_width=0.5)
show(fig)
Thank you in advance!
The LogColorMapper has configurable high and low properties. Another option, of course, is to use a different color mapper, e.g. LinearColorMapper or CategorgicalColorMapper in conjunction with some categorical binning.
Here is my dataset after cleaning csv file
Here is output what I want
What I want is , I have to display years in x axis and column values in y axis.and I want to display bubbles with different colors and size with play animation button
I am new to data science , can someone help me ,how can I achieve this?
Judging by your dataset and attached image, what you're asking for is something like this:
But I'm not sure that is what you actually want. You see, with your particular dataset there aren't enough dimensions to justify an animation. Or even a bubble plot. This is because you're only looking at one value. So you end up showing the same value throuh the bubble sizes and on the y axis. And there's really no need to change your dataset given that your provided screenshot is in fact your desired plot. But we can talk more about that if you'd like.
Since you haven't provided a sample dataset, I've used a dataset that's available through plotly express and reshaped it so that is matches your dataset:
Complete code:
# imports
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import math
import numpy as np
# color cycle
colors = px.colors.qualitative.Alphabet*10
# sample data with similar structure as OP
df = px.data.gapminder().query("continent=='Americas'")
dfp=df.pivot(index='year', columns='country', values='pop')
dfp=dfp[['United States', 'Mexico', 'Argentina', 'Brazil', 'Colombia']]
dfp=dfp.sort_values(by='United States', ascending = False)
dfp=dfp.T
dfp.columns = [str(yr) for yr in dfp.columns]
dfp = dfp[dfp.columns[::-1]].T
# build figure and add traces
fig=go.Figure()
for col, country in enumerate(dfp):
vals = dfp[country].values
yVals = [col]*len(vals)
fig.add_traces(go.Scatter(
y=yVals,
x=dfp.index,
mode='markers',
marker=dict(color=colors[col],
size=vals,
sizemode='area',
#sizeref=2.*max(vals)/(40.**2),
sizeref=2.*max(dfp.max())/(40.**2),
sizemin=4),
name = country
))
# edit y tick layout
tickVals = np.arange(0, len(df.columns))
fig.update_layout(
yaxis = dict(tickmode = 'array',
tickvals = tickVals,
ticktext = dfp.columns.tolist()))
fig.show()
I have been exploring various COVID-19 datasets and doing analysis. Below is a 'cleaned' up version of my code. I've been running in google's Colab, but should work on any machine with the modules available (tested).
Questions upfront:
How do I create a function of my technique to extract the data as I do below for each country or US state? My technique is denoted below with My technique
How do I plot (bubble plot) the data on the geopandas maps? I would like to create a new map for each date. Then create a movie, but I haven't got that far.
Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
#%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython import display
from ipywidgets import interact, widgets
from datetime import datetime, timedelta
#from google.colab import files
Some setup parameters
chartcol='red'
plt.rcParams['figure.figsize'] = [15, 5]
I am using the Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE) dataset that is updated nightly (daily?).
# Get the data
#Read Data for Cases, Deaths and Recoveries
ConfirmedCases_raw=pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv')
Deaths_raw=pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv')
Recoveries_raw=pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv')
plt.rcParams['figure.figsize'] = [15, 5]
Here I am creating a list of date strings that match the timeseries data column names.
The list ends 'yesterday' because the dataset isn't updated for 'today'.
#This produces a list of strings that matches the column names in the COVID-10 time series.
# Will use it to extract data to build country and state data later.
today = datetime.now()
today2=date.today()
yesterday = today2 - timedelta(days = 1)
covid_epoch=date(2020, 1, 22)
delta = yesterday - covid_epoch
timeline=[]
for i in range(delta.days + 1):
#day = sdate + timedelta(days=i)
day = covid_epoch + timedelta(days=i)
timeline.append(day.strftime("%-m/%-d/%y"))
Now I use my 'technique' to extract the timeseries data in a way I can use it. I would like to be able to do this with a function.
#Extracts data from csv file into time column and value column
#Creates a list of values from each time column
#Consider using melt to do this. How?
#Create a function to do this. How?
My technique: The block below is what I want to turn into a function
time = [];value = [];country=[];province= []
col_value = list(ConfirmedCases_raw.columns)
for i in timeline:
time.append(i)
value.append(ConfirmedCases_raw[i].sum())
Create a dataframe for all cases around the world. I fill the dataframe with sum values for all countries for the 'world' dataframe, 'time' and 'value'.
world = pd.DataFrame({'Timeline':time,'Covid-19 impact':value})
#Plot the world data
plt.plot(world['Timeline'],world['Covid-19 impact'])
plt.xticks(rotation=45);
plt.suptitle('World COVID-19 Confirmed Cases', fontsize=20)
plt.xlabel('Date', fontsize=18);
plt.ylabel('Count', fontsize=16);
plt.grid(color='b', ls = '-.', lw = 0.25)
Again, do this to create a US dataframe. Filtered Confirmed cases for the United States
us_confirmed_raw=ConfirmedCases_raw[ConfirmedCases_raw['Country/Region']=='US']
Here is my data extraction block again:
time = [];value = [];country=[];province= []
col_value = list(us_confirmed_raw.columns)
for i in timeline:
time.append(i)
value.append(us_confirmed[i].sum())
And create dataframe and plot US data
us_confirmed = pd.DataFrame({'Timeline':time,'Covid-19 impact':value})
plt.plot(us_confirmed['Timeline'],us_confirmed['Covid-19 impact'])
plt.xticks(rotation=45);
plt.suptitle('USA COVID-19 Confirmed Cases', fontsize=20)
plt.xlabel('Date', fontsize=18);
plt.ylabel('Count', fontsize=16);
plt.grid(color='r', ls = '-.', lw = 0.55)
And again for New Mexico.
nm_confirmed=us_confirmed_raw[us_confirmed_raw['Province/State']=='New Mexico']
time = [];value = [];country=[];province= []
col_value = list(nm_confirmed.columns)
for i in timeline:
time.append(i)
value.append(nm_confirmed[i].sum())
nm = pd.DataFrame({'Timeline':time,'Covid-19 impact':value})
plt.plot(nm['Timeline'],nm['Covid-19 impact'],linestyle='--', marker='o')
plt.xticks(rotation=45);
plt.suptitle('New Mexico COVID-19 Confirmed Cases', fontsize=20)
plt.xlabel('Date', fontsize=18);
plt.ylabel('Count', fontsize=16);
plt.grid(color='r', ls = '-.', lw = 0.25)
your right, maybe too many examples.
My next issue is creating maps. Below I am using geopandas to create a map and plot data.
!pip install geopandas;
import geopandas
put the New Mexico data into a geopandas dataframe:
gdf = geopandas.GeoDataFrame(
nm_confirmed, geometry=geopandas.points_from_xy(nm_confirmed.Long, nm_confirmed.Lat))
and plot the location of New Mexico. I cannot figure out how to plot the COVID-19 data on this map.
# Creates a map of the world
# Show the location of new mexico
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
ax = world[world.continent == 'North America'].plot(
color='white', edgecolor='black')
gdf.plot(ax=ax, color='red')
Here is my answer for question 1:
This function, 'get_state' returns the state data in the datafiles (US filtered):
def get_state(state):
state_confirmed=us_confirmed_raw[us_confirmed_raw['Province/State']==state]
col_value = list(state_confirmed.columns)
time = [];value = [];
for i in timeline:
time.append(i)
value.append(state_confirmed[i].sum())
us_state = pd.DataFrame({'Timeline':time,'Covid-19 impact':value})
return us_state
and it can be used directly,
state='Texas'
plt.plot(get_state(state).Timeline,get_state(state)['Covid-19 impact'],linestyle='--', marker='o')
plt.xticks(rotation=45);
plt.suptitle(state+' COVID-19 Confirmed Cases', fontsize=20)
plt.xlabel('Date', fontsize=18);
plt.ylabel('Count', fontsize=16);
plt.grid(color='r', ls = '-.', lw = 0.25)
which produces this plot,
I am wondering if Python has something to plot the data availability of time series with multiple variables. An example is shown below taken from Visavail.js - A Time Data Availability Chart.
Here's a suggestion using plotly in a Jupyter Notebook:
Code:
import random
import pandas as pd
import plotly.express as px
from random import choices
# random data with a somewhat higher
# probability of 1 than 0 to mimic OPs data
random.seed(1)
vals=[0,1]
prob=[0.4, 0.6]
choices(vals, prob)
data=[]
for i in range(0,5):
data.append([choices(vals, prob)[0] for c in range(0,10)])
# organize data in a pandas dataframe
df=pd.DataFrame(data).T
df.columns=['Balance Sheet', 'Closing Price', 'Weekly Report', 'Analyst Data', 'Annual Report']
drng=pd.date_range(pd.datetime(2080, 1, 1).strftime('%Y-%m-%d'), periods=df.shape[0]).tolist()
df['date']=[d.strftime('%Y-%m-%d') for d in drng]
dfm=pd.melt(df, id_vars=['date'], value_vars=df.columns[:-1])
# plotly express
fig = px.bar(dfm, x="date", y="variable", color='value', orientation='h',
hover_data=["date"],
height=600,
color_continuous_scale=['firebrick', '#2ca02c'],
title='Data Availabiltiy Plot',
template='plotly_white',
)
fig.update_layout(yaxis=dict(title=''), xaxis=dict(title='', showgrid=False, gridcolor='grey',
tickvals=[],
)
)
fig.show()
Try Plotly Gantt Charts
https://plotly.com/python/gantt/
The benefit of using this is, you can use actual time as value and also separate ressources and tasks.