Related
So i am traying to make a cycle that gives different sankey diagram the thing is due to the plotly optimization the node are in different positions. I will like to set the standard order to be [Formal, Informal, Unemployed, Inactive]
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go
df = pd.read_csv(path, delimiter=",")
Lista_Paises = df["code"].unique().tolist()
Lista_DF = []
for x in Lista_Paises:
DF_x = df[df["code"] == x]
Lista_DF.append(DF_x)
def grafico(df):
df = df.astype({"Source": "category", "Value": "float", "Target": "category"})
def category(i):
if i == "Formal":
return 0
if i == "Informal":
return 1
if i == "Unemployed":
return 2
if i == "Inactive":
return 3
def color(i):
if i == "Formal":
return "#9FB5D5"
if i == "Informal":
return "#E3EEF9"
if i == "Unemployed":
return "#E298AE"
if i == "Inactive":
return "#FCEFBC"
df['Source_cat'] = df["Source"].apply(category).astype("int")
df['Target_cat'] = df["Target"].apply(category).astype("int")
# df['Source_cat'] = LabelEncoder().fit_transform(df.Source)
# df['Target_cat'] = LabelEncoder().fit_transform(df.Target)
df["Color"] = df["Source"].apply(color).astype("str")
df = df.sort_values(by=["Source_cat", "Target_cat"])
Lista_Para_Sumar = df["Source_cat"].nunique()
Lista_Para_Tags = df["Source"].unique().tolist()
Suma = Lista_Para_Sumar
df["out"] = df["Target_cat"] + Suma
TAGS = Lista_Para_Tags + Lista_Para_Tags
Origen = df['Source_cat'].tolist()
Destino = df["out"].tolist()
Valor = df["Value"].tolist()
Color = df["Color"].tolist()
return (TAGS, Origen, Destino, Valor, Color)
def Sankey(TAGS: object, Origen: object, Destino: object, Valor: object, Color: object, titulo: str) -> object:
label = TAGS
source = Origen
target = Destino
value = Valor
link = dict(source=source, target=target, value=value,
color=Color)
node = dict(x=[0, 0, 0, 0, 1, 1, 1, 1], y=[1, 0.75, 0.5, 0.25, 0, 1, 0.75, 0.5, 0.25, 0], label=label, pad=35,
thickness=10,
color=["#305CA3", "#C1DAF1", "#C9304E", "#F7DC70", "#305CA3", "#C1DAF1", "#C9304E", "#F7DC70"])
data = go.Sankey(link=link, node=node, arrangement='snap')
fig = go.Figure(data)
fig.update_layout(title_text=titulo + "-" + "Mujeres", font_size=10, )
plt.plot(alpha=0.01)
titulo_guardar = (str(titulo) + ".png")
fig.write_image("/Users/agudelo/Desktop/GRAFICOS PNUD/Graficas/MUJERES/" + titulo_guardar, engine="kaleido")
for y in Lista_DF:
TAGS, Origen, Destino, Valor, Color = grafico(y)
titulo = str(y["code"].unique())
titulo = titulo.replace("[", "")
titulo = titulo.replace("]", "")
titulo = titulo.replace("'", "")
Sankey(TAGS, Origen, Destino, Valor, Color, titulo)
The expected result should be.
The expected result due to the correct order:
The real result i am getting is:
I had a similar problem earlier. I hope this will work for you. As I did not have your data, I created some dummy data. Sorry about the looooong explanation. Here are the steps that should help you reach your goal...
This is what I did:
Order the data and sort it - used pd.Categorical to set the order and then df.sort to sort the data so that the input is sorted by source and then destination.
For the sankey node, you need to set the x and y positions. x=0, y=0 starts at top left. This is important as you are telling plotly the order you want the nodes. One weird thing is that it sometimes errors if x or y is at 0 or 1. Keep it very close, but not the same number... wish I knew why
For the other x and y entries, I used ratios as my total adds up to 285. For eg. Source-Informal starts at x = 0.001 and y = 75/285 as Source-Formal = 75 and this will start right after that
Based on step 1, the link -> source and destination should also be sorted. But, pls do check.
Note: I didn't color the links, but think you already have achieved that...
Hope this helps resolve your issue...
My data - sankey.csv
source,destination,value
Formal,Formal,20
Formal,Informal, 10
Formal,Unemployed,30
Formal,Inactive,15
Informal,Formal,20
Informal,Informal,15
Informal,Unemployed,25
Informal,Inactive,25
Unemployed,Formal,5
Unemployed,Informal,10
Unemployed,Unemployed,10
Unemployed,Inactive,5
Inactive,Formal,30
Inactive,Informal,20
Inactive,Unemployed,20
Inactive,Inactive,25
The code
import plotly.graph_objects as go
import pandas as pd
df = pd.read_csv('sankey.csv') #Read above CSV
#Sort by Source and then Destination
df['source'] = pd.Categorical(df['source'], ['Formal','Informal', 'Unemployed', 'Inactive'])
df['destination'] = pd.Categorical(df['destination'], ['Formal','Informal', 'Unemployed', 'Inactive'])
df.sort_values(['source', 'destination'], inplace = True)
df.reset_index(drop=True)
mynode = dict(
pad = 15,
thickness = 20,
line = dict(color = "black", width = 0.5),
label = ['Formal', 'Informal', 'Unemployed', 'Inactive', 'Formal', 'Informal', 'Unemployed', 'Inactive'],
x = [0.001, 0.001, 0.001, 0.001, 0.999, 0.999, 0.999, 0.999],
y = [0.001, 75/285, 160/285, 190/285, 0.001, 75/285, 130/285, 215/285],
color = ["#305CA3", "#C1DAF1", "#C9304E", "#F7DC70", "#305CA3", "#C1DAF1", "#C9304E", "#F7DC70"])
mylink = dict(
source = [ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 ],
target = [ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 ],
value = df.value.to_list())
fig = go.Figure(data=[go.Sankey(
arrangement='snap',
node = mynode,
link = mylink
)])
fig.update_layout(title_text="Basic Sankey Diagram", font_size=20)
fig.show()
The output
I am using plotnine to create a dual bar + line chart (see below). I would like the two legends to appear in a single line like the R example below. Can this be done with plotnine? Sample code below:
plotnine code (what I have):
import numpy as np
import pandas as pd
from plotnine import *
from mizani.formatters import date_format
qrtly = pd.DataFrame({
'date':pd.date_range(start='1/1/2015', periods=21, freq='Q'),
'qrtly': (0.6,0.9,0.7,0.1,1.0,0.3,0.7,1.0,0.5,0.9,0.9,0.4,0.2,0.5,0.7,0.6,0.4,-0.3,-7.0,3.4,3.1)
})
qrtly = pd.melt(qrtly, id_vars=['date'], value_vars=['qrtly'])
tty = pd.DataFrame({
'date':pd.date_range(start='1/1/2015', periods=21, freq='Q'),
'tty': (2.7,2.7,3.2,2.3,2.7,2.1,2.1,3.0,2.5,3.1,3.3,2.7,2.4,1.9,1.7,1.9,2.2,1.4,-6.3,-3.7,-1.1)
})
tty = pd.melt(tty, id_vars=['date'], value_vars=['tty'])
p = (ggplot()
+ theme_light()
+ geom_bar(qrtly, aes(x='date',y='value', fill='variable'), stat='identity', position='dodge')
+ geom_line(tty, aes(x='date',y='value',color='variable'))
+ labs(x=None,y=None)
+ scale_x_datetime(breaks='1 year', labels=date_format('%Y'), expand=(0,0))
+ scale_fill_manual('#002147')
+ scale_color_manual('#800000')
+ guides(color = guide_legend(nrow = 1))
+ guides(fill = guide_legend(nrow = 1))
+ theme(
legend_direction = 'horizontal',
legend_position = 'bottom',
legend_title = element_blank(),
)
)
p
result:
R code (what I want):
library(ggplot2)
df = data.frame(
date = seq(as.Date('2015-12-1'), as.Date('2020-12-1'), by='quarter'),
qrtly = c(0.6,0.9,0.7,0.1,1.0,0.3,0.7,1.0,0.5,0.9,0.9,0.4,0.2,0.5,0.7,0.6,0.4,-0.3,-7.0,3.4,3.1),
tty = c(2.7,2.7,3.2,2.3,2.7,2.1,2.1,3.0,2.5,3.1,3.3,2.7,2.4,1.9,1.7,1.9,2.2,1.4,-6.3,-3.7,-1.1)
)
ggplot(df) +
theme_light() +
geom_bar(aes(x=date, y=qrtly, fill='quarterly'), stat='identity', position='dodge') +
geom_line(aes(x=date, y=tty, group=1, color='tty'), size=1) +
labs(x=NULL, y=NULL) +
scale_fill_manual(values=c('#002147')) +
scale_color_manual(values=c('#800000')) +
guides(color = guide_legend(nrow = 1)) +
guides(fill = guide_legend(nrow = 1)) +
theme(
legend.direction = 'horizontal',
legend.position = 'bottom',
legend.title = element_blank(),
)
result:
I just figured out this out by going into the documentation, but the setting you want is
+ theme(legend_box = 'horizontal')
You can find more information here:
https://plotnine.readthedocs.io/en/stable/generated/plotnine.themes.theme.html
I've made a function to graph economic performance, but the output is often lopsided on the y-axis.
The below graph shows the problem. The range of y values makes the chart default to the max/min as the range of the y axis.
Is there any way to force the chart to center itself on 0, or do I need derive the max and min y values within the function?
The function is below. If you'd like me to replace the variables with values to repro the chart lmk- it's a bit of a task.
def recession_comparison(key, variable, dimension):
'''
Creates the "scary chart"- proportional growth for a single area/industry. All recessions included in chart.
Parameters:
key (str or int): area-fips or industry_code
variable (str): determines what economic indicator will be used in the timeline. Must be one of ['month3_emplvl' (employment), 'avg_wkly_wage' (wages), 'qtrly_estabs_count'(firms)]
dimension (str): dimension of data to chart.
Returns:
fig (matplotlib plot)
'''
fig, ax = plt.subplots(figsize =(15, 10))
if dimension == 'area':
index = 'area_fips'
title = 'Recession Comparison, ' + area_titles[key] + " (" + str(key) + ")"
elif dimension == 'industry':
index = 'industry_code'
title = 'Recession Comparison: ' + industry_titles[key] + " (" + str(key) + ")"
for recession in recessions_int.keys():
if recession == 'full':
break
loadpath = filepath(variable = variable, dimension = dimension, charttype = 'proportional', recession = recession, filetype = 'json')
df = pd.read_json(loadpath)
df.set_index(index, inplace = True)
ax.plot(df.loc[key][1:-1]*100, label = str(recession), linewidth = 1.5, alpha = 0.8)
ax.axvline(x = 6, color = 'black', linewidth = 0.8, alpha = 0.5, ls = ':', label = 'Event Quarter')
ax.axhline(y = 0, color = 'black', linewidth = 0.8, alpha = 0.5, ls = '--', label = 'Pre-Recession baseline')
ax.set_xlabel('Quarters since start of recession')
ax.set_ylabel('Growth: ' + var_display[variable])
ax.set_title(title)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
plt.legend()
plt.show()
return fig
edit: full code solution from DapperDuck:
def recession_comparison(key, variable, dimension):
fig, ax = plt.subplots(figsize =(15, 10))
if dimension == 'area':
index = 'area_fips'
title = 'Recession Comparison, ' + area_titles[key] + " (" + str(key) + ")"
elif dimension == 'industry':
index = 'industry_code'
title = 'Recession Comparison: ' + industry_titles[key] + " (" + str(key) + ")"
for recession in recessions_int.keys():
if recession == 'full':
break
loadpath = filepath(variable = variable, dimension = dimension, charttype = 'proportional', recession = recession, filetype = 'json')
df = pd.read_json(loadpath)
df.set_index(index, inplace = True)
ax.plot(df.loc[key][1:-1]*100, label = str(recession), linewidth = 1.5, alpha = 0.8)
ax.axvline(x = 6, color = 'black', linewidth = 0.8, alpha = 0.5, ls = ':', label = 'Event Quarter')
ax.axhline(y = 0, color = 'black', linewidth = 0.8, alpha = 0.5, ls = '--', label = 'Pre-Recession baseline')
yabs_max = abs(max(ax.get_ylim(), key=abs))
ax.set_ylim(ymin=-yabs_max, ymax=yabs_max)
ax.set_xlabel('Quarters since start of recession')
ax.set_ylabel('Growth: ' + var_display[variable])
ax.set_title(title)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
plt.legend()
plt.show()
return fig
Corrected image:
Add the following code right after ax.axhline(y = 0, color = 'black', linewidth = 0.8, alpha = 0.5, ls = '--', label = 'Pre-Recession baseline'):
yabs_max = abs(max(ax.get_ylim(), key=abs))
ax.set_ylim(ymin=-yabs_max, ymax=yabs_max)
I would like to add a label to a line in plotnine. I get the following error when using geom_text:
'NoneType' object has no attribute 'copy'
Sample code below:
df = pd.DataFrame({
'date':pd.date_range(start='1/1/1996', periods=4*25, freq='Q'),
'small': pd.Series([0.035]).repeat(4*25) ,
'large': pd.Series([0.09]).repeat(4*25),
})
fig1 = (ggplot()
+ geom_step(df, aes(x='date', y='small'))
+ geom_step(df, aes(x='date', y='large'))
+ scale_x_datetime(labels=date_format('%Y'))
+ scale_y_continuous(labels=lambda l: ["%d%%" % (v * 100) for v in l])
+ labs(x=None, y=None)
+ geom_text(aes(x=pd.Timestamp('2000-01-01'), y = 0.0275, label = 'small'))
)
print(fig1)
Edit:
has2k1's answer below solves the error, but I get:
I want this: (from R)
R code:
ggplot() +
geom_step(data=df, aes(x=date, y=small), color='#117DCF', size=0.75) +
geom_step(data=df, aes(x=date, y=large), color='#FF7605', size=0.75) +
scale_y_continuous(labels = scales::percent, expand = expand_scale(), limits = c(0,0.125)) +
labs(x=NULL, y=NULL) +
geom_text(aes(x = as.Date('1996-01-07'), y = 0.0275, label = 'small'), color = '#117DCF', size=5)
Any documentation beyond https://plotnine.readthedocs.io/en/stable/index.html? I have read the geom_text there and still can't produce what I need...
geom_text has no dataframe. If you want to print the text put it in quotes i.e. '"small"' or put the label mapping outside aes(), but it makes more sense to use annotate.
(ggplot(df)
...
# + geom_text(aes(x=pd.Timestamp('2000-01-01'), y = 0.0275, label = '"small"'))
# + geom_text(aes(x=pd.Timestamp('2000-01-01'), y = 0.0275), label = 'small')
+ annotate('text', x=pd.Timestamp('2000-01-01'), y = 0.0275, label='small')
)
I am getting a very strange error using basemap. No error appears, yet my 3rd plot has no data plotted when data does indeed exist. Below is my code. When run, you will see that both modis and seawifs data is plotted, but viirs is not. I can't figure out why.
import numpy as np
import urllib
import urllib2
import netCDF4
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from datetime import datetime, date, time, timedelta
import json
import math
def indexsearch(datebroken,year, month, day):
for i in range(0,len(datebroken)):
if (datebroken[i,0] == year and datebroken[i,1] == month and datebroken[i,2] == day):
return i
url = 'http://coastwatch.pfeg.noaa.gov/erddap/griddap/erdMWchlamday.nc?chlorophyll' +\
'[(2002-07-16T12:00:00Z):1:(2015-04-16T00:00:00Z)][(0.0):1:(0.0)][(36):1:(39)][(235):1:(240)]'
file = 'erdMWchlamday.nc'
urllib.urlretrieve(url, file)
ncfilemod = netCDF4.Dataset(file)
ncv1 = ncfilemod.variables
print ncv1.keys()
time1=ncv1['time'][:]
inceptiondate = datetime(1970, 1, 1, 0, 0, 0)
timenew1=[]
for i in time1[:]:
newdate = inceptiondate + timedelta(seconds=i)
timenew1.append(newdate.strftime('%Y%m%d%H'))
datebroken1 = np.zeros((len(timenew1),4),dtype=int)
for i in range(0,len(timenew1)):
datebroken1[i,0] = int(timenew1[i][0:4])
datebroken1[i,1] = int(timenew1[i][4:6])
datebroken1[i,2] = int(timenew1[i][6:8])
datebroken1[i,3] = int(timenew1[i][8:10])
lon1= ncv1['longitude'][:]
lat1 = ncv1['latitude'][:]
lons1, lats1 = np.meshgrid(lon1,lat1)
chla1 = ncv1['chlorophyll'][:,0,:,:]
url = 'http://coastwatch.pfeg.noaa.gov/erddap/griddap/erdSWchlamday.nc?chlorophyll' +\
'[(1997-09-16):1:(2010-12-16T12:00:00Z)][(0.0):1:(0.0)][(36):1:(39)][(235):1:(240)]'
file = 'erdSWchlamday.nc'
urllib.urlretrieve(url, file)
#Ncfile 2
ncfilewif = netCDF4.Dataset(file)
ncv2 = ncfilewif.variables
print ncv2.keys()
time2=ncv2['time'][:]
inceptiondate = datetime(1970, 1, 1, 0, 0, 0)
timenew2=[]
for i in time2[:]:
newdate = inceptiondate + timedelta(seconds=i)
timenew2.append(newdate.strftime('%Y%m%d%H'))
datebroken2 = np.zeros((len(timenew2),4),dtype=int)
for i in range(0,len(timenew2)):
datebroken2[i,0] = int(timenew2[i][0:4])
datebroken2[i,1] = int(timenew2[i][4:6])
datebroken2[i,2] = int(timenew2[i][6:8])
datebroken2[i,3] = int(timenew2[i][8:10])
lon2= ncv2['longitude'][:]
lat2 = ncv2['latitude'][:]
lons2, lats2 = np.meshgrid(lon2,lat2)
chla2 = ncv2['chlorophyll'][:,0,:,:]
url = 'http://coastwatch.pfeg.noaa.gov/erddap/griddap/erdVH2chlamday.nc?chla' +\
'[(2012-01-15):1:(2015-05-15T00:00:00Z)][(39):1:(36)][(-125):1:(-120)]'
file = 'erdVH2chlamday.nc'
urllib.urlretrieve(url, file)
ncfileviir = netCDF4.Dataset(file)
ncv3 = ncfileviir.variables
print ncv3.keys()
time3=ncv3['time'][:]
inceptiondate = datetime(1970, 1, 1, 0, 0, 0)
timenew3=[]
for i in time3[:]:
newdate = inceptiondate + timedelta(seconds=i)
timenew3.append(newdate.strftime('%Y%m%d%H'))
datebroken3 = np.zeros((len(timenew3),4),dtype=int)
for i in range(0,len(timenew3)):
datebroken3[i,0] = int(timenew3[i][0:4])
datebroken3[i,1] = int(timenew3[i][4:6])
datebroken3[i,2] = int(timenew3[i][6:8])
datebroken3[i,3] = int(timenew3[i][8:10])
lon3= ncv3['longitude'][:]
lat3 = ncv3['latitude'][:]
lons3, lats3 = np.meshgrid(lon3,lat3)
chla3 = ncv3['chla'][:,:,:]
i1=indexsearch(datebroken1,2012,6,16)
print i1
i2=indexsearch(datebroken2,2010,6,16)
print i2
i3=indexsearch(datebroken3,2012,6,15)
print i3
chla1plot = chla1[i1,:,:]
chla2plot = chla2[i2,:,:]
chla3plot = chla3[i3,:,:]
ncfileviir.close()
ncfilemod.close()
ncfilewif.close()
Important code is below here. All code above is just pulling the data into python to plot.
minlat = 36
maxlat = 39
minlon = 235
maxlon = 240
# Create map
fig = plt.figure()
#####################################################################################################################
#plot figure 1
ax1 = fig.add_subplot(221)
m = Basemap(projection='merc', llcrnrlat=minlat,urcrnrlat=maxlat,llcrnrlon=minlon, urcrnrlon=maxlon,resolution='h')
cs1 = m.pcolormesh(lons1,lats1,chla1plot,cmap=plt.cm.jet,latlon=True)
m.drawcoastlines()
m.drawmapboundary()
m.fillcontinents()
m.drawcountries()
m.drawstates()
m.drawrivers()
#Sets up parallels and meridians.
parallels = np.arange(36.,39,1.)
# labels = [left,right,top,bottom]
m.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(235.,240.,1.)
m.drawmeridians(meridians,labels=[True,False,False,True])
ax1.set_title('Modis')
#####################################################################################################################
#plot figure 2
ax2 = fig.add_subplot(222)
cs2 = m.pcolormesh(lons2,lats2,chla2plot,cmap=plt.cm.jet,latlon=True)
m.drawcoastlines()
m.drawmapboundary()
m.fillcontinents()
m.drawcountries()
m.drawstates()
m.drawrivers()
#Sets up parallels and meridians.
parallels = np.arange(36.,39,1.)
# labels = [left,right,top,bottom]
m.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(235.,240.,1.)
m.drawmeridians(meridians,labels=[True,False,False,True])
ax2.set_title('SeaWIFS')
#####################################################################################################################
#plot figure 3
ax3 = fig.add_subplot(223)
cs3 = m.pcolormesh(lons3,np.flipud(lats3),np.flipud(chla3plot),cmap=plt.cm.jet,latlon=True)
m.drawcoastlines()
m.drawmapboundary()
m.fillcontinents()
m.drawcountries()
m.drawstates()
m.drawrivers()
#Sets up parallels and meridians.
parallels = np.arange(36.,39,1.)
# labels = [left,right,top,bottom]
m.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(235.,240.,1.)
m.drawmeridians(meridians,labels=[True,False,False,True])
ax3.set_title('VIIRS')
# Save figure (without 'white' borders)
#plt.savefig('SSTtest.png', bbox_inches='tight')
plt.show()
My results are shown here!
![results]: http://i.stack.imgur.com/dRjkU.png
The issue that I found was that I had
minlat = 36
maxlat = 39
minlon = 235
maxlon = 240
m = Basemap(projection='merc', llcrnrlat=minlat,urcrnrlat=maxlat,llcrnrlon=minlon, urcrnrlon=maxlon,resolution='h')
The final plot was -125 to -120 which basemap did not automatically handle, but instead placed the plot at an area where I did not have data. I added a new m = basemap statement and changed the meridian numbers for the third graph using -125 to -120 as my longitude and the graph plotted just fine.