Plotnine : How to create two legends on a single line - python

I am using plotnine to create a dual bar + line chart (see below). I would like the two legends to appear in a single line like the R example below. Can this be done with plotnine? Sample code below:
plotnine code (what I have):
import numpy as np
import pandas as pd
from plotnine import *
from mizani.formatters import date_format
qrtly = pd.DataFrame({
'date':pd.date_range(start='1/1/2015', periods=21, freq='Q'),
'qrtly': (0.6,0.9,0.7,0.1,1.0,0.3,0.7,1.0,0.5,0.9,0.9,0.4,0.2,0.5,0.7,0.6,0.4,-0.3,-7.0,3.4,3.1)
})
qrtly = pd.melt(qrtly, id_vars=['date'], value_vars=['qrtly'])
tty = pd.DataFrame({
'date':pd.date_range(start='1/1/2015', periods=21, freq='Q'),
'tty': (2.7,2.7,3.2,2.3,2.7,2.1,2.1,3.0,2.5,3.1,3.3,2.7,2.4,1.9,1.7,1.9,2.2,1.4,-6.3,-3.7,-1.1)
})
tty = pd.melt(tty, id_vars=['date'], value_vars=['tty'])
p = (ggplot()
+ theme_light()
+ geom_bar(qrtly, aes(x='date',y='value', fill='variable'), stat='identity', position='dodge')
+ geom_line(tty, aes(x='date',y='value',color='variable'))
+ labs(x=None,y=None)
+ scale_x_datetime(breaks='1 year', labels=date_format('%Y'), expand=(0,0))
+ scale_fill_manual('#002147')
+ scale_color_manual('#800000')
+ guides(color = guide_legend(nrow = 1))
+ guides(fill = guide_legend(nrow = 1))
+ theme(
legend_direction = 'horizontal',
legend_position = 'bottom',
legend_title = element_blank(),
)
)
p
result:
R code (what I want):
library(ggplot2)
df = data.frame(
date = seq(as.Date('2015-12-1'), as.Date('2020-12-1'), by='quarter'),
qrtly = c(0.6,0.9,0.7,0.1,1.0,0.3,0.7,1.0,0.5,0.9,0.9,0.4,0.2,0.5,0.7,0.6,0.4,-0.3,-7.0,3.4,3.1),
tty = c(2.7,2.7,3.2,2.3,2.7,2.1,2.1,3.0,2.5,3.1,3.3,2.7,2.4,1.9,1.7,1.9,2.2,1.4,-6.3,-3.7,-1.1)
)
ggplot(df) +
theme_light() +
geom_bar(aes(x=date, y=qrtly, fill='quarterly'), stat='identity', position='dodge') +
geom_line(aes(x=date, y=tty, group=1, color='tty'), size=1) +
labs(x=NULL, y=NULL) +
scale_fill_manual(values=c('#002147')) +
scale_color_manual(values=c('#800000')) +
guides(color = guide_legend(nrow = 1)) +
guides(fill = guide_legend(nrow = 1)) +
theme(
legend.direction = 'horizontal',
legend.position = 'bottom',
legend.title = element_blank(),
)
result:

I just figured out this out by going into the documentation, but the setting you want is
+ theme(legend_box = 'horizontal')
You can find more information here:
https://plotnine.readthedocs.io/en/stable/generated/plotnine.themes.theme.html

Related

labeling Confidence interval and coefficient using ggplot in Pandas

I tried to label coefficient and Confidence interval using the following code:
pp =p.ggplot(leadslags_plot, p.aes(x = 'label', y = 'mean',
ymin = 'lb',
ymax = 'ub')) +\
p.geom_line(p.aes(group = 1),color = "b") +\
p.geom_pointrange(color = "b",size = 0.5) +\
p.geom_errorbar(color = "r", width = 0.2) +\
p.scale_color_manual(name= "label:", values = ['b','r'],labels = ["coeff","95 percent CI"] )+\
p.theme("bottom") +\
p.xlab("Years before and after ") +\
p.ylab("value ") +\
p.geom_hline(yintercept = 0,
linetype = "dashed") +\
p.geom_vline(xintercept = 0,
linetype = "dashed")
the code generates the plot but does not label the 'coeff' and 'CI'. How can I label 'coeff' and 'CI'
The issue is that to get a legend you have to map on aesthetics. In ggplot2 (the R one) this could be easily achieved by moving color="b" inside aes() which however does not work in plotnine or Python. Maybe there is a more pythonistic way to get around this issue but one option would be to add two helper columns to your dataset which could then be mapped on the color aes:
import pandas as pd
import plotnine as p
leadslags_plot = [[-2, 1, 0, 2], [0, 2, 1, 3], [2, 3, 2, 4]]
leadslags_plot = pd.DataFrame(leadslags_plot, columns=['label', 'mean', 'lb', 'ub'])
leadslags_plot["b"] = "b"
leadslags_plot["r"] = "r"
(p.ggplot(leadslags_plot, p.aes(x = 'label', y = 'mean',
ymin = 'lb',
ymax = 'ub')) +\
p.geom_line(p.aes(group = 1),color = "b") +\
p.geom_pointrange(p.aes(color = "b"),size = 0.5) +\
p.geom_errorbar(p.aes(color = "r"), width = 0.2) +\
p.scale_color_manual(name= "label:", values = ['b','r'], labels = ["coeff", "95 percent CI"] )+\
p.theme("bottom", subplots_adjust={'right': 0.8}) +\
p.xlab("Years before and after ") +\
p.ylab("value ") +\
p.geom_hline(yintercept = 0,
linetype = "dashed") +\
p.geom_vline(xintercept = 0,
linetype = "dashed"))

VPN Indicator ThinkScript to Python

Taking a stab at converting a ThinkScript to Python for the first time, and I think my logic is right, but I am missing something as the two plots for the indicator don't match.
Trying to convert the ThinkScript for the VPNIndicator to a Python implementation. Looking for someone knowledgeable in both languages to contribute here.
To start, the indicator plot in ThinkorSwim looks like this (bottom):
So I'm trying to replicate that plot using matplotlib finance, but first I need to translate from ThinkScript to Python, which I've attempted here:
import mplfinance as mpf
import pandas as pd
import numpy as np
import talib
def VPN_Indicator(df, params):
# def atr = WildersAverage(TrueRange(high, close, low), length);
df['H-L'] = df['High'] - df['Low']
df['H-C1'] = df['High'] - df['Close'].shift()
df['C1-L'] = df['Close'].shift() - df['Low']
df['TrueRange'] = df[['H-L','H-C1','C1-L']].max(axis=1)
df['WildersATR'] = df['TrueRange'].ewm(alpha=1.0 / params['length'], adjust=False).mean()
# def diff = hlc3 - hlc3[1];
df['Diff'] = ((df['High'] + df['Low'] + df['Close']) / 3) - ((df['High'].shift() + df['Low'].shift() + df['Close'].shift()) / 3) # Forward peak here?
# def vp = Sum(if diff > factor * atr then volume else 0, length);
df['VP_Helper'] = np.where(df['Diff'] > params['factor'] * df['WildersATR'], df['Volume'], 0)
df['VP'] = df['VP_Helper'].rolling(params['length']).sum()
# def vn = Sum(if diff < -factor * atr then volume else 0, length);
df['VN_Helper'] = np.where(df['Diff'] < -params['factor'] * df['WildersATR'], df['Volume'], 0)
df['VN'] = df['VN_Helper'].rolling(params['length']).sum()
# plot VPN = ExpAverage(100 * (vp - vn) / Sum(volume, length), emaLength);
df['RollingVol'] = df['Volume'].rolling(params['length']).sum()
df['VPN'] = talib.EMA(100 * (df['VP'] - df['VN']) / df['RollingVol'], timeperiod=params['emaLength'])
# plot VPNAvg = MovingAverage(averageType, VPN, averageLength);
if params['averageType'] in ['simple','sma','SMA','SIMPLE']:
df['VPNAvg'] = talib.SMA(df['VPN'], timeperiod=params['averageLength'])
# plot CriticalLevel = criticalValue;
df['CriticalLevel'] = params['criticalValue']
# VPN.DefineColor("Above", Color.UPTICK);
# VPN.DefineColor("Below", Color.DOWNTICK);
# VPN.AssignValueColor(if VPN > CriticalLevel then VPN.Color("Above") else VPN.Color("Below"));
# VPNAvg.SetDefaultColor(GetColor(7));
# CriticalLevel.SetDefaultColor(GetColor(1));
# Gimicks, don't need the top bit for now
return df
params = {
"length": 30,
"emaLength": 3,
"averageLength": 30,
"factor": 0.1,
"criticalValue": 10,
"averageType": "simple"
}
# Import a 1min dataset and rename columns as necessary
df = pd.read_csv("SPY.csv").iloc[-2000:,:]
df['time'] = pd.to_datetime(df['time'])
df = df.set_index('time')
df = df.rename(columns={'open':'Open', 'high':'High', 'low':"Low", "close": "Close", "volume": "Volume"})
df = VPN_Indicator(df, params)
# Plot the results
apds = [ mpf.make_addplot((df['CriticalLevel']), panel=2, color='g'),
mpf.make_addplot((df['VPN']), panel=2, color='g'),
mpf.make_addplot((df['VPNAvg']), panel=2, color='g'),
]
mpf.plot(df[['Open', 'High', 'Low', 'Close', 'Volume']], addplot=apds, figscale=1.2, volume=True)
... which results in a plot that looks like this:
... which is close, but the peaks don't line up with the ThinkOrSwim plot. So I'm wanting to know from someone who knows these languages where I might be off? Thanks!
Try using this to calculate ATR. This gives the same output as TOS.
import numpy as np
def ema(arr, periods=14, weight=1, init=None):
leading_na = np.where(~np.isnan(arr))[0][0]
arr = arr[leading_na:]
alpha = weight / (periods + (weight-1))
alpha_rev = 1 - alpha
n = arr.shape[0]
pows = alpha_rev**(np.arange(n+1))
out1 = np.array([])
if 0 in pows:
out1 = ema(arr[:int(len(arr)/2)], periods)
arr = arr[int(len(arr)/2) - 1:]
init = out1[-1]
n = arr.shape[0]
pows = alpha_rev**(np.arange(n+1))
scale_arr = 1/pows[:-1]
if init:
offset = init * pows[1:]
else:
offset = arr[0]*pows[1:]
pw0 = alpha*alpha_rev**(n-1)
mult = arr*pw0*scale_arr
cumsums = mult.cumsum()
out = offset + cumsums*scale_arr[::-1]
out = out[1:] if len(out1) > 0 else out
out = np.concatenate([out1, out])
out[:periods] = np.nan
out = np.concatenate(([np.nan]*leading_na, out))
return out
def atr(highs, lows, closes, periods=14, ema_weight=1):
hi = np.array(highs)
lo = np.array(lows)
c = np.array(closes)
tr = np.vstack([np.abs(hi[1:]-c[:-1]),
np.abs(lo[1:]-c[:-1]),
(hi-lo)[1:]]).max(axis=0)
atr = ema(tr, periods=periods, weight=ema_weight)
atr = np.concatenate([[np.nan], atr])
return atr

Add text to figure using python's plotnine

I would like to add a label to a line in plotnine. I get the following error when using geom_text:
'NoneType' object has no attribute 'copy'
Sample code below:
df = pd.DataFrame({
'date':pd.date_range(start='1/1/1996', periods=4*25, freq='Q'),
'small': pd.Series([0.035]).repeat(4*25) ,
'large': pd.Series([0.09]).repeat(4*25),
})
fig1 = (ggplot()
+ geom_step(df, aes(x='date', y='small'))
+ geom_step(df, aes(x='date', y='large'))
+ scale_x_datetime(labels=date_format('%Y'))
+ scale_y_continuous(labels=lambda l: ["%d%%" % (v * 100) for v in l])
+ labs(x=None, y=None)
+ geom_text(aes(x=pd.Timestamp('2000-01-01'), y = 0.0275, label = 'small'))
)
print(fig1)
Edit:
has2k1's answer below solves the error, but I get:
I want this: (from R)
R code:
ggplot() +
geom_step(data=df, aes(x=date, y=small), color='#117DCF', size=0.75) +
geom_step(data=df, aes(x=date, y=large), color='#FF7605', size=0.75) +
scale_y_continuous(labels = scales::percent, expand = expand_scale(), limits = c(0,0.125)) +
labs(x=NULL, y=NULL) +
geom_text(aes(x = as.Date('1996-01-07'), y = 0.0275, label = 'small'), color = '#117DCF', size=5)
Any documentation beyond https://plotnine.readthedocs.io/en/stable/index.html? I have read the geom_text there and still can't produce what I need...
geom_text has no dataframe. If you want to print the text put it in quotes i.e. '"small"' or put the label mapping outside aes(), but it makes more sense to use annotate.
(ggplot(df)
...
# + geom_text(aes(x=pd.Timestamp('2000-01-01'), y = 0.0275, label = '"small"'))
# + geom_text(aes(x=pd.Timestamp('2000-01-01'), y = 0.0275), label = 'small')
+ annotate('text', x=pd.Timestamp('2000-01-01'), y = 0.0275, label='small')
)

Is there a function to add WOE, calculated on Training data, to the whole data set? (python)

I am working on some python code to predict Default rate of loans handed out by a bank.
I have calculated the WOE and information value (IV) on the training set
(using the following code: https://github.com/Sundar0989/WOE-and-IV/blob/master/WOE_IV.ipynb?fbclid=IwAR1MvEfyGsdyTre0uPJC5WRl91dfue_t0vH5qJezwm2mAg6sjHZJg9MyDYo).
We have also concluded 2 high cardinality variables. We don't know however how to add these WOE scores to the whole set. How do we tackle this problem? How can we go further to use WOE to predict the target variable?
code:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy, pylab
Reading the data received from bank, feature selection part 1, splitting up whole set (Training) into training set: indices_traintrain, validation set: indices_val and test set: indices_test (70/30 split training and validation set - test set and 70/30 split training - validation)
Training =
pd.read_excel('/Users/enjo/Documents/Master/DM/Data_DSC2019_STUDENTS/DSC2019_Training.xlsx', na_values=np.nan)
Status = Training.iloc[:,-1]
Data = Training.iloc[:,0:45]
Data_missing = Data.isna()
Data_missing = Data_missing.sum()
print(Data_missing/len(Data))
"""
drop variables with more than 80% missing
"""
Drop = ['FREE_CASH_FLOW_AMT',
'A2_MTHS_FIRST_PCX_COREPROF_CNT', 'A2_MONTHS_IN_BELGIUM_CNT', 'A2_MTHS_SNC_FIRST_COREPROF_CNT', 'MONTHS_SINCE_LAST_REFUSAL_CNT']
DroppedTraining = Training.copy()
for element in Drop:
DroppedTraining.drop(element, axis=1,inplace=True)
import numpy as np
from sklearn import datasets
from sklearn import svm
from sklearn import preprocessing
Data_preprocessed=[] #contains preprocessed data
from Preprocessing_continuous import Preprocessing_continuous #import function for preprocessing
from Preprocessing_discrete import Preprocessing_discrete #import function for preprocessing
from sklearn.model_selection import train_test_split
indices=np.arange(26962)
indices_train, indices_test = train_test_split(indices, test_size=0.3, random_state=0)
indices_traintrain, indices_val = train_test_split(indices_train, test_size=0.3, random_state=0)
Training['target']= Training['Label_Default'].apply(lambda x:1 if x=='Y' else 0)
Highcardinalityset=[]
Highcardinalityset = Training[['Type',
'INDUSTRY_CD_3',
'INDUSTRY_CD_4',
'Managing_Sales_Office_Nbr',
'Postal_Code_L',
'Product_Desc',
'CREDIT_TYPE_CD',
'ACCOUNT_PURPOSE_CD',
'A2_MARITAL_STATUS_CD',
'FINANCIAL_PRODUCT_TYPE_CD',
'A2_EMPLOYMENT_STATUS_CD',
'A2_RESIDENT_STATUS_CD',
'target']]
Highcardinalityset = Highcardinalityset.iloc[indices_traintrain]
function found on github
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
max_bin = 20
force_bin = 3
# define a binning function
def mono_bin(Y, X, n = max_bin):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
r = 0
while np.abs(r) < 1:
try:
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
d2 = d1.groupby('Bucket', as_index=True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
except Exception as e:
n = n - 1
if len(d2) == 1:
n = force_bin
bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
if len(np.unique(bins)) == 2:
bins = np.insert(bins, 0, 1)
bins[1] = bins[1]-(bins[1]/2)
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)})
d2 = d1.groupby('Bucket', as_index=True)
d3 = pd.DataFrame({},index=[])
d3["MIN_VALUE"] = d2.min().X
d3["MAX_VALUE"] = d2.max().X
d3["COUNT"] = d2.count().Y
d3["EVENT"] = d2.sum().Y
d3["NONEVENT"] = d2.count().Y - d2.sum().Y
d3=d3.reset_index(drop=True)
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
return(d3)
def char_bin(Y, X):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
df2 = notmiss.groupby('X',as_index=True)
d3 = pd.DataFrame({},index=[])
d3["COUNT"] = df2.count().Y
d3["MIN_VALUE"] = df2.sum().Y.index
d3["MAX_VALUE"] = d3["MIN_VALUE"]
d3["EVENT"] = df2.sum().Y
d3["NONEVENT"] = df2.count().Y - df2.sum().Y
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
d3 = d3.reset_index(drop=True)
return(d3)
def data_vars(df1, target):
stack = traceback.extract_stack()
filename, lineno, function_name, code = stack[-2]
vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
final = (re.findall(r"[\w']+", vars_name))[-1]
x = df1.dtypes.index
count = -1
for i in x:
if i.upper() not in (final.upper()):
if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
conv = mono_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
else:
conv = char_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
if count == 0:
iv_df = conv
else:
iv_df = iv_df.append(conv,ignore_index=True)
iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
iv = iv.reset_index()
return(iv_df,iv)
final_iv, IV = data_vars(Highcardinalityset,Highcardinalityset.target)
final_iv
IV.sort_values('IV')
IV.to_csv('test.csv')
transform_vars_list = Highcardinalityset.columns.difference(['target'])
transform_prefix = 'new_' # leave this value blank if you need replace the original column values
transform_vars_list
for var in transform_vars_list:
small_df = final_iv[final_iv['VAR_NAME'] == var]
transform_dict = dict(zip(small_df.MAX_VALUE.astype(str),small_df.WOE.astype(str)))
replace_cmd = ''
replace_cmd1 = ''
for i in sorted(transform_dict.items()):
replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
replace_cmd = replace_cmd + '0'
replace_cmd1 = replace_cmd1 + '0'
if replace_cmd != '0':
try:
Highcardinalityset[transform_prefix + var] = Highcardinalityset[var].apply(lambda x: eval(replace_cmd))
except:
Highcardinalityset[transform_prefix + var] = Highcardinalityset[var].apply(lambda x: eval(replace_cmd1))
Highcardinalityset['Postal_Code_L'].value_counts()
Highcardinalityset['new_Postal_Code_L'].value_counts()
Highcardinalityset['Managing_Sales_Office_Nbr'].value_counts()
Highcardinalityset['new_Managing_Sales_Office_Nbr'].value_counts()
Nice to see when high WOE: interesting for that postal code: high risk for default!
Highcardinalityset.to_excel("Highcardinalitysettraintrain.xlsx")
TrainingWOE = DroppedTraining[['Managing_Sales_Office_Nbr', "Postal_Code_L"]]
TrainingWOE["Postal_Code_L_WOE"]=Highcardinalityset[["new_Postal_Code_L"]]
TrainingWOE["Managing_Sales_Office_Nbr_WOE"]=Highcardinalityset[["new_Managing_Sales_Office_Nbr"]]
drop variables that are not relevant because of low IV value
Drop = ["ACCOUNT_PURPOSE_CD", "A2_MARITAL_STATUS_CD", "A2_EMPLOYMENT_STATUS_CD", "A2_RESIDENT_STATUS_CD",
"INDUSTRY_CD_3", "INDUSTRY_CD_4","Type"]
DroppedTrainingAfterIVcalc = DroppedTraining.copy()
for element in Drop:
DroppedTrainingAfterIVcalc.drop(element, axis=1,inplace=True)
preprocess remaining (44-5 (because of too many missing) - 7 (because of low iv) + 1 (target variable added))
Thanks for asking this question. Here is the code to do the required transformation which is shown in the notebook as well.
transform_vars_list = df.columns.difference(['target'])
transform_prefix = 'new_' # leave this value blank to replace the original column
#apply transformations
for var in transform_vars_list:
small_df = final_iv[final_iv['VAR_NAME'] == var]
transform_dict = dict(zip(small_df.MAX_VALUE,small_df.WOE))
replace_cmd = ''
replace_cmd1 = ''
for i in sorted(transform_dict.items()):
replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
replace_cmd = replace_cmd + '0'
replace_cmd1 = replace_cmd1 + '0'
if replace_cmd != '0':
try:
df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd))
except:
df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd1))
In addition, there is a package Xverse which does the same. Please refer to it here - https://github.com/Sundar0989/XuniVerse

Bokeh how to get GlyphRenderer for Annotation

With Bokeh, how do I get a handle to the Renderer (or GlyphRenderer) for an Annotation? Is this possible?
I would like to be able to toggle a Band (which is an Annotation) on and off with an interactive legend, so I need to be able to pass a list of Renderers to the LegendItem constructor.
This code:
maxline = fig.line(x='Date', y=stn_max, line_width=0.5, legend=stn_max, name="{}_line".format(stn_max), color=stn_color, alpha=0.75, source=source)
minline = fig.line(x='Date', y=stn_min, line_width=0.5, legend=stn_min, name="{}_line".format(stn_min), color=stn_color, alpha=0.75, source=source)
band = bkm.Band(base='Date', lower=stn_min, upper=stn_max, fill_alpha=0.50, line_width=0.5, fill_color=stn_color, source=source)
bkm.LegendItem(label=stn, renderers=[maxline, minline, band])
Produces this error
...
ValueError: expected an element of List(Instance(GlyphRenderer)), got seq with invalid items [Band(id='1091', ...)]
For LegendItem only instances of GlyphRenderer can be passed to its renderers attribute and Band is not based on GlyphRenderer so it gives error. In the code below the Band visibility is being toggled by means of a callback:
from bokeh.plotting import figure, show
from bokeh.models import Band, ColumnDataSource, Legend, LegendItem, CustomJS
import pandas as pd
import numpy as np
x = np.random.random(2500) * 140 - 20
y = np.random.normal(size = 2500) * 2 + 5
df = pd.DataFrame(data = dict(x = x, y = y)).sort_values(by = "x")
sem = lambda x: x.std() / np.sqrt(x.size)
df2 = df.y.rolling(window = 100).agg({"y_mean": np.mean, "y_std": np.std, "y_sem": sem})
df2 = df2.fillna(method = 'bfill')
df = pd.concat([df, df2], axis = 1)
df['lower'] = df.y_mean - df.y_std
df['upper'] = df.y_mean + df.y_std
source = ColumnDataSource(df.reset_index())
p = figure(tools = "pan,wheel_zoom,box_zoom,reset,save")
scatter = p.scatter(x = 'x', y = 'y', line_color = None, fill_alpha = 0.3, size = 5, source = source)
band = Band(base = 'x', lower = 'lower', upper = 'upper', source = source)
p.add_layout(band)
p.title.text = "Rolling Standard Deviation"
p.xaxis.axis_label = 'X'
p.yaxis.axis_label = 'Y'
callback = CustomJS(args = dict(band = band), code = """
if (band.visible == false)
band.visible = true;
else
band.visible = false; """)
legend = Legend(items = [ LegendItem(label = "x", renderers = [scatter, band.source.selection_policy]) ])
legend.click_policy = 'hide'
scatter.js_on_change('visible', callback)
p.add_layout(legend)
show(p)
Result:

Categories