Nlopt from R to Python - python

This is my first time using Python.
I am having trouble passing from R to Python in the Nlopt package
So I am using a Maximum Likelihood Estimator to estimate 4 parameters. In R I programmed two functions: loglikelihood and gradient from the log-likelihood
In R both of my functions are like this:
loglik <- function(par){
g_h <- par[1]; g_c <- par[2]; a_bar <- par[3]; sigma_e <- par[4]
d <- mutate(obs_data,
num = h + ((I-g_c)/w),
den = g_h + ((I-g_c)/w),
eps = (num/den)-a_bar,
arg_1 = 0.5*(eps/sigma_e)^2,
arg_2 = log(abs(den)),
opt = log(sigma_e * sqrt(2*pi)) + arg_1 + arg_2)
LL <- - sum(d$opt)
return(-LL)
}
grad_loglik <- function(par){
g_h <- par[1]; g_c <- par[2]; a_bar <- par[3]; sigma_e <- par[4]
d <- obs_data %>%
mutate(num = h + ((I- g_c)/w),
den = g_h + ((I-g_c)/w),
eps = num/den - a_bar,
eps_gh = -num/(den^2),
eps_gc = (h-g_h)/(w*(den^2)),
ll_gh = -(eps/(sigma_e^2))*eps_gh - 1/den,
ll_gc = -(eps/(sigma_e^2))*eps_gc + 1/(w*den),
ll_abar = eps/(sigma_e^2),
ll_se = -1/sigma_e + (eps^2)/(sigma_e^3))
return(c(-sum(d$ll_gh),
-sum(d$ll_gc),
-sum(d$ll_abar),
-sum(d$ll_se)))
}
So I am trying to minimize the loglik function
The code for using the Nlopt function
opt_parr <- nloptr(x0 = val_i,
eval_f = loglik,
eval_grad_f = grad_loglik,
lb = c(0,0,0,0),
ub = c(24, 100, 1, 1),
opts = list("algorithm" = "NLOPT_LD_LBFGS",
"xtol_rel" = xtol,
"maxeval"= maxev,
"print_level" = 0))
So I translated both of my functions in python:
def loglik(par):
g_h = par[0]; g_c = par[1]; a_bar = par[2]; sigma_e = par[3]
d = obs_data
d = d.assign(num = lambda x: d.h + ((d.I - g_c)/d.w),
den = lambda x: g_h + ((d.I - g_c)/d.w),
eps = lambda x: (d.num/d.den) - a_bar,
arg1 = lambda x: 0.5*(d.eps/sigma_e)**2,
arg2 = lambda x: np.log(np.absolute(d.den)),
opt = lambda x: np.log(sigma_e * np.sqrt(2*np.pi)) + d.arg1 + d.arg2)
LL = -sum(d.opt)
return(-LL)
def grad_loglik(par):
g_h = par[0]; g_c = par[1]; a_bar = par[2]; sigma_e = par[3]
d = obs_data
d = d.assign(num = lambda x: d.h + ((d.I - g_c)/d.w),
den = lambda x: g_h + ((d.I - g_c)/d.w),
eps = lambda x: (d.num/d.den) - a_bar,
eps_gh = lambda x: -d.num/(d.den**2),
eps_gc = lambda x: (d.h-g_h)/(d.w*(d.den**2)),
ll_gh = lambda x: -(d.eps/(sigma_e**2))*d.eps_gh - 1/d.den,
ll_gc = lambda x: -(d.eps/(sigma_e**2))*d.eps_gc + 1/(d.w*d.den),
ll_abar = lambda x: d.eps/(sigma_e**2),
ll_se = lambda x: -1/sigma_e + (d.eps**2)/(sigma_e**3))
G1 = -sum(d.ll_gh)
G2 = -sum(d.ll_gc)
G3 = -sum(d.ll_abar)
G4 = -sum(d.ll_se)
G = [G1, G2, G3, G4]
return(G)
But I don't understand how to program the optimizer. So far this is my best try:
#%% Find optimal parameters
opt = nlopt.opt(nlopt.LD_LBFGS, 4)
opt.set_lower_bounds([0]*4)
opt.set_upper_bounds([24, 100, 1, 1])
opt.set_min_objective(loglik)
opt.set_xtol_rel(1e-64)
x = opt.optimize([1e-4]*4)
minf = opt.last_optimum_value()
print("optimum at ", x[0], x[1], x[2], x[3])
print("minimum value = ", minf)
print("result code = ", opt.last_optimize_result())
I don't know where to put the gradient function in order to make it work, in R was kinda clear.
But this page tell me that:
But since I am new in Python this doesn't tell me much info. Am I wrongly programming the gradient function? Where does it has to be?
Thanks in advance!
Data
Just use obs_data
import numpy as np
import pandas as pd
import nlopt
N = 100_000
np.random.seed(1)
wage = np.exp(np.random.normal(loc = 4, scale = 0.1, size = N))
nlincome = np.exp(np.random.normal(loc = 3, scale = 0.5, size = N))
eps_ = np.random.normal(loc = 0, scale = 0.01, size = N)
data = pd.DataFrame({'wages':wage, 'non_labor_income': nlincome,
'epsilon': eps_})
data = data.assign(alpha_bar = lambda x: α_bar + data['epsilon'])
check = data.assign(h = lambda x: (data['alpha_bar']+ data['epsilon'])*γ_h
- (((1-data['alpha_bar']-data['epsilon'])*
(data['non_labor_income']-γ_c))/(data['wages'])))
check = check.assign(l = lambda x: time - check.h,
c = lambda x: (check.wages*check.h)+check.non_labor_income,
total_income = lambda x: check.wages*check.h)
obs_data = check[['wages', 'h', 'non_labor_income']]
obs_data = obs_data.rename(columns = {"wages":"w", "non_labor_income":"i"})

Related

Where is the value in the lambda function coming from?

below is the code part from this github repo that I am confused of:
full_promp.py:
.....
......
class ProbInvKinematics:
#params:
#fwd_k: A forward kinematics object
def __laplace_cost_and_grad(self, theta, mu_theta, inv_sigma_theta, mu_x, inv_sigma_x):
print ("theta ",theta)
f_th, jac_th, ori = self.fwd_k.position_and_jac(theta)
jac_th = jac_th[0:3,:]
diff1 = theta - mu_theta
tmp1 = np.dot(inv_sigma_theta, diff1)
diff2 = f_th - mu_x
tmp2 = np.dot(inv_sigma_x, diff2)
nll = 0.5*(np.dot(diff1,tmp1) + np.dot(diff2,tmp2))
grad_nll = tmp1 + np.dot(jac_th.T,tmp2)
return nll, grad_nll
def __init__(self, fwd_kinematics):
self.fwd_k = fwd_kinematics
def inv_kin(self, mu_theta, sig_theta, mu_x, sig_x):
inv_sig_theta = np.linalg.inv(sig_theta)
inv_sig_x = np.linalg.inv(sig_x)
cost_grad = lambda theta: self.__laplace_cost_and_grad(theta, mu_theta, inv_sig_theta, mu_x, inv_sig_x)
cost = lambda theta: cost_grad(theta)[0]
grad = lambda theta: cost_grad(theta)[1]
res = opt.minimize(cost, mu_theta, method='BFGS', jac=grad)
post_mean = res.x
post_cov = res.hess_inv
return post_mean, post_cov
Usage of the class ProbInvKinematics as follow:
import robpy.full_promp as promp
prob_inv_kin = promp.ProbInvKinematics(fwd_kin)
mu_cartesian = np.array([-0.62, -0.44, -0.34])
Sigma_cartesian = 0.02**2*np.eye(3)
mu_q, Sigma_q = prob_inv_kin.inv_kin(mu_theta=prior_mu_q, sig_theta=prior_Sigma_q,
mu_x = mu_cartesian, sig_x = Sigma_cartesian)
I see that the parameter value theta is defined nowhere. But somehow when I try to print out theta in def __laplace_cost_and_grad(), the value is there... What is the logic of using this theta?

Python function calling with variable vs raw numbers

I am trying to implement a pso algorithm from Wikipedia https://en.wikipedia.org/wiki/Particle_swarm_optimization.
My problem is that when I am calling the cost function with a variable (Gbest), and then manually calling the cost function (with the Gbest data) I get a different output (cost) like the image bellow:
Code fault
I am new to python so thank you for any suggestions.
Here is the complete code:
import matplotlib.pyplot as plt
import numpy as np
from control.matlab import *
A = np.array([[0,0,1],[0,1,0],[1,2,-2]])
B = np.array( [[0],[1],[0]])
C = np.array([[0, 1,0]])
D = np.zeros([C.shape[0],B.shape[1]])
sys = ss(A,B,C,D)
sys_tf = tf(sys)
s = tf('s')
def cost(kp,ki):
global sys_tf, G, y, t, r
G = kp + ki/s
C = feedback(sys_tf*G, 1)
y, t = step(C, linspace(0,100))
r = np.ones(len(t))
return np.sum(y-r)**2
part = 100
ite = 10000
dim = 2
w = 0.001
wdamp = 0.99
phip = 0.9
phig = 0.1
blo, bup = -10,10
x = np.zeros([dim, part])
v = np.zeros([dim, part])
pbest = np.zeros([dim, part])
gbest = np.array([1000000,1000000])
for i in range(part):
for k in range(dim):
x[k][i] = pbest[k][i] = np.random.uniform(blo, bup)
v[k][i] = np.random.uniform(-np.abs(bup - blo), np.abs(bup - blo))
if cost(pbest[0][i], pbest[1][i]) < cost(gbest[0], gbest[1]):
gbest = np.array([pbest[0][i], pbest[1][i]])
for it in range(ite):
for i in range(part):
for k in range(dim):
rp = np.random.uniform(0,1)
rg = np.random.uniform(0,1)
v[k,:] = w*v[k,:] + phip*rp*(pbest[k,:] - x[k,:]) + phig*rg*(gbest[k] - x[k,:])
x[k,:] = x[k,:] + v[k,:]
w = w*wdamp
if cost(x[0][i], x[1][i]) < cost(pbest[0][i], pbest[1][i]):
pbest[:,i] = x[:,i]
if cost(pbest[0][i], pbest[1][i]) < cost(gbest[0], gbest[1]):
gbest = np.array([pbest[0][i], pbest[1][i]])
plt.plot(t, y, 'ro')
plt.plot(t, r, 'x')
plt.pause(0.005)
plt.title(gbest)
print([gbest, cost(gbest[0], gbest[1])])

MPC with python and Error ValueError: `f0` passed has more than 1 dimension

I wrote a MPC with Python and it worked before. After a long time I want to use it again but I got this Error
f0 passed has more than 1 dimension.
But I didn't change anything on my code. It is some kind of strange.
Here is my code:
import numpy as np
import numpy.linalg as npl
import matplotlib.pyplot as plt
from scipy.optimize import minimize
def mpcAugment(Am, Bm, Cm ):
"Function for Augmented Model"
nx, nu = Bm.shape
ny = Cm.shape[0]
A = np.zeros((nx+ny,nx+ny))
A[0:nx,0:nx] = Am
A[nx:nx+ny,0:nx] = Cm#Am
A[nx:nx+ny,nx:nx+ny] = np.eye(ny)
B = np.zeros((nx+ny,nu))
B[0:nx,:nu] = Bm
B[nx:nx+ny,:nu] = Cm#Bm
C = np.zeros((ny,nx+ny))
C[:ny,nx:nx+ny] = np.eye(ny)
return A, B, C
'Define Parameters'
k = 0.4
AICB = 153.8
mcp = 8.8e4
vamb1 = 30
vamb2 = 45
a = -k*AICB/mcp
b = -1/mcp
Ts = 20
VICBref = -5.0
Am = np.array([[1+Ts*a]])
Bm = np.array([[Ts*b]])
Gm = np.array([[-Ts*a]])
Cm = np.array([[1]])
A, B, C = mpcAugment(Am,Bm,Cm)
A, G, C = mpcAugment(Am,Gm,Cm)
nx, nu = B.shape
ny = C.shape[0]
nd = G.shape[1]
Np = 20
Nu = 5
F = np.zeros((Np*ny,nx))
PHI = np.zeros((Np*ny,Nu*nu))
PHIw = np.zeros((Np*ny,Np*nd))
for i in range(0,Np):
Ai = npl.matrix_power(A, i+1)
F[i*ny:(i+1)*ny,:] = C#Ai
for j in range(0, Nu):
if j <= i:
Aij = np.linalg.matrix_power(A, i-j)
PHI[i*ny:(i+1)*ny, j*nu:(j+1)*nu] = C#Aij#B
for j in range(0, Np):
if j <= i:
Aij = np.linalg.matrix_power(A, i-j)
PHIw[i*ny:(i+1)*ny, j*nd:(j+1)*nd] = C#Aij#G
umax = 3100
umin = 0
Q = np.eye(Np*ny)
R = 1e-2*np.eye(Nu*nu)
Rs = VICBref*np.ones((Np*ny,1))
Ainq = np.zeros((2*Nu*nu,Nu*nu))
binq = np.zeros((2*Nu*nu,1))
cinq = np.zeros((2*Nu*nu,1))
for i in range(0,Nu):
binq[i*nu:(i+1)*nu] = umax
binq[(i+Nu)*nu:(Nu+i+1)*nu] = 1
cinq[i*nu:(i+1)*nu] = 1
cinq[(i+Nu)*nu:(Nu+i+1)*nu] = -1
for j in range(0,i+1):
Ainq[i*nu:(i+1)*nu,j*nu:(j+1)*nu] = np.eye(nu)
Ainq[(i+Nu)*nu:(Nu+i+1)*nu,j*nu:(j+1)*nu] = np.eye(nu)
u0 = 0
def objective(du):
dU = np.array(du).reshape((len(du),1))
Y = F#x + PHI#dU + PHIw#w
return np.transpose((Rs-Y))#(Rs-Y)+np.transpose(dU)#R#(dU)
def constraint1(du):
dU = np.array(du).reshape((len(du),1))
return (binq - Ainq#dU - cinq*u0)[0]
#print(objective([1,1,1]))
ulim = (umin, umax)
bnds = np.kron(np.ones((Nu,1)),ulim)
#print(bnds)
Um = np.ones((nu*Nu,1))
Tsim = 5e4
time = np.arange(0,Tsim,Ts)
Nt = len(time)
xm = np.zeros((Nt,1))
um = np.zeros((Nt,nu))
ym = np.zeros((Nt,ny))
xm[0] = 0
ym[0] = Cm.dot(xm[0])
w = np.zeros((Np*nd,1))
print('Am = ',Am)
print('Bm = ',Bm)
print('Cm = ',Cm)
x = np.zeros((nx,1))
x[1] = xm[0]
vamb = vamb1
Vamb = np.zeros((Nt,1))
Ns = int(np.floor(Nt/2))
Vamb[0:Ns] = vamb1*np.ones((Ns,1))
Vamb[Ns:Nt] = vamb2*np.ones((Nt-Ns,1))
Vref = VICBref*np.ones((Nt,1))
con = {'type':'ineq','fun':constraint1}
for i in range(0,Nt-1):
sol = minimize(objective, Um, method = 'SLSQP',constraints = con)
if sol.success == False:
print('Error Cant solve problem')
exit()
Um = sol.x
um[i+1] = um[i] + Um[0]
u0 = um[i+1]
xm[i+1] = Am.dot(xm[i])+Bm.dot(um[i+1])+Gm.dot(Vamb[i])
ym[i+1] = Cm.dot(xm[i+1])
for j in range(0,Np):
if i+j < Nt:
Rs[j] = Vref[i+j]
w[j] = Vamb[i+j]-Vamb[i+j-1]
else:
Rs[j] = Vref[Nt-1]
w[j] = 0
x[0] = xm[i+1] - xm[i]
x[1] = xm[i+1]
print('Q = ',um[i+1],' , VICB = ',xm[i+1], ' vamb = ', Vamb[i])
hour = 60*60
plt.figure()
plt.subplot(2,1,1)
plt.plot(time/hour,ym)
plt.plot(time/hour,Vref,'--')
plt.xlabel('time(hours)')
plt.xlim([0, Tsim/hour])
plt.subplot(2,1,2)
plt.plot(time/hour,um)
plt.xlim([0, Tsim/hour])
plt.show()
It about a controller, which control the temperature of a cool box.
Is that possible that anything changed in main simply code?
I think the problem is now in minimizations part.
I reinstalled all of my libraries and it worked

Is there a function to add WOE, calculated on Training data, to the whole data set? (python)

I am working on some python code to predict Default rate of loans handed out by a bank.
I have calculated the WOE and information value (IV) on the training set
(using the following code: https://github.com/Sundar0989/WOE-and-IV/blob/master/WOE_IV.ipynb?fbclid=IwAR1MvEfyGsdyTre0uPJC5WRl91dfue_t0vH5qJezwm2mAg6sjHZJg9MyDYo).
We have also concluded 2 high cardinality variables. We don't know however how to add these WOE scores to the whole set. How do we tackle this problem? How can we go further to use WOE to predict the target variable?
code:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy, pylab
Reading the data received from bank, feature selection part 1, splitting up whole set (Training) into training set: indices_traintrain, validation set: indices_val and test set: indices_test (70/30 split training and validation set - test set and 70/30 split training - validation)
Training =
pd.read_excel('/Users/enjo/Documents/Master/DM/Data_DSC2019_STUDENTS/DSC2019_Training.xlsx', na_values=np.nan)
Status = Training.iloc[:,-1]
Data = Training.iloc[:,0:45]
Data_missing = Data.isna()
Data_missing = Data_missing.sum()
print(Data_missing/len(Data))
"""
drop variables with more than 80% missing
"""
Drop = ['FREE_CASH_FLOW_AMT',
'A2_MTHS_FIRST_PCX_COREPROF_CNT', 'A2_MONTHS_IN_BELGIUM_CNT', 'A2_MTHS_SNC_FIRST_COREPROF_CNT', 'MONTHS_SINCE_LAST_REFUSAL_CNT']
DroppedTraining = Training.copy()
for element in Drop:
DroppedTraining.drop(element, axis=1,inplace=True)
import numpy as np
from sklearn import datasets
from sklearn import svm
from sklearn import preprocessing
Data_preprocessed=[] #contains preprocessed data
from Preprocessing_continuous import Preprocessing_continuous #import function for preprocessing
from Preprocessing_discrete import Preprocessing_discrete #import function for preprocessing
from sklearn.model_selection import train_test_split
indices=np.arange(26962)
indices_train, indices_test = train_test_split(indices, test_size=0.3, random_state=0)
indices_traintrain, indices_val = train_test_split(indices_train, test_size=0.3, random_state=0)
Training['target']= Training['Label_Default'].apply(lambda x:1 if x=='Y' else 0)
Highcardinalityset=[]
Highcardinalityset = Training[['Type',
'INDUSTRY_CD_3',
'INDUSTRY_CD_4',
'Managing_Sales_Office_Nbr',
'Postal_Code_L',
'Product_Desc',
'CREDIT_TYPE_CD',
'ACCOUNT_PURPOSE_CD',
'A2_MARITAL_STATUS_CD',
'FINANCIAL_PRODUCT_TYPE_CD',
'A2_EMPLOYMENT_STATUS_CD',
'A2_RESIDENT_STATUS_CD',
'target']]
Highcardinalityset = Highcardinalityset.iloc[indices_traintrain]
function found on github
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
max_bin = 20
force_bin = 3
# define a binning function
def mono_bin(Y, X, n = max_bin):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
r = 0
while np.abs(r) < 1:
try:
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
d2 = d1.groupby('Bucket', as_index=True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
except Exception as e:
n = n - 1
if len(d2) == 1:
n = force_bin
bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
if len(np.unique(bins)) == 2:
bins = np.insert(bins, 0, 1)
bins[1] = bins[1]-(bins[1]/2)
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)})
d2 = d1.groupby('Bucket', as_index=True)
d3 = pd.DataFrame({},index=[])
d3["MIN_VALUE"] = d2.min().X
d3["MAX_VALUE"] = d2.max().X
d3["COUNT"] = d2.count().Y
d3["EVENT"] = d2.sum().Y
d3["NONEVENT"] = d2.count().Y - d2.sum().Y
d3=d3.reset_index(drop=True)
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
return(d3)
def char_bin(Y, X):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
df2 = notmiss.groupby('X',as_index=True)
d3 = pd.DataFrame({},index=[])
d3["COUNT"] = df2.count().Y
d3["MIN_VALUE"] = df2.sum().Y.index
d3["MAX_VALUE"] = d3["MIN_VALUE"]
d3["EVENT"] = df2.sum().Y
d3["NONEVENT"] = df2.count().Y - df2.sum().Y
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
d3 = d3.reset_index(drop=True)
return(d3)
def data_vars(df1, target):
stack = traceback.extract_stack()
filename, lineno, function_name, code = stack[-2]
vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
final = (re.findall(r"[\w']+", vars_name))[-1]
x = df1.dtypes.index
count = -1
for i in x:
if i.upper() not in (final.upper()):
if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
conv = mono_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
else:
conv = char_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
if count == 0:
iv_df = conv
else:
iv_df = iv_df.append(conv,ignore_index=True)
iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
iv = iv.reset_index()
return(iv_df,iv)
final_iv, IV = data_vars(Highcardinalityset,Highcardinalityset.target)
final_iv
IV.sort_values('IV')
IV.to_csv('test.csv')
transform_vars_list = Highcardinalityset.columns.difference(['target'])
transform_prefix = 'new_' # leave this value blank if you need replace the original column values
transform_vars_list
for var in transform_vars_list:
small_df = final_iv[final_iv['VAR_NAME'] == var]
transform_dict = dict(zip(small_df.MAX_VALUE.astype(str),small_df.WOE.astype(str)))
replace_cmd = ''
replace_cmd1 = ''
for i in sorted(transform_dict.items()):
replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
replace_cmd = replace_cmd + '0'
replace_cmd1 = replace_cmd1 + '0'
if replace_cmd != '0':
try:
Highcardinalityset[transform_prefix + var] = Highcardinalityset[var].apply(lambda x: eval(replace_cmd))
except:
Highcardinalityset[transform_prefix + var] = Highcardinalityset[var].apply(lambda x: eval(replace_cmd1))
Highcardinalityset['Postal_Code_L'].value_counts()
Highcardinalityset['new_Postal_Code_L'].value_counts()
Highcardinalityset['Managing_Sales_Office_Nbr'].value_counts()
Highcardinalityset['new_Managing_Sales_Office_Nbr'].value_counts()
Nice to see when high WOE: interesting for that postal code: high risk for default!
Highcardinalityset.to_excel("Highcardinalitysettraintrain.xlsx")
TrainingWOE = DroppedTraining[['Managing_Sales_Office_Nbr', "Postal_Code_L"]]
TrainingWOE["Postal_Code_L_WOE"]=Highcardinalityset[["new_Postal_Code_L"]]
TrainingWOE["Managing_Sales_Office_Nbr_WOE"]=Highcardinalityset[["new_Managing_Sales_Office_Nbr"]]
drop variables that are not relevant because of low IV value
Drop = ["ACCOUNT_PURPOSE_CD", "A2_MARITAL_STATUS_CD", "A2_EMPLOYMENT_STATUS_CD", "A2_RESIDENT_STATUS_CD",
"INDUSTRY_CD_3", "INDUSTRY_CD_4","Type"]
DroppedTrainingAfterIVcalc = DroppedTraining.copy()
for element in Drop:
DroppedTrainingAfterIVcalc.drop(element, axis=1,inplace=True)
preprocess remaining (44-5 (because of too many missing) - 7 (because of low iv) + 1 (target variable added))
Thanks for asking this question. Here is the code to do the required transformation which is shown in the notebook as well.
transform_vars_list = df.columns.difference(['target'])
transform_prefix = 'new_' # leave this value blank to replace the original column
#apply transformations
for var in transform_vars_list:
small_df = final_iv[final_iv['VAR_NAME'] == var]
transform_dict = dict(zip(small_df.MAX_VALUE,small_df.WOE))
replace_cmd = ''
replace_cmd1 = ''
for i in sorted(transform_dict.items()):
replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
replace_cmd = replace_cmd + '0'
replace_cmd1 = replace_cmd1 + '0'
if replace_cmd != '0':
try:
df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd))
except:
df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd1))
In addition, there is a package Xverse which does the same. Please refer to it here - https://github.com/Sundar0989/XuniVerse

Numerical issue in scipy.ode.integrate solver

I am using ode solver to solve stiff problem (since odeint function could not able to solve it). But by this way also I have some warnings and my plot get saturate at some point. Here is image What should I do? Here is the list of warnings:
DVODE-- Warning..internal T (=R1) and H (=R2) are
such that in the machine, T + H = T on the next step
(H = step size). solver will continue anyway
In above, R1 = 0.3667661010318D+00 R2 = 0.1426374862242D-16
DVODE-- Warning..internal T (=R1) and H (=R2) are
such that in the machine, T + H = T on the next step
(H = step size). solver will continue anyway
In above, R1 = 0.3667661010318D+00 R2 = 0.1426374862242D-16
DVODE-- Above warning has been issued I1 times.
it will not be issued again for this problem
In above message, I1 = 2
DVODE-- At current T (=R1), MXSTEP (=I1) steps
taken on this call before reaching TOUT
In above message, I1 = 500
In above message, R1 = 0.3667661010318D+00
My code:
import numpy as np
import matplotlib.pyplot as plt
import scipy.integrate as si
def func():
#arguments:::
w = 1./3.
xi = 2.86
phi1 = 1.645
phi2 = 2.* 1.202
gt = 10.**(-60)
Lt = (1.202*gt)/np.pi
Lin = 10.**-5
Lf = 0.49
dt = 0.0001
gin = gt*Lt/Lin
xin = (-np.log((3. - (xi**2)*Lin)/(3. - (xi**2)*Lt)) + np.log(Lin/Lt))/4.0
uin = -(np.log(Lin/Lt))/2.
state0 = [gin,xin,uin]
print state0
def eq(L, state):
g = state[0]
x = state[1]
u = state[2]
N = (-2.*g/(6.*np.pi + 5.*g))*(18./(1. - 2.*L) + 5.*np.log(1.- 2.*L) - phi1 + 6. )
B = (-(2. - N)*L) - ((g/np.pi)* (5.*np.log(1.-2.*L) - phi2 + (5.*N/40.)))
Eqs = np.zeros((3))
gdl = Eqs[0] = ((2.+N)*g)/B
xdl = Eqs[1] = -(2./(3.*(1.+w)))* (1./(1.-(xi**2)*L/3.))*(1./B)
udl = Eqs[2]= 1./B
return Eqs
ode = si.ode(eq)
# BDF method suited to stiff systems of ODEs
ode.set_integrator('vode',nsteps=500,method='bdf')
ode.set_initial_value(state0,Lin)
L = []
G = []
while ode.successful() and ode.t < Lf:
ode.integrate(ode.t + dt)
L.append(ode.t)
G.append(ode.y)
lam = np.vstack(L)
g,x,u = np.vstack(G).T
return g,x,u,lam
r= func()
L = r[3]
g = r[0]
lng = np.log10(g)
x = r[1]
u = r[2]
w = 1./3.
xi = 2.86
O_A = np.zeros(len(L))
q = np.zeros(len(L))
for i in np.arange(len(L)):
O_A[i] = xi**2*L[i]/3.
alpha = 2./ ((3.+3.*w) * (1.- (L[i]*xi**2)/3.) )
q[i] = 1./alpha - 1.
n = np.zeros(len(L)) #eta(n)
b = np.zeros(len(L))
for j in np.arange(len(L)):
n[j] =(-2.*g[j]/(6.*np.pi + 5.*g[j]))*(18./(1. - 2.*L[j]) + 5.*np.log(1.- 2.*L[j]) - 1.645 + 6. )
b[j]= (-(2. - n[j])*L[j]) - ((g[j]/np.pi)* (5.*np.log(1.-2.*L[j]) - 2.* 1.202 + ((5.*n[j])/4.)))
P = np.zeros(len(x))
for k in np.arange(len(x)):
C = (((3. - (xi**2)*L[k])/g[k])**(3./4.)) * (((2.*L[k] + (u[k]*b[k]))*xi**2) + (n[k] * (3.- L[k]*xi**2)) )
P[k] = (np.exp(3.*x[k])) * (np.exp(4.*u[k])) * C
plt.figure()
plt.plot(L,P)
plt.xlabel('Lambda ---->')
plt.ylabel('P ----->')
plt.title('lambda Vs P')
plt.show()

Categories