How can I gridsearch Light GBM on time series data? - python

I want to do a grid search on time series data. Is there any function to do that to search through what I have listed on "lgb_params" for example?
lgb_params = {
"learning_rate": [0.001, 0.01, 0.1, 0.2],
"max_depth": [3, 5, 7, 9],
"num_leaves": [5, 10, 15],
"num_boost_round": 10000,
"early_stopping_rounds": 300,
"feature_fraction": [0.2, 0.3, 0.5, 0.7, 0.8],
"verbose": 0
}
lgbtrain = lgb.Dataset(data=X_train, label=y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=y_val, reference=lgbtrain, feature_name=cols)
model = lgb.train(lgb_params, lgbtrain,
valid_sets=[lgbtrain, lgbval],
num_boost_round=lgb_params['num_boost_round'],
early_stopping_rounds=lgb_params['early_stopping_rounds'],
feval=lgbm_smape,
verbose_eval=100)
The code above of course does not work in the end since lgb params contains keys with more than 1 values (e.g., learning_rate, max_depth etc.). Well, those are the ones I actually want to search for and that's where the problem is...

I think I came up with a solution, it is currently running and haven't finished since it searches through a lot of values but here's the function that I wrote just in case anyone needs it:
def param_search(lgb_param_dict):
min_error = float("inf")
best_params = dict()
best_iter = float("inf")
for i in range(len(lgb_param_dict["learning_rate"])):
lgb_params = dict()
lgb_params["learning_rate"] = lgb_param_dict["learning_rate"][i]
for j in range(len(lgb_param_dict["max_depth"])):
lgb_params["max_depth"] = lgb_param_dict["max_depth"][j]
for k in range(len(lgb_param_dict["num_leaves"])):
lgb_params["num_leaves"] = lgb_param_dict["num_leaves"][k]
for s in range(len(lgb_param_dict["feature_fraction"])):
lgb_params["feature_fraction"] = lgb_param_dict["feature_fraction"][s]
print(" ")
print("##########")
print("Learning_rate = " + str(lgb_params["learning_rate"]))
print("max_depth = " + str(lgb_params["max_depth"]))
print("num_leaves = " + str(lgb_params["num_leaves"]))
print("feature_fraction = " + str(lgb_params["feature_fraction"]))
model = lgb.train(lgb_params, lgbtrain,
valid_sets=[lgbtrain, lgbval],
num_boost_round=lgb_full_params["num_boost_round"],
early_stopping_rounds=lgb_full_params["early_stopping_rounds"],
feval=lgbm_smape,
verbose_eval=500)
print("Learning_rate = " + str(lgb_params["learning_rate"]))
print("max_depth = " + str(lgb_params["max_depth"]))
print("num_leaves = " + str(lgb_params["num_leaves"]))
print("feature_fraction = " + str(lgb_params["feature_fraction"]))
if min_error > dict(model.best_score["valid_1"])["SMAPE"]:
min_error = dict(model.best_score["valid_1"])["SMAPE"]
best_params = model.params
best_iter = model.best_iteration
else:
continue
return min_error, best_params, best_iter
Print statement are for readability. There is probably better way to write this function but I'll approve it as an answer if it finishes without any problems.
Edit: It worked!

Related

How to set order of the nodes in Sankey Diagram Plotly

So i am traying to make a cycle that gives different sankey diagram the thing is due to the plotly optimization the node are in different positions. I will like to set the standard order to be [Formal, Informal, Unemployed, Inactive]
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go
df = pd.read_csv(path, delimiter=",")
Lista_Paises = df["code"].unique().tolist()
Lista_DF = []
for x in Lista_Paises:
DF_x = df[df["code"] == x]
Lista_DF.append(DF_x)
def grafico(df):
df = df.astype({"Source": "category", "Value": "float", "Target": "category"})
def category(i):
if i == "Formal":
return 0
if i == "Informal":
return 1
if i == "Unemployed":
return 2
if i == "Inactive":
return 3
def color(i):
if i == "Formal":
return "#9FB5D5"
if i == "Informal":
return "#E3EEF9"
if i == "Unemployed":
return "#E298AE"
if i == "Inactive":
return "#FCEFBC"
df['Source_cat'] = df["Source"].apply(category).astype("int")
df['Target_cat'] = df["Target"].apply(category).astype("int")
# df['Source_cat'] = LabelEncoder().fit_transform(df.Source)
# df['Target_cat'] = LabelEncoder().fit_transform(df.Target)
df["Color"] = df["Source"].apply(color).astype("str")
df = df.sort_values(by=["Source_cat", "Target_cat"])
Lista_Para_Sumar = df["Source_cat"].nunique()
Lista_Para_Tags = df["Source"].unique().tolist()
Suma = Lista_Para_Sumar
df["out"] = df["Target_cat"] + Suma
TAGS = Lista_Para_Tags + Lista_Para_Tags
Origen = df['Source_cat'].tolist()
Destino = df["out"].tolist()
Valor = df["Value"].tolist()
Color = df["Color"].tolist()
return (TAGS, Origen, Destino, Valor, Color)
def Sankey(TAGS: object, Origen: object, Destino: object, Valor: object, Color: object, titulo: str) -> object:
label = TAGS
source = Origen
target = Destino
value = Valor
link = dict(source=source, target=target, value=value,
color=Color)
node = dict(x=[0, 0, 0, 0, 1, 1, 1, 1], y=[1, 0.75, 0.5, 0.25, 0, 1, 0.75, 0.5, 0.25, 0], label=label, pad=35,
thickness=10,
color=["#305CA3", "#C1DAF1", "#C9304E", "#F7DC70", "#305CA3", "#C1DAF1", "#C9304E", "#F7DC70"])
data = go.Sankey(link=link, node=node, arrangement='snap')
fig = go.Figure(data)
fig.update_layout(title_text=titulo + "-" + "Mujeres", font_size=10, )
plt.plot(alpha=0.01)
titulo_guardar = (str(titulo) + ".png")
fig.write_image("/Users/agudelo/Desktop/GRAFICOS PNUD/Graficas/MUJERES/" + titulo_guardar, engine="kaleido")
for y in Lista_DF:
TAGS, Origen, Destino, Valor, Color = grafico(y)
titulo = str(y["code"].unique())
titulo = titulo.replace("[", "")
titulo = titulo.replace("]", "")
titulo = titulo.replace("'", "")
Sankey(TAGS, Origen, Destino, Valor, Color, titulo)
The expected result should be.
The expected result due to the correct order:
The real result i am getting is:
I had a similar problem earlier. I hope this will work for you. As I did not have your data, I created some dummy data. Sorry about the looooong explanation. Here are the steps that should help you reach your goal...
This is what I did:
Order the data and sort it - used pd.Categorical to set the order and then df.sort to sort the data so that the input is sorted by source and then destination.
For the sankey node, you need to set the x and y positions. x=0, y=0 starts at top left. This is important as you are telling plotly the order you want the nodes. One weird thing is that it sometimes errors if x or y is at 0 or 1. Keep it very close, but not the same number... wish I knew why
For the other x and y entries, I used ratios as my total adds up to 285. For eg. Source-Informal starts at x = 0.001 and y = 75/285 as Source-Formal = 75 and this will start right after that
Based on step 1, the link -> source and destination should also be sorted. But, pls do check.
Note: I didn't color the links, but think you already have achieved that...
Hope this helps resolve your issue...
My data - sankey.csv
source,destination,value
Formal,Formal,20
Formal,Informal, 10
Formal,Unemployed,30
Formal,Inactive,15
Informal,Formal,20
Informal,Informal,15
Informal,Unemployed,25
Informal,Inactive,25
Unemployed,Formal,5
Unemployed,Informal,10
Unemployed,Unemployed,10
Unemployed,Inactive,5
Inactive,Formal,30
Inactive,Informal,20
Inactive,Unemployed,20
Inactive,Inactive,25
The code
import plotly.graph_objects as go
import pandas as pd
df = pd.read_csv('sankey.csv') #Read above CSV
#Sort by Source and then Destination
df['source'] = pd.Categorical(df['source'], ['Formal','Informal', 'Unemployed', 'Inactive'])
df['destination'] = pd.Categorical(df['destination'], ['Formal','Informal', 'Unemployed', 'Inactive'])
df.sort_values(['source', 'destination'], inplace = True)
df.reset_index(drop=True)
mynode = dict(
pad = 15,
thickness = 20,
line = dict(color = "black", width = 0.5),
label = ['Formal', 'Informal', 'Unemployed', 'Inactive', 'Formal', 'Informal', 'Unemployed', 'Inactive'],
x = [0.001, 0.001, 0.001, 0.001, 0.999, 0.999, 0.999, 0.999],
y = [0.001, 75/285, 160/285, 190/285, 0.001, 75/285, 130/285, 215/285],
color = ["#305CA3", "#C1DAF1", "#C9304E", "#F7DC70", "#305CA3", "#C1DAF1", "#C9304E", "#F7DC70"])
mylink = dict(
source = [ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 ],
target = [ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 ],
value = df.value.to_list())
fig = go.Figure(data=[go.Sankey(
arrangement='snap',
node = mynode,
link = mylink
)])
fig.update_layout(title_text="Basic Sankey Diagram", font_size=20)
fig.show()
The output

Python int too large to convert to C long - when running Randomized Search CV method

I am trying to do hyperparameter tuning for XGBClassifer and I'm getting this error while fitting the Randomized Search CV method. I have no idea why this error occurred. I didn't use any number greater than the limit. X_train_mms just has values between 0 and 1 and y_train value is binary 0 and 1. Does anyone know how to solve this issue?
Here is the code:
# Preprocessing
scaler = MinMaxScaler()
scaler.fit(X)
X_mms = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_train_mms = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test_mms = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
X_valid_mms = pd.DataFrame(scaler.transform(X_valid), index=X_valid.index, columns=X_valid.columns)
# Hyperparameter tuning function
def hypertune(model_parameter):
model = model_parameter['function']
parameter = model_parameter['params']
scores = {'mcc': matthews_corrcoef}
cv = StratifiedKFold(n_splits = 10)
search = RandomizedSearchCV(model, parameter, n_iter=100,
scoring=make_scorer(scores['mcc']), cv=cv, random_state=42, return_train_score = True)
search.fit(X_train_mms, y_train)
attr = {}
attr['rank'] = search.cv_results_['rank_test_score']
attr['test_means'] = search.cv_results_['mean_test_score']
attr['test_stds'] = search.cv_results_['std_test_score']
attr['train_means'] = search.cv_results_['mean_train_score']
attr['train_stds'] = search.cv_results_['std_train_score']
attr['params'] = search.cv_results_['params']
attributes = pd.DataFrame(attr)
return attributes
parameter = {'gamma': np.concatenate((np.arange(0.0001, 0.001, 0.0001), np.arange(0.001, 0.01, 0.001), np.arange(0.01, 0.1, 0.01), np.arange(0.1, 1, 0.1), list(range(1,11))), axis = None),
'learning_rate': np.arange(0.01,10,0.01),
'max_depth': list(range(1,21)),
'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 1000, num = 20)],
'reg_alpha': np.arange(0,10,0.1),
'reg_lambda': np.arange(0,10,0.1)}
xgbooster = {'name': 'XGBooster', 'function': XGBClassifier(), 'params': parameter}
optimized_xgb = hypertune(xgbooster)

concurrent.futures.ProcessPoolExecutor(): json file is not created

I'm new to this package. When studying the codes from https://github.com/diningphil/graph-mixture-density-networks (with some minor modification). In the notebook file SIR Simulation with DGL_ERDOS-RENYI.ipynb, during the simulation process, I encountered a weird thing:
If I set debug = True, which means I'm not using the pool = concurrent.futures.ProcessPoolExecutor(max_workers=processes) but just run it one by one, both the .json files and the .bin files will be created according to the json_filepath variable.
However, when I deleted the output and run it by setting debug = False so all the codes will run simultaneously if my understanding is correct, but the json_file will not be created and the program seems to terminate at the step graph.to(torch.device(device)) as all my print command is not executed afterward. I only have the .bin files created.
Could anyone help me by telling me the possible reason or waht I should do about it? Thanks a lot!
'''
run simulation and store
1) state of all nodes at each time step
into a single pandas dataframe for all beta, gamma and repetitions
2) R_0
3) number of total people infected (total - susceptible at the end of the iteration)
'''
seed = 38
torch.manual_seed(seed)
device = 'cuda'
beta_range = [0, 1]
gamma_range = [0.1, 1]
iterations = 5
no_graph_samples = 20
no_realizations = 100
family_name = 'erdos_renyi'
folder = Path(f'{family_name}')
if not os.path.exists(folder):
os.makedirs(folder)
def simulate(p, graph_size, graph_sample, graphs_folder):
json_filepath = str(Path(graphs_folder, f'data_{graph_sample}.json'))
graph_filename = graphs_folder / Path(f'sample{graph_sample}.bin')
json_data = {'family': family_name,
'p': p,
'graph_size': graph_size,
'no_graph_samples': no_graph_samples,
'graph_samples': []
}
sample = {'graph_filename': str(graph_filename),
'simulations': []}
if not os.path.exists(graph_filename):
graph = create_erdos_renyi_graph(graph_size, p)
save_graphs(str(graph_filename), graph)
else:
graph = load_graphs(str(graph_filename))[0][0]
#print('test')
graph.to(torch.device(device))
## every code above this line will run, at least print() will work
if not os.path.exists(json_filepath):
print('test: json_does not exit')
for realizations in range(no_realizations):
beta = float(torch.FloatTensor(1).uniform_(beta_range[0], beta_range[1]))
gamma = float(torch.FloatTensor(1).uniform_(gamma_range[0], gamma_range[1]))
R0 = beta/gamma
graph.register_message_func(lambda x: SIR_message_func(beta, x))
graph.register_reduce_func(lambda x: SIR_reduce_func(gamma, x))
for initial_probability_of_infection in [0.01, 0.05, 0.1]:
simulation = {'beta': beta, 'gamma': gamma, 'R0': R0, 'init_infection_prob': initial_probability_of_infection}
S, I, R, first_infected = simulate_SIR(graph, initial_probability_of_infection, iterations)
simulation['S'] = S
simulation['I'] = I
simulation['R'] = R
simulation['first_infected'] = first_infected
simulation['total_infected'] = graph_size - S[-1]
sample['simulations'].append(deepcopy(simulation))
#print("Realization ", realizations, "produced ", graph_size - S[-1], "infected")
json_data['graph_samples'].append(sample)
with open(json_filepath, 'w') as f:
line = json.dumps(json_data)
f.write(line + '\n')
#json.dump(json_data, f)
print('dumped')
else:
print('test: there is json')
print(sample)
# with open(json_filepath, 'r') as f:
# json.load(f)
# print('loaded but why')
debug = False
processes = 100
import concurrent.futures
pool = concurrent.futures.ProcessPoolExecutor(max_workers=processes)
#for graph_size in [10, 50, 100, 200, 500, 1000]:
for graph_size in [10]:
for p in [0.01, 0.05]:
#for p in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]:
graphs_folder = folder / Path(f'graphs_size{graph_size}_p{float(p)}')
#store each graph in a different folder (create path based on graph size, prob of edge and graph sample)
if not os.path.exists(graphs_folder):
os.makedirs(graphs_folder)
for graph_sample in range(no_graph_samples):
if not debug:
pool.submit(simulate, p, graph_size, graph_sample, graphs_folder)
else: # DEBUG
simulate(p, graph_size, graph_sample, graphs_folder)
pool.shutdown() # wait the batch of configs to terminate

How to formulate linear programming problem in Cplex using Python

I am trying to solve a linear programming problem using IBM's Cplex, while calling it from Python.
The problem is to minimize a+c,
subject to the constraint that Ax'=m',
where x=[a,b,c]
A = [[20,0,0],[0,20,30]]
m = [20,30]
with a,b,c between 0 and 1.
One correct solution to the problem is a=1, b=0, and c=1. But Cplex gives solutions, a=1, b=1, and c=0. There is an error in formulating the problem but I cannot figure out where. Code below
import cplex
from cplex.exceptions import CplexError
import sys
my_obj = [1.0, 0.0, 1.0]
my_ub = [1.0] * len(my_obj)
my_lb = [0.0] * len(my_obj)
my_colnames = ["a", "b", "c"]
my_rhs = [20.0, 30.0]
my_rownames = ["c1", "c2"]
my_sense = "E" * len(my_rownames)
def populatebynonzero(prob):
prob.objective.set_sense(prob.objective.sense.minimize)
prob.linear_constraints.add(rhs = my_rhs, senses = my_sense,names = my_rownames)
prob.variables.add(obj = my_obj, ub = my_ub, lb = my_lb ,names = my_colnames)
rows = [0,1]
cols = [0,1]
vals = [20.0,30.0]
prob.linear_constraints.set_coefficients(zip(rows, cols, vals))
def lpex1():
try:
my_prob = cplex.Cplex()
handle = populatebynonzero(my_prob)
my_prob.solve()
except CplexError, exc:
print exc
return
numrows = my_prob.linear_constraints.get_num()
numcols = my_prob.variables.get_num()
print
# solution.get_status() returns an integer code
print "Solution status = " , my_prob.solution.get_status(), ":",
# the following line prints the corresponding string
print my_prob.solution.status[my_prob.solution.get_status()]
print "Solution value = ", my_prob.solution.get_objective_value()
slack = my_prob.solution.get_linear_slacks()
pi = my_prob.solution.get_dual_values()
x = my_prob.solution.get_values()
dj = my_prob.solution.get_reduced_costs()
for i in range(numrows):
print "Row %d: Slack = %10f Pi = %10f" % (i, slack[i], pi[i])
for j in range(numcols):
print "Column %d: Value = %10f Reduced cost = %10f" % (j, x[j], dj[j])
my_prob.write("lpex1.lp")
print x, "SOLUTIONS"
lpex1()
There was an error in definition of rows and columns of the constraint, corrections below, works now
rows = [0,1,1]
cols = [0,1,2]
vals = [20.0,20.0,30.0]

Fuzzy rules with more than two input variables in python

I am trying to build a fuzzy inference system in python. I have 4 variables depending on which output class is decided.
def fuzzInferenceself():
### Input ###
hf_very_poor = fuzz.trimf(hotel_facility, [0, 0.15, 0.3])
hf_poor = fuzz.trimf(hotel_facility, [2.5,0.3,0.45])
hf_average = fuzz.trimf(hotel_facility, [0.4, 0.5, 0.75])
hf_good = fuzz.trimf(hotel_facility, [0.7, 0.85, 0.9])
hf_very_good = fuzz.trimf(hotel_facility, [0.875, 0.92, 1.0])
vc_less = fuzz.trimf(visited_count, [0, 0.05, 0.1])
vc_average = fuzz.trimf(visited_count, [0.05, 0.2, 0.35])
vc_many = fuzz.trapmf(visited_count, [0.3,0.45,0.55,0.7])
vc_a_lot = fuzz.trapmf(visited_count, [0.65,0.8,0.9,1.0])
rm_very_poor = fuzz.trimf(hotel_facility, [0, 0.15, 0.3])
rm_poor = fuzz.trimf(hotel_facility, [2.5,0.3,0.45])
rm_average = fuzz.trimf(hotel_facility, [0.4, 0.5, 0.75])
rm_good = fuzz.trimf(hotel_facility, [0.7, 0.8, 0.9])
rm_very_good = fuzz.trimf(hotel_facility, [0.85, 0.9,1.0])
## output ####
class_very_poor = fuzz.gaussmf(class_score,1,0.5)
class_poor = fuzz.gaussmf(class_score,1.75,0.65)
class_average = fuzz.gaussmf(class_score,2.25,0.75)
class_good = fuzz.gaussmf(class_score,3,0.25)
class_very_good = fuzz.gaussmf(class_score, 3.5, 0.5)
def hotelFaclilityClassification(self,A):
hf_vp= fuzz.interp_membership(hotel_facility, hf_very_poor, A)
hf_p= fuzz.interp_membership(hotel_facility, hf_poor, A)
hf_av= fuzz.interp_membership(hotel_facility, hf_average, A)
hf_gd= fuzz.interp_membership(hotel_facility, hf_good, A)
hf_vg= fuzz.interp_membership(hotel_facility, hf_very_good, A)
return dict(hfVP = hf_vp, hfP = hf_p, hfAV = hf_av,hGD = hf_gd, hVG = hf_vg)
def visitCountClassification(B):
vc_l = fuzz.interp_membership(visited_count,vc_less)
vc_av = fuzz.interp_membership(visited_count,vc_average)
vc_mn = fuzz.interp_membership(visited_count,vc_many)
vc_al = fuzz.interp_membership(visited_count,vc_a_lot)
return dict(vcL = vc_l, vcAV=vc_av, vcMN = vc_mn, vcAL = vc_al )
def roomFacilityClassification(C):
rm_vp = fuzz.interp_membership(room_facility,rm_very_poor)
rm_p = fuzz.interp_membership(room_facility,rm_poor)
rm_av = fuzz.interp_membership(room_facility,rm_average)
rm_gd = fuzz.interp_membership(room_facility,rm_good)
rm_vg = fuzz.interp_membership(room_facility,rm_very_good)
return dict(rmVP = rm_vp, rmP = rm_p, rmAV = rm_av, rmGD = rm_gd, rmVG = rm_vg)
A similar function for price is: def priceClassification(D).
The rules are as follows:
"If Hotel facility score is Very good, visited count is alot, room facility is very good,prices is less then class is very good."
I do not understand how to code the rules. All sources I have seen takes one input and one output variable. But this is not the case in my code.
Can anyone give me a good resource or idea about how to code this rule?

Categories