I need to produce 5000 kgs of steel by mixing 7 alloys parts .
I need to reduce the cost, so i need to pick up the best parts.
The result must respect the main steel caracteristics, for example, the carbon level must be between 2% and 3 %, no more, no less .
The Excel linear solver program already exists ,and is originated from a professional book.
I'm trying to translate it to a PULP code, now .
My problem is : How to create the copper, carbone, and manganèse constraints ? There are 2 arrays, so I don't know how to do.
It is all in percents, and I don't know how to do . My result is actually wrong, I left the bad constraints I've done for information . It seems that I need to divide by 5000 at one moment, but how should I do ?
Let me try to explain to you what I can not understand :
I need 5000 kgs of steel to have 0.60 % of copper in it, but my Copper alloy parts contains 90 % and 96% of copper.
Do you see what I mean, and why it is so difficult to describe my constraints ?
"" "
Mining and metals
We make steel with raw materials, we want to reduce the cost of producing this steel
to make more money, but still respecting the minimum characteristics of quality steel
"" "
# Minimize the cost of metal alloys.
# Characteristics of the steel to be made
"" "Element %Minimum %Max %Real ( it is a var)
Carbon 2 3 2.26
Copper 0.4 0.6 0.60
Manganese 1.2 1.65 1.20
"" "
# Characteristics, stocks and purchase price of alloys
"" "
Alloy C% Cu% Mn% Stocks kg Price € / kg
Iron alloy 2.50 0.00 1.30 4000 1.20
Iron alloy 3.00 0.00 0.80 3000 1.50
Iron alloy 0.00 0.30 0.00 6000 0.90
Copper alloy 0.00 90.00 0.00 5000 1.30
Copper alloy 0.00 96.00 4.00 2000 1.45
Aluminum alloy 0.00 0.40 1.20 3000 1.20
Aluminum alloy 0.00 0.60 0.00 2,500 1.00
"" "
# Import the PuLP lib
from pulp import *
# Create the problem variable
prob = LpProblem ("MinimiserLpAlliage", LpMinimize)
# The 7 vars have a zero limit
x1 = LpVariable ("Iron alloy 1", 0)
x2 = LpVariable ("Iron alloy 2", 0)
x3 = LpVariable ("Iron alloy 3", 0)
x4 = LpVariable ("Copper alloy 1", 0)
x5 = LpVariable ("Copper alloy 2", 0)
x6 = LpVariable ("Aluminum alloy 1", 0)
x7 = LpVariable ("Aluminum alloy 2", 0)
# The objective function is to minimize the total cost of the alloys in EUROS for a given quantity in KGS
prob + = 1.20 * x1 + 1.50 * x2 + 0.90 * x3 + 1.30 * x4 + 1.45 * x5 + 1.20 * x6 + 1.00 * x7, "AlliageCost"
# Quantity constraint in KGS.
prob + = x1 + x2 + x3 + x4 + x5 + x6 + x7 == 5000, "RequestedQuantity"
# MIN constraints of% carbon, by alloy // ITS NOT WHAT I NEED
prob + = x1> = 2.5, "MinCarboneRequirement1"
prob + = x2> = 3, "MinCarboneRequirement2"
prob + = x3> = 0, "MinCarboneRequirement3"
prob + = x4> = 0, "MinCarboneRequirement4"
prob + = x5> = 0, "MinCarboneRequirement5"
prob + = x6> = 0, "MinCarboneRequirement6"
prob + = x7> = 0, "MinCarboneRequirement7"
# MIN constraints of% copper, by alloy // ITS WRONG ITS NOT WHAT I NEED
prob + = x1> = 0, "MinCuivreRequirement1"
prob + = x2> = 0, "MinCuivreRequirement2"
prob + = x3> = 0.3, "MinCuivreRequirement3"
prob + = x4> = 90, "MinCuivreRequirement4"
prob + = x5> = 96, "MinCuivreRequirement5"
prob + = x6> = 0.4, "MinCuivreRequirement6"
prob + = x7> = 0.6, "MinCuivreRequirement7"
# MIN constraints of% of Manganese, by alloy // ITS WRONG ITS NOT WHAT I NEED
prob + = x1> = 1.3, "MinManganeseRequirement1"
prob + = x2> = 0.8, "MinManganeseRequirement2"
prob + = x3> = 0, "MinManganeseRequirement3"
prob + = x4> = 0, "MinManganeseRequirement4"
prob + = x5> = 4, "MinManganeseRequirement5"
prob + = x6> = 1.2, "MinManganeseRequirement6"
prob + = x7> = 0, "MinManganeseRequirement7"
# MAX constraints of% of Manganese, by alloy // ITS WRONG ITS NOT WHAT I NEED
prob + = x1 <= 1.3, "MaxManganeseRequirement1"
prob + = x2 <= 0.8, "MaxManganeseRequirement2"
prob + = x3 <= 0, "MaxManganeseRequirement3"
prob + = x4 <= 0, "MaxManganeseRequirement4"
prob + = x5 <= 4, "MaxManganeseRequirement5"
prob + = x6 <= 1.2, "MaxManganeseRequirement6"
prob + = x7 <= 0, "MaxManganeseRequirement7"
# 5. MAX constraints from available stock, by alloy // I THINK IT IS OK
prob + = x1 <= 4000, "MaxStock"
prob + = x2 <= 3000, "MaxStock1"
prob + = x3 <= 6000, "MaxStock2"
prob + = x4 <= 5000, "MaxStock3"
prob + = x5 <= 2000, "MaxStock4"
prob + = x6 <= 3000, "MaxStock5"
prob + = x7 <= 2500, "MaxStock6"
# The problem data is written to an .lp file
prob.writeLP ( "WhiskasModel.lp")
# We use the solver
prob.solve ()
# The status of the solution
print ("Status:", LpStatus [prob.status])
# We magnify and display the optimums of each var
for v in prob.variables ():
print (v.name, "=", v.varValue)
# The result of the objective function is here
print ("Total", value (prob.objective))
This is the answer, but of course, it is wrong, cause I dont know how to do the constraints :
Status: Optimal
Aluminum_alloy_1 = 1.2
Aluminum_alloy_2 = 0.6
Copper_alloy_1 = 90.0
Alloy_of_copper_2 = 96.0
Alloy_of_fer_1 = 2.5
Alloy_of_fer_2 = 3.0
Iron_alloy_3 = 4806.7
Total 4,591.76,999,999,999,995
EDIT Hello !
This is the improved version 2 of my code, sorry, it is in french, but i bet you can see what i mean , it still doesn't work , thought... but closer to what I need :
Mining and metals
In the manufacture of steel with permeable materials, sur wants to reduce the cost of producing this steel
to earn more money but still respecting the important characteristics of quality steel
# Characteristics of the steel to be made
""" Elément % minimal % Max
Carbone 2 3
Cuivre 0.4 0.6
Manganèse 1.2 1.65
"""
# Characteristics, stocks and purchase price of alloys at KILO
"""
Alliage C % Cu % Mn % Stocks kg Prix €/kg
Alliage de fer 1 2,50 0,00 1,30 4000 1,20
Alliage de fer 2 3,00 0,00 0,80 3000 1,50
Alliage de fer 3 0,00 0,30 0,00 6000 0,90
Alliage de cuivre 1 0,00 90,00 0,00 5000 1,30
Alliage de cuivre 2 0,00 96,00 4,00 2000 1,45
Alliage d'alu 1 0,00 0,40 1,20 3000 1,20
Alliage d'alu 2 0,00 0,60 0,00 2500 1,00
"""
# Importer la lib PuLP
from pulp import *
#Créer la variable du problème
prob = LpProblem("MinimiserLpAlliage",LpMinimize)
# The 7 vars have a zero limit, these decision variables are expressed in KILOS
x1 = LpVariable("Alliage de fer 1",0)
x2 = LpVariable("Alliage de fer 2",0)
x3 = LpVariable("Alliage de fer 3",0)
x4 = LpVariable("Alliage de cuivre 1",0)
x5 = LpVariable("Alliage de cuivre 2",0)
x6 = LpVariable("Alliage d'alu 1",0)
x7 = LpVariable("Alliage d'alu 2",0)
# The objective function is to minimize the total cost of the alloys in EUROS
prob += 1.20 * x1 + 1.50 * x2 + 0.90 * x3 + 1.30 * x4 + 1.45 * x5 + 1.20 * x6 + 1.00 * x7, "CoutAlliages"
# Quantity constraint in KGS.
prob += x1 + x2 + x3 + x4 + x5 + x6 + x7 == 5000, "QuantitéDemandée"
# Carbon stress.
prob += (2.50 * x1 + 3.00 * x2 + x3 + x4 + x5 + x6 + x7 ) / 5000 <= 3,"carBmax"
prob += (2.50 * x1 + 3.00 * x2 + x3 + x4 + x5 + x6 + x7 ) / 5000 >= 2,"carBmin"
# Constraint cu .
prob += (x1 + x2 + 0.30 * x3 + 90 * x4 + 96 * x5 + 0.40 * x6 + 0.60 * x7) / 5000 <= 0.6,"cuBmax"
prob += (x1 + x2 + 0.30 * x3 + 90 * x4 + 96 * x5 + 0.40 * x6 + 0.60 * x7) / 5000 >= 0.4,"cuBmin"
# Constraint Manganèse.
prob += (1.30 * x1 + 0.80 * x2 + x3 + x4 + 4 * x5 + 1.20 * x6 + x7 ) / 5000 <= 1.65,"mgBmax"
prob += (1.30 * x1 + 0.80 * x2 + x3 + x4 + 4 * x5 + 1.20 * x6 + x7 ) / 5000 >= 1.2,"mgBmin"
# 5. MAX constraints from available stock, by alloy
prob += x1 <= 4000 , "MaxStock"
prob += x2 <= 3000 , "MaxStock1"
prob += x3 <= 6000 , "MaxStock2"
prob += x4 <= 5000 , "MaxStock3"
prob += x5 <= 2000 , "MaxStock4"
prob += x6 <= 3000 , "MaxStock5"
prob += x7 <= 2500 , "MaxStock6"
# The problem data is written to an .lp file
prob.writeLP("acier.lp")
# On utilise le solveur
prob.solve()
# The status of the solution
print ("Status:", LpStatus[prob.status])
# We magnify and display the optimums of each var
for v in prob.variables():
print (v.name, "=", v.varValue)
# The result of the objective function is here
print ("Total payable in euros", value(prob.objective))
""" Status: Infeasible
Alliage_d'alu_1 = 0.0
Alliage_d'alu_2 = 0.0
Alliage_de_cuivre_1 = 0.0
Alliage_de_cuivre_2 = 0.0
Alliage_de_fer_1 = 0.0
Alliage_de_fer_2 = 0.0
Alliage_de_fer_3 = 10000.0
Total à payer en euros 9000.0 """
The book says the result with the excel solver is :
iron_1 : 4000 kgs
iron_2 : 0 kgs
iron_3 : 397.76kgs
cu_1 : 0 kgs
cu_2 : 27.61kgs
al_1 : 574.62kgs
al_2 : 0kgs
Cost in euros 5887.57
Steel contains 2% carb, 0.6 % cu, 1.2 %
manganese
Excel tab :
Solver pic :
Part of your problem is how you are understanding/applying percentages. My recommendation would be to convert percentages [0-100] to fractional numbers [0-1.0] as early as possible.
In excel when a cell says 50% the numeric value of the cell is actually 0.5. Working with percentages in this way means you don't have to keep dividing out by 100, and can multiply one percentage with another and it all just works.
The code below does what you want:
"""
Mining and metals
We make steel with raw materials, we want to reduce the cost of producing this steel
to make more money, but still respecting the minimum characteristics of quality steel
"""
# Minimize the cost of metal alloys.
# Characteristics of the steel to be made
"""Element %Minimum %Max %Real (it is a var)
Carbon 2 3 2.26
Copper 0.4 0.6 0.60
Manganese 1.2 1.65 1.20
"""
# Characteristics, stocks and purchase price of alloys
"""
Alloy C% Cu% Mn% Stocks kg Price € / kg
Iron alloy 2.50 0.00 1.30 4000 1.20
Iron alloy 3.00 0.00 0.80 3000 1.50
Iron alloy 0.00 0.30 0.00 6000 0.90
Copper alloy 0.00 90.00 0.00 5000 1.30
Copper alloy 0.00 96.00 4.00 2000 1.45
Aluminum alloy 0.00 0.40 1.20 3000 1.20
Aluminum alloy 0.00 0.60 0.00 2500 1.00
"""
# Import the PuLP lib
from pulp import *
# Create the problem variable
prob = LpProblem ("MinimiserLpAlliage", LpMinimize)
# Problem Data
input_mats = ["iron_1", "iron_2", "iron_3",
"cu_1", "cu_2",
"al_1", "al_2"]
input_costs = {"iron_1": 1.20, "iron_2": 1.50, "iron_3": 0.90,
"cu_1": 1.30, "cu_2": 1.45,
"al_1": 1.20, "al_2": 1.00}
# C% Cu% Mn%
input_composition = {"iron_1": [0.025, 0.000, 0.013],
"iron_2": [0.030, 0.000, 0.008],
"iron_3": [0.000, 0.003, 0.000],
"cu_1": [0.000, 0.900, 0.000],
"cu_2": [0.000, 0.960, 0.040],
"al_1": [0.000, 0.004, 0.012],
"al_2": [0.000, 0.006, 0.000]}
input_stock = {"iron_1": 4000, "iron_2": 3000, "iron_3": 6000,
"cu_1": 5000, "cu_2": 2000,
"al_1": 3000, "al_2": 2500}
request_quantity = 5000
Carbon_min = 0.02
Carbon_max = 0.03
Cu_min = 0.004
Cu_max = 0.006
Mn_min = 0.012
Mn_max = 0.0165
# Problem variables - amount in kg of each input
x = LpVariable.dicts("input_mat", input_mats, 0)
# The objective function is to minimize the total cost of the alloys in EUROS for a given quantity in KGS
prob += lpSum([input_costs[i]*x[i] for i in input_mats]), "AlliageCost"
# Quantity constraint in KGS.
prob += lpSum([x[i] for i in input_mats]) == request_quantity, "RequestedQuantity"
# MIN/MAX constraint of carbon in resultant steel
prob += lpSum([x[i]*input_composition[i][0] for i in input_mats]) >= Carbon_min*request_quantity, "MinCarbon"
prob += lpSum([x[i]*input_composition[i][0] for i in input_mats]) <= Carbon_max*request_quantity, "MaxCarbon"
# MIN/MAX constraints of copper in resultant steel
prob += lpSum([x[i]*input_composition[i][1] for i in input_mats]) >= Cu_min*request_quantity, "MinCu"
prob += lpSum([x[i]*input_composition[i][1] for i in input_mats]) <= Cu_max*request_quantity, "MaxCu"
# MIN/MAX constraints of manganese in resultant steel
prob += lpSum([x[i]*input_composition[i][2] for i in input_mats]) >= Mn_min*request_quantity, "MinMn"
prob += lpSum([x[i]*input_composition[i][2] for i in input_mats]) <= Mn_max*request_quantity, "MaxMn"
# MAX constraints of available stock
for i in input_mats:
prob += x[i] <= input_stock[i], ("MaxStock_" + i)
# Solve the problem
prob.solve()
# The status of the solution
print ("Status:", LpStatus [prob.status])
# Dislay the optimums of each var
for v in prob.variables ():
print (v.name, "=", v.varValue)
# Display mat'l compositions
Carbon_value = sum([x[i].varValue*input_composition[i][0] for i in input_mats])/request_quantity
Cu_value = sum([x[i].varValue*input_composition[i][1] for i in input_mats])/request_quantity
Mn_value = sum([x[i].varValue*input_composition[i][2] for i in input_mats])/request_quantity
print ("Carbon content: " + str(Carbon_value))
print ("Copper content: " + str(Cu_value))
print ("Manganese content: " + str(Mn_value))
# The result of the objective function is here
print ("Total", value (prob.objective))
From which I get:
Status: Optimal
input_mat_al_1 = 574.62426
input_mat_al_2 = 0.0
input_mat_cu_1 = 0.0
input_mat_cu_2 = 27.612723
input_mat_iron_1 = 4000.0
input_mat_iron_2 = 0.0
input_mat_iron_3 = 397.76302
Carbon content: 0.02
Copper content: 0.006000000036
Manganese content: 0.012000000008
Total 5887.57427835
Related
I am trying to solve a blending problem with a system of 3 equations and I have 3 objectives to reach for, or try to get the values as close as posible for the three of them:
The equations are:
def sat (c,s,a,f):
return (100*c)/(2.8*s+1.18*a+0.65*f) #For this I need sat = 98.5
def ms (s,a,f):
return s/(a+f) #For this I need ms = 2.5
def ma (a,f):
return (a/f) #For this I need ms = 1.3
#The total mix ratio:
r1+r2+r3+r4+r5+r6 = 1
material_1:
c = 51.29
s = 4.16
a = 0.97
f = 0.38
material_2:
c = 51.42
s = 4.16
a = 0.95
f = 0.37
material_3:
c = 6.88
s = 63.36
a = 13.58
f = 3.06
material_4:
c = 32.05
s = 1.94
a = 0.0
f = 0.0
material_5:
c = 4.56
s = 21.43
a = 3.82
f = 52.28
material_6:
c = 0.19
s = 7.45
a = 4.58
f = 0.42
#The aproximate values I am trying to find are around:
0.300 <= r1 <= 0.370
0.300 <= r2 <= 0.370
0.070 <= r3 <= 0.130
0.005 <= r4 <= 0.015
0.010 <= r5 <= 0.030
0.110 <= r6 <= 0.130
So how can I calculate the value for every ratio "r" in order to get the closets values to the objectives for the 3 equations?
I looked on some optimizers but as I am new with them I still can not understand how to set up the problem, the equations and constraints into them.
I guess I made it, of course the code is awful but I will try to make it look better later.
I added the cost of the components so I can give a function to "minimize", of course this is becouse I know the aproximated material ratio so it guide the solver to it.
I will post the code for it:
c1 = 51.42
c2 = 51.42
c3 = 6.88
c5 = 32.05
c6 = 4.56
c7 = 0.19
s1 = 4.16
s2 = 4.16
s3 = 63.36
s5 = 1.94
s6 = 21.43
s7 = 7.45
a1 = 0.97
a2 = 0.95
a3 = 13.58
a5 = 0.0
a6 = 3.82
a7 = 4.58
f1 = 0.38
f2 = 0.37
f3 = 3.06
f5 = 0.0
f6 = 52.28
f7 = 0.42
r7 = 0.125
r1 = cp.Variable()
r2 = cp.Variable()
r3 = cp.Variable()
r5 = cp.Variable()
r6 = cp.Variable()
#Costos
caliza = 10
arcilla = 20
hierro = 170
yeso = 80
objective = cp.Minimize(r1*caliza+r2*caliza+r3*arcilla+r5*yeso+r6*hierro)
constraints = [
r1-r2 == 0,
r1>= 0.20,
r1<= 0.40,
r3<=0.14,
r3>=0.06,
r5>=0.001,
r5<=0.008,
r6>=0.01,
r6<=0.03,
2.5*((r1*a1+r2*a2+r3*a3+r5*a5+r6*a6+r7*a7)+(f1*r1+f2*r2+f3*r3+f5*r5+f6*r6+f7*r7))-(r1*s1+r2*s2+r3*s3+r5*s5+r6*s6+r7*s7)==0,
(98.5*(2.8*(r1*s1+r2*s2+r3*s3+r5*s5+r6*s6+r7*s7)+1.18*(r1*a1+r2*a2+r3*a3+r5*a5+r6*a6+r7*a7)+0.65*(f1*r1+f2*r2+f3*r3+f5*r5+f6*r6+f7*r7))-100*(r1*c1+r2*c2+r3*c3+r5*c5+r6*c6+r7*c7)) == 0,
#1.3*(f1*r1+f2*r2+f3*r3+f5*r5+f6*r6+f7*r7)-(r1*a1+r2*a2+r3*a3+r5*a5+r6*a6+r7*a7) == 0,
r1+r2+r3+r5+r6+r7 == 1]
problem = cp.Problem(objective,constraints)
problem.solve()
print(r1.value,r2.value,r3.value,r5.value,r6.value)
print(problem.status)
This gives me the result:
0.3644382497863931 0.3644382497863931 0.12287226775076901 0.0009999999955268117 0.022251232680917873
optimal
Anyways, the only way to make a feasible result is to only consider 2 of the three constraints functions, becouse the components cant reach the 3 of them and this indicates that I need to check de material components before I try to reach the 3 constraints (wich were the sat, ma and ms).
Now I will try to make the code better using pandas so i can get the material components with somekind of for loop and laso use it for the ratios.
Thank you so much for your help👍.
So this is a simple/trivial example to show the intent that was mentioned in comment to minimize the square of errors... Instead of using a constraint to pin a value to an exact outcome, we let the solver find the best outcome that minimizes the square of the error where error = value - target. I think what I've written below is fairly clear. CVXPY likes to work in the linear algebra realm, and I'm sure this could be converted into vector / matrix format, but the concept is to remove constraints and let the solver figure out the best combo. Obviously, if there are hard constraints, those need to be added, but note that I've just made an example with 2 of your 3 targets (with some trivial math) and moved it into the objective.
Your problem with 3 constraints that aren't simultaneously satisfiable is probably a candidate for a conversion like this...
import cvxpy as cp
r1 = cp.Variable()
r2 = cp.Variable()
ma = 2.5
ms = 3.4
delta_1 = (r1 + r2 - ma)**2 # diff from r1 + r2 and ma
delta_2 = (3*r1 + 2*r2 - ms)**2 # diff from 3r1 + 2r2 and ms
prob = cp.Problem(cp.Minimize(delta_1 + delta_2))
prob.solve()
print(prob.value)
print(r1.value, r2.value)
Output
9.860761315262648e-31
-1.6000000000000014 4.100000000000002
Ok this is what i have done and works fine:
#I call the values from a pandas DF:
c1 = df.at[0, 'MAX']
c2 = df.at[4, 'MAX']
c3 = df.at[8, 'MAX']
c5 = df.at[12, 'MAX']
c6 = df.at[16, 'MAX']
c7 = df.at[20, 'MAX']
s1 = df.at[1, 'MAX']
s2 = df.at[5, 'MAX']
s3 = df.at[9, 'MAX']
s5 = df.at[13, 'MAX']
s6 = df.at[17, 'MAX']
s7 = df.at[21, 'MAX']
a1 = df.at[2, 'MAX']
a2 = df.at[6, 'MAX']
a3 = df.at[10, 'MAX']
a5 = df.at[14, 'MAX']
a6 = df.at[18, 'MAX']
a7 = df.at[22, 'MAX']
f1 = df.at[3, 'MAX']
f2 = df.at[7, 'MAX']
f3 = df.at[11, 'MAX']
f5 = df.at[15, 'MAX']
f6 = df.at[19, 'MAX']
f7 = df.at[23, 'MAX']
r1 = cp.Variable()
r2 = cp.Variable()
r3 = cp.Variable()
r5 = cp.Variable()
r6 = cp.Variable()
r7 = 12.5
#Objectives
ma = 1.3
ms = 2.50
lsf = 98.5
delta1 =(ms*((r1*a1+r2*a2+r3*a3+r5*a5+r6*a6+r7*a7)+(f1*r1+f2*r2+f3*r3+f5*r5+f6*r6+f7*r7))-(r1*s1+r2*s2+r3*s3+r5*s5+r6*s6+r7*s7))**2
delta2 =(ma*(f1*r1+f2*r2+f3*r3+f5*r5+f6*r6+f7*r7)-(r1*a1+r2*a2+r3*a3+r5*a5+r6*a6+r7*a7))**2
delta3 =((lsf*(2.8*(r1*s1+r2*s2+r3*s3+r5*s5+r6*s6+r7*s7)+1.18*(r1*a1+r2*a2+r3*a3+r5*a5+r6*a6+r7*a7)+0.65*(f1*r1+f2*r2+f3*r3+f5*r5+f6*r6+f7*r7))-100*(r1*c1+r2*c2+r3*c3+r5*c5+r6*c6+r7*c7)))**2
objective = cp.Minimize(delta1+delta2+delta3)
constraints = [r1-r2 == 0, #I added this to make r1=r2.
r1>= 0.20,
r3>=0, #I added these to make it non negative.
r5>=0,
r5<=0.008,
r6>=0,
r1+r2+r3+r5+r6+r7 == 1]
problem = cp.Problem(objective,constraints)
problem.solve()
print(r1.value,r2.value,r3.value,r5.value,r6.value)
print(problem.status)
Once again i want to thank you for your help guys.
Maybe you know how I can improve the code for get the variable values, maybe there is and example of using a for loop to get the values instead of put it directly from the DF for each one, the DF looks like this:
DATO MAX
0 c1 51.95000
1 s1 3.07000
2 a1 0.83000
3 f1 0.31000
4 c2 52.26000
5 s2 2.82000
6 a2 0.75000
...
I would like to know what is the distance of the nearest place in dataframe two to each of the rows in dataframe one.(What is the nearest place in distance for each coordinates in my dataframe one)
LOOK BELOW ALL MY CODE
I have two Dataframes: (In the original DataFrame I have thousands of rows)
The DataFrame 1 called "place_locations" :*
|CLUSTER| |CLIENT| |LATITUDE| |LENGHT|
0 X1 19.45685402 -70.68645898
1 X1 19.39320504 -70.52567322
2 X1 18.614736 -68.71711383
3 X2 18.47977644 -69.93177289
4 X2 19.76546997 -70.51085451
5 X3 18.55835346 -68.38226906
6 X3 19.79037017 -70.68748243
7 X4 19.2232559 -70.52629188
8 X4 18.42865751 -68.9703434
9 X5 19.37935119 -70.51440314
10 X5 18.68743273 -68.45068029
11 X6 19.44126162 -70.73159162
12 X6 19.6678557 -70.36758867
13 X7 18.7816069 -70.2598325
14 X8 19.48708304 -70.74375908
15 X8 18.93720371 -70.40746487
16 X9 19.299298 -69.5559162
17 X10 18.60044506 -68.41991221
18 X10 19.30702896 -69.54500792
19 X11 19.3783253 -70.618205
The DataFrame 2 called "Coordinates_coords" :
| PLACE| | LATITUDE| | LENGHT|
supermarket 18.63609095 -68.39650565
school 19.44512055 -70.66851055
restarant 18.48377033 -69.93910793
spa 18.46608496 -69.92713481
supermarket 18.45646778 -69.9395694
restaurant 18.4845644 -69.9300583
school 18.47284417 -69.9345797
def haversine_np(lon1, lat1, lon2, lat2):
lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
c = 2 * np.arcsin(np.sqrt(a))
km = 6371 * c
return km
def Top_nearest(distancia,distancias,todos = False,limit= 1.0):
results = []
for d in distancias:
results.append(haversine_np(distancia[0],distancia[1],d[0],d[1]))
results= np.array(results)
if not todos:
print(results.argmin())
indexes = np.where(results < limit)
else:
indexes = np.where(results>= 0)
return list(indexes[0]),results[indexes]
nearest_coordinates = list()
for index,row in place_locations.iterrows():
indexes,distances=Top_nearest(row[['LATITUDE', 'LENGHT']].values,
Coordinates_coords[['LATITUDE', 'LENGHT']].reset_index(drop=True).values,
todos=True)
nearest_coordinates.append(distances[0])
nearest_coordinates [:5]
place_locations['Distance_locations'] = nearest_coordinates
place_locations
The results that Im getting are not the correct, there are something in the calculus that I cant identify. The Distance_location column that Im getting, dont give me the nearest distance location
I've previously posted on this. location, distance, nearest
It's simplest to use a library to calculate distances. geopy has worked well for me
import geopy.distance
import pandas as pd
import io
df1 = pd.read_csv(io.StringIO("""CLUSTER CLIENT LATITUDE LENGHT
0 X1 19.45685402 -70.68645898
1 X1 19.39320504 -70.52567322
2 X1 18.614736 -68.71711383
3 X2 18.47977644 -69.93177289
4 X2 19.76546997 -70.51085451
5 X3 18.55835346 -68.38226906
6 X3 19.79037017 -70.68748243
7 X4 19.2232559 -70.52629188
8 X4 18.42865751 -68.9703434
9 X5 19.37935119 -70.51440314
10 X5 18.68743273 -68.45068029
11 X6 19.44126162 -70.73159162
12 X6 19.6678557 -70.36758867
13 X7 18.7816069 -70.2598325
14 X8 19.48708304 -70.74375908
15 X8 18.93720371 -70.40746487
16 X9 19.299298 -69.5559162
17 X10 18.60044506 -68.41991221
18 X10 19.30702896 -69.54500792
19 X11 19.3783253 -70.618205"""), sep="\s+")
df2 = pd.read_csv(io.StringIO(""" PLACE LATITUDE LENGHT
supermarket 18.63609095 -68.39650565
school 19.44512055 -70.66851055
restarant 18.48377033 -69.93910793
spa 18.46608496 -69.92713481
supermarket 18.45646778 -69.9395694
restaurant 18.4845644 -69.9300583
school 18.47284417 -69.9345797"""), sep="\s+")
# no need to calc distance in miles and kms, there for reference
df3 = (df1
.assign(foo=1)
.merge(df2.assign(foo=1), on="foo")
.assign(distance_km=lambda dfa: dfa.apply(lambda r:
geopy.distance.geodesic(
(r["LATITUDE_x"],r["LENGHT_y"]),
(r["LATITUDE_y"],r["LENGHT_y"])).km, axis=1))
.assign(distance_miles=lambda dfa: dfa.apply(lambda r:
geopy.distance.geodesic(
(r["LATITUDE_x"],r["LENGHT_y"]),
(r["LATITUDE_y"],r["LENGHT_y"])).miles, axis=1))
)
# now find nearest PLACE to a CLIENT and count
(df3.sort_values(["CLIENT","distance_km"])
.groupby(["CLIENT"]).agg({"PLACE":"first","distance_km":"first"})
.reset_index()
.groupby("PLACE")["CLIENT"].count()
.to_frame().reset_index().sort_values("CLIENT",ascending=False)
)
output
PLACE CLIENT
2 school 5
3 supermarket 4
0 restarant 1
1 restaurant 1
I am currently trying to create a program to calculate the total of customer's purchases for a bookstore, however I am currently stuck on the part where I have to create a function for calculating all the input values. So far, I can only do the menu part (the function that returns the input values) and even then I'm not entirely sure I'm doing it right. I also need to save the customers' purchase record to a text file, but that's a story for another day. Any help would be appreciated. Thanks!
I tried using global to pass down the values from the RainbowStationary() function to the Total() function but it didn't work.
def RainbowStationary():
print("~~ Rainbow Stationary ~~")
a = int(input("A4 Paper (Canon): "))
b = int(input("A4 Paper (rainbow): "))
c = int(input("Long Ruler: "))
d = int(input("Short Ruler: "))
e = int(input("Blue Pen: "))
f = int(input("Red Pen: "))
g = int(input("Black Pen: "))
h = int(input("2B Pencil: "))
return a, b, c, d, e, f, g, h
def Total():
I'm not sure what I'm supposed to write here. But this is the part where it is supposed to calculate the price for all those items in def RainbowStationary().
Oh, and this is the price list that I want to use:
A4 paper (canon): 8.9
A4 paper (rainbow): 7.5
Long ruler: 0.85
Short ruler: 0.55
Blue Pen: 0.65
Red Pen: 0.65
Black Pen: 0.65
2B Pencil: 2.4
I have no error messages, but here are the Expected results:
"~~ Rainbow Stationary ~~
A4 Paper (Canon): 1
A4 Paper (rainbow): 1
Long Ruler: 1
Short Ruler: 1
Blue Pen: 1
Red Pen:1
Black Pen: 1
2B Pencil: 1
Total: 22.15
use dictionary and iterate over its items, you can just do it like:
prices = {"A4 paper (canon)": 8.9,
"A4 paper (rainbow)": 7.5,
"Long ruler": 0.85,
"Short ruler": 0.55,
"Blue Pen": 0.65,
"Red Pen": 0.65,
"Black Pen": 0.65,
"2B Pencil": 2.4
}
def RainbowStationary():
total = 0
for key,val in prices.items():
stack = int(input(key + ": "))
total = total + (stack*val)
print("Total: %.2f" % total)
RainbowStationary()
output:
A4 paper (canon): 3
A4 paper (rainbow): 4
Long ruler: 5
Short ruler: 3
Blue Pen: 2
Red Pen: 1
Black Pen: 2
2B Pencil: 3
Total: 73.05
You could pass the quantities for the different goods as parameters to the function and return the sum of their respective prices:
def RainbowStationary():
print("~~ Rainbow Stationary ~~")
a = int(input("A4 Paper (Canon): "))
b = int(input("A4 Paper (rainbow): "))
c = int(input("Long Ruler: "))
d = int(input("Short Ruler: "))
e = int(input("Blue Pen: "))
f = int(input("Red Pen: "))
g = int(input("Black Pen: "))
h = int(input("2B Pencil: "))
return a, b, c, d, e, f, g, h
def Total(a, b, c, d, e, f, g, h):
return a * 8.9 + b * 7.5 + c * 0.85 + d * 0.55 + e * 0.65 + f * 0.65 + g * 0.65 + h * 2.4
goods = RainbowStationary()
price = Total(*goods)
print("Total: %.2f" % price)
However, this is very repetitive, and very hard to extend, or even just to change the prices. Instead, you could use a dictionary, mapping wares to prices, and just loop the items in that dictionary:
prices = {"A4 paper (canon)": 8.9,
"A4 paper (rainbow)": 7.5,
"Long ruler": 0.85,
"Short ruler": 0.55,
"Blue Pen": 0.65,
"Red Pen": 0.65,
"Black Pen": 0.65,
"2B Pencil": 2.4}
print("~~ Rainbow Stationary ~~")
total = 0
for good in prices:
n = int(input("%s: " % good))
total += prices[good] * n
print("Total: %.2f" % total)
Of course, you can still split it into two function by storing the quantities in another dictionary and returning that instead of a dozen individual variables:
def RainbowStationary():
print("~~ Rainbow Stationary ~~")
quantities = {}
for good in prices:
n = int(input("%s: " % good))
quantities[good] = n
return quantities
def Total(quantities):
return sum(n * prices[g] for g, n in quantities.items())
goods = RainbowStationary()
price = Total(goods)
print("Total: %.2f" % price)
My use-case is as follows:
I want to make an electric bill generator with different units having different prices. In below program, if the user enters units between 1-150 then for making price value it will do multiplication with 2.50 and make a price. If the user Units are between the 151-300 then it will multiply the first 150 units with 2.50 and left units will multiply with the 3.60 means to say If user entered 155 then for first 150 150 * 2.50 and for left units 5 it will do multiplication 5 * 3.60. like the above I said I'm making like this four variations given Below:-
if units lies between 1-150 then it will multiplies with the 2.50 and calculates the price.
if units lies between 151-300 then it will multiplies first 150 units with the price 2.50 and left units below the 300 will multiply with the 3.60 and calculates the price.
if units lies between 301-450 then it will multiplies first 150 units with the price 2.50 and other 150 units will multiplies with the 3.60 and left units are multiplies with the 4.75 and calculates the price.
if units lies between 451-600 then it will multiplies first 150 units with the price 2.50 and other 150 units will multiplies with the 3.60 and other 150 units are multiplies with the 4.75 and left units are multiplies with the 5 and calculates the price.
if units lies above 600 then it will multiplies first 150 units with the price 2.50 and other 150 units will multiplies with the 3.60 and other 150 units are multiplies with the 4.75 and next 150 units are multiplies with the 5 and left units are multiplies with the 6 and calculates the price.
Here is my code which is seek to optimize, i.e. shorten:
units = int(input("Enter the units"))
if(1<=units and 150>=units):
firstSum = units * 2.50
print("First sum:-", firstSum)
if(151<=units and 300>=units):
firstSum = 150 * 2.50
subUnits = units - 150
secondSum = firstSum + (subUnits * 3.60)
print(secondSum)
if(301<=units and 450>=units):
firstSum = 150 * 2.50
subUnits1 = units - 150
firstSum += 150 * 3.60
subUnits = subUnits1 - 150
secondSum = firstSum + subUnits * 4.75
print(secondSum)
if(451<=units and 600>= units):
firstSum = 150 * 2.50
subUnits1 = units - 150
firstSum += 150 * 3.60
subUnits1 -= 150
firstSum += subUnits1 * 4.75
subUnits = subUnits1 - 150
secondSum = firstSum + subUnits * 5
print(secondSum)
if(601<=units):
firstSum = 150 * 2.50
subUnits1 = units - 150
firstSum += 150 * 3.60
subUnits1 -= 150
firstSum += subUnits1 * 4.75
subUnits2 = subUnits1 - 150
firstSum += 150 * 5
subUnits = subUnits2 - 150
secondSum = firstSum + subUnits * 6
print(secondSum)
Can any one help me to make my program in short way.
Thank you for your precious time
If I understand well your problem, I don't think your code is doing the right thing for the two last cases. It looks like a wrong copy/paste of the previous cases ;)
I think that for the two last cases you should have :
if(451<=units and 600>= units):
firstSum = 150 * 2.50
subUnits1 = units - 150
firstSum += 150 * 3.60
subUnits1 -= 150
firstSum += 150 * 4.75
subUnits = subUnits1 - 150
secondSum = firstSum + subUnits * 5
print(secondSum)
if(601<=units):
firstSum = 150 * 2.50
subUnits1 = units - 150
firstSum += 150 * 3.60
subUnits1 -= 150
firstSum += 150 * 4.75
subUnits2 = subUnits1 - 150
firstSum += 150 * 5
subUnits = subUnits2 - 150
secondSum = firstSum + subUnits * 6
print(secondSum)
To answer your question, if I understood well, you can do something like :
units = int(input("Enter the units : "))
factor_list = [2.5, 3.6, 4.75, 5]
last_factor = 6
upper_bound = 600
step = 150
SUM = 0
if (units > upper_bound):
SUM += (units-upper_bound)*last_factor
units = upper_bound
nb150 = units/step
for i in range(0,int(nb150)):
SUM += step*factor_list[i]
if(int(nb150) < len(factor_list)):
SUM += (units-int(nb150)*step)*factor_list[int(nb150)]
print(SUM)
This solution simply avoid the multiple if statements by computing the euclidean division of units. That way you can easily change the coefficients or add others without needing to write other cases.
The first if statement takes care of all the units that are greater than the upper bound. It basically multiplies all the units above 600 with 6 and remove them from the units to be handled.
By the line nb150 = units/step and taking the integer part, I obtain the number of groups of 150 units. Then I can multiply them by their corresponding coefficient in the for loop.
Finally, if the number of units is lower than 600 but not a multiple of 150, the code needs to take care of the rest. So it removes the groups of 150 : (units-int(nb150)*step), then multiplies the rest with the corresponding factor factor_list[int(nb150)].
If you need further explanation, feel free to ask !
Talking about optimization, there's quite less you can do in order to optimize the code...
You can definitely optimize it by using if-elif-else conditional statements instead of using just if :
if(1<=units and 150>=units):
...
elif(151<=units and 300>=units):
...
:
:
else:
...
When you are doing so, you are making sure that the condition checking doesn't happen after the right condition is reached. Thereby, reducing the number of comparisons done and optimizing the program.
Special reason why you need it shortened? Anyway you might start by refactoring repeating code blocks into methods. For example
firstSum = 150 * 2.50
subUnits1 = units - 150
firstSum += 150 * 3.60
subUnits = subUnits1 - 150
Happens three times.
Also is there a special reasons for all the if statements instead of elif? Not that it would make the code shorter.
If you don't mind the unreadability here is your one-liner:
print(units * 2.5 + max(0, units - 150) * 1.1 + max(0, units - 300) * 1.15 + max(0, units - 450) * 0.25 + max(0, units - 600))
Also your example code is buggy on line 23 (firstSum += subUnits1 * 4.75), it should multiply with another 150 there.
Given your code (regardless your description); you can shorten your code by doing the math, e.g.:
def bill_generator(units):
firstSum = min(units, 150) * 2.5
if units <= 300:
secondSum = firstSum + units * 3.60 - 540.0
elif units <= 450:
firstSum += 540
secondSum = firstSum + units * 4.75 - 1425.0
elif units <= 600:
firstSum = 540 + units * 4.75 - 1425.0
secondSum = firstSum + units * 5.0 - 2250.0
else:
firstSum = 150.0 * 11.1 + units * 4.75 - 1425.0
secondSum = firstSum + units * 6.0 - 3600.0
print("FirstSum:-{}".format(firstSum))
if units > 150:
print(secondSum)
if __name__ == '__main__':
inp_units = int(input("Enter the units: "))
while inp_units < 1:
print("invalid input, units must be greater than zero")
inp_units = int(input("Enter the units"))
Tested the border cases:
def bill_generator(units):
firstSum = min(units, 150) * 2.5
if units <= 300:
secondSum = firstSum + units * 3.60 - 540.0
elif units <= 450:
firstSum += 540.0
secondSum = firstSum + units * 4.75 - 1425.0
elif units <= 600:
firstSum += 540.0 + units * 4.75 - 1425.0
secondSum = firstSum + units * 5.0 - 2250.0
else:
firstSum = 1665.0 + units * 4.75 - 1425.0
secondSum = firstSum + units * 6.0 - 3600.0
print("FirstSum:-{}".format(firstSum))
if units > 150:
print(secondSum)
if __name__ == '__main__':
for ii in [1, 150, 151, 300, 301, 450, 451, 600, 601, 1200]:
print('Testing for unit input "{}"'.format(ii))
bill_generator(ii)
'''
Testing for unit input "1"
FirstSum:-2.5
Testing for unit input "150"
FirstSum:-375.0
Testing for unit input "151"
FirstSum:-375.0
378.6
Testing for unit input "300"
FirstSum:-375.0
915.0
Testing for unit input "301"
FirstSum:-915.0
919.75
Testing for unit input "450"
FirstSum:-915.0
1627.5
Testing for unit input "451"
FirstSum:-1632.25
1637.25
Testing for unit input "600"
FirstSum:-2340.0
3090.0
Testing for unit input "601"
FirstSum:-3094.75
3100.75
Testing for unit input "1200"
FirstSum:-5940.0
9540.0
'''
You are adding your values to variables again and again in newlines rather than binding them up in a single line. Your shorted code can be as:
units = int(input("Enter the units: "))
if 1<=units and 150>=units:
print("First sum:-", units * 2.50)
elif 300>=units:
print((150 * 2.50) + ((units - 150) * 3.60))
elif 450>=units:
print(((150 * 2.50)+150 * 3.60) + ((units - 300) * 4.75))
elif 600>= units:
print((((150 * 2.50) + 150 * 3.60) + (units - 300) * 4.75) + ((units - 300) - 150 ) * 5)
else:
print(((((150 * 2.50) +150 * 3.60)+(units - 300) * 4.75)+150 * 5) + (((units - 300) - 150) - 150) * 6)
I am trying to generate a matrix of numbers with 7 rows and 4 columns. Each row must sum to 100 and each column must have an even spread (if permitted) between a min and max range (specified below).
Goal:
C1 C2 C3 C4 sum range
1 low 100 ^
2 .. |
3 .. |
4 .. |
5 .. |
6 .. |
7 high _
c1_high = 98
c1_low = 75
c2_high = 15
c2_low = 6
c3_high = 8
c3_low = 2
c4_low = 0.05
c4_high =0.5
In addition to this, i need the spread of each row to be as linear as possible, though a line fitted to the data with a second order polynomial would suffice (with an r^2 value of >0.98).
I am currently trying to do this using the following sudocode:
generate random number between ranges for c1,c2,c3 and c4.
repeat this 7 times
check correlation between each generated c1 value and a range of numbers from 1-7. For example:
repeat step 3 for c2,c3 and c4.
Break loop when step 3 and 4 are successful
This has proven to be too burdensome in terms of the number of iterations required and as a result, the solution is never reached.
Is there a more efficient way of achieving this solution?
So far:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
c1_high = 98
c1_low = 75
c2_high = 15
c2_low = 6
c3_high = 8
c3_low = 2
c4_low = 0.05
c4_high =0.5
def matrix_gen(): #generates matrix within min and max values
container =[]
d={}
offset = np.linspace(0.05,1,9)
c1= np.linspace(c1_low, c1_high, 7)
c2= np.linspace(c2_low, c2_high, 7)
c3= np.linspace(c3_low, c3_high, 7)
c4= np.linspace(c4_low, c4_high, 7)
for i in np.arange(7):
d["row{0}".format(i)]=[item[i] for item in [c1,c2,c3,c4]]
df =pd.DataFrame(d)
df.loc[4,:] = df.iloc[0,:][::-1].values
df1 = df.drop(0)
df1.loc[5,:] = df1.sum(axis=0)
new_name = df1.index[-1]
df1 = df1.rename(index={new_name: 'sum'})
return df1
m = matrix_gen()
print(m)
out:
row0 row1 row2 row3 row4 row5 row6
1 6.00 7.500000 9.000000 10.500 12.000000 13.500000 15.0
2 2.00 3.000000 4.000000 5.000 6.000000 7.000000 8.0
3 0.05 0.125000 0.200000 0.275 0.350000 0.425000 0.5
4 98.00 94.166667 90.333333 86.500 82.666667 78.833333 75.0
sum 106.05 104.791667 103.533333 102.275 101.016667 99.758333 98.5
next function:
def shuf(): # attempts at shuffling the values around such that the 'sum' row is as close to 100 as possible.
df = matrix_gen()
df1 = df[1:4]
count =0
while True:
df1 = shuffle(df1)
df1.loc[5,:] = df1.sum(axis=0)
for i in df1.loc[5].values:
if 98<= i <=100:
print('solution')
return df1
else:
count+=1
print(count)
continue
opt = shuf()
print(opt)
next function will need to apply a deviation to each number to provide a sum of each row equal to 100. Optimization should include minimizing deviations.
I think an interesting approach would be to use an optimization model.
Ordered values
Let x(i,j) be the matrix your want to fill. Then we have:
sum(j, x(i,j)) = 100 ∀i
L(j) ≤ x(i,j) ≤ U(j) ∀i,j
x(i,j) = x(i-1,j) + step(j) + deviation(i,j)
special cases:
x(1,j) = L(j) + deviation(1,j)
and x(m,j) = U(j) + deviation(m,j)
step(j) ≥ 0
minimize sum((i,j), deviation(i,j)^2 )
This is a quadratic programming problem. It is possible to absolute deviations instead of squared ones. In that case you have an LP.
The model can be refined to minimize squared relative errors.
This is a little bit related to what is called matrix balancing (a statistical technique often used in economic modeling).
Unordered values
In the above I assumed the values had to be ordered. Now I understand this is not the case. I adapted the model to handle this as follows. First an overview of the results.
The input data is:
---- 17 PARAMETER LO
c1 80.000, c2 5.000, c3 0.500, c4 0.050
---- 17 PARAMETER UP
c1 94.000, c2 14.000, c3 5.000, c4 0.500
Warning: Note that this data has been changed by the poster. My answer is using the original LO and UP values before they were changed.
The model operates in three steps:
(1) populate a perfectly organized matrix without obeying the row sum constraints. This can be done outside the model. I generated simply:
---- 53 PARAMETER init initial matrix
c1 c2 c3 c4 rowsum
r1 80.000 5.000 0.500 0.050 85.550
r2 82.333 6.500 1.250 0.125 90.208
r3 84.667 8.000 2.000 0.200 94.867
r4 87.000 9.500 2.750 0.275 99.525
r5 89.333 11.000 3.500 0.350 104.183
r6 91.667 12.500 4.250 0.425 108.842
r7 94.000 14.000 5.000 0.500 113.500
I.e. from lo(j) to up(j) with equal steps.
(2) The second step is to permute the values within a column to achieve a solution that has a close match to the row sums. This gives:
---- 53 VARIABLE y.L after permutation
c1 c2 c3 c4 rowsum
r1 94.000 5.000 0.500 0.125 99.625
r2 82.333 12.500 4.250 0.500 99.583
r3 89.333 8.000 2.000 0.200 99.533
r4 87.000 9.500 2.750 0.275 99.525
r5 84.667 11.000 3.500 0.350 99.517
r6 91.667 6.500 1.250 0.050 99.467
r7 80.000 14.000 5.000 0.425 99.425
This is already very close and maintains "perfect" spread.
(3) Change the values a little bit by adding a deviation such that the row sums are exactly 100. Minimize the sum of the squared relative deviations. This gives:
---- 53 VARIABLE x.L final values
c1 c2 c3 c4 rowsum
r1 94.374 5.001 0.500 0.125 100.000
r2 82.747 12.503 4.250 0.500 100.000
r3 89.796 8.004 2.000 0.200 100.000
r4 87.469 9.506 2.750 0.275 100.000
r5 85.142 11.007 3.501 0.350 100.000
r6 92.189 6.510 1.251 0.050 100.000
r7 80.561 14.012 5.002 0.425 100.000
---- 53 VARIABLE d.L deviations
c1 c2 c3 c4
r1 0.374 0.001 1.459087E-5 1.459087E-7
r2 0.414 0.003 9.542419E-5 9.542419E-7
r3 0.462 0.004 2.579521E-4 2.579521E-6
r4 0.469 0.006 4.685327E-4 4.685327E-6
r5 0.475 0.007 7.297223E-4 7.297223E-6
r6 0.522 0.010 0.001 1.123123E-5
r7 0.561 0.012 0.002 1.587126E-5
Steps (2) and (3) have to be inside the optimization model: they have to be executed simultaneously to achieve proven optimal solutions.
The mathematical model can look like:
The model solves within a few seconds to proven global optimality using a solver like Cplex or Gurobi.
I think this is pretty cute model (ok, that is really nerdy, I know). The permutation is modeled with a permutation matrix P (binary values). This makes the model a MIQP (Mixed Integer Quadratic Programming) model. It can be linearized fairly easily: use absolute values instead of squares in the objective. After proper reformulation, we end up with a linear MIP model. There is lots of software available to handle this. This includes libraries and packages callable from Python.
Note: I probably should not divide by init(i,j) in the objective, but rather by the column means in the init matrix. Dividing by y(i,j) would be the best, but that leads to another non-linearity.
Your numbers are small enough for a smart brute force approach.
I use two methods to quantify and minimize deviations from the "clean" equidistant values (linspace(low, high, 7)). "abserr" for squared difference and "relerr" for squared error divided by squared clean value. I also check corrcoefs in the very end but I've never seen anything below 99.8%
The following code first finds the shuffle od the clean values with the smallest error. This takes just a few seconds, because we use the following tricks:
split the 4 columns into two pairs
each pair has 7! relative arrangements, a mangeable number even when squared (one factor for each pair)
compute these (7!)^2 shuffles and sum over pairs
to not have to iterate over all relative shuffles between the pairs we observe that the total error is minimized if the the two sets of pair sums are arranged in opposite order this is true for "abserr" and "relerr"
In the end the values are corrected to make rows sum to 100. Here again we use the fact that the summed error is minimized when evenly spread.
The code below contains two variants a legacy one solve which contains a small inaccuracy when minimizing relerr and a corrected version improved_solve. They frequently find different solutions but in more than 100 random problems only one led to a very slightly smaller error with improved_solve.
Answers to a few examples:
OP's example:
((75, 98), (6, 15), (2, 8), (0.05, 0.5))
solve relerr improved_solve relerr
table: table:
76.14213 15.22843 8.12183 0.50761 76.14213 15.22843 8.12183 0.50761
79.02431 13.53270 7.01696 0.42603 79.02431 13.53270 7.01696 0.42603
81.83468 11.87923 5.93961 0.34648 81.83468 11.87923 5.93961 0.34648
84.57590 10.26644 4.88878 0.26888 84.57590 10.26644 4.88878 0.26888
87.25048 8.69285 3.86349 0.19317 87.25048 8.69285 3.86349 0.19317
89.86083 7.15706 2.86282 0.11928 89.86083 7.15706 2.86282 0.11928
92.40924 5.65771 1.88590 0.04715 92.40924 5.65771 1.88590 0.04715
avgerr: avgerr:
0.03239 0.03239
corrcoefs: corrcoefs:
0.99977 0.99977 0.99977 0.99977 0.99977 0.99977 0.99977 0.99977
An example where sorting some colums ascending some descending is not optimal:
((11, 41), (4, 34), (37, 49), (0.01, 23.99))
Note that the solvers find different solutions, but the error is the same.
solve relerr improved_solve relerr
table: table:
10.89217 18.81374 46.53926 23.75483 11.00037 24.00080 49.00163 15.99720
26.00087 9.00030 49.00163 15.99720 16.00107 19.00127 45.00300 19.99467
31.00207 4.00027 45.00300 19.99467 25.74512 13.86276 36.63729 23.75483
16.00000 29.00000 43.00000 12.00000 35.99880 8.99970 46.99843 8.00307
20.99860 33.99773 40.99727 4.00640 41.00000 4.00000 43.00000 12.00000
40.99863 13.99953 36.99877 8.00307 20.99860 33.99773 40.99727 4.00640
36.35996 24.23998 39.38996 0.01010 31.30997 29.28997 39.38996 0.01010
avgerr: avgerr:
0.00529 0.00529
corrcoefs: corrcoefs:
0.99993 0.99994 0.99876 0.99997 0.99989 0.99994 0.99877 0.99997
This is the problem where improved_solve actually beats legacy solve:
((36.787862883725872, 43.967159949544317),
(40.522239654303483, 47.625869880574164),
(19.760537036548321, 49.183056694462799),
(45.701873101046154, 48.051424087501672))
solve relerr improved_solve relerr
table: table:
21.36407 23.53276 28.56241 26.54076 20.25226 26.21874 27.07599 26.45301
22.33545 24.52391 26.03695 27.10370 21.53733 26.33278 25.10656 27.02333
23.33149 25.54022 23.44736 27.68093 22.90176 26.45386 23.01550 27.62888
24.35314 26.58266 20.79119 28.27301 24.35314 26.58266 20.79119 28.27301
25.40141 27.65226 18.06583 28.88050 25.90005 26.71994 18.42047 28.95953
26.47734 28.75009 15.26854 29.50403 27.55225 26.86656 15.88840 29.69279
27.58205 29.87728 12.39644 30.14424 29.32086 27.02351 13.17793 30.47771
avgerr: avgerr:
0.39677 0.39630
corrcoefs: corrcoefs:
0.99975 0.99975 0.99975 0.99975 0.99847 0.99847 0.99847 0.99847
Code:
import numpy as np
import itertools
import math
N_CHUNKS = 3
def improved_solve(LH, errtype='relerr'):
N = math.factorial(7)
# accept anything that looks like a 2d array
LH = np.asanyarray(LH)
# build equidistant columns
C = np.array([np.linspace(l, h, 7) for l, h in LH])
# subtract offset; it's cheaper now than later
c0, c1, c2, c3 = C - 25
# list all permutiations of a single column
p = np.array(list(itertools.permutations(range(7))))
# split into left and right halves, compute all relative permutiations
# and sort them by their sums of corresponding elements.
# Left pairs in ascending, right pairs in descending order.
L = np.sort(c0 + c1[p], axis=1)
R = np.sort(c2 + c3[p], axis=1)[:, ::-1]
# For each pair of permutations l in L, r in R compute the smallest
# possible error (sum of squared deviations.)
if errtype == 'relerr':
err = np.empty((N, N))
split = np.linspace(0, N, N_CHUNKS+1, dtype=int)[1:-1]
for LCH, ECH in zip(np.split(L, split, axis=0),
np.split(err, split, axis=0)):
dev = LCH[:, None] + R[None, :]
((dev / (100+dev))**2).sum(axis=-1, out=ECH)
del dev
elif errtype == 'abserr':
err = (np.add.outer(np.einsum('ij,ij->i', L, L),
np.einsum('ij,ij->i', R, R))
+ np.einsum('ik, jk->ij', 2*L, R))
else:
raise ValueError
# find pair of pairs with smallest error
i = np.argmin(err.ravel())
i1, i3 = np.unravel_index(i, (N, N))
# recreate shuffled table
c0, c1, c2, c3 = C
lidx = np.argsort(c0 + c1[p[i1]])
ridx = np.argsort(c2 + c3[p[i3]])[::-1]
C = np.array([c0[lidx], c1[p[i1]][lidx], c2[ridx], c3[p[i3]][ridx]])
# correct rowsums, calculate error and corrcoef and return
if errtype == 'relerr':
result = C * (100.0 / C.sum(axis=0, keepdims=True))
err = math.sqrt((((result-C)/C)**2).mean())
else:
result = C + (25 - C.mean(axis=0, keepdims=True))
err = math.sqrt(((result-C)**2).mean())
rs = np.sort(result, axis=1)
cc = tuple(np.corrcoef(ri, range(7))[0, 1] for ri in rs)
return dict(table=result.T, avgerr=err, corrcoefs=cc)
def solve(LH, errtype='relerr'):
LH = np.asanyarray(LH)
if errtype=='relerr':
err1 = 200 / LH.sum()
diff = np.diff(LH * err1, axis=1).ravel()
elif errtype=='abserr':
err1 = 25 - LH.mean()
diff = np.diff(LH, axis=1).ravel()
else:
raise ValueError
C = np.array([np.linspace(-d/2, d/2, 7) for d in diff])
c0, c1, c2, c3 = C
p = np.array(list(itertools.permutations(range(7))))
L = np.sort(c0 + c1[p], axis=1)
R = np.sort(c2 + c3[p], axis=1)[:, ::-1]
err = (np.add.outer(np.einsum('ij,ij->i', L, L),
np.einsum('ij,ij->i', R, R))
+ np.einsum('ik, jk->ij', 2*L, R)).ravel()
i = np.argmin(err)
i1, i3 = np.unravel_index(i, (math.factorial(7), math.factorial(7)))
L = np.argsort(c0 + c1[p[i1]])
R = np.argsort(c2 + c3[p[i3]])[::-1]
ref = [np.linspace(l, h, 7) for l, h in LH]
if errtype=='relerr':
c0, c1, c2, c3 = [np.linspace(l, h, 7) for l, h in LH * err1]
C = np.array([c0[L], c1[p[i1]][L], c2[R], c3[p[i3]][R]])
err2 = 100 / np.sum(C, axis=0)
C *= err2
cs = list(map(sorted, C))
err = math.sqrt(sum((c/r-1)**2 for ci, ri in zip(cs, ref) for c, r in zip(ci, ri)) / 28)
elif errtype=='abserr':
c0, c1, c2, c3 = [np.linspace(l, h, 7) for l, h in LH + err1]
C = np.array([c0[L], c1[p[i1]][L], c2[R], c3[p[i3]][R]])
err2 = 25 - np.mean(C, axis=0)
C += err2
cs = list(map(sorted, C))
err = math.sqrt(sum((c-r)**2 for ci, ri in zip(cs, ref) for c, r in zip(ci, ri)) / 28)
else:
raise ValueError
cc = tuple(np.corrcoef(ci, range(7))[0, 1] for ci in cs)
return dict(table=C.T, avgerr=err, corrcoefs=cc)
for problem in [((75, 98), (6, 15), (2, 8), (0.05, 0.5)),
((11, 41), (4, 34), (37, 49), (0.01, 23.99)),
((80, 94), (5, 14), (0.5, 5), (0.05, 0.5)),
((36.787862883725872, 43.967159949544317),
(40.522239654303483, 47.625869880574164),
(19.760537036548321, 49.183056694462799),
(45.701873101046154, 48.051424087501672))]:
for errtype in ('relerr', 'abserr'):
print()
columns = []
for solver in (solve, improved_solve):
sol = solver(problem, errtype)
column = [[' '.join((solver.__name__, errtype))]] + \
[[k + ':'] + [' '.join([f'{e:8.5f}' for e in r])
for r in np.atleast_2d(v)]
for k, v in sol.items()]
column = (line for block in column for line in block)
columns.append(column)
for l, r in zip(*columns):
print(f"{l:39s} {r:39s}")
problems = []
for i in range(0):
problem = np.sort(np.random.random((4, 2)), axis=1) * 50
for errtype in ('relerr', 'abserr'):
sol0 = solve(problem, errtype)
sol1 = improved_solve(problem, errtype)
if not np.allclose(sol0['table'], sol1['table']):
print(i, end= " ")
if np.abs((sol0['avgerr']-sol1['avgerr'])
/(sol0['avgerr']+sol1['avgerr']))>1e-6:
print(problem)
problems.append(problem)
columns = []
for sol, name in [(sol0, 'old '), (sol1, 'improved ')]:
column = [[name + errtype]] + \
[[k + ':'] + [' '.join([f'{e:8.5f}' for e in r])
for r in np.atleast_2d(v)]
for k, v in sol.items()]
column = (line for block in column for line in block)
columns.append(column)
for l, r in zip(*columns):
print(f"{l:39s} {r:39s}")