I would like to simulate individual changes in growth and mortality for a variable number of days. My dataframe is formatted as follows...
import pandas as pd
data = {'unique_id': ['2', '4', '5', '13'],
'length': ['27.7', '30.2', '25.4', '29.1'],
'no_fish': ['3195', '1894', '8', '2774'],
'days_left': ['253', '253', '254', '256'],
'growth': ['0.3898', '0.3414', '0.4080', '0.3839']
}
df = pd.DataFrame(data)
print(df)
unique_id length no_fish days_left growth
0 2 27.7 3195 253 0.3898
1 4 30.2 1894 253 0.3414
2 5 25.4 8 254 0.4080
3 13 29.1 2774 256 0.3839
Ideally, I would like the initial length (i.e., length) to increase by the daily growth rate (i.e., growth) for each of the days remaining in the year (i.e., days_left).
df['final'] = df['length'] + (df['days_left'] * df['growth']
However, I would also like to update the number of fish that each individual represents (i.e., no_fish) on a daily basis using a size-specific equation. I'm fairly new to python so I initially thought to use a for-loop (I'm not sure if there is another, more efficient way). My code is as follows:
# keep track of run time - START
start_time = time.perf_counter()
df['z'] = 0.0
for indx in range(len(df)):
count = 1
while count <= int(df.days_to_forecast[indx]):
# (1) update individual length
df.lgth[indx] = df.lgth[indx] + df.linearGR[indx]
# (2) estimate daily size-specific mortality
if df.lgth[indx] > 50.0:
df.z[indx] = 0.01
else:
if df.lgth[indx] <= 50.0:
df.z[indx] = 0.052857-((0.03/35)*df.lgth[indx])
elif df.lgth[indx] < 15.0:
df.z[indx] = 0.728*math.exp(-0.1892*df.lgth[indx])
df['no_fish'].round(decimals = 0)
if df.no_fish[indx] < 1.0:
df.no_fish[indx] = 0.0
elif df.no_fish[indx] >= 1.0:
df.no_fish[indx] = df.no_fish[indx]*math.exp(-(df.z[indx]))
# (3) reduce no. of days left in forecast by 1
count = count + 1
# keep track of run time - END
total_elapsed_time = round(time.perf_counter() - start_time, 2)
print("Forecast iteration completed in {} seconds".format(total_elapsed_time))
The above code now works correctly, but it is still far to inefficient to run for 40,000 individuals each for 200+ days.
I would really appreciate any advice on how to modify the following code to make it pythonic.
Thanks
Another option that was suggested to me is to use the pd.dataframe.apply function. This dramatically reduced the overall the run time and could be useful to someone else in the future.
### === RUN SIMULATION === ###
start_time = time.perf_counter() # keep track of run time -- START
#-------------------------------------------------------------------------#
def function_to_apply( df ):
df['z_instantMort'] = ''
for indx in range(int(df['days_left'])):
# (1) update individual length
df['length'] = df['length'] + df['growth']
# (2) estimate daily size-specific mortality
if df['length'] > 50.0:
df['z_instantMort'] = 0.01
else:
if df['length'] <= 50.0:
df['z_instantMort'] = 0.052857-((0.03/35)*df['length'])
elif df['length'] < 15.0:
df['z_instantMort'] = 0.728*np.exp(-0.1892*df['length'])
whole_fish = round(df['no_fish'], 0)
if whole_fish < 1.0:
df['no_fish'] = 0.0
elif whole_fish >= 1.0:
df['no_fish'] = df['no_fish']*np.exp(-(df['z_instantMort']))
return df
#-------------------------------------------------------------------------#
sim_results = df.apply(function_to_apply, axis=1)
total_elapsed_time = round(time.perf_counter() - start_time, 2) # END
print("Forecast iteration completed in {} seconds".format(total_elapsed_time))
print(sim_results)
### ====================== ###
output being...
Forecast iteration completed in 0.05 seconds
unique_id length no_fish days_left growth z_instantMort
0 2.0 126.3194 148.729190 253.0 0.3898 0.01
1 4.0 116.5742 93.018465 253.0 0.3414 0.01
2 5.0 129.0320 0.000000 254.0 0.4080 0.01
3 13.0 127.3784 132.864757 256.0 0.3839 0.01
As I said in my comment, a preferable alternative to for loops in this setting is using vector operations. For instance, running your code:
import pandas as pd
import time
import math
import numpy as np
data = {'unique_id': [2, 4, 5, 13],
'length': [27.7, 30.2, 25.4, 29.1],
'no_fish': [3195, 1894, 8, 2774],
'days_left': [253, 253, 254, 256],
'growth': [0.3898, 0.3414, 0.4080, 0.3839]
}
df = pd.DataFrame(data)
print(df)
# keep track of run time - START
start_time = time.perf_counter()
df['z'] = 0.0
for indx in range(len(df)):
count = 1
while count <= int(df.days_left[indx]):
# (1) update individual length
df.length[indx] = df.length[indx] + df.growth[indx]
# (2) estimate daily size-specific mortality
if df.length[indx] > 50.0:
df.z[indx] = 0.01
else:
if df.length[indx] <= 50.0:
df.z[indx] = 0.052857-((0.03/35)*df.length[indx])
elif df.length[indx] < 15.0:
df.z[indx] = 0.728*math.exp(-0.1892*df.length[indx])
df['no_fish'].round(decimals = 0)
if df.no_fish[indx] < 1.0:
df.no_fish[indx] = 0.0
elif df.no_fish[indx] >= 1.0:
df.no_fish[indx] = df.no_fish[indx]*math.exp(-(df.z[indx]))
# (3) reduce no. of days left in forecast by 1
count = count + 1
# keep track of run time - END
total_elapsed_time = round(time.perf_counter() - start_time, 2)
print("Forecast iteration completed in {} seconds".format(total_elapsed_time))
print(df)
with output:
unique_id length no_fish days_left growth
0 2 27.7 3195 253 0.3898
1 4 30.2 1894 253 0.3414
2 5 25.4 8 254 0.4080
3 13 29.1 2774 256 0.3839
Forecast iteration completed in 31.75 seconds
unique_id length no_fish days_left growth z
0 2 126.3194 148.729190 253 0.3898 0.01
1 4 116.5742 93.018465 253 0.3414 0.01
2 5 129.0320 0.000000 254 0.4080 0.01
3 13 127.3784 132.864757 256 0.3839 0.01
Now with vector operations, you could do something like:
# keep track of run time - START
start_time = time.perf_counter()
df['z'] = 0.0
for day in range(1, df.days_left.max() + 1):
update = day <= df['days_left']
# (1) update individual length
df[update]['length'] = df[update]['length'] + df[update]['growth']
# (2) estimate daily size-specific mortality
df[update]['z'] = np.where( df[update]['length'] > 50.0, 0.01, 0.052857-( ( 0.03 / 35)*df[update]['length'] ) )
df[update]['z'] = np.where( df[update]['length'] < 15.0, 0.728 * np.exp(-0.1892*df[update]['length'] ), df[update]['z'] )
df[update]['no_fish'].round(decimals = 0)
df[update]['no_fish'] = np.where(df[update]['no_fish'] < 1.0, 0.0, df[update]['no_fish'] * np.exp(-(df[update]['z'])))
# keep track of run time - END
total_elapsed_time = round(time.perf_counter() - start_time, 2)
print("Forecast iteration completed in {} seconds".format(total_elapsed_time))
print(df)
with output
Forecast iteration completed in 1.32 seconds
unique_id length no_fish days_left growth z
0 2 126.3194 148.729190 253 0.3898 0.0
1 4 116.5742 93.018465 253 0.3414 0.0
2 5 129.0320 0.000000 254 0.4080 0.0
3 13 127.3784 132.864757 256 0.3839 0.0
I need to produce 5000 kgs of steel by mixing 7 alloys parts .
I need to reduce the cost, so i need to pick up the best parts.
The result must respect the main steel caracteristics, for example, the carbon level must be between 2% and 3 %, no more, no less .
The Excel linear solver program already exists ,and is originated from a professional book.
I'm trying to translate it to a PULP code, now .
My problem is : How to create the copper, carbone, and manganèse constraints ? There are 2 arrays, so I don't know how to do.
It is all in percents, and I don't know how to do . My result is actually wrong, I left the bad constraints I've done for information . It seems that I need to divide by 5000 at one moment, but how should I do ?
Let me try to explain to you what I can not understand :
I need 5000 kgs of steel to have 0.60 % of copper in it, but my Copper alloy parts contains 90 % and 96% of copper.
Do you see what I mean, and why it is so difficult to describe my constraints ?
"" "
Mining and metals
We make steel with raw materials, we want to reduce the cost of producing this steel
to make more money, but still respecting the minimum characteristics of quality steel
"" "
# Minimize the cost of metal alloys.
# Characteristics of the steel to be made
"" "Element %Minimum %Max %Real ( it is a var)
Carbon 2 3 2.26
Copper 0.4 0.6 0.60
Manganese 1.2 1.65 1.20
"" "
# Characteristics, stocks and purchase price of alloys
"" "
Alloy C% Cu% Mn% Stocks kg Price € / kg
Iron alloy 2.50 0.00 1.30 4000 1.20
Iron alloy 3.00 0.00 0.80 3000 1.50
Iron alloy 0.00 0.30 0.00 6000 0.90
Copper alloy 0.00 90.00 0.00 5000 1.30
Copper alloy 0.00 96.00 4.00 2000 1.45
Aluminum alloy 0.00 0.40 1.20 3000 1.20
Aluminum alloy 0.00 0.60 0.00 2,500 1.00
"" "
# Import the PuLP lib
from pulp import *
# Create the problem variable
prob = LpProblem ("MinimiserLpAlliage", LpMinimize)
# The 7 vars have a zero limit
x1 = LpVariable ("Iron alloy 1", 0)
x2 = LpVariable ("Iron alloy 2", 0)
x3 = LpVariable ("Iron alloy 3", 0)
x4 = LpVariable ("Copper alloy 1", 0)
x5 = LpVariable ("Copper alloy 2", 0)
x6 = LpVariable ("Aluminum alloy 1", 0)
x7 = LpVariable ("Aluminum alloy 2", 0)
# The objective function is to minimize the total cost of the alloys in EUROS for a given quantity in KGS
prob + = 1.20 * x1 + 1.50 * x2 + 0.90 * x3 + 1.30 * x4 + 1.45 * x5 + 1.20 * x6 + 1.00 * x7, "AlliageCost"
# Quantity constraint in KGS.
prob + = x1 + x2 + x3 + x4 + x5 + x6 + x7 == 5000, "RequestedQuantity"
# MIN constraints of% carbon, by alloy // ITS NOT WHAT I NEED
prob + = x1> = 2.5, "MinCarboneRequirement1"
prob + = x2> = 3, "MinCarboneRequirement2"
prob + = x3> = 0, "MinCarboneRequirement3"
prob + = x4> = 0, "MinCarboneRequirement4"
prob + = x5> = 0, "MinCarboneRequirement5"
prob + = x6> = 0, "MinCarboneRequirement6"
prob + = x7> = 0, "MinCarboneRequirement7"
# MIN constraints of% copper, by alloy // ITS WRONG ITS NOT WHAT I NEED
prob + = x1> = 0, "MinCuivreRequirement1"
prob + = x2> = 0, "MinCuivreRequirement2"
prob + = x3> = 0.3, "MinCuivreRequirement3"
prob + = x4> = 90, "MinCuivreRequirement4"
prob + = x5> = 96, "MinCuivreRequirement5"
prob + = x6> = 0.4, "MinCuivreRequirement6"
prob + = x7> = 0.6, "MinCuivreRequirement7"
# MIN constraints of% of Manganese, by alloy // ITS WRONG ITS NOT WHAT I NEED
prob + = x1> = 1.3, "MinManganeseRequirement1"
prob + = x2> = 0.8, "MinManganeseRequirement2"
prob + = x3> = 0, "MinManganeseRequirement3"
prob + = x4> = 0, "MinManganeseRequirement4"
prob + = x5> = 4, "MinManganeseRequirement5"
prob + = x6> = 1.2, "MinManganeseRequirement6"
prob + = x7> = 0, "MinManganeseRequirement7"
# MAX constraints of% of Manganese, by alloy // ITS WRONG ITS NOT WHAT I NEED
prob + = x1 <= 1.3, "MaxManganeseRequirement1"
prob + = x2 <= 0.8, "MaxManganeseRequirement2"
prob + = x3 <= 0, "MaxManganeseRequirement3"
prob + = x4 <= 0, "MaxManganeseRequirement4"
prob + = x5 <= 4, "MaxManganeseRequirement5"
prob + = x6 <= 1.2, "MaxManganeseRequirement6"
prob + = x7 <= 0, "MaxManganeseRequirement7"
# 5. MAX constraints from available stock, by alloy // I THINK IT IS OK
prob + = x1 <= 4000, "MaxStock"
prob + = x2 <= 3000, "MaxStock1"
prob + = x3 <= 6000, "MaxStock2"
prob + = x4 <= 5000, "MaxStock3"
prob + = x5 <= 2000, "MaxStock4"
prob + = x6 <= 3000, "MaxStock5"
prob + = x7 <= 2500, "MaxStock6"
# The problem data is written to an .lp file
prob.writeLP ( "WhiskasModel.lp")
# We use the solver
prob.solve ()
# The status of the solution
print ("Status:", LpStatus [prob.status])
# We magnify and display the optimums of each var
for v in prob.variables ():
print (v.name, "=", v.varValue)
# The result of the objective function is here
print ("Total", value (prob.objective))
This is the answer, but of course, it is wrong, cause I dont know how to do the constraints :
Status: Optimal
Aluminum_alloy_1 = 1.2
Aluminum_alloy_2 = 0.6
Copper_alloy_1 = 90.0
Alloy_of_copper_2 = 96.0
Alloy_of_fer_1 = 2.5
Alloy_of_fer_2 = 3.0
Iron_alloy_3 = 4806.7
Total 4,591.76,999,999,999,995
EDIT Hello !
This is the improved version 2 of my code, sorry, it is in french, but i bet you can see what i mean , it still doesn't work , thought... but closer to what I need :
Mining and metals
In the manufacture of steel with permeable materials, sur wants to reduce the cost of producing this steel
to earn more money but still respecting the important characteristics of quality steel
# Characteristics of the steel to be made
""" Elément % minimal % Max
Carbone 2 3
Cuivre 0.4 0.6
Manganèse 1.2 1.65
"""
# Characteristics, stocks and purchase price of alloys at KILO
"""
Alliage C % Cu % Mn % Stocks kg Prix €/kg
Alliage de fer 1 2,50 0,00 1,30 4000 1,20
Alliage de fer 2 3,00 0,00 0,80 3000 1,50
Alliage de fer 3 0,00 0,30 0,00 6000 0,90
Alliage de cuivre 1 0,00 90,00 0,00 5000 1,30
Alliage de cuivre 2 0,00 96,00 4,00 2000 1,45
Alliage d'alu 1 0,00 0,40 1,20 3000 1,20
Alliage d'alu 2 0,00 0,60 0,00 2500 1,00
"""
# Importer la lib PuLP
from pulp import *
#Créer la variable du problème
prob = LpProblem("MinimiserLpAlliage",LpMinimize)
# The 7 vars have a zero limit, these decision variables are expressed in KILOS
x1 = LpVariable("Alliage de fer 1",0)
x2 = LpVariable("Alliage de fer 2",0)
x3 = LpVariable("Alliage de fer 3",0)
x4 = LpVariable("Alliage de cuivre 1",0)
x5 = LpVariable("Alliage de cuivre 2",0)
x6 = LpVariable("Alliage d'alu 1",0)
x7 = LpVariable("Alliage d'alu 2",0)
# The objective function is to minimize the total cost of the alloys in EUROS
prob += 1.20 * x1 + 1.50 * x2 + 0.90 * x3 + 1.30 * x4 + 1.45 * x5 + 1.20 * x6 + 1.00 * x7, "CoutAlliages"
# Quantity constraint in KGS.
prob += x1 + x2 + x3 + x4 + x5 + x6 + x7 == 5000, "QuantitéDemandée"
# Carbon stress.
prob += (2.50 * x1 + 3.00 * x2 + x3 + x4 + x5 + x6 + x7 ) / 5000 <= 3,"carBmax"
prob += (2.50 * x1 + 3.00 * x2 + x3 + x4 + x5 + x6 + x7 ) / 5000 >= 2,"carBmin"
# Constraint cu .
prob += (x1 + x2 + 0.30 * x3 + 90 * x4 + 96 * x5 + 0.40 * x6 + 0.60 * x7) / 5000 <= 0.6,"cuBmax"
prob += (x1 + x2 + 0.30 * x3 + 90 * x4 + 96 * x5 + 0.40 * x6 + 0.60 * x7) / 5000 >= 0.4,"cuBmin"
# Constraint Manganèse.
prob += (1.30 * x1 + 0.80 * x2 + x3 + x4 + 4 * x5 + 1.20 * x6 + x7 ) / 5000 <= 1.65,"mgBmax"
prob += (1.30 * x1 + 0.80 * x2 + x3 + x4 + 4 * x5 + 1.20 * x6 + x7 ) / 5000 >= 1.2,"mgBmin"
# 5. MAX constraints from available stock, by alloy
prob += x1 <= 4000 , "MaxStock"
prob += x2 <= 3000 , "MaxStock1"
prob += x3 <= 6000 , "MaxStock2"
prob += x4 <= 5000 , "MaxStock3"
prob += x5 <= 2000 , "MaxStock4"
prob += x6 <= 3000 , "MaxStock5"
prob += x7 <= 2500 , "MaxStock6"
# The problem data is written to an .lp file
prob.writeLP("acier.lp")
# On utilise le solveur
prob.solve()
# The status of the solution
print ("Status:", LpStatus[prob.status])
# We magnify and display the optimums of each var
for v in prob.variables():
print (v.name, "=", v.varValue)
# The result of the objective function is here
print ("Total payable in euros", value(prob.objective))
""" Status: Infeasible
Alliage_d'alu_1 = 0.0
Alliage_d'alu_2 = 0.0
Alliage_de_cuivre_1 = 0.0
Alliage_de_cuivre_2 = 0.0
Alliage_de_fer_1 = 0.0
Alliage_de_fer_2 = 0.0
Alliage_de_fer_3 = 10000.0
Total à payer en euros 9000.0 """
The book says the result with the excel solver is :
iron_1 : 4000 kgs
iron_2 : 0 kgs
iron_3 : 397.76kgs
cu_1 : 0 kgs
cu_2 : 27.61kgs
al_1 : 574.62kgs
al_2 : 0kgs
Cost in euros 5887.57
Steel contains 2% carb, 0.6 % cu, 1.2 %
manganese
Excel tab :
Solver pic :
Part of your problem is how you are understanding/applying percentages. My recommendation would be to convert percentages [0-100] to fractional numbers [0-1.0] as early as possible.
In excel when a cell says 50% the numeric value of the cell is actually 0.5. Working with percentages in this way means you don't have to keep dividing out by 100, and can multiply one percentage with another and it all just works.
The code below does what you want:
"""
Mining and metals
We make steel with raw materials, we want to reduce the cost of producing this steel
to make more money, but still respecting the minimum characteristics of quality steel
"""
# Minimize the cost of metal alloys.
# Characteristics of the steel to be made
"""Element %Minimum %Max %Real (it is a var)
Carbon 2 3 2.26
Copper 0.4 0.6 0.60
Manganese 1.2 1.65 1.20
"""
# Characteristics, stocks and purchase price of alloys
"""
Alloy C% Cu% Mn% Stocks kg Price € / kg
Iron alloy 2.50 0.00 1.30 4000 1.20
Iron alloy 3.00 0.00 0.80 3000 1.50
Iron alloy 0.00 0.30 0.00 6000 0.90
Copper alloy 0.00 90.00 0.00 5000 1.30
Copper alloy 0.00 96.00 4.00 2000 1.45
Aluminum alloy 0.00 0.40 1.20 3000 1.20
Aluminum alloy 0.00 0.60 0.00 2500 1.00
"""
# Import the PuLP lib
from pulp import *
# Create the problem variable
prob = LpProblem ("MinimiserLpAlliage", LpMinimize)
# Problem Data
input_mats = ["iron_1", "iron_2", "iron_3",
"cu_1", "cu_2",
"al_1", "al_2"]
input_costs = {"iron_1": 1.20, "iron_2": 1.50, "iron_3": 0.90,
"cu_1": 1.30, "cu_2": 1.45,
"al_1": 1.20, "al_2": 1.00}
# C% Cu% Mn%
input_composition = {"iron_1": [0.025, 0.000, 0.013],
"iron_2": [0.030, 0.000, 0.008],
"iron_3": [0.000, 0.003, 0.000],
"cu_1": [0.000, 0.900, 0.000],
"cu_2": [0.000, 0.960, 0.040],
"al_1": [0.000, 0.004, 0.012],
"al_2": [0.000, 0.006, 0.000]}
input_stock = {"iron_1": 4000, "iron_2": 3000, "iron_3": 6000,
"cu_1": 5000, "cu_2": 2000,
"al_1": 3000, "al_2": 2500}
request_quantity = 5000
Carbon_min = 0.02
Carbon_max = 0.03
Cu_min = 0.004
Cu_max = 0.006
Mn_min = 0.012
Mn_max = 0.0165
# Problem variables - amount in kg of each input
x = LpVariable.dicts("input_mat", input_mats, 0)
# The objective function is to minimize the total cost of the alloys in EUROS for a given quantity in KGS
prob += lpSum([input_costs[i]*x[i] for i in input_mats]), "AlliageCost"
# Quantity constraint in KGS.
prob += lpSum([x[i] for i in input_mats]) == request_quantity, "RequestedQuantity"
# MIN/MAX constraint of carbon in resultant steel
prob += lpSum([x[i]*input_composition[i][0] for i in input_mats]) >= Carbon_min*request_quantity, "MinCarbon"
prob += lpSum([x[i]*input_composition[i][0] for i in input_mats]) <= Carbon_max*request_quantity, "MaxCarbon"
# MIN/MAX constraints of copper in resultant steel
prob += lpSum([x[i]*input_composition[i][1] for i in input_mats]) >= Cu_min*request_quantity, "MinCu"
prob += lpSum([x[i]*input_composition[i][1] for i in input_mats]) <= Cu_max*request_quantity, "MaxCu"
# MIN/MAX constraints of manganese in resultant steel
prob += lpSum([x[i]*input_composition[i][2] for i in input_mats]) >= Mn_min*request_quantity, "MinMn"
prob += lpSum([x[i]*input_composition[i][2] for i in input_mats]) <= Mn_max*request_quantity, "MaxMn"
# MAX constraints of available stock
for i in input_mats:
prob += x[i] <= input_stock[i], ("MaxStock_" + i)
# Solve the problem
prob.solve()
# The status of the solution
print ("Status:", LpStatus [prob.status])
# Dislay the optimums of each var
for v in prob.variables ():
print (v.name, "=", v.varValue)
# Display mat'l compositions
Carbon_value = sum([x[i].varValue*input_composition[i][0] for i in input_mats])/request_quantity
Cu_value = sum([x[i].varValue*input_composition[i][1] for i in input_mats])/request_quantity
Mn_value = sum([x[i].varValue*input_composition[i][2] for i in input_mats])/request_quantity
print ("Carbon content: " + str(Carbon_value))
print ("Copper content: " + str(Cu_value))
print ("Manganese content: " + str(Mn_value))
# The result of the objective function is here
print ("Total", value (prob.objective))
From which I get:
Status: Optimal
input_mat_al_1 = 574.62426
input_mat_al_2 = 0.0
input_mat_cu_1 = 0.0
input_mat_cu_2 = 27.612723
input_mat_iron_1 = 4000.0
input_mat_iron_2 = 0.0
input_mat_iron_3 = 397.76302
Carbon content: 0.02
Copper content: 0.006000000036
Manganese content: 0.012000000008
Total 5887.57427835
Recently I was trying out this problem and my code got 60% of the marks, with the remaining cases returning TLEs.
Bazza and Shazza do not like bugs. They wish to clear out all the bugs
on their garden fence. They come up with a brilliant idea: they buy
some sugar frogs and release them near the fence, letting them eat up
all the bugs.
The plan is a great success and the bug infestation is gone. But
strangely, they now have a sugar frog infestation. Instead of getting
rid of the frogs, Bazza and Shazza decide to set up an obstacle course
and watch the frogs jump along it for their enjoyment.
The fence is a series of \$N\$ fence posts of varying heights. Bazza and
Shazza will select three fence posts to create the obstacle course,
where the middle post is strictly higher than the other two. The frogs
are to jump up from the left post to the middle post, then jump down
from the middle post to the right post. The three posts do not have to
be next to each other as frogs can jump over other fence posts,
regardless of the height of those other posts.
The difficulty of an obstacle course is the height of the first jump
plus the height of the second jump. The height of a jump is equal to
the difference in height between it's two fence posts. Your task is to
help Bazza and Shazza find the most difficult obstacle course for the
frogs to jump.
Input
Your program should read from the file. The file will describe
a single fence.
The first line of input will contain one integer \$N\$: the number of
fence posts. The next \$N\$ lines will each contain one integer \$h_i\$: the
height of the ith fence post. You are guaranteed that there will be at
least one valid obstacle course: that is, there will be at least one
combination of three fence posts where the middle post is strictly
higher than the other two.
Output
Your program should write to the file. Your output file should
contain one line with one integer: the greatest difficulty of any
possible obstacle course.
Constraints
To evaluate your solution, the judges will run your
program against several different input files. All of these files will
adhere to the following bounds:
\$3 \leq N \leq 100,000\$ (the number of fence posts)
\$1 \leq h_i \leq 100,000\$ (the height of each post)
As some of the test cases will be quite large,
you may need to think about how well your solution scales for larger
input values. However, not all the cases will be large. In particular:
For 30% of the marks, \$N \leq 300\$. For an additional 30% of the
marks, \$N \leq 3,000\$. For the remaining 40% of the marks, no special > constraints apply.
Hence, I was wondering if anyone could think of a way to optimize my code (below), or perhaps provide a more elegant, efficient algorithm than the one I am currently using.
Here is my code:
infile = open('frogin.txt', 'r')
outfile = open('frogout.txt', 'w')
N = int(infile.readline())
l = []
for i in range(N):
l.append(int(infile.readline()))
m = 0
#find maximum z-x+z-y such that the middle number z is the largest of x, y, z
for j in range(1, N - 1):
x = min(l[0: j])
y = min(l[j + 1:])
z = l[j]
if x < z and y < z:
n = z - x + z - y
m = n if n > m else m
outfile.write(str(m))
infile.close()
outfile.close()
exit()
If you require additional information regarding my solution or the problem, please do comment below.
Ok, first let's evaluate your program. I created a test file like
from random import randint
n = 100000
max_ = 100000
with open("frogin.txt", "w") as outf:
outf.write(str(n) + "\n")
outf.write("\n".join(str(randint(1, max_)) for _ in range(n)))
then ran your code in IPython like
%load_ext line_profiler
def test():
infile = open('frogin.txt', 'r')
outfile = open('frogout.txt', 'w')
N = int(infile.readline())
l = []
for i in range(N):
l.append(int(infile.readline()))
m = 0
for j in range(1, N - 1):
pre_l = l[0: j] # I split these lines
x = min(pre_l) # for a bit more detail
post_l = l[j + 1:] # on exactly which operations
y = min(post_l) # are taking the most time
z = l[j]
if x < z and y < z:
n = z - x + z - y
m = n if n > m else m
outfile.write(str(m))
infile.close()
outfile.close()
%lprun -f test test() # instrument the `test` function, then run `test()`
which gave
Total time: 197.565 s
File: <ipython-input-37-afa35ce6607a>
Function: test at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def test():
2 1 479 479.0 0.0 infile = open('frogin.txt', 'r')
3 1 984 984.0 0.0 outfile = open('frogout.txt', 'w')
4 1 195 195.0 0.0 N = int(infile.readline())
5 1 2 2.0 0.0 l = []
6 100001 117005 1.2 0.0 for i in range(N):
7 100000 269917 2.7 0.0 l.append(int(infile.readline()))
8 1 2 2.0 0.0 m = 0
9 99999 226984 2.3 0.0 for j in range(1, N - 1):
10 99998 94137525 941.4 12.2 pre_l = l[0: j]
11 99998 300309109 3003.2 38.8 x = min(pre_l)
12 99998 85915575 859.2 11.1 post_l = l[j + 1:]
13 99998 291183808 2911.9 37.7 y = min(post_l)
14 99998 441185 4.4 0.1 z = l[j]
15 99998 212870 2.1 0.0 if x < z and y < z:
16 99978 284920 2.8 0.0 n = z - x + z - y
17 99978 181296 1.8 0.0 m = n if n > m else m
18 1 114 114.0 0.0 outfile.write(str(m))
19 1 170 170.0 0.0 infile.close()
20 1 511 511.0 0.0 outfile.close()
which shows that 23.3% of your time (46 s) is spent repeatedly slicing your array, and 76.5% (151 s) is spent running min() on the slices 200k times.
So - how can we speed this up? Consider
a = min(l[0:50001]) # 50000 comparisons
b = min(l[0:50002]) # 50001 comparisons
c = min(a, l[50001]) # 1 comparison
Here's the magic: b and c are exactly equivalent but b takes something like 10k times longer to run. You have to have a calculated first - but you can repeat the same trick, shifted back by 1, to get a cheaply, and the same for the a's predecessor, and so on.
In one pass from start to end you can keep a running tally of 'minimum value seen previous to this index'. You can then do the same thing from end to start, keeping a running tally of 'minimum value seen after this index'. You can then zip all three arrays together and find the maximum achievable values.
I wrote a quick version,
def test():
ERROR_VAL = 1000000 # too big to be part of any valid solution
# read input file
with open("frogin.txt") as inf:
nums = [int(i) for i in inf.read().split()]
# check contents
n = nums.pop(0)
if len(nums) < n:
raise ValueError("Input file is too short!")
elif len(nums) > n:
raise ValueError("Input file is too long!")
# min_pre[i] == min(nums[:i])
min_pre = [0] * n
min_pre[0] = ERROR_VAL
for i in range(1, n):
min_pre[i] = min(nums[i - 1], min_pre[i - 1])
# min_post[i] == min(nums[i+1:])
min_post = [0] * n
min_post[n - 1] = ERROR_VAL
for i in range(n - 2, -1, -1):
min_post[i] = min(nums[i + 1], min_post[i + 1])
return max((nums[i] - min_pre[i]) + (nums[i] - min_post[i]) for i in range(1, n - 1) if min_pre[i] < nums[i] > min_post[i])
and profiled it,
Total time: 0.300842 s
File: <ipython-input-99-2097216e4420>
Function: test at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def test():
2 1 5 5.0 0.0 ERROR_VAL = 1000000 # too big to be part of any valid solution
3 # read input file
4 1 503 503.0 0.0 with open("frogin.txt") as inf:
5 1 99903 99903.0 8.5 nums = [int(i) for i in inf.read().split()]
6 # check contents
7 1 212 212.0 0.0 n = nums.pop(0)
8 1 7 7.0 0.0 if len(nums) < n:
9 raise ValueError("Input file is too short!")
10 1 2 2.0 0.0 elif len(nums) > n:
11 raise ValueError("Input file is too long!")
12 # min_pre[i] == min(nums[:i])
13 1 994 994.0 0.1 min_pre = [0] * n
14 1 3 3.0 0.0 min_pre[0] = ERROR_VAL
15 100000 162915 1.6 13.8 for i in range(1, n):
16 99999 267593 2.7 22.7 min_pre[i] = min(nums[i - 1], min_pre[i - 1])
17 # min_post[i] == min(nums[i+1:])
18 1 1050 1050.0 0.1 min_post = [0] * n
19 1 3 3.0 0.0 min_post[n - 1] = ERROR_VAL
20 100000 167021 1.7 14.2 for i in range(n - 2, -1, -1):
21 99999 272080 2.7 23.1 min_post[i] = min(nums[i + 1], min_post[i + 1])
22 1 205222 205222.0 17.4 return max((nums[i] - min_pre[i]) + (nums[i] - min_post[i]) for i in range(1, n - 1) if min_pre[i] < nums[i] > min_post[i])
and you can see the run-time for processing 100k values has dropped from 197 s to 0.3 s.