k-means Cluster function won't converge - python

Disclaimer: It is homework.
I try to implement a k mean cluster function. My problem is the cluster don't converge, it jump between two states. And I don't understand why or can't find the bug in the function.
My data comes from a csv and I push it into a pandas dataframe. I use the pearson correlation coefficient as metric. To calculate the coeffiction the function takes in two list
My only idea what the problem could be is, that the pearson correlation coefficient has negative numbers, so technically it is not a metric.
My code:
def distPearson(para1,para2):
n = len(para1)
p1_med = 0.0
p2_med = 0.0
for i in range(n):
p1_med += para1[i]
p2_med += para2[i]
p1_med = p1_med/n
p2_med = p2_med/n
sum_p1_den = 0.0
sum_p2_den = 0.0
sum_p1p2_num = 0.0
for i in range(n):
delta_p1 = para1[i]-p1_med
delta_p2 = para2[i]-p2_med
sum_p1p2_num += delta_p1*delta_p2
sum_p1_den += delta_p1**2
sum_p2_den += delta_p2**2
den = sqrt(sum_p1_den*sum_p2_den)
if (den == 0):
return 0
return (sum_p1p2_num/den)
def kMeanCluster(data,distance=distPearson,k = 5):
#get range for cluster placement
clusterRanges= [(data.iloc[i].min(),data.iloc[i].max())for i in range(len(data))]
# plase k cluster random
kCluster =[]
for i in range(k):
cluster = [rnd.randint(e[0],e[1]) for e in clusterRanges]
kCluster.append(cluster)
# match data to cluster
cacheMatches = []
for t in range(100):
print("Iter: " + str(t))
bestMatches = []
for e in data.items():
bestmatch = (-1,2)
for i in range(k):
dist = distance(data[e[0]],kCluster[i])
if dist < bestmatch[1]:
bestmatch = (i,dist)
bestMatches.append(bestmatch)
if cacheMatches == bestMatches:
return bestMatches
memberList = [[] for i in range(k)]
for i in range(len(bestMatches)):
memberList[bestMatches[i][0]].append(i)
print("1|" +str(len(memberList[0]))
+ " 2|" +str(len(memberList[1]))
+ " 3|" +str(len(memberList[2]))
+ " 4|" +str(len(memberList[3]))
+ " 5|" +str(len(memberList[4])))
kCluster = []
for i in range(k):
cluster = data[data.columns[memberList[i]]].mean(axis=1).tolist()
kCluster.append(cluster)
cacheMatches = bestMatches
I get the output:
Iter: 0
1|28 2|21 3|17 4|20 5|13
Iter: 1
1|10 2|67 3|8 4|4 5|10
Iter: 2
1|2 2|2 3|11 4|59 5|25
Iter: 3
1|33 2|46 3|12 4|4 5|4
Iter: 4
1|5 2|3 3|9 4|55 5|27
Iter: 5
1|43 2|41 3|10 4|3 5|2
Iter: 6
1|5 2|3 3|5 4|47 5|39
Iter: 7
1|51 2|34 3|10 4|3 5|1
Iter: 8
1|5 2|6 3|2 4|46 5|40
Iter: 9
1|50 2|34 3|12 4|2 5|1
Iter: 10
1|5 2|6 3|2 4|46 5|40
Iter: 11
1|50 2|34 3|12 4|2 5|1
and so on

Related

Minimize AbsEquality rather than enforce in OrTools

I'm trying to solve the following using OR tools:
Given the following bags containing different colors of balls:
bag
red
blue
green
black
A
10
5
85
0
B
25
50
25
0
C
0
100
0
0
D
90
5
5
0
E
2
0
98
0
F
0
0
0
100
How many of each type of bag would I need to have an equal number of each color of ball?
For cases like this where there is an exact answer, the following code:
bags= [
[10,5,85,0],
[25,50,25,0],
[0,100,0,0],
[90,5,5,0],
[2,0,98,0],
[0,0,0,100]
]
bags_n = len(bags)
color_n = len(bags[0])
print(f'Bags: {bags_n}')
print(f'Colors: {color_n}')
color_count= [0] * color_n
for c in range(color_n):
for b in bags:
color_count[c]+= b[c]
print(color_count)
print(f'Inital total: {sum(color_count)}')
print(f'Inital equal share: {sum(color_count)//color_n}')
model = cp_model.CpModel()
weights = []
for r in range(bags_n):
weights.append(model.NewIntVar(1,1000,f'Weight of Bag: {r}'))
total = model.NewIntVar(0, 100000, 'total')
model.Add(
sum(flatten(
[[bags[r][c] * weights[r] for r in range(bags_n)] for c in range(color_n)]
)) == total
)
equal = model.NewIntVar(0, 10000, 'equal share')
model.AddDivisionEquality(equal, total, color_n)
for c in range(color_n):
diff_c = model.NewIntVar(0, 1000, 'diff_'+str(c))
model.Add(diff_c == sum([bags[r][c] * weights[r] for r in range(bags_n)]) - equal)
model.AddAbsEquality(0, diff_c)
solver = cp_model.CpSolver()
status = solver.Solve(model)
if status == cp_model.OPTIMAL or status == cp_model.FEASIBLE:
print(f'Maximum of objective function: {solver.ObjectiveValue()}\n')
for v in weights:
print(f'{solver.Value(v)}')
print(f'total = {solver.Value(total)}')
print(f'equal share = {solver.Value(equal)}')
else:
print(status)
gives back valid weights:
82
2
70
78
5
79
If I change the setup to something like
bags= [
[50,40,10],
[30,20,50],
[30,30,40],
[30,25,45],
]
The model becomes infeasible, I assume due to the fact that there are no weights that satisfy the AbsEquality for every color.
How can I change this to get me the solution closest to an even distribution even if a perfect solution is infeasable?
Christopher Hamkins' suggestion worked great:
bags= [
[50,40,10],
[30,20,50],
[30,30,40],
[30,25,45],
]
bags_n = len(bags)
color_n = len(bags[0])
print(f'Bags: {bags_n}')
print(f'Colors: {color_n}')
color_count= [0] * color_n
for c in range(color_n):
for b in bags:
color_count[c]+= b[c]
print(color_count)
print(["{0:.0%}".format(c/sum(color_count)) for c in color_count])
model = cp_model.CpModel()
weights = []
for r in range(bags_n):
weights.append(model.NewIntVar(1,500,f'Weight of Bag: {r}'))
max = model.NewIntVar(0,100000000,f'Max')
model.AddMaxEquality(max,
[sum([bags[r][c] * weights[r] for r in range(bags_n)]) for c in range(color_n)]
)
min = model.NewIntVar(0,100000000,f'Min')
model.AddMinEquality(min,
[sum([bags[r][c] * weights[r] for r in range(bags_n)]) for c in range(color_n)]
)
diff = model.NewIntVar(0,100000000,f'Diff')
model.Add(max - min == diff)
model.Minimize(diff)
solver = cp_model.CpSolver()
status = solver.Solve(model)
if status == cp_model.OPTIMAL or status == cp_model.FEASIBLE:
print(f'max = {solver.Value(max)}')
print(f'min = {solver.Value(min)}')
print(f'diff = {solver.Value(diff)}')
bag_weights = [0] * bags_n
for i,v in enumerate(weights):
bag_weights[i] = solver.Value(v)
print(f'{solver.Value(v)}')
color_count = [0] * color_n
for c in range(color_n):
for i,b in enumerate(bags):
color_count[c]+= (b[c] * bag_weights[i])
print(color_count)
print(["{0:.0%}".format(c/sum(color_count)) for c in color_count])
else:
print(status)

How do I make z score algorithms work? Where am I going wrong?

I have been having some trouble with my code, about how to use the Kaggle database to find the popularity of data, and analyze it using Z-score algorithms. I have tried a lot, but can never seem to get my code to work. Here is the link to the data: https://docs.google.com/spreadsheets/d/1HIAzQta-dSfoovkdPeqKcBxANhaO9VR_TiNFwnUKFcU/edit?usp=sharing
And here is my code
for header in data.columns:
# average
sum = 0
n = 0
crab = 0
for i in data[header]:
sum += ele
n += 1
average = sum / n
# standard Deviation
for j in data[header]:
crab = np.std(crab)
for k in data[header]:
k = (k - average) / crab
print("k = ",k)
break
print(data.shape)
data.info()
data.hist(figsize=(14, 14))
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
k is apparently = to - inf? and im really confused
You have written:
crab = 0
# ...
# ...
crab = np.std(crab)
# ...
# ...
k = (k - average) / crab # You are dividing by zero
Hence the infinty in the output

How to structure python programs? Tried making it more structured, now runs 13 times slower

Im very new to programming, I wrote a simple program for a school project and wanted to make the code "prettier" by not just having the program be one giant function but instead be made up of multiple smaller functions with a singe purpose. I seemed to have messed up royally since the program now runs 13 times slower. How should I structured the program to make it run faster and just in general make programs easier to write, read and edit?
Here are the two programs:
First program (for reference values runs in ≈0:20):
import numpy as np
import matplotlib.pyplot as plt
def graf(a,b,H,p):
GM = 39.5216489684
x_0 = a + np.sqrt(a**2 - b**2)
v_0 = np.sqrt(GM*(2/x_0 - 1/a))
konstant_period = np.sqrt(a**3)*H
h = 1/H
'''starting position given by an elliptic orbit '''
stor_x_lista = [x_0]
stor_y_lista = [0]
hastighet_x = [0]
hastighet_y = [v_0]
liten_x_lista = []
liten_y_lista = []
''' a loop that approximates the points of the orbit'''
t = 0
tid_lista = []
n = 0
while n < konstant_period:
hastighet_x.append(hastighet_x[n] - h*GM* stor_x_lista[n]/(np.sqrt(stor_x_lista[n]**2 + stor_y_lista[n]**2))**3)
stor_x_lista.append(stor_x_lista[n] + h*hastighet_x[n])
hastighet_y.append(hastighet_y[n] - h*GM*stor_y_lista[n]/(np.sqrt(stor_x_lista[n]**2 + stor_y_lista[n]**2))**3)
stor_y_lista.append(stor_y_lista[n] + h*hastighet_y[n])
'''smaller list of points to run faster'''
if n % p == 0:
liten_x_lista.append(stor_x_lista[n])
liten_y_lista.append(stor_y_lista[n])
tid_lista.append(t)
n += 1
t += h
''' function that finds the angle'''
vinkel = []
siffra = 0
while siffra < len(liten_x_lista):
if liten_y_lista[siffra ] >= 0:
vinkel.append( np.arccos( liten_x_lista[siffra]/np.sqrt( liten_x_lista[siffra]**2 + liten_y_lista[siffra]**2)))
siffra += 1
elif liten_y_lista[siffra] < 0 :
vinkel.append( np.pi + np.arccos( -liten_x_lista[siffra]/np.sqrt( liten_x_lista[siffra]**2 + liten_y_lista[siffra]**2) ))
siffra += 1
'''get rid of line to find periodic function'''
mod_lista = []
modn = 0
while modn < len(vinkel):
mod_lista.append(vinkel[modn] - (2*np.pi*tid_lista[modn])/np.sqrt(a**3))
modn += 1
'''make all inputs have period 1'''
squeeze_tid = []
squeezen = 0
while squeezen < len(tid_lista):
squeeze_tid.append(tid_lista[squeezen]/np.sqrt(a**3))
squeezen += 1
del mod_lista[-1:]
del tid_lista[-1:]
del squeeze_tid[-1:]
plt.plot(squeeze_tid,mod_lista)
plt.title('p(t) där a = ' + str(a) + ' och b = ' + str(b))
plt.show
Second more split up program (for reference values runs in ≈4:20):
import numpy as np
import matplotlib.pyplot as plt
'''function that generates the points of the orbit'''
def punkt(a,b,H,p):
GM = 39.5216489684
x_0 = a + np.sqrt(a**2 - b**2)
v_0 = np.sqrt(GM*(2/x_0 - 1/a))
konstant_period = np.sqrt(a**3)*H
h = 1/H
'''starting position given by an elliptic orbit '''
stor_x_lista = [x_0]
stor_y_lista = [0]
hastighet_x = [0]
hastighet_y = [v_0]
liten_x_lista = []
liten_y_lista = []
''' a loop that approximates the points of the orbit'''
t = 0
tid_lista = []
n = 0
while n < konstant_period:
hastighet_x.append(hastighet_x[n] - h*GM* stor_x_lista[n]/(np.sqrt(stor_x_lista[n]**2 + stor_y_lista[n]**2))**3)
stor_x_lista.append(stor_x_lista[n] + h*hastighet_x[n])
hastighet_y.append(hastighet_y[n] - h*GM*stor_y_lista[n]/(np.sqrt(stor_x_lista[n]**2 + stor_y_lista[n]**2))**3)
stor_y_lista.append(stor_y_lista[n] + h*hastighet_y[n])
'''smaller list of points to run faster'''
if n % p == 0:
liten_x_lista.append(stor_x_lista[n])
liten_y_lista.append(stor_y_lista[n])
tid_lista.append(t)
n += 1
t += h
return (liten_x_lista,liten_y_lista,tid_lista)
''' function that finds the angle'''
def vinkel(a,b,H,p):
'''import lists'''
liten_x_lista = punkt(a,b,H,p)[0]
liten_y_lista = punkt(a,b,H,p)[1]
tid_lista = punkt(a,b,H,p)[2]
'''find the angle'''
vinkel_lista = []
siffra = 0
while siffra < len(liten_x_lista):
if liten_y_lista[siffra ] >= 0:
vinkel_lista.append( np.arccos( liten_x_lista[siffra]/np.sqrt( liten_x_lista[siffra]**2 + liten_y_lista[siffra]**2)))
siffra += 1
elif liten_y_lista[siffra] < 0 :
vinkel_lista.append( np.pi + np.arccos( -liten_x_lista[siffra]/np.sqrt( liten_x_lista[siffra]**2 + liten_y_lista[siffra]**2) ))
siffra += 1
return (vinkel_lista, tid_lista)
def periodisk(a,b,H,p):
'''import lists'''
tid_lista = vinkel(a,b,H,p)[1]
vinkel_lista = vinkel(a,b,H,p)[0]
'''get rid of linear line to find p(t)'''
mod_lista = []
modn = 0
while modn < len(vinkel_lista):
mod_lista.append((vinkel_lista[modn] - (2*np.pi*tid_lista[modn])/np.sqrt(a**3)))
modn += 1
'''make all inputs have period 1'''
squeeze_tid = []
squeezen = 0
while squeezen < len(tid_lista):
squeeze_tid.append(tid_lista[squeezen]/np.sqrt(a**3))
squeezen += 1
del mod_lista[-1:]
del tid_lista[-1:]
del squeeze_tid[-1:]
return (squeeze_tid,mod_lista)
'''fixa 3d-punkt av p(a,b) a är konstant b varierar??? '''
def hitta_amp(a):
x_b = []
y_b = []
n_b = 0.1
while n_b <= a:
x_b.append(n_b)
y_b.append(punkt(a,n_b,10**5,10**3))
return 0
def graf(a,b,H,p):
plt.plot(periodisk(a,b,H,p)[0],periodisk(a,b,H,p)[1])
plt.show
I would assume the thing that is going wrong is that the program is running the same, slow code multiple times instead of just running it once and then accessing the data. Is the problem that everything is done locally and nothing is stored globally or is it something else?
Just as a heads up, the only thing I know about programming is basic syntax, I have no clue how to actually write and run programs. I ran all the code in spyder if that affects anything.
plt.plot(periodisk(a,b,H,p)[0],periodisk(a,b,H,p)[1])
This code runs periodisk twice with the same arguments, thus at this point we know we run things at least 2 times slower.
You should do some_var = periodisk(a,b,H,p) and then some_var[0], some_var[1]. Or just use unpacking:
plt.plot(*periodisk(a,b,H,p))
tid_lista = vinkel(a,b,H,p)[1]
vinkel_lista = vinkel(a,b,H,p)[0]
Again doing the same thing twice (total: 4*time of (current) vinkel function). Again, smart assignment to fix this:
vinkel_lista, tid_lista = vinkel(a,b,H,p)
liten_x_lista = punkt(a,b,H,p)[0]
liten_y_lista = punkt(a,b,H,p)[1]
tid_lista = punkt(a,b,H,p)[2]
And now you repeat yourself thrice. (total: 12 * time of current punkt function)
liten_x_lista, liten_y_lista, tid_lista = punkt(a,b,H,p)
punkt function is like in original, so we arrived as total being 12 times slower - which quite matches your time estimations. :)
You are calling the functions once per returned list, you should only call them once.
When a method returns multiple variables, (e.g. punkt):
def punkt(a,b,H,p):
# Here is all your code
return (liten_x_lista,liten_y_lista,tid_lista)
You must be careful to only call the function once:
result = punkt(a,b,H,p)
liten_x_lista = result[0]
liten_y_lista = result[1]
tid_lista = result[2]
# As opposed to:
liten_x_lista = punkt(a,b,H,p)[0] # 1st call, ignoring results 2 and 3
liten_y_lista = punkt(a,b,H,p)[1] # 2nd call, ignoring results 1 and 3
tid_lista = punkt(a,b,H,p)[2] # 3rd call, ignoring results 1 and 2
Note: I would personally not return a list, but use python's unpacking:
def punkt(a,b,H,p):
# Here is all your code
return liten_x_lista, liten_y_lista, tid_lista
And you'd access it:
liten_x_lista, liten_y_lista, tid_lista = punkt(a,b,H,p)

For loop returning odd values

I have two lists I want to iterate through:
efw = [13.882352941176457, 10.854092526690406, 94.25675675675676, 17.851739788199694, 14.63844797178131, 8.166189111747846, 5.278592375366564, 3.4129692832764347, -6.413612565445015, 11.678832116788328, 23.859649122807003, 4.545454545454564, 10.105580693815996, -3.562340966921118, -0.6684491978609763, 2.285714285714291, 8.505747126436791]
and
gini = [3.9215686274509887, 6.190476190476191, -7.733812949640296, -16.608391608391603, -13.458262350936979, 7.505518763796926, -12.884615384615394, -20.21276595744681, -19.839679358717433, -10.885608856088568, -12.891986062717764, -15.56420233463035, -12.66540642722116, -12.802768166089962, -11.336032388663975, -13.507625272331147, -1.882845188284521]
I want to create two new lists with + and - depending on the values in gini and efw. If the value in gini is positive, then a + should be added to the g list. Same with the e and efw lists. I have tried:
g = []
e = []
for n in gini:
if n > 0:
g.append("+")
g.append("-")
for f in efw:
if f > 0:
e.append("+")
e.append("-")
But for some positive values in gini there is a - sign... Why is the for loop appending the wrong symbols to the new lists?
Adding else solved it.
g = []
e = []
for n in gini:
if n > 0:
g.append("+")
else:
g.append("-")
for f in efw:
if f > 0:
e.append("+")
else:
e.append("-")

how to optimize this code and algorithm?

def compute_qty(self):
prduct = self.env["product.product"]
attribute = self.env["product.attribute.value"]
for line in self.qty_line_id:
stake_meter = line.pipe_size
line_qty = line.pipe_qty
pipe_list = []
qty_list = []
pipe_size_qty_list = []
pipe_size_qty_list_of_list = []
for obj in prduct.search([('product_tmpl_id','=',line.product_id.product_tmpl_id.id),('qty_available','>', 0)]):
for attr in obj.attribute_value_ids.ids:
for name in attribute.search([('id','=', str(attr))]):
pipe_product_size_qty_list = []
if float(str(name.name)) >= line.pipe_size:
pipe_size_qty_list.append(float(str(name.name)))
pipe_size_qty_list.append(obj.qty_available)
pipe_size_qty_list_of_list.append(pipe_size_qty_list)
pipe_size_qty_list = []
pipe_size_qty_list_of_list = sorted(pipe_size_qty_list_of_list)
i = 0
n = line_qty
t = 0
while n !=0 and i < len(pipe_size_qty_list_of_list):
pipe_qty_need = math.floor(float(pipe_size_qty_list_of_list[i][0] / pipe_meter))
if pipe_qty_need == 1 and pipe_size_qty_list_of_list[i][0] > pipe_meter:
if pipe_size_qty_list_of_list[i][1] <= n:
pipe_size = pipe_size_qty_list_of_list[i][0]
pipe_qty = pipe_size_qty_list_of_list[i][1]
pipe_list.append(pipe_size)
qty_list.append(pipe_qty)
if pipe_size_qty_list_of_list[i][1] > n:
pipe_size= pipe_size_qty_list_of_list[i][0]
pipe_qty= n
pipe_list.append(pipe_size)
qty_list.append(pipe_qty)
break
n= n - pipe_qty
t = t + pipe_qty
i += 1
if pipe_qty_need !=1 and pipe_size_qty_list_of_list[i][0] > pipe_meter:
if pipe_qty_need * pipe_size_qty_list_of_list[i][1] <= n:
pipe_size= pipe_size_qty_list_of_list[i][0]
pipe_qty= pipe_size_qty_list_of_list[i][1]
pipe_list.append(pipe_size)
qty_list.append(pipe_qty)
if pipe_qty_need * pipe_size_qty_list_of_list[i][1] > n:
pipe_size= pipe_size_qty_list_of_list[i][0]
pipe_qty=math.ceil(float(n/pipe_qty_need))
pipe_list.append(pipe_size)
qty_list.append(pipe_qty)
break
n= n - pipe_qty
t = t + pipe_qty
i += 1
raise UserError(_("pipe Test list %s")%(qty_list)) #here is the chosen quantity from the suitable size
im going to compute the quantity of possible length of pipe i can use to build a
new object, i had to pick pipes from the stock depends
on the size of the new object, for example:
- suppose i need 5 pipe of size "4.3" meter to build a new object:
- what i realy have in the stock:
- 1 pipe of size 5
- 4 pipe of size 4.1
- 2 pipe of size 4.4
- 10 pipe of size 9
cutting is possible while assembeling is not.
so here i should pick from the available quantity of size >= 4.3 sequentially from the smallest to the largest
until the quatity i need is equal to 5 "the number of needed pipe"
from the exampe above i have to chose the following pipe:
2 of "4.4"
1 of "5"
1 of "9" (here becaue it's sufficeint to produce 2 of (4.3) pipe)
what i actually did is appending pipe's size and quantity in a list of list in the form [[size,qty]] ,sorting and searching on that list.
here is the list:
[[4.4,2],[5,1],[9,10]]
what i should get from that list is the possible size and quantity
[[4.4,2],[5,1][9,1]]
it's work fine but im looking for optimizing my code.
thanks in advance

Categories