Inefficient Regularized Logistic Regression with Numpy - python
I am a machine learning noob attemping to implement regularized logistic regression via Newton's method.
The data have two features which are supposed to be expanded to 28 through finding all monomial terms of (u,v) up to degree 6
My code converges to the correct solution of norm(theta)=0.9384 after around 500 or so iterations when it should only take around 15 for lambda = 10, though the exercise is based on Matlab instead of Python. Each cycle of the parameter update is also very slow with my code and I am not sure exactly why. If anyone could explain why my code takes so many iterations to converge and why each iteration is painfully slow I would be very grateful!
The data are taken from Andrew Ng's open course exercise 5. The problem information and data can be found here http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=MachineLearning&doc=exercises/ex5/ex5.html
although I posted the data and my code below.
X data with two features
0.051267,0.69956
-0.092742,0.68494
-0.21371,0.69225
-0.375,0.50219
-0.51325,0.46564
-0.52477,0.2098
-0.39804,0.034357
-0.30588,-0.19225
0.016705,-0.40424
0.13191,-0.51389
0.38537,-0.56506
0.52938,-0.5212
0.63882,-0.24342
0.73675,-0.18494
0.54666,0.48757
0.322,0.5826
0.16647,0.53874
-0.046659,0.81652
-0.17339,0.69956
-0.47869,0.63377
-0.60541,0.59722
-0.62846,0.33406
-0.59389,0.005117
-0.42108,-0.27266
-0.11578,-0.39693
0.20104,-0.60161
0.46601,-0.53582
0.67339,-0.53582
-0.13882,0.54605
-0.29435,0.77997
-0.26555,0.96272
-0.16187,0.8019
-0.17339,0.64839
-0.28283,0.47295
-0.36348,0.31213
-0.30012,0.027047
-0.23675,-0.21418
-0.06394,-0.18494
0.062788,-0.16301
0.22984,-0.41155
0.2932,-0.2288
0.48329,-0.18494
0.64459,-0.14108
0.46025,0.012427
0.6273,0.15863
0.57546,0.26827
0.72523,0.44371
0.22408,0.52412
0.44297,0.67032
0.322,0.69225
0.13767,0.57529
-0.0063364,0.39985
-0.092742,0.55336
-0.20795,0.35599
-0.20795,0.17325
-0.43836,0.21711
-0.21947,-0.016813
-0.13882,-0.27266
0.18376,0.93348
0.22408,0.77997
0.29896,0.61915
0.50634,0.75804
0.61578,0.7288
0.60426,0.59722
0.76555,0.50219
0.92684,0.3633
0.82316,0.27558
0.96141,0.085526
0.93836,0.012427
0.86348,-0.082602
0.89804,-0.20687
0.85196,-0.36769
0.82892,-0.5212
0.79435,-0.55775
0.59274,-0.7405
0.51786,-0.5943
0.46601,-0.41886
0.35081,-0.57968
0.28744,-0.76974
0.085829,-0.75512
0.14919,-0.57968
-0.13306,-0.4481
-0.40956,-0.41155
-0.39228,-0.25804
-0.74366,-0.25804
-0.69758,0.041667
-0.75518,0.2902
-0.69758,0.68494
-0.4038,0.70687
-0.38076,0.91886
-0.50749,0.90424
-0.54781,0.70687
0.10311,0.77997
0.057028,0.91886
-0.10426,0.99196
-0.081221,1.1089
0.28744,1.087
0.39689,0.82383
0.63882,0.88962
0.82316,0.66301
0.67339,0.64108
1.0709,0.10015
-0.046659,-0.57968
-0.23675,-0.63816
-0.15035,-0.36769
-0.49021,-0.3019
-0.46717,-0.13377
-0.28859,-0.060673
-0.61118,-0.067982
-0.66302,-0.21418
-0.59965,-0.41886
-0.72638,-0.082602
-0.83007,0.31213
-0.72062,0.53874
-0.59389,0.49488
-0.48445,0.99927
-0.0063364,0.99927
Y data
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
My code below:
import pandas as pd
import numpy as np
import math
def sigmoid(theta, x):
return 1/(1 + math.exp(-1*theta.T.dot(x)))
def cost_function(X, y, theta):
s = 0
for i in range(m):
loss = -y[i]*np.log(sigmoid(theta, X[i])) - (1-y[i])*np.log(1-sigmoid(theta, X[i]))
s += loss
s /= m
s += (lamb/(2*m))*sum(theta[j]**2 for j in range(1, 28))
return s
def gradient(theta, X, y):
# add regularization terms
add_column = theta * (lamb/m)
add_column[0] = 0
a = sum((sigmoid(theta, X[i]) - y[i])*X[i] + add_column for i in range(m))/m
return a
def hessian(theta, X, reg_matrix):
matrix = []
for i in range(28):
row = []
for j in range(28):
cell = sum(sigmoid(theta, X[k])*(1-sigmoid(theta, X[k]))*X[k][i]*X[k][j] for k in range(m))
row.append(cell)
matrix.append(row)
H = np.array(matrix)
H = np.add(H, reg_matrix)
return H
def newtons_method(theta, iterations):
for i in range(iterations):
g = gradient(theta, X, y)
H = hessian(theta, X, reg_matrix)
theta = theta - np.linalg.inv(H).dot(g)
cost = cost_function(X,y,theta)
print(cost)
return theta
def map_feature(u, v): # expand features according to problem instructions
new_row = []
new_row.append(1)
new_row.append(u)
new_row.append(v)
new_row.append(u**2)
new_row.append(u*v)
new_row.append(v**2)
new_row.append(u**3)
new_row.append(u**2*v)
new_row.append(u*v**2)
new_row.append(v**3)
new_row.append(u**4)
new_row.append(u**3*v)
new_row.append(u*v**3)
new_row.append(v**4)
new_row.append(u**2*v**2)
new_row.append(u**5)
new_row.append(u**4*v)
new_row.append(u*v**4)
new_row.append(v**5)
new_row.append(u**2*v**3)
new_row.append(u**3*v**2)
new_row.append(u**6)
new_row.append(u**5*v)
new_row.append(u*v**5)
new_row.append(v**6)
new_row.append(u**4*v**2)
new_row.append(u**2*v**4)
new_row.append(u**3*v**3)
return np.array(new_row)
with open('ex5Logx.dat', 'r') as f:
array = []
for line in f.readlines():
array.append(line.strip().split(','))
for a in array:
a[0], a[1] = float(a[0]), float(a[1].strip())
xdata= np.array(array)
with open('ex5Logy.dat', 'r') as f:
array = []
for line in f.readlines():
array.append(line.strip())
for i in range(len(array)):
array[i] = float(array[i])
ydata= np.array(array)
X_df = pd.DataFrame(xdata, columns=['score1', 'score2'])
y_df = pd.DataFrame(ydata, columns=['acceptence'])
m = len(y_df)
iterations = 15
ones = np.ones((m,1)) # intercept term in first column
X = np.array(X_df)
X = np.append(ones, X, axis=1)
y = np.array(y_df).flatten()
new_X = [] # prepare new array for expanded features
for i in range(m):
new_row = map_feature(X[i][1], X[i][2])
new_X.append(new_row)
X = np.array(new_X)
theta = np.array([0 for i in range(28)]) # initialize parameters to 0
lamb = 10 # lambda constant for regularization
reg_matrix = np.zeros((28,28),dtype=int) # n+1*n+1 regularization matrix
np.fill_diagonal(reg_matrix, 1)
reg_matrix[0] = 0
reg_matrix = (lamb/m)*reg_matrix
theta = newtons_method(theta, iterations)
print(np.linalg.norm(theta))
I am not 100% sure but i went through one tutorial on Logistic Regression using Newton's method(http://thelaziestprogrammer.com/sharrington/math-of-machine-learning/solving-logreg-newtons-method) and it's implementation of Newton's method is little different from yours.Actually there is one major difference. In Newton's method it's adding product of inv of hessian and gradient to theta whereas you are subtracting. I know about logistic regression normal way not using newton's method. Apart from that it seems that you are using loops in Cost function and Hessian which i think can be done with one statement in numpy than looping.
I would suggest refer to attached link which i gave as it has done all implementation in python numpy and there are no loops. Loops which you have created are impacting performance.
Related
Creating matrix with for loop in python
I have a list with 4 elements. Each element is a correct score that I am pulling from a form. For example: scoreFixed_1 = 1 scoreFixed_2 = 2 scoreFixed_3 = 3 scoreFixed_4 = 4 scoreFixed = [scoreFixed_1, scoreFixed_2, scoreFixed_3, scoreFixed_4] Then, I need to add: scoreFixed_1 to fixture[0][0] scoreFixed_2 to fixture[0][1] scoreFixed_3 to fixture[1][0] scoreFixed_4 to fixture[1][1] Hence, I need to create a triple for loop that outputs the following sequence so I can index to achieve the result above: 0 0 0 1 0 1 2 1 0 3 1 1 I have tried to use this to create this matrix, however I am only able to get the first column correct. Can anyone help? for x in range(1): for y in range(1): for z in range(4): print(z, x, y) which outputs: 0 0 0 1 0 0 2 0 0 3 0 0
Your logic does not generate the table, you want something like: rownum = 0 for x in range(2): for y in range(2): print (rownum, x, y) rownum += 1 (Edit: The question has been changed, to accomplish the new desire, you want something like this:) scoreIndex = 0 for x in range(2): for y in range(2): fixture[x][y] += scoreFixed[scoreIndex] scoreIndex += 1
After your edit, it seems like we can split the 'sequence' into: First column, regular ascending variable ( n += 1) Second and third column, binary counter (00, 01, 10, 11) 0 0 0 1 0 1 2 1 0 3 1 1 ^ ^------- These seem like a binary counter (00, 01, 10, 11) ^------ A regular ascending variable ( n += 1 ) Using that 'logic' we can create a code that looks like import itertools scoreFixed = 0 for i in itertools.product([0,1],repeat=2): print(scoreFixed, ' '.join(map(str,i))) scoreFixed += 1 And wil output: 0 0 0 1 0 1 2 1 0 3 1 1 As you can test in this online demo
for x in range(4): z = int(bin(x)[-1]) y = bin(x)[-2] y = int(y) if y.isdigit() else 0 print(x, y, z)
Python: Rearrange indices from np.where()
I would like to rearrange the indices in a tuple which was created with np.where. The reason for this is, that I would like to apply values to a number of special position (a pipe) in a mesh, which were pre-selected. The values shall be applied in the direction of flow. The direction of flow is defined from top left to bottom left = from (3,0) to (3,6) to (7,6) to (7,0). Currently, the order of the index tuple ind is according to the automatic sorting of the indices. This leads to the figure, below, where the values 1:10 are correctly applied, but 11:17 are obviously in reverse order. Is there a better way to grab the indices or how can I rearrange the tuple so that the values are applied in the direction of flow? import numpy as np import matplotlib.pyplot as plt # mesh size nx, ny = 10, 10 # special positions sx1, sx2, sy = .3, .7, .7 T = 1 # create mesh u0 = np.zeros((nx, ny)) # assign values to mesh u0[int(nx*sx1), 0:int(ny*sy)] = T u0[int(nx*sx2), 0:int(ny*sy)] = T u0[int(nx*sx1+1):int(nx*sx2), int(ny*sy-1)] = T # get indices of special positions ind = np.where(u0 == T) # EDIT: hand code sequence length = len(u0[int(nx*sx2), 0:int(ny*sy)]) ind[0][-length:] = np.flip(ind[0][-length:]) ind[1][-length:] = np.flip(ind[1][-length:]) # apply new values on special positions u0[ind] = np.arange(1, len(ind[1])+1,1) fig, ax = plt.subplots() fig = ax.imshow(u0, cmap=plt.get_cmap('RdBu_r')) ax.figure.colorbar(fig) plt.show() Old image (without edit) New image (after edit)
I think it's a fallacy to think that you can algorithmically deduce the correct "flow-sequence" of the grid points, by examining the contents of the tuple ind. Here's an example that illustrates why: 0 0 0 0 0 0 0 0 0 0 A B C D E 0 0 0 0 0 0 0 0 0 F 0 0 0 0 0 0 0 0 0 G 0 0 0 0 0 0 0 0 I H 0 0 0 0 0 0 0 0 J K 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 This is a schematic representation of your grid matrix, where, if you follow the letters A, B, C, etc, you will get the sequence of the flow through the grid-elements. However, note that, no matter how smart an algorithm is, it will be unable to choose between the two possible flows: A, B, C, D, E, F, G, H, I, J, K and A, B, C, D, E, F, G, H, K, J, I So, I think you will have to record the sequence explicitly yourself, rather than deduce it from the positions of the T values in the grid. Any algorithm will stop at the ambiguity at the grid location H in the above example
Generate binary strings that are at least in d hamming distance using ECC
I want to generate binary strings of length n=128 with the property that any pair of such strings are at least in d=10 hamming distance. For this I am trying to use an Error Correcting Code (ECC) with minimum distance d=10. However, I cannot find any ecc that has code words of 128 bit length. If the code word length (n) and d are a little bit smaller/greater than 128 and 10, that still works for me. Is there any ecc with this (similar) properties? Is there any python implementation of this?
Reed-Muller codes RM(3,7) have: a block size of 128 bits a minimum distance of 16 a message size of 64 bits First construct a basis like this: def popcnt(x): return bin(x).count("1") basis = [] by_ones = list(range(128)) by_ones.sort(key=popcnt) for i in by_ones: count = popcnt(i) if count > 3: break if count <= 1: basis.append(((1 << 128) - 1) // ((1 << i) | 1)) else: p = ((1 << 128) - 1) for b in [basis[k + 1] for k in range(7) if ((i >> k) & 1) != 0]: p = p & b basis.append(p) Then you can use any linear combination of them, which are created by XORing subsets of rows of the basis, for example: def encode(x, basis): # requires x < (1 << 64) r = 0 for i in range(len(basis)): if ((x >> i) & 1) != 0: r = r ^ basis[i] return r In some other implementation I found this was done by taking dot products with columns of the basis matrix and then reducing modulo 2. I don't know why they do that, it seems much easier to do it more directly by summing a subset of rows.
I needed the exact same thing. For me the naive approach worked very well! Simply generate random bit strings and check hamming distance between them, gradually building a list of strings that fulfills the requirement: def random_binary_array(width): """Generate random binary array of specific width""" # You can enforce additional array level constraints here return np.random.randint(2, size=width) def hamming2(s1, s2): """Calculate the Hamming distance between two bit arrays""" assert len(s1) == len(s2) # return sum(c1 != c2 for c1, c2 in zip(s1, s2)) # Wikipedia solution return np.count_nonzero(s1 != s2) # a faster solution def generate_hamm_arrays(n_values, size, min_hamming_dist=5): """ Generate a list of binary arrays ensuring minimal hamming distance between the arrays. """ hamm_list = [] while len(hamm_list) < size: test_candidate = random_binary_array(n_values) valid = True for word in hamm_list: if (word == test_candidate).all() or hamming2(word, test_candidate) <= min_hamming_dist: valid = False break if valid: hamm_list.append(test_candidate) return np.array(hamm_list) print(generate_hamm_arrays(16, 10)) Output: [[0 0 1 1 0 1 1 1 0 1 0 1 1 1 1 1] [1 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1] [1 1 0 0 0 0 1 0 0 0 1 1 1 1 0 0] [1 0 0 1 1 0 0 1 1 0 0 1 1 1 0 1] [0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1] [1 1 0 0 0 0 0 1 0 1 1 1 0 1 1 1] [1 1 0 1 0 1 0 1 1 1 1 0 0 1 0 0] [0 1 1 1 1 1 1 0 0 0 1 1 0 0 0 0] [1 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1] [0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0]] And it's not too slow as long as you don't want a very dense list of strings (a small number of bits in a string + large hamming distance). From your specifications (128 bit strings with hamming distance 10 it is no problem) we can generate a 1000 bit strings in under 0.2 seconds on a really weak cpu: import timeit timeit.timeit(lambda: generate_hamm_arrays(n_values=128, size=100, min_hamming_dist=10), number=10) >> 0.19202665984630585 Hope this solution is sufficient for you too.
My O(n*n!) solution (works in a reasonable time for N<14) def hammingDistance(n1, n2): return bin(np.bitwise_xor(n1, n2)).count("1") N = 10 # binary code of length N D = 6 # with minimum distance D M = 2**N # number of unique codes in general # construct hamming distance matrix A = np.zeros((M, M), dtype=int) for i in range(M): for j in range(i+1, M): A[i, j] = hammingDistance(i, j) A += A.T def recursivly_find_legit_numbers(nums, codes=set()): codes_to_probe = nums for num1 in nums: codes.add(num1) codes_to_probe = codes_to_probe - {num1} for num2 in nums - {num1}: if A[num1, num2] < D: "Distance isn't sufficient, remove this number from set" codes_to_probe = codes_to_probe - {num2} if len(codes_to_probe): recursivly_find_legit_numbers(codes_to_probe, codes) return codes group_of_codes = {} for i in tqdm(range(M)): satisfying_numbers = np.where(A[i] >= D)[0] satisfying_numbers = satisfying_numbers[satisfying_numbers > i] nums = set(satisfying_numbers) if len(nums) == 0: continue group_of_codes[i] = recursivly_find_legit_numbers(nums, set()) group_of_codes[i].add(i) largest_group = 0 for i, nums in group_of_codes.items(): if len(nums) > largest_group: largest_group = len(nums) ind = i print(f"largest group for N={N} and D={D}: {largest_group}") print("Number of unique groups:", len(group_of_codes)) largest group for N=10 and D=6: 6 Number of unique groups: 992 # generate largest group of codes [format(num, f"0{N}b") for num in group_of_codes[ind]] ['0110100001', '0001000010', '1100001100', '1010010111', '1111111010', '0001111101']
Python numpy zeros array being assigned 1 for every value when only one index is updated
The following is my code: amount_features = X.shape[1] best_features = np.zeros((amount_features,), dtype=int) best_accuracy = 0 best_accuracy_index = 0 def find_best_features(best_features, best_accuracy): for i in range(amount_features): trial_features = best_features trial_features[i] = 1 svc = SVC(C = 10, gamma = .1) svc.fit(X_train[:,trial_features==1],y_train) y_pred = svc.predict(X_test[:,trial_features==1]) accuracy = metrics.accuracy_score(y_test,y_pred) if (accuracy > best_accuracy): best_accuracy = accuracy best_accuracy_index = i print(best_accuracy_index) best_features[best_accuracy_index] = 1 return best_features, best_accuracy bf, ba = find_best_features(best_features, best_accuracy) print(bf, ba) And this is my output: 25 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] 0.865853658537 And my expected output: 25 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0] 0.865853658537 I am trying to update the zeros array with the index that gives the highest accuracy. As you see it should be index 25, and I follow that by assigning the 25 index for my array equal to 1. However, when I print the array it shows every index has been updated to 1. Not sure what is the mishap. Thanks for spending your limited time on Earth to help me.
Change trial_features = best_features to trial_features = numpy.copy(best_features). Reasoning behind the change is already given by #Michael Butscher.
Python Ignoring What is in a list?
Working on a project for CS1 that prints out a grid made of 0s and adds shapes of certain numbered sizes to it. Before it adds a shape it needs to check if A) it will fit on the grid and B) if something else is already there. The issue I am having is that when run, the function that checks to make sure placement for the shapes is valid will always do the first and second shapes correctly, but any shape added after that will only "see" the first shape added when looking for a collision. I checked to see if it wasnt taking in the right list after the first time but that doesnt seem to be it. Example of the issue.... Shape Sizes = 4, 3, 2, 1 Python Outputs: 4 4 4 4 1 2 3 0 4 4 4 4 2 2 3 0 4 4 4 4 3 3 3 0 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 It Should Output: 4 4 4 4 3 3 3 1 4 4 4 4 3 3 3 0 4 4 4 4 3 3 3 0 4 4 4 4 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 What's going on here? Full Code is below... def binCreate(size): binlist = [[0 for col in range(size)] for row in range(size)] return binlist def binPrint(lst): for row in range(len(lst)): for col in range(len(lst[row])): print(lst[row][col], end = " ") print() def itemCreate(fileName): lst = [] for i in open(fileName): i = i.split() lst = i lst = [int(i) for i in lst] return lst def main(): size = int(input("Bin Size: ")) fileName = str(input("Item Size File: ")) binList = binCreate(size) blockList = itemCreate(fileName) blockList.sort(reverse = True) binList = checker(binList, len(binList), blockList) binPrint(binList) def isSpaceFree(binList, r, c, size): if r + size > len(binList[0]): return False elif c + size > len(binList[0]): return False for row in range(r, r + size): for col in range(c, c + size): if binList[r][c] != 0: return False elif binList[r][c] == size: return False return True def checker(binList, gSize, blockList): for i in blockList: r = 0 c = 0 comp = False while comp != True: check = isSpaceFree(binList, r, c, i) if check == True: for x in range(c, c+ i): for y in range(r, r+ i): binList[x][y] = i comp = True else: print(c) print(r) r += 1 if r > gSize: r = 0 c += 1 if c > gSize: print("Imcompadible") comp = True print(i) binPrint(binList) input() return binList
Your code to test for open spaces looks in binList[r][c] (where r is a row value and c is a column value). However, the code that sets the values once an open space has been found sets binList[x][y] (where x is a column value and y is a row value). The latter is wrong. You want to set binList[y][x] instead (indexing by row, then column). That will get you a working solution, but it will still not be exactly what you say you expect (you'll get a reflection across the diagonal). This is because your code updates r first, then c only when r has exceeded the bin size. If you want to place items to the right first, then below, you need to swap them. I'd suggest using two for loops for r and c, rather than a while too, but to make it work in an elegant way you'd probably need to factor out the "find one item's place" code so you could return from the inner loop (rather than needing some complicated code to let you break out of both of the nested loops).