Implementing a simple gaussian naive bayes algorithm in python - python
So im a real amateur, trying to implement something you may call a sort of 'simplified' version of the naive bayes algorithm in python, and seem to have a lot of trouble [the reason for which is perhaps the fact that im not too sure i completely understand the way the algorithm works..]. I would appreciate any help/suggestions very much though. This is the code I have:
class GaussianNB(object):
def __init__(self):
'''
Constructor
'''
# This variable will hold the gaussian distribution over your data
# In fact, you need a distribution per class for each feature variable.
# This can be done as a list of lists.
self.classmodels_count = {}
self.classmodels = {}
self.classmodelsMeanAndVariance = {}
self.featureTokenCount= 0;
self.featureTypeCount = 0;
def train(self, trainingdata):
for i in trainingdata:
current_class = i[0]
features = i[1]
if self.classmodels.has_key(current_class):
current_class_model = self.classmodels[current_class]
self.classmodels_count[current_class] = self.classmodels_count[current_class] + 1
else:
current_class_model = {}
self.classmodels_count[current_class] = 1
for f in features:
feature = f[0]
value = f[1]
if current_class_model.has_key(feature):
list_of_values = current_class_model[feature]
list_of_values.append(value)
current_class_model[feature] = list_of_values
else:
list_of_values = []
list_of_values.append(value)
current_class_model[feature] = list_of_values
self.classmodels[current_class] = current_class_model
for a_class in self.classmodels.keys():
a_class_model = self.classmodels[a_class]
a_class_model_mean_and_variance = {}
for feature in a_class_model.keys():
a_class_model_mean_and_variance[feature] = findMeanSD(np.array(a_class_model[feature]))
self.classmodelsMeanAndVariance[a_class] = a_class_model_mean_and_variance
def classify(self, testing_vecs):
outputs = []
for vec in testing_vecs:
features = vec[1]
class_model_output_prob = {}
for a_class in self.classmodelsMeanAndVariance.keys():
a_class_output_prob = 0.0
a_class_model_mean_and_variance = self.classmodelsMeanAndVariance[a_class]
for feature_value in features:
feature = feature_value[0]
value = feature_value[1]
#simply ignore a feature if its not seen in training
if(a_class_model_mean_and_variance.has_key(feature)):
feature_mean = a_class_model_mean_and_variance[feature][0]
feature_std = a_class_model_mean_and_variance[feature][1]
a_class_output_prob = a_class_output_prob + math.log10(norm(value,feature_mean,feature_std))
#ignoring P(class) prior.. assuming equal priors
class_model_output_prob[a_class_output_prob] = a_class
probs = class_model_output_prob.keys()
print probs
probs.sort()
max_prob = probs[len(probs)-1]
max_class =class_model_output_prob[max_prob]
outputs.append(max_class)
return outputs
When running on some data, the error I get is
Traceback (most recent call last):
File "C:\Users\Toshiba\workspace\Assignment6\src\gnb_test.py", line 34, in
gaussian = Model.train(testData)
File "C:\Users\Toshiba\workspace\Assignment6\src\gnb.py", line 91, in train
for f in features:
TypeError: 'numpy.float64' object is not iterable
And I dont really [at all] understand what it means
Your traceback suggests that the problem is that you are trying to iterate through features, but features is a float and not a list or tuple - basically, it can't be broken into individual elements. I think it is a float because the lines
for i in trainingdata:
current_class = i[0]
features = i[1]
suggest features keeps getting rewritten as a successive series of numbers, when what you seem to want is to save the numbers into an iterable type. Try
features = []
for i in trainingdata:
current_class = i[0]
features.append(i[1])
Related
Getting specific values from ASCII table
I'm currently creating a genetic algorithm and am trying to only get certain values from the ASCII table so the runtime of the algorithm can be a bit faster. In the code below I get the values between 9-127 but I only need the values 9-10, and 32-127 from the ASCII table and I'm not sure on how to exactly only get those specific values. Code below is done in python. import numpy as np TARGET_PHRASE = """The smartest and fastest Pixel yet. Google Tensor: Our first custom-built processor. The first processor designed by Google and made for Pixel, Tensor makes the new Pixel phones our most powerful yet. The most advanced Pixel Camera ever. Capture brilliant color and vivid detail with Pixels best-in-class computational photography and new pro-level lenses.""" # target DNA POP_SIZE = 4000 # population size CROSS_RATE = 0.8 # mating probability (DNA crossover) MUTATION_RATE = 0.00001 # mutation probability N_GENERATIONS = 100000 DNA_SIZE = len(TARGET_PHRASE) TARGET_ASCII = np.fromstring(TARGET_PHRASE, dtype=np.uint8) # convert string to number ASCII_BOUND = [9, 127] class GA(object): def __init__(self, DNA_size, DNA_bound, cross_rate, mutation_rate, pop_size): self.DNA_size = DNA_size DNA_bound[1] += 1 self.DNA_bound = DNA_bound self.cross_rate = cross_rate self.mutate_rate = mutation_rate self.pop_size = pop_size self.pop = np.random.randint(*DNA_bound, size=(pop_size, DNA_size)).astype(np.int8) # int8 for convert to ASCII def translateDNA(self, DNA): # convert to readable string return DNA.tostring().decode('ascii') def get_fitness(self): # count how many character matches match_count = (self.pop == TARGET_ASCII).sum(axis=1) return match_count def select(self): fitness = self.get_fitness() # add a small amount to avoid all zero fitness idx = np.random.choice(np.arange(self.pop_size), size=self.pop_size, replace=True, p=fitness/fitness.sum()) return self.pop[idx] def crossover(self, parent, pop): if np.random.rand() < self.cross_rate: i_ = np.random.randint(0, self.pop_size, size=1) # select another individual from pop cross_points = np.random.randint(0, 2, self.DNA_size).astype(np.bool) # choose crossover points parent[cross_points] = pop[i_, cross_points] # mating and produce one child return parent def mutate(self, child): for point in range(self.DNA_size): if np.random.rand() < self.mutate_rate: child[point] = np.random.randint(*self.DNA_bound) # choose a random ASCII index return child def evolve(self): pop = self.select() pop_copy = pop.copy() for parent in pop: # for every parent child = self.crossover(parent, pop_copy) child = self.mutate(child) parent[:] = child self.pop = pop if __name__ == '__main__': ga = GA(DNA_size=DNA_SIZE, DNA_bound=ASCII_BOUND, cross_rate=CROSS_RATE, mutation_rate=MUTATION_RATE, pop_size=POP_SIZE) for generation in range(N_GENERATIONS): fitness = ga.get_fitness() best_DNA = ga.pop[np.argmax(fitness)] best_phrase = ga.translateDNA(best_DNA) print('Gen', generation, ': ', best_phrase) if best_phrase == TARGET_PHRASE: break ga.evolve()
You need a customed method to generate random samples in range 9-10, and 32-127, like def my_rand(pop_size, DNA_size): bold1=[9,10] bold2=list(range(32,127)) bold=bold1+bold2 pop = np.random.choice(bold,(pop_size,DNA_size)).astype(np.int8) return pop then call this method to replace the line 29, like delete -- self.pop = np.random.randint(*DNA_bound, size=(pop_size, DNA_size)).astype(np.int8) # int8 for convert to ASCII call ---self.pop = my_rand(pop_size, DNA_size)
how to get prediction value using trained model using keras?
I want to use the learned model to get a new prediction. SO i made a function what i want to get. But it doesn't work. When i using this function, it always return same values. I think there's a problem with the encoding procedure, but I don't know how to solve it. I changed lot of things what i doubted but it doesn't work. Please watch this code, and please tell me what is the problem in this code. thank you. def determineRank(t,n,bid_t,w,h,k): # t = str(input()) # time # n = int(input()) # now ranking # bid_t = int(input()) # bid amount # w = int(input()) # weekday # h = int(input()) # holiday # k = str(input()) # keyword encode = LabelEncoder() #x = np.concatenate((t,n,bid_t,w,h,k),axis = 1).reshape(1,6,1) t = categorize_time(t) k = encode.fit_transform([k]) new_list = [] new_list = [t,n,bid_t,w,h,k] """ new_list = new_list.append(t) new_list = new_list.append(n) new_list = new_list.append(bid_t) new_list = new_list.append(w) new_list = new_list.append(h) new_list = new_list.append(k) k = encode.fit_transform(k) """ new_list = np.array(new_list) new_list = new_list.reshape(1,6,1) model = load_model('03-0.728448.hdf5') rank = model.predict(new_list) return rank[0]
You are right, there is problem with the encoding procedure. When you call the LabelEncoder (), you create a new encoder. So for any value that is passed to the new encoder, the encoded result will be the same. To predict by encoder-based value, you should use the same encoder as for model fitting: Fit: ... encode = LabelEncoder() # create new encoder k = encode.fit_transform([k]) # fit encoder and transform value save_encoder(encode, 'some_name_for_encoder.hdf5') # save fitted encoder for further usage ... Predict: ... encode = load_encoder('some_name_for_encoder.hdf5') # load fitted encoder k = encode.transform([k]) # transform value ...
Numpy Array Manipulation Based off of Internal Values
I am trying to accomplish a weird task. I need to complete the following without the use of sklearn, and preferably with numpy: Given a dataset, split the data into 5 equal "folds", or partitions Within each partition, split the data into a "training" and "testing" set, with an 80/20 split Here is the catch: Your dataset is labeled for classes. So take for example a dataset with 100 instances, and class A with 33 samples and class B with 67 samples. I should create 5 folds of 20 data instances, where in each fold, class A has something like 6 or 7 (1/3) values and class B has the rest My issue that: I do not know how to properly return a test and training set for each fold, despite being able to split it appropriately, and, more important, I do not know how to incorporate the proper division of # of elements per class. My current code is here. It is commented where I am stuck: import numpy def csv_to_array(file): # Open the file, and load it in delimiting on the ',' for a comma separated value file data = open(file, 'r') data = numpy.loadtxt(data, delimiter=',') # Loop through the data in the array for index in range(len(data)): # Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0 try: data[index] = [float(x) for x in data[index]] except Exception: data[index] = 0 except ValueError: data[index] = 0 # Return the now type-formatted data return data def five_cross_fold_validation(dataset): # print("DATASET", dataset) numpy.random.shuffle(dataset) num_rows = dataset.shape[0] split_mark = int(num_rows / 5) folds = [] temp1 = dataset[:split_mark] # print("TEMP1", temp1) temp2 = dataset[split_mark:split_mark*2] # print("TEMP2", temp2) temp3 = dataset[split_mark*2:split_mark*3] # print("TEMP3", temp3) temp4 = dataset[split_mark*3:split_mark*4] # print("TEMP4", temp4) temp5 = dataset[split_mark*4:] # print("TEMP5", temp5) folds.append(temp1) folds.append(temp2) folds.append(temp3) folds.append(temp4) folds.append(temp5) # folds = numpy.asarray(folds) for fold in folds: # fold = numpy.asarray(fold) num_rows = fold.shape[0] split_mark = int(num_rows * .8) fold_training = fold[split_mark:] fold_testing = fold[:split_mark] print(type(fold)) # fold.tolist() list(fold) print(type(fold)) del fold[0:len(fold)] fold.append(fold_training) fold.append(fold_testing) fold = numpy.asarray(fold) # Somehow, return a testing and training set within each fold # print(folds) return folds def confirm_size(folds): total = 0 for fold in folds: curr = len(fold) total = total + curr return total def main(): print("BEGINNING CFV") ecoli = csv_to_array('Classification/ecoli.csv') print(len(ecoli)) folds = five_cross_fold_validation(ecoli) size = confirm_size(folds) print(size) main() Additionally, for reference, I have attached my csv I am working with (it is a modification of the UCI Ecoli Dataset.) The classes here are the values in the last column. So 0, 1, 2, 3, 4. It is important to note that there are not equal amounts of each class. 0.61,0.45,0.48,0.5,0.48,0.35,0.41,0 0.17,0.38,0.48,0.5,0.45,0.42,0.5,0 0.44,0.35,0.48,0.5,0.55,0.55,0.61,0 0.43,0.4,0.48,0.5,0.39,0.28,0.39,0 0.42,0.35,0.48,0.5,0.58,0.15,0.27,0 0.23,0.33,0.48,0.5,0.43,0.33,0.43,0 0.37,0.52,0.48,0.5,0.42,0.42,0.36,0 0.29,0.3,0.48,0.5,0.45,0.03,0.17,0 0.22,0.36,0.48,0.5,0.35,0.39,0.47,0 0.23,0.58,0.48,0.5,0.37,0.53,0.59,0 0.47,0.47,0.48,0.5,0.22,0.16,0.26,0 0.54,0.47,0.48,0.5,0.28,0.33,0.42,0 0.51,0.37,0.48,0.5,0.35,0.36,0.45,0 0.4,0.35,0.48,0.5,0.45,0.33,0.42,0 0.44,0.34,0.48,0.5,0.3,0.33,0.43,0 0.44,0.49,0.48,0.5,0.39,0.38,0.4,0 0.43,0.32,0.48,0.5,0.33,0.45,0.52,0 0.49,0.43,0.48,0.5,0.49,0.3,0.4,0 0.47,0.28,0.48,0.5,0.56,0.2,0.25,0 0.32,0.33,0.48,0.5,0.6,0.06,0.2,0 0.34,0.35,0.48,0.5,0.51,0.49,0.56,0 0.35,0.34,0.48,0.5,0.46,0.3,0.27,0 0.38,0.3,0.48,0.5,0.43,0.29,0.39,0 0.38,0.44,0.48,0.5,0.43,0.2,0.31,0 0.41,0.51,0.48,0.5,0.58,0.2,0.31,0 0.34,0.42,0.48,0.5,0.41,0.34,0.43,0 0.51,0.49,0.48,0.5,0.53,0.14,0.26,0 0.25,0.51,0.48,0.5,0.37,0.42,0.5,0 0.29,0.28,0.48,0.5,0.5,0.42,0.5,0 0.25,0.26,0.48,0.5,0.39,0.32,0.42,0 0.24,0.41,0.48,0.5,0.49,0.23,0.34,0 0.17,0.39,0.48,0.5,0.53,0.3,0.39,0 0.04,0.31,0.48,0.5,0.41,0.29,0.39,0 0.61,0.36,0.48,0.5,0.49,0.35,0.44,0 0.34,0.51,0.48,0.5,0.44,0.37,0.46,0 0.28,0.33,0.48,0.5,0.45,0.22,0.33,0 0.4,0.46,0.48,0.5,0.42,0.35,0.44,0 0.23,0.34,0.48,0.5,0.43,0.26,0.37,0 0.37,0.44,0.48,0.5,0.42,0.39,0.47,0 0,0.38,0.48,0.5,0.42,0.48,0.55,0 0.39,0.31,0.48,0.5,0.38,0.34,0.43,0 0.3,0.44,0.48,0.5,0.49,0.22,0.33,0 0.27,0.3,0.48,0.5,0.71,0.28,0.39,0 0.17,0.52,0.48,0.5,0.49,0.37,0.46,0 0.36,0.42,0.48,0.5,0.53,0.32,0.41,0 0.3,0.37,0.48,0.5,0.43,0.18,0.3,0 0.26,0.4,0.48,0.5,0.36,0.26,0.37,0 0.4,0.41,0.48,0.5,0.55,0.22,0.33,0 0.22,0.34,0.48,0.5,0.42,0.29,0.39,0 0.44,0.35,0.48,0.5,0.44,0.52,0.59,0 0.27,0.42,0.48,0.5,0.37,0.38,0.43,0 0.16,0.43,0.48,0.5,0.54,0.27,0.37,0 0.06,0.61,0.48,0.5,0.49,0.92,0.37,1 0.44,0.52,0.48,0.5,0.43,0.47,0.54,1 0.63,0.47,0.48,0.5,0.51,0.82,0.84,1 0.23,0.48,0.48,0.5,0.59,0.88,0.89,1 0.34,0.49,0.48,0.5,0.58,0.85,0.8,1 0.43,0.4,0.48,0.5,0.58,0.75,0.78,1 0.46,0.61,0.48,0.5,0.48,0.86,0.87,1 0.27,0.35,0.48,0.5,0.51,0.77,0.79,1
Edit I replaced np.random.shuffle(A) by A = np.random.permutation(A), the only difference is that it doesn't mutate the input array. This doesn't make any difference in this code, but it is safer in general. The idea is to randomly sample the input by using numpy.random.permutation. Once the rows are shuffled we just need to iterate over all the possible tests sets (sliding window of the desired size, here 20% of the input size). The corresponding training sets are just composed of all remaining elements. This will preserve the original classes distribution on all subsets even though we pick them in order because we shuffled the input. The following code iterate over the test/train sets combinations: import numpy as np def csv_to_array(file): with open(file, 'r') as f: data = np.loadtxt(f, delimiter=',') return data def classes_distribution(A): """Print the class distributions of array A.""" nb_classes = np.unique(A[:,-1]).shape[0] total_size = A.shape[0] for i in range(nb_classes): class_size = sum(row[-1] == i for row in A) class_p = class_size/total_size print(f"\t P(class_{i}) = {class_p:.3f}") def random_samples(A, test_set_p=0.2): """Split the input array A in two uniformly chosen random sets: test/training. Repeat this until all rows have been yielded once at least once as a test set.""" A = np.random.permutation(A) sample_size = int(test_set_p*A.shape[0]) for start in range(0, A.shape[0], sample_size): end = start + sample_size yield { "test": A[start:end,], "train": np.append(A[:start,], A[end:,], 0) } def main(): ecoli = csv_to_array('ecoli.csv') print("Input set shape: ", ecoli.shape) print("Input set class distribution:") classes_distribution(ecoli) print("Training sets class distributions:") for iteration in random_samples(ecoli): test_set = iteration["test"] training_set = iteration["train"] classes_distribution(training_set) print("---") # ... Do what ever with these two sets main() It produces an output of the form: Input set shape: (169, 8) Input set class distribution: P(class_0) = 0.308 P(class_1) = 0.213 P(class_2) = 0.207 P(class_3) = 0.118 P(class_4) = 0.154 Training sets class distributions: P(class_0) = 0.316 P(class_1) = 0.206 P(class_2) = 0.199 P(class_3) = 0.118 P(class_4) = 0.162 ...
TypeError for predict_proba(np.array(test))
model = LogisticRegression() model = model.fit(X, y) test_data = [1,2,3,4,5,6,7,8,9,10,11,12,13] test_prediction = model.predict_proba(np.array(test_data)) max = -1.0 res = 0 for i in range(test_prediction): if test_prediction[i]>max: max = test_prediction[i] res = i if res==0: print('A') elif res==1: print('B') else: print('C') Using the above python code I have to predict the probabilities of the 3 possible results (A, B, C). The probabilities are saved in test_prediction and it can be printed as: Output: [[ 0.82882588 0.08641236 0.08476175]] But the remaining part gives an error: for i in range(test_prediction): TypeError: only integer scalar arrays can be converted to a scalar index I want to find the max probability and then display the event that is likely to occur the most (A/B/C). How to go about this?
You can also use numpy.argmax which will directly give you the index of the largest value. import numpy as np #test_prediction is most probably np array only pred = np.array(test_prediction) classes_val = np.argmax(pred, axis=1) for res in class_val: if res==0: print('A') elif res==1: print('B') else: print('C')
The problem in using array in range In this case you should use length of array range(len(test_prediction)) Also you may simplify your code: import operator #... enum_predict = enumerate(test_prediction) res = max(enum_predict, key=operator.itemgetter(1))[0] enumerate convert array to list of tuples (index, item) key=operator.itemgetter(1) - max function will compare types by second value
You can do something like this: predict_prob_df = pd.DataFrame(model.predict_proba(test_data)) max_prob = predict_prob_df.apply(max,axis = 1) predicted_output = pd.DataFrame(model.predict(test_data)) Then you can concat them: final_frame = pd.concat([max_prob,predicted_output],axis = 1) This way you do not need to use the for loop, which was causing the error.
I came up with another solution: for i in range(3): if np.take(test_prediction, i) > max: max = np.take(test_prediction, i) res = i if res==0: ..... This worked by accessing the index in test_prediction using np.take But the solution specified by #Vivek_Kumar seems more correct and efficient.
k nearest neighbours algorithm python
this is my code for the k nearest neighbor algorithm: import numpy as np from EuclideanDistance import EuclideanDistance dataset = np.loadtxt('C:\Users\Toshiba\Documents\machine learning\RealEstate.csv', delimiter=',', usecols=(2,3,4,5)) p1 = () def normalizeToZscores(data): '''Normalizes the variables to z-scores''' zScores = list() for s in data: zScore = (s - np.mean(data))/np.std(data) zScores.append(zScore) return np.asarray(zScores) def InOutBudget(data): '''Decides whether a particular house is within or outside the budget of $153000 and assigns values of 1 and 0 respectively''' data2 = list() for i in data: if (i > 153000): data2.append(0) else: data2.append(1) return np.array(data2) classes = dataset[:,0] classes = classes.reshape((dataset.shape[0],1)) classes = InOutBudget(classes) data = dataset[:20,:] data = normalizeToZscores(data) p1s = dataset[20:400,:] def measureDis(data, p1): listD = [] for x in data: D = EuclideanDistance(x, p1) listD.append(D) return listD def most_common(lst): '''Finds the most frequently occuring element of a list. It will be used to predict a class based on the classification of the k-nearest neighbours''' return max(set(lst), key=lst.count) def findKnn(k): '''K nearest neighbours algorithm''' knns = list() errors = list() #for i in k: for p1 in p1s: # Create a list of tuples containing distance and class, # Then sort them by shortest distance tuples = zip(measureDis(data,p1), classes[20:400]) tuples = sorted(tuples) knn = tuples[:k] print knn knn = [x[1] for x in knn] knn = most_common(knn) knns = knns.append(knn) print knn error = np.abs(knn - p1) errors = errors.append(error) errorsNum = np.sum(errors) return knns But I keep getting: Traceback (most recent call last): File "C:\Users\Toshiba\workspace\assignment5\src\knn2.py", line 76, in <module> knn = findKnn(k) File "C:\Users\Toshiba\workspace\assignment5\src\knn2.py", line 64, in findKnn knns = knns.append(knn) AttributeError: 'NoneType' object has no attribute 'append' I know the code is really amateur, but could someone please help me just solve the issue?
list.append doesn't return the list. Simply do: knns.append(knn) instead of: knns = knns.append(knn)
append does not return the list, it returns None, so you are clobbering it after the first loop.