Implementing a simple gaussian naive bayes algorithm in python

Implementing a simple gaussian naive bayes algorithm in python - python

So im a real amateur, trying to implement something you may call a sort of 'simplified' version of the naive bayes algorithm in python, and seem to have a lot of trouble [the reason for which is perhaps the fact that im not too sure i completely understand the way the algorithm works..]. I would appreciate any help/suggestions very much though. This is the code I have:
class GaussianNB(object):
def __init__(self):
'''
Constructor
'''
# This variable will hold the gaussian distribution over your data
# In fact, you need a distribution per class for each feature variable.
# This can be done as a list of lists.
self.classmodels_count = {}
self.classmodels = {}
self.classmodelsMeanAndVariance = {}
self.featureTokenCount= 0;
self.featureTypeCount = 0;
def train(self, trainingdata):
for i in trainingdata:
current_class = i[0]
features = i[1]
if self.classmodels.has_key(current_class):
current_class_model = self.classmodels[current_class]
self.classmodels_count[current_class] = self.classmodels_count[current_class] + 1
else:
current_class_model = {}
self.classmodels_count[current_class] = 1
for f in features:
feature = f[0]
value = f[1]
if current_class_model.has_key(feature):
list_of_values = current_class_model[feature]
list_of_values.append(value)
current_class_model[feature] = list_of_values
else:
list_of_values = []
list_of_values.append(value)
current_class_model[feature] = list_of_values
self.classmodels[current_class] = current_class_model
for a_class in self.classmodels.keys():
a_class_model = self.classmodels[a_class]
a_class_model_mean_and_variance = {}
for feature in a_class_model.keys():
a_class_model_mean_and_variance[feature] = findMeanSD(np.array(a_class_model[feature]))
self.classmodelsMeanAndVariance[a_class] = a_class_model_mean_and_variance
def classify(self, testing_vecs):
outputs = []
for vec in testing_vecs:
features = vec[1]
class_model_output_prob = {}
for a_class in self.classmodelsMeanAndVariance.keys():
a_class_output_prob = 0.0
a_class_model_mean_and_variance = self.classmodelsMeanAndVariance[a_class]
for feature_value in features:
feature = feature_value[0]
value = feature_value[1]
#simply ignore a feature if its not seen in training
if(a_class_model_mean_and_variance.has_key(feature)):
feature_mean = a_class_model_mean_and_variance[feature][0]
feature_std = a_class_model_mean_and_variance[feature][1]
a_class_output_prob = a_class_output_prob + math.log10(norm(value,feature_mean,feature_std))
#ignoring P(class) prior.. assuming equal priors
class_model_output_prob[a_class_output_prob] = a_class
probs = class_model_output_prob.keys()
print probs
probs.sort()
max_prob = probs[len(probs)-1]
max_class =class_model_output_prob[max_prob]
outputs.append(max_class)
return outputs
When running on some data, the error I get is
Traceback (most recent call last):
File "C:\Users\Toshiba\workspace\Assignment6\src\gnb_test.py", line 34, in
gaussian = Model.train(testData)
File "C:\Users\Toshiba\workspace\Assignment6\src\gnb.py", line 91, in train
for f in features:
TypeError: 'numpy.float64' object is not iterable
And I dont really [at all] understand what it means

Your traceback suggests that the problem is that you are trying to iterate through features, but features is a float and not a list or tuple - basically, it can't be broken into individual elements. I think it is a float because the lines
for i in trainingdata:
current_class = i[0]
features = i[1]
suggest features keeps getting rewritten as a successive series of numbers, when what you seem to want is to save the numbers into an iterable type. Try
features = []
for i in trainingdata:
current_class = i[0]
features.append(i[1])

Related

Getting specific values from ASCII table

I'm currently creating a genetic algorithm and am trying to only get certain values from the ASCII table so the runtime of the algorithm can be a bit faster. In the code below I get the values between 9-127 but I only need the values 9-10, and 32-127 from the ASCII table and I'm not sure on how to exactly only get those specific values. Code below is done in python.
import numpy as np
TARGET_PHRASE = """The smartest and fastest Pixel yet.
Google Tensor: Our first custom-built processor.
The first processor designed by Google and made for Pixel, Tensor makes the new Pixel phones our most powerful yet.
The most advanced Pixel Camera ever.
Capture brilliant color and vivid detail with Pixels best-in-class computational photography and new pro-level lenses.""" # target DNA
POP_SIZE = 4000 # population size
CROSS_RATE = 0.8 # mating probability (DNA crossover)
MUTATION_RATE = 0.00001 # mutation probability
N_GENERATIONS = 100000
DNA_SIZE = len(TARGET_PHRASE)
TARGET_ASCII = np.fromstring(TARGET_PHRASE, dtype=np.uint8) # convert string to number
ASCII_BOUND = [9, 127]
class GA(object):
def __init__(self, DNA_size, DNA_bound, cross_rate, mutation_rate, pop_size):
self.DNA_size = DNA_size
DNA_bound[1] += 1
self.DNA_bound = DNA_bound
self.cross_rate = cross_rate
self.mutate_rate = mutation_rate
self.pop_size = pop_size
self.pop = np.random.randint(*DNA_bound, size=(pop_size, DNA_size)).astype(np.int8) # int8 for convert to ASCII
def translateDNA(self, DNA): # convert to readable string
return DNA.tostring().decode('ascii')
def get_fitness(self): # count how many character matches
match_count = (self.pop == TARGET_ASCII).sum(axis=1)
return match_count
def select(self):
fitness = self.get_fitness() # add a small amount to avoid all zero fitness
idx = np.random.choice(np.arange(self.pop_size), size=self.pop_size, replace=True, p=fitness/fitness.sum())
return self.pop[idx]
def crossover(self, parent, pop):
if np.random.rand() < self.cross_rate:
i_ = np.random.randint(0, self.pop_size, size=1) # select another individual from pop
cross_points = np.random.randint(0, 2, self.DNA_size).astype(np.bool) # choose crossover points
parent[cross_points] = pop[i_, cross_points] # mating and produce one child
return parent
def mutate(self, child):
for point in range(self.DNA_size):
if np.random.rand() < self.mutate_rate:
child[point] = np.random.randint(*self.DNA_bound) # choose a random ASCII index
return child
def evolve(self):
pop = self.select()
pop_copy = pop.copy()
for parent in pop: # for every parent
child = self.crossover(parent, pop_copy)
child = self.mutate(child)
parent[:] = child
self.pop = pop
if __name__ == '__main__':
ga = GA(DNA_size=DNA_SIZE, DNA_bound=ASCII_BOUND, cross_rate=CROSS_RATE,
mutation_rate=MUTATION_RATE, pop_size=POP_SIZE)
for generation in range(N_GENERATIONS):
fitness = ga.get_fitness()
best_DNA = ga.pop[np.argmax(fitness)]
best_phrase = ga.translateDNA(best_DNA)
print('Gen', generation, ': ', best_phrase)
if best_phrase == TARGET_PHRASE:
break
ga.evolve()

You need a customed method to generate random samples in range 9-10, and 32-127, like
def my_rand(pop_size, DNA_size):
bold1=[9,10]
bold2=list(range(32,127))
bold=bold1+bold2
pop = np.random.choice(bold,(pop_size,DNA_size)).astype(np.int8)
return pop
then call this method to replace the line 29, like
delete -- self.pop = np.random.randint(*DNA_bound, size=(pop_size, DNA_size)).astype(np.int8) # int8 for convert to ASCII
call ---self.pop = my_rand(pop_size, DNA_size)

how to get prediction value using trained model using keras?

I want to use the learned model to get a new prediction.
SO i made a function what i want to get.
But it doesn't work.
When i using this function, it always return same values.
I think there's a problem with the encoding procedure, but I don't know how to solve it.
I changed lot of things what i doubted but it doesn't work.
Please watch this code, and please tell me what is the problem in this code.
thank you.
def determineRank(t,n,bid_t,w,h,k):
# t = str(input()) # time
# n = int(input()) # now ranking
# bid_t = int(input()) # bid amount
# w = int(input()) # weekday
# h = int(input()) # holiday
# k = str(input()) # keyword
encode = LabelEncoder()
#x = np.concatenate((t,n,bid_t,w,h,k),axis = 1).reshape(1,6,1)
t = categorize_time(t)
k = encode.fit_transform([k])
new_list = []
new_list = [t,n,bid_t,w,h,k]
"""
new_list = new_list.append(t)
new_list = new_list.append(n)
new_list = new_list.append(bid_t)
new_list = new_list.append(w)
new_list = new_list.append(h)
new_list = new_list.append(k)
k = encode.fit_transform(k)
"""
new_list = np.array(new_list)
new_list = new_list.reshape(1,6,1)
model = load_model('03-0.728448.hdf5')
rank = model.predict(new_list)
return rank[0]

You are right, there is problem with the encoding procedure.
When you call the LabelEncoder (), you create a new encoder. So for any value that is passed to the new encoder, the encoded result will be the same. To predict by encoder-based value, you should use the same encoder as for model fitting:
Fit:
...
encode = LabelEncoder() # create new encoder
k = encode.fit_transform([k]) # fit encoder and transform value
save_encoder(encode, 'some_name_for_encoder.hdf5') # save fitted encoder for further usage
...
Predict:
...
encode = load_encoder('some_name_for_encoder.hdf5') # load fitted encoder
k = encode.transform([k]) # transform value
...

Numpy Array Manipulation Based off of Internal Values

I am trying to accomplish a weird task.
I need to complete the following without the use of sklearn, and preferably with numpy:
Given a dataset, split the data into 5 equal "folds", or partitions
Within each partition, split the data into a "training" and "testing" set, with an 80/20 split
Here is the catch: Your dataset is labeled for classes. So take for example a dataset with 100 instances, and class A with 33 samples and class B with 67 samples. I should create 5 folds of 20 data instances, where in each fold, class A has something like 6 or 7 (1/3) values and class B has the rest
My issue that:
I do not know how to properly return a test and training set for each fold, despite being able to split it appropriately, and, more important, I do not know how to incorporate the proper division of # of elements per class.
My current code is here. It is commented where I am stuck:
import numpy
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
def five_cross_fold_validation(dataset):
# print("DATASET", dataset)
numpy.random.shuffle(dataset)
num_rows = dataset.shape[0]
split_mark = int(num_rows / 5)
folds = []
temp1 = dataset[:split_mark]
# print("TEMP1", temp1)
temp2 = dataset[split_mark:split_mark*2]
# print("TEMP2", temp2)
temp3 = dataset[split_mark*2:split_mark*3]
# print("TEMP3", temp3)
temp4 = dataset[split_mark*3:split_mark*4]
# print("TEMP4", temp4)
temp5 = dataset[split_mark*4:]
# print("TEMP5", temp5)
folds.append(temp1)
folds.append(temp2)
folds.append(temp3)
folds.append(temp4)
folds.append(temp5)
# folds = numpy.asarray(folds)
for fold in folds:
# fold = numpy.asarray(fold)
num_rows = fold.shape[0]
split_mark = int(num_rows * .8)
fold_training = fold[split_mark:]
fold_testing = fold[:split_mark]
print(type(fold))
# fold.tolist()
list(fold)
print(type(fold))
del fold[0:len(fold)]
fold.append(fold_training)
fold.append(fold_testing)
fold = numpy.asarray(fold)
# Somehow, return a testing and training set within each fold
# print(folds)
return folds
def confirm_size(folds):
total = 0
for fold in folds:
curr = len(fold)
total = total + curr
return total
def main():
print("BEGINNING CFV")
ecoli = csv_to_array('Classification/ecoli.csv')
print(len(ecoli))
folds = five_cross_fold_validation(ecoli)
size = confirm_size(folds)
print(size)
main()
Additionally, for reference, I have attached my csv I am working with (it is a modification of the UCI Ecoli Dataset.) The classes here are the values in the last column. So 0, 1, 2, 3, 4. It is important to note that there are not equal amounts of each class.
0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
0,0.38,0.48,0.5,0.42,0.48,0.55,0
0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
0.27,0.35,0.48,0.5,0.51,0.77,0.79,1

Edit I replaced np.random.shuffle(A) by A = np.random.permutation(A), the only difference is that it doesn't mutate the input array. This doesn't make any difference in this code, but it is safer in general.
The idea is to randomly sample the input by using numpy.random.permutation. Once the rows are shuffled we just need to iterate over all the possible tests sets (sliding window of the desired size, here 20% of the input size). The corresponding training sets are just composed of all remaining elements.
This will preserve the original classes distribution on all subsets even though we pick them in order because we shuffled the input.
The following code iterate over the test/train sets combinations:
import numpy as np
def csv_to_array(file):
with open(file, 'r') as f:
data = np.loadtxt(f, delimiter=',')
return data
def classes_distribution(A):
"""Print the class distributions of array A."""
nb_classes = np.unique(A[:,-1]).shape[0]
total_size = A.shape[0]
for i in range(nb_classes):
class_size = sum(row[-1] == i for row in A)
class_p = class_size/total_size
print(f"\t P(class_{i}) = {class_p:.3f}")
def random_samples(A, test_set_p=0.2):
"""Split the input array A in two uniformly chosen
random sets: test/training.
Repeat this until all rows have been yielded once at least
once as a test set."""
A = np.random.permutation(A)
sample_size = int(test_set_p*A.shape[0])
for start in range(0, A.shape[0], sample_size):
end = start + sample_size
yield {
"test": A[start:end,],
"train": np.append(A[:start,], A[end:,], 0)
}
def main():
ecoli = csv_to_array('ecoli.csv')
print("Input set shape: ", ecoli.shape)
print("Input set class distribution:")
classes_distribution(ecoli)
print("Training sets class distributions:")
for iteration in random_samples(ecoli):
test_set = iteration["test"]
training_set = iteration["train"]
classes_distribution(training_set)
print("---")
# ... Do what ever with these two sets
main()
It produces an output of the form:
Input set shape: (169, 8)
Input set class distribution:
P(class_0) = 0.308
P(class_1) = 0.213
P(class_2) = 0.207
P(class_3) = 0.118
P(class_4) = 0.154
Training sets class distributions:
P(class_0) = 0.316
P(class_1) = 0.206
P(class_2) = 0.199
P(class_3) = 0.118
P(class_4) = 0.162
...

TypeError for predict_proba(np.array(test))

model = LogisticRegression()
model = model.fit(X, y)
test_data = [1,2,3,4,5,6,7,8,9,10,11,12,13]
test_prediction = model.predict_proba(np.array(test_data))
max = -1.0
res = 0
for i in range(test_prediction):
if test_prediction[i]>max:
max = test_prediction[i]
res = i
if res==0:
print('A')
elif res==1:
print('B')
else:
print('C')
Using the above python code I have to predict the probabilities of the 3 possible results (A, B, C).
The probabilities are saved in test_prediction and it can be printed as:
Output: [[ 0.82882588 0.08641236 0.08476175]]
But the remaining part gives an error:
for i in range(test_prediction):
TypeError: only integer scalar arrays can be converted to a scalar index
I want to find the max probability and then display the event that is likely to occur the most (A/B/C).
How to go about this?

You can also use numpy.argmax which will directly give you the index of the largest value.
import numpy as np
#test_prediction is most probably np array only
pred = np.array(test_prediction)
classes_val = np.argmax(pred, axis=1)
for res in class_val:
if res==0:
print('A')
elif res==1:
print('B')
else:
print('C')

The problem in using array in range
In this case you should use length of array range(len(test_prediction))
Also you may simplify your code:
import operator
#...
enum_predict = enumerate(test_prediction)
res = max(enum_predict, key=operator.itemgetter(1))[0]
enumerate convert array to list of tuples (index, item)
key=operator.itemgetter(1) - max function will compare types by second value

You can do something like this:
predict_prob_df = pd.DataFrame(model.predict_proba(test_data))
max_prob = predict_prob_df.apply(max,axis = 1)
predicted_output = pd.DataFrame(model.predict(test_data))
Then you can concat them:
final_frame = pd.concat([max_prob,predicted_output],axis = 1)
This way you do not need to use the for loop, which was causing the error.

I came up with another solution:
for i in range(3):
if np.take(test_prediction, i) > max:
max = np.take(test_prediction, i)
res = i
if res==0:
.....
This worked by accessing the index in test_prediction using np.take
But the solution specified by #Vivek_Kumar seems more correct and efficient.

k nearest neighbours algorithm python

this is my code for the k nearest neighbor algorithm:
import numpy as np
from EuclideanDistance import EuclideanDistance
dataset = np.loadtxt('C:\Users\Toshiba\Documents\machine learning\RealEstate.csv', delimiter=',', usecols=(2,3,4,5))
p1 = ()
def normalizeToZscores(data):
'''Normalizes the variables to z-scores'''
zScores = list()
for s in data:
zScore = (s - np.mean(data))/np.std(data)
zScores.append(zScore)
return np.asarray(zScores)
def InOutBudget(data):
'''Decides whether a particular house is within
or outside the budget of $153000 and assigns values of 1 and 0 respectively'''
data2 = list()
for i in data:
if (i > 153000): data2.append(0)
else: data2.append(1)
return np.array(data2)
classes = dataset[:,0]
classes = classes.reshape((dataset.shape[0],1))
classes = InOutBudget(classes)
data = dataset[:20,:]
data = normalizeToZscores(data)
p1s = dataset[20:400,:]
def measureDis(data, p1):
listD = []
for x in data:
D = EuclideanDistance(x, p1)
listD.append(D)
return listD
def most_common(lst):
'''Finds the most frequently occuring element of a list.
It will be used to predict a class based on the classification
of the k-nearest neighbours'''
return max(set(lst), key=lst.count)
def findKnn(k):
'''K nearest neighbours algorithm'''
knns = list()
errors = list()
#for i in k:
for p1 in p1s:
# Create a list of tuples containing distance and class,
# Then sort them by shortest distance
tuples = zip(measureDis(data,p1), classes[20:400])
tuples = sorted(tuples)
knn = tuples[:k]
print knn
knn = [x[1] for x in knn]
knn = most_common(knn)
knns = knns.append(knn)
print knn
error = np.abs(knn - p1)
errors = errors.append(error)
errorsNum = np.sum(errors)
return knns
But I keep getting:
Traceback (most recent call last):
File "C:\Users\Toshiba\workspace\assignment5\src\knn2.py", line 76, in <module> knn = findKnn(k)
File "C:\Users\Toshiba\workspace\assignment5\src\knn2.py", line 64, in findKnn knns = knns.append(knn)
AttributeError: 'NoneType' object has no attribute 'append'
I know the code is really amateur, but could someone please help me just solve the issue?

list.append doesn't return the list. Simply do:
knns.append(knn)
instead of:
knns = knns.append(knn)

append does not return the list, it returns None, so you are clobbering it after the first loop.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Implementing a simple gaussian naive bayes algorithm in python - python

Related

Getting specific values from ASCII table

how to get prediction value using trained model using keras?

Numpy Array Manipulation Based off of Internal Values

TypeError for predict_proba(np.array(test))

k nearest neighbours algorithm python

Categories

Resources