Numpy Array Manipulation Based off of Internal Values - python

I am trying to accomplish a weird task.
I need to complete the following without the use of sklearn, and preferably with numpy:
Given a dataset, split the data into 5 equal "folds", or partitions
Within each partition, split the data into a "training" and "testing" set, with an 80/20 split
Here is the catch: Your dataset is labeled for classes. So take for example a dataset with 100 instances, and class A with 33 samples and class B with 67 samples. I should create 5 folds of 20 data instances, where in each fold, class A has something like 6 or 7 (1/3) values and class B has the rest
My issue that:
I do not know how to properly return a test and training set for each fold, despite being able to split it appropriately, and, more important, I do not know how to incorporate the proper division of # of elements per class.
My current code is here. It is commented where I am stuck:
import numpy
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
def five_cross_fold_validation(dataset):
# print("DATASET", dataset)
numpy.random.shuffle(dataset)
num_rows = dataset.shape[0]
split_mark = int(num_rows / 5)
folds = []
temp1 = dataset[:split_mark]
# print("TEMP1", temp1)
temp2 = dataset[split_mark:split_mark*2]
# print("TEMP2", temp2)
temp3 = dataset[split_mark*2:split_mark*3]
# print("TEMP3", temp3)
temp4 = dataset[split_mark*3:split_mark*4]
# print("TEMP4", temp4)
temp5 = dataset[split_mark*4:]
# print("TEMP5", temp5)
folds.append(temp1)
folds.append(temp2)
folds.append(temp3)
folds.append(temp4)
folds.append(temp5)
# folds = numpy.asarray(folds)
for fold in folds:
# fold = numpy.asarray(fold)
num_rows = fold.shape[0]
split_mark = int(num_rows * .8)
fold_training = fold[split_mark:]
fold_testing = fold[:split_mark]
print(type(fold))
# fold.tolist()
list(fold)
print(type(fold))
del fold[0:len(fold)]
fold.append(fold_training)
fold.append(fold_testing)
fold = numpy.asarray(fold)
# Somehow, return a testing and training set within each fold
# print(folds)
return folds
def confirm_size(folds):
total = 0
for fold in folds:
curr = len(fold)
total = total + curr
return total
def main():
print("BEGINNING CFV")
ecoli = csv_to_array('Classification/ecoli.csv')
print(len(ecoli))
folds = five_cross_fold_validation(ecoli)
size = confirm_size(folds)
print(size)
main()
Additionally, for reference, I have attached my csv I am working with (it is a modification of the UCI Ecoli Dataset.) The classes here are the values in the last column. So 0, 1, 2, 3, 4. It is important to note that there are not equal amounts of each class.
0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
0,0.38,0.48,0.5,0.42,0.48,0.55,0
0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
0.27,0.35,0.48,0.5,0.51,0.77,0.79,1

Edit I replaced np.random.shuffle(A) by A = np.random.permutation(A), the only difference is that it doesn't mutate the input array. This doesn't make any difference in this code, but it is safer in general.
The idea is to randomly sample the input by using numpy.random.permutation. Once the rows are shuffled we just need to iterate over all the possible tests sets (sliding window of the desired size, here 20% of the input size). The corresponding training sets are just composed of all remaining elements.
This will preserve the original classes distribution on all subsets even though we pick them in order because we shuffled the input.
The following code iterate over the test/train sets combinations:
import numpy as np
def csv_to_array(file):
with open(file, 'r') as f:
data = np.loadtxt(f, delimiter=',')
return data
def classes_distribution(A):
"""Print the class distributions of array A."""
nb_classes = np.unique(A[:,-1]).shape[0]
total_size = A.shape[0]
for i in range(nb_classes):
class_size = sum(row[-1] == i for row in A)
class_p = class_size/total_size
print(f"\t P(class_{i}) = {class_p:.3f}")
def random_samples(A, test_set_p=0.2):
"""Split the input array A in two uniformly chosen
random sets: test/training.
Repeat this until all rows have been yielded once at least
once as a test set."""
A = np.random.permutation(A)
sample_size = int(test_set_p*A.shape[0])
for start in range(0, A.shape[0], sample_size):
end = start + sample_size
yield {
"test": A[start:end,],
"train": np.append(A[:start,], A[end:,], 0)
}
def main():
ecoli = csv_to_array('ecoli.csv')
print("Input set shape: ", ecoli.shape)
print("Input set class distribution:")
classes_distribution(ecoli)
print("Training sets class distributions:")
for iteration in random_samples(ecoli):
test_set = iteration["test"]
training_set = iteration["train"]
classes_distribution(training_set)
print("---")
# ... Do what ever with these two sets
main()
It produces an output of the form:
Input set shape: (169, 8)
Input set class distribution:
P(class_0) = 0.308
P(class_1) = 0.213
P(class_2) = 0.207
P(class_3) = 0.118
P(class_4) = 0.154
Training sets class distributions:
P(class_0) = 0.316
P(class_1) = 0.206
P(class_2) = 0.199
P(class_3) = 0.118
P(class_4) = 0.162
...

Related

Getting specific values from ASCII table

I'm currently creating a genetic algorithm and am trying to only get certain values from the ASCII table so the runtime of the algorithm can be a bit faster. In the code below I get the values between 9-127 but I only need the values 9-10, and 32-127 from the ASCII table and I'm not sure on how to exactly only get those specific values. Code below is done in python.
import numpy as np
TARGET_PHRASE = """The smartest and fastest Pixel yet.
Google Tensor: Our first custom-built processor.
The first processor designed by Google and made for Pixel, Tensor makes the new Pixel phones our most powerful yet.
The most advanced Pixel Camera ever.
Capture brilliant color and vivid detail with Pixels best-in-class computational photography and new pro-level lenses.""" # target DNA
POP_SIZE = 4000 # population size
CROSS_RATE = 0.8 # mating probability (DNA crossover)
MUTATION_RATE = 0.00001 # mutation probability
N_GENERATIONS = 100000
DNA_SIZE = len(TARGET_PHRASE)
TARGET_ASCII = np.fromstring(TARGET_PHRASE, dtype=np.uint8) # convert string to number
ASCII_BOUND = [9, 127]
class GA(object):
def __init__(self, DNA_size, DNA_bound, cross_rate, mutation_rate, pop_size):
self.DNA_size = DNA_size
DNA_bound[1] += 1
self.DNA_bound = DNA_bound
self.cross_rate = cross_rate
self.mutate_rate = mutation_rate
self.pop_size = pop_size
self.pop = np.random.randint(*DNA_bound, size=(pop_size, DNA_size)).astype(np.int8) # int8 for convert to ASCII
def translateDNA(self, DNA): # convert to readable string
return DNA.tostring().decode('ascii')
def get_fitness(self): # count how many character matches
match_count = (self.pop == TARGET_ASCII).sum(axis=1)
return match_count
def select(self):
fitness = self.get_fitness() # add a small amount to avoid all zero fitness
idx = np.random.choice(np.arange(self.pop_size), size=self.pop_size, replace=True, p=fitness/fitness.sum())
return self.pop[idx]
def crossover(self, parent, pop):
if np.random.rand() < self.cross_rate:
i_ = np.random.randint(0, self.pop_size, size=1) # select another individual from pop
cross_points = np.random.randint(0, 2, self.DNA_size).astype(np.bool) # choose crossover points
parent[cross_points] = pop[i_, cross_points] # mating and produce one child
return parent
def mutate(self, child):
for point in range(self.DNA_size):
if np.random.rand() < self.mutate_rate:
child[point] = np.random.randint(*self.DNA_bound) # choose a random ASCII index
return child
def evolve(self):
pop = self.select()
pop_copy = pop.copy()
for parent in pop: # for every parent
child = self.crossover(parent, pop_copy)
child = self.mutate(child)
parent[:] = child
self.pop = pop
if __name__ == '__main__':
ga = GA(DNA_size=DNA_SIZE, DNA_bound=ASCII_BOUND, cross_rate=CROSS_RATE,
mutation_rate=MUTATION_RATE, pop_size=POP_SIZE)
for generation in range(N_GENERATIONS):
fitness = ga.get_fitness()
best_DNA = ga.pop[np.argmax(fitness)]
best_phrase = ga.translateDNA(best_DNA)
print('Gen', generation, ': ', best_phrase)
if best_phrase == TARGET_PHRASE:
break
ga.evolve()
You need a customed method to generate random samples in range 9-10, and 32-127, like
def my_rand(pop_size, DNA_size):
bold1=[9,10]
bold2=list(range(32,127))
bold=bold1+bold2
pop = np.random.choice(bold,(pop_size,DNA_size)).astype(np.int8)
return pop
then call this method to replace the line 29, like
delete -- self.pop = np.random.randint(*DNA_bound, size=(pop_size, DNA_size)).astype(np.int8) # int8 for convert to ASCII
call ---self.pop = my_rand(pop_size, DNA_size)

How to split pandas dataframe by unique value

I am working on implementing an ID3 algorithm in Python. In order to get past the first step I need to calculate the information gain per column. The comments are self-explanatory.
The issue is on the line
# ii) split the given data source based on the
# unique values in the attribute
print(f'split the given data source based on the')
print(f'unique values in the attribute')
df1 = training_set[training_set[columnName] >= k]
df2 = training_set[training_set[columnName] < k]
print("**********")
print("splitting ")
print(f'df1 {df1}')
print(f'df2 {df2}')
print("**********")
The dataframe is imported like so
0 1 2 3 4 5 6 7 8
0 Venue color Model Category Location weight Veriety Material Volume
1 2 6 4 4 4 2 2 1 1
The column names are coming back as numbers. They should be the string value of the headers.
The full program is shown below.
from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier
def calculate_metrics(tp, tn, fn, p, n, fp):
# calculate the accuracy, error rate, sensitivity, specificity, and precision for the selected classifier in reference to the corresponding test set.
accuracy = tp + tn /(p+n)
error_rate = fp + fn /(p + n)
sensitivity = tp/ p
precision = tp/ (tp+fp)
specificity = tn/n
display_metrics(accuracy, error_rate, sensitivity, precision, specificity)
def display_metrics(accuracy, error_rate, sensitivity, precision, specificity):
print(f'Accuracy: {accuracy}, Error_rate:{error_rate}, Sensitivity:{sensitivity}, Precision:{precision}, specificity:{specificity}')
def ID3(threshold,g):
# use the training set to predict the test set.
# use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
test_set = pd.read_csv("Test set for ID3.csv", header=None)
training_set = pd.read_csv("Training set for ID3.csv", header=None)
print(f'test_set: {test_set}')
print(f'training_set: {training_set}')
# Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute
print(f'Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute')
# MC = -p1*log2(p1) - p2*log2(p2)
# For n classes MC = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)
# For each column calculate the gain.
numberOfColumns = 0
mcDictionary = {}
print('***********************************')
print('For each column calculate the gain.')
for (columnName, columnData) in training_set.iteritems():
print(f'Column Name :{columnName}')
print(f'Column Contents: {training_set[columnName]}')
column = training_set[columnName]
probs = column.value_counts(normalize=True)
print(f'Probability {probs}')
entropy = -1*np.sum(np.log2(probs)*probs)
print(f'Entropy {entropy}')
mcDictionary.update({columnName:round(entropy)})
numberOfColumns+=1
print('***********************************')
print(f'numberOfColumns {numberOfColumns}')
print(f'mcDictionary {mcDictionary}')
# The column with the highest gain is the root.
print(f'The column with the highest gain is the root.')
values = mcDictionary.values()
max_value = max(values)
print(f'The max value is {max_value}')
columnNames = list(mcDictionary.keys())
columnWithMaximumInformationGain = columnNames.index(max_value)
print(f'The max value, {max_value}, is associated with column {columnWithMaximumInformationGain}')
root = training_set[columnWithMaximumInformationGain]
print(f'root {root}')
# Loop
# Step 2 - Repeat for every attribute
print(f'Step 2 - Repeat for every attribute')
for (columnName, columnData) in training_set.iteritems():
# i) use the atttribute as a node from which k
# k branches are emanating, where k is
# the number of unique values in the attribute
attribute = columnName
k = training_set[columnName].nunique()
print(f'use the atttribute {columnName} as a node from which {k}')
print(f'{k} branches are emanating, where {k} is')
print(f'the number of unique values in the attribute')
# ii) split the given data source based on the
# unique values in the attribute
print(f'split the given data source based on the')
print(f'unique values in the attribute')
df1 = training_set[training_set[columnName] >= k]
df2 = training_set[training_set[columnName] < k]
print("**********")
print("splitting ")
print(f'df1 {df1}')
print(f'df2 {df2}')
print("**********")
# iii) calculate MC for new splits
# calculate MC for each attribute of Venue
# iv calculculate the weight for each split
# start with venue
# v) calculate the weighted MC (WMC) for the attribute
# WMC(venue) = W(1)*MC(1) + W(2)*MC(2)
# vi) Calculate Gain for the attribute [MC-WMC(venue)]
# Gain(venue) = MC-WMC(venue)
# Step 3- Repeat for each split produced by the root
# if all records have the same class then break.
# Step 4- If every split is free of a mixture of class values, then stop
# expansion of the tree
# Step 5- Extract rules in form of if-then-else from the tree
# select the max value from the gain array
# this is the new root
# # leaf generated from the decision tree.
# F1 = 0
# # define c1 count of records w/ dominant class in F1
# # How do I determine the number of records w/ dominant class in F1?
# c1 = 0
# # alpha = c1/ |F1|
# # F1 is one of the unique values of a given attribute.
# alpha = c1/ abs(F1)
# # the number of records in the test set that are correctly classified by the rules extracted from the tree before removal.
# # How do I determine the number of records in test set that are correctly classified by rules extracted from the tree before removal?
# N = 0
# # the number of records in the test set that are correctly classified by the rules extracted from the tree.
# # How do I determine the number of records in the test set that are correctly classified by the rules extracted from the tree?
# M = 0
# # the parameter and 0 <= g <= 0.15
# g = 0
# if g < 0 or g > 0.15:
# exit()
# # k is the total number of branches in the subtree
# # How do I determine the total number of branches in the subtree?
# k = 0
# if alpha > threshold:
# # stop splitting tree
# # How do we apply prepruning to the data?
# # For post-pruning use the criteria below
# if (N-M)/Q < g*k:
# # remove subtree
# # true positive
# tp = 0
# # true negative
# tn = 0
# # postive
# p = 0
# # negative
# n = 0
# # false positive
# fp = 0
# calculate_metrics(tp, tn, p, n, fp)
def BayesClassifier():
# use the assignment 2-- training set for Bayes as the training set to classify the records of the assignment 2 test set for bayes
test_set = pd.read_csv("Assignment 2--Test set for Bayes.csv")
training_set = pd.read_csv("Assignment 2--Training set for Bayes.csv")
# prompt user to select either ID3 or Bayes classifier.
selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = 0.9 #= input("Please enter a threshold: ")
g = 0.5 #= input("Please enter a value for g: ")
if(selection == "ID3"):
ID3(threshold,g)
if(selection == "Bayes"):
BayesClassifier()
Expected:
**********
splitting
df1 {df1}
df2 {df2}
**********
Actual:
unique values in the attribute
Traceback (most recent call last):
File ".\assignment2.py", line 183, in <module>
ID3(threshold,g)
File ".\assignment2.py", line 86, in ID3
df1 = training_set[training_set[columnName] >= k]
File "C:\Users\physe\AppData\Roaming\Python\Python36\site-packages\pandas\core\ops\common.py", line 65, in new_method
return method(self, other)
File "C:\Users\physe\AppData\Roaming\Python\Python36\site-packages\pandas\core\ops\__init__.py", line 370, in wrapper
res_values = comparison_op(lvalues, rvalues, op)
File "C:\Users\physe\AppData\Roaming\Python\Python36\site-packages\pandas\core\ops\array_ops.py", line 244, in comparison_op
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
File "C:\Users\physe\AppData\Roaming\Python\Python36\site-packages\pandas\core\ops\array_ops.py", line 56, in comp_method_OBJECT_ARRAY
result = libops.scalar_compare(x.ravel(), y, op)
File "pandas\_libs\ops.pyx", line 103, in pandas._libs.ops.scalar_compare
TypeError: '>=' not supported between instances of 'str' and 'int'
How can I split the dataframe by the unique value?
The Test set for ID3.csv
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
1,6,4,4,4,1,1,1,6
2,5,4,4,4,2,6,1,1
1,6,2,1,4,1,4,2,4
1,6,2,1,4,1,2,1,2
2,6,5,5,5,2,2,1,2
1,5,4,4,4,1,6,2,2
1,3,3,3,3,1,6,2,2
1,5,2,1,1,1,2,1,2
1,4,4,4,1,1,5,3,6
1,4,4,4,4,1,6,4,6
2,5,4,4,4,2,4,4,1
2,4,3,3,3,2,1,1,1
2,6,5,5,5,1,4,2,1
The Training set for ID3.csv
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
1,6,4,4,4,1,1,1,6
2,5,4,4,4,2,6,1,1
1,6,2,1,4,1,4,2,4
1,6,2,1,4,1,2,1,2
2,6,5,5,5,2,2,1,2
1,5,4,4,4,1,6,2,2
1,3,3,3,3,1,6,2,2
1,5,2,1,1,1,2,1,2
1,4,4,4,1,1,5,3,6
Don't use header=none
test_set = pd.read_csv("Test set for ID3.csv")
training_set = pd.read_csv("Training set for ID3.csv")

Trying to Implement Naive Bayes algorithm on dataset

I have a dataset upon which I would like to implement the Naïve Bayes algorithm but it is triggering an error on line 107; str_column_to_float(dataset, i) as follows; "could not convert string to float:''"
I thought it was because of the headers for the various columns but even after I removed them and run the code, it is still giving me the same error. Any help will very much be appreciated. The link to the dataset is as follows;
[Dataset][1]
The code is below
# Make Predictions with Naive Bayes On The Accident Project Dataset
from csv import reader
from math import sqrt
from math import exp
from math import pi
# Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Convert string column to integer
def str_column_to_int(dataset, column):
class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
print('[%s] => %d' % (value, i))
for row in dataset:
row[column] = lookup[row[column]]
return lookup
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
separated = dict()
for i in range(len(dataset)):
vector = dataset[i]
class_value = vector[-1]
if (class_value not in separated):
separated[class_value] = list()
separated[class_value].append(vector)
return separated
# Calculate the mean of a list of numbers
def mean(numbers):
return sum(numbers)/float(len(numbers))
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
avg = mean(numbers)
variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
return sqrt(variance)
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
del(summaries[-1])
return summaries
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
separated = separate_by_class(dataset)
summaries = dict()
for class_value, rows in separated.items():
summaries[class_value] = summarize_dataset(rows)
return summaries
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
return (1 / (sqrt(2 * pi) * stdev)) * exponent
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
total_rows = sum([summaries[label][0][2] for label in summaries])
probabilities = dict()
for class_value, class_summaries in summaries.items():
probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
for i in range(len(class_summaries)):
mean, stdev, _ = class_summaries[i]
probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
return probabilities
# Predict the class for a given row
def predict(summaries, row):
probabilities = calculate_class_probabilities(summaries, row)
best_label, best_prob = None, -1
for class_value, probability in probabilities.items():
if best_label is None or probability > best_prob:
best_prob = probability
best_label = class_value
return best_label
# Make a prediction with Naive Bayes on Accident Dataset
filename = 'C:/Users/Vince/Desktop/University of Wyoming PHD/Year 2/Machine Learning/Term
Project/Accident Project dataset.csv'
dataset = load_csv(filename)
for i in range(len(dataset[1])-1):
str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# fit model
model = summarize_by_class(dataset)
# define a new record
row = [1,0,1,0,1,0,1,0,1,0,1,0,1]
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))
[1]: https://docs.google.com/spreadsheets/d/1aFJLSYqo59QUYJ6es09ZHY0UBqwH6cbgV4JjxY1HXZo/edit?
usp=sharing
The ValueError is being raised because float() is trying to cast a word to a string.
# Raises the ValueError
float("one")
# Does not raise the ValueError
float("1")
You need to find the string that is non-numerical and then manually convert it. You can change your code to help you find it, like this:
def str_column_to_float(dataset, column):
i =0
try:
for row in dataset:
row[column] = float(row[column].strip())
except ValueError:
print(f'Change value: {row[column]} on row {i} column {column} to numeric.')
finally:
i+=1

NumPy random sampling using larger sample results in less unique elements than smaller sample

I'm trying to build a dataset for training a deep learning model that requires positive and negative sampling. For each input list, I randomly choose 3 elementa to be the positive samplea and k elements to be the negative sample from the rest of the vocabulary. For some reason, at the end, when I use k=16 negative samples for each positive, I get less unique elements than if I would've used k=4 and I'm not sure why that's the case since larger samples should obviously provide more coverage. Here's the code that I have doing the sampling (replace value of num_neg to change # of negatives sampled). I feel like I might be missing something obvious but haven't figured it out...
pos_map = {}
neg_map = {}
num_pos = 3
num_neg = 16
# vocab maps from id => integer index, reverse_map maps from integer index => id. vocab size is ~28k and stores all possible values of id
np.random.seed(2)
for ids in ids_list:
encoded = [vocab[id_] for id_ in ids]
target_positive_indices = np.random.choice(range(len(encoded)), size=num_pos, replace=False)
for target_positive_index in target_positive_indices:
pos = encoded[target_positive_index]
if pos in pos_map:
pos_map[pos] += 1
else:
pos_map[pos] = 1
# perform negative sampling
all_indices = np.arange(vocab_size)
possible_negs = np.random.choice(range(len(all_indices)), size=num_neg * 3, replace=False)
# some negatives chosen could be the same as positives or in the context, filter those out
filtered_negs = np.setdiff1d(possible_negs, store_indexes)[:num_neg]
for n in filtered_negs:
neg = reverse_map[n]
if neg in neg_map:
neg_map[neg] += 1
else:
neg_map[neg] = 1
print(len(neg_map))
Result for num_neg=4: 15842
Result for num_neg=16: 13968

Get percentile points from a huge list

I have a huge list (45M+ data poitns), with numerical values:
[78,0,5,150,9000,5,......,25,9,78422...]
I can easily get the maximum and minimum values, the number of these values, and the sum of them:
file_handle=open('huge_data_file.txt','r')
sum_values=0
min_value=None
max_value=None
for i,line in enumerate(file_handle):
value=int(line[:-1])
if min_value==None or value<min_value:
min_value=value
if max_value==None or value>max_value:
max_value=value
sum_values+=value
average_value=float(sum_values)/i
However, this is not what I need. I need a list of 10 numbers, where the number of data points between each two consecutive points is equal, for example
median points [0,30,120,325,912,1570,2522,5002,7025,78422]
and we have the number of data points between 0 and 30 or between 30 and 120 to be almost 4.5 million data points.
How can we do this?
=============================
EDIT:
I am well aware that we will need to sort the data. The problem is that I cannot fit all this data in one variable in memory, but I need to read it sequentially from a generator (file_handle)
If you are happy with an approximation, here is a great (and fairly easy to implement) algorithm for computing quantiles from stream data: "Space-Efficient Online Computation of Quantile Summaries" by Greenwald and Khanna.
The silly numpy approach:
import numpy as np
# example data (produced by numpy but converted to a simple list)
datalist = list(np.random.randint(0, 10000000, 45000000))
# converted back to numpy array (start here with your data)
arr = np.array(datalist)
np.percentile(arr, 10), np.percentile(arr, 20), np.percentile(arr, 30)
# ref:
# http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html
You can also hack something together where you just do like:
arr.sort()
# And then select the 10%, 20% etc value, add some check for equal amount of
# numbers within a bin and then calculate the average, excercise for reader :-)
The thing is that calling this function several times will slow it down, so really, just sort the array and then select the elements yourself.
As you said in the comments that you want a solution that can scale to larger datasets then can be stored in RAM, feed the data into an SQLlite3 database. Even if your data set is 10GB and you only have 8GB RAM a SQLlite3 database should still be able to sort the data and give it back to you in order.
The SQLlite3 database gives you a generator over your sorted data.
You might also want to look into going beyond python and take some other database solution.
Here's a pure-python implementation of the partitioned-on-disk sort. It's slow, ugly code, but it works and hopefully each stage is relatively clear (the merge stage is really ugly!).
#!/usr/bin/env python
import os
def get_next_int_from_file(f):
l = f.readline()
if not l:
return None
return int(l.strip())
MAX_SAMPLES_PER_PARTITION = 1000000
PARTITION_FILENAME = "_{}.txt"
# Partition data set
part_id = 0
eof = False
with open("data.txt", "r") as fin:
while not eof:
print "Creating partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "w") as fout:
for _ in range(MAX_SAMPLES_PER_PARTITION):
line = fin.readline()
if not line:
eof = True
break
fout.write(line)
part_id += 1
num_partitions = part_id
# Sort each partition
for part_id in range(num_partitions):
print "Reading unsorted partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "r") as fin:
samples = [int(line.strip()) for line in fin.readlines()]
print "Disk-Deleting unsorted {}".format(part_id)
os.remove(PARTITION_FILENAME.format(part_id))
print "In-memory sorting partition {}".format(part_id)
samples.sort()
print "Writing sorted partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "w") as fout:
fout.writelines(["{}\n".format(sample) for sample in samples])
# Merge-sort the partitions
# NB This is a very inefficient implementation!
print "Merging sorted partitions"
part_files = []
part_next_int = []
num_lines_out = 0
# Setup data structures for the merge
for part_id in range(num_partitions):
fin = open(PARTITION_FILENAME.format(part_id), "r")
next_int = get_next_int_from_file(fin)
if next_int is None:
continue
part_files.append(fin)
part_next_int.append(next_int)
with open("data_sorted.txt", "w") as fout:
while part_files:
# Find the smallest number across all files
min_number = None
min_idx = None
for idx in range(len(part_files)):
if min_number is None or part_next_int[idx] < min_number:
min_number = part_next_int[idx]
min_idx = idx
# Now add that number, and move the relevent file along
fout.write("{}\n".format(min_number))
num_lines_out += 1
if num_lines_out % MAX_SAMPLES_PER_PARTITION == 0:
print "Merged samples: {}".format(num_lines_out)
next_int = get_next_int_from_file(part_files[min_idx])
if next_int is None:
# Remove this partition, it's now finished
del part_files[min_idx:min_idx + 1]
del part_next_int[min_idx:min_idx + 1]
else:
part_next_int[min_idx] = next_int
# Cleanup partition files
for part_id in range(num_partitions):
os.remove(PARTITION_FILENAME.format(part_id))
My code a proposal for finding the result without needing much space. In testing it found a quantile value in 7 minutes 51 seconds for a dataset of size 45 000 000.
from bisect import bisect_left
class data():
def __init__(self, values):
random.shuffle(values)
self.values = values
def __iter__(self):
for i in self.values:
yield i
def __len__(self):
return len(self.values)
def sortedValue(self, percentile):
val = list(self)
val.sort()
num = int(len(self)*percentile)
return val[num]
def init():
numbers = data([x for x in range(1,1000000)])
print(seekPercentile(numbers, 0.1))
print(numbers.sortedValue(0.1))
def seekPercentile(numbers, percentile):
lower, upper = minmax(numbers)
maximum = upper
approx = _approxPercentile(numbers, lower, upper, percentile)
return neighbor(approx, numbers, maximum)
def minmax(list):
minimum = float("inf")
maximum = float("-inf")
for num in list:
if num>maximum:
maximum = num
if num<minimum:
minimum = num
return minimum, maximum
def neighbor(approx, numbers, maximum):
dif = maximum
for num in numbers:
if abs(approx-num)<dif:
result = num
dif = abs(approx-num)
return result
def _approxPercentile(numbers, lower, upper, percentile):
middles = []
less = []
magicNumber = 10000
step = (upper - lower)/magicNumber
less = []
for i in range(1, magicNumber-1):
middles.append(lower + i * step)
less.append(0)
for num in numbers:
index = bisect_left(middles,num)
if index<len(less):
less[index]+= 1
summing = 0
for index, testVal in enumerate(middles):
summing += less[index]
if summing/len(numbers) < percentile:
print(" Change lower from "+str(lower)+" to "+ str(testVal))
lower = testVal
if summing/len(numbers) > percentile:
print(" Change upper from "+str(upper)+" to "+ str(testVal))
upper = testVal
break
precision = 0.01
if (lower+precision)>upper:
return lower
else:
return _approxPercentile(numbers, lower, upper, percentile)
init()
I edited my code a bit and I now think that this way works at least decently even when it's not optimal.

Categories