Trying to Implement Naive Bayes algorithm on dataset - python

I have a dataset upon which I would like to implement the Naïve Bayes algorithm but it is triggering an error on line 107; str_column_to_float(dataset, i) as follows; "could not convert string to float:''"
I thought it was because of the headers for the various columns but even after I removed them and run the code, it is still giving me the same error. Any help will very much be appreciated. The link to the dataset is as follows;
[Dataset][1]
The code is below
# Make Predictions with Naive Bayes On The Accident Project Dataset
from csv import reader
from math import sqrt
from math import exp
from math import pi
# Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Convert string column to integer
def str_column_to_int(dataset, column):
class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
print('[%s] => %d' % (value, i))
for row in dataset:
row[column] = lookup[row[column]]
return lookup
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
separated = dict()
for i in range(len(dataset)):
vector = dataset[i]
class_value = vector[-1]
if (class_value not in separated):
separated[class_value] = list()
separated[class_value].append(vector)
return separated
# Calculate the mean of a list of numbers
def mean(numbers):
return sum(numbers)/float(len(numbers))
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
avg = mean(numbers)
variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
return sqrt(variance)
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
del(summaries[-1])
return summaries
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
separated = separate_by_class(dataset)
summaries = dict()
for class_value, rows in separated.items():
summaries[class_value] = summarize_dataset(rows)
return summaries
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
return (1 / (sqrt(2 * pi) * stdev)) * exponent
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
total_rows = sum([summaries[label][0][2] for label in summaries])
probabilities = dict()
for class_value, class_summaries in summaries.items():
probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
for i in range(len(class_summaries)):
mean, stdev, _ = class_summaries[i]
probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
return probabilities
# Predict the class for a given row
def predict(summaries, row):
probabilities = calculate_class_probabilities(summaries, row)
best_label, best_prob = None, -1
for class_value, probability in probabilities.items():
if best_label is None or probability > best_prob:
best_prob = probability
best_label = class_value
return best_label
# Make a prediction with Naive Bayes on Accident Dataset
filename = 'C:/Users/Vince/Desktop/University of Wyoming PHD/Year 2/Machine Learning/Term
Project/Accident Project dataset.csv'
dataset = load_csv(filename)
for i in range(len(dataset[1])-1):
str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# fit model
model = summarize_by_class(dataset)
# define a new record
row = [1,0,1,0,1,0,1,0,1,0,1,0,1]
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))
[1]: https://docs.google.com/spreadsheets/d/1aFJLSYqo59QUYJ6es09ZHY0UBqwH6cbgV4JjxY1HXZo/edit?
usp=sharing

The ValueError is being raised because float() is trying to cast a word to a string.
# Raises the ValueError
float("one")
# Does not raise the ValueError
float("1")
You need to find the string that is non-numerical and then manually convert it. You can change your code to help you find it, like this:
def str_column_to_float(dataset, column):
i =0
try:
for row in dataset:
row[column] = float(row[column].strip())
except ValueError:
print(f'Change value: {row[column]} on row {i} column {column} to numeric.')
finally:
i+=1

Related

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed. Works for first two loops

Let me start by saying that I know this error message has posts about it, but I'm not sure what's wrong with my code. The block of code works just fine for the first two loops, but then fails. I've even tried removing the first two loops from the data to rule out issues in the 3rd loop, but no luck. I did have it set to print out the unsorted temporary list, and it just prints an empty array for the 3rd loop.
Sorry for the wall of comments in my code, but I'd rather have each line commented than cause confusion over what I'm trying to accomplish.
TL;DR: I'm trying to find and remove outliers from a list of data, but only for groups of entries that have the same number in column 0.
Pastebin with data
import numpy as np, csv, multiprocessing as mp, mysql.connector as msc, pandas as pd
import datetime
#Declare unsorted data array
d_us = []
#Declare temporary array for use in loop
tmp = []
#Declare sorted data array
d = []
#Declare Sum variable
tot = 0
#Declare Mean variable
m = 0
#declare sorted final array
sort = []
#Declare number of STDs
t = 1
#Declare Standard Deviation variable
std = 0
#Declare z-score variable
z_score
#Timestamp for output files
nts = datetime.datetime.now().timestamp()
#Create output file
with open(f"calib_temp-{nts}.csv", 'w') as ctw:
pass
#Read data from CSV
with open("test.csv", 'r', newline='') as drh:
fr_rh = csv.reader(drh, delimiter=',')
for row in fr_rh:
#append data to unsorted array
d_us.append([float(row[0]),float(row[1])])
#Sort array by first column
d = np.sort(d_us)
#Calculate the range of the data
l = round((d[-1][0] - d[0][0]) * 10)
#Declare the starting value
s = d[0][0]
#Declare the ending value
e = d[-1][0]
#Set the while loop counter
n = d[0][0]
#Iterate through data
while n <= e:
#Create array with difference column
for row in d:
if row[0] == n:
diff = round(row[0] - row[1], 1)
tmp.append([row[0],row[1],diff])
#Convert to numpy array
tmp = np.array(tmp)
#Sort numpy array
sort = tmp[np.argsort(tmp[:,2])]
#Calculate sum of differences
for row in tmp:
tot = tot + row[2]
#Calculate mean
m = np.mean(tot)
#Calculate Standard Deviation
std = np.std(tmp[:,2])
#Calculate outliers and write to output file
for y in tmp:
z_score = (y[2] - m)/std
if np.abs(z_score) > t:
with open(f"calib_temp-{nts}.csv", 'a', newline='') as ct:
c = csv.writer(ct, delimiter = ',')
c.writerow([y[0],y[1]])
#Reset Variables
tot = 0
m = 0
n = n + 0.1
tmp = []
std = 0
z_score = 0
Do this before the loop:
#Create output file
ct = open(f"calib_temp-{nts}.csv", 'w')
c = csv.writer(ct, delimiter = ',')
Then change the loop to this. Note that I have moved your initializations to the top of the loop, so you don't need to initialize them twice. Note the if tmp: line, which solves the numpy exception.
#Iterate through data
while n <= e:
tot = 0
m = 0
tmp = []
std = 0
z_score = 0
#Create array with difference column
for row in d:
if row[0] == n:
diff = round(row[0] - row[1], 1)
tmp.append([row[0],row[1],diff])
#Sort numpy array
if tmp:
#Convert to numpy array
tmp = np.array(tmp)
sort = tmp[np.argsort(tmp[:,2])]
#Calculate sum of differences
for row in tmp:
tot = tot + row[2]
#Calculate mean
m = np.mean(tot)
#Calculate Standard Deviation
std = np.std(tmp[:,2])
#Calculate outliers and write to output file
for y in tmp:
z_score = (y[2] - m)/std
if np.abs(z_score) > t:
c.writerow([y[0],y[1]])
#Reset Variables
n = n + 0.1

How to split pandas dataframe by unique value

I am working on implementing an ID3 algorithm in Python. In order to get past the first step I need to calculate the information gain per column. The comments are self-explanatory.
The issue is on the line
# ii) split the given data source based on the
# unique values in the attribute
print(f'split the given data source based on the')
print(f'unique values in the attribute')
df1 = training_set[training_set[columnName] >= k]
df2 = training_set[training_set[columnName] < k]
print("**********")
print("splitting ")
print(f'df1 {df1}')
print(f'df2 {df2}')
print("**********")
The dataframe is imported like so
0 1 2 3 4 5 6 7 8
0 Venue color Model Category Location weight Veriety Material Volume
1 2 6 4 4 4 2 2 1 1
The column names are coming back as numbers. They should be the string value of the headers.
The full program is shown below.
from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier
def calculate_metrics(tp, tn, fn, p, n, fp):
# calculate the accuracy, error rate, sensitivity, specificity, and precision for the selected classifier in reference to the corresponding test set.
accuracy = tp + tn /(p+n)
error_rate = fp + fn /(p + n)
sensitivity = tp/ p
precision = tp/ (tp+fp)
specificity = tn/n
display_metrics(accuracy, error_rate, sensitivity, precision, specificity)
def display_metrics(accuracy, error_rate, sensitivity, precision, specificity):
print(f'Accuracy: {accuracy}, Error_rate:{error_rate}, Sensitivity:{sensitivity}, Precision:{precision}, specificity:{specificity}')
def ID3(threshold,g):
# use the training set to predict the test set.
# use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
test_set = pd.read_csv("Test set for ID3.csv", header=None)
training_set = pd.read_csv("Training set for ID3.csv", header=None)
print(f'test_set: {test_set}')
print(f'training_set: {training_set}')
# Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute
print(f'Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute')
# MC = -p1*log2(p1) - p2*log2(p2)
# For n classes MC = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)
# For each column calculate the gain.
numberOfColumns = 0
mcDictionary = {}
print('***********************************')
print('For each column calculate the gain.')
for (columnName, columnData) in training_set.iteritems():
print(f'Column Name :{columnName}')
print(f'Column Contents: {training_set[columnName]}')
column = training_set[columnName]
probs = column.value_counts(normalize=True)
print(f'Probability {probs}')
entropy = -1*np.sum(np.log2(probs)*probs)
print(f'Entropy {entropy}')
mcDictionary.update({columnName:round(entropy)})
numberOfColumns+=1
print('***********************************')
print(f'numberOfColumns {numberOfColumns}')
print(f'mcDictionary {mcDictionary}')
# The column with the highest gain is the root.
print(f'The column with the highest gain is the root.')
values = mcDictionary.values()
max_value = max(values)
print(f'The max value is {max_value}')
columnNames = list(mcDictionary.keys())
columnWithMaximumInformationGain = columnNames.index(max_value)
print(f'The max value, {max_value}, is associated with column {columnWithMaximumInformationGain}')
root = training_set[columnWithMaximumInformationGain]
print(f'root {root}')
# Loop
# Step 2 - Repeat for every attribute
print(f'Step 2 - Repeat for every attribute')
for (columnName, columnData) in training_set.iteritems():
# i) use the atttribute as a node from which k
# k branches are emanating, where k is
# the number of unique values in the attribute
attribute = columnName
k = training_set[columnName].nunique()
print(f'use the atttribute {columnName} as a node from which {k}')
print(f'{k} branches are emanating, where {k} is')
print(f'the number of unique values in the attribute')
# ii) split the given data source based on the
# unique values in the attribute
print(f'split the given data source based on the')
print(f'unique values in the attribute')
df1 = training_set[training_set[columnName] >= k]
df2 = training_set[training_set[columnName] < k]
print("**********")
print("splitting ")
print(f'df1 {df1}')
print(f'df2 {df2}')
print("**********")
# iii) calculate MC for new splits
# calculate MC for each attribute of Venue
# iv calculculate the weight for each split
# start with venue
# v) calculate the weighted MC (WMC) for the attribute
# WMC(venue) = W(1)*MC(1) + W(2)*MC(2)
# vi) Calculate Gain for the attribute [MC-WMC(venue)]
# Gain(venue) = MC-WMC(venue)
# Step 3- Repeat for each split produced by the root
# if all records have the same class then break.
# Step 4- If every split is free of a mixture of class values, then stop
# expansion of the tree
# Step 5- Extract rules in form of if-then-else from the tree
# select the max value from the gain array
# this is the new root
# # leaf generated from the decision tree.
# F1 = 0
# # define c1 count of records w/ dominant class in F1
# # How do I determine the number of records w/ dominant class in F1?
# c1 = 0
# # alpha = c1/ |F1|
# # F1 is one of the unique values of a given attribute.
# alpha = c1/ abs(F1)
# # the number of records in the test set that are correctly classified by the rules extracted from the tree before removal.
# # How do I determine the number of records in test set that are correctly classified by rules extracted from the tree before removal?
# N = 0
# # the number of records in the test set that are correctly classified by the rules extracted from the tree.
# # How do I determine the number of records in the test set that are correctly classified by the rules extracted from the tree?
# M = 0
# # the parameter and 0 <= g <= 0.15
# g = 0
# if g < 0 or g > 0.15:
# exit()
# # k is the total number of branches in the subtree
# # How do I determine the total number of branches in the subtree?
# k = 0
# if alpha > threshold:
# # stop splitting tree
# # How do we apply prepruning to the data?
# # For post-pruning use the criteria below
# if (N-M)/Q < g*k:
# # remove subtree
# # true positive
# tp = 0
# # true negative
# tn = 0
# # postive
# p = 0
# # negative
# n = 0
# # false positive
# fp = 0
# calculate_metrics(tp, tn, p, n, fp)
def BayesClassifier():
# use the assignment 2-- training set for Bayes as the training set to classify the records of the assignment 2 test set for bayes
test_set = pd.read_csv("Assignment 2--Test set for Bayes.csv")
training_set = pd.read_csv("Assignment 2--Training set for Bayes.csv")
# prompt user to select either ID3 or Bayes classifier.
selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = 0.9 #= input("Please enter a threshold: ")
g = 0.5 #= input("Please enter a value for g: ")
if(selection == "ID3"):
ID3(threshold,g)
if(selection == "Bayes"):
BayesClassifier()
Expected:
**********
splitting
df1 {df1}
df2 {df2}
**********
Actual:
unique values in the attribute
Traceback (most recent call last):
File ".\assignment2.py", line 183, in <module>
ID3(threshold,g)
File ".\assignment2.py", line 86, in ID3
df1 = training_set[training_set[columnName] >= k]
File "C:\Users\physe\AppData\Roaming\Python\Python36\site-packages\pandas\core\ops\common.py", line 65, in new_method
return method(self, other)
File "C:\Users\physe\AppData\Roaming\Python\Python36\site-packages\pandas\core\ops\__init__.py", line 370, in wrapper
res_values = comparison_op(lvalues, rvalues, op)
File "C:\Users\physe\AppData\Roaming\Python\Python36\site-packages\pandas\core\ops\array_ops.py", line 244, in comparison_op
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
File "C:\Users\physe\AppData\Roaming\Python\Python36\site-packages\pandas\core\ops\array_ops.py", line 56, in comp_method_OBJECT_ARRAY
result = libops.scalar_compare(x.ravel(), y, op)
File "pandas\_libs\ops.pyx", line 103, in pandas._libs.ops.scalar_compare
TypeError: '>=' not supported between instances of 'str' and 'int'
How can I split the dataframe by the unique value?
The Test set for ID3.csv
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
1,6,4,4,4,1,1,1,6
2,5,4,4,4,2,6,1,1
1,6,2,1,4,1,4,2,4
1,6,2,1,4,1,2,1,2
2,6,5,5,5,2,2,1,2
1,5,4,4,4,1,6,2,2
1,3,3,3,3,1,6,2,2
1,5,2,1,1,1,2,1,2
1,4,4,4,1,1,5,3,6
1,4,4,4,4,1,6,4,6
2,5,4,4,4,2,4,4,1
2,4,3,3,3,2,1,1,1
2,6,5,5,5,1,4,2,1
The Training set for ID3.csv
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
1,6,4,4,4,1,1,1,6
2,5,4,4,4,2,6,1,1
1,6,2,1,4,1,4,2,4
1,6,2,1,4,1,2,1,2
2,6,5,5,5,2,2,1,2
1,5,4,4,4,1,6,2,2
1,3,3,3,3,1,6,2,2
1,5,2,1,1,1,2,1,2
1,4,4,4,1,1,5,3,6
Don't use header=none
test_set = pd.read_csv("Test set for ID3.csv")
training_set = pd.read_csv("Training set for ID3.csv")

How to detect outliers "among neighbouring values" of univariate data using standard python

I have a simple data-set that has 2 columns date/temperature.
I need some method to find outliers:
1. In the temperature column
2. Only w.r.t to some neighbouring values
What will be the best way to detect outliers w.r.t neighbouring values only?
I have tried coding up IQR, Mean/Median deviation etc but they prune out more values. I tried applying those methods to 10 values at a time but that ends up finding even more outliers than I expect.
Also I need to finally code it in standard python, so any hints on that will be useful.
Below are the functions from my class DataChecker that calculate IQR and use it to check outlier.
class DataChecker:
class DateTemperature:
def __init__(self, input_date, temperature):
try:
day, month, year = input_date.split('/')
self._temperature_date = date(int(year), int(month), int(day))
except ValueError:
# Don't tolerate invalid date
raise #TODO change to custom error
try:
self._temperature = float(temperature)
except (TypeError, ValueError):
self._temperature = 0
#property
def date(self):
return self._temperature_date.strftime('%d/%m/%Y')
#property
def temperature(self):
return self._temperature
def __init__(self, input_date_temperature_values):
self._date_temperature_values = []
for date, temperature in input_date_temperature_values:
try:
self._date_temperature_values.append(self.DateTemperature(date, temperature))
except ValueError:
pass
self._date_temperature_values.sort(key=lambda x:x.date)
self._outlier_low, self._outlier_high = self._calculate_outlier_thresholds(self._date_temperature_values)
def _is_value_outlier(self, temperature):
if temperature < self._outlier_low or temperature > self._outlier_high:
return True
return False
def _calculate_outlier_thresholds(self, data_temperature_values):
temperature_values = sorted([dataTemperature.temperature for dataTemperature in data_temperature_values])
median_index = len(temperature_values) // 2
first_quartile = median(temperature_values[:median_index])
third_quartile = median(temperature_values[median_index+1:])
iqr = (third_quartile - first_quartile)
# Tried with 1.5, 1.2, 2 etc
low_iqr = first_quartile - 1.2*iqr
high_iqr = third_quartile + 1.2*iqr
# Trying mean/median deviation
#mean_value = statistics.median(temperature_values)
#std_dev = statistics.pstdev(temperature_values)
#print(f'{mean_value} : {std_dev}')
#low_iqr = mean_value - 2*std_dev
#high_iqr = mean_value + 2*std_dev
#print(low_iqr, ':', high_iqr)
return low_iqr, high_iqr
Thanks!
Standard Deviation: The standard deviation is a measure that is used to quantify the amount of variation or dispersion of a set of data values.
Mean: Mean is the sum of the sampled values divided by the number of items.
import numpy as np
def removeOutlier(data):
data = np.array(data)
mean = np.mean(data, axis=0)
std = np.std(data, axis=0)
final_list = [x for x in data if (x > mean - 2 * std)]
final_list = [x for x in final_list if (x < mean + 2 * std)]
return final_list
you can select the neighboring point and put it in a list then if you pass it through this function you'll get outlier free list.

Numpy Array Manipulation Based off of Internal Values

I am trying to accomplish a weird task.
I need to complete the following without the use of sklearn, and preferably with numpy:
Given a dataset, split the data into 5 equal "folds", or partitions
Within each partition, split the data into a "training" and "testing" set, with an 80/20 split
Here is the catch: Your dataset is labeled for classes. So take for example a dataset with 100 instances, and class A with 33 samples and class B with 67 samples. I should create 5 folds of 20 data instances, where in each fold, class A has something like 6 or 7 (1/3) values and class B has the rest
My issue that:
I do not know how to properly return a test and training set for each fold, despite being able to split it appropriately, and, more important, I do not know how to incorporate the proper division of # of elements per class.
My current code is here. It is commented where I am stuck:
import numpy
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
def five_cross_fold_validation(dataset):
# print("DATASET", dataset)
numpy.random.shuffle(dataset)
num_rows = dataset.shape[0]
split_mark = int(num_rows / 5)
folds = []
temp1 = dataset[:split_mark]
# print("TEMP1", temp1)
temp2 = dataset[split_mark:split_mark*2]
# print("TEMP2", temp2)
temp3 = dataset[split_mark*2:split_mark*3]
# print("TEMP3", temp3)
temp4 = dataset[split_mark*3:split_mark*4]
# print("TEMP4", temp4)
temp5 = dataset[split_mark*4:]
# print("TEMP5", temp5)
folds.append(temp1)
folds.append(temp2)
folds.append(temp3)
folds.append(temp4)
folds.append(temp5)
# folds = numpy.asarray(folds)
for fold in folds:
# fold = numpy.asarray(fold)
num_rows = fold.shape[0]
split_mark = int(num_rows * .8)
fold_training = fold[split_mark:]
fold_testing = fold[:split_mark]
print(type(fold))
# fold.tolist()
list(fold)
print(type(fold))
del fold[0:len(fold)]
fold.append(fold_training)
fold.append(fold_testing)
fold = numpy.asarray(fold)
# Somehow, return a testing and training set within each fold
# print(folds)
return folds
def confirm_size(folds):
total = 0
for fold in folds:
curr = len(fold)
total = total + curr
return total
def main():
print("BEGINNING CFV")
ecoli = csv_to_array('Classification/ecoli.csv')
print(len(ecoli))
folds = five_cross_fold_validation(ecoli)
size = confirm_size(folds)
print(size)
main()
Additionally, for reference, I have attached my csv I am working with (it is a modification of the UCI Ecoli Dataset.) The classes here are the values in the last column. So 0, 1, 2, 3, 4. It is important to note that there are not equal amounts of each class.
0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
0,0.38,0.48,0.5,0.42,0.48,0.55,0
0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
0.27,0.35,0.48,0.5,0.51,0.77,0.79,1
Edit I replaced np.random.shuffle(A) by A = np.random.permutation(A), the only difference is that it doesn't mutate the input array. This doesn't make any difference in this code, but it is safer in general.
The idea is to randomly sample the input by using numpy.random.permutation. Once the rows are shuffled we just need to iterate over all the possible tests sets (sliding window of the desired size, here 20% of the input size). The corresponding training sets are just composed of all remaining elements.
This will preserve the original classes distribution on all subsets even though we pick them in order because we shuffled the input.
The following code iterate over the test/train sets combinations:
import numpy as np
def csv_to_array(file):
with open(file, 'r') as f:
data = np.loadtxt(f, delimiter=',')
return data
def classes_distribution(A):
"""Print the class distributions of array A."""
nb_classes = np.unique(A[:,-1]).shape[0]
total_size = A.shape[0]
for i in range(nb_classes):
class_size = sum(row[-1] == i for row in A)
class_p = class_size/total_size
print(f"\t P(class_{i}) = {class_p:.3f}")
def random_samples(A, test_set_p=0.2):
"""Split the input array A in two uniformly chosen
random sets: test/training.
Repeat this until all rows have been yielded once at least
once as a test set."""
A = np.random.permutation(A)
sample_size = int(test_set_p*A.shape[0])
for start in range(0, A.shape[0], sample_size):
end = start + sample_size
yield {
"test": A[start:end,],
"train": np.append(A[:start,], A[end:,], 0)
}
def main():
ecoli = csv_to_array('ecoli.csv')
print("Input set shape: ", ecoli.shape)
print("Input set class distribution:")
classes_distribution(ecoli)
print("Training sets class distributions:")
for iteration in random_samples(ecoli):
test_set = iteration["test"]
training_set = iteration["train"]
classes_distribution(training_set)
print("---")
# ... Do what ever with these two sets
main()
It produces an output of the form:
Input set shape: (169, 8)
Input set class distribution:
P(class_0) = 0.308
P(class_1) = 0.213
P(class_2) = 0.207
P(class_3) = 0.118
P(class_4) = 0.154
Training sets class distributions:
P(class_0) = 0.316
P(class_1) = 0.206
P(class_2) = 0.199
P(class_3) = 0.118
P(class_4) = 0.162
...

Looping back to the next column in a csv

I'm trying to get a script to run on each individual column of a csv file. I've figured out how to tell python which column I would like to run the script on but I want it to analyze column one, output the results, the move to column two and continue on and on through the file. What I want is a "if etc goto etc" command. I've found how to do this with simple oneliners but I have a larger script. Any help would be great as I'm sure I'm just missing something. Like if I could loop back to where I define my data (h=data) but tell it to choose the next column. Here is my script.
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import pylab
from scipy import linalg
import sys
import scipy.interpolate as interpolate
import scipy.optimize as optimize
a=raw_input("Data file name? ") #Name of the data file including the directory, must be .csv
datafile = open(a, 'r')
data = []
for row in datafile:
data.append(row.strip().split(',')) #opening and organizing the csv file
print('Data points= ', len(data))
print data
c=raw_input("Is there a header row? y/n?") #Remove header line if present
if c is ('y'):
del data[0]
data2=data
print('Raw data= ', data2)
else:
print('Raw data= ', data)
'''
#if I wanted to select a column
b=input("What column to analyze?") #Asks what column depth data is in
if b is 1:
h=[[rowa[i] for rowa in data] for i in range(1)] #first row
'''
h=data # all columns
g=reduce(lambda x,y: x+y,h) #prepares data for calculations
a=map(float, g)
a.sort()
print ('Organized data= ',a)
def GRLC(values):
'''
Calculate Gini index, Gini coefficient, Robin Hood index, and points of
Lorenz curve based on the instructions given in
www.peterrosenmai.com/lorenz-curve-graphing-tool-and-gini-coefficient-calculator
Lorenz curve values as given as lists of x & y points [[x1, x2], [y1, y2]]
#param values: List of values
#return: [Gini index, Gini coefficient, Robin Hood index, [Lorenz curve]]
'''
n = len(values)
assert(n > 0), 'Empty list of values'
sortedValues = sorted(values) #Sort smallest to largest
#Find cumulative totals
cumm = [0]
for i in range(n):
cumm.append(sum(sortedValues[0:(i + 1)]))
#Calculate Lorenz points
LorenzPoints = [[], []]
sumYs = 0 #Some of all y values
robinHoodIdx = -1 #Robin Hood index max(x_i, y_i)
for i in range(1, n + 2):
x = 100.0 * (i - 1)/n
y = 100.0 * (cumm[i - 1]/float(cumm[n]))
LorenzPoints[0].append(x)
LorenzPoints[1].append(y)
sumYs += y
maxX_Y = x - y
if maxX_Y > robinHoodIdx: robinHoodIdx = maxX_Y
giniIdx = 100 + (100 - 2 * sumYs)/n #Gini index
return [giniIdx, giniIdx/100, robinHoodIdx, LorenzPoints]
result = GRLC(a)
print 'Gini Index', result[0]
print 'Gini Coefficient', result[1]
print 'Robin Hood Index', result[2]
I'm ignoring all of that GRLC function and just solving the looping question. Give this a try. It uses while True: to loop forever (you can just break out by ending the program; Ctrl+C in Windows, depends on OS). Just load the data from the csv once then each time it loops, you can re-build some variables. If you have questions please ask. Also, I didn't test it as I don't have all the NumPy packages installed :)
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import pylab
from scipy import linalg
import sys
import scipy.interpolate as interpolate
import scipy.optimize as optimize
def GRLC(values):
'''
Calculate Gini index, Gini coefficient, Robin Hood index, and points of
Lorenz curve based on the instructions given in
www.peterrosenmai.com/lorenz-curve-graphing-tool-and-gini-coefficient-calculator
Lorenz curve values as given as lists of x & y points [[x1, x2], [y1, y2]]
#param values: List of values
#return: [Gini index, Gini coefficient, Robin Hood index, [Lorenz curve]]
'''
n = len(values)
assert(n > 0), 'Empty list of values'
sortedValues = sorted(values) #Sort smallest to largest
#Find cumulative totals
cumm = [0]
for i in range(n):
cumm.append(sum(sortedValues[0:(i + 1)]))
#Calculate Lorenz points
LorenzPoints = [[], []]
sumYs = 0 #Some of all y values
robinHoodIdx = -1 #Robin Hood index max(x_i, y_i)
for i in range(1, n + 2):
x = 100.0 * (i - 1)/n
y = 100.0 * (cumm[i - 1]/float(cumm[n]))
LorenzPoints[0].append(x)
LorenzPoints[1].append(y)
sumYs += y
maxX_Y = x - y
if maxX_Y > robinHoodIdx: robinHoodIdx = maxX_Y
giniIdx = 100 + (100 - 2 * sumYs)/n #Gini index
return [giniIdx, giniIdx/100, robinHoodIdx, LorenzPoints]
#Name of the data file including the directory, must be .csv
a=raw_input("Data file name? ")
datafile = open(a.strip(), 'r')
data = []
#opening and organizing the csv file
for row in datafile:
data.append(row.strip().split(','))
#Remove header line if present
c=raw_input("Is there a header row? y/n?")
if c.strip().lower() == ('y'):
del data[0]
while True :
#if I want the first column, that's index 0.
b=raw_input("What column to analyze?")
# Validate that the column input data is correct here. Otherwise it might be out of range, etc.
# Maybe try this. You might want more smarts in there, depending on your intent:
b = int(b.strip())
# If you expect the user to inpt "2" to mean the second column, you're going to use index 1 (list indexes are 0 based)
h=[[rowa[b-1] for rowa in data] for i in range(1)]
# prepares data for calculations
g=reduce(lambda x,y: x+y,h)
a=map(float, g)
a.sort()
print ('Organized data= ',a)
result = GRLC(a)
print 'Gini Index', result[0]
print 'Gini Coefficient', result[1]
print 'Robin Hood Index', result[2]

Categories