How to process pandas dataframe by row - python

I am working on an ID3 algorithm implementation. The issue that I am running into is processing the branches from the new root attribute
As the print shows
gain: 1.263221025628615 for Material
processing attribute Volume
processing branch 1 for Volume
processing branch 6 for Volume
processing branch 4 for Volume
processing branch 2 for Volume
processing branch 5 for Volume
processing branch 3 for Volume
gain: 0.6036978279454468 for Volume
attribute Venue has the max gain of 0.6036978279454468
removing Venue
new root Venue has branches [2 1]
The last step on Step 3 should filter the dataframe by the unique values of the selected attribute:
from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
def calculate_metrics(tp, tn, fn, p, n, fp):
# calculate the accuracy, error rate, sensitivity, specificity, and precision for the selected classifier in reference to the corresponding test set.
accuracy = tp + tn /(p+n)
error_rate = fp + fn /(p + n)
sensitivity = tp/ p
precision = tp/ (tp+fp)
specificity = tn/n
display_metrics(accuracy, error_rate, sensitivity, precision, specificity)
def display_metrics(accuracy, error_rate, sensitivity, precision, specificity):
print(f'Accuracy: {accuracy}, Error_rate:{error_rate}, Sensitivity:{sensitivity}, Precision:{precision}, specificity:{specificity}')
def mc(columnName,training_set):
column = training_set[columnName]
probs = column.value_counts(normalize=True)
messageConveyed = -1*np.sum(np.log2(probs)*probs)
# print(f'mc {messageConveyed}')
return messageConveyed
def isUnique(s):
a = s.to_numpy() # s.values (pandas<0.24)
return (a[0] == a).all()
def ID3(threshold,g):
# use the training set to predict the test set.
# use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
test_set = pd.read_csv("Assignment 2--Test set for ID3.csv")
training_set = pd.read_csv("Assignment 2--Training set for ID3.csv")
print('***********************************')
print('TRAINING SET')
print(training_set)
print('***********************************')
print('***********************************')
print('TEST SET')
print(test_set)
print('***********************************')
print(f'test_set: {test_set}')
print(f'training_set: {training_set}')
# Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute
print(f'Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute')
# MC = -p1*log2(p1) - p2*log2(p2)
# For n classes MC = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)
# For each column calculate the gain.
numberOfColumns = 0
mcDictionary = {}
print('***********************************')
print('For each column calculate the gain.')
for (columnName, columnData) in training_set.iteritems():
messageConveyed = mc(columnName,training_set)
mcDictionary.update({columnName:round(messageConveyed)})
numberOfColumns+=1
print('***********************************')
print(f'numberOfColumns {numberOfColumns}')
print(f'mcDictionary {mcDictionary}')
# The column with the highest gain is the root.
print(f'The column with the highest gain is the root.')
values = mcDictionary.values()
max_value = max(values)
print(f'The max value is {max_value}')
# print(f'The max value, {max_value}, is associated with column {columnWithMaximumInformationGain}')
val_list = list(values)
columnWithMaximumInformationGain = list(mcDictionary.keys())[list(mcDictionary.values()).index(max_value)]
print(f'The max value, {max_value}, is associated with column {columnWithMaximumInformationGain}')
# select the max value from the gain array
# this is the new root
root = columnWithMaximumInformationGain
print(f'root is {root}')
print("******************************************")
print("************** ROOT ******************")
print(f"TF is {root}**********************")
print("******************************************")
print(f'isUnique = {isUnique(training_set[root])}')
if(isUnique(training_set[root])):
return
# Step 2 - Repeat for every attribute
print(f'Step 2 - Repeat for every attribute')
# Loop 1
attribute = ""
maximum = 0
for (F, columnData) in training_set.iteritems():
print(f'processing attribute {F}')
# Loop 2
Total = 0
uniques = training_set[F].unique()
for k in uniques:
print(f'processing branch {k} for {F}')
# Calculate MC for column
messageConveyed = mc(F,training_set)
# Calculate the weight for F
F_D = training_set[F].count()
TF_D = training_set[root].count()
weight = F_D/TF_D
total = weight*messageConveyed
gain = mcDictionary[root] - total
if(gain > maximum):
attribute = F
maximum = gain
print(f"gain: {gain} for {F}")
print(f'attribute {attribute} has the max gain of {gain}')
print(f'removing {attribute}')
root = attribute
print(f'new root {root} has branches {training_set[root].unique()}')
del training_set[attribute]
# Step 3 - Examine dataset of each leaf
print(f'')
def BayesClassifier(training_set,test_set):
# use the assignment 2-- training set for Bayes as the training set to classify the records of the assignment 2 test set for bayes
X = test_set.values
Y = training_set.values
clf = GaussianNB()
clf.fit(X, Y)
# prompt user to select either ID3 or Bayes classifier.
selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = 0.9 #= input("Please enter a threshold: ")
g = 0.05 #= input("Please enter a value for g: ")
if(selection == "ID3"):
ID3(threshold,g)
if(selection == "Bayes"):
BayesClassifier()
Given the training set
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
2,6,4,4,4,2,2,1,1
1,2,4,4,4,1,6,2,6
1,5,4,4,4,1,2,1,6
2,4,4,4,4,2,6,1,4
1,4,4,4,4,1,2,2,2
2,4,3,3,3,2,1,1,1
1,5,2,1,4,1,6,2,6
1,2,3,3,3,1,2,1,6
2,6,4,4,4,2,3,1,1
1,4,4,4,4,1,2,1,6
1,5,4,4,4,1,2,1,4
The data frame should be split into two frames by 1 and 2.
i.e.
Venue, color, Model....
1
1
1
1
1
1
1
1
Venue, color, Model....
2
2
2
2
2
2
2
2
2
Can some explain how this can be done? Thanks.

This seems to do it.
unique_values = training_set[root].unique()
datasets = []
for unique_value in unique_values:
print(f'processing for file : {unique_value} ')
df_1 = training_set[training_set[attribute] > unique_value]
df_2 = training_set[training_set[attribute] < unique_value]
datasets.append(df_1)
datasets.append(df_2)
del training_set[attribute]

Related

return value to df after several operations

I run IPR outlier control for a relatively big dataframe df:
I perform IPR within subset of the data so I use for loop.
How can I return value to original df >1 000 000 rows:
months product brick units is_outlier
0 202104 abc 3 1.00 False
1 202104 abc 6 3.00 False
for product in df['product'].unique():
for brick in df['brick'].unique():
try:
# Extract the units for the current product and brick
data = df.loc[(df['product'] == product) & (df['brick'] == brick)]['units'].values
# Scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.reshape(-1, 1))
# Fit a linear regression model to the data
reg = LinearRegression()
reg.fit(np.arange(len(data_scaled)).reshape(-1, 1), data_scaled)
# Calculate the residuals of the regression
residuals = data_scaled - reg.predict(np.arange(len(data_scaled)).reshape(-1, 1))
# Identify any observations with a residual larger than 2 standard deviations from the mean
threshold = 2*residuals.std()
outliers = np.where(np.abs(residuals) > threshold)
# Set the "is_outlier" column to True for the outliers in the current product
df.loc[(df['product'] == product ) & (df['brick']== brick) & (df.index.isin(outliers[0])), 'is_outlier'] = True
except:
pass
As #QuangHoang suggested, use groupby and apply your custom function:
def outlier(df):
data = df.to_numpy().reshape((-1, 1))
# Scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
# Fit a linear regression model to the data
reg = LinearRegression()
reg.fit(np.arange(len(data_scaled)).reshape(-1, 1), data_scaled)
# Calculate the residuals of the regression
residuals = data_scaled - reg.predict(np.arange(len(data_scaled)).reshape(-1, 1))
# Identify any observations with a residual
# larger than 2 standard deviations from the mean
threshold = 2*residuals.std()
return np.ravel(np.abs(residuals) > threshold)
df['is_outlier'] = df.groupby(['product', 'brick'])['units'].transform(outlier)

I'm having trouble with content based recommendation system prediction (NOT TDIDF)

I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.
Can someone help me? I've been working on this code for a few days. My entire body of code is below.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IESEGRecSys.Functions import *
from sklearn.model_selection import train_test_split
from surprise import KNNBasic
from surprise import Dataset, Reader
user_artists = pd.read_table("user_artists.dat")
user_artists['ratings'] = 0
user_artists.loc[user_artists['weight'] <= user_artists['weight'].quantile(1), 'ratings'] = 5
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.8), 'ratings'] = 4
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.6), 'ratings'] = 3
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.4), 'ratings'] = 2
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.2), 'ratings'] = 1
data = user_artists[['userID','artistID','ratings']]
data.head()
data.shape
# train-test split
train, test = train_test_split(data, test_size=0.3, random_state=42)
# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(data.shape)
print(train.shape)
print(test.shape)
tags = pd.read_table("tags.dat", encoding = 'unicode_escape')
user_taggedartists = pd.read_table("user_taggedartists.dat")
user_tag_merged = pd.merge(user_taggedartists, tags, on="tagID", how="inner")
user_tag_merged_updated = pd.merge(user_tag_merged, data, on=(["userID","artistID"]),how="inner")
movie=user_tag_merged_updated
movie
data2 = data[['userID','artistID','ratings']]
# train-test split
train, test2 = train_test_split(data2, test_size=0.3, random_state=42)
# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(data2.shape)
print(train.shape)
print(test.shape)
data_pivot2 = data2.pivot_table(index='artistID', values='ratings', columns='userID').fillna(0)
data_pivot2.head()
movie2 = [['tagID','artistID','year']]
movie2 = user_tag_merged_updated.pivot_table(index='tagID', values='year', columns='userID').fillna(0)
movie2.head()
# Content based as a function
from numpy.linalg import norm
def simil_cosine(a,b):
return np.dot(a, b)/(norm(a)*norm(b))
def ContentBased(content_data, test_data, NN):
cdata = content_data.reset_index(drop=True).copy()
# store user and item dimensions
dim = cdata.shape[0]
nr_user = cdata.shape[0]
if test_data.shape[1] != dim:
raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\
.format(test_data.shape[1], dim))
# similarity matrices
matrix = np.zeros(shape=(dim, dim), dtype=np.float)
matrixNN = np.zeros(shape=(dim, dim), dtype=np.float)
# compute similarity
for i, row in cdata.iterrows():
for j, col in cdata.iterrows():
if i <= j: continue
else: matrix[i][j] = simil_cosine(np.array(row),np.array(col))
# copy values to other diagonal
matrix = matrix + matrix.T - np.diag(np.diag(matrix))
print('Similarity calculation done...')
# mask all values that are not nearest neighbors
cutoff = lambda x,cv: x if x >= cv else 0.0
v_cutoff = np.vectorize(cutoff)
for i in range(dim):
crit_val = -np.sort(-matrix[i])[NN-1]
matrixNN[i] = v_cutoff(matrix[i], crit_val)
print('Nearest neighbor selection done...')
# predict user-item ratings in test_data
prediction = np.zeros(shape=(nr_user, dim), dtype=np.float)
for i in range(nr_user):
num = np.matmul(np.array(test_data.iloc[i,:]), matrixNN)
denom = matrixNN.sum(axis=0) # column sums
prediction[i] = num/denom
print('Prediction done...')
# return DataFrame
return pd.DataFrame(prediction, index=test_data.index, columns=test_data.columns)
cb_pred = ContentBased(movie2,data_pivot2, 10)
# Content Based as a Class
from numpy.linalg import norm
class ContentBased:
def simil_cosine(self, a,b):
return np.dot(a, b)/(norm(a)*norm(b))
def __init__(self, NN):
self.NN = NN
def fit(self, content_data):
cdata = content_data.reset_index(drop=True).copy()
self.item_dim = cdata.shape[0]
self.matrix = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float)
self.matrixNN = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float)
# compute similarity
for i, row in cdata.iterrows():
for j, col in cdata.iterrows():
if i <= j: continue
else: self.matrix[i][j] = self.simil_cosine(np.array(row),np.array(col))
# copy values to other diagonal
self.matrix = self.matrix + self.matrix.T - np.diag(np.diag(self.matrix))
cutoff = lambda x,cv: x if x >= cv else 0.0
v_cutoff = np.vectorize(cutoff)
for i in range(self.item_dim):
crit_val = -np.sort(-self.matrix[i])[self.NN-1]
self.matrixNN[i] = v_cutoff(self.matrix[i], crit_val)
def predict(self, test_data):
if test_data.shape[1] != self.item_dim:
raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\
.format(test_data.shape[1], self.item_dim))
I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.

Drawing equal samples from each class in stratified sampling

So I have 1000 class 1 and 2500 class 2. So naturally when using:
sklearn's train_test_split(test_size = 200, stratify = y). I get an imbalanced test set since it is preserving the data distribution from the original data set. However, I would like to split to have 100 class 1 and 100 class 2 in the test set.
How would I do it? Any suggestions would be appreciated.
Split Manually
A manual solution isn't that scary. Main steps explained:
Isolate the index of class-1 and class-2 rows.
Use np.random.permutation() to select random n1 and n2 test samples for class 1 and 2 respectively.
Use df.index.difference() to perform inverse selection for the train samples.
The code can be easily generalized to arbitrary number of classes and arbitrary numbers to be selected as test data (just put n1/n2, idx1/idx2, etc. into lists and process by loops). But that's out of the scope of the question itself.
Code
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
# data
df = pd.DataFrame(
data={
"label": np.array([1]*1000 + [2]*2500),
# label 1 has value > 0, label 2 has value < 0
"value": np.hstack([np.random.uniform(0, 1, 1000),
np.random.uniform(-1, 0, 2500)])
}
)
df = df.sample(frac=1).reset_index(drop=True)
# sampling number for each class
n1 = 100
n2 = 100
# 1. get indexes and lengths for the classes respectively
idx1 = df.index.values[df["label"] == 1]
idx2 = df.index.values[df["label"] == 2]
len1 = len(idx1) # 1000
len2 = len(idx2) # 2500
# 2. draw index for test dataset
draw1 = np.random.permutation(len1)[:n1] # keep the first n1 entries to be selected
idx1_test = idx1[draw1]
draw2 = np.random.permutation(len2)[:n2]
idx2_test = idx2[draw2]
# combine the drawn indexes
idx_test = np.hstack([idx1_test, idx2_test])
# 3. derive index for train dataset
idx_train = df.index.difference(idx_test)
# split
df_train = df.loc[idx_train, :] # optional: .reset_index(drop=True)
df_test = df.loc[idx_test, :]
# len(df_train) = 3300
# len(df_test) = 200
# verify that no row was missing
idx_merged = np.hstack([df_train.index.values, df_test.index.values])
assert len(np.unique(idx_merged)) == 3500

Arima grid search function

I am writing a function that does a grid search for arima model based on bic criteria
I am able to do ARIMA grid search but I am trying to update it to SARIMA grid search
Code look like below I just cant make it work
import itertools
p_min = 0
d_min = 0
q_min = 0
p_max = 4
d_max = 0
q_max = 4
# Initialize a DataFrame to store the results
results_bic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min,p_max+1)],
columns=['MA{}'.format(i) for i in range(q_min,q_max+1)])
for p,d,q in itertools.product(range(p_min,p_max+1),
range(d_min,d_max+1),
range(q_min,q_max+1)):
if p==0 and d==0 and q==0:
results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = np.nan
continue
try:
model = sm.tsa.SARIMAX(data, order=(p, d, q),trend = "c", seasonal_order = (p, d, q,12)
)
results = model.fit()
results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = results.bic
except:
continue
results_bic = results_bic[results_bic.columns].astype(float)

Odd Results on Entropy Calculation

I am trying to write a function that properly calculates the entropy of a given dataset. However, I am getting very weird entropy values.
I am following the understanding that all entropy calculations must fall between 0 and 1, yet I am consistently getting values above 2.
Note: I must use log base 2 for this
Can someone explain why am I yielding incorrect entropy results?
The dataset I am testing is the ecoli dataset from the UCI Machine Learning Repository
import numpy
import math
#################### DATA HANDLING LIBRARY ####################
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
# Function that utilizes the numpy library to randomize the dataset.
def randomize_data(csv):
csv = numpy.random.shuffle(csv)
return csv
# Function to split the data into test, training set, and validation sets
def split_data(csv):
# Call the randomize data function
randomize_data(csv)
# Grab the number of rows and calculate where to split
num_rows = csv.shape[0]
validation_split = int(num_rows * 0.10)
training_split = int(num_rows * 0.72)
testing_split = int(num_rows * 0.18)
# Validation set as the first 10% of the data
validation_set = csv[:validation_split]
# Training set as the next 72
training_set = csv[validation_split:training_split + validation_split]
# Testing set as the last 18
testing_set = csv[training_split + validation_split:]
# Split the data into classes vs actual data
training_cols = training_set.shape[1]
testing_cols = testing_set.shape[1]
validation_cols = validation_set.shape[1]
training_classes = training_set[:, training_cols - 1]
testing_classes = testing_set[:, testing_cols - 1]
validation_classes = validation_set[:, validation_cols - 1]
# Take the sets and remove the last (classification) column
training_set = training_set[:-1]
testing_set = testing_set[:-1]
validation_set = validation_set[:-1]
# Return the datasets
return testing_set, testing_classes, training_set, training_classes, validation_set, validation_classes
#################### DATA HANDLING LIBRARY ####################
# This function returns the list of classes, and their associated weights (i.e. distributions)
# for a given dataset
def class_distribution(dataset):
# Ensure the dataset is a numpy array
dataset = numpy.asarray(dataset)
# Collect # of total rows and columns, using numpy
num_total_rows = dataset.shape[0]
num_columns = dataset.shape[1]
# Create a numpy array of just the classes
classes = dataset[:, num_columns - 1]
# Use numpy.unique to remove duplicates
classes = numpy.unique(classes)
# Create an empty array for the class weights
class_weights = []
# Loop through the classes one by one
for aclass in classes:
# Create storage variables
total = 0
weight = 0
# Now loop through the dataset
for row in dataset:
# If the class of the dataset is equal to the current class you are evaluating, increase the total
if numpy.array_equal(aclass, row[-1]):
total = total + 1
# If not, continue
else:
continue
# Divide the # of occurences by total rows
weight = float((total / num_total_rows))
# Add that weight to the list of class weights
class_weights.append(weight)
# Turn the weights into a numpy array
class_weights = numpy.asarray(class_weights)
# Return the array
return classes, class_weights
# This function returns the entropy for a given dataset
# Can be used across an entire csv, or just for a column of data (feature)
def get_entropy(dataset):
# Set initial entropy
entropy = 0.0
# Determine the classes and their frequencies (weights) of the dataset
classes, class_freq = class_distribution(dataset)
# Utilize numpy's quicksort to test the most occurring class first
numpy.sort(class_freq)
# Determine the max entropy for the dataset
max_entropy = math.log(len(classes), 2)
print("MAX ENTROPY FOR THIS DATASET: ", max_entropy)
# Loop through the frequencies and use given formula to calculate entropy
# For...Each simulates the sequence operator
for freq in class_freq:
entropy += float(-freq * math.log(freq, 2))
# Return the entropy value
return entropy
def main():
ecol = csv_to_array('ecoli.csv')
testing_set, testing_classes, training_set, training_classes, validation_set, validation_classes = split_data(ecol)
entropy = get_entropy(ecol)
print(entropy)
main()
The following function was used to calculate Entropy:
# Function to return Shannon's Entropy
def entropy(attributes, dataset, targetAttr):
freq = {}
entropy = 0.0
index = 0
for item in attributes:
if (targetAttr == item):
break
else:
index = index + 1
index = index - 1
for item in dataset:
if ((item[index]) in freq):
# Increase the index
freq[item[index]] += 1.0
else:
# Initialize it by setting it to 0
freq[item[index]] = 1.0
for freq in freq.values():
entropy = entropy + (-freq / len(dataset)) * math.log(freq / len(dataset), 2)
return entropy
As #MattTimmermans had indicated, entropy's value is actually contingent on the number of classes. For strictly 2 classes, it is contained in the 0 to 1 (inclusive) range. However, for more than 2 classes (which is what was being tested), entropy is calculated with a different formula (converted to Pythonic code above). This post here explains those mathematics and calculations a bit more in detail.

Categories