How to process pandas dataframe by row - python
I am working on an ID3 algorithm implementation. The issue that I am running into is processing the branches from the new root attribute
As the print shows
gain: 1.263221025628615 for Material
processing attribute Volume
processing branch 1 for Volume
processing branch 6 for Volume
processing branch 4 for Volume
processing branch 2 for Volume
processing branch 5 for Volume
processing branch 3 for Volume
gain: 0.6036978279454468 for Volume
attribute Venue has the max gain of 0.6036978279454468
removing Venue
new root Venue has branches [2 1]
The last step on Step 3 should filter the dataframe by the unique values of the selected attribute:
from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
def calculate_metrics(tp, tn, fn, p, n, fp):
# calculate the accuracy, error rate, sensitivity, specificity, and precision for the selected classifier in reference to the corresponding test set.
accuracy = tp + tn /(p+n)
error_rate = fp + fn /(p + n)
sensitivity = tp/ p
precision = tp/ (tp+fp)
specificity = tn/n
display_metrics(accuracy, error_rate, sensitivity, precision, specificity)
def display_metrics(accuracy, error_rate, sensitivity, precision, specificity):
print(f'Accuracy: {accuracy}, Error_rate:{error_rate}, Sensitivity:{sensitivity}, Precision:{precision}, specificity:{specificity}')
def mc(columnName,training_set):
column = training_set[columnName]
probs = column.value_counts(normalize=True)
messageConveyed = -1*np.sum(np.log2(probs)*probs)
# print(f'mc {messageConveyed}')
return messageConveyed
def isUnique(s):
a = s.to_numpy() # s.values (pandas<0.24)
return (a[0] == a).all()
def ID3(threshold,g):
# use the training set to predict the test set.
# use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
test_set = pd.read_csv("Assignment 2--Test set for ID3.csv")
training_set = pd.read_csv("Assignment 2--Training set for ID3.csv")
print('***********************************')
print('TRAINING SET')
print(training_set)
print('***********************************')
print('***********************************')
print('TEST SET')
print(test_set)
print('***********************************')
print(f'test_set: {test_set}')
print(f'training_set: {training_set}')
# Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute
print(f'Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute')
# MC = -p1*log2(p1) - p2*log2(p2)
# For n classes MC = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)
# For each column calculate the gain.
numberOfColumns = 0
mcDictionary = {}
print('***********************************')
print('For each column calculate the gain.')
for (columnName, columnData) in training_set.iteritems():
messageConveyed = mc(columnName,training_set)
mcDictionary.update({columnName:round(messageConveyed)})
numberOfColumns+=1
print('***********************************')
print(f'numberOfColumns {numberOfColumns}')
print(f'mcDictionary {mcDictionary}')
# The column with the highest gain is the root.
print(f'The column with the highest gain is the root.')
values = mcDictionary.values()
max_value = max(values)
print(f'The max value is {max_value}')
# print(f'The max value, {max_value}, is associated with column {columnWithMaximumInformationGain}')
val_list = list(values)
columnWithMaximumInformationGain = list(mcDictionary.keys())[list(mcDictionary.values()).index(max_value)]
print(f'The max value, {max_value}, is associated with column {columnWithMaximumInformationGain}')
# select the max value from the gain array
# this is the new root
root = columnWithMaximumInformationGain
print(f'root is {root}')
print("******************************************")
print("************** ROOT ******************")
print(f"TF is {root}**********************")
print("******************************************")
print(f'isUnique = {isUnique(training_set[root])}')
if(isUnique(training_set[root])):
return
# Step 2 - Repeat for every attribute
print(f'Step 2 - Repeat for every attribute')
# Loop 1
attribute = ""
maximum = 0
for (F, columnData) in training_set.iteritems():
print(f'processing attribute {F}')
# Loop 2
Total = 0
uniques = training_set[F].unique()
for k in uniques:
print(f'processing branch {k} for {F}')
# Calculate MC for column
messageConveyed = mc(F,training_set)
# Calculate the weight for F
F_D = training_set[F].count()
TF_D = training_set[root].count()
weight = F_D/TF_D
total = weight*messageConveyed
gain = mcDictionary[root] - total
if(gain > maximum):
attribute = F
maximum = gain
print(f"gain: {gain} for {F}")
print(f'attribute {attribute} has the max gain of {gain}')
print(f'removing {attribute}')
root = attribute
print(f'new root {root} has branches {training_set[root].unique()}')
del training_set[attribute]
# Step 3 - Examine dataset of each leaf
print(f'')
def BayesClassifier(training_set,test_set):
# use the assignment 2-- training set for Bayes as the training set to classify the records of the assignment 2 test set for bayes
X = test_set.values
Y = training_set.values
clf = GaussianNB()
clf.fit(X, Y)
# prompt user to select either ID3 or Bayes classifier.
selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = 0.9 #= input("Please enter a threshold: ")
g = 0.05 #= input("Please enter a value for g: ")
if(selection == "ID3"):
ID3(threshold,g)
if(selection == "Bayes"):
BayesClassifier()
Given the training set
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
2,6,4,4,4,2,2,1,1
1,2,4,4,4,1,6,2,6
1,5,4,4,4,1,2,1,6
2,4,4,4,4,2,6,1,4
1,4,4,4,4,1,2,2,2
2,4,3,3,3,2,1,1,1
1,5,2,1,4,1,6,2,6
1,2,3,3,3,1,2,1,6
2,6,4,4,4,2,3,1,1
1,4,4,4,4,1,2,1,6
1,5,4,4,4,1,2,1,4
The data frame should be split into two frames by 1 and 2.
i.e.
Venue, color, Model....
1
1
1
1
1
1
1
1
Venue, color, Model....
2
2
2
2
2
2
2
2
2
Can some explain how this can be done? Thanks.
This seems to do it.
unique_values = training_set[root].unique()
datasets = []
for unique_value in unique_values:
print(f'processing for file : {unique_value} ')
df_1 = training_set[training_set[attribute] > unique_value]
df_2 = training_set[training_set[attribute] < unique_value]
datasets.append(df_1)
datasets.append(df_2)
del training_set[attribute]
Related
return value to df after several operations
I run IPR outlier control for a relatively big dataframe df: I perform IPR within subset of the data so I use for loop. How can I return value to original df >1 000 000 rows: months product brick units is_outlier 0 202104 abc 3 1.00 False 1 202104 abc 6 3.00 False for product in df['product'].unique(): for brick in df['brick'].unique(): try: # Extract the units for the current product and brick data = df.loc[(df['product'] == product) & (df['brick'] == brick)]['units'].values # Scale the data scaler = StandardScaler() data_scaled = scaler.fit_transform(data.reshape(-1, 1)) # Fit a linear regression model to the data reg = LinearRegression() reg.fit(np.arange(len(data_scaled)).reshape(-1, 1), data_scaled) # Calculate the residuals of the regression residuals = data_scaled - reg.predict(np.arange(len(data_scaled)).reshape(-1, 1)) # Identify any observations with a residual larger than 2 standard deviations from the mean threshold = 2*residuals.std() outliers = np.where(np.abs(residuals) > threshold) # Set the "is_outlier" column to True for the outliers in the current product df.loc[(df['product'] == product ) & (df['brick']== brick) & (df.index.isin(outliers[0])), 'is_outlier'] = True except: pass
As #QuangHoang suggested, use groupby and apply your custom function: def outlier(df): data = df.to_numpy().reshape((-1, 1)) # Scale the data scaler = StandardScaler() data_scaled = scaler.fit_transform(data) # Fit a linear regression model to the data reg = LinearRegression() reg.fit(np.arange(len(data_scaled)).reshape(-1, 1), data_scaled) # Calculate the residuals of the regression residuals = data_scaled - reg.predict(np.arange(len(data_scaled)).reshape(-1, 1)) # Identify any observations with a residual # larger than 2 standard deviations from the mean threshold = 2*residuals.std() return np.ravel(np.abs(residuals) > threshold) df['is_outlier'] = df.groupby(['product', 'brick'])['units'].transform(outlier)
I'm having trouble with content based recommendation system prediction (NOT TDIDF)
I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match. Can someone help me? I've been working on this code for a few days. My entire body of code is below. import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from IESEGRecSys.Functions import * from sklearn.model_selection import train_test_split from surprise import KNNBasic from surprise import Dataset, Reader user_artists = pd.read_table("user_artists.dat") user_artists['ratings'] = 0 user_artists.loc[user_artists['weight'] <= user_artists['weight'].quantile(1), 'ratings'] = 5 user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.8), 'ratings'] = 4 user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.6), 'ratings'] = 3 user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.4), 'ratings'] = 2 user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.2), 'ratings'] = 1 data = user_artists[['userID','artistID','ratings']] data.head() data.shape # train-test split train, test = train_test_split(data, test_size=0.3, random_state=42) # reset index train = train.reset_index(drop=True) test = test.reset_index(drop=True) print(data.shape) print(train.shape) print(test.shape) tags = pd.read_table("tags.dat", encoding = 'unicode_escape') user_taggedartists = pd.read_table("user_taggedartists.dat") user_tag_merged = pd.merge(user_taggedartists, tags, on="tagID", how="inner") user_tag_merged_updated = pd.merge(user_tag_merged, data, on=(["userID","artistID"]),how="inner") movie=user_tag_merged_updated movie data2 = data[['userID','artistID','ratings']] # train-test split train, test2 = train_test_split(data2, test_size=0.3, random_state=42) # reset index train = train.reset_index(drop=True) test = test.reset_index(drop=True) print(data2.shape) print(train.shape) print(test.shape) data_pivot2 = data2.pivot_table(index='artistID', values='ratings', columns='userID').fillna(0) data_pivot2.head() movie2 = [['tagID','artistID','year']] movie2 = user_tag_merged_updated.pivot_table(index='tagID', values='year', columns='userID').fillna(0) movie2.head() # Content based as a function from numpy.linalg import norm def simil_cosine(a,b): return np.dot(a, b)/(norm(a)*norm(b)) def ContentBased(content_data, test_data, NN): cdata = content_data.reset_index(drop=True).copy() # store user and item dimensions dim = cdata.shape[0] nr_user = cdata.shape[0] if test_data.shape[1] != dim: raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\ .format(test_data.shape[1], dim)) # similarity matrices matrix = np.zeros(shape=(dim, dim), dtype=np.float) matrixNN = np.zeros(shape=(dim, dim), dtype=np.float) # compute similarity for i, row in cdata.iterrows(): for j, col in cdata.iterrows(): if i <= j: continue else: matrix[i][j] = simil_cosine(np.array(row),np.array(col)) # copy values to other diagonal matrix = matrix + matrix.T - np.diag(np.diag(matrix)) print('Similarity calculation done...') # mask all values that are not nearest neighbors cutoff = lambda x,cv: x if x >= cv else 0.0 v_cutoff = np.vectorize(cutoff) for i in range(dim): crit_val = -np.sort(-matrix[i])[NN-1] matrixNN[i] = v_cutoff(matrix[i], crit_val) print('Nearest neighbor selection done...') # predict user-item ratings in test_data prediction = np.zeros(shape=(nr_user, dim), dtype=np.float) for i in range(nr_user): num = np.matmul(np.array(test_data.iloc[i,:]), matrixNN) denom = matrixNN.sum(axis=0) # column sums prediction[i] = num/denom print('Prediction done...') # return DataFrame return pd.DataFrame(prediction, index=test_data.index, columns=test_data.columns) cb_pred = ContentBased(movie2,data_pivot2, 10) # Content Based as a Class from numpy.linalg import norm class ContentBased: def simil_cosine(self, a,b): return np.dot(a, b)/(norm(a)*norm(b)) def __init__(self, NN): self.NN = NN def fit(self, content_data): cdata = content_data.reset_index(drop=True).copy() self.item_dim = cdata.shape[0] self.matrix = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float) self.matrixNN = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float) # compute similarity for i, row in cdata.iterrows(): for j, col in cdata.iterrows(): if i <= j: continue else: self.matrix[i][j] = self.simil_cosine(np.array(row),np.array(col)) # copy values to other diagonal self.matrix = self.matrix + self.matrix.T - np.diag(np.diag(self.matrix)) cutoff = lambda x,cv: x if x >= cv else 0.0 v_cutoff = np.vectorize(cutoff) for i in range(self.item_dim): crit_val = -np.sort(-self.matrix[i])[self.NN-1] self.matrixNN[i] = v_cutoff(self.matrix[i], crit_val) def predict(self, test_data): if test_data.shape[1] != self.item_dim: raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\ .format(test_data.shape[1], self.item_dim)) I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.
Drawing equal samples from each class in stratified sampling
So I have 1000 class 1 and 2500 class 2. So naturally when using: sklearn's train_test_split(test_size = 200, stratify = y). I get an imbalanced test set since it is preserving the data distribution from the original data set. However, I would like to split to have 100 class 1 and 100 class 2 in the test set. How would I do it? Any suggestions would be appreciated.
Split Manually A manual solution isn't that scary. Main steps explained: Isolate the index of class-1 and class-2 rows. Use np.random.permutation() to select random n1 and n2 test samples for class 1 and 2 respectively. Use df.index.difference() to perform inverse selection for the train samples. The code can be easily generalized to arbitrary number of classes and arbitrary numbers to be selected as test data (just put n1/n2, idx1/idx2, etc. into lists and process by loops). But that's out of the scope of the question itself. Code import numpy as np from sklearn.model_selection import train_test_split import pandas as pd # data df = pd.DataFrame( data={ "label": np.array([1]*1000 + [2]*2500), # label 1 has value > 0, label 2 has value < 0 "value": np.hstack([np.random.uniform(0, 1, 1000), np.random.uniform(-1, 0, 2500)]) } ) df = df.sample(frac=1).reset_index(drop=True) # sampling number for each class n1 = 100 n2 = 100 # 1. get indexes and lengths for the classes respectively idx1 = df.index.values[df["label"] == 1] idx2 = df.index.values[df["label"] == 2] len1 = len(idx1) # 1000 len2 = len(idx2) # 2500 # 2. draw index for test dataset draw1 = np.random.permutation(len1)[:n1] # keep the first n1 entries to be selected idx1_test = idx1[draw1] draw2 = np.random.permutation(len2)[:n2] idx2_test = idx2[draw2] # combine the drawn indexes idx_test = np.hstack([idx1_test, idx2_test]) # 3. derive index for train dataset idx_train = df.index.difference(idx_test) # split df_train = df.loc[idx_train, :] # optional: .reset_index(drop=True) df_test = df.loc[idx_test, :] # len(df_train) = 3300 # len(df_test) = 200 # verify that no row was missing idx_merged = np.hstack([df_train.index.values, df_test.index.values]) assert len(np.unique(idx_merged)) == 3500
Arima grid search function
I am writing a function that does a grid search for arima model based on bic criteria I am able to do ARIMA grid search but I am trying to update it to SARIMA grid search Code look like below I just cant make it work import itertools p_min = 0 d_min = 0 q_min = 0 p_max = 4 d_max = 0 q_max = 4 # Initialize a DataFrame to store the results results_bic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min,p_max+1)], columns=['MA{}'.format(i) for i in range(q_min,q_max+1)]) for p,d,q in itertools.product(range(p_min,p_max+1), range(d_min,d_max+1), range(q_min,q_max+1)): if p==0 and d==0 and q==0: results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = np.nan continue try: model = sm.tsa.SARIMAX(data, order=(p, d, q),trend = "c", seasonal_order = (p, d, q,12) ) results = model.fit() results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = results.bic except: continue results_bic = results_bic[results_bic.columns].astype(float)
Odd Results on Entropy Calculation
I am trying to write a function that properly calculates the entropy of a given dataset. However, I am getting very weird entropy values. I am following the understanding that all entropy calculations must fall between 0 and 1, yet I am consistently getting values above 2. Note: I must use log base 2 for this Can someone explain why am I yielding incorrect entropy results? The dataset I am testing is the ecoli dataset from the UCI Machine Learning Repository import numpy import math #################### DATA HANDLING LIBRARY #################### def csv_to_array(file): # Open the file, and load it in delimiting on the ',' for a comma separated value file data = open(file, 'r') data = numpy.loadtxt(data, delimiter=',') # Loop through the data in the array for index in range(len(data)): # Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0 try: data[index] = [float(x) for x in data[index]] except Exception: data[index] = 0 except ValueError: data[index] = 0 # Return the now type-formatted data return data # Function that utilizes the numpy library to randomize the dataset. def randomize_data(csv): csv = numpy.random.shuffle(csv) return csv # Function to split the data into test, training set, and validation sets def split_data(csv): # Call the randomize data function randomize_data(csv) # Grab the number of rows and calculate where to split num_rows = csv.shape[0] validation_split = int(num_rows * 0.10) training_split = int(num_rows * 0.72) testing_split = int(num_rows * 0.18) # Validation set as the first 10% of the data validation_set = csv[:validation_split] # Training set as the next 72 training_set = csv[validation_split:training_split + validation_split] # Testing set as the last 18 testing_set = csv[training_split + validation_split:] # Split the data into classes vs actual data training_cols = training_set.shape[1] testing_cols = testing_set.shape[1] validation_cols = validation_set.shape[1] training_classes = training_set[:, training_cols - 1] testing_classes = testing_set[:, testing_cols - 1] validation_classes = validation_set[:, validation_cols - 1] # Take the sets and remove the last (classification) column training_set = training_set[:-1] testing_set = testing_set[:-1] validation_set = validation_set[:-1] # Return the datasets return testing_set, testing_classes, training_set, training_classes, validation_set, validation_classes #################### DATA HANDLING LIBRARY #################### # This function returns the list of classes, and their associated weights (i.e. distributions) # for a given dataset def class_distribution(dataset): # Ensure the dataset is a numpy array dataset = numpy.asarray(dataset) # Collect # of total rows and columns, using numpy num_total_rows = dataset.shape[0] num_columns = dataset.shape[1] # Create a numpy array of just the classes classes = dataset[:, num_columns - 1] # Use numpy.unique to remove duplicates classes = numpy.unique(classes) # Create an empty array for the class weights class_weights = [] # Loop through the classes one by one for aclass in classes: # Create storage variables total = 0 weight = 0 # Now loop through the dataset for row in dataset: # If the class of the dataset is equal to the current class you are evaluating, increase the total if numpy.array_equal(aclass, row[-1]): total = total + 1 # If not, continue else: continue # Divide the # of occurences by total rows weight = float((total / num_total_rows)) # Add that weight to the list of class weights class_weights.append(weight) # Turn the weights into a numpy array class_weights = numpy.asarray(class_weights) # Return the array return classes, class_weights # This function returns the entropy for a given dataset # Can be used across an entire csv, or just for a column of data (feature) def get_entropy(dataset): # Set initial entropy entropy = 0.0 # Determine the classes and their frequencies (weights) of the dataset classes, class_freq = class_distribution(dataset) # Utilize numpy's quicksort to test the most occurring class first numpy.sort(class_freq) # Determine the max entropy for the dataset max_entropy = math.log(len(classes), 2) print("MAX ENTROPY FOR THIS DATASET: ", max_entropy) # Loop through the frequencies and use given formula to calculate entropy # For...Each simulates the sequence operator for freq in class_freq: entropy += float(-freq * math.log(freq, 2)) # Return the entropy value return entropy def main(): ecol = csv_to_array('ecoli.csv') testing_set, testing_classes, training_set, training_classes, validation_set, validation_classes = split_data(ecol) entropy = get_entropy(ecol) print(entropy) main()
The following function was used to calculate Entropy: # Function to return Shannon's Entropy def entropy(attributes, dataset, targetAttr): freq = {} entropy = 0.0 index = 0 for item in attributes: if (targetAttr == item): break else: index = index + 1 index = index - 1 for item in dataset: if ((item[index]) in freq): # Increase the index freq[item[index]] += 1.0 else: # Initialize it by setting it to 0 freq[item[index]] = 1.0 for freq in freq.values(): entropy = entropy + (-freq / len(dataset)) * math.log(freq / len(dataset), 2) return entropy As #MattTimmermans had indicated, entropy's value is actually contingent on the number of classes. For strictly 2 classes, it is contained in the 0 to 1 (inclusive) range. However, for more than 2 classes (which is what was being tested), entropy is calculated with a different formula (converted to Pythonic code above). This post here explains those mathematics and calculations a bit more in detail.