I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.
Can someone help me? I've been working on this code for a few days. My entire body of code is below.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IESEGRecSys.Functions import *
from sklearn.model_selection import train_test_split
from surprise import KNNBasic
from surprise import Dataset, Reader
user_artists = pd.read_table("user_artists.dat")
user_artists['ratings'] = 0
user_artists.loc[user_artists['weight'] <= user_artists['weight'].quantile(1), 'ratings'] = 5
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.8), 'ratings'] = 4
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.6), 'ratings'] = 3
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.4), 'ratings'] = 2
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.2), 'ratings'] = 1
data = user_artists[['userID','artistID','ratings']]
data.head()
data.shape
# train-test split
train, test = train_test_split(data, test_size=0.3, random_state=42)
# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(data.shape)
print(train.shape)
print(test.shape)
tags = pd.read_table("tags.dat", encoding = 'unicode_escape')
user_taggedartists = pd.read_table("user_taggedartists.dat")
user_tag_merged = pd.merge(user_taggedartists, tags, on="tagID", how="inner")
user_tag_merged_updated = pd.merge(user_tag_merged, data, on=(["userID","artistID"]),how="inner")
movie=user_tag_merged_updated
movie
data2 = data[['userID','artistID','ratings']]
# train-test split
train, test2 = train_test_split(data2, test_size=0.3, random_state=42)
# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(data2.shape)
print(train.shape)
print(test.shape)
data_pivot2 = data2.pivot_table(index='artistID', values='ratings', columns='userID').fillna(0)
data_pivot2.head()
movie2 = [['tagID','artistID','year']]
movie2 = user_tag_merged_updated.pivot_table(index='tagID', values='year', columns='userID').fillna(0)
movie2.head()
# Content based as a function
from numpy.linalg import norm
def simil_cosine(a,b):
return np.dot(a, b)/(norm(a)*norm(b))
def ContentBased(content_data, test_data, NN):
cdata = content_data.reset_index(drop=True).copy()
# store user and item dimensions
dim = cdata.shape[0]
nr_user = cdata.shape[0]
if test_data.shape[1] != dim:
raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\
.format(test_data.shape[1], dim))
# similarity matrices
matrix = np.zeros(shape=(dim, dim), dtype=np.float)
matrixNN = np.zeros(shape=(dim, dim), dtype=np.float)
# compute similarity
for i, row in cdata.iterrows():
for j, col in cdata.iterrows():
if i <= j: continue
else: matrix[i][j] = simil_cosine(np.array(row),np.array(col))
# copy values to other diagonal
matrix = matrix + matrix.T - np.diag(np.diag(matrix))
print('Similarity calculation done...')
# mask all values that are not nearest neighbors
cutoff = lambda x,cv: x if x >= cv else 0.0
v_cutoff = np.vectorize(cutoff)
for i in range(dim):
crit_val = -np.sort(-matrix[i])[NN-1]
matrixNN[i] = v_cutoff(matrix[i], crit_val)
print('Nearest neighbor selection done...')
# predict user-item ratings in test_data
prediction = np.zeros(shape=(nr_user, dim), dtype=np.float)
for i in range(nr_user):
num = np.matmul(np.array(test_data.iloc[i,:]), matrixNN)
denom = matrixNN.sum(axis=0) # column sums
prediction[i] = num/denom
print('Prediction done...')
# return DataFrame
return pd.DataFrame(prediction, index=test_data.index, columns=test_data.columns)
cb_pred = ContentBased(movie2,data_pivot2, 10)
# Content Based as a Class
from numpy.linalg import norm
class ContentBased:
def simil_cosine(self, a,b):
return np.dot(a, b)/(norm(a)*norm(b))
def __init__(self, NN):
self.NN = NN
def fit(self, content_data):
cdata = content_data.reset_index(drop=True).copy()
self.item_dim = cdata.shape[0]
self.matrix = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float)
self.matrixNN = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float)
# compute similarity
for i, row in cdata.iterrows():
for j, col in cdata.iterrows():
if i <= j: continue
else: self.matrix[i][j] = self.simil_cosine(np.array(row),np.array(col))
# copy values to other diagonal
self.matrix = self.matrix + self.matrix.T - np.diag(np.diag(self.matrix))
cutoff = lambda x,cv: x if x >= cv else 0.0
v_cutoff = np.vectorize(cutoff)
for i in range(self.item_dim):
crit_val = -np.sort(-self.matrix[i])[self.NN-1]
self.matrixNN[i] = v_cutoff(self.matrix[i], crit_val)
def predict(self, test_data):
if test_data.shape[1] != self.item_dim:
raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\
.format(test_data.shape[1], self.item_dim))
I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.
Related
(Edited to include dataset and model code)
I'm training a Keras CNN 2d matrix. I'm creating my own training dataset, in which each matrix cell has the shape of [[list], int]. The cell's first list item is the product of a string class that I converts to list (using tf.keras.utils.to_categorical):
cell[0] = to_categorical(
rnd_type-1, num_classes=num_types)
the second is a simple int:
cell[1] = random.randint(0, max_val)
The dataset creation function:
def make_data(num_of_samples, num_types, max_height, grid_x, grid_y):
grids_list = []
target_list = []
target = 0
for _ in range(num_of_samples):
# create empty grid
grid = [[[[],0] for i in range(grid_y)] for j in range(grid_x)]
for i in range(grid_x):
for j in range(grid_y):
rnd_type = random.randint(
0, num_types)
# get random class
# and convert to cat list
cat = to_categorical(
rnd_type-1, num_classes=num_types)
# get random type
rnd_height = random.randint(0, max_height)
# inject the two values into the cell
grid[i][j] = [cat, rnd_height]
# get some target value
target += rnd_type * 5 + random.random()*5
target_list.append(target)
grids_list.append(grid)
# make np arrs out of the lists
t = np.array(target_list)
g = np.array(grids_list)
return t, g
my model is created using model = models.create_cnn(grid_size, grid_size, 2, regress=True) in which (I assumed) the Input depth is 2.
The model creation code:
num_types = 20
max_height = 50
num_of_samples = 10
grid_size = 10
epochs = 5000
# get n results of X x Y grid with target
targets_list, grids_list = datasets.make_data(
num_of_samples, num_types, max_height, grid_size, grid_size)
split = train_test_split(targets_list, grids_list,
test_size=0.25, random_state=42)
(train_attr_X, test_attr_X, train_grids_X, test_grids_X) = split
# find the largest value in the training set and use it to
# scale values to the range [0, 1]
max_target = train_attr_X.max()
train_attr_Y = train_attr_X / max_target
test_attr_Y = test_attr_X / max_target
model = models.create_cnn(grid_size, grid_size, 2, regress=True)
I however cannot train it given this error: ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
Answer my own question:
model can only accept int as depth. Therefore, the depth of my matrix must by a list of int len, not a 2D matrix. For that reason, the way to merge class data with continuous field rnd_height is:
class => cat = to_categorical
cell = np.append(cat, [rnd_height])
This way, cat list is added with the rnd_height value.
The whole dataset function now look like this:
def make_data(num_of_samples, num_types, max_height, grid_x, grid_y):
grids_list = []
target_list = []
target = 0
for _ in range(num_of_samples):
grid = [[[False, False] for i in range(grid_y)] for j in range(grid_x)]
for i in range(grid_x):
for j in range(grid_y):
rnd_type = random.randint(
0, num_types)
cat = to_categorical(
rnd_type-1, num_classes=num_types)
rnd_height = random.randint(0, max_height)
cell = np.append(cat, [rnd_height])
grid[i][j] = cell
# simulate simple objective function
if rnd_type < num_types/5:
target += rnd_height * 5
target_list.append(target)
grids_list.append(grid)
t = np.array(target_list)
g = np.array(grids_list)
# return grids and targets
return g, t
I am trying to run a fama-macbeth regression in a python. As afirst step I am running the time series for every asset in my portfolio but I am unable to run it because I am getting an error:
'ValueError: Must pass DataFrame with boolean values only'
I am relatively new to python and have heavily relied on this forum to help me out. I hope it you can help me with this issue.
Please let me know how I can resolve this. I will be very grateful to you!
I assume this line is producing the error. Cause when I run the function without the for loop, it works perfectly.
for i in range(cols):
df_beta = RegressionRoll(df=data_set, subset = 0, dependent = data_set.iloc[:,i], independent = data_set.iloc[:,30:], const = True, parameters = 'beta',
win = 12)
The dimension of my matrix is 108x35, 30 stocks and 5 factors over 108 points. Hence I want to run a regression for every stock against the 4 factors and store the result of the coeffs in a dataframe. Sample dataframe:
Date BAS GY AI FP SGL GY LNA GY AKZA NA Market Factor
1/29/2010 -5.28% -7.55% -1.23% -5.82% -7.09% -5.82%
2/26/2010 0.04% 13.04% -1.84% 4.06% -14.62% -14.62%
3/31/2010 10.75% 1.32% 7.33% 6.61% 12.21% 12.21%
The following is the entire code:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
data_set = pd.read_excel(r'C:\XXX\Research Project\Data\Regression.xlsx', sheet_name = 'Fama Macbeth')
data_set.set_index(data_set['Date'], inplace=True)
data_set.drop('Date', axis=1, inplace=True)
X = data_set.iloc[:,30:]
y = data_set.iloc[:,:30]
def RegressionRoll(df, subset, dependent, independent, const, win, parameters):
# Data subset
if subset != 0:
df = df.tail(subset)
else:
df = df
# Loopinfo
end = df.shape[0]
win = win
rng = np.arange(start = win, stop = end, step = 1)
# Subset and store dataframes
frames = {}
n = 1
for i in rng:
df_temp = df.iloc[:i].tail(win)
newname = 'df' + str(n)
frames.update({newname: df_temp})
n += 1
# Analysis on subsets
df_results = pd.DataFrame()
for frame in frames:
#print(frames[frame])
# Rolling data frames
dfr = frames[frame]
y = dependent
x = independent
if const == True:
x = sm.add_constant(dfr[x])
model = sm.OLS(dfr[y], x).fit()
else:
model = sm.OLS(dfr[y], dfr[x]).fit()
if parameters == 'beta':
theParams = model.params[0:]
coefs = theParams.to_frame()
df_temp = pd.DataFrame(coefs.T)
indx = dfr.tail(1).index[-1]
df_temp['Date'] = indx
df_temp = df_temp.set_index(['Date'])
df_results = pd.concat([df_results, df_temp], axis = 0)
if parameters == 'R2':
theParams = model.rsquared
df_temp = pd.DataFrame([theParams])
indx = dfr.tail(1).index[-1]
df_temp['Date'] = indx
df_temp = df_temp.set_index(['Date'])
df_temp.columns = [', '.join(independent)]
df_results = pd.concat([df_results, df_temp], axis = 0)
return(df_results)
cols = len(y.columns)
for i in range(cols):
df_beta = RegressionRoll(df=data_set, subset = 0, dependent = data_set.iloc[:,i], independent = data_set.iloc[:,30:], const = True, parameters = 'beta',
win = 12)
ValueError: Must pass DataFrame with boolean values only
I get a ValueError: Found input variables with inconsistent numbers of samples: [20000, 1] when I run the following even though the row values of x and y are correct. I load in the RCV1 dataset, get indices of the categories with the top x documents, create list of tuples with equal number of randomly-selected positives and negatives for each category, and then finally attempt to run a logistic regression on one of the categories.
import sklearn.datasets
from sklearn import model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from scipy import sparse
rcv1 = sklearn.datasets.fetch_rcv1()
def get_top_cat_indices(target_matrix, num_cats):
cat_counts = target_matrix.sum(axis=0)
#cat_counts = cat_counts.reshape((1,103)).tolist()[0]
cat_counts = cat_counts.reshape((103,))
#b = sorted(cat_counts, reverse=True)
ind_temp = np.argsort(cat_counts)[::-1].tolist()[0]
ind = [ind_temp[i] for i in range(5)]
return ind
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
cat_present = x.tocsr()[np.where(temp.sum(axis=1)>0)[0],:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[np.where(temp.sum(axis=1)==0)[0],:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[idx_cat,:]
sampled_y_neg = temp.tocsr()[idx_nocat,:]
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
ind = get_top_cat_indices(rcv1.target, 5)
test_res = prepare_data(train_x, train_y, ind, 20000)
x, y = test_res[0]
print(x.shape)
print(y.shape)
LogisticRegression().fit(x, y)
Could it be an issue with the sparse matrices, or problem with dimensionality (there are 20K samples and 47K features)
When I run your code, I get following error:
AttributeError: 'bool' object has no attribute 'any'
That's because y for LogisticRegression needs to numpy array. So, I changed last line to:
LogisticRegression().fit(x, y.A.flatten())
Then I get following error:
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
This is because your sampling code has a bug. You need to subset y array with rows having that category before using sampling indices. See code below:
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
c1 = np.where(temp.sum(axis=1)>0)[0]
c2 = np.where(temp.sum(axis=1)==0)[0]
cat_present = x.tocsr()[c1,:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[c2,:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[c1][idx_cat,:]
print(sampled_y_pos.nnz)
sampled_y_neg = temp.tocsr()[c2][idx_nocat,:]
print(sampled_y_neg.nnz)
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
Now, Everything works like a charm
I would like to know how I can store different data into a numpy array, in order to feed it to a machine Learning SVC algorithm.
My goal, is to get a dataframe of size (sample * features) like this:
With:
Feature 1 in gray containing list of size n
Feature 2 in red, containing 2D numpy array of shape (i,k)
Feature ... Something else (array for pwelch spectrum, integers, float, ...)
Feature n in blue, containing integer.
How can I do that in Python ? Is this going to be ok for sklearn ?
Here is the current error from the code bellow:
ValueError: setting an array element with a sequence.
Code:
# -*- coding: utf-8 -*-
"""----------------------------------------------------------------------------
-------------------------------- Imports --------------------------------------
----------------------------------------------------------------------------"""
import os
import pandas as pd
import numpy as np
from scipy import io as sio
from scipy import signal
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
"""----------------------------------------------------------------------------
------------------------------ Parameters -------------------------------------
----------------------------------------------------------------------------"""
# Path to the clean EEG .mat files
EEG_path = "data"
# Listing of the .mat files
EEG = list()
for elt in os.listdir(EEG_path):
if os.path.isfile(os.path.join(EEG_path, elt)):
if '.mat' in elt[len(elt)-4:]:
EEG.append(elt)
# Spectrum used
spectrum = ['all', (1,45), (8,12)]
nb_features = 3
"""----------------------------------------------------------------------------
------------------------------ Functions --------------------------------------
----------------------------------------------------------------------------"""
# Function on 1 channel
# Input: All points from one channel, for one epoch
def filter(x, n, fs, fc1, fc2):
b, a = signal.butter(n, [fc1/(fs/2), fc2/(fs/2)], 'bandpass')
y = signal.filtfilt(b, a, x)
return y
def haming(x, L):
# Symetric L-points hamming window
window = signal.hamming(L)
y = x * window.T # Element wise multiplication
return y
# Function on one epoch
# Input is a matrix of size (channel * length)
def amp_mean(x):
size = x.shape
y = list()
for i in range(size[0]):
y.append(np.mean(x[i,:]))
return y
def amp_max(x):
size = x.shape
y = list()
for i in range(size[0]):
y.append(np.max(abs(x[i,:])))
return y
"""----------------------------------------------------------------------------
-------------------------------- Script ---------------------------------------
----------------------------------------------------------------------------"""
# Load data
s_EEG = "{}/{}".format(EEG_path, EEG[4])
data = sio.loadmat(s_EEG)['s_EEG']['data'][0][0].astype(float) # data[i, j ,k]
labels = sio.loadmat(s_EEG)['s_EEG']['labels'][0][0][0] # labels[k]
fs = sio.loadmat(s_EEG)['s_EEG']['sampling_rate'][0][0][0][0] # 500 Hz
size = data.shape
# Creates an empty data frame of size (epoch * features)
df = np.empty(shape = (size[2], nb_features * len(spectrum)))
# Filling the dataframe with features
# for every epoch
for k in range(size[2]):
for freq in spectrum:
data_to_compute = np.empty(shape = size, dtype = float)
# Apply hamming
if freq == 'all':
for i in range(size[0]):
data_to_compute[i,:,k] = haming(data[i,:,k], size[1])
# Apply hamming after filtering
else:
for i in range(size[0]):
data_to_compute[i,:,k] = haming(filter(data[i,:,k],
15, fs, freq[0], freq[1]), size[1])
# data_to_compute is ready to have feature extracted
for n in range(0, df.shape[1], nb_features):
df[k, n] = data_to_compute[:,:,k]
df[k, n+1] = amp_mean(data_to_compute[:,:,k])
df[k, n+2] = amp_max(data_to_compute[:,:,k])
# X signal / Y label
X_train, X_test, Y_train, Y_test = train_test_split(data,
list(labels),
test_size=0.15,
random_state=42)
clf = SVC()
clf.fit(X_train, Y_train)
Variable type:
Thanks !
Remove rows from numpy array when its repeated less than n times
Cause:
I have a certain dataset that is 1 gb in size.
It has 29.118.021 samples and 108.390 classes.
However, some classes has just 1 sample. Or 3 samples, and so on...
Problem:
I want to remove the rows/classes from the numpy array that are presented/repeated less than N times.
Reference
XgBoost : The least populated class in y has only 1 members, which is too few
Attempt that failed
train_x, train_y, test_x, test_id = loader.load()
n_samples = train_y.shape[0]
unique_labels, y_inversed = np.unique(train_y, return_inverse=True)
label_counts = bincount(y_inversed)
min_labels = np.min(label_counts)
print "Total Rows ", n_samples
print "unique_labels ", unique_labels.shape[0]
print "label_counts ", label_counts[:]
print "min labels ", min_labels
unique_labels = unique_labels.astype(np.uint8)
unique_amounts = np.empty(shape=unique_labels.shape, dtype=np.uint8)
for u in xrange(0, unique_labels.shape[0]):
if u % 100 == 0:
print "Processed ", str(u)
for index in xrange(0, train_y.shape[0]):
if train_y[index] == unique_labels[u]:
unique_amounts[u] = unique_amounts[u] + 1
for k in xrange(0, unique_amounts.shape[0]):
if unique_amounts[k] == 1:
print "\n"
print "value :", unique_amounts[k]
print "at ", k
The code above is taking too long.Even after i left it running at the server for 1 whole night, it didnt even reach half processment.
Load method
This is my load method.
I could load it and keep it as a dataframe.
def load():
train = pd.read_csv('input/train.csv', index_col=False, header='infer')
test = pd.read_csv('input/test.csv', index_col=False, header='infer')
# drop useless columns
train.drop('row_id', axis=1, inplace=True)
acc = train["accuracy"].iloc[:].as_matrix()
x = train["x"].iloc[:].as_matrix()
y = train["y"].iloc[:].as_matrix()
time = train["time"].iloc[:].as_matrix()
train_y = train["place_id"].iloc[:].as_matrix()
####################################################################################
acc = acc.reshape(-1, 1)
x = x.reshape(-1, 1)
y = y.reshape(-1, 1)
time = time.reshape(-1, 1)
train_y = train_y.reshape(-1, 1)
####################################################################################
train_x = np.hstack((acc, x, y, time))
####################################################################################
acc = test["accuracy"].iloc[:].as_matrix()
x = test["x"].iloc[:].as_matrix()
y = test["y"].iloc[:].as_matrix()
time = test["time"].iloc[:].as_matrix()
test_id = test['row_id'].iloc[:].as_matrix()
#######################
acc = acc.reshape(-1, 1)
x = x.reshape(-1, 1)
y = y.reshape(-1, 1)
time = time.reshape(-1, 1)
#######################
test_x = np.hstack((acc, x, y, time))
return train_x, train_y, test_x, test_id
The numpy_indexed package (disclaimer: I am its author) contains a multiplicity function, which leads to a very readable way of performing such manipulations:
import numpy_indexed as npi
samples_mask = npi.multiplicity(train_y) >= n_min
filtered_train_y = train_y[samples_mask]
I would keep your data in a dataframe format.
That way, you can use some useful methods from the pandas module, and that should be quicker than looping.
First, get the different labels associated with df with df['labels'].value_counts().
(I assume that the labels column name is 'labels').
Then, get only the labels that have less than n_min rows in the dataframe.
vc = df['labels'].value_counts()
labels = vc[vc < n_min].index
df.drop(labels, inplace=True)
Hope that helps !