(Edited to include dataset and model code)
I'm training a Keras CNN 2d matrix. I'm creating my own training dataset, in which each matrix cell has the shape of [[list], int]. The cell's first list item is the product of a string class that I converts to list (using tf.keras.utils.to_categorical):
cell[0] = to_categorical(
rnd_type-1, num_classes=num_types)
the second is a simple int:
cell[1] = random.randint(0, max_val)
The dataset creation function:
def make_data(num_of_samples, num_types, max_height, grid_x, grid_y):
grids_list = []
target_list = []
target = 0
for _ in range(num_of_samples):
# create empty grid
grid = [[[[],0] for i in range(grid_y)] for j in range(grid_x)]
for i in range(grid_x):
for j in range(grid_y):
rnd_type = random.randint(
0, num_types)
# get random class
# and convert to cat list
cat = to_categorical(
rnd_type-1, num_classes=num_types)
# get random type
rnd_height = random.randint(0, max_height)
# inject the two values into the cell
grid[i][j] = [cat, rnd_height]
# get some target value
target += rnd_type * 5 + random.random()*5
target_list.append(target)
grids_list.append(grid)
# make np arrs out of the lists
t = np.array(target_list)
g = np.array(grids_list)
return t, g
my model is created using model = models.create_cnn(grid_size, grid_size, 2, regress=True) in which (I assumed) the Input depth is 2.
The model creation code:
num_types = 20
max_height = 50
num_of_samples = 10
grid_size = 10
epochs = 5000
# get n results of X x Y grid with target
targets_list, grids_list = datasets.make_data(
num_of_samples, num_types, max_height, grid_size, grid_size)
split = train_test_split(targets_list, grids_list,
test_size=0.25, random_state=42)
(train_attr_X, test_attr_X, train_grids_X, test_grids_X) = split
# find the largest value in the training set and use it to
# scale values to the range [0, 1]
max_target = train_attr_X.max()
train_attr_Y = train_attr_X / max_target
test_attr_Y = test_attr_X / max_target
model = models.create_cnn(grid_size, grid_size, 2, regress=True)
I however cannot train it given this error: ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
Answer my own question:
model can only accept int as depth. Therefore, the depth of my matrix must by a list of int len, not a 2D matrix. For that reason, the way to merge class data with continuous field rnd_height is:
class => cat = to_categorical
cell = np.append(cat, [rnd_height])
This way, cat list is added with the rnd_height value.
The whole dataset function now look like this:
def make_data(num_of_samples, num_types, max_height, grid_x, grid_y):
grids_list = []
target_list = []
target = 0
for _ in range(num_of_samples):
grid = [[[False, False] for i in range(grid_y)] for j in range(grid_x)]
for i in range(grid_x):
for j in range(grid_y):
rnd_type = random.randint(
0, num_types)
cat = to_categorical(
rnd_type-1, num_classes=num_types)
rnd_height = random.randint(0, max_height)
cell = np.append(cat, [rnd_height])
grid[i][j] = cell
# simulate simple objective function
if rnd_type < num_types/5:
target += rnd_height * 5
target_list.append(target)
grids_list.append(grid)
t = np.array(target_list)
g = np.array(grids_list)
# return grids and targets
return g, t
Related
I am trying to adjust the following code to produce bootstrap samples of equal size with replacement:
rng = np.random.RandomState(1)
def cluster_stability(X, est, n_iter=20, random_state=None):
labels = []
indices = []
for i in range(n_iter):
# draw bootstrap samples, store indices
sample_indices = rng.randint(0, X.shape[0], X.shape[0])
indices.append(sample_indices)
est = clone(est)
if hasattr(est, "random_state"):
# randomize estimator if possible
est.random_state = rng.randint(1e5)
X_bootstrap = X.iloc[sample_indices] # ADD .iloc IF NOT 2D
est.fit(X_bootstrap)
# store clustering outcome using original indices
relabel = -np.ones(X.shape[0], dtype=int)
relabel[sample_indices] = est.labels_
labels.append(relabel)
scores = []
for l, i in zip(labels, indices):
for k, j in zip(labels, indices):
# we also compute the diagonal which is a bit silly
in_both = np.intersect1d(i, j)
scores.append(metrics.adjusted_rand_score(l[in_both], k[in_both]))
return np.mean(scores)
Hence, sample_indices should be of size len(X). However, this adjustment crashes the code.
Thanks in advance!
Nested Array
I want to turn the above into the below. This accidentally happened as I was doing a linear regression that the output was already in a 1x1 array, let me know if you would like to see more of my code. It looks like my betas variable is the issue with the nesting.
Normal Array
Generally speaking, I am just trying to get the output from
[[ array([x]), array([x]), array([x]), array([x]), array([x])]]
to
[[x, x, x, x, x ]]
def si_model():
dj_data = pd.read_csv("/data.tsv", sep = "\t")
dj_data = dj_data.pct_change().dropna()
ann_dj_data = dj_data * 252
dj_index = ann_dj_data['^DJI']
ann_dj_data = ann_dj_data.drop('^DJI', axis='columns')
# Function to Linear Regress Each Stock onto DJ
def model_regress(stock):
# Fit DJ to Index Data
DJ = np.array(dj_index).reshape(len(stock), 1)
# Regression of each stock onto DJ
lm = LinearRegression().fit(DJ, y=stock.to_numpy())
resids = stock.to_numpy() - lm.predict(DJ)
return lm.coef_, lm.intercept_, resids.std()
# Run model regression on each stock
lm_all = ann_dj_data.apply(lambda stock: model_regress(stock)).T
# Table of the Coeffeicents
lm_all = lm_all.rename(columns={0: 'Beta ', 1: 'Intercept', 2: 'Rsd Std'})
# Varaince of the index's returns
dj_index_var = dj_index.std() ** 2
betas = lm_all['Beta '].to_numpy()
resid_vars = lm_all['Rsd Std'].to_numpy() ** 2
# Single index approximation of covariance matrix using identity matrix (np.eye)
Qsi = dj_index_var * betas * betas.reshape(-1, 1) + np.eye(len(betas)) * resid_vars
return Qsi
# Printing first five rows of approximation
Qsi = si_model()
print("Covariance Matrix")
print(Qsi[:5, :5])
You can use squeeze().
Here is a small example similar to yours:
import numpy as np
a = np.array([17.1500691])
b = np.array([5.47690856])
c = np.array([5.47690856])
d = np.array([11.7700696])
e = list([[a,b],[c,d]])
print(e)
f = np.squeeze(np.array(e), axis=2)
print(f)
Output:
[[array([17.1500691]), array([5.47690856])], [array([5.47690856]), array([11.7700696])]]
[[17.1500691 5.47690856]
[ 5.47690856 11.7700696 ]]
I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.
Can someone help me? I've been working on this code for a few days. My entire body of code is below.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IESEGRecSys.Functions import *
from sklearn.model_selection import train_test_split
from surprise import KNNBasic
from surprise import Dataset, Reader
user_artists = pd.read_table("user_artists.dat")
user_artists['ratings'] = 0
user_artists.loc[user_artists['weight'] <= user_artists['weight'].quantile(1), 'ratings'] = 5
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.8), 'ratings'] = 4
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.6), 'ratings'] = 3
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.4), 'ratings'] = 2
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.2), 'ratings'] = 1
data = user_artists[['userID','artistID','ratings']]
data.head()
data.shape
# train-test split
train, test = train_test_split(data, test_size=0.3, random_state=42)
# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(data.shape)
print(train.shape)
print(test.shape)
tags = pd.read_table("tags.dat", encoding = 'unicode_escape')
user_taggedartists = pd.read_table("user_taggedartists.dat")
user_tag_merged = pd.merge(user_taggedartists, tags, on="tagID", how="inner")
user_tag_merged_updated = pd.merge(user_tag_merged, data, on=(["userID","artistID"]),how="inner")
movie=user_tag_merged_updated
movie
data2 = data[['userID','artistID','ratings']]
# train-test split
train, test2 = train_test_split(data2, test_size=0.3, random_state=42)
# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(data2.shape)
print(train.shape)
print(test.shape)
data_pivot2 = data2.pivot_table(index='artistID', values='ratings', columns='userID').fillna(0)
data_pivot2.head()
movie2 = [['tagID','artistID','year']]
movie2 = user_tag_merged_updated.pivot_table(index='tagID', values='year', columns='userID').fillna(0)
movie2.head()
# Content based as a function
from numpy.linalg import norm
def simil_cosine(a,b):
return np.dot(a, b)/(norm(a)*norm(b))
def ContentBased(content_data, test_data, NN):
cdata = content_data.reset_index(drop=True).copy()
# store user and item dimensions
dim = cdata.shape[0]
nr_user = cdata.shape[0]
if test_data.shape[1] != dim:
raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\
.format(test_data.shape[1], dim))
# similarity matrices
matrix = np.zeros(shape=(dim, dim), dtype=np.float)
matrixNN = np.zeros(shape=(dim, dim), dtype=np.float)
# compute similarity
for i, row in cdata.iterrows():
for j, col in cdata.iterrows():
if i <= j: continue
else: matrix[i][j] = simil_cosine(np.array(row),np.array(col))
# copy values to other diagonal
matrix = matrix + matrix.T - np.diag(np.diag(matrix))
print('Similarity calculation done...')
# mask all values that are not nearest neighbors
cutoff = lambda x,cv: x if x >= cv else 0.0
v_cutoff = np.vectorize(cutoff)
for i in range(dim):
crit_val = -np.sort(-matrix[i])[NN-1]
matrixNN[i] = v_cutoff(matrix[i], crit_val)
print('Nearest neighbor selection done...')
# predict user-item ratings in test_data
prediction = np.zeros(shape=(nr_user, dim), dtype=np.float)
for i in range(nr_user):
num = np.matmul(np.array(test_data.iloc[i,:]), matrixNN)
denom = matrixNN.sum(axis=0) # column sums
prediction[i] = num/denom
print('Prediction done...')
# return DataFrame
return pd.DataFrame(prediction, index=test_data.index, columns=test_data.columns)
cb_pred = ContentBased(movie2,data_pivot2, 10)
# Content Based as a Class
from numpy.linalg import norm
class ContentBased:
def simil_cosine(self, a,b):
return np.dot(a, b)/(norm(a)*norm(b))
def __init__(self, NN):
self.NN = NN
def fit(self, content_data):
cdata = content_data.reset_index(drop=True).copy()
self.item_dim = cdata.shape[0]
self.matrix = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float)
self.matrixNN = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float)
# compute similarity
for i, row in cdata.iterrows():
for j, col in cdata.iterrows():
if i <= j: continue
else: self.matrix[i][j] = self.simil_cosine(np.array(row),np.array(col))
# copy values to other diagonal
self.matrix = self.matrix + self.matrix.T - np.diag(np.diag(self.matrix))
cutoff = lambda x,cv: x if x >= cv else 0.0
v_cutoff = np.vectorize(cutoff)
for i in range(self.item_dim):
crit_val = -np.sort(-self.matrix[i])[self.NN-1]
self.matrixNN[i] = v_cutoff(self.matrix[i], crit_val)
def predict(self, test_data):
if test_data.shape[1] != self.item_dim:
raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\
.format(test_data.shape[1], self.item_dim))
I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.
I get a ValueError: Found input variables with inconsistent numbers of samples: [20000, 1] when I run the following even though the row values of x and y are correct. I load in the RCV1 dataset, get indices of the categories with the top x documents, create list of tuples with equal number of randomly-selected positives and negatives for each category, and then finally attempt to run a logistic regression on one of the categories.
import sklearn.datasets
from sklearn import model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from scipy import sparse
rcv1 = sklearn.datasets.fetch_rcv1()
def get_top_cat_indices(target_matrix, num_cats):
cat_counts = target_matrix.sum(axis=0)
#cat_counts = cat_counts.reshape((1,103)).tolist()[0]
cat_counts = cat_counts.reshape((103,))
#b = sorted(cat_counts, reverse=True)
ind_temp = np.argsort(cat_counts)[::-1].tolist()[0]
ind = [ind_temp[i] for i in range(5)]
return ind
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
cat_present = x.tocsr()[np.where(temp.sum(axis=1)>0)[0],:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[np.where(temp.sum(axis=1)==0)[0],:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[idx_cat,:]
sampled_y_neg = temp.tocsr()[idx_nocat,:]
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
ind = get_top_cat_indices(rcv1.target, 5)
test_res = prepare_data(train_x, train_y, ind, 20000)
x, y = test_res[0]
print(x.shape)
print(y.shape)
LogisticRegression().fit(x, y)
Could it be an issue with the sparse matrices, or problem with dimensionality (there are 20K samples and 47K features)
When I run your code, I get following error:
AttributeError: 'bool' object has no attribute 'any'
That's because y for LogisticRegression needs to numpy array. So, I changed last line to:
LogisticRegression().fit(x, y.A.flatten())
Then I get following error:
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
This is because your sampling code has a bug. You need to subset y array with rows having that category before using sampling indices. See code below:
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
c1 = np.where(temp.sum(axis=1)>0)[0]
c2 = np.where(temp.sum(axis=1)==0)[0]
cat_present = x.tocsr()[c1,:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[c2,:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[c1][idx_cat,:]
print(sampled_y_pos.nnz)
sampled_y_neg = temp.tocsr()[c2][idx_nocat,:]
print(sampled_y_neg.nnz)
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
Now, Everything works like a charm
I'm trying to vectorize a code with numpy, to run it using multiprocessing, but i can't understand how numpy.apply_along_axis works. This is an example of the code, vectorized using map
import numpy
from scipy import sparse
import multiprocessing
from matplotlib import pyplot
#first i build a matrix of some x positions vs time datas in a sparse format
matrix = numpy.random.randint(2, size = 100).astype(float).reshape(10,10)
x = numpy.nonzero(matrix)[0]
times = numpy.nonzero(matrix)[1]
weights = numpy.random.rand(x.size)
#then i define an array of y positions
nStepsY = 5
y = numpy.arange(1,nStepsY+1)
#now i build an image using x-y-times coordinates and x-times weights
def mapIt(ithStep):
ncolumns = 80
image = numpy.zeros(ncolumns)
yTimed = y[ithStep]*times
positions = (numpy.round(x-yTimed)+50).astype(int)
values = numpy.bincount(positions,weights)
values = values[numpy.nonzero(values)]
positions = numpy.unique(positions)
image[positions] = values
return image
image = list(map(mapIt, range(nStepsY)))
image = numpy.array(image)
a = pyplot.imshow(image, aspect = 10)
Here the output plot
I tried to use numpy.apply_along_axis, but this function allows me to iterate only along the rows of image, while i need to iterate along the ithStep index too. E.g.:
#now i build an image using x-y-times coordinates and x-times weights
nrows = nStepsY
ncolumns = 80
matrix = numpy.zeros(nrows*ncolumns).reshape(nrows,ncolumns)
def applyIt(image):
image = numpy.zeros(ncolumns)
yTimed = y[ithStep]*times
positions = (numpy.round(x-yTimed)+50).astype(int)
values = numpy.bincount(positions,weights)
values = values[numpy.nonzero(values)]
positions = numpy.unique(positions)
image[positions] = values
return image
imageApplied = numpy.apply_along_axis(applyIt,1,matrix)
a = pyplot.imshow(imageApplied, aspect = 10)
It obviously return only the firs row nrows times, since nothing iterates ithStep:
And here the wrong plot
There is a way to iterate an index, or to use an index while numpy.apply_along_axis iterates?
Here the code with only matricial operations: it's quite faster than map or apply_along_axis but uses so much memory.
(in this function i use a trick with scipy.sparse, which works more intuitively than numpy arrays when you try to sum numbers on a same element)
def fullmatrix(nRows, nColumns):
y = numpy.arange(1,nStepsY+1)
image = numpy.zeros((nRows, nColumns))
yTimed = numpy.outer(y,times)
x3d = numpy.outer(numpy.ones(nStepsY),x)
weights3d = numpy.outer(numpy.ones(nStepsY),weights)
y3d = numpy.outer(y,numpy.ones(x.size))
positions = (numpy.round(x3d-yTimed)+50).astype(int)
matrix = sparse.coo_matrix((numpy.ravel(weights3d), (numpy.ravel(y3d), numpy.ravel(positions)))).todense()
return matrix
image = fullmatrix(nStepsY, 80)
a = pyplot.imshow(image, aspect = 10)
This way is simplier and very fast! Thank you so much.
nStepsY = 5
nRows = nStepsY
nColumns = 80
y = numpy.arange(1,nStepsY+1)
image = numpy.zeros((nRows, nColumns))
fakeRow = numpy.zeros(positions.size)
def itermatrix(ithStep):
yTimed = y[ithStep]*times
positions = (numpy.round(x-yTimed)+50).astype(int)
matrix = sparse.coo_matrix((weights, (fakeRow, positions))).todense()
matrix = numpy.ravel(matrix)
missColumns = (nColumns-matrix.size)
zeros = numpy.zeros(missColumns)
matrix = numpy.concatenate((matrix, zeros))
return matrix
for i in numpy.arange(nStepsY):
image[i] = itermatrix(i)
#or, without initialization of image:
imageMapped = list(map(itermatrix, range(nStepsY)))
imageMapped = numpy.array(imageMapped)
It feels like attempting to use map or apply_along_axis is obscuring the essentially iteration of the problem.
I rewrote your code as an explicit loop on y:
nStepsY = 5
y = numpy.arange(1,nStepsY+1)
image = numpy.zeros((nStepsY, 80))
for i, yi in enumerate(y):
yTimed = yi*times
positions = (numpy.round(x-yTimed)+50).astype(int)
values = numpy.bincount(positions,weights)
values = values[numpy.nonzero(values)]
positions = numpy.unique(positions)
image[i, positions] = values
a = pyplot.imshow(image, aspect = 10)
pyplot.show()
Looking at the code, I think I could calculate positions for all y values making a (y.shape[0],times.shape[0]) array. But the rest, the bincount and unique still have to work row by row.
apply_along_axis when working with a 2d array, and axis=1 essentially does:
res = np.zeros_like(arr)
for i in range....:
res[i,:] = func1d(arr[i,:])
If the input array has more dimensions it constructs a more elaborate indexing object [i,j,k,:]. And it can handle cases where func1d returns a different size array than the input. But in any case it is just a generalized iteration tool.
Moving the initial positions creation outside the loop:
yTimed = y[:,None]*times
positions = (numpy.round(x-yTimed)+50).astype(int)
image = numpy.zeros((positions.shape[0], 80))
for i, pos in enumerate(positions):
values = numpy.bincount(pos,weights)
values = values[numpy.nonzero(values)]
pos = numpy.unique(pos)
image[i, pos] = values
Now I can cast this as an apply_along_axis problem, with an applyIt that takes a positions vector (with all the yTimed information) rather than blank image vector.
def applyIt(pos, size, weights):
acolumn = numpy.zeros(size)
values = numpy.bincount(pos,weights)
values = values[numpy.nonzero(values)]
pos = numpy.unique(pos)
acolumn[pos] = values
return acolumn
image = numpy.apply_along_axis(applyIt, 1, positions, 80, weights)
Timing wise I expect it's a bit slower than my explicit iteration. It has to do more setup work, including a test call applyIt(positions[0,:],...) to determine the size of its return array (i.e image has different shape than positions.)
def csrmatrix(y, times, x, weights):
yTimed = numpy.outer(y,times)
n=y.shape[0]
x3d = numpy.outer(numpy.ones(n),x)
weights3d = numpy.outer(numpy.ones(n),weights)
y3d = numpy.outer(y,numpy.ones(x.size))
positions = (numpy.round(x3d-yTimed)+50).astype(int)
#print(y.shape, weights3d.shape, y3d.shape, positions.shape)
matrix = sparse.csr_matrix((numpy.ravel(weights3d), (numpy.ravel(y3d), numpy.ravel(positions))))
#print(repr(matrix))
return matrix
# one call
image = csrmatrix(y, times, x, weights)
# iterative call
alist = []
for yi in numpy.arange(1,nStepsY+1):
alist.append(csrmatrix(numpy.array([yi]), times, x, weights))
def mystack(alist):
# concatenate without offset
row, col, data = [],[],[]
for A in alist:
A = A.tocoo()
row.extend(A.row)
col.extend(A.col)
data.extend(A.data)
print(len(row),len(col),len(data))
return sparse.csr_matrix((data, (row, col)))
vimage = mystack(alist)