LDA python library not taking sparse matrix as input - python

I am trying to use the lda 1.0.2 package for python.
The documentation says that sparse matrix are acceptable, but when I pass a sparse matrix to the transform() function. It throws the error
The truth value of an array with more than one element is ambiguous.
Use a.any() or a.all().
The transform() function works fine with normal matrix.
Has anybody else faced similar problem ?
any help will be great! Thanks in advance :)

I just got the same error. To reproduce:
from scipy.sparse import csr_matrix
import lda
X = csr_matrix([[1,0],[0,1]])
lda_test = lda.LDA(n_topics=2, n_iter=10)
lda_test.fit(X)
X_trans = lda_test.transform(X)
Which produces the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-192-a1a0875bac02> in <module>()
5 lda_test = lda.LDA(n_topics=2, n_iter=10)
6 lda_test.fit(X)
----> 7 X_trans = lda_test.transform(X)
C:\Users\lidw6lw\PortablePython\App\lib\site-packages\lda\lda.pyc in transform(self, X, max_iter, tol)
173 n_topics = len(self.components_)
174 doc_topic = np.empty((len(X), n_topics))
--> 175 WS, DS = lda.utils.matrix_to_lists(X)
176 # TODO: this loop is parallelizable
177 for d in range(len(X)):
C:\Users\lidw6lw\PortablePython\App\lib\site-packages\lda\utils.pyc in matrix_to_lists(doc_word)
44 if np.count_nonzero(doc_word.sum(axis=1)) != doc_word.shape[0]:
45 logger.warning("all zero row in document-term matrix found")
---> 46 if np.count_nonzero(doc_word.sum(axis=0)) != doc_word.shape[1]:
47 logger.warning("all zero column in document-term matrix found")
48 sparse = True
C:\Users\lidw6lw\PortablePython\App\lib\site-packages\numpy\core\_methods.pyc in _sum(a, axis, dtype, out, keepdims)
23 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
24 return um.add.reduce(a, axis=axis, dtype=dtype,
---> 25 out=out, keepdims=keepdims)
26
27 def _prod(a, axis=None, dtype=None, out=None, keepdims=False):
C:\Users\lidw6lw\PortablePython\App\lib\site-packages\scipy\sparse\base.pyc in __bool__(self)
181 return True if self.nnz == 1 else False
182 else:
--> 183 raise ValueError("The truth value of an array with more than one "
184 "element is ambiguous. Use a.any() or a.all().")
185 __nonzero__ = __bool__
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().
Looks like it's due to lda.utils.matrix_to_lists
Both of the below work just fine:
X_trans = lda_test.fit(X.toarray())
X_trans2 = lda_test.fit_transform(X)
EDIT: It's actually that the transform function that doesn't account for sparse matrices properly.Make a copy of the package, and in the code for transformjust replace len(X) with X.shape(0) and comment out the np.atleast_2d(X) line. So the section right below the docstring in transform looks like this:
# X = np.atleast_2d(X)
phi = self.components_
alpha = self.alpha
# for debugging, let's not worry about the documents
n_topics = len(self.components_)
doc_topic = np.empty((X.shape[0], n_topics))
WS, DS = lda.utils.matrix_to_lists(X)
# TODO: this loop is parallelizable
for d in range(X.shape[0]):

Got the similar error recently.
ValueError: expected sparse matrix with integer values, found float values
This fixed the issue:
model.fit(X.toarray().astype(int))

Related

"ValueError: array is not broadcastable to correct shape" when using nested arrays in Autograd

I am using Autograd to compute the gradient of a float valued function. The function involves an array of arrays as arguments, and returns a float, and is quite complicated. A minimal example which produces this error is the function in the following code:
import autograd.numpy as np
from autograd import grad
def mod(param):
'''
param: Is an array of the form e.g. [0.1, [0.1,0.2]], where the second term in the list is an
array,
and the first term is a float.
'''
return param[0]+np.sum(np.array(param[1]))
I read the Autograd documentation and it seems I am doing things correctly since I am casting 'param[1]' explicitly as an array. When running the following:
dmod = grad(mod)
x = np.array([0.1,np.array([0.1,0.1])])
dmod(x)
I get the error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-289-5fc18f1d6a09> in <module>
3 x = np.array([0.1,np.array([0.1,0.1])])
4
----> 5 dmod(x)
~\Anaconda3\lib\site-packages\autograd\wrap_util.py in nary_f(*args, **kwargs)
18 else:
19 x = tuple(args[i] for i in argnum)
---> 20 return unary_operator(unary_f, x, *nary_op_args, **nary_op_kwargs)
21 return nary_f
22 return nary_operator
~\Anaconda3\lib\site-packages\autograd\differential_operators.py in grad(fun, x)
27 raise TypeError("Grad only applies to real scalar-output functions. "
28 "Try jacobian, elementwise_grad or holomorphic_grad.")
---> 29 return vjp(vspace(ans).ones())
30
31 #unary_to_nary
~\Anaconda3\lib\site-packages\autograd\core.py in vjp(g)
12 def vjp(g): return vspace(x).zeros()
13 else:
---> 14 def vjp(g): return backward_pass(g, end_node)
15 return vjp, end_value
16
~\Anaconda3\lib\site-packages\autograd\core.py in backward_pass(g, end_node)
21 ingrads = node.vjp(outgrad[0])
22 for parent, ingrad in zip(node.parents, ingrads):
---> 23 outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad)
24 return outgrad[0]
25
~\Anaconda3\lib\site-packages\autograd\core.py in add_outgrads(prev_g_flagged, g)
174 else:
175 if sparse:
--> 176 return sparse_add(vspace(g), None, g), True
177 else:
178 return g, False
~\Anaconda3\lib\site-packages\autograd\tracer.py in f_wrapped(*args, **kwargs)
46 return new_box(ans, trace, node)
47 else:
---> 48 return f_raw(*args, **kwargs)
49 f_wrapped.fun = f_raw
50 f_wrapped._is_autograd_primitive = True
~\Anaconda3\lib\site-packages\autograd\core.py in sparse_add(vs, x_prev, x_new)
184 def sparse_add(vs, x_prev, x_new):
185 x_prev = x_prev if x_prev is not None else vs.zeros()
--> 186 return x_new.mut_add(x_prev)
187
188 class VSpace(object):
~\Anaconda3\lib\site-packages\autograd\numpy\numpy_vjps.py in mut_add(A)
696 idx = onp.array(idx, dtype='int64')
697 def mut_add(A):
--> 698 onp.add.at(A, idx, x)
699 return A
700 return SparseObject(vs, mut_add)
ValueError: array is not broadcastable to correct shape
---------------------------------------------------------------------
I am using an IPython notebook and my version of Autograd is 1.3.
Any help is much appreciated!
I think the problem was that the input which produced an error is a numpy array. If the list x = [0.1,[0.1,0.1]] is passed into the gradient 'dmod' instead, then the output looks correct.

weird error i cannot get my head around in pearsonr function

my code is like below:
predict_n=model.predict(x_test)
predict_n=predict_n.astype(np.float64)
corr_value, p_value = pearsonr(predict_n, y_test)
print(corr_value,round(p_value,4))
print(esc('31;1;4') +"correlation:"+corr_value+" p_value:"+p_value)
fig = plt.figure(figsize=(30, 30))
plot_corr(val_d[:,i,j], predict_n[:,i,j],corrs[i,j])
and when it hits the 3rd row it outputs this:
TypeError Traceback (most recent call last)
<ipython-input-18-b71eb959e83c> in <module>
66 predict_n=model.predict(x_test)
67 predict_n=predict_n.astype(np.float64)
---> 68 corr_value, p_value = pearsonr(predict_n, y_test)
69 print(corr_value,round(p_value,4))
70 print(esc('31;1;4') +"correlation:"+corr_value+" p_value:"+p_value)
~\Anaconda3\envs\deeplearning\lib\site-packages\scipy\stats\stats.py in pearsonr(x, y)
3517 return dtype(np.sign(x[1] - x[0])*np.sign(y[1] - y[0])), 1.0
3518
-> 3519 xmean = x.mean(dtype=dtype)
3520 ymean = y.mean(dtype=dtype)
3521
~\Anaconda3\envs\deeplearning\lib\site-packages\numpy\core\_methods.py in _mean(a, axis, dtype, out, keepdims)
149 is_float16_result = True
150
--> 151 ret = umr_sum(arr, axis, dtype, out, keepdims)
152 if isinstance(ret, mu.ndarray):
153 ret = um.true_divide(
TypeError: No loop matching the specified signature and casting was found for ufunc add
This error i found on this site has to do with variable type. Thats why i added the second row above to make them both to float64. when i, for example input
print(np.shape(predict_n))
print(np.shape(y_test))
print(predict_n.dtype)
print(y_test.dtype)
iget the output
(367, 100, 1)
(367, 100, 1)
float64
float64
Can anyone pls help figure this out.
The shapes of your inputs are (367, 100, 1). pearsonr requires the inputs to be 1-d arrays¹. Unfortunately, that cryptic error message provides no help for figuring out what is wrong!
If your intent is to treat each input as a 1-d sequence of 36700 values, you can use pearsonr(predict_n.ravel(), y_test.ravel()).
If you expected pearsonr to implicitly loop over one of the dimensions, you'll have to write your own loop to do that.
¹ Eventually pearsonr will be enhanced with an axis argument, but for now, its inputs must be 1-d.

How to use cov function to a dataset iris python

I want to get the covariance from the iris data set, https://www.kaggle.com/jchen2186/machine-learning-with-iris-dataset/data
I am using numpy, and the function -> np.cov(iris)
with open("Iris.csv") as iris:
reader = csv.reader(iris)
data = []
next(reader)
for row in reader:
data.append(row)
for i in data:
i.pop(0)
i.pop(4)
iris = np.array(data)
np.cov(iris)
And I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-bfb836354075> in <module>
----> 1 np.cov(iris)
D:\Anaconda\lib\site-packages\numpy\lib\function_base.py in cov(m, y, rowvar, bias, ddof, fweights, aweights)
2300 w *= aweights
2301
-> 2302 avg, w_sum = average(X, axis=1, weights=w, returned=True)
2303 w_sum = w_sum[0]
2304
D:\Anaconda\lib\site-packages\numpy\lib\function_base.py in average(a, axis, weights, returned)
354
355 if weights is None:
--> 356 avg = a.mean(axis)
357 scl = avg.dtype.type(a.size/avg.size)
358 else:
D:\Anaconda\lib\site-packages\numpy\core\_methods.py in _mean(a, axis, dtype, out, keepdims)
73 is_float16_result = True
74
---> 75 ret = umr_sum(arr, axis, dtype, out, keepdims)
76 if isinstance(ret, mu.ndarray):
77 ret = um.true_divide(
TypeError: cannot perform reduce with flexible type
I don't understand what it means..
So, if you want to modify your code you could try by reading the Iris.csv with pandas.read_csv function. And then select the appropiate columns of your choice.
BUT, here is a little set of commands to ease up this task. They use scikit-learn and numpy to load the iris dataset obtain X and y and obtain covariance matrix:
from sklearn.datasets import load_iris
import numpy as np
data = load_iris()
X = data['data']
y = data['target']
np.cov(X)
Hope this has helped.

Is silhouette coefficient subsampling stratified in sklearn ?

I'm again having trouble using the scikit-learn silhouette coefficient. (first question was here : silhouette coefficient in python with sklearn).
I make a clustering that can be very unbalanced but with a lot of individuals so I want to use the sampling parameter of the silhouette coefficient. I was wondering if the subsampling was stratified, meaning sampling with respect to clusters. I take the iris dataset as an example but my dataset is far bigger (and that's why I need sampling).
My code is :
from sklearn import datasets
from sklearn.metrics import *
iris = datasets.load_iris()
col = iris.feature_names
name = iris.target_names
X = pd.DataFrame(iris.data, columns = col)
y = iris.target
s = silhouette_score(X.values, y, metric='euclidean',sample_size=50)
which works. But now If I biased that with :
y[0:148] =0
y[148] = 1
y[149] = 2
print y
s = silhouette_score(X.values, y, metric='euclidean',sample_size=50)
I get :
ValueError Traceback (most recent call last)
<ipython-input-12-68a7fba49c54> in <module>()
4 y[149] =2
5 print y
----> 6 s = silhouette_score(X.values, y, metric='euclidean',sample_size=50)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/cluster/unsupervised.pyc in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
82 else:
83 X, labels = X[indices], labels[indices]
---> 84 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
85
86
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/cluster/unsupervised.pyc in silhouette_samples(X, labels, metric, **kwds)
146 for i in range(n)])
147 B = np.array([_nearest_cluster_distance(distances[i], labels, i)
--> 148 for i in range(n)])
149 sil_samples = (B - A) / np.maximum(A, B)
150 # nan values are for clusters of size 1, and should be 0
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/cluster/unsupervised.pyc in _nearest_cluster_distance(distances_row, labels, i)
200 label = labels[i]
201 b = np.min([np.mean(distances_row[labels == cur_label])
--> 202 for cur_label in set(labels) if not cur_label == label])
203 return b
/usr/lib/python2.7/dist-packages/numpy/core/fromnumeric.pyc in amin(a, axis, out, keepdims)
1980 except AttributeError:
1981 return _methods._amin(a, axis=axis,
-> 1982 out=out, keepdims=keepdims)
1983 # NOTE: Dropping the keepdims parameter
1984 return amin(axis=axis, out=out)
/usr/lib/python2.7/dist-packages/numpy/core/_methods.pyc in _amin(a, axis, out, keepdims)
12 def _amin(a, axis=None, out=None, keepdims=False):
13 return um.minimum.reduce(a, axis=axis,
---> 14 out=out, keepdims=keepdims)
15
16 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
ValueError: zero-size array to reduction operation minimum which has no identity
an error which is due I think to the fact that sampling is random not stratified so it has not taken into account the two small clusters.
Am I correct ?
Yes you are correct. The sampling is not stratified since it doesn't take the labels into consideration when doing the sampling.
This is how the sample is taken (version 0.14.1)
indices = random_state.permutation(X.shape[0])[:sample_size]
Where X is the input array of size [n_samples_a, n_samples_a] or [n_samples_a, n_features].
I think you are right, the current implementation does not support balanced resampling.
Just an update for year 2020:
As of scikit-learn 0.22.1, the sampling remains random (i.e. not stratified).
The source code is still:
indices = random_state.permutation(X.shape[0])[:sample_size]

silhouette coefficient in python with sklearn

I'm having trouble computing the silhouette coefficient in python with sklearn.
Here is my code :
from sklearn import datasets
from sklearn.metrics import *
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = col)
y = pd.DataFrame(iris.target,columns = ['cluster'])
s = silhouette_score(X, y, metric='euclidean',sample_size=int(50))
I get the error :
IndexError: indices are out-of-bounds
I want to use the sample_size parameter because when working with very large datasets, silhouette is too long to compute. Anyone knows how this parameter could work ?
Complete traceback :
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-72-70ff40842503> in <module>()
4 X = pd.DataFrame(iris.data, columns = col)
5 y = pd.DataFrame(iris.target,columns = ['cluster'])
----> 6 s = silhouette_score(X, y, metric='euclidean',sample_size=50)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/cluster/unsupervised.pyc in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
81 X, labels = X[indices].T[indices].T, labels[indices]
82 else:
---> 83 X, labels = X[indices], labels[indices]
84 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
85
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
1993 if isinstance(key, (np.ndarray, list)):
1994 # either boolean or fancy integer index
-> 1995 return self._getitem_array(key)
1996 elif isinstance(key, DataFrame):
1997 return self._getitem_frame(key)
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _getitem_array(self, key)
2030 else:
2031 indexer = self.ix._convert_to_indexer(key, axis=1)
-> 2032 return self.take(indexer, axis=1, convert=True)
2033
2034 def _getitem_multilevel(self, key):
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in take(self, indices, axis, convert)
2981 if convert:
2982 axis = self._get_axis_number(axis)
-> 2983 indices = _maybe_convert_indices(indices, len(self._get_axis(axis)))
2984
2985 if self._is_mixed_type:
/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _maybe_convert_indices(indices, n)
1038 mask = (indices>=n) | (indices<0)
1039 if mask.any():
-> 1040 raise IndexError("indices are out-of-bounds")
1041 return indices
1042
IndexError: indices are out-of-bounds
silhouette_score expects regular numpy arrays as input. Why wrap your arrays in data frames?
>>> silhouette_score(iris.data, iris.target, sample_size=50)
0.52999903616584543
From the traceback, you can observe that the code is doing fancy indexing (subsampling) on the first axis. By default indexing a dataframe will index the columns and not the rows hence the issue you observe.

Categories