pymc3 with custom likelihood function from kernel density estimation

pymc3 with custom likelihood function from kernel density estimation - python

I'm trying to use pymc3 with a likelihood function derived from some observed data. This observed data doesn't fit any nice, standard distribution, so I want to define my own, based on these observations.
One approach is to use kernel density estimation over the observations. This was possible in pymc2, but doesn't play nicely with the Theano variables in pymc3.
In my code below I'm just generating some dummy data that is normally distributed. As my prior, I'm essentially assuming a uniform distribution for my observations.
Here's my code:
from scipy import stats
import numpy as np
import pymc3 as pm
from sklearn.neighbors.kde import KernelDensity
data = np.sort(stats.norm.rvs(loc=0, scale=1, size=1000))
kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(data.reshape(-1, 1))
def get_log_likelihood(x):
return kde.score_samples(x)
with pm.Model() as test_model:
x = pm.Uniform('prior rv', lower=-10, upper=10)
obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
step = pm.Metropolis()
trace = pm.sample(200, step=step)
The error I receive seems to be the kde score_samples function blowing up as it expects an array, but x is a Theano variable.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-49-4efbbe7376dc> in <module>()
1 with pm.Model() as test_model:
2 x = pm.Uniform('prior rv', lower=0.0, upper=1e6)
----> 3 obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
4
5 step = pm.Metropolis()
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
40 total_size = kwargs.pop('total_size', None)
41 dist = cls.dist(*args, **kwargs)
---> 42 return model.Var(name, dist, data, total_size)
43 else:
44 raise TypeError("Name needs to be a string but got: {}".format(name))
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size)
825 with self:
826 var = MultiObservedRV(name=name, data=data, distribution=dist,
--> 827 total_size=total_size, model=self)
828 self.observed_RVs.append(var)
829 if var.missing_values:
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
1372 self.missing_values = [datum.missing_values for datum in self.data.values()
1373 if datum.missing_values is not None]
-> 1374 self.logp_elemwiset = distribution.logp(**self.data)
1375 # The logp might need scaling in minibatches.
1376 # This is done in `Factor`.
<ipython-input-48-535f58ce543b> in get_log_likelihood(x)
1 def get_log_likelihood(x):
----> 2 return kde.score_samples(x)
~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/neighbors/kde.py in score_samples(self, X)
150 # For it to be a probability, we must scale it. For this reason
151 # we'll also scale atol.
--> 152 X = check_array(X, order='C', dtype=DTYPE)
153 N = self.tree_.data.shape[0]
154 atol_N = self.atol * N
~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
431 force_all_finite)
432 else:
--> 433 array = np.array(array, dtype=dtype, order=order, copy=copy)
434
435 if ensure_2d:
ValueError: setting an array element with a sequence.
Any help would be greatly appreciated. Thanks!

Related

Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required

This error keeps coming when I try to find MI values. My code is as follows
X_new = X.copy()
X_new = X_new.fillna(0)
y = data.SalePrice
def make_mi_scores(X, y):
X = X.copy()
for colname in X.select_dtypes(["object", "category"]):
X[colname], _ = X[colname].factorize()
discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
return mi_scores
def plot_mi_scores(scores):
scores = scores.sort_values(ascending=True)
width = np.arange(len(scores))
ticks = list(scores.index)
plt.barh(width, scores)
plt.yticks(width, ticks)
plt.title("Mutual Information Scores")
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(make_mi_scores(X_new,y))
if you want the full notebook here is a link https://www.kaggle.com/code/snigdhkarki/house-price-competition
The error is as follows
ValueError Traceback (most recent call last)
/tmp/ipykernel_19/1575243112.py in <module>
42
43 plt.figure(dpi=100, figsize=(8, 5))
---> 44 plot_mi_scores(make_mi_scores(X_new,y))
/tmp/ipykernel_19/1575243112.py in make_mi_scores(X, y)
28 print(X.isnull().any().any())
29 print(y.isnull().any().any())
---> 30 mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
31 mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
32 mi_scores = mi_scores.sort_values(ascending=False)
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in mutual_info_regression(X, y, discrete_features, n_neighbors, copy, random_state)
382 of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
383 """
--> 384 return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)
385
386
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _estimate_mi(X, y, discrete_features, discrete_target, n_neighbors, copy, random_state)
300 mi = [
301 _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
--> 302 for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
303 ]
304
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in <listcomp>(.0)
300 mi = [
301 _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
--> 302 for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
303 ]
304
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _compute_mi(x, y, x_discrete, y_discrete, n_neighbors)
160 return mutual_info_score(x, y)
161 elif x_discrete and not y_discrete:
--> 162 return _compute_mi_cd(y, x, n_neighbors)
163 elif not x_discrete and y_discrete:
164 return _compute_mi_cd(x, y, n_neighbors)
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _compute_mi_cd(c, d, n_neighbors)
137 radius = radius[mask]
138
--> 139 kd = KDTree(c)
140 m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
141 m_all = np.array(m_all) - 1.0
sklearn/neighbors/_binary_tree.pxi in sklearn.neighbors._kd_tree.BinaryTree.__init__()
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
806 "Found array with %d sample(s) (shape=%s) while a"
807 " minimum of %d is required%s."
--> 808 % (n_samples, array.shape, ensure_min_samples, context)
809 )
810
ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.
There were only a few places where this question was asked and even in those places I was unable to find any answers to my question

The issue is arising where you are calling mutual_info_regression over here -
mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
As per sklearn's documentation, the parameter discrete_features should be a boolean mask that has True for discrete variables and False otherwise.
I checked your Kaggle code and it seems like your technique for identifying discrete and continuous features in your data frame is wrong.
A simple hack to get the code running would be to identify all features as continuous using the following code -
discrete_features = [False]*73
# 73 is the number of columns X has
However, the result might be wrong if the mutual_info_regression algorithm requires you to accurately identify discrete and continuous features.

AttributeError: type object 'object' has no attribute 'dtype'

I'm trying to replicate the results described in How to Determine the Best Fitting Data Distribution Using Python. I used then the following code:
import numpy as np
from distfit import distfit
# Generate 10000 normal distribution samples with mean 0, std dev of 3
X = np.random.normal(0, 3, 10000)
# Initialize distfit
dist = distfit()
# Determine best-fitting probability distribution for data
dist.fit_transform(X)
Anyway, I obtained the following error:
[distfit] >fit..
[distfit] >transform..
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-8-02f73e7f157d> in <module>
9
10 # Determine best-fitting probability distribution for data
---> 11 dist.fit_transform(X)
~\Anaconda3\lib\site-packages\distfit\distfit.py in fit_transform(self, X, verbose)
275 self.fit(verbose=verbose)
276 # Transform X based on functions
--> 277 self.transform(X, verbose=verbose)
278 # Store
279 results = _store(self.alpha,
~\Anaconda3\lib\site-packages\distfit\distfit.py in transform(self, X, verbose)
214 if self.method=='parametric':
215 # Compute best distribution fit on the empirical X
--> 216 out_summary, model = _compute_score_distribution(X, X_bins, y_obs, self.distributions, self.stats, verbose=verbose)
217 # Determine confidence intervals on the best fitting distribution
218 model = _compute_cii(self, model, verbose=verbose)
~\Anaconda3\lib\site-packages\distfit\distfit.py in _compute_score_distribution(data, X, y_obs, DISTRIBUTIONS, stats, verbose)
906 model['params'] = (0.0, 1.0)
907 best_score = np.inf
--> 908 df = pd.DataFrame(index=range(0, len(DISTRIBUTIONS)), columns=['distr', 'score', 'LLE', 'loc', 'scale', 'arg'])
909 max_name_len = np.max(list(map(lambda x: len(x.name), DISTRIBUTIONS)))
910
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
346 dtype=dtype, copy=copy)
347 elif isinstance(data, dict):
--> 348 mgr = self._init_dict(data, index, columns, dtype=dtype)
349 elif isinstance(data, ma.MaskedArray):
350 import numpy.ma.mrecords as mrecords
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _init_dict(self, data, index, columns, dtype)
449 nan_dtype = dtype
450 v = construct_1d_arraylike_from_scalar(np.nan, len(index),
--> 451 nan_dtype)
452 arrays.loc[missing] = [v] * missing.sum()
453
~\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in construct_1d_arraylike_from_scalar(value, length, dtype)
1194 else:
1195 if not isinstance(dtype, (np.dtype, type(np.dtype))):
-> 1196 dtype = dtype.dtype
1197
1198 # coerce if we have nan for an integer dtype
AttributeError: type object 'object' has no attribute 'dtype'
(I'm using Jupyter.)
How can I fix this problem?

The solution to the above error, as can be seen in the comments of the question, was to upgrade pandas. This issue appears in versions 1.0.4 and lower.

sklearn clustering with custom metric: pairwise_distances throwing error

I would like to cluster sets of spatial data using my own metric. The data comes as pairs of (x,y) values in a dataframe, where each set of pairs has an id. Like in the following example where I have three sets of points:
import pandas as pd
import numpy as np
df = pd.DataFrame({'id': [1] * 4 + [2] * 5 + [3] * 3,
'x': np.random.random(12),
'y': np.random.random(12)})
df['xy'] = df[['x','y']].apply(lambda row: [row['x'],row['y']], axis = 1)
Here is the distance function I would like to use:
from scipy.spatial.distance import directed_hausdorff
def some_distance(u, v):
return max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
This function computes the Hausdorff distance, i.e. the distance between two subsets u and v of n-dimensional space. In my case, I would like to use this distance function to cluster subsets of the real plane. In the data above there are three such subsets (ids from 1 to 3) so the resulting distance matrix should be 3x3.
My idea for the clustering step was to use sklearn.cluster.AgglomerativeClustering with a precomputed metric, which in turn I want to compute with sklearn.metrics.pairwise import pairwise_distances.
from sklearn.metrics.pairwise import pairwise_distances
def to_np_array(col):
return np.array(list(col.values))
X = df.groupby('id')['xy'].apply(to_np_array).as_matrix()
m = pairwise_distances(X, X, metric=some_distance)
However, the last line is giving me an error:
ValueError: setting an array element with a sequence.
What does work fine, however, is calling some_distance(X[1], X[2]).
My hunch is that X needs to be a different format for pairwise_distances to work. Any ideas on how to make this work, or how to compute the matrix myself so I can stick it into sklearn.cluster.AgglomerativeClustering?
The error stack is
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-3-e34155622595> in <module>
12 def some_distance(u, v):
13 return max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
---> 14 m = pairwise_distances(X, X, metric=some_distance)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
1430 func = partial(distance.cdist, metric=metric, **kwds)
1431
-> 1432 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1433
1434
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1065
1066 if effective_n_jobs(n_jobs) == 1:
-> 1067 return func(X, Y, **kwds)
1068
1069 # TODO: in some cases, backend='threading' may be appropriate
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _pairwise_callable(X, Y, metric, **kwds)
1079 """Handle the callable case for pairwise_{distances,kernels}
1080 """
-> 1081 X, Y = check_pairwise_arrays(X, Y)
1082
1083 if X is Y:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in check_pairwise_arrays(X, Y, precomputed, dtype)
106 if Y is X or Y is None:
107 X = Y = check_array(X, accept_sparse='csr', dtype=dtype,
--> 108 warn_on_dtype=warn_on_dtype, estimator=estimator)
109 else:
110 X = check_array(X, accept_sparse='csr', dtype=dtype,
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
525 try:
526 warnings.simplefilter('error', ComplexWarning)
--> 527 array = np.asarray(array, dtype=dtype, order=order)
528 except ComplexWarning:
529 raise ValueError("Complex data not supported\n"
C:\ProgramData\Anaconda3\lib\site-packages\numpy\core\numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: setting an array element with a sequence.

Try this:
import numpy as np
import pandas as pd
from scipy.spatial.distance import directed_hausdorff
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
df = pd.DataFrame({'id': [1] * 4 + [2] * 5 + [3] * 3, 'x':
np.random.random(12), 'y': np.random.random(12)})
df['xy'] = df[['x','y']].apply(lambda row: [row['x'],row['y']], axis = 1)
df.groupby('id')['xy'].apply(to_np_array)
def some_distance(u, v):
return max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
def to_np_array(col):
return np.array(list(col.values))
X = df.groupby('id')['xy'].apply(to_np_array)
d = np.zeros((len(X),len(X)))
for i, u in enumerate(X):
for j, v in list(enumerate(X))[i:]:
d[i,j] = some_distance(u,v)
d[j,i] = d[i,j]
And now when you print d you get this:
array([[0. , 0.58928274, 0.40767213],
[0.58928274, 0. , 0.510095 ],
[0.40767213, 0.510095 , 0. ]])
And for clustering:
cluster = AgglomerativeClustering(n_clusters=2, affinity='precomputed', linkage = 'average')
cluster.fit(d)

It would help if you showed some of the variables. Fortunately you gave enough code to run it. For example the dataframe:
In [9]: df
Out[9]:
id x y xy
0 1 0.428437 0.267264 [0.42843730501201727, 0.2672637429997736]
1 1 0.944687 0.023323 [0.9446872371859233, 0.023322969159167317]
2 1 0.091055 0.683154 [0.09105472832178496, 0.6831542985617349]
3 1 0.474522 0.313541 [0.4745222021519122, 0.3135405569298565]
4 2 0.835237 0.491541 [0.8352366339973815, 0.4915408434083248]
5 2 0.905918 0.854030 [0.9059178939221513, 0.8540297797160584]
6 2 0.182154 0.909656 [0.18215390836391654, 0.9096555360282939]
7 2 0.225270 0.522193 [0.22527013482912195, 0.5221926076838651]
8 2 0.924208 0.858627 [0.9242076604008371, 0.8586274362498842]
9 3 0.419813 0.634741 [0.41981292371175905, 0.6347409684931891]
10 3 0.954141 0.795452 [0.9541413559045294, 0.7954524369652217]
11 3 0.896593 0.271187 [0.8965932351250882, 0.2711872631673109]
And your X:
In [10]: X
Out[10]:
array([array([[0.42843731, 0.26726374],
[0.94468724, 0.02332297],
[0.09105473, 0.6831543 ],
[0.4745222 , 0.31354056]]),
array([[0.83523663, 0.49154084],
[0.90591789, 0.85402978],
[0.18215391, 0.90965554],
[0.22527013, 0.52219261],
[0.92420766, 0.85862744]]),
array([[0.41981292, 0.63474097],
[0.95414136, 0.79545244],
[0.89659324, 0.27118726]])], dtype=object)
That is a (3,) object array - in effect a list of 3 2d arrays, with different sizes ((3,2),(5,2),(4,2)). That's one array for each group.
How is pairwise supposed to feed that to your distance code? pairwise docs says X should be a (n,m) array - n samples, m features. Your X doesn't fit that description!
The error is probably produced by when trying to make a float array from X:
In [12]: np.asarray(X,dtype=float)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-a6e08bb1590c> in <module>
----> 1 np.asarray(X,dtype=float)
/usr/local/lib/python3.6/dist-packages/numpy/core/numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: setting an array element with a sequence.

Agglomerative Clustering to cluster doc2vec

I'm new to Agglomerative Clustering and doc2vec, so I hope somebody can help me with the following issue.
This is my code:
model = AgglomerativeClustering(linkage='average',
connectivity=None, n_clusters=2)
X = model_dm.docvecs.doctag_syn0
model.fit(X, y=None)
model.fit_predict(X, y=None)
What I want is to predict the average of the distances of each observation. I got the following error:
MemoryErrorTraceback (most recent call last)
<ipython-input-22-d8b93bc6abe1> in <module>()
2 model = AgglomerativeClustering(linkage='average',connectivity=None,n_clusters=2)
3 X = model_dm.docvecs.doctag_syn0
----> 4 model.fit(X, y=None)
5
/usr/local/lib64/python2.7/site-packages/sklearn/cluster/hierarchical.pyc in fit(self, X, y)
763 n_components=self.n_components,
764 n_clusters=n_clusters,
--> 765 **kwargs)
766 # Cut the tree
767 if compute_full_tree:
/usr/local/lib64/python2.7/site-packages/sklearn/externals/joblib/memory.pyc in __call__(self, *args, **kwargs)
281
282 def __call__(self, *args, **kwargs):
--> 283 return self.func(*args, **kwargs)
284
285 def call_and_shelve(self, *args, **kwargs):
/usr/local/lib64/python2.7/site-packages/sklearn/cluster/hierarchical.pyc in _average_linkage(*args, **kwargs)
547 def _average_linkage(*args, **kwargs):
548 kwargs['linkage'] = 'average'
--> 549 return linkage_tree(*args, **kwargs)
550
551
/usr/local/lib64/python2.7/site-packages/sklearn/cluster/hierarchical.pyc in linkage_tree(X, connectivity, n_components, n_clusters, linkage, affinity, return_distance)
428 i, j = np.triu_indices(X.shape[0], k=1)
429 X = X[i, j]
--> 430 out = hierarchy.linkage(X, method=linkage, metric=affinity)
431 children_ = out[:, :2].astype(np.int)
432
/usr/local/lib64/python2.7/site-packages/scipy/cluster/hierarchy.pyc in linkage(y, method, metric)
669 'matrix looks suspiciously like an uncondensed '
670 'distance matrix')
--> 671 y = distance.pdist(y, metric)
672 else:
673 raise ValueError("`y` must be 1 or 2 dimensional.")
/usr/local/lib64/python2.7/site-packages/scipy/spatial/distance.pyc in pdist(X, metric, p, w, V, VI)
1375
1376 m, n = s
-> 1377 dm = np.zeros((m * (m - 1)) // 2, dtype=np.double)
1378
1379 # validate input for multi-args metrics
MemoryError:

You are getting a MemoryError. This is a reliable indicator that you are running out of memory, on the line indicated.
The line indicates an attempt to allocate an np.zeros() array of (m * (m - 1)) // 2 values of type double (8 bytes). Looking at the scipy source, m, here, is the number of vectors in X, aka model_dm.docvecs.doctag_syn0.shape[0].
So, how many docvecs are you working with? If it's 200,000, you will need...
((200000 * 199999) // 2) * 8 bytes
...or about 320GB of RAM for that np.zeros() allocation to succeed. (If you have more docvecs, even more RAM.)
(Agglomerative clustering needs to know all the pairwise distances, which the scipy implementation tries to calculate and store at the beginning, which is very space-consuming.)
You may need to have more RAM, or use fewer docvecs, or use a different clustering algorithm, or use an implementation which is lazier about calculating distances (but is then much much slower because it will often be recalculating, rather than reusing, distances it needs repeatedly.

setting an array element with a sequence Scikit learn cosine_similarity

I'm trying to compute the cosine similarity on the result of a K-Means algorithm
tf_vectorizer = CountVectorizer()
tf = tf_vectorizer.fit_transform(tokens_list)
svd = TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer(copy=False))
lsa_tf = lsa.fit_transform(tf)
Above I'm building my two matrix of features (lsa and lsa_tf) and I want to build the cosine_similarity for both of them.
nb_cluster = 5
km = KMeans(n_clusters=nb_cluster, init='k-means++', max_iter=300, n_init=3, random_state=0)
km.fit(lsa_tf)
Above I'm applying my K-mean.
cluster_matrix = []
for cluster_number in range(0,nb_cluster):
index = 0
for label in km.labels_:
if label == cluster_number:
cluster_matrix.append(lsa_tf[index])
index += 1
Here I'm creating a matrix with all my vector grouped by label from the clustering result.
downsample_matrix = []
downsample_coefficient = 0
for vector in cluster_matrix:
downsample_coefficient += 1
if downsample_coefficient == 5:
downsample_matrix.append(vector)
downsample_coefficient = 0
Above I'm simply downsampling my matrix otherway it's to big to be displayed.
similarity_matrix = cosine_similarity(downsample_matrix)
plt.matshow(similarity_matrix)
plt.show()
And finally here I'm using cosine_similarity and display the matrix resulting.
This code works well for lsa_tf.
but it raised the following error for tf when trying to compute the cosine_similarity:
ValueError Traceback (most recent call last)
<ipython-input-27-5997ca6abb2d> in <module>()
19 downsample_matrix.append(vector)
20 downsample_coefficient = 0
---> 21 similarity_matrix = cosine_similarity(downsample_matrix)
22 plt.matshow(similarity_matrix)
23 plt.show()
/home/venv/lib/python3.5/site-packages/sklearn/metrics/pairwise.py in cosine_similarity(X, Y, dense_output)
908 # to avoid recursive import
909
--> 910 X, Y = check_pairwise_arrays(X, Y)
911
912 X_normalized = normalize(X, copy=True)
/home/venv/lib/python3.5/site-packages/sklearn/metrics/pairwise.py in check_pairwise_arrays(X, Y, precomputed, dtype)
104 if Y is X or Y is None:
105 X = Y = check_array(X, accept_sparse='csr', dtype=dtype,
--> 106 warn_on_dtype=warn_on_dtype, estimator=estimator)
107 else:
108 X = check_array(X, accept_sparse='csr', dtype=dtype,
/home/venv/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
380 force_all_finite)
381 else:
--> 382 array = np.array(array, dtype=dtype, order=order, copy=copy)
383
384 if ensure_2d:
ValueError: setting an array element with a sequence.
What is the difference between my tf and lsa_tf? How can I apply cosine_similarity to both of them?

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

pymc3 with custom likelihood function from kernel density estimation - python

Related

Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required

AttributeError: type object 'object' has no attribute 'dtype'

sklearn clustering with custom metric: pairwise_distances throwing error

Agglomerative Clustering to cluster doc2vec

setting an array element with a sequence Scikit learn cosine_similarity

Categories

Resources