setting an array element with a sequence Scikit learn cosine_similarity

setting an array element with a sequence Scikit learn cosine_similarity - python

I'm trying to compute the cosine similarity on the result of a K-Means algorithm
tf_vectorizer = CountVectorizer()
tf = tf_vectorizer.fit_transform(tokens_list)
svd = TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer(copy=False))
lsa_tf = lsa.fit_transform(tf)
Above I'm building my two matrix of features (lsa and lsa_tf) and I want to build the cosine_similarity for both of them.
nb_cluster = 5
km = KMeans(n_clusters=nb_cluster, init='k-means++', max_iter=300, n_init=3, random_state=0)
km.fit(lsa_tf)
Above I'm applying my K-mean.
cluster_matrix = []
for cluster_number in range(0,nb_cluster):
index = 0
for label in km.labels_:
if label == cluster_number:
cluster_matrix.append(lsa_tf[index])
index += 1
Here I'm creating a matrix with all my vector grouped by label from the clustering result.
downsample_matrix = []
downsample_coefficient = 0
for vector in cluster_matrix:
downsample_coefficient += 1
if downsample_coefficient == 5:
downsample_matrix.append(vector)
downsample_coefficient = 0
Above I'm simply downsampling my matrix otherway it's to big to be displayed.
similarity_matrix = cosine_similarity(downsample_matrix)
plt.matshow(similarity_matrix)
plt.show()
And finally here I'm using cosine_similarity and display the matrix resulting.
This code works well for lsa_tf.
but it raised the following error for tf when trying to compute the cosine_similarity:
ValueError Traceback (most recent call last)
<ipython-input-27-5997ca6abb2d> in <module>()
19 downsample_matrix.append(vector)
20 downsample_coefficient = 0
---> 21 similarity_matrix = cosine_similarity(downsample_matrix)
22 plt.matshow(similarity_matrix)
23 plt.show()
/home/venv/lib/python3.5/site-packages/sklearn/metrics/pairwise.py in cosine_similarity(X, Y, dense_output)
908 # to avoid recursive import
909
--> 910 X, Y = check_pairwise_arrays(X, Y)
911
912 X_normalized = normalize(X, copy=True)
/home/venv/lib/python3.5/site-packages/sklearn/metrics/pairwise.py in check_pairwise_arrays(X, Y, precomputed, dtype)
104 if Y is X or Y is None:
105 X = Y = check_array(X, accept_sparse='csr', dtype=dtype,
--> 106 warn_on_dtype=warn_on_dtype, estimator=estimator)
107 else:
108 X = check_array(X, accept_sparse='csr', dtype=dtype,
/home/venv/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
380 force_all_finite)
381 else:
--> 382 array = np.array(array, dtype=dtype, order=order, copy=copy)
383
384 if ensure_2d:
ValueError: setting an array element with a sequence.
What is the difference between my tf and lsa_tf? How can I apply cosine_similarity to both of them?

Related

Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required

This error keeps coming when I try to find MI values. My code is as follows
X_new = X.copy()
X_new = X_new.fillna(0)
y = data.SalePrice
def make_mi_scores(X, y):
X = X.copy()
for colname in X.select_dtypes(["object", "category"]):
X[colname], _ = X[colname].factorize()
discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
return mi_scores
def plot_mi_scores(scores):
scores = scores.sort_values(ascending=True)
width = np.arange(len(scores))
ticks = list(scores.index)
plt.barh(width, scores)
plt.yticks(width, ticks)
plt.title("Mutual Information Scores")
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(make_mi_scores(X_new,y))
if you want the full notebook here is a link https://www.kaggle.com/code/snigdhkarki/house-price-competition
The error is as follows
ValueError Traceback (most recent call last)
/tmp/ipykernel_19/1575243112.py in <module>
42
43 plt.figure(dpi=100, figsize=(8, 5))
---> 44 plot_mi_scores(make_mi_scores(X_new,y))
/tmp/ipykernel_19/1575243112.py in make_mi_scores(X, y)
28 print(X.isnull().any().any())
29 print(y.isnull().any().any())
---> 30 mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
31 mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
32 mi_scores = mi_scores.sort_values(ascending=False)
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in mutual_info_regression(X, y, discrete_features, n_neighbors, copy, random_state)
382 of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
383 """
--> 384 return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)
385
386
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _estimate_mi(X, y, discrete_features, discrete_target, n_neighbors, copy, random_state)
300 mi = [
301 _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
--> 302 for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
303 ]
304
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in <listcomp>(.0)
300 mi = [
301 _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
--> 302 for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
303 ]
304
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _compute_mi(x, y, x_discrete, y_discrete, n_neighbors)
160 return mutual_info_score(x, y)
161 elif x_discrete and not y_discrete:
--> 162 return _compute_mi_cd(y, x, n_neighbors)
163 elif not x_discrete and y_discrete:
164 return _compute_mi_cd(x, y, n_neighbors)
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _compute_mi_cd(c, d, n_neighbors)
137 radius = radius[mask]
138
--> 139 kd = KDTree(c)
140 m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
141 m_all = np.array(m_all) - 1.0
sklearn/neighbors/_binary_tree.pxi in sklearn.neighbors._kd_tree.BinaryTree.__init__()
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
806 "Found array with %d sample(s) (shape=%s) while a"
807 " minimum of %d is required%s."
--> 808 % (n_samples, array.shape, ensure_min_samples, context)
809 )
810
ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.
There were only a few places where this question was asked and even in those places I was unable to find any answers to my question

The issue is arising where you are calling mutual_info_regression over here -
mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
As per sklearn's documentation, the parameter discrete_features should be a boolean mask that has True for discrete variables and False otherwise.
I checked your Kaggle code and it seems like your technique for identifying discrete and continuous features in your data frame is wrong.
A simple hack to get the code running would be to identify all features as continuous using the following code -
discrete_features = [False]*73
# 73 is the number of columns X has
However, the result might be wrong if the mutual_info_regression algorithm requires you to accurately identify discrete and continuous features.

ValueError: setting an array element with a sequence while running NearestNeighbor

I have pyspark dataframe like this:
+------+---------------------------------------------------------------------+
|id |features |
+------+---------------------------------------------------------------------+
|2484 |[0.016910851, 0.025989642, 0.0025321299, -0.022232508, -0.00701562] |
|2504 |[0.015019539, 0.024844216, 0.0029279909, -0.020771071, -0.0061111804]|
|2904 |[0.014104126, 0.02474243, 0.0011707658, -0.021675153, -0.0050868453] |
|3084 |[0.110674664, 0.17139696, 0.059836507, -0.1926481, -0.060425207] |
|3164 |[0.17688861, 0.2159168, 0.10567094, -0.17365277, -0.016458606] |
|377784|[0.18425785, 0.34397766, 0.022859085, -0.35151178, -0.07897296] |
|425114|[0.14556459, 0.25762737, 0.09011796, -0.27128243, 0.011280057] |
|455074|[0.13579306, 0.3266111, 0.016416805, -0.31139722, -0.054227617] |
|532624|[0.22281846, 0.1575731, 0.14126688, -0.29887098, -0.09433056] |
|781654|[0.1381407, 0.14674455, 0.06877328, -0.13415968, -0.06589967] |
+------+---------------------------------------------------------------------+
Now I have to find nearest neighbor for this features so here are my step:
df_collect = df.toPandas()
#converting list column to array
df_collect['features'] = df_collect['features'].apply(lambda x: np.array(x))
features = df_collect['features'].to_numpy()
knnobj = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(features)
Now here I'm getting error:
TypeError Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
/tmp/ipykernel_6511/1498389666.py in <module>
----> 1 knnobj = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(features)
~/miniconda3/envs/dev_env_37/lib/python3.7/site-packages/sklearn/neighbors/_unsupervised.py in fit(self, X, y)
164 The fitted nearest neighbors estimator.
165 """
--> 166 return self._fit(X)
~/miniconda3/envs/dev_env_37/lib/python3.7/site-packages/sklearn/neighbors/_base.py in _fit(self, X, y)
433 else:
434 if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
--> 435 X = self._validate_data(X, accept_sparse="csr")
436
437 self._check_algorithm_metric()
~/miniconda3/envs/dev_env_37/lib/python3.7/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
564 raise ValueError("Validation should be done on X, y or both.")
565 elif not no_val_X and no_val_y:
--> 566 X = check_array(X, **check_params)
567 out = X
568 elif no_val_X and not no_val_y:
~/miniconda3/envs/dev_env_37/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
744 array = array.astype(dtype, casting="unsafe", copy=False)
745 else:
--> 746 array = np.asarray(array, order=order, dtype=dtype)
747 except ComplexWarning as complex_warning:
748 raise ValueError(
ValueError: setting an array element with a sequence.
I have checked all the size of subarray and everything's same and also data type. Can someone please point out what can be wrong here.
Output of features:
array([array([ 0.01691085, 0.02598964, 0.00253213, -0.02223251, -0.00701562]),
array([ 0.01501954, 0.02484422, 0.00292799, -0.02077107, -0.00611118]),
array([ 0.01410413, 0.02474243, 0.00117077, -0.02167515, -0.00508685]),
...,
array([ 0.01896316, 0.03188267, 0.00258667, -0.02800867, -0.00646481]),
array([ 0.03538242, 0.07453772, 0.00816828, -0.02914227, -0.0942148 ]),
array([ 0.02470775, 0.02561068, 0.00401011, -0.02863882, -0.00419102])],
dtype=object)

df.toPandas() returns a column of lists. You need to convert this column of lists to a 2D array. When you do df_collect['features'].apply(lambda x: np.array(x)).to_numpy() you get an array of arrays which is not the same as a 2D array. So you need
df_collect = df.toPandas()
features = np.array(df_collect.features.to_list())
knnobj = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(features)
As an alternative, you can directly pass the nested list to NearestNeighbors:
knnobj = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(df_collect.features.to_list())

I had to modify the table string to be able to convert it into a Pandas dataframe. The, this code works fine
from sklearn.neighbors import NearestNeighbors
from io import StringIO
import numpy as np
df_str = """2484, 0.016910851, 0.025989642, 0.0025321299, -0.022232508, -0.00701562
2504, 0.015019539, 0.024844216, 0.0029279909, -0.020771071, -0.0061111804
2904, 0.014104126, 0.02474243, 0.0011707658, -0.021675153, -0.0050868453
3084, 0.110674664, 0.17139696, 0.059836507, -0.1926481, -0.060425207
3164, 0.17688861, 0.2159168, 0.10567094, -0.17365277, -0.016458606
377784, 0.18425785, 0.34397766, 0.022859085, -0.35151178, -0.07897296
425114, 0.14556459, 0.25762737, 0.09011796, -0.27128243, 0.011280057
455074, 0.13579306, 0.3266111, 0.016416805, -0.31139722, -0.054227617
532624, 0.22281846, 0.1575731, 0.14126688, -0.29887098, -0.09433056
781654, 0.1381407, 0.14674455, 0.06877328, -0.13415968, -0.06589967"""
# convert to pandas frame
data = StringIO(df_str)
df = pd.read_csv(data, sep=",", names=['id'] + ['feat_{}'.format(i) for i in range(1,6)])
#converting list column to array
features = df.drop(columns=['id']).to_numpy()
# fit kNN
knnobj = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(features)
# output
knnobj.get_params()
> {'algorithm': 'auto',
'leaf_size': 30,
'metric': 'minkowski',
'metric_params': None,
'n_jobs': None,
'n_neighbors': 5,
'p': 2,
'radius': 1.0}
Given the cryptic error message, my guess is that the conversion of df_collect introduces an erroneous data format that throws of the kNN.

Pandas/Sklearn: Indexing column and ValueError: could not convert string to float

I have written following script to find out the best features to be used in my sklearn algorithm.
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plots
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
data = pd.read_csv("movies.csv")
X = data.iloc[6:] #columns with words
y = genre_to_binary(data.iloc[1]) #target column i.e genre
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Genre'] #naming the dataframe columns
print(featureScores.nlargest(10,'Genre')) #print 10 best features
data = pd.read_csv("movies.csv")
X = data.iloc[:,0:20] #independent columns
y = data.iloc[:,-1] #target column i.e price range
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()
def genre_to_binary(series):
genres_list = series.values.tolist()
converted_genre=[]
for i in np.arange(len(genres_list)):
if genres_list[i]=="action":
converted_genre=np.append(converted_genre,0)
else:
converted_genre=np.append(converted_genre,1)
return converted_genre
However, this throws following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-78-b772929399c8> in <module>
1 #apply SelectKBest class to extract top 10 best features
2 bestfeatures = SelectKBest(score_func=chi2, k=10)
----> 3 fit = bestfeatures.fit(X,y)
4 dfscores = pd.DataFrame(fit.scores_)
5 dfcolumns = pd.DataFrame(X.columns)
/srv/app/venv/lib/python3.6/site-packages/sklearn/feature_selection/univariate_selection.py in fit(self, X, y)
339 self : object
340 """
--> 341 X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=True)
342
343 if not callable(self.score_func):
/srv/app/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
717 ensure_min_features=ensure_min_features,
718 warn_on_dtype=warn_on_dtype,
--> 719 estimator=estimator)
720 if multi_output:
721 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
/srv/app/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
534 # make sure we actually converted to numeric:
535 if dtype_numeric and array.dtype.kind == "O":
--> 536 array = array.astype(np.float64)
537 if not allow_nd and array.ndim >= 3:
538 raise ValueError("Found array with dim %d. %s expected <= 2."
ValueError: could not convert string to float: 'natural born killers'
I do not understand why natural born killers is even contained in x or y since it only occurs in the column at index 0 which I do not access. So where is the problem?
You can find the file here.

sklearn clustering with custom metric: pairwise_distances throwing error

I would like to cluster sets of spatial data using my own metric. The data comes as pairs of (x,y) values in a dataframe, where each set of pairs has an id. Like in the following example where I have three sets of points:
import pandas as pd
import numpy as np
df = pd.DataFrame({'id': [1] * 4 + [2] * 5 + [3] * 3,
'x': np.random.random(12),
'y': np.random.random(12)})
df['xy'] = df[['x','y']].apply(lambda row: [row['x'],row['y']], axis = 1)
Here is the distance function I would like to use:
from scipy.spatial.distance import directed_hausdorff
def some_distance(u, v):
return max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
This function computes the Hausdorff distance, i.e. the distance between two subsets u and v of n-dimensional space. In my case, I would like to use this distance function to cluster subsets of the real plane. In the data above there are three such subsets (ids from 1 to 3) so the resulting distance matrix should be 3x3.
My idea for the clustering step was to use sklearn.cluster.AgglomerativeClustering with a precomputed metric, which in turn I want to compute with sklearn.metrics.pairwise import pairwise_distances.
from sklearn.metrics.pairwise import pairwise_distances
def to_np_array(col):
return np.array(list(col.values))
X = df.groupby('id')['xy'].apply(to_np_array).as_matrix()
m = pairwise_distances(X, X, metric=some_distance)
However, the last line is giving me an error:
ValueError: setting an array element with a sequence.
What does work fine, however, is calling some_distance(X[1], X[2]).
My hunch is that X needs to be a different format for pairwise_distances to work. Any ideas on how to make this work, or how to compute the matrix myself so I can stick it into sklearn.cluster.AgglomerativeClustering?
The error stack is
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-3-e34155622595> in <module>
12 def some_distance(u, v):
13 return max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
---> 14 m = pairwise_distances(X, X, metric=some_distance)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
1430 func = partial(distance.cdist, metric=metric, **kwds)
1431
-> 1432 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1433
1434
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1065
1066 if effective_n_jobs(n_jobs) == 1:
-> 1067 return func(X, Y, **kwds)
1068
1069 # TODO: in some cases, backend='threading' may be appropriate
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _pairwise_callable(X, Y, metric, **kwds)
1079 """Handle the callable case for pairwise_{distances,kernels}
1080 """
-> 1081 X, Y = check_pairwise_arrays(X, Y)
1082
1083 if X is Y:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in check_pairwise_arrays(X, Y, precomputed, dtype)
106 if Y is X or Y is None:
107 X = Y = check_array(X, accept_sparse='csr', dtype=dtype,
--> 108 warn_on_dtype=warn_on_dtype, estimator=estimator)
109 else:
110 X = check_array(X, accept_sparse='csr', dtype=dtype,
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
525 try:
526 warnings.simplefilter('error', ComplexWarning)
--> 527 array = np.asarray(array, dtype=dtype, order=order)
528 except ComplexWarning:
529 raise ValueError("Complex data not supported\n"
C:\ProgramData\Anaconda3\lib\site-packages\numpy\core\numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: setting an array element with a sequence.

Try this:
import numpy as np
import pandas as pd
from scipy.spatial.distance import directed_hausdorff
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
df = pd.DataFrame({'id': [1] * 4 + [2] * 5 + [3] * 3, 'x':
np.random.random(12), 'y': np.random.random(12)})
df['xy'] = df[['x','y']].apply(lambda row: [row['x'],row['y']], axis = 1)
df.groupby('id')['xy'].apply(to_np_array)
def some_distance(u, v):
return max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
def to_np_array(col):
return np.array(list(col.values))
X = df.groupby('id')['xy'].apply(to_np_array)
d = np.zeros((len(X),len(X)))
for i, u in enumerate(X):
for j, v in list(enumerate(X))[i:]:
d[i,j] = some_distance(u,v)
d[j,i] = d[i,j]
And now when you print d you get this:
array([[0. , 0.58928274, 0.40767213],
[0.58928274, 0. , 0.510095 ],
[0.40767213, 0.510095 , 0. ]])
And for clustering:
cluster = AgglomerativeClustering(n_clusters=2, affinity='precomputed', linkage = 'average')
cluster.fit(d)

It would help if you showed some of the variables. Fortunately you gave enough code to run it. For example the dataframe:
In [9]: df
Out[9]:
id x y xy
0 1 0.428437 0.267264 [0.42843730501201727, 0.2672637429997736]
1 1 0.944687 0.023323 [0.9446872371859233, 0.023322969159167317]
2 1 0.091055 0.683154 [0.09105472832178496, 0.6831542985617349]
3 1 0.474522 0.313541 [0.4745222021519122, 0.3135405569298565]
4 2 0.835237 0.491541 [0.8352366339973815, 0.4915408434083248]
5 2 0.905918 0.854030 [0.9059178939221513, 0.8540297797160584]
6 2 0.182154 0.909656 [0.18215390836391654, 0.9096555360282939]
7 2 0.225270 0.522193 [0.22527013482912195, 0.5221926076838651]
8 2 0.924208 0.858627 [0.9242076604008371, 0.8586274362498842]
9 3 0.419813 0.634741 [0.41981292371175905, 0.6347409684931891]
10 3 0.954141 0.795452 [0.9541413559045294, 0.7954524369652217]
11 3 0.896593 0.271187 [0.8965932351250882, 0.2711872631673109]
And your X:
In [10]: X
Out[10]:
array([array([[0.42843731, 0.26726374],
[0.94468724, 0.02332297],
[0.09105473, 0.6831543 ],
[0.4745222 , 0.31354056]]),
array([[0.83523663, 0.49154084],
[0.90591789, 0.85402978],
[0.18215391, 0.90965554],
[0.22527013, 0.52219261],
[0.92420766, 0.85862744]]),
array([[0.41981292, 0.63474097],
[0.95414136, 0.79545244],
[0.89659324, 0.27118726]])], dtype=object)
That is a (3,) object array - in effect a list of 3 2d arrays, with different sizes ((3,2),(5,2),(4,2)). That's one array for each group.
How is pairwise supposed to feed that to your distance code? pairwise docs says X should be a (n,m) array - n samples, m features. Your X doesn't fit that description!
The error is probably produced by when trying to make a float array from X:
In [12]: np.asarray(X,dtype=float)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-a6e08bb1590c> in <module>
----> 1 np.asarray(X,dtype=float)
/usr/local/lib/python3.6/dist-packages/numpy/core/numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: setting an array element with a sequence.

pymc3 with custom likelihood function from kernel density estimation

I'm trying to use pymc3 with a likelihood function derived from some observed data. This observed data doesn't fit any nice, standard distribution, so I want to define my own, based on these observations.
One approach is to use kernel density estimation over the observations. This was possible in pymc2, but doesn't play nicely with the Theano variables in pymc3.
In my code below I'm just generating some dummy data that is normally distributed. As my prior, I'm essentially assuming a uniform distribution for my observations.
Here's my code:
from scipy import stats
import numpy as np
import pymc3 as pm
from sklearn.neighbors.kde import KernelDensity
data = np.sort(stats.norm.rvs(loc=0, scale=1, size=1000))
kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(data.reshape(-1, 1))
def get_log_likelihood(x):
return kde.score_samples(x)
with pm.Model() as test_model:
x = pm.Uniform('prior rv', lower=-10, upper=10)
obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
step = pm.Metropolis()
trace = pm.sample(200, step=step)
The error I receive seems to be the kde score_samples function blowing up as it expects an array, but x is a Theano variable.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-49-4efbbe7376dc> in <module>()
1 with pm.Model() as test_model:
2 x = pm.Uniform('prior rv', lower=0.0, upper=1e6)
----> 3 obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
4
5 step = pm.Metropolis()
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
40 total_size = kwargs.pop('total_size', None)
41 dist = cls.dist(*args, **kwargs)
---> 42 return model.Var(name, dist, data, total_size)
43 else:
44 raise TypeError("Name needs to be a string but got: {}".format(name))
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size)
825 with self:
826 var = MultiObservedRV(name=name, data=data, distribution=dist,
--> 827 total_size=total_size, model=self)
828 self.observed_RVs.append(var)
829 if var.missing_values:
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
1372 self.missing_values = [datum.missing_values for datum in self.data.values()
1373 if datum.missing_values is not None]
-> 1374 self.logp_elemwiset = distribution.logp(**self.data)
1375 # The logp might need scaling in minibatches.
1376 # This is done in `Factor`.
<ipython-input-48-535f58ce543b> in get_log_likelihood(x)
1 def get_log_likelihood(x):
----> 2 return kde.score_samples(x)
~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/neighbors/kde.py in score_samples(self, X)
150 # For it to be a probability, we must scale it. For this reason
151 # we'll also scale atol.
--> 152 X = check_array(X, order='C', dtype=DTYPE)
153 N = self.tree_.data.shape[0]
154 atol_N = self.atol * N
~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
431 force_all_finite)
432 else:
--> 433 array = np.array(array, dtype=dtype, order=order, copy=copy)
434
435 if ensure_2d:
ValueError: setting an array element with a sequence.
Any help would be greatly appreciated. Thanks!

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

setting an array element with a sequence Scikit learn cosine_similarity - python

Related

Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required

ValueError: setting an array element with a sequence while running NearestNeighbor

Pandas/Sklearn: Indexing column and ValueError: could not convert string to float

sklearn clustering with custom metric: pairwise_distances throwing error

pymc3 with custom likelihood function from kernel density estimation

Categories

Resources