I keep getting the error IndexError: tuple index out of range and i am not sure what is happening. My code was working just fine however when i restarted the jupyter notebook i started receiving this error.
this is my code:
X = df.Tweet
y = df.target
from sklearn import linear_model
import pyswarms as ps
# Create an instance of the classifier
classifier = linear_model.LogisticRegression()
# Define objective function
def f_per_particle(m, alpha):
total_features = X.shape[1]
# Get the subset of the features from the binary mask
if np.count_nonzero(m) == 0:
X_subset = X
else:
X_subset = X[:,m==1]
# Perform classification and store performance in P
classifier.fit(X_subset, y)
P = (classifier.predict(X_subset) == y).mean()
# Compute for the objective function
j = (alpha * (1.0 - P)
+ (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))
return j
[some more code]
options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}
# Call instance of PSO
dimensions = X.shape[1] # dimensions should be the number of features
optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)
# Perform optimization
cost, pos = optimizer.optimize(f, iters=1000)
i received the following traceback:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-76-bea8cf064cd2> in <module>
2
3 # Call instance of PSO
----> 4 dimensions = X.shape[1] # dimensions should be the number of features
5 optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)
6
IndexError: tuple index out of range
It is not absolutely clear, but it seems to me that your df variable might be a Pandas dataframe, and your df.Tweet may be a Pandas series.
In that case, being a series, your X will have only one dimension (so, only the first element of the tuple X.shape, X.shape[0]), instead of two dimensions - reason for the index out of range exception in your code. The two dimensions case occurs only when the variable is a dataframe.
More information: https://www.google.com/amp/s/www.geeksforgeeks.org/python-pandas-series-shape/amp/
Related
I am trying to load columns 1 to 15 of the data.txt file into array X and column 16 into array y, and normalize all 15 columns in X in the for loop and array y in a single statement. Loading is working properly, but after trying to print results of normalization I get this error:
TypeError: 'tuple' object cannot be interpreted as an integer
Please help, the code is being done in python in the Jupyter notebook.
import numpy as np
import matplotlib.pyplot as plt
data = np.loadtxt('data.txt')
X = np.array(data[:, 1:16])
y = np.array(data[:, 16], ndmin=2).T
n = X.shape
for i in range(n):
X[:, i] = (X[:, i]-np.min(X[:, i])) / (np.max(X[:, i])-np.min(X[:, i]))
y = ( y-np.min(y) ) / ( np.max(y)-np.min(y) )
print(X)
print(y)
The problem is probably in the loop row for i in range(n):
n is X shape, its a tuple, range needs integer as parameter.
Code example of your case:
n = (2,4)
for i in range(n):
print(i)
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-277-3834a67eeb55> in <module>
1 n = (2,4)
----> 2 for i in range(n):
3 print(i)
TypeError: 'tuple' object cannot be interpreted as an integer
In your case, I think you want to iterate over columns, so n = X.shape[1] will fix it.
By calculating the p-value, I am reducing the number of features in a large sparse file. But I get this error. I have seen similar posts but this code works with non-sparse input. Can you help, please? (I can upload the input file if needed)
import statsmodels.formula.api as sm
def backwardElimination(x, Y, sl, columns):
numVars = len(x[0])
pvalue_removal_counter = 0
for i in range(0, numVars):
print(i, 'of', numVars)
regressor_OLS = sm.OLS(Y, x).fit()
maxVar = max(regressor_OLS.pvalues).astype(float)
if maxVar > sl:
for j in range(0, numVars - i):
if (regressor_OLS.pvalues[j].astype(float) == maxVar):
x = np.delete(x, j, 1)
pvalue_removal_counter += 1
columns = np.delete(columns, j)
regressor_OLS.summary()
return x, columns
Output:
0 of 1970
1 of 1970
2 of 1970
Traceback (most recent call last):
File "main.py", line 142, in <module>
selected_columns)
File "main.py", line 101, in backwardElimination
if (regressor_OLS.pvalues[j].astype(float) == maxVar):
IndexError: index 1967 is out of bounds for axis 0 with size 1967
Here is a fixed version.
I made a number of changes:
Import the correct OLS from statsmodels.api
Generate columns in the function
Use np.argmax to find the location of the maximum value
Use a boolean index to select columns. In pseudo-code it is like x[:, [True, False, True]] which keeps columns 0 and 2.
Stop if there is nothing to drop.
import numpy as np
# Wrong import. Not using the formula interface, so using statsmodels.api
import statsmodels.api as sm
def backwardElimination(x, Y, sl):
numVars = x.shape[1] # variables in columns
columns = np.arange(numVars)
for i in range(0, numVars):
print(i, 'of', numVars)
regressor_OLS = sm.OLS(Y, x).fit()
if maxVar > sl:
# Use boolean selection
retain = np.ones(x.shape[1], bool)
drop = np.argmax(regressor_OLS.pvalues)
# Drop the highest pvalue(s)
retain[drop] = False
# Keep the x we with to retain
x = x[:, retain]
# Also keep their column indices
columns = columns[retain]
else:
# Exit early if everything has pval above sl
break
# Show the final summary
print(regressor_OLS.summary())
return x, columns
You can test it with
x = np.random.standard_normal((1000,100))
y = np.random.standard_normal(1000)
backwardElimination(x,y,0.1)
I'm using the following code to find the topk matches using pytorch:
def find_top(self, x, y, n_neighbors, unit_vectors=False, cuda=False):
if not unit_vectors:
x = __to_unit_torch__(x, cuda=cuda)
y = __to_unit_torch__(y, cuda=cuda)
with torch.no_grad():
d = 1. - torch.matmul(x, y.transpose(0, 1))
values, indices = torch.topk(d, n_neighbors, dim=1, largest=False, sorted=True)
return indices.cpu().numpy()
Unfortunately, it is throwing the following error:
values, indices = torch.topk(d, n_neighbors, dim=1, largest=False, sorted=True)
RuntimeError: invalid argument 5: k not in range for dimension at /pytorch/aten/src/THC/generic/THCTensorTopK.cu:23
The size of d is (1793,1) . What am I missing?
This error occurs when you call torch.topk with a k larger than the total number of classes. Reduce your argument and it should run fine.
I am trying to develop a code to compute a covariance matrix of a dataset using For Loops instead of Numpy. The code I have so far generates an error:
def cov_naive(X):
"""Compute the covariance for a dataset of size (D,N)
where D is the dimension and N is the number of data points"""
D, N = X.shape
### Edit the code below to compute the covariance matrix by iterating over the dataset.
covariance = np.zeros((D, D))
mean = np.mean(X, axis=1)
for i in range(D):
for j in range(D):
covariance[i,j] += (X[:,i] - mean[i]) # (X[:,j] - mean[j])
return covariance/N
I am trying to perform the below test to validate that it works:
# Let's first test the functions on some hand-crafted dataset.
X_test = np.arange(6).reshape(2,3)
expected_test_mean = np.array([1., 4.]).reshape(-1, 1)
expected_test_cov = np.array([[2/3., 2/3.], [2/3.,2/3.]])
print('X:\n', X_test)
print('Expected mean:\n', expected_test_mean)
print('Expected covariance:\n', expected_test_cov)
np.testing.assert_almost_equal(mean(X_test), expected_test_mean)
np.testing.assert_almost_equal(mean_naive(X_test), expected_test_mean)
np.testing.assert_almost_equal(cov(X_test), expected_test_cov)
np.testing.assert_almost_equal(cov_naive(X_test), expected_test_cov)
and get the following error:
AssertionError:
Arrays are not almost equal to 7 decimals
AssertionError Traceback (most recent call last)
<ipython-input-21-6a6498089109> in <module>()
12
13 np.testing.assert_almost_equal(cov(X_test), expected_test_cov)
---> 14 np.testing.assert_almost_equal(cov_naive(X_test), expected_test_cov)
Any help would be greatly appreciated!
The mistake lies in that line
mean = np.mean(X, axis=1)
it should be:
mean = np.mean(X, axis=0)
as you are computing the mean over the columns (ie dataset Dimensionality)
I get a ValueError: Found input variables with inconsistent numbers of samples: [20000, 1] when I run the following even though the row values of x and y are correct. I load in the RCV1 dataset, get indices of the categories with the top x documents, create list of tuples with equal number of randomly-selected positives and negatives for each category, and then finally attempt to run a logistic regression on one of the categories.
import sklearn.datasets
from sklearn import model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from scipy import sparse
rcv1 = sklearn.datasets.fetch_rcv1()
def get_top_cat_indices(target_matrix, num_cats):
cat_counts = target_matrix.sum(axis=0)
#cat_counts = cat_counts.reshape((1,103)).tolist()[0]
cat_counts = cat_counts.reshape((103,))
#b = sorted(cat_counts, reverse=True)
ind_temp = np.argsort(cat_counts)[::-1].tolist()[0]
ind = [ind_temp[i] for i in range(5)]
return ind
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
cat_present = x.tocsr()[np.where(temp.sum(axis=1)>0)[0],:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[np.where(temp.sum(axis=1)==0)[0],:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[idx_cat,:]
sampled_y_neg = temp.tocsr()[idx_nocat,:]
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
ind = get_top_cat_indices(rcv1.target, 5)
test_res = prepare_data(train_x, train_y, ind, 20000)
x, y = test_res[0]
print(x.shape)
print(y.shape)
LogisticRegression().fit(x, y)
Could it be an issue with the sparse matrices, or problem with dimensionality (there are 20K samples and 47K features)
When I run your code, I get following error:
AttributeError: 'bool' object has no attribute 'any'
That's because y for LogisticRegression needs to numpy array. So, I changed last line to:
LogisticRegression().fit(x, y.A.flatten())
Then I get following error:
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
This is because your sampling code has a bug. You need to subset y array with rows having that category before using sampling indices. See code below:
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
c1 = np.where(temp.sum(axis=1)>0)[0]
c2 = np.where(temp.sum(axis=1)==0)[0]
cat_present = x.tocsr()[c1,:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[c2,:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[c1][idx_cat,:]
print(sampled_y_pos.nnz)
sampled_y_neg = temp.tocsr()[c2][idx_nocat,:]
print(sampled_y_neg.nnz)
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
Now, Everything works like a charm