Error "'numpy.ndarray' object has no attribute 'values'" - python

I want to shift my time series data, but I am getting the following error:
AttributeError: 'numpy.ndarray' object has no attribute 'values'
This is my code:
def create_dataset(datasets):
#series = dataset
temps = DataFrame(datasets.values)
dataframes = concat(
[temps, temps.shift(-1), temps.shift(-2), temps.shift(-3)], axis=1)
lala = numpy.array(dataframes)
return lala
# Load
dataframe = pandas.read_csv('zahlenreihe.csv', index_col=False,
engine='python', header=None)
dataset = dataframe.values
dataset = dataset.astype('float32')
# Split
train_size = int(len(dataset) * 0.70)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
# Create
trainX = create_dataset(train)
I think the following line is wrong:
temps = DataFrame(datasets.values)
My zahlenreihe.csv file (number sequence) just has integers ordered like:
1
2
3
4
5
n
How should I handle it?

The solution:
The given dataset was already an array, so I didn’t need to call .value.

The problem lies in the following line:
df = StandardScaler().fit_transform(df)
It returns a NumPy array (see the documentation), which does not have a drop function. You would have to convert it into a pd.DataFrame first!
new_df = pd.DataFrame(StandardScaler().fit_transform(df), columns=df.columns, index=df.index)

Related

Encoding multiple columns

In the case a dataframe has two or more columns with numerical and text values, and one Label/Target column, if I want to apply a model like svm, how can I use only the columns I am more interested in?
Ex.
Data Num Label/Target No_Sense
What happens here? group1 1 Migrate
Customer Management group2 0 Change Stage
Life Cycle Stages group1 1 Restructure
Drop-down allows to select status type group3 1 Restructure Status
and so.
The approach I have taken is
1.encode "Num" column:
one_hot = pd.get_dummies(df['Num'])
df = df.drop('Num',axis = 1)
df = df.join(one_hot)
2.encode "Data" column:
def bag_words(df):
df = basic_preprocessing(df)
count_vectorizer = CountVectorizer()
count_vectorizer.fit(df['Data'])
list_corpus = df["Data"].tolist()
list_labels = df["Label/Target"].tolist()
X = count_vectorizer.transform(list_corpus)
return X, list_labels
Then apply bag_words to the dataset
X, y = bag_words(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
Is there anything that I missed in these steps? How can I select only "Data" and "Num" features in my training dataset? (as I think "No_Sense" is not so relevant for my purposes)
EDIT: I have tried with
def bag_words(df):
df = basic_preprocessing(df)
count_vectorizer = CountVectorizer()
count_vectorizer.fit(df['Data'])
list_corpus = df["Data"].tolist()+ df["group1"].tolist()+df["group2"].tolist()+df["group3"].tolist() #<----
list_labels = df["Label/Target"].tolist()
X = count_vectorizer.transform(list_corpus)
return X, list_labels
but I have found the error:
TypeError: 'int' object is not iterable
I hope this helps you:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
#this part so I can recreate you df from the string you posted
#remove this part !!!!
data="""
Data Num Label/Target No_Sense
What happens here? group1 1 Migrate
Customer Management group2 0 Change Stage
Life Cycle Stages group1 1 Restructure
Drop-down allows to select status type group3 1 Restructure Status
"""
df = pd.DataFrame(np.array( [ re.split(r'\s{2,}', line) for line in lines[1:] ] ),
columns = lines[0].split())
#what you want starts from here!!!!:
one_hot = pd.get_dummies(df['Num'])
df = df.drop('Num',axis = 1)
df = df.join(one_hot)
#at this point you have 3 new fetures for 'Num' variable
def bag_words(df):
count_vectorizer = CountVectorizer()
count_vectorizer.fit(df['Data'])
matrix = count_vectorizer.transform(df['Data'])
#this dataframe: `encoded_df`has 15 new features, these are the result of fitting
#the CountVectorizer to the 'Data' variable
encoded_df = pd.DataFrame(data=matrix.toarray(), columns=["Data"+str(i) for i in range(matrix.shape[1])])
#adding them to the dataframe
df.join(encoded_df)
#getting the numpy arrays that you can use in training
X = df.loc[:, ["Data"+str(i) for i in range(matrix.shape[1])] + ["group1", "group2", "group3"]].to_numpy()
y = df.loc[:, ["Label/Target"]].to_numpy()
return X, y
X, y = bag_words(df)

How do I standardize my data so that the Mean is 0?

I'm trying to standardize a dataset in Python as part of Principle Component Analysis. I've managed to do the following so far:
cancer_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
cancer_data.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
'Normal Nucleoli', 'Mitoses','Class']
cancer_data = cancer_data.replace('?', np.NaN)
cancer_data = cancer_data.fillna(cancer_data.median())
classDF = cancer_data['Class']
cancer_data = cancer_data.drop(['Class' ,'Sample code'], axis = 1)
# Standardization of data
standardized = StandardScaler().fit_transform(cancer_data)
x = pd.DataFrame(standardized, columns = cancer_data.columns)
However when I check the Mean values, I get the following output:
array([-5.08256606e-17, -9.14861892e-17, -3.04953964e-17, 5.08256606e-17,
5.08256606e-17, -8.13210570e-17, 3.04953964e-17, -1.32146718e-16,
-8.13210570e-17])
I'm not too sure what I'm doing wrong for these values to be wrong so any help is much appreicated (I'm new to data mining).
Use the formula of the standarization:
column = column to standardized
df_std[column] = (df_std[column] - df_std[column].mean()) /
df_std[column].std()
or:
from sklearn.preprocessing import StandardScaler
# create a scaler object
std_scaler = StandardScaler()
std_scaler
# fit and transform the data
df_std = pd.DataFrame(std_scaler.fit_transform(df_cars), columns=column)
Read for more information :
https://towardsdatascience.com/data-normalization-with-pandas-and-scikit-learn-7c1cc6ed6475

cannot concatenate object of type "<class 'numpy.ndarray'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid

My input data is under the form:
gold,Program,MethodType,CallersT,CallersN,CallersU,CallersCallersT,CallersCallersN,CallersCallersU,CalleesT,CalleesN,CalleesU,CalleesCalleesT,CalleesCalleesN,CalleesCalleesU,CompleteCallersCallees,classGold
T,chess,Inner,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
T,chess,Inner,Low,-1,-1,Low,-1,-1,Medium,-1,Medium,High,-1,High,0,Trace,
T,chess,Inner,Low,-1,-1,Low,-1,-1,Medium,-1,Medium,High,-1,High,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,Low,Low,High,Medium,-1,Medium,0,Trace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,Low,Low,Medium,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace,
T,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,Low,Low,Medium,0,Trace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,Low,Low,High,Low,Low,Medium,0,Trace,
N,chess,Inner,Low,-1,-1,-1,-1,-1,Low,Low,High,Low,Low,Medium,0,Trace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace,
....
N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,NoTrace,
T,chess,Inner,Low,-1,-1,Low,Low,-1,Low,-1,Low,-1,-1,-1,0,Trace,
T,chess,Inner,Low,-1,-1,Medium,-1,-1,Low,-1,Low,-1,-1,-1,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,NoTrace,
I am reading my data and I am trying to concatenate two data sets that are subsets of the original data set, here is the code I am using:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
# Feature Scaling
from sklearn.preprocessing import StandardScaler
SeparateProjectLearning=False
CompleteCallersCallees=False
PartialTrainingSetCompleteCallersCallees=True
def main():
X_train={}
X_test={}
y_train={}
y_test={}
dataset = pd.read_csv( 'InputData.txt', sep= ',', index_col=False)
#convert T into 1 and N into 0
dataset['gold'] = dataset['gold'].astype('category').cat.codes
dataset['Program'] = dataset['Program'].astype('category').cat.codes
dataset['classGold'] = dataset['classGold'].astype('category').cat.codes
dataset['MethodType'] = dataset['MethodType'].astype('category').cat.codes
dataset['CallersT'] = dataset['CallersT'].astype('category').cat.codes
dataset['CallersN'] = dataset['CallersN'].astype('category').cat.codes
dataset['CallersU'] = dataset['CallersU'].astype('category').cat.codes
dataset['CallersCallersT'] = dataset['CallersCallersT'].astype('category').cat.codes
dataset['CallersCallersN'] = dataset['CallersCallersN'].astype('category').cat.codes
dataset['CallersCallersU'] = dataset['CallersCallersU'].astype('category').cat.codes
dataset['CalleesT'] = dataset['CalleesT'].astype('category').cat.codes
dataset['CalleesN'] = dataset['CalleesN'].astype('category').cat.codes
dataset['CalleesU'] = dataset['CalleesU'].astype('category').cat.codes
dataset['CalleesCalleesT'] = dataset['CalleesCalleesT'].astype('category').cat.codes
dataset['CalleesCalleesN'] = dataset['CalleesCalleesN'].astype('category').cat.codes
dataset['CalleesCalleesU'] = dataset['CalleesCalleesU'].astype('category').cat.codes
pd.set_option('display.max_columns', None)
row_count, column_count = dataset.shape
Xcol = dataset.iloc[:, 1:column_count]
CompleteSet=dataset.loc[dataset['CompleteCallersCallees'] == 1]
CompleteSet_X = CompleteSet.iloc[:, 1:column_count].values
CompleteSet_Y = CompleteSet.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(CompleteSet_X, CompleteSet_Y, test_size = 0.2, random_state = 0)
TestSet=dataset.loc[dataset['CompleteCallersCallees'] == 0]
X_test1=TestSet.iloc[:, 1:column_count].values
X_test=pd.concat(X_test1,X_test)
I want to build my own test set and training set by using concatenation and I am trying to concatenate X_test1 and X_test in the code above. However, the problem is that I am getting an error for the last line of code X_test=pd.concat(X_test1,X_test) and the error says TypeError: cannot concatenate object of type "<class 'numpy.ndarray'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid. How can I fix this?
By adding .values to the end of your filters in the following lines:
CompleteSet_X = CompleteSet.iloc[:, 1:column_count].values
CompleteSet_Y = CompleteSet.iloc[:, 0].values
X_test1=TestSet.iloc[:, 1:column_count].values
You are extracting the underlying Numpy ndarray from the Pandas Series/DataFrame the prior code extracts, just remove .values at the end and you can use concat directly with the Series or DataFrame.

IndexError: too many indices for array in python

I have an input training dataset of dimension(1500, 5)and an output training dataset for(1499, 1). Please suggest what I can do to eliminate the error.
Error: Y = dataframe1[:,0]
IndexError: too many indices for array
Code:
dataframe = np.genfromtxt('DataInput.csv', delimiter=",")
#pandas.read_csv("DataInput.csv", delim_whitespace=True, header=None)
#dataset = dataframe.values
dataframe1 = np.genfromtxt("OptimizedSpeed.csv", delimiter=",")
#dataset1 = dataframe1.values
# split into input (X) and output (Y) variables
X = dataframe[:,0:4]
**Y = dataframe1[:,0]** <-- this line shows the error

Do you have to create dummy variables for ordinal variables? Also getting error with conversion

For the dataset that I am working with, the categorical variables are ordinal, ranging from 1 to 5 for three columns. I am going to be feeding this into XGBoost.
Would I be okay to just run this command and skip creating dummy variables:
ser = pd.Series([1, 2, 3], dtype='category')
ser = ser.to_frame()
ser = ser.T
I would like to know conceptually, since the categorical data is ordinal, would simply converting that to type category be adequate for the model? I tried creating dummy variables but all the values become a 1.
As for the code now, it runs but this command returns: 'numpy.int64'.
type(ser[0][0])
Am I going about this correctly? Any help would be great!
Edit: updated code
Edit2: Normalizing the numerical data values. Is this logic correct?:
r = [1, 2, 3, 100 ,200]
scaler = preprocessing.StandardScaler()
r = preprocessing.scale(r)
r = pd.Series(r)
r = r.to_frame()
r = r.T
Edit3: This is the dataset.
Just setting categorical variables as dtype="category" is not sufficient and won't work.
You need to convert categorical values to true categorical values with pd.factorize(), where each category is assigned a numerical label.
Let's say df is your pandas dataframe. Then in general you could use this boilerplate code:
df_numeric = df.select_dtypes(exclude=['object'])
df_obj = df.select_dtypes(include=['object']).copy()
# factorize categoricals columnwise
for c in df_obj:
df_obj[c] = pd.factorize(df_obj[c])[0]
# if you want to one hot encode then add this line:
df_obj = pd.get_dummies(df_obj, prefix_sep='_', drop_first = True)
# merge dataframes back to one dataframe
df_final = pd.concat([df_numeric, df_obj], axis=1)
Since your categorical variables already are factorized (as far as I understand), you can skip the factorization and just try one hot encoding.
See also this post on stats.stackexchange.
If you want to standardize/normalize your numerical data (not the categorical) use this function:
from sklearn import preprocessing
def scale_data(data, scale="robust"):
x = data.values
if scale == "minmax":
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
elif scale == "standard":
scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
elif scale == "quantile":
scaler = preprocessing.QuantileTransformer()
x_scaled = scaler.fit_transform(x)
elif scale == "robust":
scaler = preprocessing.RobustScaler()
x_scaled = scaler.fit_transform(x)
data = pd.DataFrame(x_scaled, columns = data.columns)
return data
scaled_df = scale_data(df_numeric, "robust")
Putting it all together for your dataset:
from sklearn import preprocessing
df = pd.read_excel("default of credit card clients.xls", skiprows=1)
y = df['default payment next month'] #target variable
del df['default payment next month']
c = [2,3,4] # index of categorical data columns
r = list(range(0,24))
r = [x for x in r if x not in c] # get list of all other columns
df_cat = df.iloc[:, [2,3,4]].copy()
df_con = df.iloc[:, r].copy()
# factorize categorical data
for c in df_cat:
df_cat[c] = pd.factorize(df_cat[c])[0]
# scale continuous data
scaler = preprocessing.MinMaxScaler()
df_scaled = scaler.fit_transform(df_con)
df_scaled = pd.DataFrame(df_scaled, columns=df_con.columns)
df_final = pd.concat([df_cat, df_scaled], axis=1)
#reorder columns back to original order
cols = df.columns
df_final = df_final[cols]
To further improve the code, do the train/test split before normalization, fit_transform() on the training data and just transform() on the test data. Otherwise you will have a data leak.

Categories