ValueError: Found input variables with inconsistent numbers of samples: [645471, 78] - python

How can I fix this error it throws? ValueError: Found input variables with inconsistent numbers of samples: [645471, 78]
full code attached
#Importing the numpy to perform Linear Algebraic operations on the data
import numpy as np
#Import pandas library to perform the data preprocessing
import pandas
#importing the Keras deep learning framework of Python
import keras
#Importing the Sequential model from keras
from keras.models import Sequential
#Importing the types of layers in the Neural Network that we are going to have
from keras.layers import Dense
#Importing the train_test_split function which is useful in dividing the dataset into the training and testing data
from sklearn.model_selection import train_test_split
#Importing the StandardScaler function to perform the standardisation/scaling of the data
from sklearn.preprocessing import StandardScaler, LabelEncoder
#Importing the metries for the performance evaluation of our deep learning model
from sklearn import metrics
from keras.utils import np_utils, normalize, to_categorical
data = pandas.read_csv("C:/Users/bam/train.csv", header=0, dtype=object)
X = data.iloc[:, 0:78]
y = data.iloc[:78]
#I have splitted the dataset into a ratio of 80:20 between the train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 23)
#Creating an object of StandardScaler
sc = StandardScaler()
#Scaling the data using the StandardScaler() object
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
error

Related

Tensorflow better model loading

I am traing to load my model without my database, but I have to use Standard Scale split. This is my code:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
dataset = pd.read_csv('database.csv') #reading database
x = dataset.drop(columns=['good/bad']).values
y = dataset['good/bad'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
model = load_model("model.h5")
#Now I want to predict my data
out = scaler.transform([my_data])
prediction = model.predict(out)
pred = prediction[0][0]
Can I predict my data without loading my dataset?
In your code, firstly you need to create a model. For the research models and techniques, most machine learning or deep learning practitioners use these steps:
Data preprocess
Create Model
Save model
Load model
Make Prediction
More details have been already cleared in Tensorflow official Documentation

The first argument to `Layer.call` must always be passed

import os
from pylab import rcParams
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
from numpy import *
from scipy import stats
from pandas.plotting import scatter_matrix
import sklearn
import warnings
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
data = pd.read_excel(r'Attrition Data Exercise.xlsx')
X = data.iloc[:, 3:-1].values
y = data.iloc[:, -1].values
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
ct = ColumnTransformer(transformers=
[('one_encoder', OneHotEncoder(), [2, 5, 11, 13, 28]),
('ord_encoder', OrdinalEncoder(), [0])],
remainder='passthrough')
X = np.array(ct.fit_transform(X))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dropout(rate=0.3))
ann.add(tf.keras.layers.Dense(units=6, activation='relu', kernel_regularizer='l1', bias_regularizer='l2'))
ann.add(tf.keras.layers.Dropout(rate=0.3))
ann.add(tf.keras.layers.Dense(units=3, activation='relu', kernel_regularizer='l1', bias_regularizer='l2'))
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
opt = tf.keras.optimizers.Adam(
learning_rate=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-08)
ann.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy', tf.keras.metrics.Recall()])
The above code runs successfully. It's when I run the below code in a cell that it causes an error.
pipe = Pipeline([('smt', SMOTE()), ('model', KerasClassifier(build_fn = ann, verbose = 0, epochs=170))])
weights = np.linspace(0.5, 0.5, 1)
gsc = GridSearchCV(
estimator = pipe,
param_grid = {
'smt__sampling_strategy' : weights
},
scoring = 'f1',
cv = 4)
grid_result = gsc.fit(X_train, y_train)
The code above results in the following error:
ValueError: The first argument to `Layer.call` must always be passed
Any idea what I might be doing wrong or what can be improved?
I tried replacing KerasClassifier with KerasRegressor too just to see if something changes but nothing did. What essentially is going wrong?
I'm trying to use the Pipeline class from imblearn and GridSearchCV to get the best parameters for classifying the imbalanced dataset, I want to leave out resampling of the validation set and only resample the training set, which imblearn's Pipeline seems to be doing. However, I'm getting an error while implementing the accepted solution
Also link to the screenshot to the error trace is attached.Error Trace Complete
#danr got it correct. Many thanks to him. I was getting the same error when using KerasClassifier with sklearn's cross_val_score. Adding the lambda after build_fn solved the problem. I had a function create_model that created a keras Sequential model. Corrected code that runs smoothly (tensorflow 2.4.1):
from sklearn.model_selection import cross_val_score
# Create a KerasClassifier using best params determined using RandomizedSearchCV above
model = KerasClassifier(build_fn = lambda: create_model(learning_rate = 0.01, activation = 'tanh'), epochs = 50, batch_size = 32, verbose = 0)
# Calculate the accuracy score for each fold
kfolds = cross_val_score(model, X, y, cv = 3)

Fitting linear model using PolynominalFeatures

I want to create some random data and try to improve my model with PolynominalFeatures, however I'm facing little troubles with doing so.
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import random
import pandas as pd
import numpy as np
import statsmodels.api as sm
#create some artificial data
x=np.linspace(-1,1,1000)
x=pd.array(random.choices(x,k=1000))
y=x**2+np.random.randn(1000)
#divide sample
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5)
#define data frame to futher use for PolynomialFeatures
df=pd.DataFrame([x_train,x_test])
df=df.transpose()
data = df
# perform a polynomial features transform of the dataset
trans = PolynomialFeatures(degree=2)
data = trans.fit_transform(data)
model = sm.OLS(y_train,data).fit()
And then I get error : ValueError: unrecognized data structures: <class 'pandas.core.arrays.numpy_.PandasArray'> / <class 'numpy.ndarray'>
Do you have any ideas what should be done to make my regression work properly ?
use to_numpy() function to convert pandas array to numpy array
model = sm.OLS(y_train.to_numpy(),data).fit()

Trouble importing Keras

Here is the complete code
top part runs fine till i import keras.
I have tried installing and uninstalling keras, however the error is still there
Classification template
# Importing the libraries
import numpy as my
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
#Removing 1 Dummy Variable to avoid Dummy Variable Trap
X = X[:, 1:]
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Part 2: Let's make the ANN
#Importing the keras library
import keras.backend
import keras
from keras.models import Sequential
from keras.layers import Dense
# Initialising the ANN
classifier = Sequential()
AttributeError: module 'tensorflow.python.keras.backend' has no attribute 'get_graph'
Solution (as found in comments) was to install keras version 2.2.4
e.g:
pip install 'keras==2.2.4'
if you are above that version, you may try using this function instead:
keras.backend.image_data_format()

k-fold cross validation using tensorflow

I have created an artificial neural network. I am trying to calculate the accuracy of the model using k-fold cross validation technique but after compiling the last line its not progressing any further, its stuck there for more than 20 mins. I am not able to figure out where I am going wrong. Can anyone please help me with this thing? Below is the code I have used.
Thanks in advance.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X=X[:,1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential #required to initialize ann
from keras.layers import Dense #required to build the layers of ann
def build_classifier():
classifier=Sequential()
classifier.add(Dense(kernel_initializer="uniform", activation="relu", input_dim=11, units=6))
classifier.add(Dense(kernel_initializer="uniform", activation="relu", units=6))
classifier.add(Dense(kernel_initializer="uniform", activation="sigmoid",units=1))
classifier.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
return classifier
classifier=KerasClassifier(build_fn=build_classifier, batch_size=10, nb_epoch=100)
accuracies=cross_val_score(estimator=classifier,X=X_train,y=y_train,cv=10,n_jobs=-1)
I had the same issue with the exact same code. It seems Windows has an issue with "n_jobs", if you remove it by "accuracies = .." , it will start working. It's just that it could take long but it will work and show each epoch being updated.

Categories