Trying to fit a linear kernel ridge regression model on a dataset with 8 features.
import pandas as pd
import urllib.request
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls'
urllib.request.urlretrieve(url, './Concrete_Data.xls')
data = pd.read_excel('./Concrete_Data.xls')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
new_col_names = ["Cement", "BlastFurnaceSlag", "FlyAsh", "Water", "Superplasticizer","CoarseAggregate", "FineAggregate", "Age", "CC_Strength"]
curr_col_names = list(data.columns)
mapper = {}
for i,name in enumerate(curr_col_names):
mapper[name] = new_col_names[i]
data = data.rename(columns=mapper)
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
from sklearn.kernel_ridge import KernelRidge
kr = KernelRidge(alpha=1.0)
kr.fit(x_train, y_train)
y_pred_kr = kr.predict(y_test)
When I try to run the code, there is an error that says the expected array is meant to be 2D but is a 1D array. Could someone let me know what I am possibly doing wrong?
I am using Python and random forests to predict the first column of my input file, my input file is under the form of:
T,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Here is the link to my full data: https://drive.google.com/file/d/1gjKoSi4rmMYZVm31LZ2Li92HM9USlu6A/view?usp=sharing
I am trying to predict the first column either T or N, depending on the values of the remaining columns and I am using random forests. I am getting the following error, how to fix it? Here is the code:
import pandas as pd
import numpy as np
dataset = pd.read_csv( 'data1extended.txt', sep= ',')
dataset.head()
row_count, column_count = dataset.shape
X = dataset.iloc[:, 1:column_count].values
y = dataset.iloc[:, 0].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
Try changing your target variable to numeric first. Assuming 'gold' column is your target, run this immediately after loading the data to a dataframe.
dataset['gold'] = dataset['gold'].astype('category').cat.codes
I have a problem with this code. The error is on the line: ppn.fit(X_train, y_train)
I just use Python 3.7
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("file.csv", sep=',', error_bad_lines=False, low_memory=False)
X = df.iloc[:, 1:44].values
y = df.iloc[:, 48].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train = np.isnan(X_train)
y_train = np.isnan(y_train)
X_test = np.isnan(X_test)
ppn = Perceptron(max_iter=40, tol=0.001, eta0=0.1, random_state=0)
ppn.fit(X_train, y_train)
y_pred = ppn.predict(X_test)
y_pred = np.isnan(y_pred)
print(accuracy_score(y_test, y_pred))
How can I fix it? Thanks.
The problem is I am getting two totally different results when I run the DTC algorithm, I just want to make sure that I am writing the cross validation - K Fold in a correct way or to understand why the result of the K fold is too much less than the normal one.
I've tried to run the codes for getting result from both normal accuracy and K fold accuracy the code is below:
from scipy.signal import butter, lfilter
import numpy as np
import pandas as pd
import pandas
from sklearn import preprocessing
from scipy.fftpack import fft
import pickle
import numpy
from pandas import Series
from numpy.random import randn
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
xx = pandas.read_csv("data1.dat", delimiter=",")
y = pandas.read_csv("label.dat", delim_whitespace=True)
x = xx.as_matrix()
y = numpy.array(y).astype(numpy.int)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_train, y_train)
y_predict_2 = clf2.predict(X_test)
print("DTC Accuracy : ")
print(accuracy_score(y_test, y_predict_2)*100)
DTC Accuracy :
97.6302083333333
from sklearn.model_selection import cross_val_score
DTC = DecisionTreeClassifier(random_state=42)
scores =cross_val_score(DTC, x, y, cv=10, scoring='accuracy')
print(scores.mean()*100)
35.331452470904985
from sklearn.model_selection import cross_val_score
DTC = DecisionTreeClassifier(random_state=42)
scores =cross_val_score(DTC, X_train, y_train, cv=10, scoring='accuracy')
print(scores.mean()*100)
97.34356
However, in the cross validation part, when I put X_train instead of x and y_train instead of y, the accuracy again rises to 97.
I am wondering which one I need to use (x and y) or (X_train adn y_train) will be the correct and common sense cross validation.
Try to shuffle your data and reduce the cross validation folds.
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
xx = pandas.read_csv("data1.dat", delimiter=",")
y = pandas.read_csv("label.dat", delim_whitespace=True)
x = xx.as_matrix()
y = y.values.astype(np.int32).reshape(-1, 1)
x, y = shuffle(x, y, random_state=42)
DTC = DecisionTreeClassifier(random_state=42)
scores = cross_val_score(DTC, x, y, cv=3, scoring='accuracy')
print(scores.mean()*100)