Trying to fit a linear kernel ridge regression model on a dataset with 8 features.
import pandas as pd
import urllib.request
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls'
urllib.request.urlretrieve(url, './Concrete_Data.xls')
data = pd.read_excel('./Concrete_Data.xls')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
new_col_names = ["Cement", "BlastFurnaceSlag", "FlyAsh", "Water", "Superplasticizer","CoarseAggregate", "FineAggregate", "Age", "CC_Strength"]
curr_col_names = list(data.columns)
mapper = {}
for i,name in enumerate(curr_col_names):
mapper[name] = new_col_names[i]
data = data.rename(columns=mapper)
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
from sklearn.kernel_ridge import KernelRidge
kr = KernelRidge(alpha=1.0)
kr.fit(x_train, y_train)
y_pred_kr = kr.predict(y_test)
When I try to run the code, there is an error that says the expected array is meant to be 2D but is a 1D array. Could someone let me know what I am possibly doing wrong?
Related
How can I use this dataset "MC1" to plot a KNN decision boundary figure?
Here is my code, I have tried to use iloc and loc but did not work
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from yellowbrick.contrib.classifier import DecisionViz
from yellowbrick.features import RadViz
from yellowbrick.style import set_palette
set_palette('flatui')
data_set = pd.read_csv('MC1.csv')
X, y = data_set
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = tts(X, y, test_size=.4, random_state=42)
visualizer = RadViz(size=(500, 400))
viz = DecisionViz(
KNeighborsClassifier(5), title="Nearest Neighbors",classes=['Y', 'N']
)
viz.fit(X_train, y_train)
viz.draw(X_test, y_test)
viz.show()
I was reading this tutorial on LIME which showed how to visualise the result of a prediction by executing this code
import pandas as pd
import numpy as np
import lime
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import load_boston
boston = load_boston()
boston_df = pd.DataFrame(data=boston.data, columns = boston.feature_names)
boston_df["Price"] = boston.target
from sklearn.model_selection import train_test_split
X, Y = boston.data, boston.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.90, test_size=0.1, random_state=123, shuffle=True)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)
from lime import lime_tabular
explainer = lime_tabular.LimeTabularExplainer(X_train, mode="regression", feature_names= boston.feature_names)
idx = random.randint(1, len(X_test))
explanation = explainer.explain_instance(X_test[idx], lr.predict, num_features=len(boston.feature_names))
explanation.show_in_notebook()
The code produced this graph
My question is: how can I change the text size for the words negative and positive in that graph? I searched the docs but couldn't find any info on that matter.
I'm trying to run this python script,
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
dataset= pd.read_csv('Desktop/Yahoo_Finance.csv')
dataset.head()
X = dataset.iloc[:, 0:2].values
y = dataset['Close']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = LinearRegression()
classifier.fit(X_train, y_train)
plt.xlabel("Date")
plt.ylabel("Yahoo Finance Stock Prices")
plt.plot(y_pred[:50],linewidth=3.0)
but im getting the following error,
this is the error showing up on google colab
You probably need to convert the date column into datetime type before scaling.
The problem is I am getting two totally different results when I run the DTC algorithm, I just want to make sure that I am writing the cross validation - K Fold in a correct way or to understand why the result of the K fold is too much less than the normal one.
I've tried to run the codes for getting result from both normal accuracy and K fold accuracy the code is below:
from scipy.signal import butter, lfilter
import numpy as np
import pandas as pd
import pandas
from sklearn import preprocessing
from scipy.fftpack import fft
import pickle
import numpy
from pandas import Series
from numpy.random import randn
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
xx = pandas.read_csv("data1.dat", delimiter=",")
y = pandas.read_csv("label.dat", delim_whitespace=True)
x = xx.as_matrix()
y = numpy.array(y).astype(numpy.int)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_train, y_train)
y_predict_2 = clf2.predict(X_test)
print("DTC Accuracy : ")
print(accuracy_score(y_test, y_predict_2)*100)
DTC Accuracy :
97.6302083333333
from sklearn.model_selection import cross_val_score
DTC = DecisionTreeClassifier(random_state=42)
scores =cross_val_score(DTC, x, y, cv=10, scoring='accuracy')
print(scores.mean()*100)
35.331452470904985
from sklearn.model_selection import cross_val_score
DTC = DecisionTreeClassifier(random_state=42)
scores =cross_val_score(DTC, X_train, y_train, cv=10, scoring='accuracy')
print(scores.mean()*100)
97.34356
However, in the cross validation part, when I put X_train instead of x and y_train instead of y, the accuracy again rises to 97.
I am wondering which one I need to use (x and y) or (X_train adn y_train) will be the correct and common sense cross validation.
Try to shuffle your data and reduce the cross validation folds.
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
xx = pandas.read_csv("data1.dat", delimiter=",")
y = pandas.read_csv("label.dat", delim_whitespace=True)
x = xx.as_matrix()
y = y.values.astype(np.int32).reshape(-1, 1)
x, y = shuffle(x, y, random_state=42)
DTC = DecisionTreeClassifier(random_state=42)
scores = cross_val_score(DTC, x, y, cv=3, scoring='accuracy')
print(scores.mean()*100)
This is the custom code
#Custom model for multiple linear regression
import numpy as np
import pandas as pd
dataset = pd.read_csv("50s.csv")
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4:5].values
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
x[:,3] = lb.fit_transform(x[:,3])
from sklearn.preprocessing import OneHotEncoder
on = OneHotEncoder(categorical_features=[3])
x = on.fit_transform(x).toarray()
x = x[:,1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=1/5, random_state=0)
con = np.matrix(X_train)
z = np.matrix(y_train)
#training model
result1 = con.transpose()*con
result1 = np.linalg.inv(result1)
p = con.transpose()*z
f = result1*p
l = []
for i in range(len(X_test)):
temp = f[0]*X_test[i][0] + f[1]*X_test[i][1] +f[2]*X_test[i][2]+f[3]*X_test[i][3]+f[4]*X_test[i][4]
l.append(temp)
import matplotlib.pyplot as plt
plt.scatter(y_test,l)
plt.show()
Then I created created a model with scikit learn
and compared the results with y_test and l(predicted values of above code)
comparisons are as follows
for i in range(len(prediction)):
print(y_test[i],prediction[i],l[i],sep=' ')
103282.38 103015.20159795816 [[116862.44205399]]
144259.4 132582.27760816005 [[118661.40080974]]
146121.95 132447.73845175043 [[124952.97891882]]
77798.83 71976.09851258533 [[60680.01036438]]
This were the comparison between y_test,scikit-learn model predictions and custom code predictions
please help with the accuracy of model.
blue :Custom model predictions
yellow : scikit-learn model predictions