Python- linear regression - python

I need to check these criteria in multiple linear regression in order to make a loop:
p- value < 0.05
F- statistics < 0.05
R^2 >= 0.8
I use this code:
X = df.iloc[:,2:].values
Y = df.iloc[:,1].values
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state= 0)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)
df1 = pd.DataFrame({'Actual': Y_test.flatten(), 'Predicted': Y_pred.flatten()})
df1.plot(kind='bar')
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
import statsmodels.formula.api as sm
X= np.append (arr = np.ones((141,1)).astype(int), values = X, axis = 1)
X_opt = X[:,[0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()
The output is:
Here
How could I define the criteria so they become comparable? They need to receive different values in every loop.
Thanks :)

Related

How to visualize cluster boundaries

I generated several datasets, and using classifiers, I predicted the distribution of clusters. I need to draw boundaries between clusters on the chart. In the form of lines or in the form of filled areas - it does not matter. Please let me know if there is any way to do this.
My code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split
n_sample = 2000
def make_square(n_sample):
data=np.array([0,[]])
data[0] = np.random.sample((n_sample,2))
for i in range(n_sample):
if data[0][i][0] > 0.5 and data[0][i][1] > 0.5 or data[0][i][0] < 0.5 and data[0][i][1] < 0.5:
data[1].append(1)
else:
data[1].append(0)
return data
datasets = [
make_circles(n_samples=n_sample, noise=0.09, factor=0.5),
make_square(n_sample),
make_moons(n_samples=n_sample, noise=0.12),
]
ks=[]
for data in datasets:
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc = classifier.score(X_test, y_test)
accs = []
for i in range(1, 8):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
acc0 = knn.score(X_test, y_test)
accs.append(acc0)
plt.figure(figsize=(12, 6))
plt.plot(range(1, 8), accs, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('accs Score K Value')
plt.xlabel('K Value')
plt.ylabel('accs Score')
print("Max Score:", max(accs), "k=",accs.index(max(accs))+1)
ks.append(accs.index(max(accs))+1)
for i in range(3):
data = datasets[i]
k = ks[i]
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
plt.figure(figsize=(9,9))
plt.title("Test")
plt.scatter(X_test[:,0], X_test[:,1], c=y_test)
plt.figure(figsize=(9,9))
plt.title("Predict")
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred)
Example output:
enter image description here
enter image description here
scikit-learn 1.1 introduced the DecisionBoundaryDisplay to assist with this sort of task.
Following the use of make_moons and the KNeighborsClassifier in the question, we can fit the classifier on the dataset, invoke the DecisionBoundaryDisplay.from_estimator() method, then scatter the X data on the returned axis:
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import DecisionBoundaryDisplay
X, y = make_moons(noise=0.2)
clf = KNeighborsClassifier().fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(clf, X, response_method="predict", alpha=0.3)
disp.ax_.scatter(X[:, 0], X[:, 1], c=y)
plt.show()
Resulting in something like this:

ValueError: X has 2 features, but SVC is expecting 32 features as input

I encountered a problem as the title showed and didn't know how to fix it. I want to use cross-validation to analyze chronic disease. Could someone help me? thanks
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =
0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train,y_train)
# Predicting the test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
ckd = confusion_matrix(y_test, y_pred)
ckd
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 =np.meshgrid(np.arange(start =X_set[:,0].min()-1, stop= X_set[:,
0].max()+1,step =
0.01), np.arange(start =X_set[:,1].min()-1, stop=
X_set[:,1].max()+1,step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape), #this line error
alpha = 0.15, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(),X1.max())
plt.xlim(X1.min(),X1.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set ==j, 1],
c= ListedColormap(('red','green'))(i),label = j)
plt.title('Kernel SVM (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()

Accessing corresponding timeSeries data after Support Vector Regression

I am trying to train my data for forecasting by support vector regression. I exclude time series before doing regression because time is not an input. But I need time data in plotting and should be in order. When it comes to plotting, I am having a problem with getting actual data time index values. I need actual corresponding time series for y_test and y_pred. When I tried to get original datetime index, plotting is not correct and not in the date order corresponding with the y series.
The output should be time(in order such as from 01/01/2021 to 31/12/2021) vs y_pred and y_test.
Here is my dataset: https://github.com/ozgurylc/Dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
dataset = pd.read_csv('Combined_MET_PV_data.csv')
# takes necessary columns
df = dataset[['referenceTime', 'dew_point_temp', 'air_temp', 'relative_humidity',
'irradiance', 'wind_speed', 'wind_category',
'hour_harmonic', 'AC_Power_IV2']]
print(df)
X = df.iloc[:, :-1].values # does not take Power
y = df.iloc[:, -1].values # only takes Power
print(X)
print(y)
print(X.shape, y.shape)
y = np.reshape(y, (-1,1))
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print("Train Shape: {} {} \nTest Shape: {} {}".format(X_train.shape, y_train.shape,
X_test.shape, y_test.shape))
X_train = X_train[:, 1:] # excludes referenceTime from X_train
X_test1 = X_test[:, 1:] #excludes referenceTime fron X_test
print(X_test[:, 0].tolist()) # this keeps referenceTime
print(X_test)
Here is where regression is done:
# scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test1 = sc_X.transform(X_test1)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)
y_test = sc_y.transform(y_test)
y_train = y_train.reshape((-1,))
svr_linear = svm.SVR(kernel='rbf')
svr_linear.fit(X_train, y_train)
print(svr_linear.score(X_test1, y_test))
y_pred = svr_linear.predict(X_test1)
print(y_pred)
# in the following code X_test[:,0] where time index is kept.
plot_1 = plt.plot(X_test[:, 0], y_test, color='red', linewidth=2)
plot_2 = plt.plot(X_test[:, 0], y_pred, color='blue', linewidth=2, linestyle='--')
plt.show()

Sklearn - price prediction using cross validation

This is my code:
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
%matplotlib inline
boston_properties = load_boston()
l_distance = boston_properties['data'][:, np.newaxis, 7]
linreg = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(l_distance, boston_properties['target'], test_size = 0.3)
y_pred = cross_val_predict(linreg, l_distance, boston_properties.target, cv=5)
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=2)
plt.show()
print(y_pred.shape)
The error which I'm receiving is the following:
ValueError: x and y must have same first dimension, but have shapes (152, 1) and (506,)
How can I make this work?
You made a train_test_split, but you're not using it to train the model. Then you predict on your entire training data, and compare it with y_test. This makes no sense. Use these lines instead:
l_distance = boston_properties['data'][:, np.newaxis, 7]
linreg = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(l_distance,
boston_properties['target'], test_size = 0.3) # now you have a train/test set
y_pred = cross_val_predict(linreg, X_train, y_train, cv=5)
plt.scatter(X_train, y_train, color='black')
plt.plot(X_train, y_pred, color='blue', linewidth=2)
plt.show()
Edit: You can also use this line to make a straight line through your points:
plt.scatter(X_train, y_train, color='black')
plt.plot([X_train[np.argmin(X_train)], X_train[np.argmax(X_train)]],
[y_pred[np.argmin(X_train)], y_pred[np.argmax(X_train)]],
color='blue')
plt.show()

how to solve ? x and y must have same first dimension

from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
r = pd.read_csv("vitalsign_test.csv")
clm_list = []
for column in r.columns:
clm_list.append(column)
X = r[clm_list[1:len(clm_list)-1]].values
y = r[clm_list[len(clm_list)-1]].values
X_train, X_test, y_train, y_test = train_test_split (X,y, test_size = 0.3, random_state=4)
k_range = range(1,25)
scores = []
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
plt.plot(k_range,scores)
plt.xlabel('value of k for clf')
plt.ylabel('testing accuracy')
reponse that I am getting is
ValueError: x and y must have same first dimension
my feature and response shape is:
y.shape
Out[60]: (500,)
X.shape
Out[61]: (500, 6)
It has nothing to do with your X and y, it is about x and y arguments to plot, since your scores has one element, and k_range has 25. The error is incorrect indentation:
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
should be
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))

Categories