My Python linear regression has incomplete line - python

I am trying to plot a linear regression. But the line is incomeplete. Below is my Python code:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
features = [[2],[4],[8],[5]]
labels = [320, 610, 1190, 726]
plt.scatter(features,labels,color="black")
plt.xlabel("Number of room")
plt.ylabel('price')
clf = linear_model.LinearRegression()
clf=clf.fit(features,labels)
result = clf.predict([[11],[9]])
plt.plot([[11],[9]], result, color='blue', linewidth=3)
print(result)
plt.show()
And the picture of the plot is given below:
From the picture, you can see that the line is incomplete. Please help me to solve this problem. Make the line complete with all the other values.

Related

How to find the 'peak' of a polynomial regression line in Matplotlib

Is it possible to find the peak (vertex?) values (x,y) of a polynomial regression line that was computed using Matplotlib?
I've included my basic setup below (of course with fuller data sets), as well as a screenshot of the actual regression line question.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
degree=6
setX={'Fixation Duration': {0:1,1:2,2:3}}
setY={'Fixation Occurrences': {0:1,1:2,2:3}}
X_gall=pd.DataFrame.from_dict(setX)
Y_gall=pd.DataFrame.from_dict(setY)
X_seqGall = np.linspace(X_gall.min(),X_gall.max(),300).reshape(-1,1)
polyregGall=make_pipeline(PolynomialFeatures(degree),LinearRegression())
polyregGall.fit(X_gall,Y_gall)
plt.scatter(X_gall,Y_gall, c="#1E4174", s=100.0, alpha=0.4)
plt.plot(X_seqGall,polyregGall.predict(X_seqGall),color="#1E4174", linewidth=4)
plt.show()
would like to find x,y values along red arrows
You can find the maximum from the underlying plot data.
First, let's change your plotting commands to explicitly define the axes:
fig, ax = plt.subplots(figsize=(6,4))
_ = ax.scatter(X_gall,Y_gall, c="#1E4174", s=100.0, alpha=0.4)
poly = ax.plot(X_seqGall,polyregGall.predict(X_seqGall),color="#1E4174", linewidth=4)
plt.show()
Now you can access the line data:
lines = poly[0].axes.lines
for line in lines:
max_y = np.max(line.get_ydata())
print(f"Maximum y is: {max_y}")
x_of_max_y = line.get_xdata()[np.argmax(line.get_ydata())]
print(f"x value of maximum y is: {x_of_max_y}")
Output:
Maximum y is: 3.1515605364361114
x value of maximum y is: 2.8127090301003346

ConfusionMatrixDisplay (Scikit-Learn) plot labels out of range

The following code plots a confusion matrix:
from sklearn.metrics import ConfusionMatrixDisplay
confusion_matrix = confusion_matrix(y_true, y_pred)
target_names = ["aaaaa", "bbbbbb", "ccccccc", "dddddddd", "eeeeeeeeee", "ffffffff", "ggggggggg"]
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=target_names)
disp.plot(cmap=plt.cm.Blues, xticks_rotation=45)
plt.savefig("conf.png")
There are two problems with this plot.
The y-axis label is cut off (True Label). The x label is cut off too.
The names are to long for the x-axis.
To solve the first problem I tried to use poof(bbox_inches='tight') which is unfortunately not available for sklearn.
In the second case I tried the following solution for 2. which lead to a completely distorted plot.
All in all I'm struggeling with both problems.
I think the easiest way would be to switch into tight_layout and add pad_inches= something.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from numpy.random import default_rng
rand = default_rng()
y_true = rand.integers(low=0, high=7, size=500)
y_pred = rand.integers(low=0, high=7, size=500)
confusion_matrix = confusion_matrix(y_true, y_pred)
target_names = ["aaaaa", "bbbbbb", "ccccccc", "dddddddd", "eeeeeeeeee", "ffffffff", "ggggggggg"]
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=target_names)
disp.plot(cmap=plt.cm.Blues, xticks_rotation=45)
plt.tight_layout()
plt.savefig("conf.png", pad_inches=5)
Result:

Polynomial Regression plot not showing correctly

I run this code for polynomial regression using sklearn but my plot is not what i was expecting. As you can see here i'm not getting a smooth line but it's jumping from one point to another. From my understanding i have to sort X, but when i do that all i get is an empty plot with a linear line.
import operator
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.formula.api as smf
df = pd.read_csv('D:\Mall_Customers.csv', usecols = ['Age', 'Annual Income (k$)','Spending Score (1-100)'])
x = StandardScaler().fit_transform(df)
kmeans = KMeans(n_clusters=3, max_iter=100)
y_kmeans= kmeans.fit_predict(x)
mydict = {i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters)}
dictlist = []
for key, value in mydict.items():
temp = [key,value]
dictlist.append(temp)
df0 = df[df.index.isin(mydict[0].tolist())]
X = df0[['Age', 'Annual Income (k$)']]
Y = df0['Spending Score (1-100)']
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X)
model = LinearRegression()
model.fit(X_poly, Y)
y_poly_pred = model.predict(X_poly)
r2 = r2_score(Y,y_poly_pred)
print(r2)
model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression(fit_intercept = False))
model.fit(X,Y)
plt.scatter(X.iloc[:, 1], Y, color='red')
plt.plot(X, Y, color='blue')
plt.xlabel('Age. Annual income')
plt.ylabel('Spending Score')
plt.show()
TLDR; the data is not linear dependent.
The reason the graph got so messy is because you plotted the X (train data) with the Y (the actual prediction data) and the fact that you were plotting this data while:
the data was messy and not really linear dependent
is what made the result this messy graph.
I suggest you to:
split to the train data into train, test and then after you train the model check the error with the test and maybe create 2 plots, 1 with the model results according to the test data and one with the actual result for the test data.
and change plot code to this:
.
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()

Linear regression plot is really bad

observations that are different from each other so i run regression again but for only one cluster.But it also came out wrong What exactly is wrong here? I'll also have to point out that i am still new to this (linerear regression etc.) so my understanding of all this is still bad. How can i fix this plot and please if it's possible try to explain why it's wrong.
Code :
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
np.random.seed(0)
kmeans.cluster_centers_
kmeans.labels_
n,y_test = train_test_split(X, Y, test_size = 0.4, random_state = 0)
plt.scatter(X.iloc[:, 1], Y)
plt.show()
You're performing multiple linear regression, since you have 2 input features ('Age', 'Annual Income (k$)') that try to predict the output feature ('Spending Score (1-100)'). You need to plot this data in 3D, in order to properly visualize the regression.
Even though I can't test your code without the data, something like this should work (after training the model):
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X.iloc[:, 0], X.iloc[:, 1], Y)
ax.plot(X.iloc[:, 0], X.iloc[:, 1], y_pred, color='red')
ax.set_xlabel('Age')
ax.set_ylabel('Annual Income')
ax.set_zlabel('Spending Score')

Translate cross_validation algorithm to model_selection

In 2016, I ran a lasso regression model using the code below:
#Import required packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pylab as plt
import matplotlib.pyplot as plp
import seaborn as sns
import statsmodels.formula.api as smf
from scipy import stats
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoLarsCV
# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.4, random_state=123)
#%
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False).fit(pred_train,tar_train)
#%
# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))
#regcoef.to_csv('variable+regresscoef.csv')
#%%
# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')
#%
# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
#%
# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)
#%
# R-square from training and test data
rsquared_train=model.score(pred_train,tar_train)
rsquared_test=model.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
Now I want to run it again and got the following warning:
DeprecationWarning: This module was deprecated in version 0.18 in
favor of the model_selection module into which all the refactored
classes and functions are moved.
How can I rewrite this code using model_selection ?
Only thing I can see here that used cross_validation module earlier is train_test_split.
So just change your import from:
from sklearn.cross_validation import train_test_split
to:
from sklearn.model_selection import train_test_split
and you are good to go.

Categories