messy scatter plot regression line: Python

messy scatter plot regression line: Python - python

In python 2.7.6, matlablib, scikit learn 0.17.0, When I make a polynomial regression lines on a scatter plot, the polynomial curve will be really messy like this:
The script is like this: it will read two columns of floating data and make a scatter plot and regression
import pandas as pd
import scipy.stats as stats
import pylab
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pylab as pl
import sklearn
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
df=pd.read_csv("boston_real_estate_market_clean.csv")
LSTAT = df['LSTAT'].as_matrix()
LSTAT=LSTAT.reshape(LSTAT.shape[0], 1)
MEDV=df['MEDV'].as_matrix()
MEDV=MEDV.reshape(MEDV.shape[0], 1)
# Train test set split
X_train1, X_test1, y_train1, y_test1 = train_test_split(LSTAT,MEDV,test_size=0.3,random_state=1)
# Ploynomial Regression-nst order
plt.scatter(X_test1, y_test1, s=10, alpha=0.3)
for degree in [1,2,3,4,5]:
model = make_pipeline(PolynomialFeatures(degree), Ridge())
model.fit(X_train1,y_train1)
y_plot = model.predict(X_test1)
plt.plot(X_test1, y_plot, label="degree %d" % degree
+'; $q^2$: %.2f' % model.score(X_train1, y_train1)
+'; $R^2$: %.2f' % model.score(X_test1, y_test1))
plt.legend(loc='upper right')
plt.show()
I guess the reason is because the "X_test1, y_plot" are not sorted properly?
X_test1 is a numpy array like this:
[[ 5.49]
[ 16.65]
[ 17.09]
....
[ 25.68]
[ 24.39]]
yplot is a numpy array like this:
[[ 29.78517812]
[ 17.16759833]
[ 16.86462359]
[ 23.18680265]
...[ 37.7631725 ]]
I try to sort with this:
[X_test1, y_plot] = zip(*sorted(zip(X_test1, y_plot), key=lambda y_plot: y_plot[0]))
plt.plot(X_test1, y_plot, label="degree %d" % degree
+'; $q^2$: %.2f' % model.score(X_train1, y_train1)
+'; $R^2$: %.2f' % model.score(X_test1, y_test1))
The curve looks normal now but the result is weird with a negative R^2.
Could any guru show me the real issue is or how to sort here properly? Thank you!

While the plot is now correct, you messed up the pairing of X_test1 to y_test1 while sorting because you forgot to also sort y_test1 in the same way.
The best solution is to sort right after the split. Then y_plot, which is computed later, will be automatically correct: (Here untested example using numpy as np)
X_train1, X_test1, y_train1, y_test1 = train_test_split(LSTAT,MEDV,test_size=0.3,random_state=1)
sorted_index = np.argsort(X_test1)
X_test1 = X_test1[sorted_index]
y_test1 = y_test1[sorted_index]

Related

linear regression loop for residuals scatterplot

I'm running a linear regression simulation, each model according to a different value of the "label" variable. I can print metrics for each model, but I'm not able to run a different scatterplot por each model. All the graphs are reproduced in a single scatterplot. I would like to run a metric and a different scatterplot for each model
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import binom
from scipy.stats import norm
import numpy as np
from scipy.stats import norm
# generate random numbers from N(0,1)
x = norm.rvs(size=10000,loc=0,scale=1)
y = norm.rvs(size=10000,loc=0,scale=1)
z = binom.rvs(n=10,p=0.8,size=10000)
df = pd.DataFrame(data={'v1':x.flatten(),'target':y.flatten(),'label':z.flatten()})
classes=df.label.unique().tolist()
results = []
for name in classes:
df_subset=df.loc[df['label']==name]
reg = LinearRegression()
reg.fit(df_subset['v1'].values.reshape(-1, 1), df_subset["target"].values.reshape(-1, 1))
predictions = reg.predict(df_subset['v1'].values.reshape(-1, 1))
res=np.mean((predictions - df_subset["target"].values.reshape(-1, 1)) ** 2)
results.append(res)
msg = "Metric model %s: %f " % (name, res)
print(msg)
df_subset['pred']=predictions
sns.scatterplot(data=df_subset, x='pred', y="target")

Just create a new figure before sns plot.
plt.figure() <---
after sns plot do plt.show() so that you can show print statement(model metric) before each plot.
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import binom
from scipy.stats import norm
import numpy as np
import seaborn as sns
from scipy.stats import norm
# generate random numbers from N(0,1)
x = norm.rvs(size=10000,loc=0,scale=1)
y = norm.rvs(size=10000,loc=0,scale=1)
z = binom.rvs(n=10,p=0.8,size=10000)
df = pd.DataFrame(data={'v1':x.flatten(),'target':y.flatten(),'label':z.flatten()})
classes=df.label.unique().tolist()
results = []
for name in classes:
df_subset=df.loc[df['label']==name]
reg = LinearRegression()
reg.fit(df_subset['v1'].values.reshape(-1, 1), df_subset["target"].values.reshape(-1, 1))
predictions = reg.predict(df_subset['v1'].values.reshape(-1, 1))
res=np.mean((predictions - df_subset["target"].values.reshape(-1, 1)) ** 2)
results.append(res)
msg = "Metric model %s: %f " % (name, res)
print(msg)
plt.figure() #<-----------here
df_subset['pred']=predictions
sns.scatterplot(data=df_subset, x='pred', y="target")
plt.show() #<------------ here

I would recommend installing matplotlib library and then
import matplotlib.pyplot as plt
y = 0
.
.
.
#inside your for loop
plot = sns.scatterplot(data=df_subset, x='pred', y="target")
plt.savefig('plot_' + str(y))
plt.clf()

Too many lines and curves on the polynomial graph

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
df = pd.read_csv("C:\\Users\\MONSTER\\Desktop\\dosyalar\\datasets\\Auto.csv")
x = df["horsepower"].to_numpy()
y = df["mpg"].to_numpy()
x = x.reshape(-1,1)
poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(x)
poly.fit(X_poly,y)
lr = LinearRegression()
lr.fit(X_poly, y)
y_pred = lr.predict(X_poly)
plt.scatter(x,y,color="blue",marker=".")
plt.plot(x,y_pred,color="red")
I have tried to draw a polynomial regression curve but I couldn't manage it. Someone told me to sorting values before plotting via "numpy.argsort" but nothing has changed. How can I fix it?

probably scatter is better for you:
plt.scatter(x,y_pred,color="red")
Or with argsort as mentioned:
orders = np.argsort(x.ravel())
plt.plot(x[orders], y[orders], color='red')

Polynomial Regression plot not showing correctly

I run this code for polynomial regression using sklearn but my plot is not what i was expecting. As you can see here i'm not getting a smooth line but it's jumping from one point to another. From my understanding i have to sort X, but when i do that all i get is an empty plot with a linear line.
import operator
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.formula.api as smf
df = pd.read_csv('D:\Mall_Customers.csv', usecols = ['Age', 'Annual Income (k$)','Spending Score (1-100)'])
x = StandardScaler().fit_transform(df)
kmeans = KMeans(n_clusters=3, max_iter=100)
y_kmeans= kmeans.fit_predict(x)
mydict = {i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters)}
dictlist = []
for key, value in mydict.items():
temp = [key,value]
dictlist.append(temp)
df0 = df[df.index.isin(mydict[0].tolist())]
X = df0[['Age', 'Annual Income (k$)']]
Y = df0['Spending Score (1-100)']
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X)
model = LinearRegression()
model.fit(X_poly, Y)
y_poly_pred = model.predict(X_poly)
r2 = r2_score(Y,y_poly_pred)
print(r2)
model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression(fit_intercept = False))
model.fit(X,Y)
plt.scatter(X.iloc[:, 1], Y, color='red')
plt.plot(X, Y, color='blue')
plt.xlabel('Age. Annual income')
plt.ylabel('Spending Score')
plt.show()

TLDR; the data is not linear dependent.
The reason the graph got so messy is because you plotted the X (train data) with the Y (the actual prediction data) and the fact that you were plotting this data while:
the data was messy and not really linear dependent
is what made the result this messy graph.
I suggest you to:
split to the train data into train, test and then after you train the model check the error with the test and maybe create 2 plots, 1 with the model results according to the test data and one with the actual result for the test data.
and change plot code to this:
.
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()

How to plot a ROC curve by varying a parameter in a pandas dataframe

I am trying to plot multiple ROC curves on a plot by varying a variable in a cell in a pandas dataframe.
So in a particular row, if the total is above a certain threshold then it will be classified as an invoice. I want to be plotting the different curves on different thresholds of total.
This is the code that I have so far that measures basic metrics and is an attempt to plot the ROC curve but I have been unsuccessful so far.
import os
import pandas as pd
from sklearn import datasets, metrics, model_selection, svm
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import scikitplot as skplt
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv("test_results.csv", header = 0)
true_array = list(df["actual"].to_numpy())
predicted_array = list(df["predicted"].to_numpy())
accuracy = accuracy_score(true_array, predicted_array)
precision, recall, fscore, support = score(true_array, predicted_array, average = None, labels = ['invoice', 'non-invoice'])
print("Labels: \t invoice", "non-invoice")
print('Accuracy: \t {}'.format(accuracy))
print('Precision: \t {}'.format(precision))
print('Recall: \t {}'.format(recall))
print('Fscore: \t {}'.format(fscore))
skplt.metrics.plot_roc_curve(true_array, predicted_array)
plt.show()
The error I am getting is
fpr[i], tpr[i], _ = roc_curve(y_true, probas[:, i],
IndexError: too many indices for array
Any help would be appreciated..

The following documentation mentions that skplt.metrics.plot_roc_curve takes ground truth (correct) target values and prediction probabilities for each class returned by a classifier. So you should change 2nd input prediction_array.
https://scikit-plot.readthedocs.io/en/stable/metrics.html?highlight=roc#scikitplot.metrics.plot_roc

Translate cross_validation algorithm to model_selection

In 2016, I ran a lasso regression model using the code below:
#Import required packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pylab as plt
import matplotlib.pyplot as plp
import seaborn as sns
import statsmodels.formula.api as smf
from scipy import stats
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoLarsCV
# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.4, random_state=123)
#%
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False).fit(pred_train,tar_train)
#%
# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))
#regcoef.to_csv('variable+regresscoef.csv')
#%%
# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')
#%
# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
#%
# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)
#%
# R-square from training and test data
rsquared_train=model.score(pred_train,tar_train)
rsquared_test=model.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
Now I want to run it again and got the following warning:
DeprecationWarning: This module was deprecated in version 0.18 in
favor of the model_selection module into which all the refactored
classes and functions are moved.
How can I rewrite this code using model_selection ?

Only thing I can see here that used cross_validation module earlier is train_test_split.
So just change your import from:
from sklearn.cross_validation import train_test_split
to:
from sklearn.model_selection import train_test_split
and you are good to go.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

messy scatter plot regression line: Python - python

Related

linear regression loop for residuals scatterplot

Too many lines and curves on the polynomial graph

Polynomial Regression plot not showing correctly

How to plot a ROC curve by varying a parameter in a pandas dataframe

Translate cross_validation algorithm to model_selection

Categories

Resources