linear regression loop for residuals scatterplot - python

I'm running a linear regression simulation, each model according to a different value of the "label" variable. I can print metrics for each model, but I'm not able to run a different scatterplot por each model. All the graphs are reproduced in a single scatterplot. I would like to run a metric and a different scatterplot for each model
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import binom
from scipy.stats import norm
import numpy as np
from scipy.stats import norm
# generate random numbers from N(0,1)
x = norm.rvs(size=10000,loc=0,scale=1)
y = norm.rvs(size=10000,loc=0,scale=1)
z = binom.rvs(n=10,p=0.8,size=10000)
df = pd.DataFrame(data={'v1':x.flatten(),'target':y.flatten(),'label':z.flatten()})
classes=df.label.unique().tolist()
results = []
for name in classes:
df_subset=df.loc[df['label']==name]
reg = LinearRegression()
reg.fit(df_subset['v1'].values.reshape(-1, 1), df_subset["target"].values.reshape(-1, 1))
predictions = reg.predict(df_subset['v1'].values.reshape(-1, 1))
res=np.mean((predictions - df_subset["target"].values.reshape(-1, 1)) ** 2)
results.append(res)
msg = "Metric model %s: %f " % (name, res)
print(msg)
df_subset['pred']=predictions
sns.scatterplot(data=df_subset, x='pred', y="target")

Just create a new figure before sns plot.
plt.figure() <---
after sns plot do plt.show() so that you can show print statement(model metric) before each plot.
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import binom
from scipy.stats import norm
import numpy as np
import seaborn as sns
from scipy.stats import norm
# generate random numbers from N(0,1)
x = norm.rvs(size=10000,loc=0,scale=1)
y = norm.rvs(size=10000,loc=0,scale=1)
z = binom.rvs(n=10,p=0.8,size=10000)
df = pd.DataFrame(data={'v1':x.flatten(),'target':y.flatten(),'label':z.flatten()})
classes=df.label.unique().tolist()
results = []
for name in classes:
df_subset=df.loc[df['label']==name]
reg = LinearRegression()
reg.fit(df_subset['v1'].values.reshape(-1, 1), df_subset["target"].values.reshape(-1, 1))
predictions = reg.predict(df_subset['v1'].values.reshape(-1, 1))
res=np.mean((predictions - df_subset["target"].values.reshape(-1, 1)) ** 2)
results.append(res)
msg = "Metric model %s: %f " % (name, res)
print(msg)
plt.figure() #<-----------here
df_subset['pred']=predictions
sns.scatterplot(data=df_subset, x='pred', y="target")
plt.show() #<------------ here

I would recommend installing matplotlib library and then
import matplotlib.pyplot as plt
y = 0
.
.
.
#inside your for loop
plot = sns.scatterplot(data=df_subset, x='pred', y="target")
plt.savefig('plot_' + str(y))
plt.clf()

Related

Getting error while making scatter plot with pyplot

Trying to make scatter plot using pyplot but getting black covered bars and the scatter is in a line rather then being scattered points. I'm trying to do kmeans on the set of data https://archive.ics.uci.edu/ml/datasets/wholesale+customers#. And trying to show scatter plots of all pair of attributes with the cluster colouring.
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
data = np.genfromtxt('wholesale_customers.csv', delimiter=',')
ds = pd.read_csv('wholesale_customers.csv', sep=',', header=None)
ds = ds.drop([0])
ds = ds.drop(0, 1)
ds = ds.drop(1, 1)
dsT = ds.T
columns = ['fresh', 'milk', 'grocery', 'frozen', 'detergents', 'delicatessen']
for i in range(2, 8):
mean = 0
min = 100000
max = 0
temp = ds[i].values
for x in temp:
x = int(x)
if x < min:
min = x
if x > max:
max = x
mean += x
mean /= 440
print(columns[i - 2], " min: ", min, ", max: ", max, ", mean: ", mean)
temp = ds.values
tempT = dsT.values
kmeans = KMeans(n_clusters=3, random_state=0)
label = kmeans.fit_predict(temp)
for i in range(0, 6):
for j in range(i+1, 6):
plt.scatter(tempT[i], tempT[j])
plt.xlabel(columns[i])
plt.ylabel(columns[j])
plt.show()

Too many lines and curves on the polynomial graph

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
df = pd.read_csv("C:\\Users\\MONSTER\\Desktop\\dosyalar\\datasets\\Auto.csv")
x = df["horsepower"].to_numpy()
y = df["mpg"].to_numpy()
x = x.reshape(-1,1)
poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(x)
poly.fit(X_poly,y)
lr = LinearRegression()
lr.fit(X_poly, y)
y_pred = lr.predict(X_poly)
plt.scatter(x,y,color="blue",marker=".")
plt.plot(x,y_pred,color="red")
I have tried to draw a polynomial regression curve but I couldn't manage it. Someone told me to sorting values before plotting via "numpy.argsort" but nothing has changed. How can I fix it?
probably scatter is better for you:
plt.scatter(x,y_pred,color="red")
Or with argsort as mentioned:
orders = np.argsort(x.ravel())
plt.plot(x[orders], y[orders], color='red')

Polynomial Regression plot not showing correctly

I run this code for polynomial regression using sklearn but my plot is not what i was expecting. As you can see here i'm not getting a smooth line but it's jumping from one point to another. From my understanding i have to sort X, but when i do that all i get is an empty plot with a linear line.
import operator
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.formula.api as smf
df = pd.read_csv('D:\Mall_Customers.csv', usecols = ['Age', 'Annual Income (k$)','Spending Score (1-100)'])
x = StandardScaler().fit_transform(df)
kmeans = KMeans(n_clusters=3, max_iter=100)
y_kmeans= kmeans.fit_predict(x)
mydict = {i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters)}
dictlist = []
for key, value in mydict.items():
temp = [key,value]
dictlist.append(temp)
df0 = df[df.index.isin(mydict[0].tolist())]
X = df0[['Age', 'Annual Income (k$)']]
Y = df0['Spending Score (1-100)']
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X)
model = LinearRegression()
model.fit(X_poly, Y)
y_poly_pred = model.predict(X_poly)
r2 = r2_score(Y,y_poly_pred)
print(r2)
model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression(fit_intercept = False))
model.fit(X,Y)
plt.scatter(X.iloc[:, 1], Y, color='red')
plt.plot(X, Y, color='blue')
plt.xlabel('Age. Annual income')
plt.ylabel('Spending Score')
plt.show()
TLDR; the data is not linear dependent.
The reason the graph got so messy is because you plotted the X (train data) with the Y (the actual prediction data) and the fact that you were plotting this data while:
the data was messy and not really linear dependent
is what made the result this messy graph.
I suggest you to:
split to the train data into train, test and then after you train the model check the error with the test and maybe create 2 plots, 1 with the model results according to the test data and one with the actual result for the test data.
and change plot code to this:
.
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()

NameError: name 'predictions' is not defined

i am running the below code and getting this error. Please help:
Error: NameError: name 'predictions' is not defined
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn import linear_model
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
import seaborn
from datetime import date
from datetime import datetime
today = date.today()
sns.set_color_codes("dark")
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
d3 = today.strftime("%Y%m%d")
d5 = "S:\\Investment Process\\LCRV_Strategy\\1.VOLS Pack\\Drop\\Main_HY.CDX."+d3+".csv"
data = df=pd.read_csv(d5, skiprows=3)
#df.head()
plt.figure(figsize=(11.5, 8.5))
plt.scatter(
df['1M 10-50 HY'],
df['Spread'],
c='black'
)
plt.scatter(x='1M 10-50 HY', y='Spread', data=data.iloc[-1], c='orange')
plt.xlabel("1M 10-50 HY")
plt.ylabel("Spread")
plt.plot(
df['1M 10-50 HY'],
predictions,
c='blue',
linewidth=2
)
X = df['1M 10-50 HY'].values.reshape(-1,1)
y = df['Spread'].values.reshape(-1,1)
reg = LinearRegression()
reg.fit(X, y)
print("The linear model is: Y = {:.5} + {:.5}X".format(reg.intercept_[0], reg.coef_[0][0]))
X = df['1M 10-50 HY']
y = df['Spread']
#X2 = sm.add_constant(X)
#est = sm.OLS(y, X2)
#est2 = est.fit()
#print(est2.summary())
plt.show()
of course that is because the Python compiler does not know what is "predictions"!
if you want to predict you must call
predictions= reg.predict(x)
after the reg.fit() line.
then you can plot.

Classifications after better cluster found - Sklearn

I using kmeans to classificate data.
And I found my better k cluster with Elbow method and silhouette to validate decision.
So now how can i classificate my data and plot dist chart?
Could you please help me with this?
This is my code.
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
%matplotlib inline
df_diabetes = pd.read_csv('diabetes.csv')
#Deletando a coluna "Classe"
df_noclass = df_diabetes.drop('Classe', axis=1)
df_noclass.head()
nomes = df_diabetes_noclass.columns
valores = df_diabetes_noclass.values
escala_min_max = preprocessing.MinMaxScaler()
valores_normalizados = escala_min_max.fit_transform(valores)
df_diabetes_normalizado = pd.DataFrame(valores_normalizados)
df_diabetes_normalizado.columns = nomes
df_diabetes_normalizado.head(5)
sse = {}
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
df_diabetes_normalizado["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure(figsize=(14,9))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Numero de Clusters")
plt.ylabel("SSE")
plt.show()
X = df_diabetes_normalizado
y = df_diabetes_normalizado
for n_cluster in range(2, 11):
kmeans = KMeans(n_clusters=n_cluster).fit(X)
label = kmeans.labels_
sil_coeff = silhouette_score(X, label, metric='euclidean')
print("Para n_clusters={}, O Coeficiente de silueta é {}".format(n_cluster, sil_coeff))
I need to classificate my datas now and create a plot like image below.
If you want to predict which cluster your new data belongs to, you need to use the predict method:
kmeans.predict(newData)
Here is the documentation link for the predict method:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans.predict

Categories