NameError: name 'predictions' is not defined - python

i am running the below code and getting this error. Please help:
Error: NameError: name 'predictions' is not defined
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn import linear_model
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
import seaborn
from datetime import date
from datetime import datetime
today = date.today()
sns.set_color_codes("dark")
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
d3 = today.strftime("%Y%m%d")
d5 = "S:\\Investment Process\\LCRV_Strategy\\1.VOLS Pack\\Drop\\Main_HY.CDX."+d3+".csv"
data = df=pd.read_csv(d5, skiprows=3)
#df.head()
plt.figure(figsize=(11.5, 8.5))
plt.scatter(
df['1M 10-50 HY'],
df['Spread'],
c='black'
)
plt.scatter(x='1M 10-50 HY', y='Spread', data=data.iloc[-1], c='orange')
plt.xlabel("1M 10-50 HY")
plt.ylabel("Spread")
plt.plot(
df['1M 10-50 HY'],
predictions,
c='blue',
linewidth=2
)
X = df['1M 10-50 HY'].values.reshape(-1,1)
y = df['Spread'].values.reshape(-1,1)
reg = LinearRegression()
reg.fit(X, y)
print("The linear model is: Y = {:.5} + {:.5}X".format(reg.intercept_[0], reg.coef_[0][0]))
X = df['1M 10-50 HY']
y = df['Spread']
#X2 = sm.add_constant(X)
#est = sm.OLS(y, X2)
#est2 = est.fit()
#print(est2.summary())
plt.show()

of course that is because the Python compiler does not know what is "predictions"!
if you want to predict you must call
predictions= reg.predict(x)
after the reg.fit() line.
then you can plot.

Related

linear regression loop for residuals scatterplot

I'm running a linear regression simulation, each model according to a different value of the "label" variable. I can print metrics for each model, but I'm not able to run a different scatterplot por each model. All the graphs are reproduced in a single scatterplot. I would like to run a metric and a different scatterplot for each model
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import binom
from scipy.stats import norm
import numpy as np
from scipy.stats import norm
# generate random numbers from N(0,1)
x = norm.rvs(size=10000,loc=0,scale=1)
y = norm.rvs(size=10000,loc=0,scale=1)
z = binom.rvs(n=10,p=0.8,size=10000)
df = pd.DataFrame(data={'v1':x.flatten(),'target':y.flatten(),'label':z.flatten()})
classes=df.label.unique().tolist()
results = []
for name in classes:
df_subset=df.loc[df['label']==name]
reg = LinearRegression()
reg.fit(df_subset['v1'].values.reshape(-1, 1), df_subset["target"].values.reshape(-1, 1))
predictions = reg.predict(df_subset['v1'].values.reshape(-1, 1))
res=np.mean((predictions - df_subset["target"].values.reshape(-1, 1)) ** 2)
results.append(res)
msg = "Metric model %s: %f " % (name, res)
print(msg)
df_subset['pred']=predictions
sns.scatterplot(data=df_subset, x='pred', y="target")
Just create a new figure before sns plot.
plt.figure() <---
after sns plot do plt.show() so that you can show print statement(model metric) before each plot.
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import binom
from scipy.stats import norm
import numpy as np
import seaborn as sns
from scipy.stats import norm
# generate random numbers from N(0,1)
x = norm.rvs(size=10000,loc=0,scale=1)
y = norm.rvs(size=10000,loc=0,scale=1)
z = binom.rvs(n=10,p=0.8,size=10000)
df = pd.DataFrame(data={'v1':x.flatten(),'target':y.flatten(),'label':z.flatten()})
classes=df.label.unique().tolist()
results = []
for name in classes:
df_subset=df.loc[df['label']==name]
reg = LinearRegression()
reg.fit(df_subset['v1'].values.reshape(-1, 1), df_subset["target"].values.reshape(-1, 1))
predictions = reg.predict(df_subset['v1'].values.reshape(-1, 1))
res=np.mean((predictions - df_subset["target"].values.reshape(-1, 1)) ** 2)
results.append(res)
msg = "Metric model %s: %f " % (name, res)
print(msg)
plt.figure() #<-----------here
df_subset['pred']=predictions
sns.scatterplot(data=df_subset, x='pred', y="target")
plt.show() #<------------ here
I would recommend installing matplotlib library and then
import matplotlib.pyplot as plt
y = 0
.
.
.
#inside your for loop
plot = sns.scatterplot(data=df_subset, x='pred', y="target")
plt.savefig('plot_' + str(y))
plt.clf()

Too many lines and curves on the polynomial graph

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
df = pd.read_csv("C:\\Users\\MONSTER\\Desktop\\dosyalar\\datasets\\Auto.csv")
x = df["horsepower"].to_numpy()
y = df["mpg"].to_numpy()
x = x.reshape(-1,1)
poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(x)
poly.fit(X_poly,y)
lr = LinearRegression()
lr.fit(X_poly, y)
y_pred = lr.predict(X_poly)
plt.scatter(x,y,color="blue",marker=".")
plt.plot(x,y_pred,color="red")
I have tried to draw a polynomial regression curve but I couldn't manage it. Someone told me to sorting values before plotting via "numpy.argsort" but nothing has changed. How can I fix it?
probably scatter is better for you:
plt.scatter(x,y_pred,color="red")
Or with argsort as mentioned:
orders = np.argsort(x.ravel())
plt.plot(x[orders], y[orders], color='red')

Using TSNE to dimensionality reduction. Why 3 D graph is not working?

I have used the Digits dataset from Sklearn and I have tried to reduce the dimension from 64 to 3 using TSNE( t-Distributed Stochastic Neighbor Embedding):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotib inline
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits
from mpl_toolkits.mplot3d import Axes3D
digits = load_digits()
digits_df = pd.DataFrame(digits.data,)
digits_df["target"] = pd.Series(digits.target)
tsne = TSNE(n_components=3)
digits_tsne = tsne.fit_transform(digits_df.iloc[:,:64])
digits_df_tsne = pd.DataFrame(digits_tsne,
columns =["Component1","Component2","Component3"])
finalDf = pd.concat([digits_df_tsne, digits_df["target"]], axis = 1)
#Visualizing 3D
figure = plt.figure(figsize=(9,9))
axes = figure.add_subplot(111,projection = "3d")
dots = axes.scatter(xs = finalDf[:,0],ys = finalDf[:,1],zs = finalDf[:,2],
c = digits.target, cmap = plt.cm.get_cmap("nipy_spectral_r",10))
The finalDf:
Te error:
TypeError: '(slice(None, None, None), 0)' is an invalid key
What is wrong? Could someone help me?
You're trying numpy slicing on pandas dataframe which is not valid, so first convert the dataframes to numpy arrays.
Here's the updated code: -
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotib inline
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits
from mpl_toolkits.mplot3d import Axes3D
digits = load_digits()
digits_df = pd.DataFrame(digits.data,)
digits_df["target"] = pd.Series(digits.target)
tsne = TSNE(n_components=3)
digits_tsne = tsne.fit_transform(digits_df.iloc[:,:64])
digits_df_tsne = pd.DataFrame(digits_tsne,
columns =["Component1","Component2","Component3"])
finalDf = pd.concat([digits_df_tsne, digits_df["target"]], axis = 1)
#Visualizing 3D
figure = plt.figure(figsize=(9,9))
axes = figure.add_subplot(111,projection = "3d")
dots = axes.scatter(xs = finalDf.to_numpy()[:,0],ys = finalDf.to_numpy()[:,1],zs = finalDf.to_numpy()[:,2],
c = digits.target, cmap = plt.cm.get_cmap("nipy_spectral_r",10))

I would like to know how i can apply this clustering algorithm on my own data please?

I'd like to replace the iris data by my own data. please tell me what are the steps to follow to do that ?
thanks
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import scale
import sklearn.metrics as sm
from sklearn import datasets
from sklearn.metrics import confusion_matrix,classification_report import matplotlib.pyplot as plt plt.rc('figure', figsize=(7,4))
iris = datasets.load_iris()
X = scale(iris.data)
Y = pd.DataFrame(iris.target)
variable_name = iris.feature_names X[0:10,]
clustering = KMeans(n_clusters=3,random_state=5)
clustering.fit(X)
iris_df = pd.DataFrame(iris.data)
iris_df.columns=['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width'] Y.columns = ['Targets']
The import section will stay the same.
Lets assume you have a dataframe:
#read your dataframe(several types possible)
df = pd.read_csv('test.csv')
#you need to define a target variable (named target in my case) and the features X
Y = df['target']
X = df.drop(['target'], axis=1)
#here your k-means algorithm gets start
clustering = KMeans(n_clusters=3,random_state=5)
clustering.fit(X)
let me add one more think, for what are you using kmeans? it is an unsupervised learning method, so you do not have any target variable, so what are you doing?
Normally it should be:
df = pd.read_csv('test.csv')
#columns header you want to use
relevant_columns = ['A', 'B']
X = df[relevant_columns]
clustering = KMeans(n_clusters=3,random_state=5)
clustering.fit(X)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import scale
import sklearn.metrics as sm
from sklearn import datasets
from sklearn.metrics import confusion_matrix,classification_report
# CHANGED CODE START
df = pd.read_excel('tmp.xlsx')
Y = df['target']
X = df.drop(['target'], axis=1)
# CHANGED CODE END
variable_name = X.columns
clustering = KMeans(n_clusters=3,random_state=5)
clustering.fit(X)

AttributeError: 'OLSResults' object has no attribute 'norm_resid'

When I run this I have the following error :
AttributeError: 'OLSResults' object has no attribute 'norm_resid'
I have the latest version of OLS, so the attribute norm_resid should be there.
Any ideas ?
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn import datasets, linear_model
from statsmodels.formula.api import ols
"""
Data Management
"""
data = pd.read_csv("TestExer1-sales-round1.csv")
X_train = data["Advertising"]
Y_train = data["Sales"]
# use of linregregress
model = ols("Y_train ~ X_train", data).fit()
print(model.summary())
plt.plot(X_train,Y_train , 'ro')
plt.plot(X_train, model.fittedvalues, 'b')
plt.legend(['Sales', 'Advertising'])
plt.ylim(0, 70)
plt.xlim(5, 18)
plt.hist(model.norm_resid())
plt.ylabel('Count')
plt.xlabel('Normalized residuals')
plt.xlabel('Temperature')
plt.ylabel('Gas')
plt.title('Before Insulation')
I had the same issue, but the following worked:
plt.hist(model.resid_pearson)
Thus your solution should look like:
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn import datasets, linear_model
from statsmodels.formula.api import ols
"""
Data Management
"""
data = pd.read_csv("TestExer1-sales-round1.csv")
X_train = data["Advertising"]
Y_train = data["Sales"]
# use of linregregress
model = ols("Y_train ~ X_train", data).fit()
print(model.summary())
plt.plot(X_train,Y_train , 'ro')
plt.plot(X_train, model.fittedvalues, 'b')
plt.legend(['Sales', 'Advertising'])
plt.ylim(0, 70)
plt.xlim(5, 18)
plt.hist(model.resid_pearson)
plt.ylabel('Count')
plt.xlabel('Normalized residuals')
plt.xlabel('Temperature')
plt.ylabel('Gas')
plt.title('Before Insulation')
when using statsmodel version 0.8.0 or greater.
Note: the pearson residuals only divide each residual value with standard error of residuals. While normalisation also divides each residual by the sum of all residuals. For more see here
From the docs.

Categories