Overlay Linear Regression Line on Scatter Plot (iPython Notebook) - python

gh_data = ascii.read('http://dept.astro.lsa.umich.edu/~ericbell/data/GHOSTS/M81/ngc3031- field15.newphoto_radec')
ra = gh_data['col5'][:]
dec = gh_data['col6'][:]
f606 = gh_data['col3'][:]
f814 = gh_data['col4'][:]
plot(f6062-f8142,f8142, 'bo', alpha=0.15)
axis([-1,2.5,27,23])
xlabel('F606W-F814W')
ylabel('F814W')
title('Field 14')
The data set is imported and organized into different columns, I am trying to overlay a line of best fit, or linear regression over the scatterplot created, but I cannot figure out how. Thanks in advance.

As #rayryeng pointed out, your code just plots the data, but doesn't actually compute any regression results to plot. Here's one way of doing it:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.DataFrame({"y": range(1,11)+np.random.rand(10),
"x": range(1,11)+np.random.rand(10)})
Use statsmodels OLS method to fit a regression line, and params to extract the coefficient on the single regressor:
beta_1 = sm.OLS(data.y, data.x).fit().params
Produce a scatterplot and add a regression line:
fig, ax = plt.subplots()
ax.scatter(data.x, data.y)
ax.plot(range(1,11), [i*beta_1 for i in range(1,11)], label = "best fit")
ax.legend(loc="best")

Related

How to find the 'peak' of a polynomial regression line in Matplotlib

Is it possible to find the peak (vertex?) values (x,y) of a polynomial regression line that was computed using Matplotlib?
I've included my basic setup below (of course with fuller data sets), as well as a screenshot of the actual regression line question.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
degree=6
setX={'Fixation Duration': {0:1,1:2,2:3}}
setY={'Fixation Occurrences': {0:1,1:2,2:3}}
X_gall=pd.DataFrame.from_dict(setX)
Y_gall=pd.DataFrame.from_dict(setY)
X_seqGall = np.linspace(X_gall.min(),X_gall.max(),300).reshape(-1,1)
polyregGall=make_pipeline(PolynomialFeatures(degree),LinearRegression())
polyregGall.fit(X_gall,Y_gall)
plt.scatter(X_gall,Y_gall, c="#1E4174", s=100.0, alpha=0.4)
plt.plot(X_seqGall,polyregGall.predict(X_seqGall),color="#1E4174", linewidth=4)
plt.show()
would like to find x,y values along red arrows
You can find the maximum from the underlying plot data.
First, let's change your plotting commands to explicitly define the axes:
fig, ax = plt.subplots(figsize=(6,4))
_ = ax.scatter(X_gall,Y_gall, c="#1E4174", s=100.0, alpha=0.4)
poly = ax.plot(X_seqGall,polyregGall.predict(X_seqGall),color="#1E4174", linewidth=4)
plt.show()
Now you can access the line data:
lines = poly[0].axes.lines
for line in lines:
max_y = np.max(line.get_ydata())
print(f"Maximum y is: {max_y}")
x_of_max_y = line.get_xdata()[np.argmax(line.get_ydata())]
print(f"x value of maximum y is: {x_of_max_y}")
Output:
Maximum y is: 3.1515605364361114
x value of maximum y is: 2.8127090301003346

Trying to plot outliers using DBSCAN

I have never been great with Python plotting concepts, and now I'm still apparently missing something new.
Here is my code.
import pandas as pd
import matplotlib.pyplot as plt
import sys
from numpy import genfromtxt
from sklearn.cluster import DBSCAN
data = pd.read_csv('C:\\Users\\path_here\\wine.csv')
data
# Reading in 2D Feature Space
model = DBSCAN(eps=0.9, min_samples=10).fit(data)
array_flavanoids = data.iloc[:, 2]
# Slicing array
array_colorintensity = data.iloc[:, 3]
# Scatter plot function
colors = model.labels_
plt.scatter(array_flavanoids, array_colorintensity, c=colors, marker='o')
plt.xlabel('Concentration of flavanoids', fontsize=16)
plt.ylabel('Color intensity', fontsize=16)
plt.title('Concentration of flavanoids vs Color intensity', fontsize=20)
plt.show()
Here is my result.
I am expecting the outliers to be in a different color than the non-outliers. So, something like this.
Maybe one color for outliers and another for non-outliers. I am just trying to learn the concept in this exercise. I am trying to follow the example from this link.
https://towardsdatascience.com/outlier-detection-python-cd22e6a12098
I am using this data source.
https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009
I am testing different data sets.
I got this to work.
from sklearn.cluster import DBSCAN
def dbscan(X, eps, min_samples):
ss = StandardScaler()
X = ss.fit_transform(X)
db = DBSCAN(eps=eps, min_samples=min_samples)
db.fit(X)
y_pred = db.fit_predict(X)
plt.scatter(X[:,0], X[:,1],c=y_pred, cmap='Paired')
plt.title("DBSCAN")
dbscan(data, eps=.5, min_samples=5)
I found this to be a great resource.
https://medium.com/#plog397/functions-to-plot-kmeans-hierarchical-and-dbscan-clustering-c4146ed69744

How to create as scatter plot with regression line based on statsmodel OLS?

I am having difficulty adding a regression line (the one which statsmodel OLS is based on) on to scatter plot. Note that with seaborn's lmplot, I can get a line (see example), but I would like to use the exact one coming from statsmodel OLS for total consistency.
How can I adjust code below to add in the regression line into the first scatter plot?
import statsmodels.regression.linear_model as sm
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(0)
data = {'Xvalue': range(20, 30), 'Yvalue': np.random.randint(low=10, high=100, size=10)}
data = pd.DataFrame(data)
X = data[['Xvalue']]
Y = data['Yvalue']
model2 = sm.OLS(Y,sm.add_constant(X), data=data)
model_fit = model2.fit()
print(model_fit.summary())
#Plot
data.plot(kind='scatter', x='Xvalue', y='Yvalue')
#Seaborn
sns.lmplot(x='Xvalue', y='Yvalue', data=data)
Scatter plot (trying to work out how to add in the statsmodel OLS regression line
seaborn lmplot with its regression line (trying to mimic this)
Thanks to the link from #busybear, it now works!
import statsmodels.regression.linear_model as sm
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(0)
data = {'Xvalue': range(20, 30), 'Yvalue': np.random.randint(low=10, high=100, size=10)}
data = pd.DataFrame(data)
X = data[['Xvalue']]
Y = data['Yvalue']
model = sm.OLS(data['Yvalue'], sm.add_constant(data['Xvalue']))
model_fit = model.fit()
p = model_fit.params
print(model_fit.summary())
#Plot
p
x = np.arange(0,40)
ax = data.plot(kind='scatter', x='Xvalue', y='Yvalue')
ax.plot(x, p.const + p.Xvalue * x)
ax.set_xlim([0,30])
#Seaborn
sns.lmplot(x='Xvalue', y='Yvalue', data=data)

R abline() equivalent in Python

I am trying to plot a Linear Regression onto a scatterplot in Python.
In R I would simply do the following:
Run OLS Linear Regresion
fit_1 <- lm(medv ~ lstat)
plot(medv ~ lstat)
abline(fit_1, col = "red")
I have been looking at different solutions in Python, but I can't seem to be able to actually get it to work.
My script is:
Plot Data
Boston.plot(kind='scatter', x='medv', y='lstat', color = "black")
plt.show()
Run Linear Regression
fit_1 = sm.ols(formula='medv ~ lstat', data= Boston).fit()
Show Summary
fit_1.summary()
Plot Regression Line
Insert code here
It can be done quite simply. In the below code, I use sklearn to fit the model and predict the values.
import pandas as pd
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
model = LinearRegression()
model.fit(X,y)
predictions = model.predict(X)
plt.plot(X,y,'o')
# change here
plt.plot(X, predictions, '-')
plt.show()
Try this:
plt.plot(Boston.lstat, fit_1.fittedvalues, 'r')
Saw this on Statology that helped me a lot:
def abline(slope, intercept):
axes = plt.gca()
x_vals = np.array(axes.get_xlim())
y_vals = intercept + slope * x_vals
plt.plot(x_vals, y_vals, '--')

Seaborn: annotate the linear regression equation

I tried fitting an OLS for Boston data set. My graph looks like below.
How to annotate the linear regression equation just above the line or somewhere in the graph? How do I print the equation in Python?
I am fairly new to this area. Exploring python as of now. If somebody can help me, it would speed up my learning curve.
Many thanks!
I tried this as well.
My problem is - how to annotate the above in the graph in equation format?
You can use coefficients of linear fit to make a legend like in this example:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
tips = sns.load_dataset("tips")
# get coeffs of linear fit
slope, intercept, r_value, p_value, std_err = stats.linregress(tips['total_bill'],tips['tip'])
# use line_kws to set line label for legend
ax = sns.regplot(x="total_bill", y="tip", data=tips, color='b',
line_kws={'label':"y={0:.1f}x+{1:.1f}".format(slope,intercept)})
# plot legend
ax.legend()
plt.show()
If you use more complex fitting function you can use latex notification: https://matplotlib.org/users/usetex.html
To annotate multiple linear regression lines in the case of using seaborn lmplot you can do the following.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_excel('data.xlsx')
# assume some random columns called EAV and PAV in your DataFrame
# assume a third variable used for grouping called "Mammal" which will be used for color coding
p = sns.lmplot(x=EAV, y=PAV,
data=df, hue='Mammal',
line_kws={'label':"Linear Reg"}, legend=True)
ax = p.axes[0, 0]
ax.legend()
leg = ax.get_legend()
L_labels = leg.get_texts()
# assuming you computed r_squared which is the coefficient of determination somewhere else
slope, intercept, r_value, p_value, std_err = stats.linregress(df['EAV'],df['PAV'])
label_line_1 = r'$y={0:.1f}x+{1:.1f}'.format(slope,intercept)
label_line_2 = r'$R^2:{0:.2f}$'.format(0.21) # as an exampple or whatever you want[!
L_labels[0].set_text(label_line_1)
L_labels[1].set_text(label_line_2)
Result:
Simpler syntax.. same result.
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
slope, intercept, r_value, pv, se = stats.linregress(df['alcohol'],df['magnesium'])
sns.regplot(x="alcohol", y="magnesium", data=df,
ci=None, label="y={0:.1f}x+{1:.1f}".format(slope, intercept)).legend(loc="best")
I extended the solution by #RMS to work for a multi-panel lmplot example (using data from a sleep-deprivation study (Belenky et. al., J Sleep Res 2003) available in pydataset). This allows one to have axis-specific legends/labels without having to use, e.g., regplot and plt.subplots.
Edit: Added second method using the map_dataframe() method from FacetGrid(), as suggested in the answer by Marcos here.
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import pydataset as pds
import matplotlib.pyplot as plt
# use seaborn theme
sns.set_theme(color_codes=True)
# Load data from sleep deprivation study (Belenky et al, J Sleep Res 2003)
# ['Reaction', 'Days', 'Subject'] = [reaction time (ms), deprivation time, Subj. No.]
df = pds.data("sleepstudy")
# convert integer label to string
df['Subject'] = df['Subject'].apply(str)
# perform linear regressions outside of seaborn to get parameters
subjects = np.unique(df['Subject'].to_numpy())
fit_str = []
for s in subjects:
ddf = df[df['Subject'] == s]
m, b, r_value, p_value, std_err = \
sp.stats.linregress(ddf['Days'],ddf['Reaction'])
fs = f"y = {m:.2f} x + {b:.1f}"
fit_str.append(fs)
method_one = False
method_two = True
if method_one:
# Access legend on each axis to write equation
#
# Create 18 panel lmplot with seaborn
g = sns.lmplot(x="Days", y="Reaction", col="Subject",
col_wrap=6, height=2.5, data=df,
line_kws={'label':"Linear Reg"}, legend=True)
# write string with fit result into legend string of each axis
axes = g.axes # 18 element list of axes objects
i=0
for ax in axes:
ax.legend() # create legend on axis
leg = ax.get_legend()
leg_labels = leg.get_texts()
leg_labels[0].set_text(fit_str[i])
i += 1
elif method_two:
# use the .map_dataframe () method from FacetGrid() to annotate plot
# https://stackoverflow.com/questions/25579227 (answer by #Marcos)
#
# Create 18 panel lmplot with seaborn
g = sns.lmplot(x="Days", y="Reaction", col="Subject",
col_wrap=6, height=2.5, data=df)
def annotate(data, **kws):
m, b, r_value, p_value, std_err = \
sp.stats.linregress(data['Days'],data['Reaction'])
ax = plt.gca()
ax.text(0.5, 0.9, f"y = {m:.2f} x + {b:.1f}",
horizontalalignment='center',
verticalalignment='center',
transform=ax.transAxes)
g.map_dataframe(annotate)
# write figure to pdf
plt.savefig("sleepstudy_data_w-fits.pdf")
Output (Method 1):
Output (Method 2):
Update 2022-05-11: Unrelated to the plotting techniques, it turns out that this interpretation of the data (and that provided, e.g., in the original R repository) is incorrect. See the reported issue here. Fits should be done to days 2-9, corresponding to zero to seven days of sleep deprivation (3h sleep per night). The first three data points correspond to training and baseline days (all with 8h sleep per night).

Categories