Linear Regression: Extending line past data and adding a legend - python

I have a code:
import math
import numpy as np
import pylab as plt1
from matplotlib import pyplot as plt
uH2 = 1.90866638
uHe = 3.60187307
eH2 = 213.38
eHe = 31.96
R = float(uH2*eH2)/(uHe*eHe)
C_Values = []
Delta = []
kHeST = []
J_f21 = []
data = np.genfromtxt("Lamda_HeHCL.txt", unpack=True);
J_i1=data[1];
J_f1=data[2];
kHe=data[7]
data = np.genfromtxt("Basecol_Basic_New_1.txt", unpack=True);
J_i2=data[0];
J_f2=data[1];
kH2=data[5]
print kHe
print kH2
kHe = map(float, kHe)
kH2 = map(float, kH2)
kHe = np.array(kHe)
kH2= np.array(kH2)
g = len(kH2)
for n in range(0,g):
if J_f2[n] == 1:
Jf21 = J_f2[n]
J_f21.append(Jf21)
ratio = kHe[n]/kH2[n]
C = (((math.log(float(kH2[n]),10)))-(math.log(float(kHe[n]),10)))/math.log(R,10)
C_Values.append(C)
St = abs(J_f1[n] - J_i1[n])
Delta.append(St)
print C_Values
print Delta
print J_f21
fig, ax = plt.subplots()
ax.scatter(Delta,C_Values)
for i, txt in enumerate(J_f21):
ax.annotate(txt, (Delta[i],C_Values[i]))
plt.plot(np.unique(Delta), np.poly1d(np.polyfit(Delta, C_Values, 1))(np.unique(Delta)))
plt.plot(Delta, C_Values)
fit = np.polyfit(Delta,C_Values,1)
fit_fn = np.poly1d(fit)
# fit_fn is now a function which takes in x and returns an estimate for y
plt.scatter(Delta,C_Values, Delta, fit_fn(Delta))
plt.xlim(0, 12)
plt.ylim(-3, 3)
In this code, I am trying to plot a linear regression that extends past the data and touches the x-axis. I am also trying to add a legend to the plot that shows the slope of the plot. Using the code, I was able to plot this graph.
Here is some trash data I have been using to try and extend the line and add a legend to my code.
x =[5,7,9,15,20]
y =[10,9,8,7,6]
I would also like it to be a scatter except for the linear regression line.

Given that you don't provide the data you're loading from files I was unable to test this, but off the top of my head:
To extend the line past the plot, you could turn this line
plt.plot(np.unique(Delta), np.poly1d(np.polyfit(Delta, C_Values, 1))(np.unique(Delta)))
Into something like
x = np.linspace(0, 12, 50) # both 0 and 12 are from visually inspecting the plot
plt.plot(x, np.poly1d(np.polyfit(Delta, C_Values, 1))(x))
But if you want the line extended to the x-axis,
polynomial = np.polyfit(Delta, C_Values, 1)
x = np.linspace(0, *np.roots(polynomial))
plt.plot(x, np.poly1d(polynomial)(x))
As for the scatter plot thing, it seems to me you could just remove this line:
plt.plot(Delta, C_Values)
Oh right, as for the legend, add a label to the plots you make, like this:
plt.plot(x, np.poly1d(polynomial)(x), label='Linear regression')
and add a call to plt.legend() just before plt.show().

Related

How to set the same y-axis on a graph with matplotlib.pyplot

I am trying to plot a set of data points on the same axis showing the minor differences in their data. When I go to plot the lines, the y-axis is stacked with almost repeating data points instead of mixing the data points. I am limited to using numpy, math, and matplotlib.pyplot.
I apologize in advance if there is a better way to upload graphs.
I am able to get one graph with two lines as desired, but it resets the y-axis for each plot (stacks it). I have tried setting the plt.ylim(0,1000), putting both (x,y) couples in the same plot line amongst other things.
My wrong graph
Correct y-axis graph
I used these imports:
import numpy as np
import matplotlib.pyplot as plt
Here is the code I have been using:
getKey = [m for m in data]
#get reversed list for ideal values
ideal2 = (data[getKey[0]])
ideal2r = []
for k in ideal2:
ideal2r.insert(0,k)
#get reversed list for actual values
actual2 = (data[getKey[1]])
actual2r = []
for k in actual2:
actual2r.insert(0,k)
#get reversed list for measured values
measured2 = (data[getKey[2]])
measured2r = []
for k in measured2:
measured2r.insert(0,k)
#plot the first graph comparing ideal to actual values over increments 0-30(incs)
plt.plot(incs,ideal2r,'b', label = 'ideal')
plt.plot(incs,actual2r,'r', label = 'actual')
#plt.axis('equal')
plt.legend()
plt.show()
#plot the second graph comparing actual to measured values over increments 0-30(incs)
plt.plot(incs,actual2r,'b', label = 'actual')
plt.plot(incs,measured2r,'r', label = 'measured')
#plt.axis('equal')
plt.legend()
plt.show()
When I use this method in a different function it prints on the same y-axis for each line (0,1000)
Other function graph
other method:
k = np.linspace(0,9,10)
x = [1000, 750, 563, 422, 316, 237, 178, 133, 100, 75]
z = [927,870,567,271,132,47,92,79,99,123]
# Complete the rest of the function below this line
r = 200
a = 0.75
x_est = [z[0]]
p_est = [1]
g_est = [0]
x_hat = z[0]
p = 1 #cant be 0
for i in range(len(k)-1):
#initialization
x_hat = a*x_hat
p = a*p*a
#predict
g = p/(p+r) # adding noise prediction
x_hat = x_hat + g*(z[i] - x_hat)
p = (1-g)*p
#update
p_est.append(p)
x_est.append(x_hat)
g_est.append(g)
plt.plot(k,x,'b', label = 'true state')
plt.plot(k,z,'r', label = 'observation')
plt.plot(k, x_est, 'g', label = 'estimate')
plt.legend()
plt.show()
The data i have been using is contained here

How to remove marker from plot and make it smooth

I have been trying to plot a smooth graph, and here is my code
import matplotlib.pyplot as plt
#fig,axes= plt.subplots(nrows=6, ncols=1, squeeze=False)
x = df["DOY"]
y = df["By"]
z = df["Bz"]
a = df["Vsw"]
b = df["Nsw"]
c = df["magnetopause_distance"]
d = df["reconnection_rate"]
And after that, I used the following logic to plot the same
#create a figure
fig=plt.figure()
#define subplots and define their position
plt1=fig.add_subplot(611)
plt2=fig.add_subplot(612)
plt3=fig.add_subplot(613)
plt4=fig.add_subplot(614)
plt5=fig.add_subplot(615)
plt6=fig.add_subplot(616)
plt1.plot(x,y,'black',linewidth=0.5,marker=None)
plt1.set_ylabel("By")
plt1.set_title("3-6 July 2003")
plt2.plot(x,z,'black',linewidth=0.5)
plt2.set_ylabel("Bz")
plt3.plot(x,a,'black',linewidth=0.5)
plt3.set_ylabel("Vsw")
plt4.plot(x,b,'black',linewidth=0.5)
plt4.set_ylabel("Nsw")
plt5.plot(x,c,'black',linewidth=0.5)
plt5.set_ylabel("MD")
plt6.plot(x,d,'black',linewidth=0.5)
plt6.set_ylabel("MRR")
plt.subplots_adjust(hspace = 2,wspace = 2)
#saving plot in .jpg format
plt.savefig('myplot01.jpg', format='jpeg',dpi=500, bbox_inches='tight')
Finally, I am getting a plot like this:
What I want is something like this:
Sorry for the typos. Thanks for your time :)
Use:
from scipy.interpolate import UnivariateSpline
import numpy as np
list_x_new = np.linspace(min(x), max(x), 1000)
list_y_smooth = UnivariateSpline(x, y, list_x_new)
plt.plot(list_x_new, list_y_smooth)
plt.show()
This is for one of the graphs, you can substitute the values in list_y_smooth in place of y according to the values you want to plot.

Plotting bars hist and PDF line (via kdeplot)

I'm trying to plot bar hist of interest rates and attach to it a PDF line. I have looked for solutions and found a way with kdeplot.
The result is pretty strange the kdeplot line is much higher than the bars hist and I don't know how to fix it.
After applying kdeplot:
Before applying kdeplot:
Here is the code that I'm using:
df=pd.read_excel('interestrate.xlsx')
k=0.0005
bin_steps = np.arange(start = df['Interest rate Real'].min(), stop = df['Interest rate Real'].max(), step = k)
ax = df['Interest rate Real'].hist(bins = bin_steps, figsize=[10,5])
ax1 = df['Interest rate Real']
vals = ax.get_xticks()
ax.set_xticklabels(['{:,.2%}'.format(x) for x in vals])
ax.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
ax.set_title("PDF for Real Interest Rate")
#sns.kdeplot(ax1)
The following code snippet should set you in the right direction (just insert your data):
import scipy.stats as st
y = np.random.randn(1000) # your data goes here
plt.hist(y,50, density=True)
mn, mx = plt.xlim()
plt.xlim(mn, mx)
x = np.linspace(mn, mx, 301)
kde = st.gaussian_kde(y)
plt.plot(x, kde.pdf(x));
Alternatively with seaborn:
import seaborn as sns
plt.hist(y,50, density=True)
sns.kdeplot(y);
or as simple as:
sns.distplot(y)

Removing Data Below A Line In A Scatterplot (Python)

So I had code that graphed a 2dhistogram of my dataset. I plotted it like so:
histogram = plt.hist2d(fehsc, ofesc, bins=nbins, range=[[-1,.5],[0.225,0.4]])
I wanted to only look at data above a certain line though, so I added the following and it worked just fine:
counts = histogram[0]
xpos = histogram[1]
ypos = histogram[2]
image = histogram[3]
newcounts = counts #we're going to iterate over this
for i in range (nbins):
xin = xpos[i]
yin = ypos
yline = m*xin + b
reset = np.where(yin < yline) #anything less than yline we want to be 0
#index = index[0:len(index)-1]
countout = counts[i]
countout[reset] = 0
newcounts[i] = countout
However, I now need to draw a regression line through that cut region. Doing so is not possible (AFAIK) in plt.2dhist, so I'm using plt.scatter. Problem is I don't know how to make that cut anymore - I can't index the scatterplot.
I have this now:
plt.xlim(-1,.5)
plt.ylim(.225, .4)
scatter = plt.scatter(fehsc,ofesc, marker = ".")
and I only want to retain the data above some line:
xarr = np.arange(-1,0.5, 0.015)
yarr = m*xarr + b
plt.plot(xarr, yarr, color='r')
I've tried running the loop with some variations of the variables but I don't actually understand or know how to get it to work.
You could define a mask for your data before you plot and then just plot the data points that actually meet your criteria. Below an example, where all data points above a certain line are plotted in green and all data points below the line are plotted in black.
from matplotlib import pyplot as plt
import numpy as np
#the scatterplot data
xvals = np.random.rand(100)
yvals = np.random.rand(100)
#the line
b = 0.1
m = 1
x = np.linspace(0,1,num=100)
y = m*x+b
mask = yvals > m*xvals+b
plt.scatter(xvals[mask],yvals[mask],color='g')
plt.scatter(xvals[~mask],yvals[~mask],color='k')
plt.plot(x,y,'r')
plt.show()
The result looks like this
Hope this helps.
EDIT:
If you want to create a 2D histogram, where the portion below the line is set to zero, you can do that by first generating the histogram using numpy (as an array) and then setting the values inside that array to zero, if the bins fall below the line. After that, you can plot the matrix using plt.pcolormesh:
from matplotlib import pyplot as plt
import numpy as np
#the scatterplot data
xvals = np.random.rand(1000)
yvals = np.random.rand(1000)
histogram,xbins,ybins = np.histogram2d(xvals,yvals,bins=50)
#computing the bin centers from the bin edges:
xcenters = 0.5*(xbins[:-1]+xbins[1:])
ycenters = 0.5*(ybins[:-1]+ybins[1:])
#the line
b = 0.1
m = 1
x = np.linspace(0,1,num=100)
y = m*x+b
#hiding the part of the histogram below the line
xmesh,ymesh = np.meshgrid(xcenters,ycenters)
mask = m*xmesh+b > ymesh
histogram[mask] = 0
#making the plot
mat = plt.pcolormesh(xcenters,ycenters,histogram)
line = plt.plot(x,y,'r')
plt.xlim([0,1])
plt.ylim([0,1])
plt.show()
The result would be something like this:

Get data array from object in Python

I'm using a library which produces 3 plots given an object k.
I need to figure the data points (x,y,z) that produced these plot, but the problem is that the plots comes from a function from k.
The library I'm using is pyKriging and this is their github repository.
A simplified version of their example code is:
import pyKriging
from pyKriging.krige import kriging
from pyKriging.samplingplan import samplingplan
sp = samplingplan(2)
X = sp.optimallhc(20)
testfun = pyKriging.testfunctions().branin
y = testfun(X)
k = kriging(X, y, testfunction=testfun, name='simple')
k.train()
k.plot()
The full code, comments and output can be found here.
In summary, I'm trying to get the numpy array that produced these plots so I can create plots that follows my formatting styles.
I'm not knowledgeable about going into library codes in Python and I appreciate any help!
There is no single data array that produces the plot. Instead many arrays used for plotting are generated inside the kriging plot function.
Changing the filled contours to line contours is of course not a style option. One therefore needs to use the code from the original plotting function.
An option is to subclass kriging and implement a custom plot function (let's call it myplot). In this function, one can use contour instead of contourf. Naturally, it's also possible to change it completely to one's needs.
import pyKriging
from pyKriging.krige import kriging
from pyKriging.samplingplan import samplingplan
import numpy as np
import matplotlib.pyplot as plt
class MyKriging(kriging):
def __init__(self,*args,**kwargs):
kriging.__init__(self,*args,**kwargs)
def myplot(self,labels=False, show=True, **kwargs):
fig = plt.figure(figsize=(8,6))
# Create a set of data to plot
plotgrid = 61
x = np.linspace(self.normRange[0][0], self.normRange[0][1], num=plotgrid)
y = np.linspace(self.normRange[1][0], self.normRange[1][1], num=plotgrid)
X, Y = np.meshgrid(x, y)
# Predict based on the optimized results
zs = np.array([self.predict([xi,yi]) for xi,yi in zip(np.ravel(X), np.ravel(Y))])
Z = zs.reshape(X.shape)
#Calculate errors
zse = np.array([self.predict_var([xi,yi]) for xi,yi in zip(np.ravel(X), np.ravel(Y))])
Ze = zse.reshape(X.shape)
spx = (self.X[:,0] * (self.normRange[0][1] - self.normRange[0][0])) + self.normRange[0][0]
spy = (self.X[:,1] * (self.normRange[1][1] - self.normRange[1][0])) + self.normRange[1][0]
contour_levels = kwargs.get("levels", 25)
ax = fig.add_subplot(222)
CS = plt.contour(X,Y,Ze, contour_levels)
plt.colorbar()
plt.plot(spx, spy,'or')
ax = fig.add_subplot(221)
if self.testfunction:
# Setup the truth function
zt = self.testfunction( np.array(zip(np.ravel(X), np.ravel(Y))) )
ZT = zt.reshape(X.shape)
CS = plt.contour(X,Y,ZT,contour_levels ,colors='k',zorder=2, alpha=0)
if self.testfunction:
contour_levels = CS.levels
delta = np.abs(contour_levels[0]-contour_levels[1])
contour_levels = np.insert(contour_levels, 0, contour_levels[0]-delta)
contour_levels = np.append(contour_levels, contour_levels[-1]+delta)
CS = plt.contour(X,Y,Z,contour_levels,zorder=1)
plt.plot(spx, spy,'or', zorder=3)
plt.colorbar()
ax = fig.add_subplot(212, projection='3d')
ax.plot_surface(X, Y, Z, rstride=3, cstride=3, alpha=0.4)
if self.testfunction:
ax.plot_wireframe(X, Y, ZT, rstride=3, cstride=3)
if show:
plt.show()
sp = samplingplan(2)
X = sp.optimallhc(20)
testfun = pyKriging.testfunctions().branin
y = testfun(X)
k = MyKriging(X, y, testfunction=testfun, name='simple')
k.train()
k.myplot()

Categories