As part of my research project, I was working on performing linear regression with some data using matplotlib. Unfortunately, I am unable to get my line to touch the origin; matplotlib seems to cut it off at the minimum value of my dataset. How can I fix this and get my line to touch the origin? As reference, here is my code:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from statsmodels import api as sm
def file_analysis(csv_file, state):
"""
This method takes in a file object and the name of a state.
:param csv_file: Pass in a csv file object.
:param state: Name of the state as a string.
:return: None.
"""
data = pd.read_csv(csv_file)
data = data[["Total Cases", "Total Deaths"]]
y = data["Total Deaths"]
x = data["Total Cases"]
results = sm.OLS(y, x).fit()
plt.scatter(x, y)
yhat = results.params[0] * x
print(results.params)
plt.ylim(ymin=0)
plt.xlim(xmin=0)
plt.margins(0)
fig = plt.plot(x, yhat, lw=4, c="orange", label="regressionline")
plt.xlabel("Total Cases", fontsize=20)
plt.ylabel('Total Deaths', fontsize=20)
plt.title(state)
plt.savefig(state + "_scatterplot" + ".png")
plt.show()
with open(state + "_analysis.txt", "w") as file:
file.write(results.summary().as_text())
And here is the resulting scatter-plot after passing in the name of the state and the csv file for the state:
You should just change the x-values that you want in your regression to include 0.
yhat = results.params[0] * range(0, x.max())
fig = plt.plot(range(0, x.max()), yhat, lw=4, c="orange", label="regressionline")
I think the reason your line does not touch the origin is that your are only plotting it at the extent of your data. By calculating the predicted deaths like this yhat = results.params[0] * x you are restricting the line to points in your dataset. You can easily fix this if you supply a wider range of x parameters:
newX = range(0, 80)
yhat = results.params[0] * newX
fig = plt.plot(newX, yhat, lw=4, c="orange", label="regressionline")
By the way, are you fitting the model without intercept on purpose?
I don't have the data, to try the solution I am proposing, but If I was you, I would add a 0 value to the yhat values and also a 0 to the x values in the same position, so you will see the line in the [0,0] position.
Let me know if this works :)
Related
As a tradition, I want to say that I am pretty new to python.
I have set of x and y values as csv file, and my y values are pretty noisy. So far, I managed to use a filter(scipy.signal.savgol_filter) to filter the noise, plot my graph, and get a linear regression of my data, where it is showing a linear trend. This part is important, because my question is related to linear fitting of some part of the data. Here is the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.signal
import os
from scipy.signal import savgol_coeffs
from sklearn.metrics import r2_score
from scipy.linalg import lstsq
plt.rc('lines',linewidth=1)
plt.rc('axes', labelsize=16)
plt.rc('xtick', labelsize=14)
plt.rc('ytick', labelsize=14)
plt.rc('legend', fontsize=10)
# define material parameters
deprate = 5.90E-07
#deposition rate unit: cm/s
Ms = 180000 # Si substrate modulus, unit: MPa
hs = 0.03 # Si substrate thickness, unit: cm
stressfac = Ms*hs**2/6 # stress prefactor, unit: MPa
def fit_slope(hfilm, curvature, h0, h1):
# use least square fitting to find the slope of the film_thickness vs curvature curve btw
# thickness = h0 and h1.
# return fitting parameter p. Least square fitting line is y = p[1]*x + p[0]
xdata = hfilm[ (hfilm>h0) & (hfilm<h1)]
ydata = curvature[(hfilm>h0) & (hfilm<h1)]
A = xdata[:, np.newaxis] ** [0,1]
p, *_ = lstsq(A, ydata)
return p
def read_MOSSdata(filename, deprate):
data = pd.read_csv(filename, sep='\s*,\s*', engine='python')
time = data['time [s]'][~data['time [s]'].isna()].to_numpy()
curvature = data['Curvature'][~data['time [s]'].isna()] # curvature unit: 1/cm
hfilm = time * deprate # file thickness unit: cm
return time, hfilm, curvature
filename = (r'C:\Users\yavuz\01-0722-2.csv')
time, hfilm, curvature = read_MOSSdata(filename, deprate)
h0 = 0.00005
h1 = 0.00008
xdata = np.linspace(h0, h1, 500)
yhat = scipy.signal.savgol_filter(curvature, 21,1)
p = fit_slope(hfilm, yhat, h0, h1)
plt.plot(hfilm, curvature)
plt.plot(hfilm, yhat, color='red', label = 'filtered data')
plt.plot(xdata, p[1]*xdata + p[0], color='green', linewidth=4, label = 'linear fitting')
plt.xlabel("Film thickness (cm)")
plt.ylabel("Curvature(1/cm)")
print(f'fitted stress = {-p[1]*stressfac} MPa')
plt.legend(loc=0)
My question is how do I calculate R-square value of this slope on my graph? I tried using r-square value calculators like sklearn.metrics but the problem is that I am limiting my x values to get a slope of a window, and all of the codes I tried, showing the problem of ''expected x and y to have same length''. I would add the csv file but it seems like there is not such an option. Thanks a lot for the help!
I'm trying to plot bar hist of interest rates and attach to it a PDF line. I have looked for solutions and found a way with kdeplot.
The result is pretty strange the kdeplot line is much higher than the bars hist and I don't know how to fix it.
After applying kdeplot:
Before applying kdeplot:
Here is the code that I'm using:
df=pd.read_excel('interestrate.xlsx')
k=0.0005
bin_steps = np.arange(start = df['Interest rate Real'].min(), stop = df['Interest rate Real'].max(), step = k)
ax = df['Interest rate Real'].hist(bins = bin_steps, figsize=[10,5])
ax1 = df['Interest rate Real']
vals = ax.get_xticks()
ax.set_xticklabels(['{:,.2%}'.format(x) for x in vals])
ax.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
ax.set_title("PDF for Real Interest Rate")
#sns.kdeplot(ax1)
The following code snippet should set you in the right direction (just insert your data):
import scipy.stats as st
y = np.random.randn(1000) # your data goes here
plt.hist(y,50, density=True)
mn, mx = plt.xlim()
plt.xlim(mn, mx)
x = np.linspace(mn, mx, 301)
kde = st.gaussian_kde(y)
plt.plot(x, kde.pdf(x));
Alternatively with seaborn:
import seaborn as sns
plt.hist(y,50, density=True)
sns.kdeplot(y);
or as simple as:
sns.distplot(y)
I`m interested in the features of bijectors in tensorflow_probability, so I tried to sampling from a random variable function which is constructed by tfp.bijectors.
I just provide my test code blow, and here I privide some detials: the case I used to tested is the Chi_square distribution. I got the samples out Chi(2) distribution in two different ways: (1)directly use the Chi(2) api in tensorflow; (2)using the tfp.bijectors by the relationship between Chi(2) and standerd normal distribution( N(0, 1) ): if X, Y iid~N(0,1), Z = g(X, Y) = X^2 + Y^2, then Z ~ Chi(2). My result showed blow, the means of tow groups samples is approximately equal, but the tow standard deviation is much more different, anyone could tell me where i`m wrong and how to use the tf_probability correctly?
import os
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from scipy.stats import chi2
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.reset_default_graph() # Clear computational graph before calc again!!!
tfd = tfp.distributions
tfb = tfp.bijectors
n_samples = 2000
chi2_origin = tfd.Chi2(2)
s_chi2_origin = chi2_origin.sample([n_samples])
base_normal = tfd.Normal(loc=0., scale=1.)
n_to_chi1_bij = tfb.Square()
n_to_chi2_bij = tfb.Chain([tfb.AffineScalar(shift=0., scale=2.), tfb.Square()])
target_Chi = tfd.TransformedDistribution(
distribution=base_normal,
bijector=n_to_chi2_bij,
name="Chi_x_constructed"
)
s_chi1_constru = target_Chi.sample([n_samples])
with tf.Session() as sess:
init_op = tf.local_variables_initializer()
sess.run(init_op)
s_chi2_origin_ = sess.run(s_chi2_origin)
# print("Samples by Chi2_ORIGIN", s_chi2_origin_)
print("Origin : mean={:.4f}, std={:.4f}".
format(s_chi2_origin_.mean(), s_chi2_origin_.std()))
s_chi2_constru_ = sess.run(s_chi1_constru)
# print("Samples by Chi1_CONSTRU:", s_chi1_constru_[-5:-1])
print("Constru: mean={:.4f}, std={:.4f}".
format(s_chi2_constru_.mean(), s_chi2_constru_.std()))
x = np.arange(0, 15, .5)
y = chi2(2).pdf(x)
fig, (ax0, ax1) = plt.subplots(1, 2, sharey=True, figsize=(6,4))
ax0.hist(s_chi2_origin_, bins='auto', density=True)
ax0.plot(x, y, 'r-')
ax1.hist(s_chi2_constru_, bins=200, density=True)
ax1.plot(x, y, 'r-')
plt.show()
And here is my result, the Origin line is calculated by directly Chi(2) api in tf, the left image shows the origin result; the constru line and the right picture is gotten by tf_probability.bijectors.
I think you have the entries in your chain reversed. They are called right to left, as in the mathematical notation of function composition.
I have a code:
import math
import numpy as np
import pylab as plt1
from matplotlib import pyplot as plt
uH2 = 1.90866638
uHe = 3.60187307
eH2 = 213.38
eHe = 31.96
R = float(uH2*eH2)/(uHe*eHe)
C_Values = []
Delta = []
kHeST = []
J_f21 = []
data = np.genfromtxt("Lamda_HeHCL.txt", unpack=True);
J_i1=data[1];
J_f1=data[2];
kHe=data[7]
data = np.genfromtxt("Basecol_Basic_New_1.txt", unpack=True);
J_i2=data[0];
J_f2=data[1];
kH2=data[5]
print kHe
print kH2
kHe = map(float, kHe)
kH2 = map(float, kH2)
kHe = np.array(kHe)
kH2= np.array(kH2)
g = len(kH2)
for n in range(0,g):
if J_f2[n] == 1:
Jf21 = J_f2[n]
J_f21.append(Jf21)
ratio = kHe[n]/kH2[n]
C = (((math.log(float(kH2[n]),10)))-(math.log(float(kHe[n]),10)))/math.log(R,10)
C_Values.append(C)
St = abs(J_f1[n] - J_i1[n])
Delta.append(St)
print C_Values
print Delta
print J_f21
fig, ax = plt.subplots()
ax.scatter(Delta,C_Values)
for i, txt in enumerate(J_f21):
ax.annotate(txt, (Delta[i],C_Values[i]))
plt.plot(np.unique(Delta), np.poly1d(np.polyfit(Delta, C_Values, 1))(np.unique(Delta)))
plt.plot(Delta, C_Values)
fit = np.polyfit(Delta,C_Values,1)
fit_fn = np.poly1d(fit)
# fit_fn is now a function which takes in x and returns an estimate for y
plt.scatter(Delta,C_Values, Delta, fit_fn(Delta))
plt.xlim(0, 12)
plt.ylim(-3, 3)
In this code, I am trying to plot a linear regression that extends past the data and touches the x-axis. I am also trying to add a legend to the plot that shows the slope of the plot. Using the code, I was able to plot this graph.
Here is some trash data I have been using to try and extend the line and add a legend to my code.
x =[5,7,9,15,20]
y =[10,9,8,7,6]
I would also like it to be a scatter except for the linear regression line.
Given that you don't provide the data you're loading from files I was unable to test this, but off the top of my head:
To extend the line past the plot, you could turn this line
plt.plot(np.unique(Delta), np.poly1d(np.polyfit(Delta, C_Values, 1))(np.unique(Delta)))
Into something like
x = np.linspace(0, 12, 50) # both 0 and 12 are from visually inspecting the plot
plt.plot(x, np.poly1d(np.polyfit(Delta, C_Values, 1))(x))
But if you want the line extended to the x-axis,
polynomial = np.polyfit(Delta, C_Values, 1)
x = np.linspace(0, *np.roots(polynomial))
plt.plot(x, np.poly1d(polynomial)(x))
As for the scatter plot thing, it seems to me you could just remove this line:
plt.plot(Delta, C_Values)
Oh right, as for the legend, add a label to the plots you make, like this:
plt.plot(x, np.poly1d(polynomial)(x), label='Linear regression')
and add a call to plt.legend() just before plt.show().
I'm using a library which produces 3 plots given an object k.
I need to figure the data points (x,y,z) that produced these plot, but the problem is that the plots comes from a function from k.
The library I'm using is pyKriging and this is their github repository.
A simplified version of their example code is:
import pyKriging
from pyKriging.krige import kriging
from pyKriging.samplingplan import samplingplan
sp = samplingplan(2)
X = sp.optimallhc(20)
testfun = pyKriging.testfunctions().branin
y = testfun(X)
k = kriging(X, y, testfunction=testfun, name='simple')
k.train()
k.plot()
The full code, comments and output can be found here.
In summary, I'm trying to get the numpy array that produced these plots so I can create plots that follows my formatting styles.
I'm not knowledgeable about going into library codes in Python and I appreciate any help!
There is no single data array that produces the plot. Instead many arrays used for plotting are generated inside the kriging plot function.
Changing the filled contours to line contours is of course not a style option. One therefore needs to use the code from the original plotting function.
An option is to subclass kriging and implement a custom plot function (let's call it myplot). In this function, one can use contour instead of contourf. Naturally, it's also possible to change it completely to one's needs.
import pyKriging
from pyKriging.krige import kriging
from pyKriging.samplingplan import samplingplan
import numpy as np
import matplotlib.pyplot as plt
class MyKriging(kriging):
def __init__(self,*args,**kwargs):
kriging.__init__(self,*args,**kwargs)
def myplot(self,labels=False, show=True, **kwargs):
fig = plt.figure(figsize=(8,6))
# Create a set of data to plot
plotgrid = 61
x = np.linspace(self.normRange[0][0], self.normRange[0][1], num=plotgrid)
y = np.linspace(self.normRange[1][0], self.normRange[1][1], num=plotgrid)
X, Y = np.meshgrid(x, y)
# Predict based on the optimized results
zs = np.array([self.predict([xi,yi]) for xi,yi in zip(np.ravel(X), np.ravel(Y))])
Z = zs.reshape(X.shape)
#Calculate errors
zse = np.array([self.predict_var([xi,yi]) for xi,yi in zip(np.ravel(X), np.ravel(Y))])
Ze = zse.reshape(X.shape)
spx = (self.X[:,0] * (self.normRange[0][1] - self.normRange[0][0])) + self.normRange[0][0]
spy = (self.X[:,1] * (self.normRange[1][1] - self.normRange[1][0])) + self.normRange[1][0]
contour_levels = kwargs.get("levels", 25)
ax = fig.add_subplot(222)
CS = plt.contour(X,Y,Ze, contour_levels)
plt.colorbar()
plt.plot(spx, spy,'or')
ax = fig.add_subplot(221)
if self.testfunction:
# Setup the truth function
zt = self.testfunction( np.array(zip(np.ravel(X), np.ravel(Y))) )
ZT = zt.reshape(X.shape)
CS = plt.contour(X,Y,ZT,contour_levels ,colors='k',zorder=2, alpha=0)
if self.testfunction:
contour_levels = CS.levels
delta = np.abs(contour_levels[0]-contour_levels[1])
contour_levels = np.insert(contour_levels, 0, contour_levels[0]-delta)
contour_levels = np.append(contour_levels, contour_levels[-1]+delta)
CS = plt.contour(X,Y,Z,contour_levels,zorder=1)
plt.plot(spx, spy,'or', zorder=3)
plt.colorbar()
ax = fig.add_subplot(212, projection='3d')
ax.plot_surface(X, Y, Z, rstride=3, cstride=3, alpha=0.4)
if self.testfunction:
ax.plot_wireframe(X, Y, ZT, rstride=3, cstride=3)
if show:
plt.show()
sp = samplingplan(2)
X = sp.optimallhc(20)
testfun = pyKriging.testfunctions().branin
y = testfun(X)
k = MyKriging(X, y, testfunction=testfun, name='simple')
k.train()
k.myplot()