Creating a Matrix of Floats to do Polynomial Regression - python

I'm trying to do a polynomial regression of csv file I have (or any other csv file). I am not sure how to build a matrix that contains the data set I have. Here is the current code I have.
from matplotlib.pyplot import *
import numpy as np
import csv
from math import *
f=open("data_setshort.csv", "r")
csv_f = csv.reader(f)
xval = []
yval = []
polyreg = []
for row in csv_f:
xval.append(row[0])
yval.append(row[1])
f.close()
x = np.array(xval)
y = np.array(yval)
xlist = [float(i) for i in x]
ylist = [float(i) for i in y]
print xlist
print ylist
def poly_fit(x,y):
for i in range(1, len(x)):
M = np.matrix(x[i],y[i])
return M
Matrix = poly_fit(xlist,ylist)
print Matrix
The poly_fit(x,y) is the function I am trying to build to do the polynomial regression.

Maybe I misunderstood exactly what you're trying to do, but if it's fitting a polynomial from continuous x and y values, then this will do it:
import numpy as np
xi = np.random.uniform(-3, 3, 30)
ni = np.random.uniform(0, .4, 30)
coefficients = np.polyfit(xi, ni, 3)
print coefficients
Then, to use it to generate y values given new x values:
new_x = 2.5
polynomial = np.poly1d(coefficients)
new_y = polynomial(new_x)

Related

plot decision boundary in python

I did a logistic regression on my data and now I find the best Theta array to find the class of a new data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def h_theta(x,theta):
return np.dot(x,np.transpose(theta))
def g_z(x,theta):
return 1/(1+pow(np.e,-h_theta(x,theta)))
def cost_function(x,y,theta):
cost = 0
for i in range(len(y)):
l = np.log(g_z(x[i],theta))
cost += -y[i]*l -(1-y[i])*np.log((1-(g_z(x[i],theta))))
return cost/(2*len(y))
def updata_theta(x,y,theta,alpha):
for i in range(6):
u = 0
for j in range(len(y)):
u += (h_theta(x[j],theta)-y[j])*x[j,i]
theta[0,i] -= alpha*u/(len(y))
data = pd.read_csv("D:\REZA\programming\machine learning-andrew ng\coding\machine-learning-ex2\ex2\ex2data2.csv")
y = np.array(data["1"])
s = np.array(data.drop("1",axis=1))
x1T2 = np.zeros((117,1))
x2T2 = np.zeros((117,1))
x1x2 = np.zeros((117,1))
one = np.ones((117,1))
m = len(y)
for i in range(m):
x1T2[i] = s[i,0]*s[i,0]
x2T2[i] = s[i,1]*s[i,1]
x1x2[i] = s[i,0]*s[i,1]
x = np.append(one,s,axis=1)
f = np.append(x1T2,x2T2,axis=1)
f = np.append(f,x1x2,axis=1)
x = np.append(x,f,axis=1)
x = np.array(x,dtype=np.float)
theta = np.zeros((1,6),dtype=float)
n=0
alpha = 0.003
while(n<100 and cost_function(x,y,theta)>0.01):
updata_theta(x,y,theta,alpha)
n+=1
I can plot my data with plt.scatter
plt.scatter(x[:,1],x[:,2],c=y)
plt.show()
scatter plot output
Now I want to plot decision boundary using this theta array, but I don't know how to do it.

Saving big data in csv file

I am trying to save a large matrix, 1000x1000 which follows log-normal distribution. But the saved file turns out to be empty. What am I doing incorrectly here?
import numpy as np
import csv
with open('Radius.csv', 'w') as f:
shape = 1000,1000
zmin, zmax = 0.2,0.8
n = np.prod(shape)
zc = np.array([])
while True:
z = np.random.lognormal(mean=0.2, sigma=0.5, size=n * 100)
z = z[(zmin <= z) & (z < zmax)]
z = np.r_[zc, z]
if len(z) >= n:
break
inv_r = z[:n].reshape(shape)
print("1/r =",[inv_r])
writer = csv.writer(f)
writer.writerows(zip(1,[inv_r]))
It has to do with the way you are writing to rows, the zip function takes in two iterables, you passed in an int and an iterable [list]
the while loop also only ever will go through once as it stands. If you run this:
import numpy as np
import csv
with open('Radius.csv', 'w+') as f:
shape = 1000,1000
zmin, zmax = 0.2,0.8
n = np.prod(shape)
zc = np.array([])
z = np.random.lognormal(mean=0.2, sigma=0.5, size=n * 100)
z = z[(zmin <= z) & (z < zmax)]
z = np.r_[zc, z]
inv_r = z[:n].reshape(shape)
print("1/r =",[inv_r])
writer = csv.writer(f)
writer.writerows(inv_r)
it will at least log to the csv, definitely check your zip function to make sure it does what you want it to!

Why are all the values in my appended list the same?

When I plot y as a function of t, the values for a do not change. When I print the appended list I see they are all 0.0. Please help! I'm confused because y as a function of x plots fine. I can't include the actual code, but here is a minimum working example.
import numpy as np
from math import *
from astropy.table import Table
import matplotlib.pyplot as plt
from random import random
x = 0
y = 0
t = 0
h = 0.0100
tf = 40
N=ceil(tf/h)
tnew = t
x_list = [x]
y_list = [y]
t_list = [t]
for i in range(N):
#while y >= 0:
tnew = t + h*i
t = tnew
print(t)
#First and second derivatives
# stuff happens (can't share the code)
x_new = random()
y_new = random()
x = x_new
y = y_new
""" appends selected data for ability to plot"""
x_list.append(x)
y_list.append(y)
t_list.append(t)
#break
""" Plot1"""
plt.plot(t_list,y_list)
plt.show()
""" Plot2"""
plt.plot(x_list,y_list)
plt.show()
First plot I just get a vertical line
Second plot is the way it should be
First plot:
Second plot

More pythonic way of creating within-class scatter matrix

I am looking for a better way of calculating the following
import numpy as np
np.random.seed(123)
# test code
t = np.random.randint(3, size = 100)
X = np.random.random((100, 3))
m = np.random.random((3, 3))
# current method
res = 0
for k in np.unique(t):
for row in X[t == k] - m[k]:
res += np.outer(row, row)
res
"""
Output:
array([[12.45661335, -3.51124346, 3.75900294],
[-3.51124346, 14.85327689, -3.02281263],
[ 3.75900294, -3.02281263, 18.30868772]])
"""
I would prefer getting rid of the for loops using numpy.
This is the within-class scatter matrix for fischers linear discriminant.
You can write as follows:
Y = X - m[t]
np.matmul(Y.T, Y)
This is because sum_i x_i x'_i = X' X, where X is (N, 3) matrix and x_i = X[i,:], i.e. i-th row of X. ' indicates the transpose.

Asymmetric error bars in Scipy's odrpack

I am using Scipy's odrpack to fit a linear function to some data that has uncertainties in both the x and y dimensions. Each data point has it's own uncertainty that is asymmetric.
I can fit a function using symmetric uncertainties, but this is not a true representation of my data.
How can I perform the fit with this in mind?
This is my code so far. It receives input data as a command line argument, and the uncertainties i'm using are just random numbers at the moment. (also, two fits are happening, one for positive data points another for the negative. The reasons are unrelated to this question)
import sys
import numpy as np
import scipy.odr.odrpack as odrpack
def f(B, x):
return B[0]*x + B[1]
xdata = sys.argv[1].split(',')
xdata = [float(i) for i in xdata]
xdata = np.array(xdata)
#find indices of +/- data
zero_ind = np.where(xdata >= 0)[0][0]
x_p = xdata[zero_ind:]
x_m = xdata[:zero_ind+1]
ydata = sys.argv[2].split(',')
ydata = [float(i) for i in ydata]
ydata = np.array(ydata)
y_p = ydata[zero_ind:]
y_m = ydata[:zero_ind+1]
sx_m = np.random.random(len(x_m))
sx_p = np.random.random(len(x_p))
sy_m = np.random.random(len(y_m))
sy_p = np.random.random(len(y_p))
linear = odrpack.Model(f)
data_p = odrpack.RealData(x_p, y_p, sx=sx_p, sy=sy_p)
odr_p = odrpack.ODR(data_p, linear, beta0=[1.,2.])
out_p = odr_p.run()
data_m = odrpack.RealData(x_m, y_m, sx=sx_m, sy=sy_m)
odr_m = odrpack.ODR(data_m, linear, beta0=[1.,2.])
out_m = odr_m.run()
Thanks!
I will just give you solution with random data,I could not bother to import your data
import numpy as np
import scipy.odr.odrpack as odrpack
np.random.seed(1)
N = 10
x = np.linspace(0,5,N)*(-1)
y = 2*x - 1 + np.random.random(N)
sx = np.random.random(N)
sy = np.random.random(N)
def f(B, x):
return B[0]*x + B[1]
linear = odrpack.Model(f)
# mydata = odrpack.Data(x, y, wd=1./np.power(sx,2), we=1./np.power(sy,2))
mydata = odrpack.RealData(x, y, sx=sx, sy=sy)
myodr = odrpack.ODR(mydata, linear, beta0=[1., 2.])
myoutput = myodr.run()
myoutput.pprint()
Than we got
Beta: [ 1.92743947 -0.94409236]
Beta Std Error: [ 0.03117086 0.11273067]
Beta Covariance: [[ 0.02047196 0.06690713]
[ 0.06690713 0.26776027]]
Residual Variance: 0.04746112419196648
Inverse Condition #: 0.10277763521624257
Reason(s) for Halting:
Sum of squares convergence

Categories