How can I create a column in a pandas dataframe with is the gradient of another column?
I want the gradient to be run over a rolling window, so only 4 data points are assessed at one time.
I am assuming it is something like:
df['Gradient'] = np.gradient(df['Yvalues'].rolling(center=False,window=4))
However this gives error:
raise ValueError('Length of values does not match length of ' 'index')
ValueError: Length of values does not match length of index
Any ideas?
Thank you!!
I think I found the solution. Though it's probably not the most efficient..
class lines(object):
def __init__(self):
pass
def date_index_to_integer_axis(self, dateindex):
d = [d.date() for d in dateindex]
days = [(d[x] - d[x-1]).days for x in range(0,len(d))]
axis = np.cumsum(days)
axis = [x - days[0] for x in axis]
return axis
def roll(self, Xvalues, Yvalues, w): # Rollings Generator Function # https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do-in-python
for i in range(len(Xvalues) + 1 - w):
yield Xvalues[i:i + w], Yvalues[i:i + w]
def gradient(self,Xvalues,Yvalues):
#Uses least squares method.
#Returns the gradient of two array vectors (https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.lstsq.html)
A = np.vstack([Xvalues, np.ones(len(Xvalues))]).T
m, c = np.linalg.lstsq(A, Yvalues)[0]
return m,c
def gradient_column(self, data, window):
""" Takes in a single COLUMN EXTRACT from a DATAFRAME (with associated "DATE" index) """
vars = variables()
#get "X" values
Xvalues = self.date_index_to_integer_axis(data.index)
Xvalues = np.asarray(Xvalues,dtype=np.float)
#get "Y" values
Yvalues = np.asarray([val for val in data],dtype=np.float)
Yvalues = np.asarray(Yvalues,dtype=np.float)
#calculate rolling window "Gradient" ("m" in Y = mx + c)
Gradient_Col = [self.gradient(sx,sy)[0] for sx,sy in self.roll(Xvalues,Yvalues, int(window))]
Gradient_Col = np.asarray(Gradient_Col,dtype=np.float)
nan_array = np.empty([int(window)-1])
nan_array[:] = np.nan
#fill blanks at the start of the "Gradient_Col" so it is the same length as the original Dataframe (its shorter due to WINDOW)
Gradient_Col = np.insert(Gradient_Col, 0, nan_array)
return Gradient_Col
df['Gradient'] = lines.gradient_column(df['Operating Revenue'],window=4)
From the given information, it can be seen that you haven't provided an aggregation function to your rolling window.
df['Gradient'] = np.gradient(
df['Yvalues']
.rolling(center=False, window=4)
.mean()
)
or
df['Gradient'] = np.gradient(
df['Yvalues']
.rolling(center=False, window=4)
.sum()
)
You can read more about rolling functions at this website.
Related
As shown in this picture, my predicted points are following the GPS track, which has noisy points and that is not desired. Instead I want my filter to predict points that follow the road instead of the green area.
I tried to implement Kalman filter on noisy GPS data to remove the jumping points or predicting missing data if GPS signal is lost. Data contains latitude and longitude. After adjusting the parameters I can see that my predicted values are very much the same as the measurements I have, which is not fulfilling the actual problem I am trying to solve. I am still at the learning
stage, so I am not sure if the parameter selection is not right or the problem lies within my Python code. I'm using QGIS for visualization of Actual and Prediction values to compare them with my real GPS data.
Here is my code:
....python...
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('C:/Users/mun/Desktop/Research/Ny mappe/GPS_track.csv')
df.head(1000)
lat = np.array([df.latitude])
print(lat)
long = np.array([df.longitude])
print(long)
print(len(long[0]))
for i in range(len(long)):
print(long[i][0])
for i in range(len(lat[0])):
print(lat[0][i])
print(len(lat[0]))
print(len(long[0]))
#length of the arrays. the arrays should always have the same length
lng=len(lat[0])
print(lng)
for index in range(lng):
print(lat[0][index])
print(long[0][index])
for index in range (lng):
np.array((lat[0][index], long[0][index]))
coord1 = [list(i) for i in zip (lat[0],long[0])]
print(coord1)
from pylab import *
from numpy import *
import matplotlib.pyplot as plt
class Kalman:
def __init__(self, ndim):
self.ndim = ndim
self.Sigma_x = eye(ndim)*1e-4 # Process noise (Q)
self.A = eye(ndim) # Transition matrix which
predict state for next time step (A)
self.H = eye(ndim) # Observation matrix (H)
self.mu_hat = 0 # State vector (X)
self.cov = eye(ndim)*0.01 # Process Covariance (P)
self.R = .001 # Sensor noise covariance matrix /
measurement error (R)
def update(self, obs):
# Make prediction
self.mu_hat_est = dot(self.A,self.mu_hat)
self.cov_est = dot(self.A,dot(self.cov,transpose(self.A))) +
self.Sigma_x
# Update estimate
self.error_mu = obs - dot(self.H,self.mu_hat_est)
self.error_cov = dot(self.H,dot(self.cov,transpose(self.H))) +
self.R
self.K =
dot(dot(self.cov_est,transpose(self.H)),linalg.inv(self.error_cov))
self.mu_hat = self.mu_hat_est + dot(self.K,self.error_mu)
if ndim>1:
self.cov = dot((eye(self.ndim) -
dot(self.K,self.H)),self.cov_est)
else:
self.cov = (1-self.K)*self.cov_est
if __name__ == "__main__":
#print "***** 1d ***********"
ndim = 1
nsteps = 3
k = Kalman(ndim)
mu_init=array([54.907134])
cov_init=0.001*ones((ndim))
obs = random.normal(mu_init,cov_init,(ndim, nsteps))
for t in range(ndim,nsteps):
k.update(obs[:,t])
print ("Actual: ", obs[:, t], "Prediction: ", k.mu_hat_est)
coord_output=[]
for coordinate in coord1:
temp_list=[]
ndim = 2
nsteps = 100
k = Kalman(ndim)
mu_init=np.array(coordinate)
cov_init=0.0001*ones((ndim))
obs = zeros((ndim, nsteps))
for t in range(nsteps):
obs[:, t] = random.normal(mu_init,cov_init)
for t in range(ndim,nsteps):
k.update(obs[:,t])
print ("Actual: ", obs[:, t], "Prediction: ", k.mu_hat_est[0])
temp_list.append(obs[:, t])
temp_list.append(k.mu_hat_est[0])
print("temp list")
print(temp_list)
coord_output.append(temp_list)
for coord_pair in coord_output:
print(coord_pair[0])
print(coord_pair[1])
print("--------")
print(line_actual)
print(coord_output)
df2= pd.DataFrame(coord_output)
print(df2)
Actual = df2[0]
Prediction = df2[1]
print (Actual)
print(Prediction)
Actual_df = pd.DataFrame(Actual)
Prediction_df = pd.DataFrame(Prediction)
print(Actual_df)
print(Prediction_df)
Actual_coord = pd.DataFrame(Actual_df[0].to_list(), columns = ['latitude',
'longitude'])
Actual_coord.to_csv('C:/Users/mun/Desktop/Research/Ny
mappe/Actual_noise.csv')
Prediction_coord = pd.DataFrame(Prediction_df[1].to_list(), columns =
['latitude', 'longitude'])
Prediction_coord.to_csv('C:/Users/mun/Desktop/Research/Ny
mappe/Prediction_noise.csv')
print (Actual_coord)
print (Prediction_coord)
Actual_coord.plot(kind='scatter',x='longitude',y='latitude',color='red')
plt.show()
Prediction_coord.plot(kind='scatter',x='longitude',y='latitude',
color='green')
plt.show()
I'm trying to use logistic regression on the popularity of hits songs on Spotify from 2010-2019 based on their durations and durability, whose data are collected from an .csv file. Basically, since the popularity values of each song is numerical, I have converted each of them to binary numbers "0" to "1". If the popularity value of a hit song is less than 70, I will replace its current value to 0, and vice versa if its value is more than 70.
The current sigmoid curve is being "log" right now, hence it is showing a straight line. However, in the context of this code, I am still not sure how to add in a proper sigmoid curve, instead of just the straight line. Is there anything i need to add to my code in order to show both a solid sigmoid curve and the log of the curve in the same graph? It would be deeply appreciated if someone can help me with the final step.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('top10s [SubtitleTools.com] (2).csv')
BPM = df.bpm
BPM = np.array(BPM)
Energy = df.nrgy
Energy = np.array(Energy)
Dance = df.dnce
Dance = np.array(Dance)
dB = df.dB
dB = np.array(dB)
Live = df.live
Live = np.array(Live)
Valence = df.val
Valence = np.array(Valence)
Acous = df.acous
Acous = np.array(Acous)
Speech = df.spch
Speech = np.array(Speech)
df.loc[df['popu'] <= 70, 'popu'] = 0
df.loc[df['popu'] > 70, 'popu'] = 1
def Logistic_Regression(X, y, iterations, alpha):
ones = np.ones((X.shape[0], ))
X = np.vstack((ones, X))
X = X.T
b = np.zeros(X.shape[1])
for i in range(iterations):
z = np.dot(X, b)
p_hat = sigmoid(z)
gradient = np.dot(X.T, (y - p_hat))/y.size
b = b + alpha * gradient
if (i % 1000 == 0):
print('LL, i ', log_likelihood(X, y, b), i)
return b
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def log_likelihood(X, y, b):
z = np.dot(X, b)
LL = np.sum(y*z - np.log(1 + np.exp(z)))
return LL
def LR1():
Dur = df.dur
Dur = np.array(Dur)
Pop = df.popu
Pop = [int(i) for i in Pop]; Pop = np.array(Pop)
plt.figure(figsize=(10,8))
colormap = np.array(['r', 'b'])
plt.scatter(Dur, Pop, c = colormap[Pop], alpha = .4)
b = Logistic_Regression(Dur, Pop, iterations = 8000, alpha = 0.00005)
print('Done')
p_hat = sigmoid(np.dot(Dur, b[1]) + b[0])
idxDur = np.argsort(Dur)
plt.plot(Dur[idxDur], p_hat[idxDur])
plt.show()
LR1()
My dataset:
CSV File
My Current Graph
What i want to have:
Shape of sigmoid i want
at first glance, your Logistic_Regression initialization seems very wrong.
I think you packed X with [X, 1] then tries to learn W = [Weight, bias], which should be [1, 0] to start with.
Note the 1 is vector [1, 1, 1...] with length = feature vector length.
try something like this:
x_range = np.linspace(Dur.min(), Dur.max(), 100)
p_hat = sigmoid(np.dot(x_range, b[1]), b[0])
plt.plot(x_range, p_hat)
plt.show()
I have a set of data in a numpy array - x-values, lets say between 0-100, and y-values. I need to get the gradient to a specific x-value ex. x=20 but I can only get the np.gradient function to give me the gradient at a certain index-value. right now I have:
g=np.gradient(y)
print(g[20])
but this of course gives me the gradient at i=20 and not x=20
I have both the x and y values in one 2D array and 2 x 1D arrays defined in my script
EDIT:
I actually came to solve it like this:
def grad(x, value):
def find_nearest(x, value):
x = np.asarray(Timeppmh)
idx = (np.abs(x - value)).argmin()
i = x.tolist().index(x[idx])
return i
g=np.gradient(yp,x)
find_nearest(x,value)
return g[find_nearest(x,value)]
If the value 20 is in x you could just do j[x == 20]. However, if that is not the case, you would need to approximate the gradient value. You can use for example linear interpolation.
import numpy as np
x = np.linspace(0, 100, 80)
print(20 in x) # 20 is not in x
# False
y = x * x + 3 * x + 2
# Pass x as second argument for value spacing
g = np.gradient(y, x)
print(np.interp(20, x, g)) # Should be 43
# 43.00000000000001
Basically I have 2 arrays obtained from a set of data points one array for the x values and one for the y values. I need to numerically integrate the y values with respect to the x values - i.e. an element from the y integrated with respect to the corresponding element in x. This should then generate a new array of elements. I have tried simpson's rule but I get one value back instead of an array. A general idea or approach is all I'm looking for. Any help, however, will be much appreciated.
Thanks.
# check out this:
def integration_by_simpsons_3_8_th_rule(i,X,Y,Fd):
h = X[i]-X[i-1]
y_n = Y[i]
y_n_1 = signal[i-1]
y_n_2 = signal[i-2]
y_n_3 = signal[i-3]
Area = (3/8)*h*( y_n_3 + 3*(y_n_2 + y_n_1) + y_n )
return (X[i-1],Area)
def rolling_integration(X,Y,Fd):
Y_int = []
corres_X = []
for i in range(3,len(signal),1):
x,y = integration_by_simpsons_3_8_th_rule(i,X,Y,Fd)
Y_int.append(float(y))
corres_X.append(float(x))
return (np.array(corres_X)+(np.array(1/(4*float(Fd)))),np.array(Y_int))
#Fd : for phase correction
I have created a script in Python which takes an image as input and produces a new image where each pixel corresponds to a feature calculated from the windowed group of pixels in the input image. The following picture will highlight this idea:
In the border cases we can either insert NaN into the output image or just use the pixels we have available inside the window. What would be an optimized way to do achieve this functionality in Python or some other programming language? At the moment, my script is simply using a bunch of for-loops to get the job done. Here you can see the code:
# This function will return the statistical features
#
#
# INPUTS:
# 'data' the data from which statistical features are to be calculated
# "winSize" specifying the window size, must be odd and > 1
#
# OUTPUT:
# 'meanData, stdData' statistical feature matrices (numpy ndarrays)
def get_stat_feats(data, winSize):
rows = data.shape[0]
cols = data.shape[1]
dist = int(math.floor(float(winSize)/2.0))
neigh = range(-dist, dist+1)
temp = np.zeros((int(winSize)**2, 1))
meanData = np.zeros(data.shape)
stdData = np.zeros(data.shape)
for row in range(0, rows):
for col in range(0, cols):
index = 0
makeNaN = 0
for y in neigh:
for x in neigh:
indY = row + y
indX = col + x
# Check that we are inside the image
if indY >= 0 and indY <= rows-1 and indX >= 0 and indX <= cols-1:
temp[index] = data[indY, indX]
index += 1
else:
makeNaN = 1
if makeNaN == 1:
meanData[row, col] = np.NAN
stdData[row, col] = np.NAN
else:
meanData[row, col] = np.mean(temp)
stdData[row, col] = np.std(temp)
return meanData, stdData
Thnx for any help! =) If there any more information needed, please ask =)
generic_filter from scipy.ndimage should be a decent solution for this. Probably faster solution, but this is the simplest i think.
It can take a mode parameter to define how to handle the edges. For example you could set it to treat elements outside the border to constant and equal NaN like this:
generic_filter(a, f, size=winSize, mode='constant', cval=np.nan)
def get_stat_feats(data, winSize):
from scipy.ndimage import generic_filter
import numpy as np
mean = lambda x: x.mean()
std = lambda x: x.std()
meanData = generic_filter(data, mean, size=winSize)
stdData = generic_filter(data, std, size=winSize)
return meanData, stdData
force float and round return value:
import numpy as np
def get_stat_feats(data, winSize):
from scipy.ndimage import generic_filter
import numpy as np
data = data.astype(float)
mean = lambda x: x.mean()
std = lambda x: x.std()
meanData = generic_filter(data, mean, size=winSize)
stdData = generic_filter(data, std, size=winSize)
return np.round(meanData,2), np.round(stdData, 2)