As shown in this picture, my predicted points are following the GPS track, which has noisy points and that is not desired. Instead I want my filter to predict points that follow the road instead of the green area.
I tried to implement Kalman filter on noisy GPS data to remove the jumping points or predicting missing data if GPS signal is lost. Data contains latitude and longitude. After adjusting the parameters I can see that my predicted values are very much the same as the measurements I have, which is not fulfilling the actual problem I am trying to solve. I am still at the learning
stage, so I am not sure if the parameter selection is not right or the problem lies within my Python code. I'm using QGIS for visualization of Actual and Prediction values to compare them with my real GPS data.
Here is my code:
....python...
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('C:/Users/mun/Desktop/Research/Ny mappe/GPS_track.csv')
df.head(1000)
lat = np.array([df.latitude])
print(lat)
long = np.array([df.longitude])
print(long)
print(len(long[0]))
for i in range(len(long)):
print(long[i][0])
for i in range(len(lat[0])):
print(lat[0][i])
print(len(lat[0]))
print(len(long[0]))
#length of the arrays. the arrays should always have the same length
lng=len(lat[0])
print(lng)
for index in range(lng):
print(lat[0][index])
print(long[0][index])
for index in range (lng):
np.array((lat[0][index], long[0][index]))
coord1 = [list(i) for i in zip (lat[0],long[0])]
print(coord1)
from pylab import *
from numpy import *
import matplotlib.pyplot as plt
class Kalman:
def __init__(self, ndim):
self.ndim = ndim
self.Sigma_x = eye(ndim)*1e-4 # Process noise (Q)
self.A = eye(ndim) # Transition matrix which
predict state for next time step (A)
self.H = eye(ndim) # Observation matrix (H)
self.mu_hat = 0 # State vector (X)
self.cov = eye(ndim)*0.01 # Process Covariance (P)
self.R = .001 # Sensor noise covariance matrix /
measurement error (R)
def update(self, obs):
# Make prediction
self.mu_hat_est = dot(self.A,self.mu_hat)
self.cov_est = dot(self.A,dot(self.cov,transpose(self.A))) +
self.Sigma_x
# Update estimate
self.error_mu = obs - dot(self.H,self.mu_hat_est)
self.error_cov = dot(self.H,dot(self.cov,transpose(self.H))) +
self.R
self.K =
dot(dot(self.cov_est,transpose(self.H)),linalg.inv(self.error_cov))
self.mu_hat = self.mu_hat_est + dot(self.K,self.error_mu)
if ndim>1:
self.cov = dot((eye(self.ndim) -
dot(self.K,self.H)),self.cov_est)
else:
self.cov = (1-self.K)*self.cov_est
if __name__ == "__main__":
#print "***** 1d ***********"
ndim = 1
nsteps = 3
k = Kalman(ndim)
mu_init=array([54.907134])
cov_init=0.001*ones((ndim))
obs = random.normal(mu_init,cov_init,(ndim, nsteps))
for t in range(ndim,nsteps):
k.update(obs[:,t])
print ("Actual: ", obs[:, t], "Prediction: ", k.mu_hat_est)
coord_output=[]
for coordinate in coord1:
temp_list=[]
ndim = 2
nsteps = 100
k = Kalman(ndim)
mu_init=np.array(coordinate)
cov_init=0.0001*ones((ndim))
obs = zeros((ndim, nsteps))
for t in range(nsteps):
obs[:, t] = random.normal(mu_init,cov_init)
for t in range(ndim,nsteps):
k.update(obs[:,t])
print ("Actual: ", obs[:, t], "Prediction: ", k.mu_hat_est[0])
temp_list.append(obs[:, t])
temp_list.append(k.mu_hat_est[0])
print("temp list")
print(temp_list)
coord_output.append(temp_list)
for coord_pair in coord_output:
print(coord_pair[0])
print(coord_pair[1])
print("--------")
print(line_actual)
print(coord_output)
df2= pd.DataFrame(coord_output)
print(df2)
Actual = df2[0]
Prediction = df2[1]
print (Actual)
print(Prediction)
Actual_df = pd.DataFrame(Actual)
Prediction_df = pd.DataFrame(Prediction)
print(Actual_df)
print(Prediction_df)
Actual_coord = pd.DataFrame(Actual_df[0].to_list(), columns = ['latitude',
'longitude'])
Actual_coord.to_csv('C:/Users/mun/Desktop/Research/Ny
mappe/Actual_noise.csv')
Prediction_coord = pd.DataFrame(Prediction_df[1].to_list(), columns =
['latitude', 'longitude'])
Prediction_coord.to_csv('C:/Users/mun/Desktop/Research/Ny
mappe/Prediction_noise.csv')
print (Actual_coord)
print (Prediction_coord)
Actual_coord.plot(kind='scatter',x='longitude',y='latitude',color='red')
plt.show()
Prediction_coord.plot(kind='scatter',x='longitude',y='latitude',
color='green')
plt.show()
Related
I have a code that represents the diffusion equation (Concentration as a function of time and space):
∂²C/∂x² - ∂C/∂t= 0
I discretized to the following form:
C[n+1,j] = C[n,j] + (dt/dx²)(C[n,j+1] - 2(C[n,j]) + C[n,j-1])
I am trying to generate the following graph, however I haven't had much success. Is there anyone who could help me with this? Many thanks!
The graph that I obtain:
The code that I have to reproduce the diffusion equation:
import numpy as np
import matplotlib.pyplot as plt
dt = 0.001 # grid size for time (s)
dx = 0.05 # grid size for space (m)
x_max = 1 # in m
t_max = 1 # total time in s
C0 = 1 # concentration
# function to calculate concentration profiles based on a
# finite difference approximation to the 1D diffusion
# equation:
def diffusion(dt,dx,t_max,x_max,C0):
# diffusion number:
s = dt/dx**2
x = np.arange(0,x_max+dx,dx)
t = np.arange(0,t_max+dt,dt)
r = len(t)
a = len(x)
C = np.zeros([r,a]) # initial condition
C[:,0] = C0 # boundary condition on left side
C[:,-1] = 0 # boundary condition on right side
for n in range(0,r-1): # time
for j in range(1,a-1): # space
C[n+1,j] = C[n,j] + s*(C[n,j-1] -
2*C[n,j] + C[n,j+1])
return x,C,r,a
# note that this can be written without the for-loop
# in space, but it is easier to read it this way
x,C,r,a = diffusion(dt,dx,t_max,x_max,C0)
# plotting:
plt.figure()
plt.xlim([0,1])
plt.ylim([0,1])
plot_times = np.arange(0,1,0.02)
for t in plot_times:
plt.plot(x,C[int(t/dt),:],'Gray',label='numerical')
plt.xlabel('Membrane position x',fontsize=12)
plt.ylabel('Concentration',fontsize=12)
Right now I'm working on a custom implementation of a PDP interaction for 3 features at the same time and I have a problem with visualizing the data. Sometime ago I thought that it would be nice to represent a 3D-heatmap for features' interaction, where the surface shape represents PDP interaction for 2 out of 3 features (x&y and x&z), and the heatmap texture layer represents interaction of the last two features (z&y).
Right now I have a working (barebones, though) implementation of the PDP heatmap for 2 features.
def pdp_custom_3D(estimated_model, X, y, n_splits, target_name_1, target_name_2, prefit = True, X_train = None, y_train = None):
import matplotlib.pyplot as plt
import seaborn as sb
if prefit == False:
try:
estimated_model.fit(X_train, y_train)
except:
logging.warning("Estimated model must have fit method.")
PDP_list_1 = list()
PDP_list_2 = list()
feature_list_1 = list()
feature_list_2 = list()
x_max_1 = X[target_name_1].max()
x_min_1 = X[target_name_1].min()
x_max_2 = X[target_name_2].max()
x_min_2 = X[target_name_2].min()
print('x_max_1', x_max_1)
print('x_min_1', x_min_1)
print('x_max_2', x_max_2)
print('x_min_2', x_min_2)
X_copy = X.copy()
step_1 = abs(x_max_1 - x_min_1)/n_splits
step_2 = abs(x_max_2 - x_min_2)/n_splits
x_axis = int(n_splits+1)
y_axis = int(n_splits+1)
s = (x_axis, y_axis)
PDP_result = np.zeros(s)
counter_1 = x_min_1
counter_2 = x_min_2
for i in range(n_splits+1):
feature_list_1.append(counter_1 + (i*step_1))
X_copy[target_name_1] = counter_1 + i * step_1
for j in range(n_splits+1):
feature_list_2.append(counter_2 + j * step_2)
X_copy[target_name_2] = counter_2 + j * step_2
temp = estimated_model.predict(X_copy)
PDP_result[i][j] = temp.mean()
sb.heatmap(PDP_result)
return
And It gets me somewhat nice heatmap
Is there any way I can get 3D heatmap (3 axis are selected features and 4th is mean of the prediction model is giving me back - It is being represented as a heat signature here, and I want a surface as a second representation of the prediction) with the same data forms?
Somewhat like this
Thank you all and have a nice day!
I'm using k-prototyps library for mixed numerical and numinal data type. According to https://github.com/nicodv/kmodes/issues/46
to calculate the silhouette score in k prototypes, I calculate the silhouette score of categorical data (based on hamming distance) and the silhouette score of numerical data (based on euclidean distance), but the developed code is Pretty slow and it takes 10h to calculate silhouette for 60000 records. My laptop has 12G Ram and corei 7.
Any help to improve the speed of code, please?
import numpy as np
import pandas as pd
from kmodes.kprototypes import KPrototypes
# -------- import data
df = pd.read_csv(r'C:\Users\data.csv')
# ------------- Normalize the data ---------------
# print(df.columns) # To get columns name
x_df = df[['R', 'F']]
x_df_norm = x_df.apply(lambda x: (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)))
x_df_norm['COType'] = df[['COType']]
def calc_euclian_dis(_s1, _s2):
# s1 = np.array((3, 5))
_eucl_dist = np.linalg.norm(_s2 - _s1) # calculate Euclidean distance, accept input an array [2 6]
return _eucl_dist
def calc_simpleMatching_dis(_s1, _s2):
_cat_dist = 0
if (_s1 != _s2):
_cat_dist = 1
return _cat_dist
k = 3
# calculate silhoutte for one cluster number
kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2)
clusters_label = kproto.fit_predict(x_df_norm, categorical=[2])
_identical_cluster_labels = list(dict.fromkeys(clusters_label))
# Assign clusters lables to the Dataset
x_df_norm['Cluster_label'] = clusters_label
# ------------- calculate _silhouette_Index -------------
# 1. Calculate ai
_silhouette_Index_arr = []
for i in x_df_norm.itertuples():
_ai_cluster_label = i[-1]
# return samples of the same cluster
_samples_cluster = x_df_norm[x_df_norm['Cluster_label'] == _ai_cluster_label]
_dist_array_ai = []
_s1_nume_att = np.array((i[1], i[2]))
_s1_cat_att = i[3]
for j in _samples_cluster.itertuples():
_s2_nume_att = np.array((j[1], j[2]))
_s2_cat_att = j[3]
_euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
_cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
_dist_array_ai.append(_euclian_dis + (kproto.gamma * _cat_dis))
ai = np.average(_dist_array_ai)
# 2. Calculate bi
# 2.1. determine the samples of other clusters
_identical_cluster_labels.remove(_ai_cluster_label)
_dic_cluseter = {}
_bi_arr = []
for ii in _identical_cluster_labels:
_samples = x_df_norm[x_df_norm['Cluster_label'] == ii]
# 2.2. calculate bi
_dist_array_bi = []
for j in _samples.itertuples():
_s2_nume_att = np.array((j[1], j[2]))
_s2_cat_att = j[3]
_euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
_cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
_dist_array_bi.append(_euclian_dis + (kproto.gamma * _cat_dis))
_bi_arr.append(np.average(_dist_array_bi))
_identical_cluster_labels.append(_ai_cluster_label)
# min bi is determined as final bi variable
bi = min(_bi_arr)
# 3. calculate silhouette Index
if ai == bi:
_silhouette_i = 0
elif ai < bi:
_silhouette_i = 1 - (ai / bi)
elif ai > bi:
_silhouette_i = 1 - (bi / ai)
_silhouette_Index_arr.append(_silhouette_i)
silhouette_score = np.average(_silhouette_Index_arr)
print('_silhouette_Index = ' + str(silhouette_score))
Hei! I reimplemented your function by using the linear algebra operators for computing dissimilarities instead of using a lot of for loops:
It is way faster :-)
def euclidean_dissim(a, b, **_):
"""Euclidean distance dissimilarity function
b is the single point, a is the matrix of vectors"""
if np.isnan(a).any() or np.isnan(b).any():
raise ValueError("Missing values detected in numerical columns.")
return np.linalg.norm(a - b, axis=1)
def matching_dissim(a, b, **_):
"""Simple matching dissimilarity function
b is the single point, a is the matrix of all other vectors,
count how many matching values so difference = 0 """
# We are subtracting to dimension since is not similarity but a dissimilarity
dimension = len(b)
return dimension - np.sum((b-a)==0,axis=1)
def calc_silhouette_proto(dataset,numerical_pos, cat_pos,kproto_model):
'''------------- calculate _silhouette_Index -------------'''
# 1. Compute a(i)
silhouette_Index_arr = []
for i in dataset.itertuples():
# convert tuple to np array
i = np.array(i)
unique_cluster_labels = list(np.unique(dataset['cluster_labels']))
# We need each time to remove the considered tuple from the dataset since we don't compute distances from itself
data = dataset.copy()
ai_cluster = i[-1] # The cluster is in the last position of the tuple
# Removing the tuple from the dataset
tuple_index = dataset.index.isin([i[0]])
data = data[~tuple_index]
# Get samples of the same cluster
samples_of_cluster = data[data['cluster_labels'] == ai_cluster].loc[:,data.columns!='cluster_labels'].to_numpy()
# Compute the 2 distances among the single points and all the others
euclidian_distances = euclidean_dissim(samples_of_cluster[:,numerical_pos],i[np.array(numerical_pos)+1])
categ_distances = matching_dissim(samples_of_cluster[:,cat_pos],i[np.array(cat_pos)+1])
# Weighted average of the 2 distances
ai = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))
# 2. Calculate bi
unique_cluster_labels.remove(ai_cluster)
bi_arr = []
for ii in unique_cluster_labels:
# Get all the samples of cluster ii
samples = data[data['cluster_labels'] == ii].loc[:,data.columns!='cluster_labels'].to_numpy()
# Compute the 2 distances among the single points and all the others
euclidian_distances = np.linalg.norm(samples[:,numerical_pos] - i[np.array(numerical_pos)+1], axis=1)
categ_distances = matching_dissim(samples[:,cat_pos],i[np.array(cat_pos)+1])
distance_bi = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))
bi_arr.append(np.average(distance_bi))
# min bi is determined as final bi variable
if(len(bi_arr)==0):
bi = 0
else:
bi = min(bi_arr)
# 3. calculate silhouette Index
if ai == bi:
silhouette_i = 0
elif ai < bi:
silhouette_i = 1 - (ai / bi)
elif ai > bi:
silhouette_i = 1 - (bi / ai)
silhouette_Index_arr.append(silhouette_i)
silhouette_score = np.average(silhouette_Index_arr)
return silhouette_score
I'm trying to use logistic regression on the popularity of hits songs on Spotify from 2010-2019 based on their durations and durability, whose data are collected from an .csv file. Basically, since the popularity values of each song is numerical, I have converted each of them to binary numbers "0" to "1". If the popularity value of a hit song is less than 70, I will replace its current value to 0, and vice versa if its value is more than 70.
The current sigmoid curve is being "log" right now, hence it is showing a straight line. However, in the context of this code, I am still not sure how to add in a proper sigmoid curve, instead of just the straight line. Is there anything i need to add to my code in order to show both a solid sigmoid curve and the log of the curve in the same graph? It would be deeply appreciated if someone can help me with the final step.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('top10s [SubtitleTools.com] (2).csv')
BPM = df.bpm
BPM = np.array(BPM)
Energy = df.nrgy
Energy = np.array(Energy)
Dance = df.dnce
Dance = np.array(Dance)
dB = df.dB
dB = np.array(dB)
Live = df.live
Live = np.array(Live)
Valence = df.val
Valence = np.array(Valence)
Acous = df.acous
Acous = np.array(Acous)
Speech = df.spch
Speech = np.array(Speech)
df.loc[df['popu'] <= 70, 'popu'] = 0
df.loc[df['popu'] > 70, 'popu'] = 1
def Logistic_Regression(X, y, iterations, alpha):
ones = np.ones((X.shape[0], ))
X = np.vstack((ones, X))
X = X.T
b = np.zeros(X.shape[1])
for i in range(iterations):
z = np.dot(X, b)
p_hat = sigmoid(z)
gradient = np.dot(X.T, (y - p_hat))/y.size
b = b + alpha * gradient
if (i % 1000 == 0):
print('LL, i ', log_likelihood(X, y, b), i)
return b
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def log_likelihood(X, y, b):
z = np.dot(X, b)
LL = np.sum(y*z - np.log(1 + np.exp(z)))
return LL
def LR1():
Dur = df.dur
Dur = np.array(Dur)
Pop = df.popu
Pop = [int(i) for i in Pop]; Pop = np.array(Pop)
plt.figure(figsize=(10,8))
colormap = np.array(['r', 'b'])
plt.scatter(Dur, Pop, c = colormap[Pop], alpha = .4)
b = Logistic_Regression(Dur, Pop, iterations = 8000, alpha = 0.00005)
print('Done')
p_hat = sigmoid(np.dot(Dur, b[1]) + b[0])
idxDur = np.argsort(Dur)
plt.plot(Dur[idxDur], p_hat[idxDur])
plt.show()
LR1()
My dataset:
CSV File
My Current Graph
What i want to have:
Shape of sigmoid i want
at first glance, your Logistic_Regression initialization seems very wrong.
I think you packed X with [X, 1] then tries to learn W = [Weight, bias], which should be [1, 0] to start with.
Note the 1 is vector [1, 1, 1...] with length = feature vector length.
try something like this:
x_range = np.linspace(Dur.min(), Dur.max(), 100)
p_hat = sigmoid(np.dot(x_range, b[1]), b[0])
plt.plot(x_range, p_hat)
plt.show()
Want to calculate dominant frequency, secondary dominant frequency from X,Y,Z accelerometer data stored in flat CSV files (million rows+) e.g.
data
I'm trying to use scipy although aware of numpy - either would do. I've converted my X, Y, Z to SMV format (single magnitude vector) and want to apply the fourier transform to this, and then get the frequencies using fftfreq - the bit that defeats me is the n and timestep. I have my sample rates, the hertz, and the size of rolling window I want to look at (10 rows of data) but not quite sure how to apply this to script below:
#The three-dimension data collected (X,Y,Z) were transformed into a
#single-dimensional Signal Magnitude Vector SMV (aka The Resultant)
#SMV = x2 + Y2 + Z2
X2 = X['X']*X['X']
Y2 = X['Y']*X['Y']
Z2 = X['Z']*X['Z']
#print X['X'].head(2) #Confirmed worked
#print X2.head(2) #Confirmed worked
combine = [X2,Y2,Z2, Y]
parent = pd.concat(combine, axis=1)
parent['ADD'] = parent.sum(axis=1) #Sum X2,Y2,Z2
sqr = np.sqrt(parent['ADD']) #Square Root of Sum Above
sqr.name = 'SMV'
combine2 = [sqr, Y] #Reduce Dataset to SMV and Class
parent2 = pd.concat(combine2, axis=1)
print parent2.head(4)
"************************* Begin Fourier ****************************"
from scipy import fftpack
X = fftpack.fft(sqr)
f_s = 80 #80 Hertz
samp = 1024 #samples per segment divided by 12.8 secs signal length
n = X.size
timestep = 10
freqs = fftpack.fftfreq(n, d=timestep)
Firstly you need to load your data into a numpy array (sorry i didn't quite follow your approach):
def load_data():
csvlist = []
times = []
with open('freq.csv') as f:
csvfile = csv.reader(f, delimiter=',')
for i, row in enumerate(csvfile):
timestamp = datetime.datetime.strptime(row[0],"%Y-%m-%d %H:%M:%S.%f")
times.append(timestamp)
csvlist.append(row[1:])
timestep = times[1]-times[0]
csvarr = numpy.array(csvlist, dtype=numpy.float32)
return timestep, csvarr
The may well be a better way to do this?
Then you need to calculate the magnitudes:
rms = numpy.sqrt(numpy.sum(data**2, axis=1))
And then the fourier analysis:
def fourier(timestep, data):
N = len(data)//2
freq = fftpack.fftfreq(len(data), d=timestep)[:N]
fft = fftpack.fft(data)[:N]
amp = numpy.abs(fft)/N
order = numpy.argsort(amp)[::-1]
return freq[order]
the return from this is a list of frequencies in decreasing order of importance.