I am trying to detect drift from a list of given stream of one dimensional data. If there is no trend in the data, I am expecting 0 <= confidence score <= 0.20, but if drift detected, I am expecting 0.90 <= confidence score <= 1.
I am attaching the Python 3.x code snippet I am using, along with my calculations by hand (the picture in the end).
import numpy as np
from univariate import UnivariateAnalysis
from scipy import stats
class UnivariateDriftAnalysis:
''' This technique looks for a trend in recent data using linear
regression as a statistical test that the trend is non-zero
Currently, this uses a fixed window length, but future versions might
incorporate a search over a range of window lengths
'''
def __init__(self, n_window, p=0.01):
'''
n_window - (int) length of data history to look for a trend
p - (int) desired confidence or false positive rate.
p=.05 means that alarms will be raised when there is <5% chance
that there is no trend
'''
self.n_window = n_window
self.p = p
def drift_detected(self, data) -> list:
''' Returns an array, x, of probabilities that the slope of the data is
not zero. i.e., the confidence that there is a slope.
x[i] corresponds to the slope of data[i-n_window:i]
The first n_window values of x are np.NaN
'''
n = len(data)
y = []
x0 = np.arange(n)
result: list = [np.NaN] * self.n_window
i = 0
for d in data:
y.append(d)
if len(y) < self.n_window:
# if max_history_samples < window_length
continue
y = y[-self.n_window:]
x = x0[i:i + self.n_window]
p_value = stats.linregress(x, y).pvalue
# slope, intercept, r_value, p_value, std_err = rez
result.append(1-p_value)
i += 1
return result
def update(self, data) -> None:
''' this function is designed to handle live stream of data'''
scores = self.alarm_score(data)
alarms = [r < self.p for r in alarm_scores]
# some other stuff
# Test
np.random.seed(100)
n_window = 10
lr = LinearRegressionSPC(n_window=n_window, p=.01)
data = np.concatenate([np.random.randint(24, 47, 1500), np.random.randint(1000, 4000, 2000), np.random.randint(1, 5, 500)])
score = lr.alarm_score(data)
print(result[n_window:]) # lowest: 0 highest: 0.9953301824956942
Question:
What am I missing? Why the confidence score is as high as 0.9953!?
My end objective is to define p value for a given data array to calculate drift existence confidence.
Related
I am trying to minimize this multivariate
where αi are constants (could be both positive or negative) and n is fixed,
using the scipy.optimize.fmin_bfgs function.
Conditions:
Test the code for a random natural number n between 5 and 10
A random starting point (all of the form m.dddd)
Do the iterations till the successive iterates are less than 2% in
absolute value, in the l∞ norm.
The coefficients αi (of the form m.dddd) should be chosen randomly so
that at least 40% of them are negative and at least 25% of them are
positive.
This is what I have tried (for custom callback refered to https://stackoverflow.com/a/30365576/7906671),
import numpy as np
from scipy.optimize import minimize
from scipy.optimize import fmin_bfgs
#Generate a list of random positive and negative integers
random_list = np.random.uniform(-1, 1, size=(1, 10))[0].tolist()
p = []
n, npr = [], []
for r in range(len(random_list)):
if random_list[r] < 0:
n.append(random_list[r])
npr.append((str(random_list[r]), 0.4))
else:
p.append(random_list[r])
npr.append((str(random_list[r]), 0.25))
#Function to pick negative number with 40% probability and positive numbers with 25% probability
def w_choice(seq):
total_prob = sum(item[1] for item in seq)
chosen = np.random.uniform(0, total_prob)
cumulative = 0
for item, probality in seq:
cumulative += probality
if cumulative > chosen:
return item
#Random start value with m.dddd and size of the input array is between 5 and 10
n = np.random.randint(5, 10)
x0 = np.round(np.random.randn(n,1), 4)
alpha = []
for i in range(n):
alpha.append(np.round(float(w_choice(npr)), 4))
print("alpha: ", alpha)
def func(x):
return sum(alpha*(x**2.0))
class StopOptimizingException(Exception):
pass
class CallbackCollector:
def __init__(self, f, thresh):
self._f = f
self._thresh = thresh
def __call__(self, xk):
if self._f(xk) < self._thresh:
self.x_opt = xk
cb = CallbackCollector(func, thresh=0.02)
x, _, _ = fmin_bfgs(func, x0, callback=cb)
But this does not converge and gives the following :
Warning: Desired error not necessarily achieved due to precision loss.
I am not able to figure out why this fails. Any help is appreciated!
I'm using k-prototyps library for mixed numerical and numinal data type. According to https://github.com/nicodv/kmodes/issues/46
to calculate the silhouette score in k prototypes, I calculate the silhouette score of categorical data (based on hamming distance) and the silhouette score of numerical data (based on euclidean distance), but the developed code is Pretty slow and it takes 10h to calculate silhouette for 60000 records. My laptop has 12G Ram and corei 7.
Any help to improve the speed of code, please?
import numpy as np
import pandas as pd
from kmodes.kprototypes import KPrototypes
# -------- import data
df = pd.read_csv(r'C:\Users\data.csv')
# ------------- Normalize the data ---------------
# print(df.columns) # To get columns name
x_df = df[['R', 'F']]
x_df_norm = x_df.apply(lambda x: (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)))
x_df_norm['COType'] = df[['COType']]
def calc_euclian_dis(_s1, _s2):
# s1 = np.array((3, 5))
_eucl_dist = np.linalg.norm(_s2 - _s1) # calculate Euclidean distance, accept input an array [2 6]
return _eucl_dist
def calc_simpleMatching_dis(_s1, _s2):
_cat_dist = 0
if (_s1 != _s2):
_cat_dist = 1
return _cat_dist
k = 3
# calculate silhoutte for one cluster number
kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2)
clusters_label = kproto.fit_predict(x_df_norm, categorical=[2])
_identical_cluster_labels = list(dict.fromkeys(clusters_label))
# Assign clusters lables to the Dataset
x_df_norm['Cluster_label'] = clusters_label
# ------------- calculate _silhouette_Index -------------
# 1. Calculate ai
_silhouette_Index_arr = []
for i in x_df_norm.itertuples():
_ai_cluster_label = i[-1]
# return samples of the same cluster
_samples_cluster = x_df_norm[x_df_norm['Cluster_label'] == _ai_cluster_label]
_dist_array_ai = []
_s1_nume_att = np.array((i[1], i[2]))
_s1_cat_att = i[3]
for j in _samples_cluster.itertuples():
_s2_nume_att = np.array((j[1], j[2]))
_s2_cat_att = j[3]
_euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
_cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
_dist_array_ai.append(_euclian_dis + (kproto.gamma * _cat_dis))
ai = np.average(_dist_array_ai)
# 2. Calculate bi
# 2.1. determine the samples of other clusters
_identical_cluster_labels.remove(_ai_cluster_label)
_dic_cluseter = {}
_bi_arr = []
for ii in _identical_cluster_labels:
_samples = x_df_norm[x_df_norm['Cluster_label'] == ii]
# 2.2. calculate bi
_dist_array_bi = []
for j in _samples.itertuples():
_s2_nume_att = np.array((j[1], j[2]))
_s2_cat_att = j[3]
_euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
_cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
_dist_array_bi.append(_euclian_dis + (kproto.gamma * _cat_dis))
_bi_arr.append(np.average(_dist_array_bi))
_identical_cluster_labels.append(_ai_cluster_label)
# min bi is determined as final bi variable
bi = min(_bi_arr)
# 3. calculate silhouette Index
if ai == bi:
_silhouette_i = 0
elif ai < bi:
_silhouette_i = 1 - (ai / bi)
elif ai > bi:
_silhouette_i = 1 - (bi / ai)
_silhouette_Index_arr.append(_silhouette_i)
silhouette_score = np.average(_silhouette_Index_arr)
print('_silhouette_Index = ' + str(silhouette_score))
Hei! I reimplemented your function by using the linear algebra operators for computing dissimilarities instead of using a lot of for loops:
It is way faster :-)
def euclidean_dissim(a, b, **_):
"""Euclidean distance dissimilarity function
b is the single point, a is the matrix of vectors"""
if np.isnan(a).any() or np.isnan(b).any():
raise ValueError("Missing values detected in numerical columns.")
return np.linalg.norm(a - b, axis=1)
def matching_dissim(a, b, **_):
"""Simple matching dissimilarity function
b is the single point, a is the matrix of all other vectors,
count how many matching values so difference = 0 """
# We are subtracting to dimension since is not similarity but a dissimilarity
dimension = len(b)
return dimension - np.sum((b-a)==0,axis=1)
def calc_silhouette_proto(dataset,numerical_pos, cat_pos,kproto_model):
'''------------- calculate _silhouette_Index -------------'''
# 1. Compute a(i)
silhouette_Index_arr = []
for i in dataset.itertuples():
# convert tuple to np array
i = np.array(i)
unique_cluster_labels = list(np.unique(dataset['cluster_labels']))
# We need each time to remove the considered tuple from the dataset since we don't compute distances from itself
data = dataset.copy()
ai_cluster = i[-1] # The cluster is in the last position of the tuple
# Removing the tuple from the dataset
tuple_index = dataset.index.isin([i[0]])
data = data[~tuple_index]
# Get samples of the same cluster
samples_of_cluster = data[data['cluster_labels'] == ai_cluster].loc[:,data.columns!='cluster_labels'].to_numpy()
# Compute the 2 distances among the single points and all the others
euclidian_distances = euclidean_dissim(samples_of_cluster[:,numerical_pos],i[np.array(numerical_pos)+1])
categ_distances = matching_dissim(samples_of_cluster[:,cat_pos],i[np.array(cat_pos)+1])
# Weighted average of the 2 distances
ai = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))
# 2. Calculate bi
unique_cluster_labels.remove(ai_cluster)
bi_arr = []
for ii in unique_cluster_labels:
# Get all the samples of cluster ii
samples = data[data['cluster_labels'] == ii].loc[:,data.columns!='cluster_labels'].to_numpy()
# Compute the 2 distances among the single points and all the others
euclidian_distances = np.linalg.norm(samples[:,numerical_pos] - i[np.array(numerical_pos)+1], axis=1)
categ_distances = matching_dissim(samples[:,cat_pos],i[np.array(cat_pos)+1])
distance_bi = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))
bi_arr.append(np.average(distance_bi))
# min bi is determined as final bi variable
if(len(bi_arr)==0):
bi = 0
else:
bi = min(bi_arr)
# 3. calculate silhouette Index
if ai == bi:
silhouette_i = 0
elif ai < bi:
silhouette_i = 1 - (ai / bi)
elif ai > bi:
silhouette_i = 1 - (bi / ai)
silhouette_Index_arr.append(silhouette_i)
silhouette_score = np.average(silhouette_Index_arr)
return silhouette_score
As shown in this picture, my predicted points are following the GPS track, which has noisy points and that is not desired. Instead I want my filter to predict points that follow the road instead of the green area.
I tried to implement Kalman filter on noisy GPS data to remove the jumping points or predicting missing data if GPS signal is lost. Data contains latitude and longitude. After adjusting the parameters I can see that my predicted values are very much the same as the measurements I have, which is not fulfilling the actual problem I am trying to solve. I am still at the learning
stage, so I am not sure if the parameter selection is not right or the problem lies within my Python code. I'm using QGIS for visualization of Actual and Prediction values to compare them with my real GPS data.
Here is my code:
....python...
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('C:/Users/mun/Desktop/Research/Ny mappe/GPS_track.csv')
df.head(1000)
lat = np.array([df.latitude])
print(lat)
long = np.array([df.longitude])
print(long)
print(len(long[0]))
for i in range(len(long)):
print(long[i][0])
for i in range(len(lat[0])):
print(lat[0][i])
print(len(lat[0]))
print(len(long[0]))
#length of the arrays. the arrays should always have the same length
lng=len(lat[0])
print(lng)
for index in range(lng):
print(lat[0][index])
print(long[0][index])
for index in range (lng):
np.array((lat[0][index], long[0][index]))
coord1 = [list(i) for i in zip (lat[0],long[0])]
print(coord1)
from pylab import *
from numpy import *
import matplotlib.pyplot as plt
class Kalman:
def __init__(self, ndim):
self.ndim = ndim
self.Sigma_x = eye(ndim)*1e-4 # Process noise (Q)
self.A = eye(ndim) # Transition matrix which
predict state for next time step (A)
self.H = eye(ndim) # Observation matrix (H)
self.mu_hat = 0 # State vector (X)
self.cov = eye(ndim)*0.01 # Process Covariance (P)
self.R = .001 # Sensor noise covariance matrix /
measurement error (R)
def update(self, obs):
# Make prediction
self.mu_hat_est = dot(self.A,self.mu_hat)
self.cov_est = dot(self.A,dot(self.cov,transpose(self.A))) +
self.Sigma_x
# Update estimate
self.error_mu = obs - dot(self.H,self.mu_hat_est)
self.error_cov = dot(self.H,dot(self.cov,transpose(self.H))) +
self.R
self.K =
dot(dot(self.cov_est,transpose(self.H)),linalg.inv(self.error_cov))
self.mu_hat = self.mu_hat_est + dot(self.K,self.error_mu)
if ndim>1:
self.cov = dot((eye(self.ndim) -
dot(self.K,self.H)),self.cov_est)
else:
self.cov = (1-self.K)*self.cov_est
if __name__ == "__main__":
#print "***** 1d ***********"
ndim = 1
nsteps = 3
k = Kalman(ndim)
mu_init=array([54.907134])
cov_init=0.001*ones((ndim))
obs = random.normal(mu_init,cov_init,(ndim, nsteps))
for t in range(ndim,nsteps):
k.update(obs[:,t])
print ("Actual: ", obs[:, t], "Prediction: ", k.mu_hat_est)
coord_output=[]
for coordinate in coord1:
temp_list=[]
ndim = 2
nsteps = 100
k = Kalman(ndim)
mu_init=np.array(coordinate)
cov_init=0.0001*ones((ndim))
obs = zeros((ndim, nsteps))
for t in range(nsteps):
obs[:, t] = random.normal(mu_init,cov_init)
for t in range(ndim,nsteps):
k.update(obs[:,t])
print ("Actual: ", obs[:, t], "Prediction: ", k.mu_hat_est[0])
temp_list.append(obs[:, t])
temp_list.append(k.mu_hat_est[0])
print("temp list")
print(temp_list)
coord_output.append(temp_list)
for coord_pair in coord_output:
print(coord_pair[0])
print(coord_pair[1])
print("--------")
print(line_actual)
print(coord_output)
df2= pd.DataFrame(coord_output)
print(df2)
Actual = df2[0]
Prediction = df2[1]
print (Actual)
print(Prediction)
Actual_df = pd.DataFrame(Actual)
Prediction_df = pd.DataFrame(Prediction)
print(Actual_df)
print(Prediction_df)
Actual_coord = pd.DataFrame(Actual_df[0].to_list(), columns = ['latitude',
'longitude'])
Actual_coord.to_csv('C:/Users/mun/Desktop/Research/Ny
mappe/Actual_noise.csv')
Prediction_coord = pd.DataFrame(Prediction_df[1].to_list(), columns =
['latitude', 'longitude'])
Prediction_coord.to_csv('C:/Users/mun/Desktop/Research/Ny
mappe/Prediction_noise.csv')
print (Actual_coord)
print (Prediction_coord)
Actual_coord.plot(kind='scatter',x='longitude',y='latitude',color='red')
plt.show()
Prediction_coord.plot(kind='scatter',x='longitude',y='latitude',
color='green')
plt.show()
I'm trying to implement emcee MCMC sampling in Python with a predefined likelihood function to find the best boundary between two populations of data.
For emcee see: http://dfm.io/emcee/current/user/line/
The likelihood function calculates the true positive and true negative classifications, given some linear boundary line, and is used to minimise the difference between the two values whilst maximising their sum.
This way you can imagine a TP and TN rate of 1 respectively will give a likelihood value of 1 while TP and TN rates of 0 will return a likelihood value of 0.
But when I attempt to sample the parameter space for m and b, the gradient and offset (or bias), for the boundary line, I get some wildly big and/or small values for the walks.
I have put an example code below which generates some nicely divided populations and then MCMCs around the initial guesses of the parameter values. I'm unsure as to why the MCMC chains don't converge nicely to an appropriate value here so any help would be greatly appreciated.
The following code should run out-of-the-box.
import emcee
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
#generate some test x and y data
folded_xy_train = np.random.uniform(0,1,10000) #test x data
folded_z_train = np.random.uniform(0,1,10000) #test y data
#define the true gradient and offset for the boundary line
m_true, b_true = 5,-2.5
#generate labels for the test data
rounded_labels_train = np.ones(len(folded_z_train))
model = (m_true*folded_xy_train) + b_true
difference = model - folded_z_train
rounded_labels_train[difference<0] = 0
#show the test data
plt.figure()
plt.scatter(folded_xy_train,folded_z_train,c=rounded_labels_train,s=1.0)
#define a likelihood function for the boundary line
def lnlike(theta, x, y, labels):
m, b = theta
model = (m*x) + b
difference = model - y
classifications = np.ones(len(y))
classifications[difference<0]=0
cfm = confusion_matrix(labels,classifications)
cm = cfm.astype('float') / cfm.sum(axis=1)[:, np.newaxis]
tn, fp, fn, tp = cm.ravel()
likelihood_val = (0.5*(tp+tn))/(1+np.abs(tp-tn))
ln_like = -np.log(likelihood_val)
return ln_like
#define a wide flat prior
def lnprior(theta):
m, b, = theta
if 0 < m < 10 and -20 < b < 5:
return 0.0
return -np.inf
#define the posterior
def lnprob(p, x, y, labels):
lp = lnprior(p)
if not np.isfinite(lp):
return 0
return lp + lnlike(p, x, y, labels)
#setup the MCMC sampling
nwalkers = 4
ndim = 2
p0 = np.array([4.2,-2]) + [np.random.rand(ndim) for i in range(nwalkers)]
sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(folded_xy_train, folded_z_train, rounded_labels_train))
sampler.run_mcmc(p0, 500)
#extract the MCMC paramater value chains
samples = sampler.chain[:, 50:, :].reshape((-1, ndim))
#view the parameter chains
plt.figure()
plt.subplot(211)
plt.plot(samples[:,0])
plt.subplot(212)
plt.plot(samples[:,1])
The initial test data, showing an obvious boundary line for given x y data (coloured by binary class label):
The sample walks, showing strange sampling for the gradient parameter (top) and offset parameter (bottom). The x-axis denotes the MCMC walk step number and the y-axis denotes the MCMC parameter values at a given step:
For the given data, I want to set the outlier values (defined by 95% confidense level or 95% quantile function or anything that is required) as nan values. Following is the my data and code that I am using right now. I would be glad if someone could explain me further.
import numpy as np, matplotlib.pyplot as plt
data = np.random.rand(1000)+5.0
plt.plot(data)
plt.xlabel('observation number')
plt.ylabel('recorded value')
plt.show()
The problem with using percentile is that the points identified as outliers is a function of your sample size.
There are a huge number of ways to test for outliers, and you should give some thought to how you classify them. Ideally, you should use a-priori information (e.g. "anything above/below this value is unrealistic because...")
However, a common, not-too-unreasonable outlier test is to remove points based on their "median absolute deviation".
Here's an implementation for the N-dimensional case (from some code for a paper here: https://github.com/joferkington/oost_paper_code/blob/master/utilities.py):
def is_outlier(points, thresh=3.5):
"""
Returns a boolean array with True if points are outliers and False
otherwise.
Parameters:
-----------
points : An numobservations by numdimensions array of observations
thresh : The modified z-score to use as a threshold. Observations with
a modified z-score (based on the median absolute deviation) greater
than this value will be classified as outliers.
Returns:
--------
mask : A numobservations-length boolean array.
References:
----------
Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
Handle Outliers", The ASQC Basic References in Quality Control:
Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
"""
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh
This is very similar to one of my previous answers, but I wanted to illustrate the sample size effect in detail.
Let's compare a percentile-based outlier test (similar to #CTZhu's answer) with a median-absolute-deviation (MAD) test for a variety of different sample sizes:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def main():
for num in [10, 50, 100, 1000]:
# Generate some data
x = np.random.normal(0, 0.5, num-3)
# Add three outliers...
x = np.r_[x, -3, -10, 12]
plot(x)
plt.show()
def mad_based_outlier(points, thresh=3.5):
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh
def percentile_based_outlier(data, threshold=95):
diff = (100 - threshold) / 2.0
minval, maxval = np.percentile(data, [diff, 100 - diff])
return (data < minval) | (data > maxval)
def plot(x):
fig, axes = plt.subplots(nrows=2)
for ax, func in zip(axes, [percentile_based_outlier, mad_based_outlier]):
sns.distplot(x, ax=ax, rug=True, hist=False)
outliers = x[func(x)]
ax.plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)
kwargs = dict(y=0.95, x=0.05, ha='left', va='top')
axes[0].set_title('Percentile-based Outliers', **kwargs)
axes[1].set_title('MAD-based Outliers', **kwargs)
fig.suptitle('Comparing Outlier Tests with n={}'.format(len(x)), size=14)
main()
Notice that the MAD-based classifier works correctly regardless of sample-size, while the percentile based classifier classifies more points the larger the sample size is, regardless of whether or not they are actually outliers.
Detection of outliers in one dimensional data depends on its distribution
1- Normal Distribution :
Data values are almost equally distributed over the expected range :
In this case you easily use all the methods that include mean ,like the confidence interval of 3 or 2 standard deviations(95% or 99.7%) accordingly for a normally distributed data (central limit theorem and sampling distribution of sample mean).I is a highly effective method.
Explained in Khan Academy statistics and Probability - sampling distribution library.
One other way is prediction interval if you want confidence interval of data points rather than mean.
Data values are are randomly distributed over a range:
mean may not be a fair representation of the data, because the average is easily influenced by outliers (very small or large values in the data set that are not typical)
The median is another way to measure the center of a numerical data set.
Median Absolute deviation - a method which measures the distance of all points from the median in terms of median distance
http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm - has a good explanation as explained in Joe Kington's answer above
2 - Symmetric Distribution : Again Median Absolute Deviation is a good method if the z-score calculation and threshold is changed accordingly
Explanation :
http://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/
3 - Asymmetric Distribution : Double MAD - Double Median Absolute Deviation
Explanation in the above attached link
Attaching my python code for reference :
def is_outlier_doubleMAD(self,points):
"""
FOR ASSYMMETRIC DISTRIBUTION
Returns : filtered array excluding the outliers
Parameters : the actual data Points array
Calculates median to divide data into 2 halves.(skew conditions handled)
Then those two halves are treated as separate data with calculation same as for symmetric distribution.(first answer)
Only difference being , the thresholds are now the median distance of the right and left median with the actual data median
"""
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
medianIndex = (points.size/2)
leftData = np.copy(points[0:medianIndex])
rightData = np.copy(points[medianIndex:points.size])
median1 = np.median(leftData, axis=0)
diff1 = np.sum((leftData - median1)**2, axis=-1)
diff1 = np.sqrt(diff1)
median2 = np.median(rightData, axis=0)
diff2 = np.sum((rightData - median2)**2, axis=-1)
diff2 = np.sqrt(diff2)
med_abs_deviation1 = max(np.median(diff1),0.000001)
med_abs_deviation2 = max(np.median(diff2),0.000001)
threshold1 = ((median-median1)/med_abs_deviation1)*3
threshold2 = ((median2-median)/med_abs_deviation2)*3
#if any threshold is 0 -> no outliers
if threshold1==0:
threshold1 = sys.maxint
if threshold2==0:
threshold2 = sys.maxint
#multiplied by a factor so that only the outermost points are removed
modified_z_score1 = 0.6745 * diff1 / med_abs_deviation1
modified_z_score2 = 0.6745 * diff2 / med_abs_deviation2
filtered1 = []
i = 0
for data in modified_z_score1:
if data < threshold1:
filtered1.append(leftData[i])
i += 1
i = 0
filtered2 = []
for data in modified_z_score2:
if data < threshold2:
filtered2.append(rightData[i])
i += 1
filtered = filtered1 + filtered2
return filtered
I've adapted the code from http://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers and it gives the same results as Joe Kington's, but uses L1 distance instead of L2 distance, and has support for asymmetric distributions. The original R code did not have Joe's 0.6745 multiplier, so I also added that in for consistency within this thread. Not 100% sure if it's necessary, but makes the comparison apples-to-apples.
def doubleMADsfromMedian(y,thresh=3.5):
# warning: this function does not check for NAs
# nor does it address issues when
# more than 50% of your data have identical values
m = np.median(y)
abs_dev = np.abs(y - m)
left_mad = np.median(abs_dev[y <= m])
right_mad = np.median(abs_dev[y >= m])
y_mad = left_mad * np.ones(len(y))
y_mad[y > m] = right_mad
modified_z_score = 0.6745 * abs_dev / y_mad
modified_z_score[y == m] = 0
return modified_z_score > thresh
Well a simple solution can also be, removing something which outside 2 standard deviations(or 1.96):
import random
def outliers(tmp):
"""tmp is a list of numbers"""
outs = []
mean = sum(tmp)/(1.0*len(tmp))
var = sum((tmp[i] - mean)**2 for i in range(0, len(tmp)))/(1.0*len(tmp))
std = var**0.5
outs = [tmp[i] for i in range(0, len(tmp)) if abs(tmp[i]-mean) > 1.96*std]
return outs
lst = [random.randrange(-10, 55) for _ in range(40)]
print lst
print outliers(lst)
Use np.percentile as #Martin suggested:
percentiles = np.percentile(data, [2.5, 97.5])
# or =>, <= for within 95%
data[(percentiles[0]<data) & (percentiles[1]>data)]
# set the outliners to np.nan
data[(percentiles[0]>data) | (percentiles[1]<data)] = np.nan