I am trying to create a multible voigt/Gaussian/Lorentizan-peak fit function with lmfit.
Therefore, I wrote the following Function:
def apply_fit_mix_multy(data,modelPeak,peakPos,amplitud,**kwargs):
peakPos=np.array(peakPos)
Start=kwargs.get('Start',data[0,0])
length_data=len(data)-1
End=kwargs.get('End',data[length_data,0])
StartPeak=kwargs.get('StartPeak',data[0,0])
EndPeak=kwargs.get('EndPeak',data[length_data,0])
BackFunc=kwargs.get('BackFunc',False)
BackCut=kwargs.get('BackCut',False)
dataN=data_intervall(data,Start,End)
y=dataN[:, 1]
x=dataN[:, 0]
amplitud=amplitud
center=peakPos
mod = None
for i in range(len(peakPos)):
this_mod = make_model(i,amplitud,center,modelPeak)
if mod is None:
mod = this_mod
else:
mod = mod + this_mod
bgy=[list() for f in range(len(x))]
if(BackFunc==True):
bg,bgx=BackFunc
for i in range(len(x)):
bgy[i]=bg.best_values.get('c')
elif(BackCut!=False):
slope,intercept=back_ground_cut(data,BackCut[0],BackCut[1])
for i in range(len(x)):
bgy[i]=slope*x[i]+intercept
if(BackCut!=False):
print('Background substraction model is used! (Sign=Sign-backgr(linear between two points))')
y=y-bgy
out = mod.fit(y, x=x)
else:
print('Combination model is used! (offset+Gauss/Lor/Voig)')
offset=ConstantModel()
mod=mod+offset
out = mod.fit(y, x=x)#out is the fitted function
area=[list() for f in range(len(peakPos))]
comps=out.eval_components(x=x)
if(BackCut!=False):
for i in range(len(peakPos)):
area[i]=simps(comps['peak'+str(i)+'_'],x=x,even='avg')-simps(bgy,x=x,even='avg')
fit_dict={'signal':y, 'convol':out.best_fit,'x':x,'peak_area':area,'backgr':bgy,'comps':comps}
else:
for i in range(len(peakPos)):
area[i]=simps(comps['peak'+str(i)+'_'],x=x,even='avg')
fit_dict={'convol':out.best_fit,'x':x,'peak_area':area,'comps':comps} #comps is inf. of sperate peaks
return fit_dict
The function reads in a data set, the modelPeak (e.g. GaussianModel) an initial guess of peak positions and amplitudes (peakPos, amplitude) .
In the first Part I initializing the model of the peaks (how many peaks...)
for i in range(len(peakPos)):
this_mod = make_model(i,amplitud,center,modelPeak)
if mod is None:
mod = this_mod
else:
mod = mod + this_mod
With the make_model funktion:
def make_model(num,amplitud,center,mod):
pref = "peak{0}_".format(num)
model = mod(prefix = pref)
model.set_param_hint(pref+'amplitud', value=amplitud[num], min=0, max=5*amplitud[num])
model.set_param_hint(pref+'center', value=center[num], min=center[num]-0.5, max=center[num]+0.5)
if(num==0):
model.set_param_hint(pref+'sigma', value=0.3, min=0.01, max=1)
else:
model.set_param_hint(pref+'sigma', value=0.3, min=0.01, max=1)
#print('Jetzt',center[num],amplitud[num])
return model
here is now my Problem: I I whant to fit e.g. 3 Peaks I whant that e.g. the sigma of the first peak is varies during the fit while the sigmas of the other peaks depend on the sigma of the first peak!
any idea?
thx
maths
FYI this is how a fit looks like:
enter image description here
If I understand your long question (it would be helpful to remove the extraneous stuff - and there is quite a lot of it), you want to create a Model with multiple peaks, allowing sigma from the 1st peak to vary freely, and constraining sigma for the other peaks to depend on this.
To do that, you can either use parameter hints (as you use in your make_model() function) or set expressions for the parameters after the Parameters object is created. For the first approach, something like this
def make_model(num,amplitud,center,mod):
pref = "peak{0}_".format(num)
model = mod(prefix = pref)
model.set_param_hint(pref+'amplitud', value=amplitud[num], min=0, max=5*amplitud[num])
model.set_param_hint(pref+'center', value=center[num], min=center[num]-0.5, max=center[num]+0.5)
if(num==0):
model.set_param_hint(pref+'sigma', value=0.3, min=0.01, max=1)
else:
## instead of
# model.set_param_hint(pref+'sigma', value=0.3, min=0.01, max=1)
## set peakN_sigma == peak0_sigma
model.set_param_hint(pref+'sigma', expr='peak0_sigma')
## or maybe set peakN_sigma == N * peak0_sigma
model.set_param_hint(pref+'sigma', expr='%d*peak0_sigma' % num)
return model
You could also make the full model (simplified somewhat from your code, but the same idea):
model = (VoigtModel(prefix='peak0_') + VoigtModel(prefix='peak1_') +
VoigtModel(prefix='peak2_') + LinearModel(prefix='const_'))
# create parameters with default values
params = model.make_params(peak0_amplitude=10, peak0_sigma=2, ....)
# set constraints for `sigma` params:
params['peak1_sigma'].expr = 'peak0_sigma'
params['peak2_sigma'].expr = 'peak0_sigma'
# similarly, set bounds as needed:
params['peak1_sigma'].min = 0
params['peak1_amplitude'].min = 0
Hope that helps...
Related
I'm going off of https://github.com/cortexlabs/cortex/blob/master/examples/pytorch/text-generator/predictor.py
But if I pass num_samples=5, I get:
generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
RuntimeError: Sizes of tensors must match except in dimension 1. Got 5 and 1 in dimension 0
the code is:
def sample_sequence(
model,
length,
context,
num_samples=1,
temperature=1,
top_k=0,
top_p=0.9,
repetition_penalty=1.0,
device="cpu",
):
context = torch.tensor(context, dtype=torch.long, device=device)
context = context.unsqueeze(0).repeat(num_samples, 1)
print('context_size', context.shape)
generated = context
print('context', context)
with torch.no_grad():
for _ in trange(length):
inputs = {"input_ids": generated}
outputs = model(
**inputs
) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states)
next_token_logits = outputs[0][0, -1, :] / (temperature if temperature > 0 else 1.0)
# reptition penalty from CTRL (https://arxiv.org/abs/1909.05858)
for _ in set(generated.view(-1).tolist()):
next_token_logits[_] /= repetition_penalty
filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
if temperature == 0: # greedy sampling:
next_token = torch.argmax(filtered_logits).unsqueeze(0)
else:
next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
return generated
As far as I can see this code doesn't provide multiple samples, but you can adjust it with a some adjustments.
This line uses already multinomial but returns only 1:
next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
change it to:
next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=num_samples)
Now you also need to change the result construction. This concatenates line the next_token with the sentence. You get now num_samples of next_tokens and you need unsqueeze all of them:
generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
change it to:
generated = torch.cat((generated, next_token.unsqueeze(1)), dim=1)
The whole function should look like this now:
def sample_sequence(
model,
length,
context,
num_samples=1,
temperature=1,
top_k=0,
top_p=0.9,
repetition_penalty=1.0,
device="cpu",
):
context = torch.tensor(context, dtype=torch.long, device=device)
context = context.unsqueeze(0).repeat(num_samples, 1)
generated = context
with torch.no_grad():
for _ in trange(length):
inputs = {"input_ids": generated}
outputs = model(
**inputs
) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states)
next_token_logits = outputs[0][0, -1, :] / (temperature if temperature > 0 else 1.0)
# reptition penalty from CTRL (https://arxiv.org/abs/1909.05858)
for _ in set(generated.view(-1).tolist()):
next_token_logits[_] /= repetition_penalty
filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
if temperature == 0: # greedy sampling:
next_token = torch.argmax(filtered_logits).unsqueeze(0)
else:
next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=num_samples)
generated = torch.cat((generated, next_token.unsqueeze(1)), dim=1)
return generated
Last but not least you have to change your tokenizer.decode call to tokenizer.batch_decode as the return value contains now multiple samples:
tokenizer.batch_decode(output.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True)
Something you have to think of byt yourself, is what you want to do when there is no valide next_token. Currently you will receive an error message like:
RuntimeError: invalid multinomial distribution (with replacement=False, not enough non-negative category to sample)
Another thing you have to think of, is if their code is even correct. During the few test I have conducted, it felt like that the quality of created sentences decreased with an increasing number of num_samples (i.e. Maybe the quality is better when you use a simple loop to call sample_sequence multiple times?). I haven't worked with GPT2 yet and can't help you here.
I try to predict the outcome of soccer games based on the number of goals scored and I use the following model:
with pm.Model() as model:
# global model parameters
h = pm.Normal('h', mu = mu, tau = tau)
sd_a = pm.Gamma('sd_a', .1, .1)
sd_d = pm.Gamma('sd_d', .1, .1)
alpha = pm.Normal('alpha', mu=mu, tau = tau)
# team-specific model parameters
a_s = pm.Normal("a_s", mu=0, sd=sd_a, shape=n)
d_s = pm.Normal("d_s", mu=0, sd=sd_d, shape=n)
atts = pm.Deterministic('atts', a_s - tt.mean(a_s))
defs = pm.Deterministic('defs', d_s - tt.mean(d_s))
h_theta = tt.exp(alpha + h + atts[h_t] + defs[a_t])
a_theta = tt.exp(alpha + atts[a_t] + defs[h_t])
# likelihood of observed data
h_goals = pm.Poisson('h_goals', mu=h_theta, observed=observed_h_goals)
a_goals = pm.Poisson('a_goals', mu=a_theta, observed=observed_a_goals)
When I sample the model, the trace plots look fine.
Afterward when I want to calculate the WAIC:
waic = pm.waic(trace, model)
I get the following error:
----> 1 waic = pm.waic(trace, model)
~\Anaconda3\envs\env\lib\site-packages\pymc3\stats_init_.py in wrapped(*args, **kwargs)
22 )
23 kwargs[new] = kwargs.pop(old)
—> 24 return func(*args, **kwargs)
25
26 return wrapped
~\Anaconda3\envs\env\lib\site-packages\arviz\stats\stats.py in waic(data, pointwise, scale)
1176 “”"
1177 inference_data = convert_to_inference_data(data)
-> 1178 log_likelihood = _get_log_likelihood(inference_data)
1179 scale = rcParams[“stats.ic_scale”] if scale is None else scale.lower()
1180
~\Anaconda3\envs\env\lib\site-packages\arviz\stats\stats_utils.py in get_log_likelihood(idata, var_name)
403 var_names.remove(“lp”)
404 if len(var_names) > 1:
–> 405 raise TypeError(
406 “Found several log likelihood arrays {}, var_name cannot be None”.format(var_names)
407 )
TypeError: Found several log likelihood arrays [‘h_goals’, ‘a_goals’], var_name cannot be None
Is there any way to calculate WAIC and compare models when I have two likelihood functions in pymc3? (1: the goals scored by the home 2: the goals scored by the away team)
It is possible but requires defining what are you interested in predicting, it can be the result of the match, or could be the number of goals scored by either team (not the aggregate, each match would then provide 2 results to predict).
A complete and detailed answer is available at PyMC discourse.
Here I transcribe the case where the quantity of interest is the result of the match as a summary. ArviZ will automatically retrieve 2 pointwise log likelihood arrays, which we have to combine somehow (e.g. add, concatenate, groupby...) to get a single array. The tricky part is knowing which operation corresponds to each quantity, which has to be assessed on a per model basis. In this particular example, the predictive accuracy of a match result can be calculated in the following way:
dims = {
"home_points": ["match"],
"away_points": ["match"],
}
idata = az.from_pymc3(trace, dims=dims, model=model)
Setting the match dim is important to tell xarray how to align the pointwise log likelihood arrays, otherwise they would not be broadcasted and aligned in the desired way.
idata.sample_stats["log_likelihood"] = (
idata.log_likelihood.home_points + idata.log_likelihood.away_points
)
az.waic(idata)
# Output
# Computed from 3000 by 60 log-likelihood matrix
#
# Estimate SE
# elpd_waic -551.28 37.96
# p_waic 46.16 -
#
# There has been a warning during the calculation. Please check the results.
Note that ArviZ>=0.7.0 is required.
I am trying to deconvolve complex gas chromatogram signals into individual gaussian signals. Here is an example, where the dotted line represents the signal I am trying to deconvolve.
I was able to write the code to do this using scipy.optimize.curve_fit; however, once applied to real data the results were unreliable. I believe being able to set bounds to my parameters will improve my results, so I am attempting to use lmfit, which allows this. I am having a problem getting lmfit to work with a variable number of parameters. The signals I am working with may have an arbitrary number of underlying gaussian components, so the number of parameters I need will vary. I found some hints here, but still can't figure it out...
Creating a python lmfit Model with arbitrary number of parameters
Here is the code I am currently working with. The code will run, but the parameter estimates do not change when the model is fit. Does anyone know how I can get my model to work?
import numpy as np
from collections import OrderedDict
from scipy.stats import norm
from lmfit import Parameters, Model
def add_peaks(x_range, *pars):
y = np.zeros(len(x_range))
for i in np.arange(0, len(pars), 3):
curve = norm.pdf(x_range, pars[i], pars[i+1]) * pars[i+2]
y = y + curve
return(y)
# generate some fake data
x_range = np.linspace(0, 100, 1000)
peaks = [50., 40., 60.]
a = norm.pdf(x_range, peaks[0], 5) * 2
b = norm.pdf(x_range, peaks[1], 1) * 0.1
c = norm.pdf(x_range, peaks[2], 1) * 0.1
fake = a + b + c
param_dict = OrderedDict()
for i in range(0, len(peaks)):
param_dict['pk' + str(i)] = peaks[i]
param_dict['wid' + str(i)] = 1.
param_dict['mult' + str(i)] = 1.
# In case, you'd like to see the plot of fake data
#y = add_peaks(x_range, *param_dict.values())
#plt.plot(x_range, y)
#plt.show()
# Initialize the model and fit
pmodel = Model(add_peaks)
params = pmodel.make_params()
for i in param_dict.keys():
params.add(i, value=param_dict[i])
result = pmodel.fit(fake, params=params, x_range=x_range)
print(result.fit_report())
I think you would be better off using lmfits ability to build composite model.
That is, with a single peak defined with
from scipy.stats import norm
def peak(x, amp, center, sigma):
return amp * norm.pdf(x, center, sigma)
(see also lmfit.models.GaussianModel), you can build a model with many peaks:
npeaks = 3
model = Model(peak, prefix='p1_')
for i in range(1, npeaks):
model = model + Model(peak, prefix='p%d_' % (i+1))
params = model.make_params()
Now model will be a sum of 3 Gaussian functions, and the params created for that model will have names like p1_amp, p1_center, p2_amp, ..., which you can add sensible initial values and/or bounds and/or constraints.
Given your example data, you could pass in initial values to make_params like
params = model.make_params(p1_amp=2.0, p1_center=50., p1_sigma=2,
p2_amp=0.2, p2_center=40., p2_sigma=2,
p3_amp=0.2, p3_center=60., p3_sigma=2)
result = model.fit(fake, params, x=x_range)
I was able to find a solution here:
https://lmfit.github.io/lmfit-py/builtin_models.html#example-3-fitting-multiple-peaks-and-using-prefixes
Building on the code above, the following accomplishes what I was trying to do...
from lmfit.models import GaussianModel
gauss1 = GaussianModel(prefix='g1_')
gauss2 = GaussianModel(prefix='g2_')
gauss3 = GaussianModel(prefix='g3_')
gauss4 = GaussianModel(prefix='g4_')
gauss5 = GaussianModel(prefix='g5_')
gauss = [gauss1, gauss2, gauss3, gauss4, gauss5]
prefixes = ['g1_', 'g2_', 'g3_', 'g4_', 'g5_']
mod = np.sum(gauss[0:len(peaks)])
pars = mod.make_params()
for i, prefix in zip(range(0, len(peaks)), prefixes[0:len(peaks)]):
pars[prefix + 'center'].set(peaks[i])
init = mod.eval(pars, x=x_range)
out = mod.fit(fake, pars, x=x_range)
print(out.fit_report(min_correl=0.5))
out.plot_fit()
plt.show()
I am currently using scikit-learn for text classification on the 20ng dataset. I want to calculate the information gain for a vectorized dataset. It has been suggested to me that this can be accomplished, using mutual_info_classif from sklearn. However, this method is really slow, so I was trying to implement information gain myself based on this post.
I came up with the following solution:
from scipy.stats import entropy
import numpy as np
def information_gain(X, y):
def _entropy(labels):
counts = np.bincount(labels)
return entropy(counts, base=None)
def _ig(x, y):
# indices where x is set/not set
x_set = np.nonzero(x)[1]
x_not_set = np.delete(np.arange(x.shape[1]), x_set)
h_x_set = _entropy(y[x_set])
h_x_not_set = _entropy(y[x_not_set])
return entropy_full - (((len(x_set) / f_size) * h_x_set)
+ ((len(x_not_set) / f_size) * h_x_not_set))
entropy_full = _entropy(y)
f_size = float(X.shape[0])
scores = np.array([_ig(x, y) for x in X.T])
return scores
Using a very small dataset, most scores from sklearn and my implementation are equal. However, sklearn seems to take frequencies into account, which my algorithm clearly doesn't. For example
categories = ['talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
categories=categories)
X, y = newsgroups_train.data, newsgroups_train.target
cv = CountVectorizer(max_df=0.95, min_df=2,
max_features=100,
stop_words='english')
X_vec = cv.fit_transform(X)
t0 = time()
res_sk = mutual_info_classif(X_vec, y, discrete_features=True)
print("Time passed for sklearn method: %3f" % (time()-t0))
t0 = time()
res_ig = information_gain(X_vec, y)
print("Time passed for ig: %3f" % (time()-t0))
for name, res_mi, res_ig in zip(cv.get_feature_names(), res_sk, res_ig):
print("%s: mi=%f, ig=%f" % (name, res_mi, res_ig))
sample output:
center: mi=0.011824, ig=0.003548
christian: mi=0.128629, ig=0.127122
color: mi=0.028413, ig=0.026397
com: mi=0.041184, ig=0.030458
computer: mi=0.020590, ig=0.012327
cs: mi=0.007291, ig=0.001574
data: mi=0.020734, ig=0.008986
did: mi=0.035613, ig=0.024604
different: mi=0.011432, ig=0.005492
distribution: mi=0.007175, ig=0.004675
does: mi=0.019564, ig=0.006162
don: mi=0.024000, ig=0.017605
earth: mi=0.039409, ig=0.032981
edu: mi=0.023659, ig=0.008442
file: mi=0.048056, ig=0.045746
files: mi=0.041367, ig=0.037860
ftp: mi=0.031302, ig=0.026949
gif: mi=0.028128, ig=0.023744
god: mi=0.122525, ig=0.113637
good: mi=0.016181, ig=0.008511
gov: mi=0.053547, ig=0.048207
So I was wondering if my implementation is wrong, or it is correct, but a different variation of the mutual information algorithm scikit-learn uses.
A little late with my answer but you should look at Orange's implementation. Within their app it is used as a behind-the-scenes processor to help inform the dynamic model parameter building process.
The implementation itself looks fairly straightforward and could most likely be ported out. The entropy calculation first
The sections starting at https://github.com/biolab/orange3/blob/master/Orange/preprocess/score.py#L233
def _entropy(dist):
"""Entropy of class-distribution matrix"""
p = dist / np.sum(dist, axis=0)
pc = np.clip(p, 1e-15, 1)
return np.sum(np.sum(- p * np.log2(pc), axis=0) * np.sum(dist, axis=0) / np.sum(dist))
Then the second portion.
https://github.com/biolab/orange3/blob/master/Orange/preprocess/score.py#L305
class GainRatio(ClassificationScorer):
"""
Information gain ratio is the ratio between information gain and
the entropy of the feature's
value distribution. The score was introduced in [Quinlan1986]_
to alleviate overestimation for multi-valued features. See `Wikipedia entry on gain ratio
<http://en.wikipedia.org/wiki/Information_gain_ratio>`_.
.. [Quinlan1986] J R Quinlan: Induction of Decision Trees, Machine Learning, 1986.
"""
def from_contingency(self, cont, nan_adjustment):
h_class = _entropy(np.sum(cont, axis=1))
h_residual = _entropy(np.compress(np.sum(cont, axis=0), cont, axis=1))
h_attribute = _entropy(np.sum(cont, axis=0))
if h_attribute == 0:
h_attribute = 1
return nan_adjustment * (h_class - h_residual) / h_attribute
The actual scoring process happens at https://github.com/biolab/orange3/blob/master/Orange/preprocess/score.py#L218
Recently I've checked the RANSAC implementation from the Cookbook: http://wiki.scipy.org/Cookbook/RANSAC , but it doesn't seem to be consistent with the RANSAC algorithm itself.
Looking at the plot there, how can it be that some of data points, which are quite far from the best model (see points in the bottom), are considered as "RANSAC data", while some other points being closer to the model are not?
From my point of view It contradicts the main idea of the RANSAC algorithm where all points inside the pre-defined threshold area are considered as inliers.
Why is it not so in this implementation and are there any other RANSAC implementations in Python?
Thanks for your help!
Cheers,
Alexey
No it does not contradict the idea of RANSAC. The plot is a little misleading though.
What is plotted as blue crosses are the sample points (best_inlier_idxs = maybeinliers + alsoinliers) of which some (exactly the points in alsoinliers, ie the consensus set) support the model (maybemodel) which was fitted to a random sample of the data (maybeinliers). Which means all points given by alsoinliers should indeed be closer to the maybemodel than the closest point not in support of it.
However, the maybemodel fit is not shown in the plot. What is shown is the bettermodel (blue line, "RANSAC fit") which is obtained by fitting model parameters to all points in best_inlier_idxs (not just the ones in maybeinliers).
Furthermore, best_inlier_idxs contains both alsoinliers and maybeinliers. There may well be points in the randomly chose sample maybeinliers that do in fact not support the maybemodel fit (i.e. they are not within a threshold away). These points are also shown as blue crosses even though they are farther away than other points not in the supporting set.
I modified the plotting a little to also indicate the best proposed model (maybemodel) and the random sample (maybeinliers) within the "RANSAC data". The important thing are the circles around some of the crosses which highlight the fact that the random samples are contained in the RANSAC data.
Here's the code for the modified plotting:
iterations = 0
bestfit = None
besterr = numpy.inf
best_inlier_idxs = None
while iterations < k:
maybe_idxs, test_idxs = random_partition(n,data.shape[0])
maybeinliers = data[maybe_idxs,:]
test_points = data[test_idxs]
maybemodel = model.fit(maybeinliers)
test_err = model.get_error( test_points, maybemodel)
also_idxs = test_idxs[test_err < t] # select indices of rows with accepted points
alsoinliers = data[also_idxs,:]
if debug:
print 'test_err.min()',test_err.min()
print 'test_err.max()',test_err.max()
print 'numpy.mean(test_err)',numpy.mean(test_err)
print 'iteration %d:len(alsoinliers) = %d'%(
iterations,len(alsoinliers))
if len(alsoinliers) > d:
betterdata = numpy.concatenate( (maybeinliers, alsoinliers) )
bettermodel = model.fit(betterdata)
better_errs = model.get_error( betterdata, bettermodel)
thiserr = numpy.mean( better_errs )
if thiserr < besterr:
bestfit = bettermodel
besterr = thiserr
best_inlier_idxs = numpy.concatenate( (maybe_idxs, also_idxs) )
best_maybe_model = maybemodel
best_random_set = maybe_idxs
iterations+=1
if bestfit is None:
raise ValueError("did not meet fit acceptance criteria")
if return_all:
return bestfit, {'inliers':best_inlier_idxs, 'best_random_set':best_random_set,'best_maybe_model':best_maybe_model}
else:
return bestfit
def test():
# generate perfect input data
n_samples = 500
n_inputs = 1
n_outputs = 1
A_exact = 20*numpy.random.random((n_samples,n_inputs) )
perfect_fit = 60*numpy.random.normal(size=(n_inputs,n_outputs) ) # the model
B_exact = scipy.dot(A_exact,perfect_fit)
assert B_exact.shape == (n_samples,n_outputs)
# add a little gaussian noise (linear least squares alone should handle this well)
A_noisy = A_exact + numpy.random.normal(size=A_exact.shape )
B_noisy = B_exact + numpy.random.normal(size=B_exact.shape )
if 1:
# add some outliers
n_outliers = 100
all_idxs = numpy.arange( A_noisy.shape[0] )
numpy.random.shuffle(all_idxs)
outlier_idxs = all_idxs[:n_outliers]
non_outlier_idxs = all_idxs[n_outliers:]
A_noisy[outlier_idxs] = 20*numpy.random.random((n_outliers,n_inputs) )
B_noisy[outlier_idxs] = 50*numpy.random.normal(size=(n_outliers,n_outputs) )
# setup model
all_data = numpy.hstack( (A_noisy,B_noisy) )
input_columns = range(n_inputs) # the first columns of the array
output_columns = [n_inputs+i for i in range(n_outputs)] # the last columns of the array
debug = True
model = LinearLeastSquaresModel(input_columns,output_columns,debug=debug)
linear_fit,resids,rank,s = scipy.linalg.lstsq(all_data[:,input_columns],
all_data[:,output_columns])
# run RANSAC algorithm
ransac_fit, ransac_data = ransac(all_data,model,
50, 1000, 7e3, 300, # misc. parameters
debug=debug,return_all=True)
if 1:
import pylab
sort_idxs = numpy.argsort(A_exact[:,0])
A_col0_sorted = A_exact[sort_idxs] # maintain as rank-2 array
if 1:
pylab.plot( A_noisy[:,0], B_noisy[:,0], 'k.', label='data' )
pylab.plot( A_noisy[ransac_data['inliers'],0], B_noisy[ransac_data['inliers'],0], 'bx', label='RANSAC data' )
pylab.plot( A_noisy[ransac_data['best_random_set'],0], B_noisy[ransac_data['best_random_set'],0], 'ro', mfc='none',label='best random set (maybeinliers)' )
else:
pylab.plot( A_noisy[non_outlier_idxs,0], B_noisy[non_outlier_idxs,0], 'k.', label='noisy data' )
pylab.plot( A_noisy[outlier_idxs,0], B_noisy[outlier_idxs,0], 'r.', label='outlier data' )
pylab.plot( A_col0_sorted[:,0],
numpy.dot(A_col0_sorted,ransac_fit)[:,0],
label='RANSAC fit' )
pylab.plot( A_col0_sorted[:,0],
numpy.dot(A_col0_sorted,perfect_fit)[:,0],
label='exact system' )
pylab.plot( A_col0_sorted[:,0],
numpy.dot(A_col0_sorted,linear_fit)[:,0],
label='linear fit' )
pylab.plot( A_col0_sorted[:,0],
numpy.dot(A_col0_sorted,ransac_data['best_maybe_model'])[:,0],
label='best proposed model (maybemodel)' )
pylab.legend()
pylab.show()
if __name__=='__main__':
test()