I was doing some regression experiments using pandas and numpy. The package only support numpy array.
I have two kinds of data in the dataset, continuous and categorical.
As independent variable
pandas dataframe: re
What I do to continuous:
built_year=re.built_year.values.reshape((-1,1))
Input: >>>built_year.shape
Output:(4508, 1)
What I do to categorical ones:
condition=pd.factorize(re.condition.values)[0].reshape((-1,1))
Input: >>>condition.shape
Output:(4508, 1)
For Dependent variable
Y = re.price.values.reshape((-1,1))
Then generate 'equation'
X = np.hstack([condition,built_year)]
then put Y and X into package method as parameter.
Whenever I only account for continuous variable, everything is fine.
As long as I account for categorical variables, it gives error:
~/miniconda3/lib/python3.6/site-packages/PySAL-1.14.3-py3.6.egg/pysal/contrib/gwr/sel_bw.py in search(self, search, criterion, bw_min, bw_max, interval, tol, max_iter)
202 self.int_score = int_score
203
--> 204 self._bw()
205
206 return self.bw[0]
~/miniconda3/lib/python3.6/site-packages/PySAL-1.14.3-py3.6.egg/pysal/contrib/gwr/sel_bw.py in _bw(self)
215 delta = 0.38197 #1 - (np.sqrt(5.0)-1.0)/2.0
216 self.bw = golden_section(a, c, delta, gwr_func, self.tol,
--> 217 self.max_iter, self.int_score)
218 elif self.search == 'interval':
219 self.bw = equal_interval(self.bw_min, self.bw_max, self.interval,
~/miniconda3/lib/python3.6/site-packages/PySAL-1.14.3-py3.6.egg/pysal/contrib/gwr/search.py in golden_section(a, c, delta, function, tol, max_iter, int_score)
51 d = np.round(d)
52
---> 53 score_a = function(a)
54 score_b = function(b)
55 score_c = function(c)
~/miniconda3/lib/python3.6/site-packages/PySAL-1.14.3-py3.6.egg/pysal/contrib/gwr/sel_bw.py in <lambda>(bw)
209 gwr_func = lambda bw: getDiag[self.criterion](
210 GWR(self.coords, self.y, self.X_loc, bw, family=self.family,
--> 211 kernel=self.kernel, fixed=self.fixed, constant=self.constant).fit())
212 if self.search == 'golden_section':
213 a,c = self._init_section(self.X_glob, self.X_loc, self.coords,
~/miniconda3/lib/python3.6/site-packages/PySAL-1.14.3-py3.6.egg/pysal/contrib/gwr/gwr.py in fit(self, ini_params, tol, max_iter, solve)
259 wi = self.W[i].reshape((-1,1))
260 rslt = iwls(self.y, self.X, self.family, self.offset, None,
--> 261 ini_params, tol, max_iter, wi=wi)
262 params[i,:] = rslt[0].T
263 predy[i] = rslt[1][i]
~/miniconda3/lib/python3.6/site-packages/PySAL-1.14.3-py3.6.egg/pysal/contrib/glm/iwls.py in iwls(y, x, family, offset, y_fix, ini_betas, tol, max_iter, wi)
74 n_betas = _compute_betas(wz, wx)
75 else:
---> 76 n_betas, xtx_inv_xt = _compute_betas_gwr(wz, wx, wi)
77 v = spdot(x, n_betas)
78 mu = family.fitted(v)
~/miniconda3/lib/python3.6/site-packages/PySAL-1.14.3-py3.6.egg/pysal/contrib/glm/iwls.py in _compute_betas_gwr(y, x, wi)
32 xT = (x * wi).T
33 xtx = np.dot(xT, x)
---> 34 xtx_inv = la.inv(xtx)
35 xtx_inv_xt = np.dot(xtx_inv, xT)
36 betas = np.dot(xtx_inv_xt, y)
~/miniconda3/lib/python3.6/site-packages/numpy/linalg/linalg.py in inv(a)
511 signature = 'D->D' if isComplexType(t) else 'd->d'
512 extobj = get_linalg_error_extobj(_raise_linalgerror_singular)
--> 513 ainv = _umath_linalg.inv(a, signature=signature, extobj=extobj)
514 return wrap(ainv.astype(result_t, copy=False))
515
~/miniconda3/lib/python3.6/site-packages/numpy/linalg/linalg.py in _raise_linalgerror_singular(err, flag)
88
89 def _raise_linalgerror_singular(err, flag):
---> 90 raise LinAlgError("Singular matrix")
91
92 def _raise_linalgerror_nonposdef(err, flag):
LinAlgError: Singular matrix
Related
I have trouble using OneVsRestClassifier and cross-validation from sklearn.
train, test = train_test_split(tickets, random_state=42, test_size=0.30, shuffle=True)
X_train = train[['TK_Poids_brut', 'TK_Poids_tare']]
y_train = train['TK_Qualite']
X_test = test[['TK_Poids_brut', 'TK_Poids_tare']]
y_test = test['TK_Qualite']
le = preprocessing.LabelEncoder()
y_train_tra = le.fit_transform(y_train)
printDataInfo(X_train,y_train_tra)
#The printDataInfo function is there just to display information about X and y
clf_OvR_SVC = OneVsRestClassifier(LinearSVC(random_state=0))
cross_v = cross_validate(clf_OvR_SVC, X_train, y_train_tra, error_score="raise",scoring=dict(ac=make_scorer(accuracy_score), roc=make_scorer(roc_auc_score, multi_class="ovr")), cv=5)
cross_v
When I do this I get the following error:
---------------------------------------------------------------------------
AxisError Traceback (most recent call last)
C:\TEMP/ipykernel_20332/2926737612.py in <module>
23
24 clf_OvR_SVC = OneVsRestClassifier(LinearSVC(random_state=0))
---> 25 cross_v = cross_validate(clf_OvR_SVC, X_train, y_train_tra ,error_score="raise",scoring=dict(ac=make_scorer(accuracy_score), roc=make_scorer(roc_auc_score, multi_class="ovr")), cv=5)
26 cross_v
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
248 parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
249 pre_dispatch=pre_dispatch)
--> 250 results = parallel(
251 delayed(_fit_and_score)(
252 clone(estimator), X, y, scorers, train, test, verbose, None,
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1041 # remaining jobs.
1042 self._iterating = False
-> 1043 if self.dispatch_one_batch(iterator):
1044 self._iterating = self._original_iterator is not None
1045
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
859 return False
860 else:
--> 861 self._dispatch(tasks)
862 return True
863
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
777 with self._lock:
778 job_idx = len(self._jobs)
--> 779 job = self._backend.apply_async(batch, callback=cb)
780 # A job can complete so quickly than its callback is
781 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\Anaconda3\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
623
624 fit_time = time.time() - start_time
--> 625 test_scores = _score(estimator, X_test, y_test, scorer, error_score)
626 score_time = time.time() - start_time - fit_time
627 if return_train_score:
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, error_score)
685 scores = scorer(estimator, X_test)
686 else:
--> 687 scores = scorer(estimator, X_test, y_test)
688 except Exception:
689 if error_score == 'raise':
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
85 for name, scorer in self._scorers.items():
86 if isinstance(scorer, _BaseScorer):
---> 87 score = scorer._score(cached_call, estimator,
88 *args, **kwargs)
89 else:
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
240 **self._kwargs)
241 else:
--> 242 return self._sign * self._score_func(y_true, y_pred,
243 **self._kwargs)
244
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py in roc_auc_score(y_true, y_score, average, sample_weight, max_fpr, multi_class, labels)
535 if multi_class == 'raise':
536 raise ValueError("multi_class must be in ('ovo', 'ovr')")
--> 537 return _multiclass_roc_auc_score(y_true, y_score, labels,
538 multi_class, average, sample_weight)
539 elif y_type == "binary":
~\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py in _multiclass_roc_auc_score(y_true, y_score, labels, multi_class, average, sample_weight)
593 """
594 # validation of the input y_score
--> 595 if not np.allclose(1, y_score.sum(axis=1)):
596 raise ValueError(
597 "Target scores need to be probabilities for multiclass "
~\Anaconda3\lib\site-packages\numpy\core\_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
45 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
46 initial=_NoValue, where=True):
---> 47 return umr_sum(a, axis, dtype, out, keepdims, initial, where)
48
49 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,
AxisError: axis 1 is out of bounds for array of dimension 1
Here is the input data format:
I already tried to put both in numpy array and I tried to reshape y in (6108,1) but I always get the same error.
type :
x: <class 'pandas.core.frame.DataFrame'>
y: <class 'numpy.ndarray'>
shape :
X: (6108, 2)
y: (6108,)
data :
x: TK_Poids_brut TK_Poids_tare
8436 14420 14160
7014 17160 12320
3931 28060 15040
6749 16680 14360
2984 10060 9100
... ... ...
5734 19700 15420
5191 25380 14620
5390 19460 14760
860 16160 14100
7270 15520 14500
[6108 rows x 2 columns]
y: [132 85 160 118 118 40 88 126 12 40 41 138 5 125 125 147 111 118
153 40 118 126 118 125 123 62 177 45 118 105 3 1 105 142 116 100
118 125 118 78 124 3 126 53 138 118 40 118 53 124 126 98 118 155
118 131 5 135 130 3 118 105 118 126 105 87 118 118 24 124 130 130
...
118 124 118 180 118 58 124 126 153 126 124 118 125 153 86 94 126 118
130 105 42 62 124 78]
The hue feature is not working when I am using pairplot.
Here is my data frame:
Here is the code that doesn't work:
sns.pairplot(activities, hue="Day")
If I remove the hue option it works. Also if I change the hue to a numerical column (such as Distance) it works, but it is not working with the Day column for some reason. Here's the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_121/1783531066.py in <module>
----> 1 sns.pairplot(activities, hue="Day")
/opt/conda/lib/python3.7/site-packages/seaborn/_decorators.py in inner_f(*args, **kwargs)
44 )
45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46 return f(**kwargs)
47 return inner_f
48
/opt/conda/lib/python3.7/site-packages/seaborn/axisgrid.py in pairplot(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, height, aspect, corner, dropna, plot_kws, diag_kws, grid_kws, size)
2020 elif diag_kind == "kde":
2021 diag_kws.setdefault("fill", True)
-> 2022 grid.map_diag(kdeplot, **diag_kws)
2023
2024 # Maybe plot on the off-diagonals
/opt/conda/lib/python3.7/site-packages/seaborn/axisgrid.py in map_diag(self, func, **kwargs)
1400 plot_kwargs.setdefault("hue_order", self._hue_order)
1401 plot_kwargs.setdefault("palette", self._orig_palette)
-> 1402 func(x=vector, **plot_kwargs)
1403 self._clean_axis(ax)
1404
/opt/conda/lib/python3.7/site-packages/seaborn/_decorators.py in inner_f(*args, **kwargs)
44 )
45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46 return f(**kwargs)
47 return inner_f
48
/opt/conda/lib/python3.7/site-packages/seaborn/distributions.py in kdeplot(x, y, shade, vertical, kernel, bw, gridsize, cut, clip, legend, cumulative, shade_lowest, cbar, cbar_ax, cbar_kws, ax, weights, hue, palette, hue_order, hue_norm, multiple, common_norm, common_grid, levels, thresh, bw_method, bw_adjust, log_scale, color, fill, data, data2, **kwargs)
1733 legend=legend,
1734 estimate_kws=estimate_kws,
-> 1735 **plot_kws,
1736 )
1737
/opt/conda/lib/python3.7/site-packages/seaborn/distributions.py in plot_univariate_density(self, multiple, common_norm, common_grid, fill, legend, estimate_kws, **plot_kws)
914 common_grid,
915 estimate_kws,
--> 916 log_scale,
917 )
918
/opt/conda/lib/python3.7/site-packages/seaborn/distributions.py in _compute_univariate_density(self, data_variable, common_norm, common_grid, estimate_kws, log_scale)
314
315 # Estimate the density of observations at this level
--> 316 density, support = estimator(observations, weights=weights)
317
318 if log_scale:
/opt/conda/lib/python3.7/site-packages/seaborn/_statistics.py in __call__(self, x1, x2, weights)
185 """Fit and evaluate on univariate or bivariate data."""
186 if x2 is None:
--> 187 return self._eval_univariate(x1, weights)
188 else:
189 return self._eval_bivariate(x1, x2, weights)
/opt/conda/lib/python3.7/site-packages/seaborn/_statistics.py in _eval_univariate(self, x, weights)
144 support = self.support
145 if support is None:
--> 146 support = self.define_support(x, cache=False)
147
148 kde = self._fit(x, weights)
/opt/conda/lib/python3.7/site-packages/seaborn/_statistics.py in define_support(self, x1, x2, weights, cache)
117 """Create the evaluation grid for a given data set."""
118 if x2 is None:
--> 119 support = self._define_support_univariate(x1, weights)
120 else:
121 support = self._define_support_bivariate(x1, x2, weights)
/opt/conda/lib/python3.7/site-packages/seaborn/_statistics.py in _define_support_univariate(self, x, weights)
89 def _define_support_univariate(self, x, weights):
90 """Create a 1D grid of evaluation points."""
---> 91 kde = self._fit(x, weights)
92 bw = np.sqrt(kde.covariance.squeeze())
93 grid = self._define_support_grid(
/opt/conda/lib/python3.7/site-packages/seaborn/_statistics.py in _fit(self, fit_data, weights)
135 fit_kws["weights"] = weights
136
--> 137 kde = stats.gaussian_kde(fit_data, **fit_kws)
138 kde.set_bandwidth(kde.factor * self.bw_adjust)
139
/opt/conda/lib/python3.7/site-packages/scipy/stats/kde.py in __init__(self, dataset, bw_method, weights)
204 self._neff = 1/sum(self._weights**2)
205
--> 206 self.set_bandwidth(bw_method=bw_method)
207
208 def evaluate(self, points):
/opt/conda/lib/python3.7/site-packages/scipy/stats/kde.py in set_bandwidth(self, bw_method)
552 raise ValueError(msg)
553
--> 554 self._compute_covariance()
555
556 def _compute_covariance(self):
/opt/conda/lib/python3.7/site-packages/scipy/stats/kde.py in _compute_covariance(self)
564 bias=False,
565 aweights=self.weights))
--> 566 self._data_inv_cov = linalg.inv(self._data_covariance)
567
568 self.covariance = self._data_covariance * self.factor**2
/opt/conda/lib/python3.7/site-packages/scipy/linalg/basic.py in inv(a, overwrite_a, check_finite)
937
938 """
--> 939 a1 = _asarray_validated(a, check_finite=check_finite)
940 if len(a1.shape) != 2 or a1.shape[0] != a1.shape[1]:
941 raise ValueError('expected square matrix')
/opt/conda/lib/python3.7/site-packages/scipy/_lib/_util.py in _asarray_validated(a, check_finite, sparse_ok, objects_ok, mask_ok, as_inexact)
294 if not objects_ok:
295 if a.dtype is np.dtype('O'):
--> 296 raise ValueError('object arrays are not supported')
297 if as_inexact:
298 if not np.issubdtype(a.dtype, np.inexact):
ValueError: object arrays are not supported
Any ideas why hue isn't working?
You can see the error:
ValueError: object arrays are not supported
Means the variable needs to be numerical.
I have a program which iterates over a set of images with black background and has 7 objects of different colours.
I need to iterate over each pixel in order to find the middle x and y points by using the lower and upper boundaries of each coordinate then use these values to compute some distances.
The images are 640x480 in size and it takes about 2 seconds for each image which seems quite a lot. I have attached the code of how I am iterating through the pixels below.
def isBlack(r, g, b): return r == 0 and g == 0 and b == 0
def isRed(r, g, b): return r > 0 and g == 0 and b == 0
def isYellow(r, g, b): return r > 0 and g > 0 and b == 0 and r == g
def isOrange(r, g, b): return r > 0 and g > 0 and b == 0 and r != g
def isBlue(r, g, b): return r == 0 and g == 0 and b > 0
def isCyan(r, g, b): return r == 0 and g > 0 and b > 0
def isGreen(r, g, b): return r == 0 and g > 0 and b == 0
def isWhite(r, g, b): return r == g == b and r != 0
def getAbsoluteValues (im, side, frame):
sizes = im.shape
ny, nx, nc = sizes
array_of_maxes_x = np.empty((14,))
array_of_maxes_x[::2] = nx + 1
array_of_maxes_x[1::2] = -1
array_of_maxes_x = array_of_maxes_x.reshape(7, 2)
array_of_maxes_y = array_of_maxes_x.copy()
for idx_y, y in enumerate(im):
for idx_x, x in enumerate(y):
b, g, r = x
if isBlack(r, g, b):
continue
elif isRed(r, g, b):
array_of_maxes_x[0] = compareLoAndHi(idx_x, array_of_maxes_x[0])
array_of_maxes_y[0] = compareLoAndHi(idx_y, array_of_maxes_y[0])
elif isYellow(r, g, b):
array_of_maxes_x[1] = compareLoAndHi(idx_x, array_of_maxes_x[1])
array_of_maxes_y[1] = compareLoAndHi(idx_y, array_of_maxes_y[1])
elif isOrange(r, g, b):
array_of_maxes_x[2] = compareLoAndHi(idx_x, array_of_maxes_x[2])
array_of_maxes_y[2] = compareLoAndHi(idx_y, array_of_maxes_y[2])
elif isBlue(r, g, b):
array_of_maxes_x[3] = compareLoAndHi(idx_x, array_of_maxes_x[3])
array_of_maxes_y[3] = compareLoAndHi(idx_y, array_of_maxes_y[3])
elif isCyan(r, g, b):
array_of_maxes_x[4] = compareLoAndHi(idx_x, array_of_maxes_x[4])
array_of_maxes_y[4] = compareLoAndHi(idx_y, array_of_maxes_y[4])
elif isGreen(r, g, b):
array_of_maxes_x[5] = compareLoAndHi(idx_x, array_of_maxes_x[5])
array_of_maxes_y[5] = compareLoAndHi(idx_y, array_of_maxes_y[5])
elif isWhite(r, g, b) and not isStray(im, idx_x, idx_y):
array_of_maxes_x[6] = compareLoAndHi(idx_x, array_of_maxes_x[6])
array_of_maxes_y[6] = compareLoAndHi(idx_y, array_of_maxes_y[6])
def compareLoAndHi(coord, current):
if coord < current[0]: current[0] = coord
if coord > current[1]: current[1] = coord
return current
def isStray(im, x, y):
values = np.array([[x, y - 1],[x, y + 1],[x - 1, y],[x + 1, y]])
for i in range(0, 4):
b, g ,r = im[values[i][1], values[i][0]]
if(not isBlack(r, g, b) and not isWhite(r, g, b)):
return True
return False
I am not sure how to make this faster I've been looking at matrix routines and everything but I can't find an answer that fits my problem.
An example image is below.
You can check the colour of only detected contour.
import numpy as np
import cv2
image = cv2.imread('image.png')
cv2.imshow("image", image)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, threshold = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY)
cv2.imshow('threshold', threshold)
contours, hierarchy = cv2.findContours(threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
blank_image = np.zeros_like(image)
for cnt in contours:
M = cv2.moments(cnt)
cX = int(M["m10"] / M["m00"])
cY = int(M["m01"] / M["m00"])
colour = (int(image[cY, cX, 0]), int(image[cY, cX, 1]), int(image[cY, cX, 2]))
print(f'point: ({cX},{cY}), color (BGR): {colour}')
cv2.circle(blank_image, (cX, cY), 2, colour, 2)
cv2.imshow('contour_image', blank_image)
cv2.waitKey(0)
cv2.destroyAllWindows()
Output:
point: (464,219), color (BGR): (0, 156, 213)
point: (368,220), color (BGR): (0, 215, 0)
point: (388,197), color (BGR): (217, 217, 217)
point: (384,176), color (BGR): (211, 0, 0)
point: (338,176), color (BGR): (111, 238, 238)
point: (333,171), color (BGR): (215, 215, 0)
point: (366,143), color (BGR): (2, 2, 216)
Also, you can iterate only non zero points
import numpy as np
import cv2
image = cv2.imread('image.png')
cv2.imshow("image", image)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
X_Cordinates,Y_Cordinates = np.nonzero(gray)
print("X_Cordinates : \n", X_Cordinates)
print("\nY_Cordinates : \n", Y_Cordinates)
Output:
X_Cordinates :
[105 105 106 106 106 106 107 107 107 107 108 108 108 108 109 109 109 109
110 110 126 126 126 127 127 127 127 128 128 128 128 128 129 129 129 129
130 130 130 130 130 130 130 131 131 131 131 131 131 131 131 132 132 132
132 132 132 132 132 133 133 133 133 133 133 133 133 134 134 134 134 134
146 147 147 147 147 147 148 148 148 148 148 149 149 149 149 149 163 163
163 163 163 163 163 163 164 164 164 164 164 164 164 164 164 165 165 165
165 165 165 165 165 165 165 166 166 166 166 166 166 166 166 166 167 167
167 167 167 167 167 167]
Y_Cordinates :
[274 275 273 274 275 276 273 274 275 276 273 274 275 276 273 274 275 276
274 275 249 250 251 249 250 251 252 248 249 250 251 252 249 250 251 252
249 250 251 253 254 288 289 252 253 254 255 287 288 289 290 252 253 254
255 287 288 289 290 252 253 254 255 287 288 289 290 252 253 254 288 289
291 289 290 291 292 293 289 290 291 292 293 289 290 291 292 293 275 276
277 278 347 348 349 350 275 276 277 278 346 347 348 349 350 274 275 276
277 278 346 347 348 349 350 275 276 277 278 346 347 348 349 350 275 276
277 278 347 348 349 350]
try to use smaller resolution for the same images like a 320x240 and see the time needed for the the iteration i think this will cut the time to the half.
So I have a set of X and Y data that I read in from an output file and then truncate. I'll give a sample of it below just in case anybody wants it as a reference to potentially test my problem (apologies that it is so much)
0 16442
4 15222
8 14222
12 12934
16 11837
20 10706
24 9689
28 8844
32 7999
36 7128
40 6547
44 5890
48 5378
52 4838
56 4308
60 4005
64 3587
68 3228
72 2933
76 2610
80 2434
84 2184
88 1951
92 1755
96 1632
100 1441
104 1362
108 1150
112 1095
116 1051
120 991
124 859
128 775
132 727
136 678
140 635
144 610
148 535
152 560
156 510
160 460
164 431
168 407
172 387
176 391
180 362
184 368
188 317
192 317
196 302
200 289
204 259
208 307
212 263
216 262
220 264
224 218
228 220
232 242
236 224
240 198
244 207
248 192
252 207
256 194
260 172
264 167
268 192
272 148
276 187
280 166
284 159
288 143
292 150
296 155
300 160
304 159
308 144
312 128
316 133
320 105
324 120
328 134
332 129
336 117
340 132
344 118
348 137
352 134
356 119
360 121
364 99
368 111
372 95
376 106
380 89
384 104
388 113
392 117
396 114
400 88
404 82
408 78
412 77
416 79
420 84
424 85
428 75
432 76
436 74
440 96
444 65
448 90
452 72
456 74
460 68
464 66
468 76
472 66
476 69
480 63
484 61
488 51
492 60
496 67
500 71
504 54
508 55
512 61
516 49
520 47
524 42
528 48
532 44
536 47
540 43
544 54
548 42
552 39
556 40
560 44
564 41
568 53
572 50
576 43
580 36
584 49
588 35
592 40
596 34
This data shows time and tallies, and represent an exponential decay type of trend. All of the data recorded is similar, but there is a single coefficient that changes for each record taken, and so i'm trying to develop a code to then find out what that single coefficient is. The equation i'm using as a fit is:
Y*((exp(-TMA*(log(2.)/HL110))) + (Xexp(-TMA(log(2.)/HL108)))) + b
The variable that is changing here is Y. Everything else is known. This is the variable I want to fit to (Y). I've done some work in Excel and can say that it is in the high 9000s (this is just going off of memory). Other cases are in the 4000s and 7000s. So it ranges, and that's why I need a code to do it, otherwise I have to manually do it every time, and we have thousands of records that I have to analyze. I wrote a code, but it flatlines and doesn't really provide a fit. I'll supply it below. It also contains all the constants mentioned above, which aren't subject to change.
### Section 1 ###
from scipy import *
from matplotlib import pyplot
from scipy.optimize import minimize_scalar
from scipy.optimize import curve_fit
import numpy as np
### Section 2 ###
data = np.loadtxt('Ag - Near_7_2026.txt') ### LOAD FILE DATA HERE ###
data_trunc = data[25:len(data)] ### TRUNCATED DATA UP TO 104 SEC ###
TM = data_trunc[:,0] ### TIME MARK ###
TMA = TM + 4 ### CORRECTED TIME ARRAY, ELAPSED TIME ###
Counts = data_trunc[:,1]
Sigma = sqrt([Counts])
### DEFINE PROBLEM CONSTANTS ###
HL110 = 24.6 ### ENDF ACCEPTED HL ###
HL108 = 142.92 ### ENDF ACCEPTED HL ###
b = 1.3333
X = 0.02955 ### FROM MCNP MODEL ###
### Function Handel ###
def func(TMA, Y, X, HL110, HL108, b):
return Y*((exp(-TMA*(log(2.)/HL110))) + (X*exp(-TMA*(log(2.)/HL108)))) + b ### MODEL FUNCTION ###
f = func(TMA, 5000, X, HL110, HL108, b) ### CALLABLE NEEDED FOR CURVE_FIT ###
# Data plotting ###
pyplot.plot(TMA, f, '.b', label = 'data')
pyplot.legend(fontsize = 'large')
### Curve Fitting and plotting ###
popt, pcov = curve_fit(func, TMA, f)
pyplot.plot(TMA, func(TMA, *popt), 'r-', label = 'fit')
pyplot.tick_params(labelsize='large')
pyplot.legend(fontsize='large')
pyplot.xlabel('Adjusted Time')
pyplot.ylabel('Counts')
pyplot.show()
I've done my best to hopefully comment out most of the code here to help anyone assisting me with understanding what is what. When I was doing this, I used https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html as my reference. (I'm not associated with them or anything, I'm just trying to show my thought process just in case it lends any extra information on where i'm going wrong).
Any help is really appreciated, and I'm available to provide any clarifying information requested!
This seems to work with few changes. I made a text file using your data, and for the code itself I do not pass the constants to the functions.
### Section 1 ###
from scipy import *
from matplotlib import pyplot
from scipy.optimize import minimize_scalar
from scipy.optimize import curve_fit
import numpy as np
### Section 2 ###
#data = np.loadtxt('Ag - Near_7_2026.txt') ### LOAD FILE DATA HERE ###
data = np.loadtxt('temp.dat')
data_trunc = data[25:len(data)] ### TRUNCATED DATA UP TO 104 SEC ###
TM = data_trunc[:,0] ### TIME MARK ###
TMA = TM + 4 ### CORRECTED TIME ARRAY, ELAPSED TIME ###
Counts = data_trunc[:,1]
Sigma = sqrt([Counts])
### DEFINE PROBLEM CONSTANTS ###
HL110 = 24.6 ### ENDF ACCEPTED HL ###
HL108 = 142.92 ### ENDF ACCEPTED HL ###
b = 1.3333
X = 0.02955 ### FROM MCNP MODEL ###
### Function Handel ###
def func(TMA, Y): # no need to pass constants
return Y*((exp(-TMA*(log(2.)/HL110))) + (X*exp(-TMA*(log(2.)/HL108)))) + b ### MODEL FUNCTION ###
# # no need to pass constants
f = func(TMA, 5000) ### CALLABLE NEEDED FOR CURVE_FIT ###
# Data plotting ###
pyplot.plot(TMA, f, '.b', label = 'data')
pyplot.legend(fontsize = 'large')
### Curve Fitting and plotting ###
popt, pcov = curve_fit(func, TMA, f)
print('Fitted parameters:', popt)
pyplot.plot(TMA, func(TMA, *popt), 'r-', label = 'fit')
pyplot.tick_params(labelsize='large')
pyplot.legend(fontsize='large')
pyplot.xlabel('Adjusted Time')
pyplot.ylabel('Counts')
pyplot.show()
EDIT -- code to solve for Y
### Section 1 ###
from scipy import *
from matplotlib import pyplot
from scipy.optimize import minimize_scalar
from scipy.optimize import curve_fit
import numpy as np
### Section 2 ###
#data = np.loadtxt('Ag - Near_7_2026.txt') ### LOAD FILE DATA HERE ###
data = np.loadtxt('temp.dat')
data_trunc = data[25:len(data)] ### TRUNCATED DATA UP TO 104 SEC ###
TM = data_trunc[:,0] ### TIME MARK ###
TMA = TM + 4 ### CORRECTED TIME ARRAY, ELAPSED TIME ###
Counts = data_trunc[:,1]
Sigma = sqrt(Counts)
### DEFINE PROBLEM CONSTANTS ###
HL110 = 24.6 ### ENDF ACCEPTED HL ###
HL108 = 142.92 ### ENDF ACCEPTED HL ###
b = 1.3333
X = 0.02955 ### FROM MCNP MODEL ###
### Function Handel ###
def func(TMA, Y): # no need to pass constants
return Y*((exp(-TMA*(log(2.)/HL110))) + (X*exp(-TMA*(log(2.)/HL108)))) + b ### MODEL FUNCTION ###
# # no need to pass constants
#f = func(TMA, 5000) ### CALLABLE NEEDED FOR CURVE_FIT ###
# Data plotting ###
pyplot.plot(TMA, Counts, '.b', label = 'data')
pyplot.legend(fontsize = 'large')
### Curve Fitting and plotting ###
popt, pcov = curve_fit(func, TMA, Counts)
print('Fitted parameters:', popt)
pyplot.plot(TMA, func(TMA, *popt), 'r-', label = 'fit')
pyplot.tick_params(labelsize='large')
pyplot.legend(fontsize='large')
pyplot.xlabel('Adjusted Time')
pyplot.ylabel('Counts')
pyplot.show()
I'm trying to use a scikit's GridSearch to find the best alpha for a Lasso, and one of parameters I want it iterate is the cross validation split. So, I'm doing:
# X_train := Pandas Dataframe with no index (auto numbered index) and 62064 rows
# y_train := Pandas 1-column Dataframe with no index (auto numbered index) and 62064 rows
from sklearn import linear_model as lm
from sklearn import cross_validation as cv
from sklearn import grid_search
model = lm.LassoCV(eps=0.001, n_alphas=1000)
params = {"cv": [cv.ShuffleSplit(n=len(X_train), test_size=0.2),
cv.ShuffleSplit(n=len(X_train), test_size=0.1)]}
m_model = grid_search.GridSearchCV(model, params)
m_model.fit(X_train, y_train)
But it raises the exception
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-113-f791cb0644c1> in <module>()
10 m_model = grid_search.GridSearchCV(model, params)
11
---> 12 m_model.fit(X_train.as_matrix(), y_train.as_matrix())
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/grid_search.py in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/grid_search.py in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py in fit(self, X, y)
1146 for train, test in folds)
1147 mse_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
-> 1148 backend="threading")(jobs)
1149 mse_paths = np.reshape(mse_paths, (n_l1_ratio, len(folds), -1))
1150 mean_mse = np.mean(mse_paths, axis=1)
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py in _path_residuals(X, y, train, test, path, path_params, alphas, l1_ratio, X_order, dtype)
931 avoid memory copies
932 """
--> 933 X_train = X[train]
934 y_train = y[train]
935 X_test = X[test]
IndexError: index 60527 is out of bounds for axis 0 with size 41376
I tried to use X_train.as_matrix() but didn't work either, giving the same error.
Strange that I can use it manually:
cv_split = cv.ShuffleSplit(n=len(X_train), test_size=0.2)
for tr, te in cv_split:
print(X_train.as_matrix()[tr], y_train.as_matrix()[tr])
[[0 0 0 ..., 0 0 1]
[0 0 0 ..., 0 0 1]
[0 0 0 ..., 0 0 1]
...,
[0 0 0 ..., 0 0 1]
[0 0 0 ..., 0 0 1]
[0 0 0 ..., 0 0 1]] [2 1 1 ..., 1 4 1]
[[ 0 0 0 ..., 0 0 1]
[1720 0 0 ..., 0 0 1]
[ 0 0 0 ..., 0 0 1]
...,
[ 773 0 0 ..., 0 0 1]
[ 0 0 0 ..., 0 0 1]
[ 501 1 0 ..., 0 0 1]] [1 1 1 ..., 1 2 1]
What am I not seeing here? Am I doing something wrong or is that a scikit bug?
Update 1
Just found out that cv parameter is not a cv.ShuffleSplit object. This is counterintuitive for me, since the docs says
Aren't cross_validation classes "object to be used as a cross-validation generator"?
Thanks!
You shouldn't be varying cv in the cross validation parameters grid, the idea is that you have a fixed cross-validation, and use this to grid search over other parameters, something like this:
m_model = grid_search.GridSearchCV(model,
{'learning_rate': [0.1, 0.05, 0.02]},
cv = cv.ShuffleSplit(n=len(X_train), test_size=0.2))