A variation of this post, without the detailed traceback, had been posted in the SO about two hours ago. This version contains the whole traceback.)
I am running StatsModels to get parameter estimates from ordinary least-squares (OLS). Data-processing and model-specific commands are shown below. When I use import statsmodels.formula.api as smas the operative api, the OLS works as desired (after I drop some 15 rows programmatically), giving intuitive results. But when I switch to import statsmodels.api as sm as the binding api, without changing the code almost at all, things fall apart, and Python interpreter triggers an error saying that 'inc_2 is not in the index'. Mind you, inc_2 was computed after the dataframe was read into StatsModels in both model runs: and yet the run was successful in the first, but not in the second. (BTW, p_c_inc_18 is per-capita income, and inc_2 is the former squarred. inc_2 is the offensive element in the second run.)
import pandas as pd
import numpy as np
import statsmodels.api as sm
%matplotlib inline import
matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid") eg=pd.read_csv(r'C:/../../../une_edu_pipc_06.csv') pd.options.display.precision = 3
plt.rc("figure", figsize=(16,8))
plt.rc("font", size=14)
sm_col = eg["lt_hsd_17"] + eg["hsd_17"]
eg["ut_hsd_17"] = sm_col
sm_col2 = eg["sm_col_17"] + eg["col_17"] eg["bnd_hsd_17"] = sm_col2
eg["d_09"]= eg["Rate_09"]-eg["Rate_06"]
eg["d_10"]= eg["Rate_10"]-eg["Rate_06"] inc_2=eg["p_c_inc_18"]*eg["p_c_inc_18"]
X = eg[["p_c_inc_18","ut_hsd_17","d_10","inc_2"]]
y = eg["Rate_18"]
X = sm.add_constant(X)
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())
Here is the traceback in full.
KeyError Traceback (most recent call last)
<ipython-input-21-e2f4d325145e> in <module>
17 eg["d_10"]= eg["Rate_10"]-eg["Rate_06"]
18 inc_2=eg["p_c_inc_18"]*eg["p_c_inc_18"]
---> 19 X = eg[["p_c_inc_18","ut_hsd_17","d_10","inc_2"]]
20 y = eg["Rate_18"]
21 X = sm.add_constant(X)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2804 if is_iterator(key):
2805 key = list(key)
-> 2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2807
2808 # take() does not accept boolean indexers
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1550 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1551
-> 1552 self._validate_read_indexer(
1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1644 if not (self.name == "loc" and not raise_missing):
1645 not_found = list(set(key) - set(ax))
-> 1646 raise KeyError(f"{not_found} not in index")
1647
1648 # we skip the warning on Categorical/Interval
KeyError: "['inc_2'] not in index"
What am I doing wrong?
The syntax you used insists that a list of strings is a legal index into eg. If you print(eg), you'll see that it has no such element. I think what you meant was to make a list of elements, each indexed by a single string.
X = [
eg["p_c_inc_18"],
eg["ut_hsd_17"],
eg["d_10"],
eg["inc_2"]
]
Related
Practicing with MetPy Monday interpolate_to_grid for metar data and I successfully got the mslp grid to work.
Moving on to Potential temperature and the result has been all nan. When it "works". When it doesnt work, I get a set of errors that dont appear to help...
import numpy as np
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
from siphon.catalog import TDSCatalog
from metpy.io import parse_metar_file
from metpy.interpolate import interpolate_to_grid, remove_nan_observations
from metpy.plots import add_metpy_logo, current_weather, sky_cover, StationPlot
from metpy.calc import wind_components, wet_bulb_temperature, altimeter_to_station_pressure,potential_temperature,gradient
from metpy.units import units
from datetime import datetime,timedelta
import pandas as pd
mapcrs = ccrs.LambertConformal(central_longitude=-100.,central_latitude=35.,standard_parallels=(30.,60.))
datacrs = ccrs.PlateCarree()
cat = TDSCatalog('https://thredds-test.unidata.ucar.edu/thredds/catalog/noaaport/text/metar/catalog.xml')
ds = cat.datasets[-4]
dattim = ds.name[6:14]+' '+ds.name[15:19]
ds.download()
df = parse_metar_file(ds.name)
#pandas dataframe
#df.head()
df.columns.values
extent = [-120,-72,24,50]
df = df.dropna(subset=['latitude','longitude','elevation','altimeter','air_temperature','eastward_wind','northward_wind','air_pressure_at_sea_level','dew_point_temperature'])
lon = df['longitude'].values
lat = df['latitude'].values
stn_ids = df['station_id'].values
elev = df['elevation'].values
altimeter = df['altimeter'].values
t2 = df['air_temperature'].values
mslp = df['air_pressure_at_sea_level'].values
#projected coords
xp, yp, _ = mapcrs.transform_points(datacrs,lon,lat).T # x,y returned
#mslp WORKS
x_masked, y_masked, mslp = remove_nan_observations(xp,yp,mslp)
#altgridx,altgridy,alt = interpolate_to_grid(x_masked,y_masked,alt, interp_type='cressman')
altgridx,altgridy,mslp = interpolate_to_grid(x_masked,y_masked,mslp, interp_type='barnes',gamma=.5,kappa_star=10, hres=25000)
#Potential Temperature doesnt work
pres = altimeter_to_station_pressure(altimeter * units('mbar'), elev * units('m'))*33.8639
print(pres)
# theta
x_masked, y_masked, temp = remove_nan_observations(xp,yp,t2*units('degC'))
x_masked, y_masked, pres = remove_nan_observations(xp,yp,pres)
print(np.size(temp))
potemp = potential_temperature(pres, temp)
print(np.size(potemp))
print(np.unique(np.array(potemp)))
grdx = 75000.
thgridx,thgridy,theta = interpolate_to_grid(x_masked,y_masked, potemp, interp_type='barnes',kappa_star=6, gamma=0.5,hres=grdx)
print(np.shape(thgridx))
print(np.unique(theta))
Here is what is returned from the last section:
[949.361081708803 993.4468013877739 987.2845093729651 ... 1029.0930108008558 1016.002484792407 930.3708063382303] millibar
5837
5837
[236.32885315 237.21299941 239.04372591 ... 368.37047837 369.20079652
370.76269267]
---------------------------------------------------------------------------
DimensionalityError Traceback (most recent call last)
~/miniconda3/lib/python3.7/site-packages/pint/quantity.py in __float__(self)
896 return float(self._convert_magnitude_not_inplace(UnitsContainer()))
--> 897 raise DimensionalityError(self._units, "dimensionless")
898
DimensionalityError: Cannot convert from 'kelvin' to 'dimensionless'
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
/var/folders/5n/sg5k98bx6gg4flb4fskykh4m0000gn/T/ipykernel_41626/379842406.py in <module>
11
12 grdx = 75000.
---> 13 thgridx,thgridy,theta = interpolate_to_grid(x_masked,y_masked, potemp, interp_type='barnes',kappa_star=6, gamma=0.5,hres=grdx)
14 print(np.shape(thgridx))
15 print(np.unique(theta))
~/miniconda3/lib/python3.7/site-packages/metpy/pandas.py in wrapper(*args, **kwargs)
19 kwargs = {name: (v.values if isinstance(v, pd.Series) else v)
20 for name, v in kwargs.items()}
---> 21 return func(*args, **kwargs)
22 return wrapper
~/miniconda3/lib/python3.7/site-packages/metpy/interpolate/grid.py in interpolate_to_grid(x, y, z, interp_type, hres, minimum_neighbors, gamma, kappa_star, search_radius, rbf_func, rbf_smooth, boundary_coords)
301 minimum_neighbors=minimum_neighbors, gamma=gamma,
302 kappa_star=kappa_star, search_radius=search_radius,
--> 303 rbf_func=rbf_func, rbf_smooth=rbf_smooth)
304
305 return grid_x, grid_y, img.reshape(grid_x.shape)
~/miniconda3/lib/python3.7/site-packages/metpy/interpolate/points.py in interpolate_to_points(points, values, xi, interp_type, minimum_neighbors, gamma, kappa_star, search_radius, rbf_func, rbf_smooth)
365 return inverse_distance_to_points(points, values, xi, search_radius, gamma, kappa,
366 min_neighbors=minimum_neighbors,
--> 367 kind=interp_type)
368
369 # If this is radial basis function, make the interpolator and apply it
~/miniconda3/lib/python3.7/site-packages/metpy/interpolate/points.py in inverse_distance_to_points(points, values, xi, r, gamma, kappa, min_neighbors, kind)
268 img[idx] = cressman_point(dists, values_subset, r)
269 elif kind == 'barnes':
--> 270 img[idx] = barnes_point(dists, values_subset, kappa, gamma)
271
272 else:
ValueError: setting an array element with a sequence.
I struggled with Units, but I think the units are correct now. What could be causing this?
I tried cressman, I tried a larger Barnes grid, and I tried making sure search_radius was large. Still nan, when it worked.
The problem is caused by interpolate_to_grid choking on units when using Cressman or Barnes--which we definitely need to fix. For now the solution is to either use a different interpolation method (like interp_type='linear', the default), or to strip units before calling:
thgridx, thgridy, theta = interpolate_to_grid(x_masked, y_masked, potemp.magnitude,
interp_type='barnes', kappa_star=6, gamma=0.5, hres=grdx)
theta = units.Quantity(theta, 'K')
As far as your problems with NaNs is concerned, you may want to look at the search_radius parameter, which controls the maximum distance from a target point observations are considered. In some data-sparse areas, this could cause you to have some drop-outs. By default, it uses a guess of 5 times the average distance from one ob point to its nearest neighbor.
I'm trying to apply some machine learning based regression on data from a CSV file. My columns are:
Index(['date', 'customer_id', 'product_category', 'payment_method',
'value [USD]', 'time_on_site', 'clicks_in_site', 'USD/[Minutes]',
'USD/clicks_in_site'],
dtype='object')
When I run:
from pycaret.regression import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
exp_reg = setup(data = df, target='value [USD]', session_id=123,
high_cardinality_features = ['product_category'],
normalize = True,
ignore_features = ['customer_id', 'date', 'time_on_site']
)
I get the following error message:
KeyError Traceback (most recent call last)
<ipython-input-43-20eab85de0cc> in <module>()
2 high_cardinality_features = ['product_category'],
3 normalize = True,
----> 4 ignore_features = ['customer_id', 'date', 'time_on_site']
5 )
6
5 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
5285 if mask.any():
5286 if errors != "ignore":
-> 5287 raise KeyError(f"{labels[mask]} not found in axis")
5288 indexer = indexer[~mask]
5289 return self.delete(indexer)
KeyError: "['value [USD]'] not found in axis"
I found the solution. The column name ['value [USD]'] was the problem. After renaming it the code works as intended. It has probably something to do with the brackets inside the column name which can maybe be interpreted as a dictionary or list but I'm not sure.
I am trying to run a stepwise automated search procedure on Python with linear regression, with my code shown below, using code from https://datascience.stackexchange.com/a/24447 I did not change any of the code given by the contributor, but am still encountering errors:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import statsmodels.api as sm
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
def stepwise_selection(X, y,
initial_list=[],
threshold_in=0.01,
threshold_out = 0.05,
verbose=True):
""" Perform a forward-backward feature selection
based on p-value from statsmodels.api.OLS
Arguments:
X - pandas.DataFrame with candidate features
y - list-like with the target
initial_list - list of features to start with (column names of X)
threshold_in - include a feature if its p-value < threshold_in
threshold_out - exclude a feature if its p-value > threshold_out
verbose - whether to print the sequence of inclusions and exclusions
Returns: list of selected features
Always set threshold_in < threshold_out to avoid infinite looping.
See https://en.wikipedia.org/wiki/Stepwise_regression for the details
"""
included = list(initial_list)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.argmin()
included.append(best_feature)
changed=True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed=True
worst_feature = pvalues.argmax()
included.remove(worst_feature)
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
return included
result = stepwise_selection(X, y)
print('resulting features:')
print(result)
However, I have run into the following error:
--------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-21-782c721f1ba0> in <module>
59 return included
60
---> 61 result = stepwise_selection(X, y)
62
63 print('resulting features:')
<ipython-input-21-782c721f1ba0> in stepwise_selection(X, y, initial_list, threshold_in, threshold_out, verbose)
45
46 # backward step
---> 47 model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
48 # use all coefs except intercept
49 pvalues = model.pvalues.iloc[1:]
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2804 if is_iterator(key):
2805 key = list(key)
-> 2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2807
2808 # take() does not accept boolean indexers
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1551
1552 self._validate_read_indexer(
-> 1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
1555 return keyarr, indexer
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1638 if missing == len(indexer):
1639 axis_name = self.obj._get_axis_name(axis)
-> 1640 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1641
1642 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "None of [Int64Index([8], dtype='int64')] are in the [columns]"
Expected output should be this:
Add LSTAT with p-value 5.0811e-88
Add RM with p-value 3.47226e-27
Add PTRATIO with p-value 1.64466e-14
Add DIS with p-value 1.66847e-05
Add NOX with p-value 5.48815e-08
Add CHAS with p-value 0.000265473
Add B with p-value 0.000771946
Add ZN with p-value 0.00465162
resulting features:
['LSTAT', 'RM', 'PTRATIO', 'DIS', 'NOX', 'CHAS', 'B', 'ZN']
Appreciate any help given, thank you!
I am not sure how the code actually worked in the first place, maybe argmax worked differently. You get the error because of this line:
best_feature = new_pval.argmin()
You need the actual name of the feature, so if you change it to:
new_pval[new_column] = model.pvalues[new_column]
And likewise this line:
worst_feature = pvalues.argmax()
To:
worst_feature = new_pval.index[pvalues.argmax()]
I get this:
Add LSTAT with p-value 5.0811e-88
Add RM with p-value 3.47226e-27
Add PTRATIO with p-value 1.64466e-14
Add DIS with p-value 1.66847e-05
Add NOX with p-value 5.48815e-08
Add CHAS with p-value 0.000265473
Add B with p-value 0.000771946
Add ZN with p-value 0.00465162
resulting features:
['LSTAT', 'RM', 'PTRATIO', 'DIS', 'NOX', 'CHAS', 'B', 'ZN']
Although here, from a statistical point of view, I have some doubts about the implementation. I suggest you maybe post this in cross-validated or as another question.
I have a pandas data set where I group the data by day. I would like to take this data and plot histrograms for each day on the same plot but offset to the day in which the data occured. I researched this item and someone stated that you need to use pcolor, which is a nice alternative
Here is a link to some example data..
http://pastebin.com/rKzj5Qzf
I attempted to use the lambda function in the below post,which create a Series. Pcolor does not like this series and says it need more that 1 value to unpack..
stackoverflow.com/questions/17050202/plot-timeseries-of-histograms-in-python
Does anyone know what I am doing wrong?
EDIT:
SO the output pasted from the series 'df' in the following
from running the following code snippet:
daily = x1.groupby(x1.date).price
f = lambda x: pd.Series(np.histogram(x, bins=bins)[0], index=bins[:-1])
df=daily.apply(f)
once, I do this i attempt to pass to matplotlib
import matplotlib as plt
plt.pcolor(df.T)
This is where I get the problem. I clearly have 3 items :date,price, count
EDIT:::Traceback
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-460b943e4ead> in <module>()
----> 1 plt.pcolor(df.T)
/usr/lib/pymodules/python2.7/matplotlib/pyplot.pyc in pcolor(*args, **kwargs)
2926 ax.hold(hold)
2927 try:
-> 2928 ret = ax.pcolor(*args, **kwargs)
2929 draw_if_interactive()
2930 finally:
/usr/lib/pymodules/python2.7/matplotlib/axes.pyc in pcolor(self, *args, **kwargs)
7543 shading = kwargs.pop('shading', 'flat')
7544
-> 7545 X, Y, C = self._pcolorargs('pcolor', *args, allmatch=False)
7546 Ny, Nx = X.shape
7547
/usr/lib/pymodules/python2.7/matplotlib/axes.pyc in _pcolorargs(funcname, *args, **kw)
7339 if len(args) == 1:
7340 C = args[0]
-> 7341 numRows, numCols = C.shape
7342 if allmatch:
7343 X, Y = np.meshgrid(np.arange(numCols), np.arange(numRows))
ValueError: need more than 1 value to unpack
I'm having trouble computing the silhouette coefficient in python with sklearn.
Here is my code :
from sklearn import datasets
from sklearn.metrics import *
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = col)
y = pd.DataFrame(iris.target,columns = ['cluster'])
s = silhouette_score(X, y, metric='euclidean',sample_size=int(50))
I get the error :
IndexError: indices are out-of-bounds
I want to use the sample_size parameter because when working with very large datasets, silhouette is too long to compute. Anyone knows how this parameter could work ?
Complete traceback :
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-72-70ff40842503> in <module>()
4 X = pd.DataFrame(iris.data, columns = col)
5 y = pd.DataFrame(iris.target,columns = ['cluster'])
----> 6 s = silhouette_score(X, y, metric='euclidean',sample_size=50)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/cluster/unsupervised.pyc in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
81 X, labels = X[indices].T[indices].T, labels[indices]
82 else:
---> 83 X, labels = X[indices], labels[indices]
84 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
85
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
1993 if isinstance(key, (np.ndarray, list)):
1994 # either boolean or fancy integer index
-> 1995 return self._getitem_array(key)
1996 elif isinstance(key, DataFrame):
1997 return self._getitem_frame(key)
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _getitem_array(self, key)
2030 else:
2031 indexer = self.ix._convert_to_indexer(key, axis=1)
-> 2032 return self.take(indexer, axis=1, convert=True)
2033
2034 def _getitem_multilevel(self, key):
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in take(self, indices, axis, convert)
2981 if convert:
2982 axis = self._get_axis_number(axis)
-> 2983 indices = _maybe_convert_indices(indices, len(self._get_axis(axis)))
2984
2985 if self._is_mixed_type:
/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _maybe_convert_indices(indices, n)
1038 mask = (indices>=n) | (indices<0)
1039 if mask.any():
-> 1040 raise IndexError("indices are out-of-bounds")
1041 return indices
1042
IndexError: indices are out-of-bounds
silhouette_score expects regular numpy arrays as input. Why wrap your arrays in data frames?
>>> silhouette_score(iris.data, iris.target, sample_size=50)
0.52999903616584543
From the traceback, you can observe that the code is doing fancy indexing (subsampling) on the first axis. By default indexing a dataframe will index the columns and not the rows hence the issue you observe.