TypeError: unhashable type: 'slice' - python

I am trying to run a regression using the following dataframe dfMyRoll the head of the dataframe looks like:
SCORE SCORE_LAG
date
2007-10-29 -0.031551 NaN
2007-10-30 0.000100 -0.031551
2007-10-31 0.000100 0.000100
2007-11-01 0.000100 0.000100
2007-11-02 0.000100 0.000100
The code that I am using is :
import glob
import pandas as pd
import os.path
import scipy
from scipy.stats import linregress
def main():
dataPath = "C:/Users/Stacey/Documents/data/Roll"
roll = 4
1ID = "BBG.XNGS.AAPL.S"
2ID = "BBG.XNGS.AMAT.S"
print(1ID,1ID)
cointergration = getCointergration(dataPath,1ID,2ID,roll)
return
def getCointergration(dataPath,1ID,2ID,roll):
for myRoll in range((roll-4),roll,1):
path = dataPath+str(myRoll)+'/'
filename='PairData_'+1ID+'_'+2ID+'.csv'
for fname in glob.iglob(path+filename):
dfMyRoll = pd.read_csv(fname, header=0, usecols=[0,31],parse_dates=[0], dayfirst=True,index_col=[0], names=['date', 'SCORE'])
dfMyRoll['SCORE_LAG'] = dfMyRoll['SCORE'].shift(1)
print('cointergration',dfMyRoll.head())
X = dfMyRoll[1:,'SCORE']
Y = dfMyRoll[1:,'SCORE_LAG']
slope,intercept,_,_,stderr=linregress(dfMyRoll[1:,'SCORE'],dfMyRoll[1:,'SCORE_LAG'])
if __name__ == "__main__":
print ("CointergrationTest...19/05/17")
try:
main()
except KeyboardInterrupt:
print ("Ctrl+C pressed. Stopping...")
I get the error: TypeError: unhashable type: 'slice'. I have looked at previous posts on this subject and tried adding iloc to the X and Y time series in the following way:
X = dfMyRoll.iloc[1:,'SCORE']
Y = dfMyRoll.iloc[1:,'SCORE_LAG']
but unfortunately I can't seem to find a solution. Please see below for a stack trace:
Traceback (most recent call last):
File "<ipython-input-3-431422978139>", line 1, in <module>
runfile('C:/Users/Stacey/Documents/scripts/cointergrationTest.py', wdir='C:/Users/Stacey/Documents/scripts')
File "C:\Anaconda\lib\site-packages\spyder\utils\site\sitecustomize.py", line 866, in runfile
execfile(filename, namespace)
File "C:\Anaconda\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Stacey/Documents/scripts/cointergrationTest.py", line 64, in <module>
main()
File "C:/Users/Stacey/Documents/scripts/cointergrationTest.py", line 23, in main
cointergration = getCointergration(dataPath,1ID,2ID,roll)
File "C:/Users/Stacey/Documents/scripts/cointergrationTest.py", line 42, in getCointergration
X = dfMyRoll[1:,'SCORE']
File "C:\Anaconda\lib\site-packages\pandas\core\frame.py", line 2059, in __getitem__
return self._getitem_column(key)
File "C:\Anaconda\lib\site-packages\pandas\core\frame.py", line 2066, in _getitem_column
return self._get_item_cache(key)
File "C:\Anaconda\lib\site-packages\pandas\core\generic.py", line 1384, in _get_item_cache
res = cache.get(item)
TypeError: unhashable type: 'slice'

You need to use loc rather than iloc:
X = dfMyRoll.loc[1:,'SCORE']
Y = dfMyRoll.loc[1:,'SCORE_LAG']
iloc is read as "integer location", and only accepts integer position. loc is somewhat more forgiving and allows both (you can also use ix).

Related

Why does Python say that a value does not exist when it specifically does?

SHORT DESCRIPTION:
The Main issue is that whenever i run the following code, i get the error below that:
import statsmodels.api as sm
from statsmodels.formula.api import ols
def onewayanaova (csv, vars, x="x-axis", y="y-axis"):
df = pd.read_csv(csv, delimiter=",")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=vars)
df_melt.columns = ['index', {x}, {y}]
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("The One-Way Anova Test Values are:\n")
print(anova_table)
onewayanaova("Book1.csv", ["a","b","c"])
The error is:
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Traceback (most recent call last):
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 36, in call_and_wrap_exc
return f(*args, **kwargs)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 165, in eval
return eval(code, {}, VarLookupDict([inner_namespace]
File "<string>", line 1, in <module>
NameError: name 'axis' is not defined
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\test.py", line 3, in <module>
mn.onewayanaova("Book1.csv", ["a","b","c"])
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\maincode.py", line 154, in onewayanaova
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\base\model.py", line 200, in from_formula
tmp = handle_formula_data(data, None, formula, depth=eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\formula\formulatools.py", line 63, in handle_formula_data
result = dmatrices(formula, Y, depth, return_type='dataframe',
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 309, in dmatrices
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 164, in _do_highlevel_design
design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 66, in _try_incr_builders
return design_matrix_builders([formula_like.lhs_termlist,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 693, in design_matrix_builders
cat_levels_contrasts) = _examine_factor_types(all_factors,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 443, in _examine_factor_types
value = factor.eval(factor_states[factor], data)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 564, in eval
return self._eval(memorize_state["eval_code"],
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 547, in _eval
return call_and_wrap_exc("Error evaluating factor",
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 43, in call_and_wrap_exc
exec("raise new_exc from e")
File "<string>", line 1, in <module>
patsy.PatsyError: Error evaluating factor: NameError: name 'axis' is not defined
y-axis ~ C(x-axis)
^^^^^^^^^
I think it is the X and Y variables I set in def onewayanaova (csv, vars, x="x-axis", y="y-axis"):. Maybe I need to change that so I don't get the error?
If you want a more detailed description, read below.
LONG DESCRIPTION:
I am trying to do a One Way Anova test. However, the main issue is that python keeps saying that there is a NameError, and that one of my values are not defined.
I am running the following code:
import statsmodels.api as sm
from statsmodels.formula.api import ols
def onewayanaova (csv, vars, x="x-axis", y="y-axis"):
df = pd.read_csv(csv, delimiter=",")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=vars)
df_melt.columns = ['index', {x}, {y}]
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("The One-Way Anova Test Values are:\n")
print(anova_table)
And:
import maincode as mn
mn.onewayanaova("Book1.csv", ["a","b","c"])
I get the following error (The first code is saved to a file named manicode.py, and the second code is saved to a file named test.py. "Book1.csv" is in the same folder as them). The error is:
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Traceback (most recent call last):
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 36, in call_and_wrap_exc
return f(*args, **kwargs)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 165, in eval
return eval(code, {}, VarLookupDict([inner_namespace]
File "<string>", line 1, in <module>
NameError: name 'axis' is not defined
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\test.py", line 3, in <module>
mn.onewayanaova("Book1.csv", ["a","b","c"])
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\maincode.py", line 154, in onewayanaova
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\base\model.py", line 200, in from_formula
tmp = handle_formula_data(data, None, formula, depth=eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\formula\formulatools.py", line 63, in handle_formula_data
result = dmatrices(formula, Y, depth, return_type='dataframe',
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 309, in dmatrices
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 164, in _do_highlevel_design
design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 66, in _try_incr_builders
return design_matrix_builders([formula_like.lhs_termlist,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 693, in design_matrix_builders
cat_levels_contrasts) = _examine_factor_types(all_factors,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 443, in _examine_factor_types
value = factor.eval(factor_states[factor], data)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 564, in eval
return self._eval(memorize_state["eval_code"],
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 547, in _eval
return call_and_wrap_exc("Error evaluating factor",
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 43, in call_and_wrap_exc
exec("raise new_exc from e")
File "<string>", line 1, in <module>
patsy.PatsyError: Error evaluating factor: NameError: name 'axis' is not defined
y-axis ~ C(x-axis)
^^^^^^^^^
The main error that I see is that I named the X and Y variables as: x="x-axis", y="y-axis". But i do not get why that gives me an error, as I made a very neat looking boxplot from it (but I know that X and Y are used as the axis titles):
def boxplot (csv, vars, x="x-axis", y="y-axis"):
#https://www.reneshbedre.com/blog/anova.html
df = pd.read_csv(csv, delimiter=",")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=vars)
df_melt.columns = ['index', x, y]
ax = sns.boxplot(x=x, y=y, data=df_melt, color='#99c2a2')
ax = sns.swarmplot(x=x, y=y, data=df_melt, color='#7d0013')
plt.show()
BUT, whenever I write this code from someone else, it gives the output I want:
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd
df = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/onewayanova.txt", sep="\t")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=['A', 'B', 'C', 'D'])
df_melt.columns = ['index', 'treatments', 'value']
model = ols('value ~ C(treatments)', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)
The output that i get with the above code:
sum_sq df F PR(>F)
C(treatments) 3010.95 3.0 17.49281 0.000026
Residual 918.00 16.0 NaN NaN
The main issue is that i need to change values of model = ols('value ~ C(treatments)', data=df_melt).fit() and df_melt.columns = ['index', 'treatments', 'value'] because most datasets do not have 'treatments', 'value' as their database. If your wondering what my .csv file has is this:
Column headers of a, b and c
A list of equal amount of numbers in each of them
My main issue is:
Please try and help me understand why I cannot replace 'value ~ C(treatments)' with X and Y!
Source of the code: https://www.reneshbedre.com/blog/anova.html
In statsmodels formulae, you need to quote your variables (i.e. columns in your dataframe) when they contain special characters such as -. Have a look at the documentation, your term "x-axis" is interpreted as "x" - "axis". Quoting variable can be done with the Q() transformation. Make sure to quote the variable name inside with different (single/double) quotes that you use for the string:
model = ols(f'Q("{y}") ~ C(Q("{x}"))', data=df_melt).fit()
It seems that model = ols('value ~ C(treatments)', data=df_melt).fit() cannot have a variable subsitute (as i had in model = ols(f'{y} ~ C({x})', data=df_melt).fit()). This is also the case if i use model = ols(f'Q("{y}") ~ C(Q("{x}"))', data=df_melt).fit(), as mentioned by #Rob.
Therefore, to make it work and have my own names, i just have to rename df_melt.columns = ['index', 'treatments', 'value'] in relation to model = ols('value ~ C(treatments)', data=df_melt).fit() (where 'treatments', 'value' are the same thing in teh two lines of code).

Why can dask.dataframe.apply only process a column called 'name'?

I am attempting to port some Pandas (Python) code to Dask instead. I am using Pandas 1.1.3 and Dask 2.30.0. I keep ramming my head against a wall I can't see. That is, I cannot understand what is going on here. I have boiled it down to the following minimal working example:
My data is the file 'test.csv' containing the following:
age,name
28,Alice
The following Python script (using Pandas) works fine:
import pandas as pd
df = pd.read_csv("test.csv", dtype={'name': str})
result = df['name'].apply(lambda text: text.upper())
#result = df['age'].apply(lambda num: num + 1)
print(result)
and prints:
0 ALICE
Name: name, dtype: object
The commented-out line operating on the 'age' column also works and prints:
0 29
Name: age, dtype: int64
Now, with Dask instead, my example becomes:
import dask.dataframe as dd
df = dd.read_csv("test.csv", dtype={'name': str})
result = df['name'].apply(lambda text: text.upper(), meta={'name': str})
#result = df['age'].apply(lambda num: num + 1, meta={'age': int})
print(result.compute())
which works fine just like the Pandas example. However, if I try the commented-out line operating on the 'age' column instead, Python complains with the following error message:
Traceback (most recent call last):
File "test_dask.py", line 7, in <module>
print(result.compute())
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/base.py", line 167, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/base.py", line 452, in compute
results = schedule(dsk, keys, **kwargs)
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/threaded.py", line 76, in get
results = get_async(
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/local.py", line 486, in get_async
raise_exception(exc, tb)
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/local.py", line 316, in reraise
raise exc
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/local.py", line 222, in execute_task
result = _execute_task(task, data)
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/core.py", line 121, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/optimization.py", line 961, in __call__
return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/core.py", line 151, in get
result = _execute_task(task, cache)
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/core.py", line 121, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/utils.py", line 29, in apply
return func(*args, **kwargs)
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/dask/dataframe/core.py", line 5306, in apply_and_enforce
c = meta.name
File "/some/path/miniconda3/envs/testdask/lib/python3.8/site-packages/pandas/core/generic.py", line 5139, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'name'
Even if I just call the 'name' column something else, it also fails like this. It is as if Dask is only able to work on columns of a DataFrame that are called 'name'. This seems extraordinarily weird to me, and I must be misunderstanding something. What is really going on here?
The docs seem to suggest that the dict should work, so that's weird, but if you replace the meta argument with a tuple instead, your code runs as expected:
df = dd.read_csv("test.csv")
result = df['age'].apply(lambda num: num + 1, meta=('age', 'int64'))
print(result.compute())
becomes
0 29
Name: age, dtype: int64

How to fix "Length of value doesnt match index" in python?

I've been trying to get the list of addresses from a franchise at Brazil, but when I run the code, it starts, runs two cities and then it stops and appears "ValueError"
The code I've been trying to run is this:
import requests
import json
import pandas as pd
dMun = pd.read_json('https://servicodados.ibge.gov.br/api/v1/localidades/municipios')
dEndTotal = pd.DataFrame()
for iMun in range(len(dMun)):
sCidade = dMun.loc[iMun,'nome']
print(str(iMun) + ' - '+ dMun.loc[iMun,'nome'])
sSigla = dMun.loc[iMun,'microrregiao']['mesorregiao']['UF']['sigla']
r = requests.post('https://www.5asec.com.br/busca-lojas-endereco', data = {'endereco':'A, 1 {}/{}'.format(sCidade,sSigla)})
jEnd = json.loads(r.text)
dEnd = pd.DataFrame.from_records(jEnd['lojas'])
print(dEnd)
if len(dEnd) > 0:
for sChave in jEnd['lojas'][0]['Endereco'].keys():
dEnd[sChave] = []
for i in range(len(dEnd)):
for sChave in jEnd['lojas'][i]['Endereco'].keys():
dEnd[sChave][i] = jEnd['lojas'][i]['Endereco'][sChave]
dEndTotal = pd.concat([dEndTotal,dEnd],ignore_index=False).drop_duplicates().reset_index(drop=True)
But its resulting on this error:
0 - Alta Floresta D'Oeste
Empty DataFrame
Columns: []
Index: []
1 - Ariquemes
CEP Codigo CodigoExterno ... Telefone TemEcommerce Url
0 76870512 675 69004P ... 35366864 False ariquemes
[1 rows x 16 columns]
Traceback (most recent call last):
File "<ipython-input-1-cd9a35514f7e>", line 1, in <module>
runfile('C:/Users/vinis/OneDrive/Área de Trabalho/5aSec.py', wdir='C:/Users/vinis/OneDrive/Área de Trabalho')
File "C:\Users\vinis\Anaconda2\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 786, in runfile
execfile(filename, namespace)
File "C:\Users\vinis\Anaconda2\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 95, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/vinis/OneDrive/Área de Trabalho/5aSec.py", line 38, in <module>
File "C:\Users\vinis\Anaconda2\lib\site-packages\pandas\core\frame.py", line 3370, in __setitem__
self._set_item(key, value)
File "C:\Users\vinis\Anaconda2\lib\site-packages\pandas\core\frame.py", line 3445, in _set_item
value = self._sanitize_column(key, value)
File "C:\Users\vinis\Anaconda2\lib\site-packages\pandas\core\frame.py", line 3630, in _sanitize_column
value = sanitize_index(value, self.index, copy=False)
File "C:\Users\vinis\Anaconda2\lib\site-packages\pandas\core\internals\construction.py", line 519, in sanitize_index
raise ValueError('Length of values does not match length of index')
ValueError: Length of values does not match length of index
How can i fix this one?
Thanks for the help, guys
And I'm sorry if the post isn't all correct

Python gives KeyError while the key is passed

I am trying to create different python file where the code is given below. While calling the method, I pass the mydata as data frame with these columns
['wage', 'educ', 'exper', 'tenure'].
import pandas as pd
import numpy as np
from prettytable import PrettyTable as pt
def LinearRegressionOLS(mydata,target_column):
if(not isinstance(mydata,pd.DataFrame)):
raise TypeError("Data must be of type Data Frame")
if(not isinstance(target_column,str)):
raise TypeError("target_column must be String")
if(target_column not in mydata.columns):
raise KeyError("target_column doesn't exist in Data Frame")
data=mydata.copy()
data["one"]=np.ones(data.count()[target_column])
column_list=["one"]
for i in data.columns:
column_list.append(i)
Y=data[target_column].as_matrix()
data.drop(target_column,inplace=True,axis=1)
X=data[column_list].as_matrix()
del data
beta = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),Y)
predY = np.matmul(X,beta)
total = np.matmul((Y-np.mean(Y)).T,(Y-np.mean(Y)))
residual = np.matmul((Y-predY).T,(Y-predY))
sigma = np.matmul((Y-predY).T,(Y-predY))/(X.shape[0]-X.shape[1])
omega = np.square(sigma)*np.linalg.inv(np.matmul(X.T,X))
SE = np.sqrt(np.diag(omega))
tstat = beta/SE
Rsq = 1-(residual/total)
final = pt()
final.add_column(" ",column_list)
final.add_column("Coefficients",beta)
final.add_column("Standard Error",SE)
final.add_column("t-stat",tstat)
print(final)
print("Residual: ",residual)
print("Total: ",total)
print("Standard Error: ",sigma)
print("R Square: ",Rsq)
After running the above code, by calling the function given below,
>>> c
['wage', 'educ', 'exper', 'tenure']
>>> import LR_OLS as inf
>>> inf.LinearRegressionOLS(file[c],"wage")
, i get some error like this
Traceback (most recent call last):
File "<pyshell#182>", line 1, in <module>
inf.LinearRegressionOLS(file[c],"wage")
File "E:\python\LR_OLS.py", line 29, in LinearRegressionOLS
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2133, in __getitem__
return self._getitem_array(key)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2177, in _getitem_array
indexer = self.loc._convert_to_indexer(key, axis=1)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\indexing.py", line 1269, in _convert_to_indexer
.format(mask=objarr[mask]))
KeyError: "['wage'] not in index"
Can anyone help me as to why i am getting this error. How can i resolve it?
The problem is that you still have 'wage' in 'column_list. So in order to never let it get in there do the following adaptation:
for i in data.columns:
if i != 'wage': # add this line to your code
column_list.append(i)

"Already tz-aware" error when reading h5 file using pandas, python 3 (but not 2)

I have an h5 store named weather.h5. My default Python environment is 3.5.2. When I try to read this store I get TypeError: Already tz-aware, use tz_convert to convert.
I've tried both pd.read_hdf('weather.h5','weather_history') and pd.io.pytables.HDFStore('weather.h5')['weather_history], but I get the error no matter what.
I can open the h5 in a Python 2.7 environment. Is this a bug in Python 3 / pandas?
I have the same issue. I'm using Anaconda Python: 3.4.5 and 2.7.3. Both are using pandas 0.18.1.
Here is a reproducible example:
generate.py (to be executed with Python2):
import pandas as pd
from pandas import HDFStore
index = pd.DatetimeIndex(['2017-06-20 06:00:06.984630-05:00', '2017-06-20 06:03:01.042616-05:00'], dtype='datetime64[ns, CST6CDT]', freq=None)
p1 = [0, 1]
p2 = [0, 2]
# Saving any of these dataframes cause issues
df1 = pd.DataFrame({"p1":p1, "p2":p2}, index=index)
df2 = pd.DataFrame({"p1":p1, "p2":p2, "i":index})
store = HDFStore("./test_issue.h5")
store['df'] = df1
#store['df'] = df2
store.close()
read_issue.py:
import pandas as pd
from pandas import HDFStore
store = HDFStore("./test_issue.h5", mode="r")
df = store['/df']
store.close()
print(df)
Running read_issue.py in Python2 has no issues and produces this output:
p1 p2
2017-06-20 11:00:06.984630-05:00 0 0
2017-06-20 11:03:01.042616-05:00 1 2
But running it in Python3 produces Error with this traceback:
Traceback (most recent call last):
File "read_issue.py", line 5, in
df = store['df']
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 417, in getitem
return self.get(key)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 634, in get
return self._read_group(group)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 1272, in _read_group
return s.read(**kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2779, in read
ax = self.read_index('axis%d' % i)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2367, in read_index
_, index = self.read_index_node(getattr(self.group, key))
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2492, in read_index_node
_unconvert_index(data, kind, encoding=self.encoding), **kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/indexes/base.py", line 153, in new
result = DatetimeIndex(data, copy=copy, name=name, **kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/util/decorators.py", line 91, in wrapper
return func(*args, **kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/tseries/index.py", line 321, in new
raise TypeError("Already tz-aware, use tz_convert "
TypeError: Already tz-aware, use tz_convert to convert.
Closing remaining open files:./test_issue.h5...done
So, there is an issue with indices. However, if you save df2 in generate.py (datetime as a column, not as an index), then Python3 in read_issue.py produces a different error:
Traceback (most recent call last):
File "read_issue.py", line 5, in
df = store['/df']
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 417, in getitem
return self.get(key)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 634, in get
return self._read_group(group)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 1272, in _read_group
return s.read(**kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2788, in read
placement=items.get_indexer(blk_items))
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/core/internals.py", line 2518, in make_block
return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/core/internals.py", line 90, in init
len(self.mgr_locs)))
ValueError: Wrong number of items passed 2, placement implies 1
Closing remaining open files:./test_issue.h5...done
Also, if you execute generate_issue.py in Python3 (saving either df1 or df2), then there is no problem executing read_issue.py in either Python3 or Python2

Categories