SHORT DESCRIPTION:
The Main issue is that whenever i run the following code, i get the error below that:
import statsmodels.api as sm
from statsmodels.formula.api import ols
def onewayanaova (csv, vars, x="x-axis", y="y-axis"):
df = pd.read_csv(csv, delimiter=",")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=vars)
df_melt.columns = ['index', {x}, {y}]
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("The One-Way Anova Test Values are:\n")
print(anova_table)
onewayanaova("Book1.csv", ["a","b","c"])
The error is:
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Traceback (most recent call last):
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 36, in call_and_wrap_exc
return f(*args, **kwargs)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 165, in eval
return eval(code, {}, VarLookupDict([inner_namespace]
File "<string>", line 1, in <module>
NameError: name 'axis' is not defined
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\test.py", line 3, in <module>
mn.onewayanaova("Book1.csv", ["a","b","c"])
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\maincode.py", line 154, in onewayanaova
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\base\model.py", line 200, in from_formula
tmp = handle_formula_data(data, None, formula, depth=eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\formula\formulatools.py", line 63, in handle_formula_data
result = dmatrices(formula, Y, depth, return_type='dataframe',
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 309, in dmatrices
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 164, in _do_highlevel_design
design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 66, in _try_incr_builders
return design_matrix_builders([formula_like.lhs_termlist,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 693, in design_matrix_builders
cat_levels_contrasts) = _examine_factor_types(all_factors,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 443, in _examine_factor_types
value = factor.eval(factor_states[factor], data)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 564, in eval
return self._eval(memorize_state["eval_code"],
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 547, in _eval
return call_and_wrap_exc("Error evaluating factor",
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 43, in call_and_wrap_exc
exec("raise new_exc from e")
File "<string>", line 1, in <module>
patsy.PatsyError: Error evaluating factor: NameError: name 'axis' is not defined
y-axis ~ C(x-axis)
^^^^^^^^^
I think it is the X and Y variables I set in def onewayanaova (csv, vars, x="x-axis", y="y-axis"):. Maybe I need to change that so I don't get the error?
If you want a more detailed description, read below.
LONG DESCRIPTION:
I am trying to do a One Way Anova test. However, the main issue is that python keeps saying that there is a NameError, and that one of my values are not defined.
I am running the following code:
import statsmodels.api as sm
from statsmodels.formula.api import ols
def onewayanaova (csv, vars, x="x-axis", y="y-axis"):
df = pd.read_csv(csv, delimiter=",")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=vars)
df_melt.columns = ['index', {x}, {y}]
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("The One-Way Anova Test Values are:\n")
print(anova_table)
And:
import maincode as mn
mn.onewayanaova("Book1.csv", ["a","b","c"])
I get the following error (The first code is saved to a file named manicode.py, and the second code is saved to a file named test.py. "Book1.csv" is in the same folder as them). The error is:
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Traceback (most recent call last):
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 36, in call_and_wrap_exc
return f(*args, **kwargs)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 165, in eval
return eval(code, {}, VarLookupDict([inner_namespace]
File "<string>", line 1, in <module>
NameError: name 'axis' is not defined
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\test.py", line 3, in <module>
mn.onewayanaova("Book1.csv", ["a","b","c"])
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\maincode.py", line 154, in onewayanaova
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\base\model.py", line 200, in from_formula
tmp = handle_formula_data(data, None, formula, depth=eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\formula\formulatools.py", line 63, in handle_formula_data
result = dmatrices(formula, Y, depth, return_type='dataframe',
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 309, in dmatrices
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 164, in _do_highlevel_design
design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 66, in _try_incr_builders
return design_matrix_builders([formula_like.lhs_termlist,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 693, in design_matrix_builders
cat_levels_contrasts) = _examine_factor_types(all_factors,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 443, in _examine_factor_types
value = factor.eval(factor_states[factor], data)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 564, in eval
return self._eval(memorize_state["eval_code"],
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 547, in _eval
return call_and_wrap_exc("Error evaluating factor",
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 43, in call_and_wrap_exc
exec("raise new_exc from e")
File "<string>", line 1, in <module>
patsy.PatsyError: Error evaluating factor: NameError: name 'axis' is not defined
y-axis ~ C(x-axis)
^^^^^^^^^
The main error that I see is that I named the X and Y variables as: x="x-axis", y="y-axis". But i do not get why that gives me an error, as I made a very neat looking boxplot from it (but I know that X and Y are used as the axis titles):
def boxplot (csv, vars, x="x-axis", y="y-axis"):
#https://www.reneshbedre.com/blog/anova.html
df = pd.read_csv(csv, delimiter=",")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=vars)
df_melt.columns = ['index', x, y]
ax = sns.boxplot(x=x, y=y, data=df_melt, color='#99c2a2')
ax = sns.swarmplot(x=x, y=y, data=df_melt, color='#7d0013')
plt.show()
BUT, whenever I write this code from someone else, it gives the output I want:
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd
df = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/onewayanova.txt", sep="\t")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=['A', 'B', 'C', 'D'])
df_melt.columns = ['index', 'treatments', 'value']
model = ols('value ~ C(treatments)', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)
The output that i get with the above code:
sum_sq df F PR(>F)
C(treatments) 3010.95 3.0 17.49281 0.000026
Residual 918.00 16.0 NaN NaN
The main issue is that i need to change values of model = ols('value ~ C(treatments)', data=df_melt).fit() and df_melt.columns = ['index', 'treatments', 'value'] because most datasets do not have 'treatments', 'value' as their database. If your wondering what my .csv file has is this:
Column headers of a, b and c
A list of equal amount of numbers in each of them
My main issue is:
Please try and help me understand why I cannot replace 'value ~ C(treatments)' with X and Y!
Source of the code: https://www.reneshbedre.com/blog/anova.html
In statsmodels formulae, you need to quote your variables (i.e. columns in your dataframe) when they contain special characters such as -. Have a look at the documentation, your term "x-axis" is interpreted as "x" - "axis". Quoting variable can be done with the Q() transformation. Make sure to quote the variable name inside with different (single/double) quotes that you use for the string:
model = ols(f'Q("{y}") ~ C(Q("{x}"))', data=df_melt).fit()
It seems that model = ols('value ~ C(treatments)', data=df_melt).fit() cannot have a variable subsitute (as i had in model = ols(f'{y} ~ C({x})', data=df_melt).fit()). This is also the case if i use model = ols(f'Q("{y}") ~ C(Q("{x}"))', data=df_melt).fit(), as mentioned by #Rob.
Therefore, to make it work and have my own names, i just have to rename df_melt.columns = ['index', 'treatments', 'value'] in relation to model = ols('value ~ C(treatments)', data=df_melt).fit() (where 'treatments', 'value' are the same thing in teh two lines of code).
I am getting an error when creating a series in pandas.
Whenever I try to print the series I have created, I get an error.
The code I am running:
import pandas as pd
data2 = [1,2,3,4]
index = ['a','b','c','d']
s = pd.Series(data2, index)
print(s.shape)
s
The error:
Traceback (most recent call last):
File "<pyshell#6>", line 1, in <module>
s
File "C:\Python34\lib\idlelib\rpc.py", line 611, in displayhook
text = repr(value)
File "C:\Python34\lib\site-packages\pandas\core\base.py", line 80, in __repr__
return str(self)
File "C:\Python34\lib\site-packages\pandas\core\base.py", line 59, in __str__
return self.__unicode__()
File "C:\Python34\lib\site-packages\pandas\core\series.py", line 1060, in __unicode__
width, height = get_terminal_size()
File "C:\Python34\lib\site-packages\pandas\io\formats\terminal.py", line 33, in get_terminal_size
return shutil.get_terminal_size()
File "C:\Python34\lib\shutil.py", line 1071, in get_terminal_size
size = os.get_terminal_size(sys.__stdout__.fileno())
AttributeError: 'NoneType' object has no attribute 'fileno'
Your error is related to pyshell, not to pandas.
Try to run it through python directly or jupyter console, because the code you provided is correct.
I am trying to create different python file where the code is given below. While calling the method, I pass the mydata as data frame with these columns
['wage', 'educ', 'exper', 'tenure'].
import pandas as pd
import numpy as np
from prettytable import PrettyTable as pt
def LinearRegressionOLS(mydata,target_column):
if(not isinstance(mydata,pd.DataFrame)):
raise TypeError("Data must be of type Data Frame")
if(not isinstance(target_column,str)):
raise TypeError("target_column must be String")
if(target_column not in mydata.columns):
raise KeyError("target_column doesn't exist in Data Frame")
data=mydata.copy()
data["one"]=np.ones(data.count()[target_column])
column_list=["one"]
for i in data.columns:
column_list.append(i)
Y=data[target_column].as_matrix()
data.drop(target_column,inplace=True,axis=1)
X=data[column_list].as_matrix()
del data
beta = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),Y)
predY = np.matmul(X,beta)
total = np.matmul((Y-np.mean(Y)).T,(Y-np.mean(Y)))
residual = np.matmul((Y-predY).T,(Y-predY))
sigma = np.matmul((Y-predY).T,(Y-predY))/(X.shape[0]-X.shape[1])
omega = np.square(sigma)*np.linalg.inv(np.matmul(X.T,X))
SE = np.sqrt(np.diag(omega))
tstat = beta/SE
Rsq = 1-(residual/total)
final = pt()
final.add_column(" ",column_list)
final.add_column("Coefficients",beta)
final.add_column("Standard Error",SE)
final.add_column("t-stat",tstat)
print(final)
print("Residual: ",residual)
print("Total: ",total)
print("Standard Error: ",sigma)
print("R Square: ",Rsq)
After running the above code, by calling the function given below,
>>> c
['wage', 'educ', 'exper', 'tenure']
>>> import LR_OLS as inf
>>> inf.LinearRegressionOLS(file[c],"wage")
, i get some error like this
Traceback (most recent call last):
File "<pyshell#182>", line 1, in <module>
inf.LinearRegressionOLS(file[c],"wage")
File "E:\python\LR_OLS.py", line 29, in LinearRegressionOLS
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2133, in __getitem__
return self._getitem_array(key)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2177, in _getitem_array
indexer = self.loc._convert_to_indexer(key, axis=1)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\indexing.py", line 1269, in _convert_to_indexer
.format(mask=objarr[mask]))
KeyError: "['wage'] not in index"
Can anyone help me as to why i am getting this error. How can i resolve it?
The problem is that you still have 'wage' in 'column_list. So in order to never let it get in there do the following adaptation:
for i in data.columns:
if i != 'wage': # add this line to your code
column_list.append(i)
I have a dictionary of python dataframe called df. I want to split each dataframe based on gap threshold of 4.5 on the time_epoch column and then merge all the result as a single collection.
From the this question and this question, I came up with following code but I get an error:
keys= df.keys()
all = Counter()
for key in keys:
ids = (df[key]['time_epoch'] > (df[key]['time_epoch'].shift() + 4.5)).cumsum()
gp= df[key].groupby(ids)
all.update(Counter(dict(list(gp))))
I get the following error:
Traceback (most recent call last):
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\ops.py", line 1176, in na_op
raise_on_error=True, **eval_kwargs)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\computation\expressions.py", line 211, in evaluate
**eval_kwargs)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\computation\expressions.py", line 64, in _evaluate_standard
return op(a, b)
TypeError: must be str, not int
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\internals.py", line 1184, in eval
result = get_result(other)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\internals.py", line 1153, in get_result
result = func(values, other)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\ops.py", line 1202, in na_op
result[mask] = op(xrav, y)
TypeError: must be str, not int
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/code.py", line 53, in <module>
function()
File "D:/code.py", line 41, in function
all.update(Counter(dict(list(flow_key))))
Edit1
My df is created as follow:
dftemp = pd.read_csv(
"traffic.csv",
skipinitialspace=True,
usecols=[
'time_epoch', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport',
'frame.len', 'tcp.flags', 'Protocol',
],
na_filter=False,
encoding="utf-8")
complete = pd.read_csv(
"traffic.csv",
skipinitialspace=True,
usecols=[
'frame.time_epoch', 'ip.src', 'ip.dst', 'tcp.srcport',
'tcp.dstport', 'frame.len', 'tcp.flags', 'Protocol',
],
na_filter=False,
encoding="utf-8")
complete.loc[(complete['ip.dst'] == hostip[i]), 'frame.len'] = complete['frame.len'] * -1
complete.loc[(complete['frame.len'] < 0), 'ip.src'] = dftemp['ip.dst']
complete.loc[(complete['frame.len'] < 0), 'ip.dst'] = dftemp['ip.src']
complete.loc[(complete['frame.len'] < 0), 'tcp.srcport'] = dftemp['tcp.dstport']
complete.loc[(complete['frame.len'] < 0), 'tcp.dstport'] = dftemp['tcp.srcport']
complete_flow = complete.groupby(
['ip.src','ip.dst','tcp.srcport','tcp.dstport','Protocol'])
df = dict(list(complete_flow))
df contains network traffic flows, which I want to split each flow using a threshold on packets timestamp gap.
Edit2
I find that counter only keep count of each key, so I iterate over new dictionary and create unique key for each, is there a pythonic way of doing this?
flows = {}
i = 1
for key in keys:
i += 1
flow_ids = (df[key]['time_epoch'] > (df[key]['time_epoch'].shift() + 4.5)).cumsum()
gp = df[key].groupby(ids)
df2 = dict(list(gp))
keys2 = df2.keys()
for i in keys2:
flows["%s, %s" % (key,i)] = df2[i]
del df2
I am reading the book Machine Learning in Action.
One example in Chapter 2 converts string to int for classification use. For example, 'student' = 1, 'teacher' = 2, engineer = 3.
See the code below in Line 12. While an error comes up while I execute it:
invalid literal for int() with base 10: 'largeDose'
Where is my problem.
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
caller code:
from numpy import *
import kNN
datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
ax.scatter(datingDataMat[:,1], datingDataMat[:,2], array(datingLabels), array(datingLabels))
plt.show()
Traceback and error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Anaconda2\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 714, in runfile
execfile(filename, namespace)
File "C:\Anaconda2\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 74, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/Zhiming Zhang/Documents/Machine Learning/kNN/execute.py", line 10, in <module>
datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
File "kNN.py", line 48, in file2matrix
classLabelVector.append(int(listFromLine[-1]))
ValueError: invalid literal for int() with base 10: 'largeDoses'
You try to convert a string like "largeDose" to an int using the conversion function int(). But that's not how this works. The function int() converts only strings which look like integer numbers (e. g. "123") to integers.
In your case you can use either an if-elif-else cascade or a dictionary.
Cascade:
if listFromLine[-1] == 'largeDose':
result = 1
elif listFromLine[-1] == 'teacher':
result = 2
elif …
…
else:
result = 42 # or raise an exception or whatever
Dictionary:
conversion = {
'largeDose': 1,
'teacher': 2,
… }
# ...
# later, in the loop:
classLabelVector.append(conversion[listFromLine[-1]])
# The above will raise a KeyError if an unexpected value is given.
# Ir in case you want to use a default value:
classLabelVector.append(conversion.get(listFromLine[-1], 42))