Adding labels to ScatterChart in openpyxl - python

I have wrote this simple code to test things out for a more complicated one:
import sys
from openpyxl import load_workbook
from openpyxl.chart import Reference, ScatterChart, Series
from openpyxl.chart.label import DataLabel, DataLabelList
from openpyxl.chart.marker import Marker
from openpyxl.chart.shapes import GraphicalProperties
from openpyxl.drawing.colors import ColorChoice
wb = load_workbook(sys.argv[1])
for ws in wb:
chart = ScatterChart(scatterStyle='smoothMarker')
for x in range(2, 192):
labels = Reference(ws, min_col=1, max_col=1, min_row=x, max_row=x)
xaxis = Reference(ws, min_col=2, max_col=2, min_row=x, max_row=x)
data = Reference(ws, min_col=3, max_col=3, min_row=x, max_row=x)
s = Series(xaxis, data, title=ws['A' + str(x)].value)
sp_color = ColorChoice(prstClr=(str(ws['A' + str(x)].value)))
sp_sppr = GraphicalProperties(solidFill=sp_color)
s.marker = Marker(symbol=('circle'), size=20, spPr=sp_sppr)
s.marker.spPr.ln.noFill = True
s.spPr.ln.noFill = True
slabel = list() #EDITED
for x in labels.cells: #EDITED
slabel.append(ws[x].value) #EDITED
s.labels = DataLabelList(dLbl=slabel, dLblPos='bestFit')
chart.series.append(s)
chart.dataLabels = (DataLabelList(showLegendKey=True))
ws.add_chart(chart, 'D1')
wb.save(sys.argv[1])
the file use as sys.argv[1] is like that (technically it's just the color list for prstClr of ColorChoice with x += 10 and y = x):
Color x y
beige 0 0
forestGreen 10 10
dkGoldenrod 20 20
lightPink 30 30
slateGrey 40 40
the line s.labels = DataLabelList(dLbl=slabel, dLblPos='bestFit') outputs the error:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/base.py", line 57, in _convert
value = expected_type(value)
ValueError: invalid literal for int() with base 10: 'beige'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/base.py", line 57, in _convert
value = expected_type(value)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/chart/label.py", line 100, in __init__
self.idx = idx
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/nested.py", line 36, in __set__
super(Nested, self).__set__(instance, value)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/base.py", line 69, in __set__
value = _convert(self.expected_type, value)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/base.py", line 59, in _convert
raise TypeError('expected ' + str(expected_type))
TypeError: expected <class 'int'>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "csv/test.py", line 37, in <module>
s.labels = DataLabelList(dLbl=slabel, dLblPos='bestFit')
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/chart/label.py", line 128, in __init__
self.dLbl = dLbl
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/sequence.py", line 27, in __set__
seq = [_convert(self.expected_type, value) for value in seq]
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/sequence.py", line 27, in <listcomp>
seq = [_convert(self.expected_type, value) for value in seq]
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/base.py", line 59, in _convert
raise TypeError('expected ' + str(expected_type))
TypeError: expected <class 'openpyxl.chart.label.DataLabel'>
EDIT: I used a list as Charlie pointed it out without more success, this list should contains the same amount of items with the label names of each.
Either slabel.append(ws[x].value) or slabel.append(x) output the same error, what should that list contains?
Versions:
- openpyxl 2.4.8
- python 3.6
- macOS Siera 10.12.6
Regards

Sequence is a descriptor used when writing classes you're using it incorrectly. You need to provide a Python sequence (list, tuple, etc.)

Related

Why does Python say that a value does not exist when it specifically does?

SHORT DESCRIPTION:
The Main issue is that whenever i run the following code, i get the error below that:
import statsmodels.api as sm
from statsmodels.formula.api import ols
def onewayanaova (csv, vars, x="x-axis", y="y-axis"):
df = pd.read_csv(csv, delimiter=",")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=vars)
df_melt.columns = ['index', {x}, {y}]
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("The One-Way Anova Test Values are:\n")
print(anova_table)
onewayanaova("Book1.csv", ["a","b","c"])
The error is:
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Traceback (most recent call last):
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 36, in call_and_wrap_exc
return f(*args, **kwargs)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 165, in eval
return eval(code, {}, VarLookupDict([inner_namespace]
File "<string>", line 1, in <module>
NameError: name 'axis' is not defined
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\test.py", line 3, in <module>
mn.onewayanaova("Book1.csv", ["a","b","c"])
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\maincode.py", line 154, in onewayanaova
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\base\model.py", line 200, in from_formula
tmp = handle_formula_data(data, None, formula, depth=eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\formula\formulatools.py", line 63, in handle_formula_data
result = dmatrices(formula, Y, depth, return_type='dataframe',
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 309, in dmatrices
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 164, in _do_highlevel_design
design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 66, in _try_incr_builders
return design_matrix_builders([formula_like.lhs_termlist,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 693, in design_matrix_builders
cat_levels_contrasts) = _examine_factor_types(all_factors,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 443, in _examine_factor_types
value = factor.eval(factor_states[factor], data)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 564, in eval
return self._eval(memorize_state["eval_code"],
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 547, in _eval
return call_and_wrap_exc("Error evaluating factor",
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 43, in call_and_wrap_exc
exec("raise new_exc from e")
File "<string>", line 1, in <module>
patsy.PatsyError: Error evaluating factor: NameError: name 'axis' is not defined
y-axis ~ C(x-axis)
^^^^^^^^^
I think it is the X and Y variables I set in def onewayanaova (csv, vars, x="x-axis", y="y-axis"):. Maybe I need to change that so I don't get the error?
If you want a more detailed description, read below.
LONG DESCRIPTION:
I am trying to do a One Way Anova test. However, the main issue is that python keeps saying that there is a NameError, and that one of my values are not defined.
I am running the following code:
import statsmodels.api as sm
from statsmodels.formula.api import ols
def onewayanaova (csv, vars, x="x-axis", y="y-axis"):
df = pd.read_csv(csv, delimiter=",")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=vars)
df_melt.columns = ['index', {x}, {y}]
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("The One-Way Anova Test Values are:\n")
print(anova_table)
And:
import maincode as mn
mn.onewayanaova("Book1.csv", ["a","b","c"])
I get the following error (The first code is saved to a file named manicode.py, and the second code is saved to a file named test.py. "Book1.csv" is in the same folder as them). The error is:
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'
Traceback (most recent call last):
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 36, in call_and_wrap_exc
return f(*args, **kwargs)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 165, in eval
return eval(code, {}, VarLookupDict([inner_namespace]
File "<string>", line 1, in <module>
NameError: name 'axis' is not defined
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\test.py", line 3, in <module>
mn.onewayanaova("Book1.csv", ["a","b","c"])
File "c:\Users\mghaf\Desktop\Python Codes\ReMan Edu\maincode.py", line 154, in onewayanaova
model = ols(f'{y} ~ C({x})', data=df_melt).fit()
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\base\model.py", line 200, in from_formula
tmp = handle_formula_data(data, None, formula, depth=eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\formula\formulatools.py", line 63, in handle_formula_data
result = dmatrices(formula, Y, depth, return_type='dataframe',
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 309, in dmatrices
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 164, in _do_highlevel_design
design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\highlevel.py", line 66, in _try_incr_builders
return design_matrix_builders([formula_like.lhs_termlist,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 693, in design_matrix_builders
cat_levels_contrasts) = _examine_factor_types(all_factors,
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\build.py", line 443, in _examine_factor_types
value = factor.eval(factor_states[factor], data)
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 564, in eval
return self._eval(memorize_state["eval_code"],
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\eval.py", line 547, in _eval
return call_and_wrap_exc("Error evaluating factor",
File "C:\Users\mghaf\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\patsy\compat.py", line 43, in call_and_wrap_exc
exec("raise new_exc from e")
File "<string>", line 1, in <module>
patsy.PatsyError: Error evaluating factor: NameError: name 'axis' is not defined
y-axis ~ C(x-axis)
^^^^^^^^^
The main error that I see is that I named the X and Y variables as: x="x-axis", y="y-axis". But i do not get why that gives me an error, as I made a very neat looking boxplot from it (but I know that X and Y are used as the axis titles):
def boxplot (csv, vars, x="x-axis", y="y-axis"):
#https://www.reneshbedre.com/blog/anova.html
df = pd.read_csv(csv, delimiter=",")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=vars)
df_melt.columns = ['index', x, y]
ax = sns.boxplot(x=x, y=y, data=df_melt, color='#99c2a2')
ax = sns.swarmplot(x=x, y=y, data=df_melt, color='#7d0013')
plt.show()
BUT, whenever I write this code from someone else, it gives the output I want:
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd
df = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/onewayanova.txt", sep="\t")
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=['A', 'B', 'C', 'D'])
df_melt.columns = ['index', 'treatments', 'value']
model = ols('value ~ C(treatments)', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)
The output that i get with the above code:
sum_sq df F PR(>F)
C(treatments) 3010.95 3.0 17.49281 0.000026
Residual 918.00 16.0 NaN NaN
The main issue is that i need to change values of model = ols('value ~ C(treatments)', data=df_melt).fit() and df_melt.columns = ['index', 'treatments', 'value'] because most datasets do not have 'treatments', 'value' as their database. If your wondering what my .csv file has is this:
Column headers of a, b and c
A list of equal amount of numbers in each of them
My main issue is:
Please try and help me understand why I cannot replace 'value ~ C(treatments)' with X and Y!
Source of the code: https://www.reneshbedre.com/blog/anova.html
In statsmodels formulae, you need to quote your variables (i.e. columns in your dataframe) when they contain special characters such as -. Have a look at the documentation, your term "x-axis" is interpreted as "x" - "axis". Quoting variable can be done with the Q() transformation. Make sure to quote the variable name inside with different (single/double) quotes that you use for the string:
model = ols(f'Q("{y}") ~ C(Q("{x}"))', data=df_melt).fit()
It seems that model = ols('value ~ C(treatments)', data=df_melt).fit() cannot have a variable subsitute (as i had in model = ols(f'{y} ~ C({x})', data=df_melt).fit()). This is also the case if i use model = ols(f'Q("{y}") ~ C(Q("{x}"))', data=df_melt).fit(), as mentioned by #Rob.
Therefore, to make it work and have my own names, i just have to rename df_melt.columns = ['index', 'treatments', 'value'] in relation to model = ols('value ~ C(treatments)', data=df_melt).fit() (where 'treatments', 'value' are the same thing in teh two lines of code).

error in creating series in pandas

I am getting an error when creating a series in pandas.
Whenever I try to print the series I have created, I get an error.
The code I am running:
import pandas as pd
data2 = [1,2,3,4]
index = ['a','b','c','d']
s = pd.Series(data2, index)
print(s.shape)
s
The error:
Traceback (most recent call last):
File "<pyshell#6>", line 1, in <module>
s
File "C:\Python34\lib\idlelib\rpc.py", line 611, in displayhook
text = repr(value)
File "C:\Python34\lib\site-packages\pandas\core\base.py", line 80, in __repr__
return str(self)
File "C:\Python34\lib\site-packages\pandas\core\base.py", line 59, in __str__
return self.__unicode__()
File "C:\Python34\lib\site-packages\pandas\core\series.py", line 1060, in __unicode__
width, height = get_terminal_size()
File "C:\Python34\lib\site-packages\pandas\io\formats\terminal.py", line 33, in get_terminal_size
return shutil.get_terminal_size()
File "C:\Python34\lib\shutil.py", line 1071, in get_terminal_size
size = os.get_terminal_size(sys.__stdout__.fileno())
AttributeError: 'NoneType' object has no attribute 'fileno'
Your error is related to pyshell, not to pandas.
Try to run it through python directly or jupyter console, because the code you provided is correct.

Python gives KeyError while the key is passed

I am trying to create different python file where the code is given below. While calling the method, I pass the mydata as data frame with these columns
['wage', 'educ', 'exper', 'tenure'].
import pandas as pd
import numpy as np
from prettytable import PrettyTable as pt
def LinearRegressionOLS(mydata,target_column):
if(not isinstance(mydata,pd.DataFrame)):
raise TypeError("Data must be of type Data Frame")
if(not isinstance(target_column,str)):
raise TypeError("target_column must be String")
if(target_column not in mydata.columns):
raise KeyError("target_column doesn't exist in Data Frame")
data=mydata.copy()
data["one"]=np.ones(data.count()[target_column])
column_list=["one"]
for i in data.columns:
column_list.append(i)
Y=data[target_column].as_matrix()
data.drop(target_column,inplace=True,axis=1)
X=data[column_list].as_matrix()
del data
beta = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),Y)
predY = np.matmul(X,beta)
total = np.matmul((Y-np.mean(Y)).T,(Y-np.mean(Y)))
residual = np.matmul((Y-predY).T,(Y-predY))
sigma = np.matmul((Y-predY).T,(Y-predY))/(X.shape[0]-X.shape[1])
omega = np.square(sigma)*np.linalg.inv(np.matmul(X.T,X))
SE = np.sqrt(np.diag(omega))
tstat = beta/SE
Rsq = 1-(residual/total)
final = pt()
final.add_column(" ",column_list)
final.add_column("Coefficients",beta)
final.add_column("Standard Error",SE)
final.add_column("t-stat",tstat)
print(final)
print("Residual: ",residual)
print("Total: ",total)
print("Standard Error: ",sigma)
print("R Square: ",Rsq)
After running the above code, by calling the function given below,
>>> c
['wage', 'educ', 'exper', 'tenure']
>>> import LR_OLS as inf
>>> inf.LinearRegressionOLS(file[c],"wage")
, i get some error like this
Traceback (most recent call last):
File "<pyshell#182>", line 1, in <module>
inf.LinearRegressionOLS(file[c],"wage")
File "E:\python\LR_OLS.py", line 29, in LinearRegressionOLS
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2133, in __getitem__
return self._getitem_array(key)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2177, in _getitem_array
indexer = self.loc._convert_to_indexer(key, axis=1)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\indexing.py", line 1269, in _convert_to_indexer
.format(mask=objarr[mask]))
KeyError: "['wage'] not in index"
Can anyone help me as to why i am getting this error. How can i resolve it?
The problem is that you still have 'wage' in 'column_list. So in order to never let it get in there do the following adaptation:
for i in data.columns:
if i != 'wage': # add this line to your code
column_list.append(i)

How to merge python dictionary and create new key for the intersections

I have a dictionary of python dataframe called df. I want to split each dataframe based on gap threshold of 4.5 on the time_epoch column and then merge all the result as a single collection.
From the this question and this question, I came up with following code but I get an error:
keys= df.keys()
all = Counter()
for key in keys:
ids = (df[key]['time_epoch'] > (df[key]['time_epoch'].shift() + 4.5)).cumsum()
gp= df[key].groupby(ids)
all.update(Counter(dict(list(gp))))
I get the following error:
Traceback (most recent call last):
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\ops.py", line 1176, in na_op
raise_on_error=True, **eval_kwargs)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\computation\expressions.py", line 211, in evaluate
**eval_kwargs)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\computation\expressions.py", line 64, in _evaluate_standard
return op(a, b)
TypeError: must be str, not int
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\internals.py", line 1184, in eval
result = get_result(other)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\internals.py", line 1153, in get_result
result = func(values, other)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\ops.py", line 1202, in na_op
result[mask] = op(xrav, y)
TypeError: must be str, not int
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/code.py", line 53, in <module>
function()
File "D:/code.py", line 41, in function
all.update(Counter(dict(list(flow_key))))
Edit1
My df is created as follow:
dftemp = pd.read_csv(
"traffic.csv",
skipinitialspace=True,
usecols=[
'time_epoch', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport',
'frame.len', 'tcp.flags', 'Protocol',
],
na_filter=False,
encoding="utf-8")
complete = pd.read_csv(
"traffic.csv",
skipinitialspace=True,
usecols=[
'frame.time_epoch', 'ip.src', 'ip.dst', 'tcp.srcport',
'tcp.dstport', 'frame.len', 'tcp.flags', 'Protocol',
],
na_filter=False,
encoding="utf-8")
complete.loc[(complete['ip.dst'] == hostip[i]), 'frame.len'] = complete['frame.len'] * -1
complete.loc[(complete['frame.len'] < 0), 'ip.src'] = dftemp['ip.dst']
complete.loc[(complete['frame.len'] < 0), 'ip.dst'] = dftemp['ip.src']
complete.loc[(complete['frame.len'] < 0), 'tcp.srcport'] = dftemp['tcp.dstport']
complete.loc[(complete['frame.len'] < 0), 'tcp.dstport'] = dftemp['tcp.srcport']
complete_flow = complete.groupby(
['ip.src','ip.dst','tcp.srcport','tcp.dstport','Protocol'])
df = dict(list(complete_flow))
df contains network traffic flows, which I want to split each flow using a threshold on packets timestamp gap.
Edit2
I find that counter only keep count of each key, so I iterate over new dictionary and create unique key for each, is there a pythonic way of doing this?
flows = {}
i = 1
for key in keys:
i += 1
flow_ids = (df[key]['time_epoch'] > (df[key]['time_epoch'].shift() + 4.5)).cumsum()
gp = df[key].groupby(ids)
df2 = dict(list(gp))
keys2 = df2.keys()
for i in keys2:
flows["%s, %s" % (key,i)] = df2[i]
del df2

Python string to int for classification

I am reading the book Machine Learning in Action.
One example in Chapter 2 converts string to int for classification use. For example, 'student' = 1, 'teacher' = 2, engineer = 3.
See the code below in Line 12. While an error comes up while I execute it:
invalid literal for int() with base 10: 'largeDose'
Where is my problem.
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
caller code:
from numpy import *
import kNN
datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
ax.scatter(datingDataMat[:,1], datingDataMat[:,2], array(datingLabels), array(datingLabels))
plt.show()
Traceback and error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Anaconda2\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 714, in runfile
execfile(filename, namespace)
File "C:\Anaconda2\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 74, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/Zhiming Zhang/Documents/Machine Learning/kNN/execute.py", line 10, in <module>
datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
File "kNN.py", line 48, in file2matrix
classLabelVector.append(int(listFromLine[-1]))
ValueError: invalid literal for int() with base 10: 'largeDoses'
You try to convert a string like "largeDose" to an int using the conversion function int(). But that's not how this works. The function int() converts only strings which look like integer numbers (e. g. "123") to integers.
In your case you can use either an if-elif-else cascade or a dictionary.
Cascade:
if listFromLine[-1] == 'largeDose':
result = 1
elif listFromLine[-1] == 'teacher':
result = 2
elif …
…
else:
result = 42 # or raise an exception or whatever
Dictionary:
conversion = {
'largeDose': 1,
'teacher': 2,
… }
# ...
# later, in the loop:
classLabelVector.append(conversion[listFromLine[-1]])
# The above will raise a KeyError if an unexpected value is given.
# Ir in case you want to use a default value:
classLabelVector.append(conversion.get(listFromLine[-1], 42))

Categories