Python string to int for classification - python

I am reading the book Machine Learning in Action.
One example in Chapter 2 converts string to int for classification use. For example, 'student' = 1, 'teacher' = 2, engineer = 3.
See the code below in Line 12. While an error comes up while I execute it:
invalid literal for int() with base 10: 'largeDose'
Where is my problem.
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
caller code:
from numpy import *
import kNN
datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
ax.scatter(datingDataMat[:,1], datingDataMat[:,2], array(datingLabels), array(datingLabels))
plt.show()
Traceback and error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Anaconda2\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 714, in runfile
execfile(filename, namespace)
File "C:\Anaconda2\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 74, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/Zhiming Zhang/Documents/Machine Learning/kNN/execute.py", line 10, in <module>
datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
File "kNN.py", line 48, in file2matrix
classLabelVector.append(int(listFromLine[-1]))
ValueError: invalid literal for int() with base 10: 'largeDoses'

You try to convert a string like "largeDose" to an int using the conversion function int(). But that's not how this works. The function int() converts only strings which look like integer numbers (e. g. "123") to integers.
In your case you can use either an if-elif-else cascade or a dictionary.
Cascade:
if listFromLine[-1] == 'largeDose':
result = 1
elif listFromLine[-1] == 'teacher':
result = 2
elif …
…
else:
result = 42 # or raise an exception or whatever
Dictionary:
conversion = {
'largeDose': 1,
'teacher': 2,
… }
# ...
# later, in the loop:
classLabelVector.append(conversion[listFromLine[-1]])
# The above will raise a KeyError if an unexpected value is given.
# Ir in case you want to use a default value:
classLabelVector.append(conversion.get(listFromLine[-1], 42))

Related

Python gives KeyError while the key is passed

I am trying to create different python file where the code is given below. While calling the method, I pass the mydata as data frame with these columns
['wage', 'educ', 'exper', 'tenure'].
import pandas as pd
import numpy as np
from prettytable import PrettyTable as pt
def LinearRegressionOLS(mydata,target_column):
if(not isinstance(mydata,pd.DataFrame)):
raise TypeError("Data must be of type Data Frame")
if(not isinstance(target_column,str)):
raise TypeError("target_column must be String")
if(target_column not in mydata.columns):
raise KeyError("target_column doesn't exist in Data Frame")
data=mydata.copy()
data["one"]=np.ones(data.count()[target_column])
column_list=["one"]
for i in data.columns:
column_list.append(i)
Y=data[target_column].as_matrix()
data.drop(target_column,inplace=True,axis=1)
X=data[column_list].as_matrix()
del data
beta = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),Y)
predY = np.matmul(X,beta)
total = np.matmul((Y-np.mean(Y)).T,(Y-np.mean(Y)))
residual = np.matmul((Y-predY).T,(Y-predY))
sigma = np.matmul((Y-predY).T,(Y-predY))/(X.shape[0]-X.shape[1])
omega = np.square(sigma)*np.linalg.inv(np.matmul(X.T,X))
SE = np.sqrt(np.diag(omega))
tstat = beta/SE
Rsq = 1-(residual/total)
final = pt()
final.add_column(" ",column_list)
final.add_column("Coefficients",beta)
final.add_column("Standard Error",SE)
final.add_column("t-stat",tstat)
print(final)
print("Residual: ",residual)
print("Total: ",total)
print("Standard Error: ",sigma)
print("R Square: ",Rsq)
After running the above code, by calling the function given below,
>>> c
['wage', 'educ', 'exper', 'tenure']
>>> import LR_OLS as inf
>>> inf.LinearRegressionOLS(file[c],"wage")
, i get some error like this
Traceback (most recent call last):
File "<pyshell#182>", line 1, in <module>
inf.LinearRegressionOLS(file[c],"wage")
File "E:\python\LR_OLS.py", line 29, in LinearRegressionOLS
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2133, in __getitem__
return self._getitem_array(key)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2177, in _getitem_array
indexer = self.loc._convert_to_indexer(key, axis=1)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\indexing.py", line 1269, in _convert_to_indexer
.format(mask=objarr[mask]))
KeyError: "['wage'] not in index"
Can anyone help me as to why i am getting this error. How can i resolve it?
The problem is that you still have 'wage' in 'column_list. So in order to never let it get in there do the following adaptation:
for i in data.columns:
if i != 'wage': # add this line to your code
column_list.append(i)

How specify constraints in pulp dynamically?

I want to check if my data is linearly separable or not.For that I am using the equations mentioned at this link. Below is the code that I am using:
try:
import os
#import random
import traceback
import datetime
#import numpy as np
import scipy.io as sio
import pulp
os.system('cls')
dicA = sio.loadmat('A1.mat')
A = dicA.get('A1')
var = pulp.LpVariable.dicts("var",range(11),pulp.LpContinuous)
A = A[:,0:10]
model = pulp.LpProblem("Data linearly seaparable", pulp.LpMinimize)
model+= 0
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
for i in range(len(A)):
expr = pulp.LpAffineExpression()
for j in range(len(A[i])):
expr += var[j]*A[i][j]
expr+= var[10] <= -1
model+= expr
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
model.solve()
print(pulp.LpStatus[model.status])
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
except:
print('exception')
tb = traceback.format_exc()
print(tb)
finally:
print('reached finally')
And here is the output that I am getting:
C:\Users\puneet\Anaconda3\lib\site-packages\pulp\pulp.py:1348: UserWarning: Overwriting previously set objective.
warnings.warn("Overwriting previously set objective.")
2017-08-29 10:06:21
exception
Traceback (most recent call last):
File "C:/Hackerearth Challenge/Machine Learning #3/LInearlySeaparblePulp.py", line 31, in <module>
model.solve()
File "C:\Users\puneet\Anaconda3\lib\site-packages\pulp\pulp.py", line 1664, in solve
status = solver.actualSolve(self, **kwargs)
File "C:\Users\puneet\Anaconda3\lib\site-packages\pulp\solvers.py", line 1362, in actualSolve
return self.solve_CBC(lp, **kwargs)
File "C:\Users\puneet\Anaconda3\lib\site-packages\pulp\solvers.py", line 1384, in solve_CBC
tmpMps, rename = 1)
File "C:\Users\puneet\Anaconda3\lib\site-packages\pulp\pulp.py", line 1484, in writeMPS
f.write(" LO BND %-8s % .12e\n" % (n, v.lowBound))
TypeError: must be real number, not str
reached finally
I am adding 0 to specify that there is no objective function as mentioned in the link. Also since there are about 12000 rows in A variable, hence I am trying to create constraints dynamically.But there seems to be some problem in that.So, what is it that I am doing wrong?
var = pulp.LpVariable.dicts("var",range(11),pulp.LpContinuous)
needs to be
var = pulp.LpVariable.dicts("var",range(11),cat=pulp.LpContinuous)
as the LpVariable.dicts fn looks like this
def dicts(self, name, indexs, lowBound = None, upBound = None, cat = LpContinuous, indexStart = []):

Adding labels to ScatterChart in openpyxl

I have wrote this simple code to test things out for a more complicated one:
import sys
from openpyxl import load_workbook
from openpyxl.chart import Reference, ScatterChart, Series
from openpyxl.chart.label import DataLabel, DataLabelList
from openpyxl.chart.marker import Marker
from openpyxl.chart.shapes import GraphicalProperties
from openpyxl.drawing.colors import ColorChoice
wb = load_workbook(sys.argv[1])
for ws in wb:
chart = ScatterChart(scatterStyle='smoothMarker')
for x in range(2, 192):
labels = Reference(ws, min_col=1, max_col=1, min_row=x, max_row=x)
xaxis = Reference(ws, min_col=2, max_col=2, min_row=x, max_row=x)
data = Reference(ws, min_col=3, max_col=3, min_row=x, max_row=x)
s = Series(xaxis, data, title=ws['A' + str(x)].value)
sp_color = ColorChoice(prstClr=(str(ws['A' + str(x)].value)))
sp_sppr = GraphicalProperties(solidFill=sp_color)
s.marker = Marker(symbol=('circle'), size=20, spPr=sp_sppr)
s.marker.spPr.ln.noFill = True
s.spPr.ln.noFill = True
slabel = list() #EDITED
for x in labels.cells: #EDITED
slabel.append(ws[x].value) #EDITED
s.labels = DataLabelList(dLbl=slabel, dLblPos='bestFit')
chart.series.append(s)
chart.dataLabels = (DataLabelList(showLegendKey=True))
ws.add_chart(chart, 'D1')
wb.save(sys.argv[1])
the file use as sys.argv[1] is like that (technically it's just the color list for prstClr of ColorChoice with x += 10 and y = x):
Color x y
beige 0 0
forestGreen 10 10
dkGoldenrod 20 20
lightPink 30 30
slateGrey 40 40
the line s.labels = DataLabelList(dLbl=slabel, dLblPos='bestFit') outputs the error:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/base.py", line 57, in _convert
value = expected_type(value)
ValueError: invalid literal for int() with base 10: 'beige'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/base.py", line 57, in _convert
value = expected_type(value)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/chart/label.py", line 100, in __init__
self.idx = idx
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/nested.py", line 36, in __set__
super(Nested, self).__set__(instance, value)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/base.py", line 69, in __set__
value = _convert(self.expected_type, value)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/base.py", line 59, in _convert
raise TypeError('expected ' + str(expected_type))
TypeError: expected <class 'int'>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "csv/test.py", line 37, in <module>
s.labels = DataLabelList(dLbl=slabel, dLblPos='bestFit')
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/chart/label.py", line 128, in __init__
self.dLbl = dLbl
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/sequence.py", line 27, in __set__
seq = [_convert(self.expected_type, value) for value in seq]
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/sequence.py", line 27, in <listcomp>
seq = [_convert(self.expected_type, value) for value in seq]
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/openpyxl/descriptors/base.py", line 59, in _convert
raise TypeError('expected ' + str(expected_type))
TypeError: expected <class 'openpyxl.chart.label.DataLabel'>
EDIT: I used a list as Charlie pointed it out without more success, this list should contains the same amount of items with the label names of each.
Either slabel.append(ws[x].value) or slabel.append(x) output the same error, what should that list contains?
Versions:
- openpyxl 2.4.8
- python 3.6
- macOS Siera 10.12.6
Regards
Sequence is a descriptor used when writing classes you're using it incorrectly. You need to provide a Python sequence (list, tuple, etc.)

TypeError: unhashable type: 'slice'

I am trying to run a regression using the following dataframe dfMyRoll the head of the dataframe looks like:
SCORE SCORE_LAG
date
2007-10-29 -0.031551 NaN
2007-10-30 0.000100 -0.031551
2007-10-31 0.000100 0.000100
2007-11-01 0.000100 0.000100
2007-11-02 0.000100 0.000100
The code that I am using is :
import glob
import pandas as pd
import os.path
import scipy
from scipy.stats import linregress
def main():
dataPath = "C:/Users/Stacey/Documents/data/Roll"
roll = 4
1ID = "BBG.XNGS.AAPL.S"
2ID = "BBG.XNGS.AMAT.S"
print(1ID,1ID)
cointergration = getCointergration(dataPath,1ID,2ID,roll)
return
def getCointergration(dataPath,1ID,2ID,roll):
for myRoll in range((roll-4),roll,1):
path = dataPath+str(myRoll)+'/'
filename='PairData_'+1ID+'_'+2ID+'.csv'
for fname in glob.iglob(path+filename):
dfMyRoll = pd.read_csv(fname, header=0, usecols=[0,31],parse_dates=[0], dayfirst=True,index_col=[0], names=['date', 'SCORE'])
dfMyRoll['SCORE_LAG'] = dfMyRoll['SCORE'].shift(1)
print('cointergration',dfMyRoll.head())
X = dfMyRoll[1:,'SCORE']
Y = dfMyRoll[1:,'SCORE_LAG']
slope,intercept,_,_,stderr=linregress(dfMyRoll[1:,'SCORE'],dfMyRoll[1:,'SCORE_LAG'])
if __name__ == "__main__":
print ("CointergrationTest...19/05/17")
try:
main()
except KeyboardInterrupt:
print ("Ctrl+C pressed. Stopping...")
I get the error: TypeError: unhashable type: 'slice'. I have looked at previous posts on this subject and tried adding iloc to the X and Y time series in the following way:
X = dfMyRoll.iloc[1:,'SCORE']
Y = dfMyRoll.iloc[1:,'SCORE_LAG']
but unfortunately I can't seem to find a solution. Please see below for a stack trace:
Traceback (most recent call last):
File "<ipython-input-3-431422978139>", line 1, in <module>
runfile('C:/Users/Stacey/Documents/scripts/cointergrationTest.py', wdir='C:/Users/Stacey/Documents/scripts')
File "C:\Anaconda\lib\site-packages\spyder\utils\site\sitecustomize.py", line 866, in runfile
execfile(filename, namespace)
File "C:\Anaconda\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Stacey/Documents/scripts/cointergrationTest.py", line 64, in <module>
main()
File "C:/Users/Stacey/Documents/scripts/cointergrationTest.py", line 23, in main
cointergration = getCointergration(dataPath,1ID,2ID,roll)
File "C:/Users/Stacey/Documents/scripts/cointergrationTest.py", line 42, in getCointergration
X = dfMyRoll[1:,'SCORE']
File "C:\Anaconda\lib\site-packages\pandas\core\frame.py", line 2059, in __getitem__
return self._getitem_column(key)
File "C:\Anaconda\lib\site-packages\pandas\core\frame.py", line 2066, in _getitem_column
return self._get_item_cache(key)
File "C:\Anaconda\lib\site-packages\pandas\core\generic.py", line 1384, in _get_item_cache
res = cache.get(item)
TypeError: unhashable type: 'slice'
You need to use loc rather than iloc:
X = dfMyRoll.loc[1:,'SCORE']
Y = dfMyRoll.loc[1:,'SCORE_LAG']
iloc is read as "integer location", and only accepts integer position. loc is somewhat more forgiving and allows both (you can also use ix).

Too many indices error coming up in Python

This is my first bash at using python, I am making an array of fathers and children, I have a dataset in a .csv file which I need to go into python so I can later take it over to java script. However the same error message keeps coming which I have put below. Further down is my script. Would be most grateful for any advice!
Gabriella
>>> runfile('/Users/gkountourides/Desktop/FunctionalExperiment/untitled0.py', wdir='/Users/gkountourides/Desktop/FunctionalExperiment')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "//anaconda/lib/python3.5/site-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 714, in runfile
execfile(filename, namespace)
File "//anaconda/lib/python3.5/site-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 89, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/Users/gkountourides/Desktop/FunctionalExperiment/untitled0.py", line 41, in <module>
father,child = fc_csv_to_javascript_arrays("/Users/gkountourides/Desktop/FunctionalExperiment/fatherChild.csv")
File "/Users/gkountourides/Desktop/FunctionalExperiment/untitled0.py", line 38, in fc_csv_to_javascript_arrays
father_str, child_str = from_fc_array_to_fc_string(full_array)
File "/Users/gkountourides/Desktop/FunctionalExperiment/untitled0.py", line 30, in from_fc_array_to_fc_string
father_array = input_2d_array[:,0]
IndexError: too many indices for array
>>>
And then my actual script:
import glob
import numpy as np
def list_all_jpgs():
javascript_string = "["
for filename in glob.glob("*.jpg"):
javascript_string += '"' + filename + '",'
javascript_string = javascript_string[0:-1] + "]"
return javascript_string
def load_into_np_array(input_csv_file):
loaded_array = np.genfromtxt(input_csv_file, dtype=str, delimiter=",")
return loaded_array
def from_single_array_to_string(input_1d_array):
output_string = '['
for entry in input_1d_array:
output_string += '"'+str(entry)+'",'
output_string = output_string[0:-1]+']'
return output_string
def from_fc_array_to_fc_string(input_2d_array):
father_array = input_2d_array[:,0]
child_array = input_2d_array[:,1]
father_string = from_single_array_to_string(father_array)
child_string = from_single_array_to_string(child_array)
return father_string, child_string
def fc_csv_to_javascript_arrays(csv_input):
full_array = load_into_np_array(csv_input)
father_str, child_str = from_fc_array_to_fc_string(full_array)
return father_str, child_str
father,child = fc_csv_to_javascript_arrays("/Users/gkountourides/Desktop/FunctionalExperiment/fatherChild.csv")
print(father)
print(child)
The too many indices error indicates that input_2d_array is not a two 2d array. genfromtxt() is not returning what you are expecting.
numpy.genfromtxt produces array of what looks like tuples, not a 2D array—why?
http://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html

Categories