I've been all day trying to solve the test_errors() function in "Exercise 48: Advanced User Input" of the book Learn Python The Hard Way.
assert_equal(), a function in the tests asks me for the tuples in order and I haven't been able to code it that way.
My loops always returns first the nouns and last the error tuples, I don't know how to break the loop so it starts again but with the right values to continue or whatever is necessary to sort this tuples in the order they should be.
Here's the code:
class Lexicon(object):
def scan(self, stringo):
vocabulary = [[('direction', 'north'), ('direction', 'south'), ('direction', 'east'), ('direction', 'west')],
[('verb', 'go'), ('verb', 'kill'), ('verb', 'eat')],
[('stop', 'the'), ('stop', 'in'), ('stop', 'of')],
[('noun', 'bear'), ('noun', 'princess')], # Remember numbers
[('error', 'ASDFADFASDF'), ('error', 'IAS')],
[('number', '1234'), ('number','3'), ('number', '91234')]]
self.stringo = stringo
got_word = ''
value = []
rompe = self.stringo.split() #split rompe en los espacios
for asigna in vocabulary:
for encuentra in asigna:
if encuentra[1] in rompe:
value.append(encuentra)
return value
eLexicon = Lexicon()
from nose.tools import *
from ex48.ex48 import eLexicon
def test_directions():
assert_equal(eLexicon.scan("north"), [('direction', 'north')])
result = eLexicon.scan("north south east")
assert_equal(result, [('direction', 'north'),
('direction', 'south'),
('direction', 'east')])
def test_verbs():
assert_equal(eLexicon.scan("go"), [('verb', 'go')])
result = eLexicon.scan("go kill eat")
assert_equal(result, [('verb', 'go'),
('verb', 'kill'),
('verb', 'eat')])
def test_stops():
assert_equal(eLexicon.scan("the"), [('stop', 'the')])
result = eLexicon.scan("the in of")
assert_equal(result, [('stop', 'the'),
('stop', 'in'),
('stop', 'of')])
def test_nouns():
assert_equal(eLexicon.scan("bear"), [('noun', 'bear')])
result = eLexicon.scan("bear princess")
assert_equal(result, [('noun', 'bear'),
('noun', 'princess')])
#def test_numbers():
# assert_equal(lexicon.scan("1234"), [('number', 1234)])
# result = lexicon.scan("3 91234")
# assert_equal(result, [('number', 3),
# ('number', 91234)])
def test_errors():
assert_equal(eLexicon.scan("ASDFADFASDF"), [('error', 'ASDFADFASDF')])
result = eLexicon.scan("bear IAS princess")
assert_equal(result, [('noun', 'bear'),
('error', 'IAS'),
('noun', 'princess')])
======================================================================
FAIL: tests.ex48_tests.test_errors
----------------------------------------------------------------------
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/nose/case.py", line 197, in runTest
self.test(*self.arg)
File "/home/totoro/Desktop/Python/projects/ex48/tests/ex48_tests.py", line 43, in test_errors
('noun', 'princess')])
AssertionError: Lists differ: [('noun', 'bear'), ('noun', 'p... != [('noun', 'bear'), ('error', '...
First differing element 1:
('noun', 'princess')
('error', 'IAS')
- [('noun', 'bear'), ('noun', 'princess'), ('error', 'IAS')]
+ [('noun', 'bear'), ('error', 'IAS'), ('noun', 'princess')]
----------------------------------------------------------------------
Ran 5 tests in 0.006s
Many thanks in advance for taking the time.
The words in the test are in the same order going in as coming out. As such you need to re-order your for-loops to iterate over the input first:
value = []
for rompe in stringo.split():
for asigna in vocabulary:
for encuentra in asigna:
if encuentra[1] == rompe:
value.append(encuentra)
This will return the encuentras in the correct order.
Note 1: You should not be hard-coding the numbers or errors.
Note 2: You can drastically reduce the complexity of this algorithm by using dictionary or two.
Example:
vocabulary = {
'direction': 'north east south west up down left right back'.split(),
'noun': 'bear princess door cabinet'.split(),
'stop': 'the in of from at it'.split(),
'verb': 'go kill eat stop'.split(),
}
'''
This creates a lookup using a dictionary-comprehension:
{'at': 'stop',
# [...]
'up': 'direction',
'west': 'direction'}
'''
classifications = {i: k for k, v in vocabulary.iteritems() for i in v}
def classify(word):
try:
return 'number', int(word)
except ValueError:
return classifications.get(word, 'error'), word
def scan(words):
return [classify(word) for word in words.split()]
for word in self.stringo.split():
for pair in vocabulary:
if pair[0][1] == word:
value.append(pair[0])
elif pair[1][1] == word:
value.append(pair[1])
elif pair[2][1] == word:
value.append(pair[2])
elif pair[3][1] == word:
value.append(pair[3])
I just finished this exercise, hope this gives some new ideas to some of you.
This is my solution:
#Set up datastructure
direction = ["north", "east", "south", "west", "up", "right", "down", "left", "back"]
verb = ["go", "stop", "kill", "eat"]
stop = ["the", "in", "of", "from", "at", "it"]
noun = ["door", "bear", "princess", "cabinet"]
vocabulary = [(direction, 'direction'), (verb, 'verb'), (stop, 'stop'), (noun, 'noun')]
def scan(sentence):
#searches the words in the datastructure, if not found checks if it is an integer, if not returns error.
results = []
words = sentence.split()
for word in words:
found = False
for category in vocabulary:
if word.lower() in category[0]:
results.append((category[1], word))
found = True
else:
pass
if found is False and isInt_str(word) is True:
results.append(('number', int(word)))
elif found is False and isInt_str(word) is False:
results.append(('error', word))
elif found is True:
pass
else:
print("I'm terribly sorry, but something you entered is neither a word nor a number.")
return results
def isInt_str(string):
#returns True or False if string equals an integer. (i.e. 2 = True, 2*-2 = True 2**0,5 = False)
string = str(string).strip()
return string=='0' or (string if string.find('..') > -1 else string.lstrip('-+').rstrip('0').rstrip('.')).isdigit()
Related
I have a dataframe where every row is a list of tuples , i.e.: tuple = (word, pos_tag). In each row, I want to change the word of some tuples by marking it and then update the tuple with the marked word. For example:
Initial dataframe row :
[('This', 'DET'), ('is', 'VERB'), ('an', 'DET'), ('example', 'NOUN'), ('text', 'NOUN'), ('that', 'DET'), ('I', 'PRON'), ('use', 'VERB'), ('in', 'ADP'), ('order', 'NOUN'), ('to', 'PART'), ('get', 'VERB'), ('an', 'DET'), ('answer', 'NOUN')]
Updated words :
updated_word : <IN>example</IN>
updated_word : <TAR>answer</TAR>
Desired output :
[('This', 'DET'), ('is', 'VERB'), ('an', 'DET'), ('<IN>example</IN>', 'NOUN'), ('text', 'NOUN'), ('that', 'DET'), ('I', 'PRON'), ('use', 'VERB'), ('in', 'ADP'), ('order', 'NOUN'), ('to', 'PART'), ('get', 'VERB'), ('an', 'DET'), ('<TAR>answer</TAR>', 'NOUN')]
But I get an error that TypeError: 'tuple' object is not callable. Can someone help? Here's the code :
for idx, row in df.iterrows():
doc = nlp(row['title'])
pos_tags = [(token.text, token.pos_) for token in doc if not token.pos_ == "PUNCT"]
for position, tuple in enumerate(pos_tags, start=1):
word = tuple[0]
spacy_pos_tag = tuple[1]
word = re.sub(r'[^\w\s]', '', word)
for col in cols:
if position in row[col]:
word = f'<{col.upper()}>{word}</{col.upper()}>'
else:
continue
tuple = tuple(word, spacy_pos_tag)
print(tuple)
>>>> Traceback (most recent call last):
>>>> tuple = tuple(word, spacy_pos_tag)
>>>> TypeError: 'tuple' object is not callable
Updated question
I have replaced tuple with tuple_ as suggested, but I still can't get back the desired output which is a list of tuples in every row. Can someone help how to update the dataframe rows? Here's the updated code :
for idx, row in df.iterrows():
doc = nlp(row['title'])
pos_tags = [(token.text, token.pos_) for token in doc if not token.pos_ == "PUNCT"]
# print(idx, "tokens, pos : ", pos_tags, "\n")
for position, tuple_ in enumerate(pos_tags, start=1):
word = tuple_[0]
spacy_pos_tag = tuple_[1]
word = re.sub(r'[^\w\s]', '', word)
for col in cols:
if position in row[col]:
word = f'<{col.upper()}>{word}</{col.upper()}>'
else:
continue
tuple_ = (word, spacy_pos_tag)
pos_tags.append(' '.join(position, tuple_))
# pos_tags.append(' '.join(tuple_))
print(idx, "tokens, pos : ", pos_tags, "\n")
>>>> Traceback (most recent call last):
>>>> pos_tag(df=df_matched)
>>>> pos_tags.append(' '.join(position, tuple_))
>>>> TypeError: join() takes exactly one argument (2 given)
Do not use tuple as a variable name, as it is a built-in python type name. Try the following instead:
for position, tuple_ in enumerate(pos_tags, start=1):
word = tuple_[0]
spacy_pos_tag = tuple_[1]
word = re.sub(r'[^\w\s]', '', word)
for col in cols:
if position in row[col]:
word = f'<{col.upper()}>{word}</{col.upper()}>'
else:
continue
tuple_ = (word, spacy_pos_tag)
print(tuple_)
Don't use "tuple" as name of a variable. It's a type name
import nltk
from itertools import groupby
def get_continuous_chunks(tagged_sent):
continuous_chunk = []
current_chunk = []
for token, tag in tagged_sent:
if tag != "O":
current_chunk.append((token, tag))
else:
if current_chunk: # if the current chunk is not empty
continuous_chunk.append(current_chunk)
current_chunk = []
# Flush the final current_chunk into the continuous_chunk, if any.
if current_chunk:
continuous_chunk.append(current_chunk)
return continuous_chunk
ne_tagged_sent = [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')]
named_entities = get_continuous_chunks(ne_tagged_sent)
named_entities = get_continuous_chunks(ne_tagged_sent)
named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities]
def parser(n,string):
for i in named_entities_str_tag[n]:
if i==string:
pass
else:
return i
print named_entities_str_tag
print
I got this output from the above code:
('PERSON ', 'Rami Eid')
('ORGANIZATION', 'Stony Brook University')
('LOCATION ', 'NY')
('PERSON ', 'GuruRaj Bagali')
('ORGANIZATION', 'Christ University')
But I want it should be map like PERSON WITH ORGANIZATION AND LOCATION I want to store it in json format.
It's not very clear what ne_tagged_sent list contains (Is there a LOCATION for each PERSON, ORGANIZATION ?), you must clarify it that we could answer your question.
You should format your data as a dictionary, each entry corresponds to a person like:
import json
data = {
'Rami Eid':{'job': 'engineer', 'location':'NY'},
'GuruRaj Bagali':{'job': 'professor', 'location': 'NY'}
}
#Save it in a json file
json.dump(data, open('path/to_your_file', 'w')
Give an input sentence, that has BIO chunk tags:
[('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed',
'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'),
('swallow', 'I-NP'), ('?', 'O')]
I would need to extract the relevant phrases out, e.g. if I want to extract 'NP', I would need to extract the fragments of tuples that contains B-NP and I-NP.
[out]:
[('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]
(Note: the numbers in the extract tuples represent the token index.)
I have tried extracting it using the following code:
def extract_chunks(tagged_sent, chunk_type):
current_chunk = []
current_chunk_position = []
for idx, word_pos in enumerate(tagged_sent):
word, pos = word_pos
if '-'+chunk_type in pos: # Append the word to the current_chunk.
current_chunk.append((word))
current_chunk_position.append((idx))
else:
if current_chunk: # Flush the full chunk when out of an NP.
_chunk_str = ' '.join(current_chunk)
_chunk_pos_str = '-'.join(map(str, current_chunk_position))
yield _chunk_str, _chunk_pos_str
current_chunk = []
current_chunk_position = []
if current_chunk: # Flush the last chunk.
yield ' '.join(current_chunk), '-'.join(current_chunk_position)
tagged_sent = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')]
print (list(extract_chunks(tagged_sent, chunk_type='NP')))
But when I have adjacent chunk of the same type:
tagged_sent = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'), ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')]
print (list(extract_chunks(tagged_sent, chunk_type='NP')))
It outputs this:
[('The Mitsubishi Electric Company Managing Director', '0-1-2-3-4-5'), ('ramen', '7')]
Instead of the desired:
[('The Mitsubishi Electric Company', '0-1-2-3'), ('Managing Director', '4-5'), ('ramen', '7')]
How can this be resolved from the above code?
Other than how it's done from the code above, is there a better solution to extract the desired chunks of a specific chunk_type?
Try this, it will extract all types of chunks with the indices of their respective words.
def extract_chunks(tagged_sent, chunk_type='NP'):
out_sen = []
for idx, word_pos in enumerate(tagged_sent):
word,bio = word_pos
boundary,tag = bio.split("-") if "-" in bio else ('','O')
if tag != chunk_type:continue
if boundary == "B":
out_sen.append([word, str(idx)])
elif boundary == "I":
out_sen[-1][0] += " "+ word
out_sen[-1][-1] += "-"+ str(idx)
else:
out_sen.append([word, str(idx)])
return out_sen
Demo:
>>> tagged_sent = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'), ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')]
>>> output_sent = extract_chunks(tagged_sent)
>>> print map(tuple, output_sent)
[('The Mitsubishi Electric Company', '0-1-2-3'), ('Managing Director', '4-5'), ('ramen', '7')]
def extract_chunks(tagged_sent, chunk_type):
grp1, grp2, chunk_type = [], [], "-" + chunk_type
for ind, (s, tp) in enumerate(tagged_sent):
if tp.endswith(chunk_type):
if not tp.startswith("B"):
grp2.append(str(ind))
grp1.append(s)
else:
if grp1:
yield " ".join(grp1), "-".join(grp2)
grp1, grp2 = [s], [str(ind)]
yield " ".join(grp1), "-".join(grp2)
Output:
In [2]: l = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'),
...: ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')]
In [3]: list(extract_chunks(l, "NP"))
Out[3]:
[('The Mitsubishi Electric Company', '0-1-2-3'),
('Managing Director', '4-5'),
('ramen', '7')]
In [4]: l = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')]
In [5]: list(extract_chunks(l, "NP"))
Out[5]: [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]
I would do it like this:
import re
def extract_chunks(tagged_sent, chunk_type):
# compiles the expression we want to match
regex = re.compile(chunk_type)
# filters matched items in a dictionary whose keys are the matched indexes
first_step = {index_:tag[0] for index_, tag in enumerate(tagged_sent) if regex.findall(tag[1])}
# builds list of lists following output format
second_step = []
for key_ in sorted(first_step.keys()):
if second_step and int(second_step [len(second_step )-1][1].split('-')[-1]) == key_ -1:
second_step[len(second_step)-1][0] += ' {0}'.format(first_step[key_])
second_step[len(second_step)-1][1] += '-{0}'.format(str(key_))
else:
second_step.append([first_step[key_], str(key_)])
# builds output in final format
return [tuple(item) for item in second_step]
You can adapt it to use generators instead of building the whole output in memory like I am doing and refactory it for better performance (I'm in a hurry so the code is far from optimal).
Hope it helps!
On exercise 48 of Learn Python the Hard Way, I'm asked to create a module to be tested by this one, lexicon_tests.py:
from nose.tools import *
from ex48 import lexicon
def test_directions():
assert_equal(lexicon.scan("north"), [('direction', 'north')])
result = lexicon.scan("north south east")
assert_equal(result, [('direction', 'north'),
('direction', 'south'),
('direction', 'east')])
def test_verbs():
assert_equal(lexicon.scan("go"), [('verb', 'go')])
result = lexicon.scan("go kill eat")
assert_equal(result, [('verb', 'go'),
('verb', 'kill'),
('verb', 'eat')])
def test_stops():
assert_equal(lexicon.scan("the"), [('stop', 'the')])
result = lexicon.scan("the in of")
assert_equal(result, [('stop', 'the'),
('stop', 'in'),
('stop', 'of')])
def test_nouns():
assert_equal(lexicon.scan("bear"), [('noun', 'bear')])
result = lexicon.scan("bear princess")
assert_equal(result, [('noun', 'bear'),
('noun', 'princess')])
def test_numbers():
assert_equal(lexicon.scan("1234"), [('number', 1234)])
result = lexicon.scan("3 91234")
assert_equal(result, [('number', 3),
('number', 91234)])
def test_errors():
assert_equal(lexicon.scan("ASDFADFASDF"), [('error', 'ASDFADFASDF')])
result = lexicon.scan("bear IAS princess")
assert_equal(result, [('noun', 'bear'),
('error', 'IAS'),
('noun', 'princess')])
So I created the module, lexicon.py, to be tested here:
def scan(words):
directions = ['north', 'south', 'east', 'west', 'down', 'up', 'left', 'right', 'back']
verbs = ['go', 'stop', 'kill', 'eat']
stop_words = ['the', 'in', 'of', 'from', 'at', 'it']
nouns = ['door', 'bear', 'princess', 'cabinet']
lex = words.split()
list1 = []
for i in lex:
if i in directions:
list1.append(('direction', i))
elif i in verbs:
list1.append(('verb', i))
elif i in stop_words:
list1.append(('stop-word', i))
elif i in nouns:
list1.append(('noun', i))
elif i.isdigit():
list1.append(('number', convert_number(i)))
else:
list1.append(('error', i))
print list1
def convert_number(s):
try:
return int(s)
except ValueError:
return None
However when I run nosetests in powershell I get this AssertionError:
Traceback (most recent call last):
File "G:\Python27\lib\site-packages\nose\case.py", line 197, in runTest
self.test(*self.arg)
File "G:\Users\Charles\dropbox\programming\lexicon_test\skeleton\tests\lexicon_tests.py", line 6, in test_directions
assert_equal(lexicon.scan("north"), [('direction', 'north')])
AssertionError: None != [('direction', 'north')]
-------------------- >> begin captured stdout << ---------------------
[('direction', 'north')]
That's the same error message I get for each test run, six of them for the six functions in lexicon_tests.py. What does this error mean? It's been irritating be for a while now. Thanks in advance.
The assert_equal function takes two arguments, and throws an error if the arguments aren't equal to each other. In this case, the result of lexicon.scan("north") is None, and since this isn't equal to [('direction', 'north')], it's throwing an error.
In other words, your lexicon.scan function isn't working properly. It might have something to do with it missing a return statement.
I have been attempting to work my way through Learn Python the Hard Way, and on Exercise 48 I continue to get an error when I run nosetests. I am using code that other people have verified on the site to work, but no matter what I continue to get this error:
======================================================================
ERROR: tests.ex48_tests.test_directions
----------------------------------------------------------------------
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nose/case.py", line 197, in runTest
self.test(*self.arg)
File "/Users/AlexanderMariona/Documents/Home/Programming/Python/Projects/Exercise 48/tests/ex48_tests.py", line 6, in test_directions
assert_equal(lexicon.scan("north"), [('direction', 'north')])
AttributeError: 'module' object has no attribute 'scan'
I get this error 6 times, one for each of my test functions.
Here is what I'm using for my code:
lexicon.py:
class Lexicon(object):
directions = ['north', 'south', 'east', 'west', 'down', 'up', 'down', 'right']
verbs = ['go', 'stop', 'kill', 'eat']
stops = ['the', 'in', 'at', 'of', 'from', 'at', 'it']
nouns = ['door', 'bear', 'princess', 'cabinet']
def scan(thewords):
thewords = thewords.split()
sentence = []
for i in thewords:
if i in directions:
sentence.append(('direction', i))
elif i in verbs:
sentence.append(('verb', i))
elif i in stops:
sentence.append(('stop', i))
elif i in nouns:
sentence.append(('noun', i))
elif i.isdigit():
sentence.append(('number', convert_number(i)))
else:
sentence.append(('error', i))
return sentence
def convert_number(s):
try:
return int(s)
except ValueError:
return None
lexicon = Lexicon()
(This was written by Dairylee.)
ex48_tests.py:
from nose.tools import *
from ex48 import lexicon
def test_directions():
assert_equal(lexicon.scan("north"), [('direction', 'north')])
result = lexicon.scan("north south east")
assert_equal(result, [('direction', 'north'),
('direction', 'south'),
('direction', 'east')])
def test_verbs():
assert_equal(lexicon.scan("go"), [('verb', 'go')])
result = lexicon.scan("go kill eat")
assert_equal(result, [('verb', 'go'),
('verb', 'kill'),
('verb', 'eat')])
def test_stops():
assert_equal(lexicon.scan("the"), [('stop', 'the')])
result = lexicon.scan("the in of")
assert_equal(result, [('stop', 'the'),
('stop', 'in'),
('stop', 'of')])
def test_nouns():
assert_equal(lexicon.scan("bear"), [('noun', 'bear')])
result = lexicon.scan("bear princess")
assert_equal(result, [('noun', 'bear'),
('noun', 'princess')])
def test_numbers():
assert_equal(lexicon.scan("1234"), [('number', 1234)])
result = lexicon.scan("3 91234")
assert_equal(result, [('number', 3),
('number', 91234)])
def test_errors():
assert_equal(lexicon.scan("ASDFADFASDF"), [('error', 'ASDFADFASDF')])
result = lexicon.scan("bear IAS princess")
assert_equal(result, [('noun', 'bear'),
('error', 'IAS'),
('noun', 'princess')])
(This is copied verbatim from LPTHW.)
setup.py:
try:
from setuptools import setup
except ImportError:
from distutils.core import setup
config = {
'name': 'Excercise 48',
'description': 'LPTHW',
'version': '0.1',
'author': 'My Name',
'author_email': 'My E-Mail',
'url': 'None',
'download_url': 'None',
'packages': ['ex48'],
'scripts': [],
'install_requires': ['nose']
}
setup(**config)
And here is the directory of the package:
Exercise 48/
bin/
docs/
ex48/
__init__.py
lexicon.py
setup.py
tests/
__init__.py
ex48_tests.py
What exactly is causing this error?
This error happens because there's no function scan in the module lexicon. There's a method in the class Lexicon, then it should be called like a method (note that self argument is missing).
On the other hand, Lexicon does not have to exist as a class at all, scan can be a module-level function.