Python - Update tuple string element error - python

I have a dataframe where every row is a list of tuples , i.e.: tuple = (word, pos_tag). In each row, I want to change the word of some tuples by marking it and then update the tuple with the marked word. For example:
Initial dataframe row :
[('This', 'DET'), ('is', 'VERB'), ('an', 'DET'), ('example', 'NOUN'), ('text', 'NOUN'), ('that', 'DET'), ('I', 'PRON'), ('use', 'VERB'), ('in', 'ADP'), ('order', 'NOUN'), ('to', 'PART'), ('get', 'VERB'), ('an', 'DET'), ('answer', 'NOUN')]
Updated words :
updated_word : <IN>example</IN>
updated_word : <TAR>answer</TAR>
Desired output :
[('This', 'DET'), ('is', 'VERB'), ('an', 'DET'), ('<IN>example</IN>', 'NOUN'), ('text', 'NOUN'), ('that', 'DET'), ('I', 'PRON'), ('use', 'VERB'), ('in', 'ADP'), ('order', 'NOUN'), ('to', 'PART'), ('get', 'VERB'), ('an', 'DET'), ('<TAR>answer</TAR>', 'NOUN')]
But I get an error that TypeError: 'tuple' object is not callable. Can someone help? Here's the code :
for idx, row in df.iterrows():
doc = nlp(row['title'])
pos_tags = [(token.text, token.pos_) for token in doc if not token.pos_ == "PUNCT"]
for position, tuple in enumerate(pos_tags, start=1):
word = tuple[0]
spacy_pos_tag = tuple[1]
word = re.sub(r'[^\w\s]', '', word)
for col in cols:
if position in row[col]:
word = f'<{col.upper()}>{word}</{col.upper()}>'
else:
continue
tuple = tuple(word, spacy_pos_tag)
print(tuple)
>>>> Traceback (most recent call last):
>>>> tuple = tuple(word, spacy_pos_tag)
>>>> TypeError: 'tuple' object is not callable
Updated question
I have replaced tuple with tuple_ as suggested, but I still can't get back the desired output which is a list of tuples in every row. Can someone help how to update the dataframe rows? Here's the updated code :
for idx, row in df.iterrows():
doc = nlp(row['title'])
pos_tags = [(token.text, token.pos_) for token in doc if not token.pos_ == "PUNCT"]
# print(idx, "tokens, pos : ", pos_tags, "\n")
for position, tuple_ in enumerate(pos_tags, start=1):
word = tuple_[0]
spacy_pos_tag = tuple_[1]
word = re.sub(r'[^\w\s]', '', word)
for col in cols:
if position in row[col]:
word = f'<{col.upper()}>{word}</{col.upper()}>'
else:
continue
tuple_ = (word, spacy_pos_tag)
pos_tags.append(' '.join(position, tuple_))
# pos_tags.append(' '.join(tuple_))
print(idx, "tokens, pos : ", pos_tags, "\n")
>>>> Traceback (most recent call last):
>>>> pos_tag(df=df_matched)
>>>> pos_tags.append(' '.join(position, tuple_))
>>>> TypeError: join() takes exactly one argument (2 given)

Do not use tuple as a variable name, as it is a built-in python type name. Try the following instead:
for position, tuple_ in enumerate(pos_tags, start=1):
word = tuple_[0]
spacy_pos_tag = tuple_[1]
word = re.sub(r'[^\w\s]', '', word)
for col in cols:
if position in row[col]:
word = f'<{col.upper()}>{word}</{col.upper()}>'
else:
continue
tuple_ = (word, spacy_pos_tag)
print(tuple_)

Don't use "tuple" as name of a variable. It's a type name

Related

None result in the grouped method in Python coding

I am trying to get result as a printed sentence in my NER code:
[(Token_1, PoS_1, Tag_1), ..., (Token_n, PoS_n, Tag_n)]
In the function 'get_next' I am getting 'None' result every time - not sure what I am doing wrong here. Code:
class SentenceGetter(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Lemat"].values.tolist(),
s["POS"].values.tolist(),
s["TAG"].values.tolist())]
self.grouped = self.data.groupby("Forma").apply(agg_func)
self.sentences = [s for s in self.grouped]
def get_next(self):
try:
s = self.grouped["Forma: {}".format(self.n_sent)]
self.n_sent += 1
return s
except:
return None
getter = SentenceGetter(data)
sent = getter.get_next()
print('Example sentence:')
print(sent)
Code block about definition:
data = pd.read_csv("nkjp-morph-named.txt",delimiter="\t")
data = data.fillna(method="ffill")
print("Form number: ", len(data.groupby(['Forma'])))
lemats = list(set(data["Lemat"].values))
n_lemats = len(lemats)
print("Lemats: ", n_lemats)
tags = list(set(data["TAG"].values))
print("TAG:", tags)
n_tags = len(tags)
print("Number of TAGs: ", n_tags)
print("Dataset:")
data.head(n=16)
Could you please help me to understand it more why its None still??
Excepted result:
Example sentence: [('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'),
('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched',
'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'),
('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war',
'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC',
'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN',
'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS',
'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN',
'O'), ('.', '.', 'O')]

How to store ner result in json/ database

import nltk
from itertools import groupby
def get_continuous_chunks(tagged_sent):
continuous_chunk = []
current_chunk = []
for token, tag in tagged_sent:
if tag != "O":
current_chunk.append((token, tag))
else:
if current_chunk: # if the current chunk is not empty
continuous_chunk.append(current_chunk)
current_chunk = []
# Flush the final current_chunk into the continuous_chunk, if any.
if current_chunk:
continuous_chunk.append(current_chunk)
return continuous_chunk
ne_tagged_sent = [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')]
named_entities = get_continuous_chunks(ne_tagged_sent)
named_entities = get_continuous_chunks(ne_tagged_sent)
named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities]
def parser(n,string):
for i in named_entities_str_tag[n]:
if i==string:
pass
else:
return i
print named_entities_str_tag
print
I got this output from the above code:
('PERSON ', 'Rami Eid')
('ORGANIZATION', 'Stony Brook University')
('LOCATION ', 'NY')
('PERSON ', 'GuruRaj Bagali')
('ORGANIZATION', 'Christ University')
But I want it should be map like PERSON WITH ORGANIZATION AND LOCATION I want to store it in json format.
It's not very clear what ne_tagged_sent list contains (Is there a LOCATION for each PERSON, ORGANIZATION ?), you must clarify it that we could answer your question.
You should format your data as a dictionary, each entry corresponds to a person like:
import json
data = {
'Rami Eid':{'job': 'engineer', 'location':'NY'},
'GuruRaj Bagali':{'job': 'professor', 'location': 'NY'}
}
#Save it in a json file
json.dump(data, open('path/to_your_file', 'w')

How to extract chunks from BIO chunked sentences? - python

Give an input sentence, that has BIO chunk tags:
[('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed',
'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'),
('swallow', 'I-NP'), ('?', 'O')]
I would need to extract the relevant phrases out, e.g. if I want to extract 'NP', I would need to extract the fragments of tuples that contains B-NP and I-NP.
[out]:
[('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]
(Note: the numbers in the extract tuples represent the token index.)
I have tried extracting it using the following code:
def extract_chunks(tagged_sent, chunk_type):
current_chunk = []
current_chunk_position = []
for idx, word_pos in enumerate(tagged_sent):
word, pos = word_pos
if '-'+chunk_type in pos: # Append the word to the current_chunk.
current_chunk.append((word))
current_chunk_position.append((idx))
else:
if current_chunk: # Flush the full chunk when out of an NP.
_chunk_str = ' '.join(current_chunk)
_chunk_pos_str = '-'.join(map(str, current_chunk_position))
yield _chunk_str, _chunk_pos_str
current_chunk = []
current_chunk_position = []
if current_chunk: # Flush the last chunk.
yield ' '.join(current_chunk), '-'.join(current_chunk_position)
tagged_sent = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')]
print (list(extract_chunks(tagged_sent, chunk_type='NP')))
But when I have adjacent chunk of the same type:
tagged_sent = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'), ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')]
print (list(extract_chunks(tagged_sent, chunk_type='NP')))
It outputs this:
[('The Mitsubishi Electric Company Managing Director', '0-1-2-3-4-5'), ('ramen', '7')]
Instead of the desired:
[('The Mitsubishi Electric Company', '0-1-2-3'), ('Managing Director', '4-5'), ('ramen', '7')]
How can this be resolved from the above code?
Other than how it's done from the code above, is there a better solution to extract the desired chunks of a specific chunk_type?
Try this, it will extract all types of chunks with the indices of their respective words.
def extract_chunks(tagged_sent, chunk_type='NP'):
out_sen = []
for idx, word_pos in enumerate(tagged_sent):
word,bio = word_pos
boundary,tag = bio.split("-") if "-" in bio else ('','O')
if tag != chunk_type:continue
if boundary == "B":
out_sen.append([word, str(idx)])
elif boundary == "I":
out_sen[-1][0] += " "+ word
out_sen[-1][-1] += "-"+ str(idx)
else:
out_sen.append([word, str(idx)])
return out_sen
Demo:
>>> tagged_sent = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'), ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')]
>>> output_sent = extract_chunks(tagged_sent)
>>> print map(tuple, output_sent)
[('The Mitsubishi Electric Company', '0-1-2-3'), ('Managing Director', '4-5'), ('ramen', '7')]
def extract_chunks(tagged_sent, chunk_type):
grp1, grp2, chunk_type = [], [], "-" + chunk_type
for ind, (s, tp) in enumerate(tagged_sent):
if tp.endswith(chunk_type):
if not tp.startswith("B"):
grp2.append(str(ind))
grp1.append(s)
else:
if grp1:
yield " ".join(grp1), "-".join(grp2)
grp1, grp2 = [s], [str(ind)]
yield " ".join(grp1), "-".join(grp2)
Output:
In [2]: l = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'),
...: ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')]
In [3]: list(extract_chunks(l, "NP"))
Out[3]:
[('The Mitsubishi Electric Company', '0-1-2-3'),
('Managing Director', '4-5'),
('ramen', '7')]
In [4]: l = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')]
In [5]: list(extract_chunks(l, "NP"))
Out[5]: [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]
I would do it like this:
import re
def extract_chunks(tagged_sent, chunk_type):
# compiles the expression we want to match
regex = re.compile(chunk_type)
# filters matched items in a dictionary whose keys are the matched indexes
first_step = {index_:tag[0] for index_, tag in enumerate(tagged_sent) if regex.findall(tag[1])}
# builds list of lists following output format
second_step = []
for key_ in sorted(first_step.keys()):
if second_step and int(second_step [len(second_step )-1][1].split('-')[-1]) == key_ -1:
second_step[len(second_step)-1][0] += ' {0}'.format(first_step[key_])
second_step[len(second_step)-1][1] += '-{0}'.format(str(key_))
else:
second_step.append([first_step[key_], str(key_)])
# builds output in final format
return [tuple(item) for item in second_step]
You can adapt it to use generators instead of building the whole output in memory like I am doing and refactory it for better performance (I'm in a hurry so the code is far from optimal).
Hope it helps!

Why can't I pass wn.ADJ_SAT as a pos when requesting synsets

I know that wordnet has an "adverb synset" type. I know that is in the synset type enum in nltk
from nltk.corpus import wordnet as wn
wn.ADJ_SAT
u's'
Why can't I pass it as a key to synsets?
>>> wn.synsets('dog', wn.ADJ_SAT)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/nltk/corpus/reader/wordnet.py", line 1413, in synsets
for form in self._morphy(lemma, p)
File "/Library/Python/2.7/site-packages/nltk/corpus/reader/wordnet.py", line 1627, in _morphy
substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
KeyError: u's'
From:
>>> from nltk.corpus import wordnet as wn
>>> wn.synsets('able')
[Synset('able.a.01'), Synset('able.s.02'), Synset('able.s.03'), Synset('able.s.04')]
>>> wn.synsets('able', pos=wn.ADJ)
[Synset('able.a.01'), Synset('able.s.02'), Synset('able.s.03'), Synset('able.s.04')]
>>> wn.synsets('able', pos=wn.ADJ_SAT)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python2.7/dist-packages/nltk/corpus/reader/wordnet.py", line 1413, in synsets
for form in self._morphy(lemma, p)
File "/usr/local/lib/python2.7/dist-packages/nltk/corpus/reader/wordnet.py", line 1627, in _morphy
substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
KeyError: u's'
From https://github.com/nltk/nltk/blob/develop/nltk/corpus/reader/wordnet.py#L1397 , we see that when you try to retrieve a synset from the NLTK wordnet API, the POS restrictions appears in the return list comprehension that calls the self._morphy(lemma, p) function:
def synsets(self, lemma, pos=None, lang='en'):
"""Load all synsets with a given lemma and part of speech tag.
If no pos is specified, all synsets for all parts of speech
will be loaded.
If lang is specified, all the synsets associated with the lemma name
of that language will be returned.
"""
lemma = lemma.lower()
if lang == 'en':
get_synset = self._synset_from_pos_and_offset
index = self._lemma_pos_offset_map
if pos is None:
pos = POS_LIST
return [get_synset(p, offset)
for p in pos
for form in self._morphy(lemma, p)
for offset in index[form].get(p, [])]
If we look at the _morphy() function, from https://github.com/nltk/nltk/blob/develop/nltk/corpus/reader/wordnet.py#L1573.
def _morphy(self, form, pos):
# from jordanbg:
# Given an original string x
# 1. Apply rules once to the input to get y1, y2, y3, etc.
# 2. Return all that are in the database
# 3. If there are no matches, keep applying rules until you either
# find a match or you can't go any further
exceptions = self._exception_map[pos]
substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
def apply_rules(forms):
return [form[:-len(old)] + new
for form in forms
for old, new in substitutions
if form.endswith(old)]
def filter_forms(forms):
result = []
seen = set()
for form in forms:
if form in self._lemma_pos_offset_map:
if pos in self._lemma_pos_offset_map[form]:
if form not in seen:
result.append(form)
seen.add(form)
return result
# 0. Check the exception lists
if form in exceptions:
return filter_forms([form] + exceptions[form])
# 1. Apply rules once to the input to get y1, y2, y3, etc.
forms = apply_rules([form])
# 2. Return all that are in the database (and check the original too)
results = filter_forms([form] + forms)
if results:
return results
# 3. If there are no matches, keep applying rules until we find a match
while forms:
forms = apply_rules(forms)
results = filter_forms(forms)
if results:
return results
# Return an empty list if we can't find anything
return []
We see that it retrieves some substitution rules from substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos] to perform some morphological reduction before it retrieves the Synsets that are stored in the "based"/"root" form. E.g.
>>> from nltk.corpus import wordnet as wn
>>> wn._morphy('dogs', 'n')
[u'dog']
And if we look at the MORPHOLOGICAL_SUBSTITUTIONS, we see that ADJ_SAT is missing, see https://github.com/nltk/nltk/blob/develop/nltk/corpus/reader/wordnet.py#L1609 :
MORPHOLOGICAL_SUBSTITUTIONS = {
NOUN: [('s', ''), ('ses', 's'), ('ves', 'f'), ('xes', 'x'),
('zes', 'z'), ('ches', 'ch'), ('shes', 'sh'),
('men', 'man'), ('ies', 'y')],
VERB: [('s', ''), ('ies', 'y'), ('es', 'e'), ('es', ''),
('ed', 'e'), ('ed', ''), ('ing', 'e'), ('ing', '')],
ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')],
ADV: []}
Thus to prevent this from happening a simple fix to add this line after line 1609 of https://github.com/nltk/nltk/blob/develop/nltk/corpus/reader/wordnet.py#L1609:
MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ]
For proof of concept:
>>> MORPHOLOGICAL_SUBSTITUTIONS = {
... 1: [('s', ''), ('ses', 's'), ('ves', 'f'), ('xes', 'x'),
... ('zes', 'z'), ('ches', 'ch'), ('shes', 'sh'),
... ('men', 'man'), ('ies', 'y')],
... 2: [('s', ''), ('ies', 'y'), ('es', 'e'), ('es', ''),
... ('ed', 'e'), ('ed', ''), ('ing', 'e'), ('ing', '')],
... 3: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')],
... 4: []}
>>>
>>> MORPHOLOGICAL_SUBSTITUTIONS[5] = MORPHOLOGICAL_SUBSTITUTIONS[3]
>>> MORPHOLOGICAL_SUBSTITUTIONS
{1: [('s', ''), ('ses', 's'), ('ves', 'f'), ('xes', 'x'), ('zes', 'z'), ('ches', 'ch'), ('shes', 'sh'), ('men', 'man'), ('ies', 'y')], 2: [('s', ''), ('ies', 'y'), ('es', 'e'), ('es', ''), ('ed', 'e'), ('ed', ''), ('ing', 'e'), ('ing', '')], 3: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')], 4: [], 5: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')]}
Thanks for the suggestion, adopted in https://github.com/nltk/nltk/commit/21fd0c538b0144c5f722e4f643c9ced1deb1f15e

Learn Python the Hard Way, Exercise 48

I've been all day trying to solve the test_errors() function in "Exercise 48: Advanced User Input" of the book Learn Python The Hard Way.
assert_equal(), a function in the tests asks me for the tuples in order and I haven't been able to code it that way.
My loops always returns first the nouns and last the error tuples, I don't know how to break the loop so it starts again but with the right values to continue or whatever is necessary to sort this tuples in the order they should be.
Here's the code:
class Lexicon(object):
def scan(self, stringo):
vocabulary = [[('direction', 'north'), ('direction', 'south'), ('direction', 'east'), ('direction', 'west')],
[('verb', 'go'), ('verb', 'kill'), ('verb', 'eat')],
[('stop', 'the'), ('stop', 'in'), ('stop', 'of')],
[('noun', 'bear'), ('noun', 'princess')], # Remember numbers
[('error', 'ASDFADFASDF'), ('error', 'IAS')],
[('number', '1234'), ('number','3'), ('number', '91234')]]
self.stringo = stringo
got_word = ''
value = []
rompe = self.stringo.split() #split rompe en los espacios
for asigna in vocabulary:
for encuentra in asigna:
if encuentra[1] in rompe:
value.append(encuentra)
return value
eLexicon = Lexicon()
from nose.tools import *
from ex48.ex48 import eLexicon
def test_directions():
assert_equal(eLexicon.scan("north"), [('direction', 'north')])
result = eLexicon.scan("north south east")
assert_equal(result, [('direction', 'north'),
('direction', 'south'),
('direction', 'east')])
def test_verbs():
assert_equal(eLexicon.scan("go"), [('verb', 'go')])
result = eLexicon.scan("go kill eat")
assert_equal(result, [('verb', 'go'),
('verb', 'kill'),
('verb', 'eat')])
def test_stops():
assert_equal(eLexicon.scan("the"), [('stop', 'the')])
result = eLexicon.scan("the in of")
assert_equal(result, [('stop', 'the'),
('stop', 'in'),
('stop', 'of')])
def test_nouns():
assert_equal(eLexicon.scan("bear"), [('noun', 'bear')])
result = eLexicon.scan("bear princess")
assert_equal(result, [('noun', 'bear'),
('noun', 'princess')])
#def test_numbers():
# assert_equal(lexicon.scan("1234"), [('number', 1234)])
# result = lexicon.scan("3 91234")
# assert_equal(result, [('number', 3),
# ('number', 91234)])
def test_errors():
assert_equal(eLexicon.scan("ASDFADFASDF"), [('error', 'ASDFADFASDF')])
result = eLexicon.scan("bear IAS princess")
assert_equal(result, [('noun', 'bear'),
('error', 'IAS'),
('noun', 'princess')])
======================================================================
FAIL: tests.ex48_tests.test_errors
----------------------------------------------------------------------
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/nose/case.py", line 197, in runTest
self.test(*self.arg)
File "/home/totoro/Desktop/Python/projects/ex48/tests/ex48_tests.py", line 43, in test_errors
('noun', 'princess')])
AssertionError: Lists differ: [('noun', 'bear'), ('noun', 'p... != [('noun', 'bear'), ('error', '...
First differing element 1:
('noun', 'princess')
('error', 'IAS')
- [('noun', 'bear'), ('noun', 'princess'), ('error', 'IAS')]
+ [('noun', 'bear'), ('error', 'IAS'), ('noun', 'princess')]
----------------------------------------------------------------------
Ran 5 tests in 0.006s
Many thanks in advance for taking the time.
The words in the test are in the same order going in as coming out. As such you need to re-order your for-loops to iterate over the input first:
value = []
for rompe in stringo.split():
for asigna in vocabulary:
for encuentra in asigna:
if encuentra[1] == rompe:
value.append(encuentra)
This will return the encuentras in the correct order.
Note 1: You should not be hard-coding the numbers or errors.
Note 2: You can drastically reduce the complexity of this algorithm by using dictionary or two.
Example:
vocabulary = {
'direction': 'north east south west up down left right back'.split(),
'noun': 'bear princess door cabinet'.split(),
'stop': 'the in of from at it'.split(),
'verb': 'go kill eat stop'.split(),
}
'''
This creates a lookup using a dictionary-comprehension:
{'at': 'stop',
# [...]
'up': 'direction',
'west': 'direction'}
'''
classifications = {i: k for k, v in vocabulary.iteritems() for i in v}
def classify(word):
try:
return 'number', int(word)
except ValueError:
return classifications.get(word, 'error'), word
def scan(words):
return [classify(word) for word in words.split()]
for word in self.stringo.split():
for pair in vocabulary:
if pair[0][1] == word:
value.append(pair[0])
elif pair[1][1] == word:
value.append(pair[1])
elif pair[2][1] == word:
value.append(pair[2])
elif pair[3][1] == word:
value.append(pair[3])
I just finished this exercise, hope this gives some new ideas to some of you.
This is my solution:
#Set up datastructure
direction = ["north", "east", "south", "west", "up", "right", "down", "left", "back"]
verb = ["go", "stop", "kill", "eat"]
stop = ["the", "in", "of", "from", "at", "it"]
noun = ["door", "bear", "princess", "cabinet"]
vocabulary = [(direction, 'direction'), (verb, 'verb'), (stop, 'stop'), (noun, 'noun')]
def scan(sentence):
#searches the words in the datastructure, if not found checks if it is an integer, if not returns error.
results = []
words = sentence.split()
for word in words:
found = False
for category in vocabulary:
if word.lower() in category[0]:
results.append((category[1], word))
found = True
else:
pass
if found is False and isInt_str(word) is True:
results.append(('number', int(word)))
elif found is False and isInt_str(word) is False:
results.append(('error', word))
elif found is True:
pass
else:
print("I'm terribly sorry, but something you entered is neither a word nor a number.")
return results
def isInt_str(string):
#returns True or False if string equals an integer. (i.e. 2 = True, 2*-2 = True 2**0,5 = False)
string = str(string).strip()
return string=='0' or (string if string.find('..') > -1 else string.lstrip('-+').rstrip('0').rstrip('.')).isdigit()

Categories