I am trying to run wordnet from nltk. But in wordnet.py it says "NameError: name 'lemma_from_key' is not defined" at line 1680, though the function lemma_from_key() is defined in the same class
_WordNetObject
. The portion of codes are as follow:
class _WordNetObject:
def lemma(self, name, lang="eng"):
"""Return lemma object that matches the name"""
# cannot simply split on first '.',
# e.g.: '.45_caliber.a.01..45_caliber'
separator = SENSENUM_RE.search(name).end()
synset_name, lemma_name = name[: separator - 1], name[separator:]
synset = self.synset(synset_name)
for lemma in synset.lemmas(lang):
if lemma._name == lemma_name:
return lemma
raise WordNetError(f"no lemma {lemma_name!r} in {synset_name!r}")
def lemma_from_key(self, key):
# Keys are case sensitive and always lower-case
key = key.lower()
lemma_name, lex_sense = key.split("%")
pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":")
pos = self._pos_names[int(pos_number)]
# open the key -> synset file if necessary
if self._key_synset_file is None:
self._key_synset_file = self.open("index.sense")
# Find the synset for the lemma.
synset_line = _binary_search_file(self._key_synset_file, key)
if not synset_line:
raise WordNetError("No synset found for key %r" % key)
offset = int(synset_line.split()[1])
synset = self.synset_from_pos_and_offset(pos, offset)
# return the corresponding lemma
for lemma in synset._lemmas:
if lemma._key == key:
return lemma
raise WordNetError("No lemma found for for key %r" % key)
#############################################################
# Loading Synsets
#############################################################
def synset(self, name):
# split name into lemma, part of speech and synset number
lemma, pos, synset_index_str = name.lower().rsplit(".", 2)
synset_index = int(synset_index_str) - 1
# get the offset for this synset
try:
offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
except KeyError as e:
message = "no lemma %r with part of speech %r"
raise WordNetError(message % (lemma, pos)) from e
except IndexError as e:
n_senses = len(self._lemma_pos_offset_map[lemma][pos])
message = "lemma %r with part of speech %r has only %i %s"
if n_senses == 1:
tup = lemma, pos, n_senses, "sense"
else:
tup = lemma, pos, n_senses, "senses"
raise WordNetError(message % tup) from e
# load synset information from the appropriate file
synset = self.synset_from_pos_and_offset(pos, offset)
# some basic sanity checks on loaded attributes
if pos == "s" and synset._pos == "a":
message = (
"adjective satellite requested but only plain "
"adjective found for lemma %r"
)
raise WordNetError(message % lemma)
assert synset._pos == pos or (pos == "a" and synset._pos == "s")
# Return the synset object.
return synset
def _data_file(self, pos):
"""
Return an open file pointer for the data file for the given
part of speech.
"""
if pos == ADJ_SAT:
pos = ADJ
if self._data_file_map.get(pos) is None:
fileid = "data.%s" % self._FILEMAP[pos]
self._data_file_map[pos] = self.open(fileid)
return self._data_file_map[pos]
def synset_from_pos_and_offset(self, pos, offset):
"""
- pos: The synset's part of speech, matching one of the module level
attributes ADJ, ADJ_SAT, ADV, NOUN or VERB ('a', 's', 'r', 'n', or 'v').
- offset: The byte offset of this synset in the WordNet dict file
for this pos.
>>> from nltk.corpus import wordnet as wn
>>> print(wn.synset_from_pos_and_offset('n', 1740))
Synset('entity.n.01')
"""
# Check to see if the synset is in the cache
if offset in self._synset_offset_cache[pos]:
return self._synset_offset_cache[pos][offset]
data_file = self._data_file(pos)
data_file.seek(offset)
data_file_line = data_file.readline()
# If valid, the offset equals the 8-digit 0-padded integer found at the start of the line:
line_offset = data_file_line[:8]
if line_offset.isalnum() and offset == int(line_offset):
synset = self._synset_from_pos_and_line(pos, data_file_line)
assert synset._offset == offset
self._synset_offset_cache[pos][offset] = synset
else:
synset = None
raise WordNetError(
f"No WordNet synset found for pos={pos} at offset={offset}."
)
data_file.seek(0)
return synset
#deprecated("Use public method synset_from_pos_and_offset() instead")
def _synset_from_pos_and_offset(self, *args, **kwargs):
"""
Hack to help people like the readers of
https://stackoverflow.com/a/27145655/1709587
who were using this function before it was officially a public method
"""
return self.synset_from_pos_and_offset(*args, **kwargs)
def _synset_from_pos_and_line(self, pos, data_file_line):
# Construct a new (empty) synset.
synset = Synset(self)
# parse the entry for this synset
try:
# parse out the definitions and examples from the gloss
columns_str, gloss = data_file_line.strip().split("|")
definition = re.sub(r"[\"].*?[\"]", "", gloss).strip()
examples = re.findall(r'"([^"]*)"', gloss)
for example in examples:
synset._examples.append(example)
synset._definition = definition.strip("; ")
# split the other info into fields
_iter = iter(columns_str.split())
def _next_token():
return next(_iter)
# get the offset
synset._offset = int(_next_token())
# determine the lexicographer file name
lexname_index = int(_next_token())
synset._lexname = self._lexnames[lexname_index]
# get the part of speech
synset._pos = _next_token()
# create Lemma objects for each lemma
n_lemmas = int(_next_token(), 16)
for _ in range(n_lemmas):
# get the lemma name
lemma_name = _next_token()
# get the lex_id (used for sense_keys)
lex_id = int(_next_token(), 16)
# If the lemma has a syntactic marker, extract it.
m = re.match(r"(.*?)(\(.*\))?$", lemma_name)
lemma_name, syn_mark = m.groups()
# create the lemma object
lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark)
synset._lemmas.append(lemma)
synset._lemma_names.append(lemma._name)
# collect the pointer tuples
n_pointers = int(_next_token())
for _ in range(n_pointers):
symbol = _next_token()
offset = int(_next_token())
pos = _next_token()
lemma_ids_str = _next_token()
if lemma_ids_str == "0000":
synset._pointers[symbol].add((pos, offset))
else:
source_index = int(lemma_ids_str[:2], 16) - 1
target_index = int(lemma_ids_str[2:], 16) - 1
source_lemma_name = synset._lemmas[source_index]._name
lemma_pointers = synset._lemma_pointers
tups = lemma_pointers[source_lemma_name, symbol]
tups.append((pos, offset, target_index))
# read the verb frames
try:
frame_count = int(_next_token())
except StopIteration:
pass
else:
for _ in range(frame_count):
# read the plus sign
plus = _next_token()
assert plus == "+"
# read the frame and lemma number
frame_number = int(_next_token())
frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
lemma_number = int(_next_token(), 16)
# lemma number of 00 means all words in the synset
if lemma_number == 0:
synset._frame_ids.append(frame_number)
for lemma in synset._lemmas:
lemma._frame_ids.append(frame_number)
lemma._frame_strings.append(frame_string_fmt % lemma._name)
# only a specific word in the synset
else:
lemma = synset._lemmas[lemma_number - 1]
lemma._frame_ids.append(frame_number)
lemma._frame_strings.append(frame_string_fmt % lemma._name)
# raise a more informative error with line text
except ValueError as e:
raise WordNetError(f"line {data_file_line!r}: {e}") from e
# set sense keys for Lemma objects - note that this has to be
# done afterwards so that the relations are available
for lemma in synset._lemmas:
if synset._pos == ADJ_SAT:
head_lemma = synset.similar_tos()[0]._lemmas[0]
head_name = head_lemma._name
head_id = "%02d" % head_lemma._lex_id
else:
head_name = head_id = ""
tup = (
lemma._name,
WordNetCorpusReader._pos_numbers[synset._pos],
lemma._lexname_index,
lemma._lex_id,
head_name,
head_id,
)
lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower()
# the canonical name is based on the first lemma
lemma_name = synset._lemmas[0]._name.lower()
offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
sense_index = offsets.index(synset._offset)
tup = lemma_name, synset._pos, sense_index + 1
synset._name = "%s.%s.%02i" % tup
return synset
def synset_from_sense_key(self, sense_key):
"""
Retrieves synset based on a given sense_key. Sense keys can be
obtained from lemma.key()
From https://wordnet.princeton.edu/documentation/senseidx5wn:
A sense_key is represented as::
lemma % lex_sense (e.g. 'dog%1:18:01::')
where lex_sense is encoded as::
ss_type:lex_filenum:lex_id:head_word:head_id
:lemma: ASCII text of word/collocation, in lower case
:ss_type: synset type for the sense (1 digit int)
The synset type is encoded as follows::
1 NOUN
2 VERB
3 ADJECTIVE
4 ADVERB
5 ADJECTIVE SATELLITE
:lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int)
:lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int)
:head_word: lemma of the first word in satellite's head synset
Only used if sense is in an adjective satellite synset
:head_id: uniquely identifies sense in a lexicographer file when paired with head_word
Only used if head_word is present (2 digit int)
>>> import nltk
>>> from nltk.corpus import wordnet as wn
>>> print(wn.synset_from_sense_key("drive%1:04:03::"))
Synset('drive.n.06')
>>> print(wn.synset_from_sense_key("driving%1:04:03::"))
Synset('drive.n.06')
"""
return self.lemma_from_key(sense_key).synset()#line 1680
The full code can be found at nltk documentation.
I was trying to run wordnet code to implement bert. I downloaded nltk using pip install nltk from anaconda command prompt. But the code gives me error: NameError: name 'lemma_from_key' is not defined.
Since you installed using pip install nltk, it must have likely installed the latest published version of the code. Seems like there is a bug in the code there, as can be seen in the latest version (3.7) source code here.
The issue in version 3.7 is that on line 1680, the function lemma_from_key is being called, but it does not exist. To call the class method lemma_from_key, one needs to use self.lemma_from_key.
You can try using an older version, 3.6.5, which does not have this issue. Install it by:
pip install nltk==3.6.5
I can also see that the develop branch of nltk has fixed this issue. I assume that this will be resolved in a future release, which you can later upgrade to.
Related
I have a question.
I want to create triples from a review dataset and visualize the relationships within the reviews, seperately from each other.
Apologies for the huge amount of code, but this is required to succesfully process the triple extractor.
import spacy
import crosslingual_coreference
DEVICE = -1 # Number of the GPU, -1 if want to use CPU
# Add coreference resolution model
coref = spacy.load('en_core_web_sm', disable=['ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
coref.add_pipe(
"xx_coref", config={"device": DEVICE})
#part 2
def call_wiki_api(item):
try:
url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
data = requests.get(url).json()
# Return the first id (Could upgrade this in the future)
return data['search'][0]['id']
except:
return 'id-less'
#part 3
import re
from typing import List
from spacy import Language, util
from spacy.tokens import Doc, Span
from transformers import pipeline
def extract_triplets(text: str) -> List[str]:
"""
parses the text to triplets
1. Split the text into tokens
2. If the token is <triplet>, <subj>, or <obj>, then set the current variable to the appropriate value
3. If the token is not one of the above, then append it to the appropriate variable
4. If the current variable is <subj>, then append the triplet to the list of triplets
:param text: str - the text to be parsed
:type text: str
:return: A list of dictionaries.
"""
triplets = []
relation, subject, relation, object_ = "", "", "", ""
text = text.strip()
current = "x"
for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
if token == "<triplet>":
current = "t"
if relation != "":
triplets.append(
{"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()}
)
relation = ""
subject = ""
elif token == "<subj>":
current = "s"
if relation != "":
triplets.append(
{"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()}
)
object_ = ""
elif token == "<obj>":
current = "o"
relation = ""
else:
if current == "t":
subject += " " + token
elif current == "s":
object_ += " " + token
elif current == "o":
relation += " " + token
if subject != "" and relation != "" and object_ != "":
triplets.append(
{"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()}
)
return triplets
#Language.factory(
"rebel",
requires=["doc.sents"],
assigns=["doc._.rel"],
default_config={
"model_name": "Babelscape/rebel-large",
"device": 0,
},
)
class RebelComponent:
def __init__(
self,
nlp,
name,
model_name: str,
device: int,
):
assert model_name is not None, ""
self.triplet_extractor = pipeline(
"text2text-generation", model=model_name, tokenizer=model_name, device=device
)
# Register custom extension on the Doc
if not Doc.has_extension("rel"):
Doc.set_extension("rel", default={})
def _generate_triplets(self, sents: List[Span]) -> List[List[dict]]:
"""
1. We pass the text of the sentence to the triplet extractor.
2. The triplet extractor returns a list of dictionaries.
3. We extract the token ids from the dictionaries.
4. We decode the token ids into text.
5. We extract the triplets from the text.
6. We return the triplets.
The triplet extractor is a model that takes a sentence as input and returns a list of dictionaries.
Each dictionary contains the token ids of the extracted triplets.
The token ids are the numbers that represent the words in the sentence.
For example, the token id of the word "the" is 2.
The token ids are decoded into text using the tokenizer.
The tokenizer is a model that takes a list of token ids as input and returns a list of words.
:param sents: List[Span]
:type sents: List[Span]
:return: A list of lists of dicts.
"""
output_ids = self.triplet_extractor(
[sent.text for sent in sents], return_tensors=True, return_text=False
) # [0]["generated_token_ids"]
extracted_texts = self.triplet_extractor.tokenizer.batch_decode(
[out["generated_token_ids"] for out in output_ids]
)
extracted_triplets = []
for text in extracted_texts:
extracted_triplets.extend(extract_triplets(text))
return extracted_triplets
def set_annotations(self, doc: Doc, triplets: List[dict]):
"""
The function takes a spacy Doc object and a list of triplets (dictionaries) as input.
For each triplet, it finds the substring in the Doc object that matches the head and tail of the triplet.
It then creates a spacy span object for each of the head and tail.
Finally, it creates a dictionary of the relation type, head span and tail span and adds it to the Doc object
:param doc: the spacy Doc object
:type doc: Doc
:param triplets: List[dict]
:type triplets: List[dict]
"""
for triplet in triplets:
# get substring to spacy span
head_span = re.search(triplet["head"], doc.text)
tail_span = re.search(triplet["tail"], doc.text)
# get spacy span
if head_span is not None:
head_span = doc.char_span(head_span.start(), head_span.end())
else:
head_span = triplet["head"]
if tail_span is not None:
tail_span = doc.char_span(tail_span.start(), tail_span.end())
else:
tail_span = triplet["tail"]
offset = (head_span.start, tail_span.start)
if offset not in doc._.rel:
doc._.rel[offset] = {
"relation": triplet["type"],
"head_span": head_span,
"tail_span": tail_span,
}
def __call__(self, doc: Doc) -> Doc:
"""
The function takes a doc object and returns a doc object
:param doc: Doc
:type doc: Doc
:return: A Doc object with the sentence triplets added as annotations.
"""
sentence_triplets = self._generate_triplets(doc.sents)
self.set_annotations(doc, sentence_triplets)
return doc
def pipe(self, stream, batch_size=128):
"""
It takes a stream of documents, and for each document,
it generates a list of sentence triplets,
and then sets the annotations for each sentence in the document
:param stream: a generator of Doc objects
:param batch_size: The number of documents to process at a time, defaults to 128 (optional)
"""
for docs in util.minibatch(stream, size=batch_size):
sents = []
for doc in docs:
sents += doc.sents
sentence_triplets = self._generate_triplets(sents)
index = 0
for doc in docs:
n_sent = len(list(doc.sents))
self.set_annotations(doc, sentence_triplets[index : index + n_sent])
index += n_sent
yield doc
The following code is also required.
# Define rel extraction model
rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", config={
'device':DEVICE, # Number of the GPU, -1 if want to use CPU
'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
)
This creates the following triple extractor.
input_text = "Christian Drosten works in Germany. He likes to work for Google."
coref_text = coref(input_text)._.resolved_text
doc = rel_ext(coref_text)
for value, rel_dict in doc._.rel.items():
print(f"{value}: {rel_dict}")
# {'relation': 'country of citizenship', 'head_span': {'text': 'Christian Drosten', 'id': 'Q1079331'}, 'tail_span': {'text': 'Germany', 'id': 'Q183'}}
# {'relation': 'employer', 'head_span': {'text': 'Christian Drosten', 'id': 'Q1079331'}, 'tail_span': {'text': 'Google', 'id': 'Q95'}}
Now, assume that I want to process the following dictionary, with the individual reviews seperately from each other as the input_text. Is there anyway to creates triple from the reviews individually? If so, how would this be possible?
reviews = {0: 'Too Heavy and Poor weld quality,',
1: 'difficult mount',
2: 'just got it installed',
3: 'Decent Little Reader, Poor Tablet',
4: 'Ok For What It Is'}
Looking forward for your response.
Kind regards
I would like to build a dictionary of abreviations.
I have a text file with a lot of abreviations. The text file looks like this(after import)
with open('abreviations.txt') as ab:
ab_words = ab.read().splitlines()
An extract:
'ACE',
'Access Control Entry',
'ACK',
'Acknowledgement',
'ACORN',
'A Completely Obsessive Really Nutty person',
Now I want to build the dictionnary, where I have every uneven line as a dictionary key and every even line as the dictionary value.
Hence I should be able to write at the end:
ab_dict['ACE']
and get the result:
'Access Control Entry'
Also, How can I make it case-insensitive ?
ab_dict['ace']
should yield the same result
'Access Control Entry'
In fact, it would be perfect, if the output would also be lower case:
'access control entry'
Here is a link to the text file: https://www.dropbox.com/s/91afgnupk686p9y/abreviations.txt?dl=0
Complete solution with custom ABDict class and Python's generator functionality:
class ABDict(dict):
''' Class representing a dictionary of abbreviations'''
def __getitem__(self, key):
v = dict.__getitem__(self, key.upper())
return v.lower() if key.islower() else v
with open('abbreviations.txt') as ab:
ab_dict = ABDict()
while True:
try:
k = next(ab).strip() # `key` line
v = next(ab).strip() # `value` line
ab_dict[k] = v
except StopIteration:
break
Now, testing (with case-relative access):
print(ab_dict['ACE'])
print(ab_dict['ace'])
print('*' * 10)
print(ab_dict['WYTB'])
print(ab_dict['wytb'])
The output(consecutively):
Access Control Entry
access control entry
**********
Wish You The Best
wish you the best
Here's another solution based on the pairwise function from this solution:
from requests.structures import CaseInsensitiveDict
def pairwise(iterable):
"s -> (s0, s1), (s2, s3), (s4, s5), ..."
a = iter(iterable)
return zip(a, a)
with open('abreviations.txt') as reader:
abr_dict = CaseInsensitiveDict()
for abr, full in pairwise(reader):
abr_dict[abr.strip()] = full.strip()
Here is an answer that also allows sentences to be replaced with words from the dictionary:
import re
from requests.structures import CaseInsensitiveDict
def read_file_dict(filename):
"""
Reads file data into CaseInsensitiveDict
"""
# lists for keys and values
keys = []
values = []
# case sensitive dict
data = CaseInsensitiveDict()
# count used for deciding which line we're on
count = 1
with open(filename) as file:
temp = file.read().splitlines()
for line in temp:
# if the line count is even, a value is being read
if count % 2 == 0:
values.append(line)
# otherwise, a key is being read
else:
keys.append(line)
count += 1
# Add to dictionary
# perhaps some error checking here would be good
for key, value in zip(keys, values):
data[key] = value
return data
def replace_word(ab_dict, sentence):
"""
Replaces sentence with words found in dictionary
"""
# not necessarily words, but you get the idea
words = re.findall(r"[\w']+|[.,!?; ]", sentence)
new_words = []
for word in words:
# if word is in dictionary, replace it and add it to resulting list
if word in ab_dict:
new_words.append(ab_dict[word])
# otherwise add it as normally
else:
new_words.append(word)
# return sentence with replaced words
return "".join(x for x in new_words)
def main():
ab_dict = read_file_dict("abreviations.txt")
print(ab_dict)
print(ab_dict['ACE'])
print(ab_dict['Ace'])
print(ab_dict['ace'])
print(replace_word(ab_dict, "The ACE is not easy to understand"))
if __name__ == '__main__':
main()
Which outputs:
{'ACE': 'Access Control Entry', 'ACK': 'Acknowledgement', 'ACORN': 'A Completely Obsessive Really Nutty person'}
Access Control Entry
Access Control Entry
Access Control Entry
The Access Control Entry is not easy to understand
se_eng_fr_dict = {'School': ['Skola', 'Ecole'], 'Ball': ['Boll', 'Ballon']}
choose_language = raw_input("Type 'English', for English. Skriv 'svenska' fo:r svenska. Pour francais, ecrit 'francais'. ")
if choose_language == 'English':
word = raw_input("Type in a word:")
swe_word = se_eng_fr_dict[word][0]
fra_word = se_eng_fr_dict[word][1]
print word, ":", swe_word, "pa. svenska," , fra_word, "en francais."
elif choose_language == 'Svenska':
word = raw_input("Vilket ord:")
for key, value in se_eng_fr_dict.iteritems():
if value == word:
print key
I want to create a dictionary (to be stored locally as a txt file) and the user can choose between entering a word in English, Swedish or French to get the translation of the word in the two other languages. The user should also be able to add data to the dictionary.
The code works when I look up the Swedish and French word with the English word. But how can I get the Key, and Value2 if I only have value1?
Is there a way or should I try to approach this problem in a different way?
A good option would be to store None for the value if it hasn't been set. While it would increase the amount of memory required, you could go a step further and add the language itself.
Example:
se_eng_fr_dict = {'pencil': {'se': None, 'fr': 'crayon'}}
def translate(word, lang):
# If dict.get() finds no value with `word` it will return
# None by default. We override it with an empty dictionary `{}`
# so we can always call `.get` on the result.
translated = se_eng_fr_dict.get(word, {}).get(lang)
if translated is None:
print("No {lang} translation found for {word}.format(**locals()))
else:
print("{} is {} in {}".format(word, translated, lang))
translate('pencil', 'fr')
translate('pencil', 'se')
i hope there could be a better solution, but here is mine:
class Word:
def __init__(self, en, fr, se):
self.en = en
self.fr = fr
self.se = se
def __str__(self):
return '<%s,%s,%s>' % (self.en, self.fr, self.se)
then you dump all these Words into a mapping data structure. you can use dictionary, but here if you have a huge data set, it's better for you to use BST, have a look at https://pypi.python.org/pypi/bintrees/2.0.1
lets say you have all these Words loaded in a list named words, then:
en_words = {w.en: w for w in words}
fr_words = {w.fr: w for w in words}
se_words = {w.se: w for w in words}
again, BST is more recommended here.
Maybe a set of nested lists would be better for this:
>>> my_list = [
[
"School", "Skola", "Ecole"
],
[
"Ball", "Boll", "Ballon"
]
]
Then you can access the set of translations by doing:
>>> position = [index for index, item in enumerate(my_list) for subitem in item if value == subitem][0]
This returns the index of the list, which you can grab:
>>> sub_list = my_list[position]
And the sublist will have all the translations in order.
For example:
>>> position = [index for index, item in enumerate(my_list) for subitem in item if "Ball" == subitem][0]
>>> print position
1
>>> my_list[position]
['Ball', 'Boll', 'Ballon']
In order to speedup word lookups and achieve a good flexibility, I'd choose a dictionary of subdictionaries: each subdictionary translates the words of a language into all the available languages and the top-level dictionary maps each language into the corresponding subdictionary.
For example, if multidict is the top-level dictionary, then multidict['english']['ball'] returns the (sub)dictionary:
{'english':'ball', 'francais':'ballon', 'svenska':'ball'}
Below is a class Multidictionary implementing such an idea.
For convenience it assumes that all the translations are stored into a text file in CSV format, which is read at initialization time, e.g.:
english,svenska,francais,italiano
school,skola,ecole,scuola
ball,boll,ballon,palla
Any number of languages can be easily added to the CSV file.
class Multidictionary(object):
def __init__(self, fname=None):
'''Init a multidicionary from a CSV file.
The file describes a word per line, separating all the available
translations with a comma.
First file line must list the corresponding languages.
For example:
english,svenska,francais,italiano
school,skola,ecole,scuola
ball,boll,ballon,palla
'''
self.fname = fname
self.multidictionary = {}
if fname is not None:
import csv
with open(fname) as csvfile:
reader = csv.DictReader(csvfile)
for translations in reader:
for lang, word in translations.iteritems():
self.multidictionary.setdefault(lang, {})[word] = translations
def get_available_languages(self):
'''Return the list of available languages.'''
return sorted(self.multidictionary)
def translate(self, word, language):
'''Return a dictionary containing the translations of a word (in a
specified language) into all the available languages.
'''
if language in self.get_available_languages():
translations = self.multidictionary[language].get(word)
else:
print 'Invalid language %r selected' % language
translations = None
return translations
def get_translations(self, word, language):
'''Generate the string containing the translations of a word in a
language into all the other available languages.
'''
translations = self.translate(word, language)
if translations:
other_langs = (lang for lang in translations if lang != language)
lang_trans = ('%s in %s' % (translations[lang], lang) for lang in other_langs)
s = '%s: %s' % (word, ', '.join(lang_trans))
else:
print '%s word %r not found' % (language, word)
s = None
return s
if __name__ == '__main__':
multidict = Multidictionary('multidictionary.csv')
print 'Available languages:', ', '.join(multidict.get_available_languages())
language = raw_input('Choose the input language: ')
word = raw_input('Type a word: ')
translations = multidict.get_translations(word, language)
if translations:
print translations
I'm trying to get the creation date for all the photos and videos in a folder, and having mixed success. I have .jpg, .mov, and .mp4 videos in this folder.
I spent a long time looking at other posts, and I saw quite a few references to the MMPython library here: http://sourceforge.net/projects/mmpython/
Looking through the MMPython source I think this will give me what I need, but the problem is that I don't know how to invoke it. In other words, I have my file, but I don't know how to interface with MMPython and I can't see any examples
Here is my script:
import os
import sys
import exifread
import hashlib
import ExifTool
if len(sys.argv) > 1:
var = sys.argv[1]
else:
var = raw_input("Please enter the directory: ")
direct = '/Users/bbarr233/Documents/Personal/projects/photoOrg/photos'
print "direct: " + direct
print "var: " + var
var = var.rstrip()
for root, dirs, filenames in os.walk(var):
print "root " + root
for f in filenames:
#make sure that we are dealing with images or videos
if f.find(".jpg") > -1 or f.find(".jpeg") > -1 or f.find(".mov") > -1 or f.find(".mp4") > -1:
print "file " + root + "/" + f
f = open(root + "/" + f, 'rb')
#Now I want to do something like this, but don't know which method to call:
#tags = mmpython.process_file(f)
# do something with the creation date
Can someone hint me on on how I can use the MMPython library?
Thanks!!!
PS. I've looked at some other threads on this, such as:
Link to thread:This one didn't make sense to me
Link to thread: This one worked great for mov but not for my mp4s, it said the creation date was 1946
Link to thread: This thread is one of the ones that suggested MMPython, but like I said I don't know how to use it.
Here is a well commented code example I found which will show you how to use mmpython..
This module extracts metadata from new media files, using mmpython,
and provides utilities for converting metadata between formats.
# Copyright (C) 2005 Micah Dowty <micah#navi.cx>
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import md5, os, cPickle
import mmpython
from mmpython.audio import mp3info
import sqlite
from RioKarma import Paths
class RidCalculator:
"""This object calculates the RID of a file- a sparse digest used by Rio Karma.
For files <= 64K, this is the file's md5sum. For larger files, this is the XOR
of three md5sums, from 64k blocks in the beginning, middle, and end.
"""
def fromSection(self, fileObj, start, end, blockSize=0x10000):
"""This needs a file-like object, as well as the offset and length of the portion
the RID is generated from. Beware that there is a special case for MP3 files.
"""
# It's a short file, compute only one digest
if end-start <= blockSize:
fileObj.seek(start)
return md5.md5(fileObj.read(end-start)).hexdigest()
# Three digests for longer files
fileObj.seek(start)
a = md5.md5(fileObj.read(blockSize)).digest()
fileObj.seek(end - blockSize)
b = md5.md5(fileObj.read(blockSize)).digest()
fileObj.seek((start + end - blockSize) / 2)
c = md5.md5(fileObj.read(blockSize)).digest()
# Combine the three digests
return ''.join(["%02x" % (ord(a[i]) ^ ord(b[i]) ^ ord(c[i])) for i in range(16)])
def fromFile(self, filename, length=None, mminfo=None):
"""Calculate the RID from a file, given its name. The file's length and
mmpython results may be provided if they're known, to avoid duplicating work.
"""
if mminfo is None:
mminfo = mmpython.parse(filename)
f = open(filename, "rb")
if length is None:
f.seek(0, 2)
length = f.tell()
f.seek(0)
# Is this an MP3 file? For some silliness we have to skip the header
# and the last 128 bytes of the file. mmpython can tell us where the
# header starts, but only in a somewhat ugly way.
if isinstance(mminfo, mmpython.audio.eyed3info.eyeD3Info):
try:
offset = mp3info.MPEG(f)._find_header(f)[0]
except ZeroDivisionError:
# This is a bit of a kludge, since mmpython seems to crash
# here on some MP3s for a currently-unknown reason.
print "WARNING, mmpython got a div0 error on %r" % filename
offset = 0
if offset < 0:
# Hmm, it couldn't find the header? Set this to zero
# so we still get a usable RID, but it probably
# won't strictly be a correct RID.
offset = 0
f.seek(0)
return self.fromSection(f, offset, length-128)
# Otherwise, use the whole file
else:
return self.fromSection(f, 0, length)
class BaseCache:
"""This is an abstract base class for objects that cache metadata
dictionaries on disk. The cache is implemented as a sqlite database,
with a 'dict' table holding administrative key-value data, and a
'files' table holding both a pickled representation of the metadata
and separate columns for all searchable keys.
"""
# This must be defined by subclasses as a small integer that changes
# when any part of the database schema or our storage format changes.
schemaVersion = None
# This is the template for our SQL schema. All searchable keys are
# filled in automatically, but other items may be added by subclasses.
schemaTemplate = """
CREATE TABLE dict
(
name VARCHAR(64) PRIMARY KEY,
value TEXT
);
CREATE TABLE files
(
%(keys)s,
_pickled TEXT NOT NULL
);
"""
# A list of searchable keys, used to build the schema and validate queries
searchableKeys = None
keyType = "VARCHAR(255)"
# The primary key is what ensures a file's uniqueness. Inserting a file
# with a primary key identical to an existing one will update that
# file rather than creating a new one.
primaryKey = None
def __init__(self, name):
self.name = name
self.connection = None
def open(self):
"""Open the cache, creating it if necessary"""
if self.connection is not None:
return
self.connection = sqlite.connect(Paths.getCache(self.name))
self.cursor = self.connection.cursor()
# See what version of the database we got. If it's empty
# or it's old, we need to reset it.
try:
version = self._dictGet('schemaVersion')
except sqlite.DatabaseError:
version = None
if version != str(self.schemaVersion):
self.empty()
def close(self):
if self.connection is not None:
self.sync()
self.connection.close()
self.connection = None
def _getSchema(self):
"""Create a complete schema from our schema template and searchableKeys"""
keys = []
for key in self.searchableKeys:
type = self.keyType
if key == self.primaryKey:
type += " PRIMARY KEY"
keys.append("%s %s" % (key, type))
return self.schemaTemplate % dict(keys=', '.join(keys))
def _encode(self, obj):
"""Encode an object that may not be a plain string"""
if type(obj) is unicode:
obj = obj.encode('utf-8')
elif type(obj) is not str:
obj = str(obj)
return "'%s'" % sqlite.encode(obj)
def _dictGet(self, key):
"""Return a value stored in the persistent dictionary. Returns None if
the key has no matching value.
"""
self.cursor.execute("SELECT value FROM dict WHERE name = '%s'" % key)
row = self.cursor.fetchone()
if row:
return sqlite.decode(row[0])
def _dictSet(self, key, value):
"""Create or update a value stored in the persistent dictionary"""
encodedValue = self._encode(value)
# First try inserting a new item
try:
self.cursor.execute("INSERT INTO dict (name, value) VALUES ('%s', %s)" %
(key, encodedValue))
except sqlite.IntegrityError:
# Violated the primary key constraint, update an existing item
self.cursor.execute("UPDATE dict SET value = %s WHERE name = '%s'" % (
encodedValue, key))
def sync(self):
"""Synchronize in-memory parts of the cache with disk"""
self.connection.commit()
def empty(self):
"""Reset the database to a default empty state"""
# Find and destroy every table in the database
self.cursor.execute("SELECT tbl_name FROM sqlite_master WHERE type='table'")
tables = [row.tbl_name for row in self.cursor.fetchall()]
for table in tables:
self.cursor.execute("DROP TABLE %s" % table)
# Apply the schema
self.cursor.execute(self._getSchema())
self._dictSet('schemaVersion', self.schemaVersion)
def _insertFile(self, d):
"""Insert a new file into the cache, given a dictionary of its metadata"""
# Make name/value lists for everything we want to update
dbItems = {'_pickled': self._encode(cPickle.dumps(d, -1))}
for column in self.searchableKeys:
if column in d:
dbItems[column] = self._encode(d[column])
# First try inserting a new row
try:
names = dbItems.keys()
self.cursor.execute("INSERT INTO files (%s) VALUES (%s)" %
(",".join(names), ",".join([dbItems[k] for k in names])))
except sqlite.IntegrityError:
# Violated the primary key constraint, update an existing item
self.cursor.execute("UPDATE files SET %s WHERE %s = %s" % (
", ".join(["%s = %s" % i for i in dbItems.iteritems()]),
self.primaryKey, self._encode(d[self.primaryKey])))
def _deleteFile(self, key):
"""Delete a File from the cache, given its primary key"""
self.cursor.execute("DELETE FROM files WHERE %s = %s" % (
self.primaryKey, self._encode(key)))
def _getFile(self, key):
"""Return a metadata dictionary given its primary key"""
self.cursor.execute("SELECT _pickled FROM files WHERE %s = %s" % (
self.primaryKey, self._encode(key)))
row = self.cursor.fetchone()
if row:
return cPickle.loads(sqlite.decode(row[0]))
def _findFiles(self, **kw):
"""Search for files. The provided keywords must be searchable.
Yields a list of details dictionaries, one for each match.
Any keyword can be None (matches anything) or it can be a
string to match. Keywords that aren't provided are assumed
to be None.
"""
constraints = []
for key, value in kw.iteritems():
if key not in self.searchableKeys:
raise ValueError("Key name %r is not searchable" % key)
constraints.append("%s = %s" % (key, self._encode(value)))
if not constraints:
constraints.append("1")
self.cursor.execute("SELECT _pickled FROM files WHERE %s" %
" AND ".join(constraints))
row = None
while 1:
row = self.cursor.fetchone()
if not row:
break
yield cPickle.loads(sqlite.decode(row[0]))
def countFiles(self):
"""Return the number of files cached"""
self.cursor.execute("SELECT COUNT(_pickled) FROM files")
return int(self.cursor.fetchone()[0])
def updateStamp(self, stamp):
"""The stamp for this cache is any arbitrary value that is expected to
change when the actual data on the device changes. It is used to
check the cache's validity. This function update's the stamp from
a value that is known to match the cache's current contents.
"""
self._dictSet('stamp', stamp)
def checkStamp(self, stamp):
"""Check whether a provided stamp matches the cache's stored stamp.
This should be used when you have a stamp that matches the actual
data on the device, and you want to see if the cache is still valid.
"""
return self._dictGet('stamp') == str(stamp)
class LocalCache(BaseCache):
"""This is a searchable metadata cache for files on the local disk.
It can be used to speed up repeated metadata lookups for local files,
but more interestingly it can be used to provide full metadata searching
on local music files.
"""
schemaVersion = 1
searchableKeys = ('type', 'rid', 'title', 'artist', 'source', 'filename')
primaryKey = 'filename'
def lookup(self, filename):
"""Return a details dictionary for the given filename, using the cache if possible"""
filename = os.path.realpath(filename)
# Use the mtime as a stamp to see if our cache is still valid
mtime = os.stat(filename).st_mtime
cached = self._getFile(filename)
if cached and int(cached.get('mtime')) == int(mtime):
# Yay, still valid
return cached['details']
# Nope, generate a new dict and cache it
details = {}
Converter().detailsFromDisk(filename, details)
generated = dict(
type = details.get('type'),
rid = details.get('rid'),
title = details.get('title'),
artist = details.get('artist'),
source = details.get('source'),
mtime = mtime,
filename = filename,
details = details,
)
self._insertFile(generated)
return details
def findFiles(self, **kw):
"""Search for files that match all given search keys. This returns an iterator
over filenames, skipping any files that aren't currently valid in the cache.
"""
for cached in self._findFiles(**kw):
try:
mtime = os.stat(cached['filename']).st_mtime
except OSError:
pass
else:
if cached.get('mtime') == mtime:
yield cached['filename']
def scan(self, path):
"""Recursively scan all files within the specified path, creating
or updating their cache entries.
"""
for root, dirs, files in os.walk(path):
for name in files:
filename = os.path.join(root, name)
self.lookup(filename)
# checkpoint this after every directory
self.sync()
_defaultLocalCache = None
def getLocalCache(create=True):
"""Get the default instance of LocalCache"""
global _defaultLocalCache
if (not _defaultLocalCache) and create:
_defaultLocalCache = LocalCache("local")
_defaultLocalCache.open()
return _defaultLocalCache
class Converter:
"""This object manages the connection between different kinds of
metadata- the data stored within a file on disk, mmpython attributes,
Rio attributes, and file extensions.
"""
# Maps mmpython classes to codec names for all formats the player
# hardware supports.
codecNames = {
mmpython.audio.eyed3info.eyeD3Info: 'mp3',
mmpython.audio.mp3info.MP3Info: 'mp3',
mmpython.audio.flacinfo.FlacInfo: 'flac',
mmpython.audio.pcminfo.PCMInfo: 'wave',
mmpython.video.asfinfo.AsfInfo: 'wma',
mmpython.audio.ogginfo.OggInfo: 'vorbis',
}
# Maps codec names to extensions. Identity mappings are the
# default, so they are omitted.
codecExtensions = {
'wave': 'wav',
'vorbis': 'ogg',
}
def filenameFromDetails(self, details,
unicodeEncoding = 'utf-8'):
"""Determine a good filename to use for a file with the given metadata
in the Rio 'details' format. If it's a data file, this will use the
original file as stored in 'title'.
Otherwise, it uses Navi's naming convention: Artist_Name/album_name/##_track_name.extension
"""
if details.get('type') == 'taxi':
return details['title']
# Start with just the artist...
name = details.get('artist', 'None').replace(os.sep, "").replace(" ", "_") + os.sep
album = details.get('source')
if album:
name += album.replace(os.sep, "").replace(" ", "_").lower() + os.sep
track = details.get('tracknr')
if track:
name += "%02d_" % track
name += details.get('title', 'None').replace(os.sep, "").replace(" ", "_").lower()
codec = details.get('codec')
extension = self.codecExtensions.get(codec, codec)
if extension:
name += '.' + extension
return unicode(name).encode(unicodeEncoding, 'replace')
def detailsFromDisk(self, filename, details):
"""Automagically load media metadata out of the provided filename,
adding entries to details. This works on any file type
mmpython recognizes, and other files should be tagged
appropriately for Rio Taxi.
"""
info = mmpython.parse(filename)
st = os.stat(filename)
# Generic details for any file. Note that we start out assuming
# all files are unreadable, and label everything for Rio Taxi.
# Later we'll mark supported formats as music.
details['length'] = st.st_size
details['type'] = 'taxi'
details['rid'] = RidCalculator().fromFile(filename, st.st_size, info)
# We get the bulk of our metadata via mmpython if possible
if info:
self.detailsFromMM(info, details)
if details['type'] == 'taxi':
# All taxi files get their filename as their title, regardless of what mmpython said
details['title'] = os.path.basename(filename)
# Taxi files also always get a codec of 'taxi'
details['codec'] = 'taxi'
# Music files that still don't get a title get their filename minus the extension
if not details.get('title'):
details['title'] = os.path.splitext(os.path.basename(filename))[0]
def detailsFromMM(self, info, details):
"""Update Rio-style 'details' metadata from MMPython info"""
# Mime types aren't implemented consistently in mmpython, but
# we can look at the type of the returned object to decide
# whether this is a format that the Rio probably supports.
# This dictionary maps mmpython clases to Rio codec names.
for cls, codec in self.codecNames.iteritems():
if isinstance(info, cls):
details['type'] = 'tune'
details['codec'] = codec
break
# Map simple keys that don't require and hackery
for fromKey, toKey in (
('artist', 'artist'),
('title', 'title'),
('album', 'source'),
('date', 'year'),
('samplerate', 'samplerate'),
):
v = info[fromKey]
if v is not None:
details[toKey] = v
# The rio uses a two-letter prefix on bit rates- the first letter
# is 'f' or 'v', presumably for fixed or variable. The second is
# 'm' for mono or 's' for stereo. There doesn't seem to be a good
# way to get VBR info out of mmpython, so currently this always
# reports a fixed bit rate. We also have to kludge a bit because
# some metdata sources give us bits/second while some give us
# kilobits/second. And of course, there are multiple ways of
# reporting stereo...
kbps = info['bitrate']
if type(kbps) in (int, float) and kbps > 0:
stereo = bool( (info['channels'] and info['channels'] >= 2) or
(info['mode'] and info['mode'].find('stereo') >= 0) )
if kbps > 8000:
kbps = kbps // 1000
details['bitrate'] = ('fm', 'fs')[stereo] + str(kbps)
# If mmpython gives us a length it seems to always be in seconds,
# whereas the Rio expects milliseconds.
length = info['length']
if length:
details['duration'] = int(length * 1000)
# mmpython often gives track numbers as a fraction- current/total.
# The Rio only wants the current track, and we might as well also
# strip off leading zeros and such.
trackNo = info['trackno']
if trackNo:
details['tracknr'] = int(trackNo.split("/", 1)[0])
Reference: http://svn.navi.cx/misc/trunk/rio-karma/python/RioKarma/Metadata.py
Further:
Including Python modules
You should look at the os.stat functions
https://docs.python.org/2/library/os.html
os.stat returns file creation and modified times ctime, mtime
It should be something like this:
Import os
st= os.stat(full_file_path)
file_ctime= st.st_ctime
print(file_ctime)
this is a strange question I know... I have a regular expression like:
rex = r"at (?P<hour>[0-2][0-9]) send email to (?P<name>\w*):? (?P<message>.+)"
so if I match that like this:
match = re.match(rex, "at 10 send email to bob: hi bob!")
match.groupdict() gives me this dict:
{"hour": "10", "name": "bob", "message": "hi bob!"}
My question is: given the dict above and rex, can I make a function that returns the original text? I know that many texts can match to the same dict (in this case the ':' after the name is optional) but I want one of the infinite texts that will match to the dict in input.
Using inverse_regex:
"""
http://www.mail-archive.com/python-list#python.org/msg125198.html
"""
import itertools as IT
import sre_constants as sc
import sre_parse
import string
# Generate strings that match a given regex
category_chars = {
sc.CATEGORY_DIGIT : string.digits,
sc.CATEGORY_SPACE : string.whitespace,
sc.CATEGORY_WORD : string.digits + string.letters + '_'
}
def unique_extend(res_list, list):
for item in list:
if item not in res_list:
res_list.append(item)
def handle_any(val):
"""
This is different from normal regexp matching. It only matches
printable ASCII characters.
"""
return string.printable
def handle_branch((tok, val)):
all_opts = []
for toks in val:
opts = permute_toks(toks)
unique_extend(all_opts, opts)
return all_opts
def handle_category(val):
return list(category_chars[val])
def handle_in(val):
out = []
for tok, val in val:
out += handle_tok(tok, val)
return out
def handle_literal(val):
return [chr(val)]
def handle_max_repeat((min, max, val)):
"""
Handle a repeat token such as {x,y} or ?.
"""
subtok, subval = val[0]
if max > 5000:
# max is the number of cartesian join operations needed to be
# carried out. More than 5000 consumes way to much memory.
# raise ValueError("To many repetitions requested (%d)" % max)
max = 5000
optlist = handle_tok(subtok, subval)
iterlist = []
for x in range(min, max + 1):
joined = IT.product(*[optlist]*x)
iterlist.append(joined)
return (''.join(it) for it in IT.chain(*iterlist))
def handle_range(val):
lo, hi = val
return (chr(x) for x in range(lo, hi + 1))
def handle_subpattern(val):
return list(permute_toks(val[1]))
def handle_tok(tok, val):
"""
Returns a list of strings of possible permutations for this regexp
token.
"""
handlers = {
sc.ANY : handle_any,
sc.BRANCH : handle_branch,
sc.CATEGORY : handle_category,
sc.LITERAL : handle_literal,
sc.IN : handle_in,
sc.MAX_REPEAT : handle_max_repeat,
sc.RANGE : handle_range,
sc.SUBPATTERN : handle_subpattern}
try:
return handlers[tok](val)
except KeyError, e:
fmt = "Unsupported regular expression construct: %s"
raise ValueError(fmt % tok)
def permute_toks(toks):
"""
Returns a generator of strings of possible permutations for this
regexp token list.
"""
lists = [handle_tok(tok, val) for tok, val in toks]
return (''.join(it) for it in IT.product(*lists))
########## PUBLIC API ####################
def ipermute(p):
return permute_toks(sre_parse.parse(p))
You could apply the substitutions given rex and data, and then use inverse_regex.ipermute to generate strings that match the original regex:
import re
import itertools as IT
import inverse_regex as ire
rex = r"(?:at (?P<hour>[0-2][0-9])|today) send email to (?P<name>\w*):? (?P<message>.+)"
match = re.match(rex, "at 10 send email to bob: hi bob!")
data = match.groupdict()
del match
new_regex = re.sub(r'[(][?]P<([^>]+)>[^)]*[)]', lambda m: data.get(m.group(1)), rex)
for s in IT.islice(ire.ipermute(new_regex), 10):
print(s)
yields
today send email to bob hi bob!
today send email to bob: hi bob!
at 10 send email to bob hi bob!
at 10 send email to bob: hi bob!
Note: I modified the original inverse_regex to not raise a ValueError when the regex contains *s. Instead, the * is changed to be effectively like {,5000} so you'll at least get some permutations.
This is one of the texts that will match the regex:
'at {hour} send email to {name}: {message}'.format(**match.groupdict())'