Truecasing - SpaCy - python

Intent is to capitalize based on POS tags, which I could achieve with the help of the below link.
How can I best determine the correct capitalization for a word?
Trying to achieve similar results using spacy?
def truecase(doc):
truecased_sents = [] # list of truecased sentences
tagged_sent = token.tag_([word.lower() for token in doc])
normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
normalized_sent[0] = normalized_sent[0].capitalize()
string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))
return string
it throws out this error
tagged_sent = token.tag_([word.lower() for token in doc])
NameError: global name 'token' is not defined
how to declare token as global and solve this issue. Is my approach correct?

import spacy, re
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'autonomous cars shift insurance liability toward manufacturers.')
tagged_sent = [(w.text, w.tag_) for w in doc]
normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
normalized_sent[0] = normalized_sent[0].capitalize()
string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))
print string
Output:
Autonomous Cars shift Insurance Liability toward Manufacturers.

Related

Iterate over spacy tokens and extract the BILOU tags

How should I annotate the following sentence with BILOU tags?
I have a function called get_dataset2 what this function do is it will give the tokens, POS tags and BILOU tags but the things is that am stuck at BILOU tags.
Function:
def get_dataset2(sent):
head_entity = ""
candidate_entity = ""
prv_tok_dep = ""
prv_tok_text = ""
prefix = ""
words_ = []
label_ = []
tags_ = []
doc = nlp(sent)
for tok in doc:
words_.append(tok.text)
label_.append(tok.pos_)
if(tok.text=='JUDGMENT'):
tags_.append('O')
next_token1 = doc[tok.i+1]
#next_tok_loc1 = tok.i+1
next_token2 = doc[tok.i+2]
#next_tok_loc2 = tok.i+2
if(tok.text==next_token1 and (next_token2.pos_=='PUNCT' or next_token2.pos_=='NUM')):
tags_.append('U-Parties')
#if(next_token1.pos_=='PROPN' and next_token2.pos_=='PROPN'):
#tags_.append('U-Parties')
else:
tags_.append('O')
return (pd.DataFrame({'Token': words_, 'POS': label_,'Tags': tags_}))
Problem: get_dataset2('JUDGMENT Gajendragadkar, J. 1.') when i pass this sentence to that function then it will successfully extract the tokens and POS but not the BILOU tags.
It should be like :
Tokens POS BILOU Tags
JUDGMENT PROPN O
Gajendragadkar PROPN U-Parties
, PUNCT O
I wan to iterate over tokens like after JUDGMENT I want to identify the second and third token and then I will assign the BILOU tags if it is single then U-parties.
Thanks!

Keywords extraction in Python - How to handle hyphenated compound words

I'm trying to perform keyphrase extraction with Python, using KeyBert and pke PositionRank. You can see an extract of my code below.
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import pke
text = "The life-cycle Global Warming Potential of the building resulting from the construction has been calculated for each stage in the life-cycle and is disclosed to investors and clients on demand" #text_cleaning(df_tassonomia.iloc[1077].text, sentence_adjustment, stop_words)
# Pke
extractor = pke.unsupervised.PositionRank()
extractor.load_document(text, language='en')
extractor.candidate_selection(maximum_word_number = 5)
extractor.candidate_weighting(window = 10)
keyphrases = extractor.get_n_best(n=10)
print(keyphrases)
# KeyBert
kw_model = KeyBERT(model = "all-mpnet-base-v2")
keyphrases_2 = kw_model.extract_keywords(docs=text,
vectorizer=KeyphraseCountVectorizer(),
keyphrase_ngram_range = (1,5),
top_n=10
)
print("")
print(keyphrases_2)
and here the results:
[('cycle global warming potential', 0.44829175082921835), ('life', 0.17858359644549557), ('cycle', 0.15775994057934534), ('building', 0.09131084381406684), ('construction', 0.08860454878871142), ('investors', 0.05426710724030216), ('clients', 0.054111700289631526), ('stage', 0.045672396861507744), ('demand', 0.039158055731066406)]
[('cycle global warming potential', 0.5444), ('building', 0.4479), ('construction', 0.3476), ('investors', 0.1967), ('clients', 0.1519), ('demand', 0.1484), ('cycle', 0.1312), ('stage', 0.0931), ('life', 0.0847)]
I would like to handle hyphenated compound words (as life-cycle in the example) are considered as a unique word, but I cannot understand how to exclude the - from the words separators list.
Thank you in advance for any help.
Francesca
this could be a silly workaround but it may help :
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import pke
text = "The life-cycle Global Warming Potential of the building
resulting from the construction has been calculated for each stage in
the life-cycle and is disclosed to investors and clients on demand"
# Pke
tokens = text.split()
orignal = set([x for x in tokens if "_" in x])
text = text.replace("-", "_")
extractor = pke.unsupervised.PositionRank()
extractor.load_document(text, language='en')
extractor.candidate_selection(maximum_word_number=5)
extractor.candidate_weighting(window=10)
keyphrases = extractor.get_n_best(n=10)
keyphrases_replaced = []
for pair in keyphrases:
if "_" in pair[0] and pair[0] not in orignal:
keyphrases_replaced.append((pair[0].replace("_","-"),pair[1]))
else:
keyphrases_replaced.append(pair)
print(keyphrases_replaced)
# KeyBert
keyphrases_2 = kw_model.extract_keywords(docs=text,
vectorizer=KeyphraseCountVectorizer(),
keyphrase_ngram_range=(1, 5),
top_n=10
)
print("")
print(keyphrases_2)
the out put should look like this:
[('life-cycle global warming potential', 0.5511001220016548), ('life-cycle', 0.20123353586644233), ('construction', 0.11945270995269436), ('building', 0.10637157845606555), ('investors', 0.06675114967366767), ('stage', 0.05503532672910801), ('clients', 0.0507262942318816), ('demand', 0.05056281895492815)]
I hope this help :)
The issue has been fixed in the on the latest pke updates: https://github.com/boudinfl/pke/issues/195
import pke
extractor = pke.unsupervised.TopicRank()
extractor.load_document(input='BERT is a state-of-the-art model.', language='en')
extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
print(extractor.candidates.keys())
now returns this output:
dict_keys(['bert', 'state-of-the-art model'])

I need to automatize the extraction of a logical statement (SWRL) from sentences in English

(Excuse me for my English, I'm also new at this, so be gentle, thank you)
I'm trying to extract a logical statement(SWRL) from any possible sentence that contains actions and conditions
This is the kind of logical statement I'd like to obtain:
IF (CONDITION) THEN (ACTION | NOT ACTION | ACTION OR NOT ACTION)
I've been trying to apply some NLP techniques with Spacy and Stanford NLP library, but my lack of knowledge about grammatical English structures makes it almost impossible for me.
I'd like to know if someone could help me with this research, either with ideas or with unknown libraries for me.
For example:
import nltk
import spacy
nlp = spacy.load('en_core_web_sm')
sent="The speed limit is 90 kilometres per hour on roads outside built-up areas."
doc=nlp(sent)
Obtaining the root:
def sent_root(sent):
for index,token in enumerate(sent):
if token.head == token:
return token, index
Out: (is, 3)
Obtaining the subject:
def sent_subj(sent):
for index,token in enumerate(sent):
if token.dep_ == 'nsubj':
return token, index
Out: (limit, 2)
Obtaining the childrens (dependencies of the word):
def sent_child(token):
complete_subj = ''
for child in token.children:
if(child.is_punct == False):
if(child.dep_ == 'compound'):
complete_subj += child.text + ' ' + token.text+' '
else:
complete_subj += child.text + ' '
for child_token in child.children:
if(child.is_punct == False):
complete_subj += child_token.text+' '
return complete_subj
Out: 'The speed limit '
Doc ents + root:
def doc_ents_root(sent, root):
ents_root = root.text+' '
for token in sent.ents:
ents_root += token.text + ' '
return ents_root
Out: 'is 90 kilometres per hour '
Extracting the action:
def action(sent):
#Obtaining the sent root
root, root_idx = sent_root(sent)
#Obtaining the subject
subj, subj_idx = sent_subj(sent)
#Obtaining the whole subject (subj + comps)
complete_subj = sent_child(subj)
complete_ents = doc_ents_root(sent, root)
return complete_subj + complete_ents
Applying all the funcions
action(doc)
Out: 'A traffic light with signal indicates '

Spacy - Chunk NE tokens

Let's say that I have a document, like so:
import spacy
nlp = spacy.load('en')
doc = nlp('My name is John Smith')
[t for t in doc]
> [My, name, is, John, Smith]
Spacy is intelligent enough to realize that 'John Smith' is a multi-token named entity:
[e for e in doc.ents]
> [John Smith]
How can I make it chunk named entities into discrete tokens, like so:
> [My, name, is, John Smith]
Spacy documentation on NER says that you can access token entity annotations using the token.ent_iob_ and token.ent_type_ attributes.
https://spacy.io/usage/linguistic-features#accessing
Example:
import spacy
nlp = spacy.load('en')
doc = nlp('My name is John Smith')
ne = []
merged = []
for t in doc:
# "O" -> current token is not part of the NE
if t.ent_iob_ == "O":
if len(ne) > 0:
merged.append(" ".join(ne))
ne = []
merged.append(t.text)
else:
ne.append(t.text)
if len(ne) > 0:
merged.append(" ".join(ne))
print(merged)
This will print:
['My', 'name', 'is', 'John Smith']

wish to extract compound noun-adjective pairs from a sentence. So, basically I want something like :

For the adjective:
"The company's customer service was terrible."
{customer service, terrible}
For the verb:
"They kept increasing my phone bill"
{phone bill, increasing}
This is a branch questions from this posting
However I'm trying to find adj and verbs corresponding to multi-token phrases/compound nouns such as "customer service" using spacy.
I'm not sure how to do this with spacy, nltk, or any other prepackaged natural language processing software, and I'd appreciate any help!
For simple examples like this, you can use spaCy's dependency parsing with a few simple rules.
First, to identify multi-word nouns similar to the examples given, you can use the "compound" dependency. After parsing a document (e.g., sentence) with spaCy, use a token's dep_ attribute to find it's dependency.
For example, this sentence has two compound nouns:
"The compound dependency identifies compound nouns."
Each token and its dependency is shown below:
import spacy
import pandas as pd
nlp = spacy.load('en')
example_doc = nlp("The compound dependency identifies compound nouns.")
for tok in example_doc:
print(tok.i, tok, "[", tok.dep_, "]")
>>>0 The [ det ]
>>>1 compound [ compound ]
>>>2 dependency [ nsubj ]
>>>3 identifies [ ROOT ]
>>>4 compound [ compound ]
>>>5 nouns [ dobj ]
>>>6 . [ punct ]
for tok in [tok for tok in example_doc if tok.dep_ == 'compound']: # Get list of
compounds in doc
noun = example_doc[tok.i: tok.head.i + 1]
print(noun)
>>>compound dependency
>>>compound nouns
The below function works for your examples. However, it will likely not work for more complicated sentences.
adj_doc = nlp("The company's customer service was terrible.")
verb_doc = nlp("They kept increasing my phone bill")
def get_compound_pairs(doc, verbose=False):
"""Return tuples of (multi-noun word, adjective or verb) for document."""
compounds = [tok for tok in doc if tok.dep_ == 'compound'] # Get list of compounds in doc
compounds = [c for c in compounds if c.i == 0 or doc[c.i - 1].dep_ != 'compound'] # Remove middle parts of compound nouns, but avoid index errors
tuple_list = []
if compounds:
for tok in compounds:
pair_item_1, pair_item_2 = (False, False) # initialize false variables
noun = doc[tok.i: tok.head.i + 1]
pair_item_1 = noun
# If noun is in the subject, we may be looking for adjective in predicate
# In simple cases, this would mean that the noun shares a head with the adjective
if noun.root.dep_ == 'nsubj':
adj_list = [r for r in noun.root.head.rights if r.pos_ == 'ADJ']
if adj_list:
pair_item_2 = adj_list[0]
if verbose == True: # For trying different dependency tree parsing rules
print("Noun: ", noun)
print("Noun root: ", noun.root)
print("Noun root head: ", noun.root.head)
print("Noun root head rights: ", [r for r in noun.root.head.rights if r.pos_ == 'ADJ'])
if noun.root.dep_ == 'dobj':
verb_ancestor_list = [a for a in noun.root.ancestors if a.pos_ == 'VERB']
if verb_ancestor_list:
pair_item_2 = verb_ancestor_list[0]
if verbose == True: # For trying different dependency tree parsing rules
print("Noun: ", noun)
print("Noun root: ", noun.root)
print("Noun root head: ", noun.root.head)
print("Noun root head verb ancestors: ", [a for a in noun.root.ancestors if a.pos_ == 'VERB'])
if pair_item_1 and pair_item_2:
tuple_list.append((pair_item_1, pair_item_2))
return tuple_list
get_compound_pairs(adj_doc)
>>>[(customer service, terrible)]
get_compound_pairs(verb_doc)
>>>[(phone bill, increasing)]
get_compound_pairs(example_doc, verbose=True)
>>>Noun: compound dependency
>>>Noun root: dependency
>>>Noun root head: identifies
>>>Noun root head rights: []
>>>Noun: compound nouns
>>>Noun root: nouns
>>>Noun root head: identifies
>>>Noun root head verb ancestors: [identifies]
>>>[(compound nouns, identifies)]
I needed to solve a similar problem and I wanted to share my solution as Spacy.io custom component.
import spacy
from spacy.tokens import Token, Span
from spacy.language import Language
#Language.component("compound_chainer")
def find_compounds(doc):
Token.set_extension("is_compound_chain", default=False)
com_range = []
max_ind = len(doc)
for idx, tok in enumerate(doc):
if((tok.dep_ == "compound") and (idx < max_ind)):
com_range.append([idx, idx+1])
to_remove = []
intersections = []
for t1 in com_range:
for t2 in com_range:
if(t1 != t2):
s1 = set(t1)
s2 = set(t2)
if(len(s1.intersection(s2)) > 0):
to_remove.append(t1)
to_remove.append(t2)
union = list(s1.union(s2))
if union not in intersections:
intersections.append(union)
r = [t for t in com_range if t not in to_remove]
compound_ranges = r + intersections
spans = []
for cr in compound_ranges:
# Example cr [[0, 1], [3, 4], [12, 13], [16, 17, 18]]
entity = Span(doc, min(cr), max(cr)+1, label="compound_chain")
for token in entity:
token._.set("is_compound_chain", True)
spans.append(entity)
doc.ents = list(doc.ents) + spans
return doc
Github link: https://github.com/eboraks/job-description-nlp-analysis/blob/main/src/components/compound_chainer.py

Categories