Spacy - Chunk NE tokens - python

Let's say that I have a document, like so:
import spacy
nlp = spacy.load('en')
doc = nlp('My name is John Smith')
[t for t in doc]
> [My, name, is, John, Smith]
Spacy is intelligent enough to realize that 'John Smith' is a multi-token named entity:
[e for e in doc.ents]
> [John Smith]
How can I make it chunk named entities into discrete tokens, like so:
> [My, name, is, John Smith]

Spacy documentation on NER says that you can access token entity annotations using the token.ent_iob_ and token.ent_type_ attributes.
https://spacy.io/usage/linguistic-features#accessing
Example:
import spacy
nlp = spacy.load('en')
doc = nlp('My name is John Smith')
ne = []
merged = []
for t in doc:
# "O" -> current token is not part of the NE
if t.ent_iob_ == "O":
if len(ne) > 0:
merged.append(" ".join(ne))
ne = []
merged.append(t.text)
else:
ne.append(t.text)
if len(ne) > 0:
merged.append(" ".join(ne))
print(merged)
This will print:
['My', 'name', 'is', 'John Smith']

Related

Extracting subject/object in Spacy

I am very new to working with Spacy in Python and I have an issue - when identifying the subject/object, Spacy doesn’t label the whole proper noun as the subject/object. For example when working with two similar nouns in the same context (e.g. John Doe and John Smith) spacy confuses Doe and Smith because they are both Johns.
I was wondering how I could solve this issue. For example by injecting something in the lines of “if Doe follows John, then John Doe is the subject, and if Smith follows the word John, then John Smith is the subject”?
Here's what I have so far
if lang == 'en':
dir_path = r'/User/news/articles/en'
nlp = en_core_web_lg.load()
#if name comes after these words he is most likely the object
indObjCombinations = re.compile(r'\b(with|for|against|to|from|without|between)(?:\W+\w+){0,3}?\W+(%s)\b' % '|'.join(names),re.IGNORECASE)
passiveCombinations = re.compile(r'\b(by)(?:\W+\w+){0,3}?\W+(%s)\b' % '|'.join(names),re.IGNORECASE)
objCombinations = re.compile(r'\b(received|receives|welcomes|welcomed)(?:\W+\w+){0,3}?\W+(%s)\b' % '|'.join(names),re.IGNORECASE)
subjCombinations = re.compile(r'\b(%s)(?:\W+\w+){0,1}?\W+(in)\b' % '|'.join(names),re.IGNORECASE)
subjCombinations_andName = re.compile(r'\b(and)(\W+\w+){0,3}\W+(%s)(\W+\w+){0,3}\W+(are)\b' % '|'.join(names),re.IGNORECASE )
subjCombinations_nameAnd = re.compile(r'\b(%s)(\W+\w+){0,3}\W+(and)(\W+\w+){0,3}\W+(are)\b' % '|'.join(names),re.IGNORECASE )
subjBeginning = re.compile(r'\b^(%s)(\W+\w+){0,1}\W*(:)\b' % '|'.join(names),re.IGNORECASE )
def getSubsFromConjunctions(subs):
moreSubs = []
for sub in subs:
# rights is a generator
rights = list(sub.rights)
rightDeps = {tok.lower_ for tok in rights}
if lang == 'en':
if "and" in rightDeps:
moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
if len(moreSubs) > 0:
moreSubs.extend(getSubsFromConjunctions(moreSubs))
def getObjsFromConjunctions(objs):
moreObjs = []
for obj in objs:
# rights is a generator
rights = list(obj.rights)
rightDeps = {tok.lower_ for tok in rights}
if lang == 'en':
if "and" in rightDeps:
moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
if len(moreObjs) > 0:
moreObjs.extend(getObjsFromConjunctions(moreObjs))

how can I pass table or dataframe instead of text with entity recognition using spacy

The following link shows how to add multiple EntityRuler with spaCy. The code to do that is below:
import spacy
import pandas as pd
from spacy.pipeline import EntityRuler
nlp = spacy.load('en_core_web_sm', disable = ['ner'])
ruler = nlp.add_pipe("entity_ruler")
flowers = ["rose", "tulip", "african daisy"]
for f in flowers:
ruler.add_patterns([{"label": "flower", "pattern": f}])
animals = ["cat", "dog", "artic fox"]
for a in animals:
ruler.add_patterns([{"label": "animal", "pattern": a}])
result={}
doc = nlp("cat and artic fox, plant african daisy")
for ent in doc.ents:
result[ent.label_]=ent.text
df = pd.DataFrame([result])
print(df)
The output:
animal flower
0 artic fox african daisy
The problem is: How can i pass dataframe or table instead of the text:"cat and artic fox, plant african daisy"
Imagine that your dataframe is
df = pd.DataFrame({'Text':["cat and artic fox, plant african daisy"]})
You may define a custom method to extract the entities and then use it with Series.apply:
def get_entities(x):
result = {}
doc = nlp(x)
for ent in doc.ents:
result[ent.label_]=ent.text
return result
and then
df['Matches'] = df['Text'].apply(get_entities)
>>> df['Matches']
0 {'animal': 'artic fox', 'flower': 'african daisy'}
Name: Matches, dtype: object

spacy-udpipe with pytextrank to extract keywords from non-English text

I've been using pytextrank (https://github.com/DerwenAI/pytextrank/) with spacy and English models for keywords extraction - it works great!
Now I need to process non-English texts and I found udpipe (https://github.com/TakeLab/spacy-udpipe) but it doesn't work out of the box ... after
nlp = spacy_udpipe.load("sk")
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
doc = nlp(text)
I get tokens with POS and DEP tags, but there is nothing in doc._.phrases (doc.noun_chunks is also empty) and in nlp.pipe_names is just ['textrank']
What should I add to the spacy's pipeline to get it working? I assume pytextrank needs noun_chunks...
Any tip or suggestion where to look will help me - thanks!
would you mind starting an issue about this on the PyTextRank repo?
https://github.com/DerwenAI/pytextrank/issues
Also, if you could please provide example text to use (in the language requested)
We'll try to debug this integration.
Thanks for pointing it out!
Paco
I found a solution! I'm not sure how clean is the nlp.Defaults.syntax_iterators = {"noun_chunks" : get_chunks}, but it works (it's based on how are the noun_chunks defined in syntax_iterators.py and __init__.py in spaCy/lang/en)
import spacy_udpipe, spacy, pytextrank
from spacy.matcher import Matcher
from spacy.attrs import POS
def get_chunks(doc):
np_label = doc.vocab.strings.add("NP")
matcher = Matcher(nlp.vocab)
pattern = [{POS: 'ADJ', "OP": "+"}, {POS: {"IN": ["NOUN", "PROPN"]}, "OP": "+"}]
matcher.add("Adjective(s), (p)noun", None, pattern)
matches = matcher(doc)
for match_id, start, end in matches:
yield start, end, np_label
spacy_udpipe.download("sk") # download model
nlp = spacy_udpipe.load("sk")
nlp.Defaults.syntax_iterators = {"noun_chunks" : get_chunks} #noun_chunk replacement
tr = pytextrank.TextRank(logger=None)
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
text = "Wikipédia je webová encyklopédia s otvoreným obsahom, ktorú možno slobodne čítať aj upravovať. Je sponzorovaná neziskovou organizáciou Wikimedia Foundation. Má 285 nezávislých jazykových vydaní vrátane slovenského a najrozsiahlejšieho anglického. Popri článkoch encyklopedického typu obsahuje, najmä anglická encyklopédia, aj články podobajúce sa almanachu, atlasu či stránky aktuálnych udalostí. Wikipédia je jedným z najpopulárnejších zdrojov informácií na webe s približne 13 miliardami zobrazení mesačne. Jej rast je skoro exponenciálny. Wikipédii (takmer 2 milióny). Wikipédia bola spustená 15. januára 2001 ako doplnok k expertmi písanej Nupedii. So stále rastúcou popularitou sa Wikipédia stala podhubím pre sesterské projekty ako Wikislovník (Wiktionary), Wikiknihy (Wikibooks) a Wikisprávy (Wikinews). Jej články sú upravované dobrovoľníkmi vo wiki štýle, čo znamená, že články môže meniť v podstate hocikto. Wikipediáni presadzujú politiku „nestranný uhol pohľadu“. Podľa nej relevantné názory ľudí sú sumarizované bez ambície určiť objektívnu pravdu. Vzhľadom na to, že Wikipédia presadzuje otvorenú filozofiu, jej najväčším problémom je vandalizmus a nepresnosť. "
doc = nlp(text)
print("Noun chunks:")
for nc in doc.noun_chunks:
print(nc)
print("\nKeywords:")
for phrase in doc._.phrases:
print("{:.4f} {:5d} {}".format(phrase.rank, phrase.count, phrase.text))
print(phrase.chunks)

wish to extract compound noun-adjective pairs from a sentence. So, basically I want something like :

For the adjective:
"The company's customer service was terrible."
{customer service, terrible}
For the verb:
"They kept increasing my phone bill"
{phone bill, increasing}
This is a branch questions from this posting
However I'm trying to find adj and verbs corresponding to multi-token phrases/compound nouns such as "customer service" using spacy.
I'm not sure how to do this with spacy, nltk, or any other prepackaged natural language processing software, and I'd appreciate any help!
For simple examples like this, you can use spaCy's dependency parsing with a few simple rules.
First, to identify multi-word nouns similar to the examples given, you can use the "compound" dependency. After parsing a document (e.g., sentence) with spaCy, use a token's dep_ attribute to find it's dependency.
For example, this sentence has two compound nouns:
"The compound dependency identifies compound nouns."
Each token and its dependency is shown below:
import spacy
import pandas as pd
nlp = spacy.load('en')
example_doc = nlp("The compound dependency identifies compound nouns.")
for tok in example_doc:
print(tok.i, tok, "[", tok.dep_, "]")
>>>0 The [ det ]
>>>1 compound [ compound ]
>>>2 dependency [ nsubj ]
>>>3 identifies [ ROOT ]
>>>4 compound [ compound ]
>>>5 nouns [ dobj ]
>>>6 . [ punct ]
for tok in [tok for tok in example_doc if tok.dep_ == 'compound']: # Get list of
compounds in doc
noun = example_doc[tok.i: tok.head.i + 1]
print(noun)
>>>compound dependency
>>>compound nouns
The below function works for your examples. However, it will likely not work for more complicated sentences.
adj_doc = nlp("The company's customer service was terrible.")
verb_doc = nlp("They kept increasing my phone bill")
def get_compound_pairs(doc, verbose=False):
"""Return tuples of (multi-noun word, adjective or verb) for document."""
compounds = [tok for tok in doc if tok.dep_ == 'compound'] # Get list of compounds in doc
compounds = [c for c in compounds if c.i == 0 or doc[c.i - 1].dep_ != 'compound'] # Remove middle parts of compound nouns, but avoid index errors
tuple_list = []
if compounds:
for tok in compounds:
pair_item_1, pair_item_2 = (False, False) # initialize false variables
noun = doc[tok.i: tok.head.i + 1]
pair_item_1 = noun
# If noun is in the subject, we may be looking for adjective in predicate
# In simple cases, this would mean that the noun shares a head with the adjective
if noun.root.dep_ == 'nsubj':
adj_list = [r for r in noun.root.head.rights if r.pos_ == 'ADJ']
if adj_list:
pair_item_2 = adj_list[0]
if verbose == True: # For trying different dependency tree parsing rules
print("Noun: ", noun)
print("Noun root: ", noun.root)
print("Noun root head: ", noun.root.head)
print("Noun root head rights: ", [r for r in noun.root.head.rights if r.pos_ == 'ADJ'])
if noun.root.dep_ == 'dobj':
verb_ancestor_list = [a for a in noun.root.ancestors if a.pos_ == 'VERB']
if verb_ancestor_list:
pair_item_2 = verb_ancestor_list[0]
if verbose == True: # For trying different dependency tree parsing rules
print("Noun: ", noun)
print("Noun root: ", noun.root)
print("Noun root head: ", noun.root.head)
print("Noun root head verb ancestors: ", [a for a in noun.root.ancestors if a.pos_ == 'VERB'])
if pair_item_1 and pair_item_2:
tuple_list.append((pair_item_1, pair_item_2))
return tuple_list
get_compound_pairs(adj_doc)
>>>[(customer service, terrible)]
get_compound_pairs(verb_doc)
>>>[(phone bill, increasing)]
get_compound_pairs(example_doc, verbose=True)
>>>Noun: compound dependency
>>>Noun root: dependency
>>>Noun root head: identifies
>>>Noun root head rights: []
>>>Noun: compound nouns
>>>Noun root: nouns
>>>Noun root head: identifies
>>>Noun root head verb ancestors: [identifies]
>>>[(compound nouns, identifies)]
I needed to solve a similar problem and I wanted to share my solution as Spacy.io custom component.
import spacy
from spacy.tokens import Token, Span
from spacy.language import Language
#Language.component("compound_chainer")
def find_compounds(doc):
Token.set_extension("is_compound_chain", default=False)
com_range = []
max_ind = len(doc)
for idx, tok in enumerate(doc):
if((tok.dep_ == "compound") and (idx < max_ind)):
com_range.append([idx, idx+1])
to_remove = []
intersections = []
for t1 in com_range:
for t2 in com_range:
if(t1 != t2):
s1 = set(t1)
s2 = set(t2)
if(len(s1.intersection(s2)) > 0):
to_remove.append(t1)
to_remove.append(t2)
union = list(s1.union(s2))
if union not in intersections:
intersections.append(union)
r = [t for t in com_range if t not in to_remove]
compound_ranges = r + intersections
spans = []
for cr in compound_ranges:
# Example cr [[0, 1], [3, 4], [12, 13], [16, 17, 18]]
entity = Span(doc, min(cr), max(cr)+1, label="compound_chain")
for token in entity:
token._.set("is_compound_chain", True)
spans.append(entity)
doc.ents = list(doc.ents) + spans
return doc
Github link: https://github.com/eboraks/job-description-nlp-analysis/blob/main/src/components/compound_chainer.py

Truecasing - SpaCy

Intent is to capitalize based on POS tags, which I could achieve with the help of the below link.
How can I best determine the correct capitalization for a word?
Trying to achieve similar results using spacy?
def truecase(doc):
truecased_sents = [] # list of truecased sentences
tagged_sent = token.tag_([word.lower() for token in doc])
normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
normalized_sent[0] = normalized_sent[0].capitalize()
string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))
return string
it throws out this error
tagged_sent = token.tag_([word.lower() for token in doc])
NameError: global name 'token' is not defined
how to declare token as global and solve this issue. Is my approach correct?
import spacy, re
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'autonomous cars shift insurance liability toward manufacturers.')
tagged_sent = [(w.text, w.tag_) for w in doc]
normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
normalized_sent[0] = normalized_sent[0].capitalize()
string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))
print string
Output:
Autonomous Cars shift Insurance Liability toward Manufacturers.

Categories