CV Parser name matching - python

I am using NLP with python to find the names from the string. I am able to find the if i have a full name (first name and last name) but in the string i have only first name means my code is not able to recognize as Person. Below is my code.
import re
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
string = """
Sriram is working as a python developer
"""
def ie_preprocess(document):
document = ' '.join([i for i in document.split() if i not in stop])
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
return sentences
def extract_names(document):
names = []
sentences = ie_preprocess(document)
#print(sentences)
for tagged_sentence in sentences:
for chunk in nltk.ne_chunk(tagged_sentence):
#print("Out Side ",chunk)
if type(chunk) == nltk.tree.Tree:
if chunk.label() == 'PERSON':
print("In Side ",chunk)
names.append(' '.join([c[0] for c in chunk]))
return names
if __name__ == '__main__':
names = extract_names(string)
print(names)

My advice is to use the StanfordNLP/Spacy NER, using nltk ne chunks is a little janky. StanfordNLP is more commonly used by researchers, but Spacy is easier to work with. Here is an example using Spacy to print the name of each named entity and its type:
>>> import spacy
>>> nlp = spacy.load('en_core_web_sm')
>>> text = 'Sriram is working as a python developer'
>>> doc = nlp(text)
>>> for ent in doc.ents:
print(ent.text,ent.label_)
Sriram ORG
>>>
Note that it classifies Sriram as an organization, which may be because it is not a common English name and Spacy is trained on English corpa. Good luck!

Related

Spacy tokenizer with only "Whitespace" rule

I would like to know if the spacy tokenizer could tokenize words only using the "space" rule.
For example:
sentence= "(c/o Oxford University )"
Normally, using the following configuration of spacy:
nlp = spacy.load("en_core_news_sm")
doc = nlp(sentence)
for token in doc:
print(token)
the result would be:
(
c
/
o
Oxford
University
)
Instead, I would like an output like the following (using spacy):
(c/o
Oxford
University
)
Is it possible to obtain a result like this using spacy?
Let's change nlp.tokenizer with a custom Tokenizer with token_match regex:
import re
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
text = "This is it's"
print("Before:", [tok for tok in nlp(text)])
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)
print("After :", [tok for tok in nlp(text)])
Before: [This, is, it, 's]
After : [This, is, it's]
You can further adjust Tokenizer by adding custom suffix, prefix, and infix rules.
An alternative, more fine grained way would be to find out why it's token is split like it is with nlp.tokenizer.explain():
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
text = "This is it's. I'm fine"
nlp.tokenizer.explain(text)
You'll find out that split is due to SPECIAL rules:
[('TOKEN', 'This'),
('TOKEN', 'is'),
('SPECIAL-1', 'it'),
('SPECIAL-2', "'s"),
('SUFFIX', '.'),
('SPECIAL-1', 'I'),
('SPECIAL-2', "'m"),
('TOKEN', 'fine')]
that could be updated to remove "it's" from exceptions like:
exceptions = nlp.Defaults.tokenizer_exceptions
filtered_exceptions = {k:v for k,v in exceptions.items() if k!="it's"}
nlp.tokenizer = Tokenizer(nlp.vocab, rules = filtered_exceptions)
[tok for tok in nlp(text)]
[This, is, it's., I, 'm, fine]
or remove split on apostrophe altogether:
filtered_exceptions = {k:v for k,v in exceptions.items() if "'" not in k}
nlp.tokenizer = Tokenizer(nlp.vocab, rules = filtered_exceptions)
[tok for tok in nlp(text)]
[This, is, it's., I'm, fine]
Note the dot attached to the token, which is due to the suffix rules not specified.
You can find the solution to this very question in the spaCy docs: https://spacy.io/usage/linguistic-features#custom-tokenizer-example. In a nutshell, you create a function that takes a string text and returns a Doc object, and then assign that callable function to nlp.tokenizer:
import spacy
from spacy.tokens import Doc
class WhitespaceTokenizer(object):
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split(' ')
# All tokens 'own' a subsequent space character in this tokenizer
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
print([t.text for t in doc])

Expected str instance, spacy.tokens.token.Token found

I am executing a data extraction use-case. To preprocess and tokenize my data, I am using both spacy English and German tokenizers, because the sentences are in both the languages. Here's my code:
import spacy
from spacy.lang.de import German
from spacy.lang.en import English
from spacy.lang.de import STOP_WORDS as stp_wrds_de
from spacy.lang.en.stop_words import STOP_WORDS as stp_wrds_en
import string
punctuations = string.punctuation
# German Parser
parser_de = German()
# English Parser
parser_en = English()
def spacy_tokenizer_de(document):
# Token object for splitting text into 'units'
tokens = parser_de(document)
# Lemmatization: Grammatical conversion of words
tokens = [word.lemma_.strip() if word.lemma_ != '-PRON-' else word for word in tokens]
# Remove punctuations
tokens = [word for word in tokens if word not in punctuations]
tokens_de_str = converttostr(tokens,' ')
tokens_en = spacy_tokenizer_en(tokens_de_str)
print("Tokens EN: {}".format(tokens_en))
tokens_en = converttostr(tokens_en,' ')
return tokens_en
def converttostr(input_seq, separator):
# Join all the strings in list
final_str = separator.join(input_seq)
return final_str
def spacy_tokenizer_en(document):
tokens = parser_en(document)
tokens = [word.lemma_.strip() if word.lemma_ != '-PRON-' else word for word in tokens]
return tokens
Here's a further elucidation of the above code:
1. spacy_tokenizer_de(): Method to parse and tokenize document in German
2. spacy_tokenizer_en(): Method to parse and tokenize document in English
3. converttostr(): Converts list of tokens to a string, so that the English spacy tokenizer can read the input (only accepts document/string format) and tokenize the data.
However, some sentences when parsed, lead to the following error:
Why is a spacy token object coming up in such scenarios, whereas, some of the sentences are being processed successfully? Can anyone please help here?
token.lemma_.strip() if token.lemma_ != '-PRON-' else token.text for token in tokens
You're supposed to get a list of words here, right? Instead, sometimes you return a string (when lemma doesn't equal to '-PRON-') but other times just token but not a string.
You may get a string from token.text.

Spacy custom sentence segmentation on line break

I'm trying to split this document into paragraphs. Specifically, I would like to split the text whenever there is a line break (<br>)
This is the code I'm using but is not producing the results I hoped
nlp = spacy.load("en_core_web_lg")
def set_custom_boundaries(doc):
for token in doc[:-1]:
if token.text == "<br>":
doc[token.i+1].is_sent_start = True
return doc
nlp.add_pipe(set_custom_boundaries, before="parser")
doc = nlp(text)
print([sent.text for sent in doc.sents])
A similar solution could be achieved by using NLTK's TextTilingTokenizer but wanted to check whether there is anything similar within Spacy
You're almost there, but the problem is that the default Tokenizer splits on '<' and '>', hence the condition token.text == "<br>" is never true. I'd add space before and after <br>. E.g.
import spacy
from spacy.symbols import ORTH
def set_custom_boundaries(doc):
for token in doc[:-1]:
if token.text == "<br>":
doc[token.i+1].is_sent_start = True
return doc
nlp = spacy.load("en_core_web_sm")
text = "the quick brown fox<br>jumps over the lazy dog"
text = text.replace('<br>', ' <br> ')
special_case = [{ORTH: "<br>"}]
nlp.tokenizer.add_special_case("<br>", special_case)
nlp.add_pipe(set_custom_boundaries, first=True)
doc = nlp(text)
print([sent.text for sent in doc.sents])
Also take a look at this PR, after it's merged to master, it'll no longer be necessary to wrap in spaces.
https://github.com/explosion/spaCy/pull/4259

How to extract noun adjective pairs from a sentence

I wish to extract noun-adjective pairs from this sentence. So, basically I want something like :
(Mark,sincere) (John,sincere).
from nltk import word_tokenize, pos_tag, ne_chunk
sentence = "Mark and John are sincere employees at Google."
print ne_chunk(pos_tag(word_tokenize(sentence)))
Spacy's POS tagging would be a better than NLTK. It's faster and better. Here is an example of what you want to do
import spacy
nlp = spacy.load('en')
doc = nlp(u'Mark and John are sincere employees at Google.')
noun_adj_pairs = []
for i,token in enumerate(doc):
if token.pos_ not in ('NOUN','PROPN'):
continue
for j in range(i+1,len(doc)):
if doc[j].pos_ == 'ADJ':
noun_adj_pairs.append((token,doc[j]))
break
noun_adj_pairs
output
[(Mark, sincere), (John, sincere)]

Getting adjective from an adverb in nltk or other NLP library

Is there a way to get an adjective corresponding to a given adverb in NLTK or other python library.
For example, for the adverb "terribly", I need to get "terrible".
Thanks.
There is a relation in wordnet that connects the adjectives to adverbs and vice versa.
>>> from itertools import chain
>>> from nltk.corpus import wordnet as wn
>>> from difflib import get_close_matches as gcm
>>> possible_adjectives = [k.name for k in chain(*[j.pertainyms() for j in chain(*[i.lemmas for i in wn.synsets('terribly')])])]
['terrible', 'atrocious', 'awful', 'rotten']
>>> gcm('terribly',possible_adjectives)
['terrible']
A more human readable way to computepossible_adjective is as followed:
possible_adj = []
for ss in wn.synsets('terribly'):
for lemmas in ss.lemmas: # all possible lemmas.
for lemma in lemmas:
for ps in lemma.pertainyms(): # all possible pertainyms.
for p in ps:
for ln in p.name: # all possible lemma names.
possible_adj.append(ln)
EDIT: In the newer version of NLTK:
possible_adj = []
for ss in wn.synsets('terribly'):
for lemmas in ss.lemmas(): # all possible lemmas
for ps in lemmas.pertainyms(): # all possible pertainyms
possible_adj.append(ps.name())
As MKoosej mentioned, nltk's lemmas is no longer an attribute but a method. I also made a little simplification to get the most possible word. Hope someone else can use it also:
wordtoinv = 'unduly'
s = []
winner = ""
for ss in wn.synsets(wordtoinv):
for lemmas in ss.lemmas(): # all possible lemmas.
s.append(lemmas)
for pers in s:
posword = pers.pertainyms()[0].name()
if posword[0:3] == wordtoinv[0:3]:
winner = posword
break
print winner # undue

Categories