Tokenizing an HTML document

Tokenizing an HTML document - python

I have an HTML document and I'd like to tokenize it using spaCy while keeping HTML tags as a single token.
Here's my code:
import spacy
from spacy.symbols import ORTH
nlp = spacy.load('en', vectors=False, parser=False, entity=False)
nlp.tokenizer.add_special_case(u'<i>', [{ORTH: u'<i>'}])
nlp.tokenizer.add_special_case(u'</i>', [{ORTH: u'</i>'}])
doc = nlp('Hello, <i>world</i> !')
print([e.text for e in doc])
The output is:
['Hello', ',', '<', 'i', '>', 'world</i', '>', '!']
If I put spaces around the tags, like this:
doc = nlp('Hello, <i> world </i> !')
The output is as I want it:
['Hello', ',', '<i>', 'world', '</i>', '!']
but I'd like avoiding complicated pre-processing to the HTML.
Any idea how can I approach this?

You need to create a custom Tokenizer.
Your custom Tokenizer will be exactly as spaCy's tokenizer but it will have '<' and '>' symbols removed from prefixes and suffixes and also it will add one new prefix and one new suffix rule.
Code:
import spacy
from spacy.tokens import Token
Token.set_extension('tag', default=False)
def create_custom_tokenizer(nlp):
from spacy import util
from spacy.tokenizer import Tokenizer
from spacy.lang.tokenizer_exceptions import TOKEN_MATCH
prefixes = nlp.Defaults.prefixes + ('^<i>',)
suffixes = nlp.Defaults.suffixes + ('</i>$',)
# remove the tag symbols from prefixes and suffixes
prefixes = list(prefixes)
prefixes.remove('<')
prefixes = tuple(prefixes)
suffixes = list(suffixes)
suffixes.remove('>')
suffixes = tuple(suffixes)
infixes = nlp.Defaults.infixes
rules = nlp.Defaults.tokenizer_exceptions
token_match = TOKEN_MATCH
prefix_search = (util.compile_prefix_regex(prefixes).search)
suffix_search = (util.compile_suffix_regex(suffixes).search)
infix_finditer = (util.compile_infix_regex(infixes).finditer)
return Tokenizer(nlp.vocab, rules=rules,
prefix_search=prefix_search,
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=token_match)
nlp = spacy.load('en_core_web_sm')
tokenizer = create_custom_tokenizer(nlp)
nlp.tokenizer = tokenizer
doc = nlp('Hello, <i>world</i> !')
print([e.text for e in doc])

For the record, it might be that this has become easier: With the current version of Spacy, you don't have to create a custom tokenizer anymore. It suffices to 1. extend the infixes (to ensure tags are separated from words), and 2. add the tags as special cases:
import spacy
from spacy.symbols import ORTH
nlp = spacy.load("en_core_web_trf")
infixes = nlp.Defaults.infixes + [r'(<)']
nlp.tokenizer.infix_finditer = spacy.util.compile_infix_regex(infixes).finditer
nlp.tokenizer.add_special_case(f"<i>", [{ORTH: f"<i>"}])
nlp.tokenizer.add_special_case(f"</i>", [{ORTH: f"</i>"}])
text = """Hello, <i>world</i> !"""
doc = nlp(text)
print([e.text for e in doc])
Prints:
['Hello', ',', '<i>', 'world', '</i>', '!']
(This is more or less a condensed version of https://stackoverflow.com/a/66268015/1016514)

Related

Prevent Spacy tokenizer from splitting on specific character

When using spacy to tokenize a sentence, I want it to not split into tokens on /
Example:
import en_core_web_lg
nlp = en_core_web_lg.load()
for i in nlp("Get 10ct/liter off when using our App"):
print(i)
Output:
Get
10ct
/
liter
off
when
using
our
App
I want it to be like Get , 10ct/liter, off, when ....
I was able to find how to add more ways to split into tokens for spacy, but not how to avoid specific splitting techniques.

I suggest using a custom tokenizer, see Modifying existing rule sets:
import spacy
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex
nlp = spacy.load("en_core_web_trf")
text = "Get 10ct/liter off when using our App"
# Modify tokenizer infix patterns
infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
#r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=](?=[{a}])".format(a=ALPHA),
]
)
infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer
doc = nlp(text)
print([t.text for t in doc])
## => ['Get', '10ct/liter', 'off', 'when', 'using', 'our', 'App']
Note the commented #r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), line, I simply took out the / char from the [:<>=/] character class. This rule split at / that is between a letter/digit and a letter.
If you need to still split '12/ct' into three tokens, you will need to add another line below the r"(?<=[{a}0-9])[:<>=](?=[{a}])".format(a=ALPHA) line:
r"(?<=[0-9])/(?=[{a}])".format(a=ALPHA),

spacy aggressive lemmatization and removing unexpected words

I am trying to clean some text data.
fisrt i removed the stop words, then i tried to Lemmatize the text. But words such as nouns are removed
Sample Data
https://drive.google.com/file/d/1p9SKWLSVYeNScOCU_pEu7A08jbP-50oZ/view?usp=sharing
udpated Code
# Libraries
import spacy
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
import nltk; nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['covid', 'COVID-19', 'coronavirus'])
article= pd.read_csv("testdata.csv")
data = article.title.values.tolist()
nlp = spacy.load('en_core_web_sm')
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(data))
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
data_words_nostops = remove_stopwords(data_words)
print ("*** Text After removing Stop words: ")
print(data_words_nostops)
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PRON']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PRON'])
print ("*** Text After Lemmatization: ")
print(data_lemmatized)
The output after removing Stopwords is :
[['qaia', 'flags', 'amman', 'melbourne', 'jetstar', 'flights', 'recovery', 'plan'],
['western', 'amman', 'suburb', 'new','nsw', 'ground', 'zero', children],
['flight', 'returned', 'amman','qaia', 'staff', 'contract','driving'], ]]
The output after Lematization :
[['flight', 'recovery', 'plan']
['suburb', 'ground']
['return', 'contract','driving']
on each reacord I do not understand the following :
-1st reord: why these words are removed: "'qaia', 'flags', 'amman', 'melbourne', 'jetstar'
-2ed recored: essential words are reomved same as the first reord, Also, I was expecting children to convert to child
-3ed, "driving" is not converted to "drive"
I was expecting that words will such as "Amman" will not removed, Also i am expecting the words will be converted from plural to singular. And the verbs will be converted to the infinitive ...
What i am missing here???
Thanx in advance

I'm guessing that most of your issues are because you're not feeding spaCy full sentences and it's not assigning the correct part-of-speech tags to your words. This can cause the lemmatizer to return the wrong results. However, since you've only provided snippets of code and none of the original text, it's difficult to answer this question. Next time consider boiling down your question to a few lines of code that someone else can run on their machine EXACTLY AS WRITTEN, and providing a sample input that fails. See Minimal Reproducible Example
Here's an example that works and is close to what you're doing.
import spacy
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
allow_postags = set(['NOUN', 'VERB', 'ADJ', 'ADV', 'PROPN'])
nlp = spacy.load('en')
text = 'The children in Amman and Melbourne are too young to be driving.'
words = []
for token in nlp(text):
if token.text not in stop_words and token.pos_ in allow_postags:
words.append(token.lemma_)
print(' '.join(words))
This returns child Amman Melbourne young drive

Spacy tokenizer with only "Whitespace" rule

I would like to know if the spacy tokenizer could tokenize words only using the "space" rule.
For example:
sentence= "(c/o Oxford University )"
Normally, using the following configuration of spacy:
nlp = spacy.load("en_core_news_sm")
doc = nlp(sentence)
for token in doc:
print(token)
the result would be:
(
c
/
o
Oxford
University
)
Instead, I would like an output like the following (using spacy):
(c/o
Oxford
University
)
Is it possible to obtain a result like this using spacy?

Let's change nlp.tokenizer with a custom Tokenizer with token_match regex:
import re
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
text = "This is it's"
print("Before:", [tok for tok in nlp(text)])
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)
print("After :", [tok for tok in nlp(text)])
Before: [This, is, it, 's]
After : [This, is, it's]
You can further adjust Tokenizer by adding custom suffix, prefix, and infix rules.
An alternative, more fine grained way would be to find out why it's token is split like it is with nlp.tokenizer.explain():
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
text = "This is it's. I'm fine"
nlp.tokenizer.explain(text)
You'll find out that split is due to SPECIAL rules:
[('TOKEN', 'This'),
('TOKEN', 'is'),
('SPECIAL-1', 'it'),
('SPECIAL-2', "'s"),
('SUFFIX', '.'),
('SPECIAL-1', 'I'),
('SPECIAL-2', "'m"),
('TOKEN', 'fine')]
that could be updated to remove "it's" from exceptions like:
exceptions = nlp.Defaults.tokenizer_exceptions
filtered_exceptions = {k:v for k,v in exceptions.items() if k!="it's"}
nlp.tokenizer = Tokenizer(nlp.vocab, rules = filtered_exceptions)
[tok for tok in nlp(text)]
[This, is, it's., I, 'm, fine]
or remove split on apostrophe altogether:
filtered_exceptions = {k:v for k,v in exceptions.items() if "'" not in k}
nlp.tokenizer = Tokenizer(nlp.vocab, rules = filtered_exceptions)
[tok for tok in nlp(text)]
[This, is, it's., I'm, fine]
Note the dot attached to the token, which is due to the suffix rules not specified.

You can find the solution to this very question in the spaCy docs: https://spacy.io/usage/linguistic-features#custom-tokenizer-example. In a nutshell, you create a function that takes a string text and returns a Doc object, and then assign that callable function to nlp.tokenizer:
import spacy
from spacy.tokens import Doc
class WhitespaceTokenizer(object):
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split(' ')
# All tokens 'own' a subsequent space character in this tokenizer
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
print([t.text for t in doc])

Spacy - modify tokenizer for numeric patterns

I have seen some ways to create a custom tokenizer, but I am a little confused. What I am doing is using the Phrase Matcher to match patterns. However, it would match a 4-digit number pattern, say 1234, in 111-111-1234, since it splits on the dash.
All I want to do is modify the current tokenizer (from nlp = English()) and add a rule that it should not split on some characters but only for numeric patterns.

To do this you will need to overwrite spaCy's default infix tokenization scheme with your own. You can do this by modifying the infix tokenization scheme used by spaCy found here.
import spacy
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex
# default tokenizer
nlp = spacy.load("en_core_web_sm")
doc = nlp("111-222-1234 for abcDE")
print([t.text for t in doc])
# modify tokenizer infix patterns
infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\*^](?=[0-9-])", # Remove the hyphen
r"(?<=[{al}{q}])\.?(?=[{au}{q}])".format( # Make the dot optional
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
)
,
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer
doc = nlp("111-222-1234 for abcDE")
print([t.text for t in doc])
Output
With default tokenizer:
['111', '-', '222', '-', '1234', 'for', 'abcDE']
With custom tokenizer:
['111-222-1234', 'for', 'abc', 'DE']

CV Parser name matching

I am using NLP with python to find the names from the string. I am able to find the if i have a full name (first name and last name) but in the string i have only first name means my code is not able to recognize as Person. Below is my code.
import re
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
string = """
Sriram is working as a python developer
"""
def ie_preprocess(document):
document = ' '.join([i for i in document.split() if i not in stop])
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
return sentences
def extract_names(document):
names = []
sentences = ie_preprocess(document)
#print(sentences)
for tagged_sentence in sentences:
for chunk in nltk.ne_chunk(tagged_sentence):
#print("Out Side ",chunk)
if type(chunk) == nltk.tree.Tree:
if chunk.label() == 'PERSON':
print("In Side ",chunk)
names.append(' '.join([c[0] for c in chunk]))
return names
if __name__ == '__main__':
names = extract_names(string)
print(names)

My advice is to use the StanfordNLP/Spacy NER, using nltk ne chunks is a little janky. StanfordNLP is more commonly used by researchers, but Spacy is easier to work with. Here is an example using Spacy to print the name of each named entity and its type:
>>> import spacy
>>> nlp = spacy.load('en_core_web_sm')
>>> text = 'Sriram is working as a python developer'
>>> doc = nlp(text)
>>> for ent in doc.ents:
print(ent.text,ent.label_)
Sriram ORG
>>>
Note that it classifies Sriram as an organization, which may be because it is not a common English name and Spacy is trained on English corpa. Good luck!

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Tokenizing an HTML document - python

Related

Prevent Spacy tokenizer from splitting on specific character

spacy aggressive lemmatization and removing unexpected words

Spacy tokenizer with only "Whitespace" rule

Spacy - modify tokenizer for numeric patterns

CV Parser name matching

Categories

Resources