Removing stop words from string using spacy in diffrent languages - python

I have an array of strings in different languages and I would like to remove stop words from these strings.
example of string :
["mai fostul președinte egiptean mohamed morsi ", "em bon jovi lançou o álbum have a nice day a ", " otok škulj är en ö i kroatien den ligger i län"...]
this is the list of languages I'm willing to use :
['French',
'Spanish',
'Thai',
'Russian',
'Persian',
'Indonesian',
'Arabic',
'Pushto',
'Kannada',
'Danish',
'Japanese',
'Malayalam',
'Latin',
'Romanian',
'Swedish',
'Portugese',
'English',
'Turkish',
'Tamil',
'Urdu',
'Korean',
'German',
'Greek',
'Italian',
'Chinese',
'Dutch',
'Estonian',
'Hindi']
I am using Spacy library, but I'm looking for something that support multiple languages.
what I have tried already:
import pandas as pd
import nltk
nltk.download('punkt')
import spacy
nlp = spacy.load("xx_ent_wiki_sm")
from spacy.tokenizer import Tokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
doc = nlp("This is a sentence about Facebook.")
print([(ent.text, ent.label) for ent in doc.ents])
all_stopwords = nlp.Defaults.stop_words
all_stopwords = nlp.Defaults.stop_words
data_text=df1['Text'] #here where i store my strings
for x in data_text:
text_tokens = word_tokenize(x)
tokens_without_sw=[word for word in text_tokens if not word inall_stopwords]
print(tokens_without_sw)

Related

Text To Speech (Multi Lang) Using gTTS API In Python

I am actually trying to make a text to speech code where the text which is written in any language will be spoken out loud. The language in which it will speak should be selected by the user in a dropdown. I have written the code which I will be showing below.
import gtts
import os
import playsound
import googletrans
from googletrans import Translator, LANGUAGES
import tkinter as tk
root=Tk()
root.title("TEXT TO SPEECH")
root.geometry("1280x720")
root.resizable(False,False)
root.configure(bg="#305065")
#Top Text
Label(root,text="TEXT TO SPEECH",font="Helvetica 20 bold",fg="white",bg='#305065').pack(pady=10)
a=tk.StringVar()
auto_select=ttk.Combobox(frame1,width=27,textvariable=a,state='readonly',font=('verdana',10,'bold'))
auto_select['values'] = (
'Afrikaans',
'Albanian',
'Amharic',
'Arabic',
'Armenian',
'Azerbaijani',
'Basque',
'Belarusian',
'Bengali',
'Bosnian',
'Bulgarian',
'Batalan',
'Cebuano',
'Chichewa',
'Chinese (simplified)',
'Chinese (traditional)',
'Corsican',
'Croatian',
'Czech',
'Danish',
'Dutch',
'English',
'Esperanto',
'Estonian',
'Filipino',
'Finnish',
'French',
'Frisian',
'Galician',
'Georgian',
'German',
'Greek',
'Gujarati',
'Gaitian creole',
'Hausa',
'Hawaiian',
'Hebrew',
'Hebrew',
'Hindi',
'Hmong',
'Hungarian',
'Icelandic',
'Igbo',
'Indonesian',
'Irish',
'Italian',
'Japanese',
'Javanese',
'Kannada',
'Kazakh',
'Khmer',
'Korean',
'Kurdish (kurmanji)',
'Kyrgyz',
'Lao',
'Latin',
'Latvian',
'Lithuanian',
'Luxembourgish',
'Macedonian',
'Malagasy',
'Malay',
'Malayalam',
'Maltese',
'Maori',
'Marathi',
'Mongolian',
'Myanmar (burmese)',
'Nepali',
'Norwegian',
'Odia',
'Pashto',
'Persian',
'Polish',
'Portuguese',
'Punjabi',
'Romanian',
'Russian',
'Samoan',
'Scots gaelic',
'Serbian',
'Sesotho',
'Shona',
'Sindhi',
'Sinhala',
'Slovak',
'Slovenian',
'Somali',
'Spanish',
'Sundanese',
'Swahili',
'Swedish',
'Tajik',
'Tamil',
'Telugu',
'Thai',
'Turkish',
'Ukrainian',
'Urdu',
'Uyghur',
'Uzbek',
'Vietnamese',
'Welsh',
'Xhosa',
'Yiddish',
'Yoruba',
'Zulu',
)
auto_select.place(x=150,y=60)
#Textboxes & Buttons
sor_txt=Text(frame1,width=40,height=11,borderwidth=5,relief=RIDGE,font=('verdana',15))
sor_txt.place(x=30,y=110)
dest_txt=Text(frame1,width=40,height=11,borderwidth=5,relief=RIDGE,font=('verdana',15))
dest_txt.place(x=710,y=110)
button_change=Button(frame1,text="TRANSLATE",relief=RAISED,borderwidth=2,font=('verdana',10,'bold'),bg='#248aa2',cursor="hand2",command=translate)
button_change.place(x=590,y=430,width=100,height=40)
button_clear=Button(frame1,text="CLEAR",relief=RAISED,borderwidth=2,font=('verdana',10,'bold'),bg='#248aa2',cursor="hand2",command=clear)
button_clear.place(x=590,y=480,width=100,height=40)
#Translator_Code
def translate():
lang_1=sor_txt.get(1.0,END)
cl=choose_lang.get()
if lang_1 == '':
messagebox.showerror("TEXT TO SPEECH","Enter the text to translate!")
else:
dest_txt.delete(1.0,END)
translator=Translator()
output=translator.translate(lang_1,dest=cl)
dest_txt.insert('end',output.text)
dest_audio = gtts.gTTS(output.text, lang=cl)
playsound.playsound(dest_audio)
def clear():
sor_txt.delete(1.0,'end')
dest_txt.delete(1.0,'end')
root.mainloop()
After executing this code I am getting a error where it is saying language not found. So may someone tell me who can I solve this?
Screenshot of my Error: https://i.stack.imgur.com/6mk5A.png
Check out the supported languages here: https://cloud.google.com/text-to-speech/docs/voices
for example "Amharic" isn't a supported language so simply fix the list of languages by removing unsupported languages

NLTK find german nouns

I want to extract all german nouns from a german text in lemmatized form with NLTK.
I also checked spacy but NLTK is much more preferred because in english it already works with the needed performance and requested data structure.
I have the following working code for english:
import nltk
from nltk.stem import WordNetLemmatizer
#germanText='Jahrtausendelang ging man davon aus, dass auch der Sender einen geheimen Schlüssel, und zwar den gleichen wie der Empfänger, benötigt.'
text='For thousands of years it was assumed that the sender also needed a secret key, the same as the recipient.'
tokens = nltk.word_tokenize(text)
tokens = [tok.lower() for tok in tokens]
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
tokens = [word for (word, pos) in nltk.pos_tag(tokens) if pos[0] == 'N']
print (tokens)
I get the print as expected:
['year', 'sender', 'key', 'recipient']
Now I tried to do this for German:
import nltk
from nltk.stem import WordNetLemmatizer
germanText='Jahrtausendelang ging man davon aus, dass auch der Sender einen geheimen Schlüssel, und zwar den gleichen wie der Empfänger, benötigt.'
#text='For thousands of years it was assumed that the sender also needed a secret key, the same as the recipient.'
tokens = nltk.word_tokenize(germanText, language='german')
tokens = [tok.lower() for tok in tokens]
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
tokens = [word for (word, pos) in nltk.pos_tag(tokens) if pos[0] == 'N']
print (tokens)
And I get a wrong result:
['jahrtausendelang', 'man', 'davon', 'au', 'der', 'sender', 'einen', 'geheimen', 'zwar', 'den', 'gleichen', 'wie', 'der', 'empfänger', 'benötigt']
The lemmatization did not work and the noun extraction did not work.
How is the proper way to apply different languages to this code?
I also checked other solutions like:
from nltk.stem.snowball import GermanStemmer
stemmer = GermanStemmer("german") # Choose a language
tokenGer=stemmer.stem(tokens)
But this would make me start from the beginning.
I have found a way with the HanoverTagger:
from HanTa import HanoverTagger as ht
tagger = ht.HanoverTagger('morphmodel_ger.pgz')
words = nltk.word_tokenize(text)
print(tagger.tag_sent(words) )
tokens=[word for (word,x,pos) in tagger.tag_sent(words,taglevel= 1) if pos == 'NN']
I get the outcome as expected: ['Jahrtausendelang', 'Sender', 'Schlüssel', 'Empfänger']

How does the ISRI Stemmer give better stem words than Lancaster or Snowball Stemmer

I have this sample text which i want to tokenize and subsequently find the stem words
sample_text = "'I am a student from the University of Alabama. \
I was born in Ontario, Canada and I am a huge fan of the United States. \
I am going to get a degree in Philosophy to improve\
my chances of becoming a Philosophy professor. \
I have been working towards this goal for 4 years. \
I am currently enrolled in a PhD program. \
It is very difficult, but I am confident that it will be a good decision'"
Using Lancaster Stemmer I am getting the following result -
sentences = sent_tokenize(sample_text)
from nltk.stem import LancasterStemmer
lancaster = LancasterStemmer()
for i in range(len(sentences)):
sentences[i] = re.sub('[^A-Za-z0-9]', ' ', sentences[i])
sentences[i] = word_tokenize(sentences[i])
stopwds = [word.lower() for word in stopwords.words('english')]
sentences[i] = [word.lower() for word in sentences[i] if word.lower() not in stopwds]
sentences[i] = [lancaster.stem(word) for word in sentences[i]]
print(sentences[i])
Output of Lancaster Stemmer:
['stud', 'univers', 'alabam']
['born', 'ontario', 'canad', 'hug', 'fan', 'unit', 'stat']
['going', 'get', 'degr', 'philosoph', 'improvemy', 'chant', 'becom', 'philosoph', 'profess']
['work', 'toward', 'goal', '4', 'year']
['cur', 'enrol', 'phd', 'program']
['difficult', 'confid', 'good', 'decid']
Output with Snowball stemmer -
['student', 'univers', 'alabama']
['born', 'ontario', 'canada', 'huge', 'fan', 'unit', 'state']
['go', 'get', 'degre', 'philosophi', 'improvemi', 'chanc', 'becom', 'philosophi', 'professor']
['work', 'toward', 'goal', '4', 'year']
['current', 'enrol', 'phd', 'program']
['difficult', 'confid', 'good', 'decis']
Output of Porter Stemmer
['student', 'univers', 'alabama']
['born', 'ontario', 'canada', 'huge', 'fan', 'unit', 'state']
['go', 'get', 'degre', 'philosophi', 'improvemi', 'chanc', 'becom', 'philosophi', 'professor']
['work', 'toward', 'goal', '4', 'year']
['current', 'enrol', 'phd', 'program']
['difficult', 'confid', 'good', 'decis']
Whereas ISRI Stemmer almost gives me same results as if the words had been lemmatized
sentences = sent_tokenize(sample_text)
from nltk.stem import ISRIStemmer
isri = ISRIStemmer()
for i in range(len(sentences)):
sentences[i] = re.sub('[^A-Za-z0-9]', ' ', sentences[i])
sentences[i] = word_tokenize(sentences[i])
stopwds = [word.lower() for word in stopwords.words()]
sentences[i] = [word.lower() for word in sentences[i] if word.lower() not in stopwds]
sentences[i] = [ isri.stem(word) for word in sentences[i]]
print(sentences[i])
Output :
['student', 'university', 'alabama']
['born', 'ontario', 'canada', 'huge', 'fan', 'united', 'states']
['going', 'get', 'degree', 'philosophy', 'improvemy', 'chances', 'becoming', 'philosophy', 'professor']
['working', 'towards', 'goal', '4', 'years']
['currently', 'enrolled', 'phd', 'program']
['difficult', 'confident', 'good', 'decision']
Can someone explain how ISRI Stemmer gives almost Lemmatized words
This stemmer was developed specifically for Arabic, that is why you are getting such results when using it on English.
Here is a link to the nltk doc https://www.nltk.org/_modules/nltk/stem/isri.html

Ideas to improve language detection between Spanish and Catalan

I'm working on a text mining script in python. I need to detect the language of a natural language field from the dataset.
The thing is, 98% of the rows are in Spanish and Catalan. I tried using some algorithms like the stopwords one or the langdetect library, but these languages share a lot of words so they fail a lot.
I'm looking for some ideas to improve this algorithm.
One thought is, make a dictionary with some words that are specific to Spanish and Catalan, so if one text has any of these words, it's tagged as that language.
Approach 1: Distinguishing characters
Spanish and Catalan (note: there will be exceptions for proper names and loanwords e.g. Barça):
esp_chars = "ñÑáÁýÝ"
cat_chars = "çÇàÀèÈòÒ·ŀĿ"
Example:
sample_texts = ["El año que es abundante de poesía, suele serlo de hambre.",
"Cal no abandonar mai ni la tasca ni l'esperança."]
for text in sample_texts:
if any(char in text for char in esp_chars):
print("Spanish: {}".format(text))
elif any(char in text for char in cat_chars):
print("Catalan: {}".format(text))
>>> Spanish: El año que es abundante de poesía, suele serlo de hambre.
Catalan: Cal no abandonar mai ni la tasca ni l'esperança.
If this isn't sufficient, you could expand this logic to search for language exclusive digraphs, letter combinations, or words:
Spanish only
Catalan only
Words
como y su con él otro
com i seva amb ell altre
Initial digraphs
d' l'
Digraphs
ss tj qü l·l l.l
Terminal digraphs
ig
Catalan letter combinations that only marginally appear in Spanish
tx
tg          (Es. exceptions postgrado, postgraduado, postguerra)
ny          (Es. exceptions mostly prefixed in-, en-, con- + y-)
ll (terminal) (Es. exceptions (loanwords): detall, nomparell)
Approach 2: googletrans library
You could also use the googletrans library to detect the language:
from googletrans import Translator
translator = Translator()
for text in sample_texts:
lang = translator.detect(text).lang
print(lang, ":", text)
>>> es : El año que es abundante de poesía, suele serlo de hambre.
ca : Cal no abandonar mai ni la tasca ni l'esperança.
DicCat = ['amb','cap','dalt','damunt','des','dintre','durant','excepte','fins','per','pro','sense','sota','llei','hi','ha','més','mes','moment','órgans', 'segóns','Article','i','per','els','amb','és','com','dels','més','seu','seva','fou','també','però','als','després','aquest','fins','any','són','hi','pel','aquesta','durant','on','part','altres','anys','ciutat','cap','des','seus','tot','estat','qual','segle','quan','ja','havia','molt','rei','nom','fer','així','li','sant','encara','pels','seves','té','partit','està','mateix','pot','nord','temps','fill','només','dues','sota','lloc','això','alguns','govern','uns','aquests','mort','nou','tots','fet','sense','frança','grup','tant','terme','fa','tenir','segons','món','regne','exèrcit','segona','abans','mentre','quals','aquestes','família','catalunya','eren','poden','diferents','nova','molts','església','major','club','estats','seua','diversos','grans','què','arribar','troba','població','poble','foren','època','haver','eleccions','diverses','tipus','riu','dia','quatre','poc','regió','exemple','batalla','altre','espanya','joan','actualment','tenen','dins','llavors','centre','algunes','important','altra','terra','antic','tenia','obres','estava','pare','qui','ara','havien','començar','història','morir','majoria','qui','ara','havien','començar','història','morir','majoria']
DicEsp = ['los','y','bajo','con', 'entre','hacia','hasta','para','por','según','segun','sin','tras','más','mas','ley','capítulo','capitulo','título','titulo','momento','y','las','por','con','su','para','lo','como','más','pero','sus','le','me','sin','este','ya','cuando','todo','esta','son','también','fue','había','muy','años','hasta','desde','está','mi','porque','qué','sólo','yo','hay','vez','puede','todos','así','nos','ni','parte','tiene','él','uno','donde','bien','tiempo','mismo','ese','ahora','otro','después','te','otros','aunque','esa','eso','hace','otra','gobierno','tan','durante','siempre','día','tanto','ella','sí','dijo','sido','según','menos','año','antes','estado','sino','caso','nada','hacer','estaba','poco','estos','presidente','mayor','ante','unos','algo','hacia','casa','ellos','ayer','hecho','mucho','mientras','además','quien','momento','millones','esto','españa','hombre','están','pues','hoy','lugar','madrid','trabajo','otras','mejor','nuevo','decir','algunos','entonces','todas','días','debe','política','cómo','casi','toda','tal','luego','pasado','medio','estas','sea','tenía','nunca','aquí','ver','veces','embargo','partido','personas','grupo','cuenta','pueden','tienen','misma','nueva','cual','fueron','mujer','frente','josé','tras','cosas','fin','ciudad','he','social','tener','será','historia','muchos','juan','tipo','cuatro','dentro','nuestro','punto','dice','ello','cualquier','noche','aún','agua','parece','haber','situación','fuera','bajo','grandes','nuestra','ejemplo','acuerdo','habían','usted','estados','hizo','nadie','países','horas','posible','tarde','ley','importante','desarrollo','proceso','realidad','sentido','lado','mí','tu','cambio','allí','mano','eran','estar','san','número','sociedad','unas','centro','padre','gente','relación','cuerpo','incluso','través','último','madre','mis','modo','problema','cinco','carlos','hombres','información','ojos','muerte','nombre','algunas','público','mujeres','siglo','todavía','meses','mañana','esos','nosotros','hora','muchas','pueblo','alguna','dar','don','da','tú','derecho','verdad','maría','unidos','podría','sería','junto','cabeza','aquel','luis','cuanto','tierra','equipo','segundo','director','dicho','cierto','casos','manos','nivel','podía','familia','largo','falta','llegar','propio','ministro','cosa','primero','seguridad','hemos','mal','trata','algún','tuvo','respecto','semana','varios','real','sé','voz','paso','señor','mil','quienes','proyecto','mercado','mayoría','luz','claro','iba','éste','pesetas','orden','español','buena','quiere','aquella','programa','palabras','internacional','esas','segunda','empresa','puesto','ahí','propia','libro','igual','político','persona','últimos','ellas','total','creo','tengo','dios','española','condiciones','méxico','fuerza','solo','único','acción','amor','policía','puerta','pesar','sabe','calle','interior','tampoco','ningún','vista','campo','buen','hubiera','saber','obras','razón','niños','presencia','tema','dinero','comisión','antonio','servicio','hijo','última','ciento','estoy','hablar','dio','minutos','producción','camino','seis','quién','fondo','dirección','papel','demás','idea','especial','diferentes','dado','base','capital','ambos','europa','libertad','relaciones','espacio','medios','ir','actual','población','empresas','estudio','salud','servicios','haya','principio','siendo','cultura','anterior','alto','media','mediante','primeros','arte','paz','sector','imagen','medida','deben','datos','consejo','personal','interés','julio','grupos','miembros','ninguna','existe','cara','edad','movimiento','visto','llegó','puntos','actividad','bueno','uso','niño','difícil','joven','futuro','aquellos','mes','pronto','soy','hacía','nuevos','nuestros','estaban','posibilidad','sigue','cerca','resultados','educación','atención','gonzález','capacidad','efecto','necesario','valor','aire','investigación','siguiente','figura','central','comunidad','necesidad','serie','organizació','nuevas','calidad']
DicEng = ['all','my','have','do','and', 'or', 'what', 'can', 'you', 'the', 'on', 'it', 'at', 'since', 'for', 'ago', 'before', 'past', 'by', 'next', 'from','with', 'wich','law','is','the','of','and','to','in','is','you','that','it','he','was','for','on','are','as','with','his','they','at','be','this','have','from','or','one','had','by','word','but','not','what','all','were','we','when','your','can','said','there','use','an','each','which','she','do','how','their','if','will','up','other','about','out','many','then','them','these','so','some','her','would','make','like','him','into','time','has','look','two','more','write','go','see','number','no','way','could','people','my','than','first','water','been','call','who','oil','its','now','find','long','down','day','did','get','come','made','may','part','may','part']
def WhichLanguage(text):
Input = text.lower().split(" ")
CatScore = []
EspScore = []
EngScore = []
for e in Input:
if e in DicCat:
CatScore.append(e)
if e in DicEsp:
EspScore.append(e)
if e in DicEng:
EngScore.append(e)
if(len(EngScore) > len(EspScore)) and (len(EngScore) > len(CatScore)):
Language ='English'
else:
if(len(CatScore) > len(EspScore)):
Language ='Catala'
else:
Language ='Espanyol'
print(text)
print("ESP= ",len(EspScore),EspScore)
print("Cat = ",len(CatScore), CatScore)
print("ING= ",len(EngScore),EngScore)
print( 'Language is =', Language)
print("-----")
return(Language)
print(WhichLanguage("Hola bon dia"))

How do I add new entity (ORG) instances in spacy nlp

I am trying to add stock symbols to the strings recognized as ORG entities. For each symbol, I do:
nlp.matcher.add(symbol, u'ORG', {}, [[{u'orth': symbol}]])
I can see that this symbol gets added to the patterns:
print "Patterns:", nlp.matcher._patterns
but any symbols that were not recognized before adding are not recognized after adding. Apparently, these tokens already exist in the vocabulary (that is why the vocab length does not change).
What should I be doing differently? What am I missing?
Thanks
Here is my example code:
"Brief snippet to practice adding stock ticker symbols as ORG entities"
from spacy.en import English
import spacy.en
from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
import os
import csv
import sys
nlp = English() #Load everything for the English model
print "Before nlp vocab length", len(nlp.matcher.vocab)
symbol_list = [u"CHK", u"JONE", u"NE", u"DO", u"ESV"]
txt = u"""drive double-digit rallies in Chesapeake Energy (NYSE: CHK), (NYSE: NE), (NYSE: DO), (NYSE: ESV), (NYSE: JONE)"""# u"""Drive double-digit rallies in Chesapeake Energy (NYSE: CHK), Noble Corporation (NYSE:NE), Diamond Offshore (NYSE:DO), Ensco (NYSE:ESV), and Jones Energy (NYSE: JONE)"""
before = nlp(txt)
for tok in before: #Before adding entities
print tok, tok.orth, tok.tag_, tok.ent_type_
for symbol in symbol_list:
print "adding symbol:", symbol
print "vocab length:", len(nlp.matcher.vocab)
print "pattern length:", nlp.matcher.n_patterns
nlp.matcher.add(symbol, u'ORG', {}, [[{u'orth': symbol}]])
print "Patterns:", nlp.matcher._patterns
print "Entities:", nlp.matcher._entities
for ent in nlp.matcher._entities:
print ent.label
tokens = nlp(txt)
print "\n\nAfter:"
print "After nlp vocab length", len(nlp.matcher.vocab)
for tok in tokens:
print tok, tok.orth, tok.tag_, tok.ent_type_
Here's working example based on the docs:
import spacy
nlp = spacy.load('en')
def merge_phrases(matcher, doc, i, matches):
'''
Merge a phrase. We have to be careful here because we'll change the token indices.
To avoid problems, merge all the phrases once we're called on the last match.
'''
if i != len(matches)-1:
return None
spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
for ent_id, label, span in spans:
span.merge('NNP' if label else span.root.tag_, span.text, nlp.vocab.strings[label])
matcher = spacy.matcher.Matcher(nlp.vocab)
matcher.add(entity_key='stock-nyse', label='STOCK', attrs={}, specs=[[{spacy.attrs.ORTH: 'NYSE'}]], on_match=merge_phrases)
matcher.add(entity_key='stock-esv', label='STOCK', attrs={}, specs=[[{spacy.attrs.ORTH: 'ESV'}]], on_match=merge_phrases)
doc = nlp(u"""drive double-digit rallies in Chesapeake Energy (NYSE: CHK), (NYSE: NE), (NYSE: DO), (NYSE: ESV), (NYSE: JONE)""")
matcher(doc)
print(['%s|%s' % (t.orth_, t.ent_type_) for t in doc])
->
['drive|', 'double|', '-|', 'digit|', 'rallies|', 'in|', 'Chesapeake|ORG', 'Energy|ORG', '(|', 'NYSE|STOCK', ':|', 'CHK|', ')|', ',|', '(|', 'NYSE|STOCK', ':|', 'NE|GPE', ')|', ',|', '(|', 'NYSE|STOCK', ':|', 'DO|', ')|', ',|', '(|', 'NYSE|STOCK', ':|', 'ESV|STOCK', ')|', ',|', '(|', 'NYSE|STOCK', ':|', 'JONE|ORG', ')|']
NYSE and ESV now marked with STOCK entity type. Basically, on each match you should manually merge tokens and/or assign entity types you want. There's also acceptor function which allows you to filter/reject the matches while they are being matched.

Categories