NLTK find german nouns - python

I want to extract all german nouns from a german text in lemmatized form with NLTK.
I also checked spacy but NLTK is much more preferred because in english it already works with the needed performance and requested data structure.
I have the following working code for english:
import nltk
from nltk.stem import WordNetLemmatizer
#germanText='Jahrtausendelang ging man davon aus, dass auch der Sender einen geheimen Schlüssel, und zwar den gleichen wie der Empfänger, benötigt.'
text='For thousands of years it was assumed that the sender also needed a secret key, the same as the recipient.'
tokens = nltk.word_tokenize(text)
tokens = [tok.lower() for tok in tokens]
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
tokens = [word for (word, pos) in nltk.pos_tag(tokens) if pos[0] == 'N']
print (tokens)
I get the print as expected:
['year', 'sender', 'key', 'recipient']
Now I tried to do this for German:
import nltk
from nltk.stem import WordNetLemmatizer
germanText='Jahrtausendelang ging man davon aus, dass auch der Sender einen geheimen Schlüssel, und zwar den gleichen wie der Empfänger, benötigt.'
#text='For thousands of years it was assumed that the sender also needed a secret key, the same as the recipient.'
tokens = nltk.word_tokenize(germanText, language='german')
tokens = [tok.lower() for tok in tokens]
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
tokens = [word for (word, pos) in nltk.pos_tag(tokens) if pos[0] == 'N']
print (tokens)
And I get a wrong result:
['jahrtausendelang', 'man', 'davon', 'au', 'der', 'sender', 'einen', 'geheimen', 'zwar', 'den', 'gleichen', 'wie', 'der', 'empfänger', 'benötigt']
The lemmatization did not work and the noun extraction did not work.
How is the proper way to apply different languages to this code?
I also checked other solutions like:
from nltk.stem.snowball import GermanStemmer
stemmer = GermanStemmer("german") # Choose a language
tokenGer=stemmer.stem(tokens)
But this would make me start from the beginning.

I have found a way with the HanoverTagger:
from HanTa import HanoverTagger as ht
tagger = ht.HanoverTagger('morphmodel_ger.pgz')
words = nltk.word_tokenize(text)
print(tagger.tag_sent(words) )
tokens=[word for (word,x,pos) in tagger.tag_sent(words,taglevel= 1) if pos == 'NN']
I get the outcome as expected: ['Jahrtausendelang', 'Sender', 'Schlüssel', 'Empfänger']

Related

Removing stop words from string using spacy in diffrent languages

I have an array of strings in different languages and I would like to remove stop words from these strings.
example of string :
["mai fostul președinte egiptean mohamed morsi ", "em bon jovi lançou o álbum have a nice day a ", " otok škulj är en ö i kroatien den ligger i län"...]
this is the list of languages I'm willing to use :
['French',
'Spanish',
'Thai',
'Russian',
'Persian',
'Indonesian',
'Arabic',
'Pushto',
'Kannada',
'Danish',
'Japanese',
'Malayalam',
'Latin',
'Romanian',
'Swedish',
'Portugese',
'English',
'Turkish',
'Tamil',
'Urdu',
'Korean',
'German',
'Greek',
'Italian',
'Chinese',
'Dutch',
'Estonian',
'Hindi']
I am using Spacy library, but I'm looking for something that support multiple languages.
what I have tried already:
import pandas as pd
import nltk
nltk.download('punkt')
import spacy
nlp = spacy.load("xx_ent_wiki_sm")
from spacy.tokenizer import Tokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
doc = nlp("This is a sentence about Facebook.")
print([(ent.text, ent.label) for ent in doc.ents])
all_stopwords = nlp.Defaults.stop_words
all_stopwords = nlp.Defaults.stop_words
data_text=df1['Text'] #here where i store my strings
for x in data_text:
text_tokens = word_tokenize(x)
tokens_without_sw=[word for word in text_tokens if not word inall_stopwords]
print(tokens_without_sw)

spacy-udpipe with pytextrank to extract keywords from non-English text

I've been using pytextrank (https://github.com/DerwenAI/pytextrank/) with spacy and English models for keywords extraction - it works great!
Now I need to process non-English texts and I found udpipe (https://github.com/TakeLab/spacy-udpipe) but it doesn't work out of the box ... after
nlp = spacy_udpipe.load("sk")
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
doc = nlp(text)
I get tokens with POS and DEP tags, but there is nothing in doc._.phrases (doc.noun_chunks is also empty) and in nlp.pipe_names is just ['textrank']
What should I add to the spacy's pipeline to get it working? I assume pytextrank needs noun_chunks...
Any tip or suggestion where to look will help me - thanks!
would you mind starting an issue about this on the PyTextRank repo?
https://github.com/DerwenAI/pytextrank/issues
Also, if you could please provide example text to use (in the language requested)
We'll try to debug this integration.
Thanks for pointing it out!
Paco
I found a solution! I'm not sure how clean is the nlp.Defaults.syntax_iterators = {"noun_chunks" : get_chunks}, but it works (it's based on how are the noun_chunks defined in syntax_iterators.py and __init__.py in spaCy/lang/en)
import spacy_udpipe, spacy, pytextrank
from spacy.matcher import Matcher
from spacy.attrs import POS
def get_chunks(doc):
np_label = doc.vocab.strings.add("NP")
matcher = Matcher(nlp.vocab)
pattern = [{POS: 'ADJ', "OP": "+"}, {POS: {"IN": ["NOUN", "PROPN"]}, "OP": "+"}]
matcher.add("Adjective(s), (p)noun", None, pattern)
matches = matcher(doc)
for match_id, start, end in matches:
yield start, end, np_label
spacy_udpipe.download("sk") # download model
nlp = spacy_udpipe.load("sk")
nlp.Defaults.syntax_iterators = {"noun_chunks" : get_chunks} #noun_chunk replacement
tr = pytextrank.TextRank(logger=None)
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
text = "Wikipédia je webová encyklopédia s otvoreným obsahom, ktorú možno slobodne čítať aj upravovať. Je sponzorovaná neziskovou organizáciou Wikimedia Foundation. Má 285 nezávislých jazykových vydaní vrátane slovenského a najrozsiahlejšieho anglického. Popri článkoch encyklopedického typu obsahuje, najmä anglická encyklopédia, aj články podobajúce sa almanachu, atlasu či stránky aktuálnych udalostí. Wikipédia je jedným z najpopulárnejších zdrojov informácií na webe s približne 13 miliardami zobrazení mesačne. Jej rast je skoro exponenciálny. Wikipédii (takmer 2 milióny). Wikipédia bola spustená 15. januára 2001 ako doplnok k expertmi písanej Nupedii. So stále rastúcou popularitou sa Wikipédia stala podhubím pre sesterské projekty ako Wikislovník (Wiktionary), Wikiknihy (Wikibooks) a Wikisprávy (Wikinews). Jej články sú upravované dobrovoľníkmi vo wiki štýle, čo znamená, že články môže meniť v podstate hocikto. Wikipediáni presadzujú politiku „nestranný uhol pohľadu“. Podľa nej relevantné názory ľudí sú sumarizované bez ambície určiť objektívnu pravdu. Vzhľadom na to, že Wikipédia presadzuje otvorenú filozofiu, jej najväčším problémom je vandalizmus a nepresnosť. "
doc = nlp(text)
print("Noun chunks:")
for nc in doc.noun_chunks:
print(nc)
print("\nKeywords:")
for phrase in doc._.phrases:
print("{:.4f} {:5d} {}".format(phrase.rank, phrase.count, phrase.text))
print(phrase.chunks)

How can I solve an attribute error when using spacy?

I am using spacy for natural language processing in german.
But I am running into this error:
AttributeError: 'str' object has no attribute 'text'
This is the text data I am working with:
tex = ['Wir waren z.B. früher auf\'m Fahrrad unterwegs in München (immer nach 11 Uhr).',
'Nun fahren wir öfter mit der S-Bahn in München herum. Tja. So ist das eben.',
'So bleibt mir nichts anderes übrig als zu sagen, vielen Dank für alles.',
'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.']
My code:
data = [re.sub(r"\"", "", i) for i in tex]
data1 = [re.sub(r"\“", "", i) for i in data]
data2 = [re.sub(r"\„", "", i) for i in data1]
nlp = spacy.load('de')
spacy_doc1 = []
for line in data2:
spac = nlp(line)
lem = [tok.lemma_ for tok in spac]
no_punct = [tok.text for tok in lem if re.match('\w+', tok.text)]
no_numbers = [tok for tok in no_punct if not re.match('\d+', tok)]
I am writing every string in a seperate list, because I need to assign the result of the processing to the original specific string.
I also understand that the result that is written into lem is not in a format anymore that spacy can process.
So how can I do this correctly?
The problem here lies in the fact that SpaCy's token.lemma_ returns a string, and that strings have no text attribute (as the error states).
I suggest doing the same as you did when you wrote:
no_numbers = [tok for tok in no_punct if not re.match('\d+', tok)]
The only difference with this line in your code would be that you'd have to include the special string "-PRON-" in case you encounter English pronouns:
import re
import spacy
# using the web English model for practicality here
nlp = spacy.load('en_core_web_sm')
tex = ['I\'m going to get a cat tomorrow',
'I don\'t know if I\'ll be able to get him a cat house though!']
data = [re.sub(r"\"", "", i) for i in tex]
data1 = [re.sub(r"\“", "", i) for i in data]
data2 = [re.sub(r"\„", "", i) for i in data1]
spacy_doc1 = []
for line in data2:
spac = nlp(line)
lem = [tok.lemma_ for tok in spac]
no_punct = [tok for tok in lem if re.match('\w+', tok) or tok in ["-PRON-"]]
no_numbers = [tok for tok in no_punct if not re.match('\d+', tok)]
print(no_numbers)
# > ['-PRON-', 'be', 'go', 'to', 'get', 'a', 'cat', 'tomorrow']
# > ['-PRON-', 'do', 'not', 'know', 'if', '-PRON-', 'will', 'be', 'able', 'to', 'get', '-PRON-', 'a', 'cat', 'house', 'though']
Please tell me if this solved your problem as I may have misunderstood your issue.

How to extract specific lemma or pos/tag using spacy?

I am using spacy to lemmatize and parse a list of sentences. the data are contained in an excel file.
I would like to write a function which allow me to return different lemma of my sentences.
For example returning only lemma with a specific tag ("VERB" OR "VERB" +"ADJ")
This is my code :
import spacy
from spacy.lang.fr import French
from spacy_lefff import LefffLemmatizer, POSTagger
nlp = spacy.load("fr_core_news_sm")
nlp=spacy.load('fr')
parser = French()
path = 'Gold.xlsx'
my_sheet ="Gold"
df = read_excel(path, sheet_name= my_sheet)
def tokenizeTexte(sample):
tokens = parser(sample)
lemmas = []
for tok in tokens:
lemmas.append((tok.lemma_.lower(), tok.tag_, tok.pos_))
tokens = lemmas
tokens = [tok for tok in tokens if tok not in stopwords]
return tokens
df['Preprocess_verbatim'] = df.apply(lambda row:tokenizeTexte(row['verbatim']), axis=1)
print(df)
df.to_excel('output.xlsx')
I would like to be able to return all lemma with for example "verb" or "adj" or "adv" tag and then modify to return all the lemma.
I also wish to return different combination of lemma ( "PRON" +" "VERB"+"ADJ")
How can i do that with spacy ?
this is what i obtain with my code
id ... Preprocess_verbatim
0 463 ... [(ce, , ), (concept, , ), (résoudre, , ), (que...
1 2647 ... [(alors, , ), (ça, , ), (vouloir, , ), (dire, ...
2 5391 ... [(ça, , ), (ne, , ), (changer, , ), (rien, , )...
3 1120 ... [(sur, , ), (le, , ), (station, , ), (de, , ),
tok.tag and tok.pos does not appear , do you know why?
My file :
example of my data :
id verbatim
14 L'économe originellement est donc celui qui a la responsabilité, pour des personnes d'une maison, d'une unité d'organisation donnée .
25 De leur donner des rations de ressources au temps opportun.
56 Contrairement à l'idée qu'on se fait l'économe n'est pas axé sur le capital, c'est-à-dire sur l'action de capitaliser, mais sur les individus d'une unité organisation, c'est-à-dire sur l'action de partager, de redistribuer d'une façon juste et opportune des ressources aux différents membre
First, I think your model isn't working correctly because you're defining the nlp object twice. I believe you only need it once. I am also not sure what parser is doing and I'm not sure you need it. For this code, I would use something like the following:
nlp = spacy.load("fr_core_news_sm")
doc = nlp(sample)
tokens = [tok for tok in doc]
Then, doc is a spacy Doc object, and tokens is a list of spaCy Token objects. From here, the loop that iterates over your tokens would work.
If you want to do the POS selection in your existing preprocessing function, I think you only need to change one line in your loop:
for tok in tokens:
if tok.pos_ in ("VERB", "ADJ", "ADV"):
lemmas.append((tok.lemma_.lower(), tok.tag_, tok.pos_))
This will only add tokens with those specific parts of speech to your lemmas list.
I also noticed another issue in your code on this line further down:
tokens = [tok for tok in tokens if tok not in stopwords]
At this point tok is your tuple of (lemma, tag, pos), so unless your list of stopwords is tuples of the same format, and not only lemmas or tokens you want to exclude, this step will not exclude anything.
Putting it all together, you'd have something like this, which would return a list of tuples of (lemma, tag, pos) if the POS is correct:
nlp = spacy.load("fr_core_news_sm")
stopwords = ["here", "are", "some", "stopwords"]
def tokenizeTexte(sample):
doc = nlp(sample)
lemmas = []
for tok in tokens:
if tok.pos_ in ("VERB", "ADJ", "ADV"):
lemmas.append((tok.lemma_.lower(), tok.tag_, tok.pos_))
tokens = [(lemma, tag, pos) for (lemma, tag, pos) in lemmas if lemma not in stopwords]
return tokens

Ideas to improve language detection between Spanish and Catalan

I'm working on a text mining script in python. I need to detect the language of a natural language field from the dataset.
The thing is, 98% of the rows are in Spanish and Catalan. I tried using some algorithms like the stopwords one or the langdetect library, but these languages share a lot of words so they fail a lot.
I'm looking for some ideas to improve this algorithm.
One thought is, make a dictionary with some words that are specific to Spanish and Catalan, so if one text has any of these words, it's tagged as that language.
Approach 1: Distinguishing characters
Spanish and Catalan (note: there will be exceptions for proper names and loanwords e.g. Barça):
esp_chars = "ñÑáÁýÝ"
cat_chars = "çÇàÀèÈòÒ·ŀĿ"
Example:
sample_texts = ["El año que es abundante de poesía, suele serlo de hambre.",
"Cal no abandonar mai ni la tasca ni l'esperança."]
for text in sample_texts:
if any(char in text for char in esp_chars):
print("Spanish: {}".format(text))
elif any(char in text for char in cat_chars):
print("Catalan: {}".format(text))
>>> Spanish: El año que es abundante de poesía, suele serlo de hambre.
Catalan: Cal no abandonar mai ni la tasca ni l'esperança.
If this isn't sufficient, you could expand this logic to search for language exclusive digraphs, letter combinations, or words:
Spanish only
Catalan only
Words
como y su con él otro
com i seva amb ell altre
Initial digraphs
d' l'
Digraphs
ss tj qü l·l l.l
Terminal digraphs
ig
Catalan letter combinations that only marginally appear in Spanish
tx
tg          (Es. exceptions postgrado, postgraduado, postguerra)
ny          (Es. exceptions mostly prefixed in-, en-, con- + y-)
ll (terminal) (Es. exceptions (loanwords): detall, nomparell)
Approach 2: googletrans library
You could also use the googletrans library to detect the language:
from googletrans import Translator
translator = Translator()
for text in sample_texts:
lang = translator.detect(text).lang
print(lang, ":", text)
>>> es : El año que es abundante de poesía, suele serlo de hambre.
ca : Cal no abandonar mai ni la tasca ni l'esperança.
DicCat = ['amb','cap','dalt','damunt','des','dintre','durant','excepte','fins','per','pro','sense','sota','llei','hi','ha','més','mes','moment','órgans', 'segóns','Article','i','per','els','amb','és','com','dels','més','seu','seva','fou','també','però','als','després','aquest','fins','any','són','hi','pel','aquesta','durant','on','part','altres','anys','ciutat','cap','des','seus','tot','estat','qual','segle','quan','ja','havia','molt','rei','nom','fer','així','li','sant','encara','pels','seves','té','partit','està','mateix','pot','nord','temps','fill','només','dues','sota','lloc','això','alguns','govern','uns','aquests','mort','nou','tots','fet','sense','frança','grup','tant','terme','fa','tenir','segons','món','regne','exèrcit','segona','abans','mentre','quals','aquestes','família','catalunya','eren','poden','diferents','nova','molts','església','major','club','estats','seua','diversos','grans','què','arribar','troba','població','poble','foren','època','haver','eleccions','diverses','tipus','riu','dia','quatre','poc','regió','exemple','batalla','altre','espanya','joan','actualment','tenen','dins','llavors','centre','algunes','important','altra','terra','antic','tenia','obres','estava','pare','qui','ara','havien','començar','història','morir','majoria','qui','ara','havien','començar','història','morir','majoria']
DicEsp = ['los','y','bajo','con', 'entre','hacia','hasta','para','por','según','segun','sin','tras','más','mas','ley','capítulo','capitulo','título','titulo','momento','y','las','por','con','su','para','lo','como','más','pero','sus','le','me','sin','este','ya','cuando','todo','esta','son','también','fue','había','muy','años','hasta','desde','está','mi','porque','qué','sólo','yo','hay','vez','puede','todos','así','nos','ni','parte','tiene','él','uno','donde','bien','tiempo','mismo','ese','ahora','otro','después','te','otros','aunque','esa','eso','hace','otra','gobierno','tan','durante','siempre','día','tanto','ella','sí','dijo','sido','según','menos','año','antes','estado','sino','caso','nada','hacer','estaba','poco','estos','presidente','mayor','ante','unos','algo','hacia','casa','ellos','ayer','hecho','mucho','mientras','además','quien','momento','millones','esto','españa','hombre','están','pues','hoy','lugar','madrid','trabajo','otras','mejor','nuevo','decir','algunos','entonces','todas','días','debe','política','cómo','casi','toda','tal','luego','pasado','medio','estas','sea','tenía','nunca','aquí','ver','veces','embargo','partido','personas','grupo','cuenta','pueden','tienen','misma','nueva','cual','fueron','mujer','frente','josé','tras','cosas','fin','ciudad','he','social','tener','será','historia','muchos','juan','tipo','cuatro','dentro','nuestro','punto','dice','ello','cualquier','noche','aún','agua','parece','haber','situación','fuera','bajo','grandes','nuestra','ejemplo','acuerdo','habían','usted','estados','hizo','nadie','países','horas','posible','tarde','ley','importante','desarrollo','proceso','realidad','sentido','lado','mí','tu','cambio','allí','mano','eran','estar','san','número','sociedad','unas','centro','padre','gente','relación','cuerpo','incluso','través','último','madre','mis','modo','problema','cinco','carlos','hombres','información','ojos','muerte','nombre','algunas','público','mujeres','siglo','todavía','meses','mañana','esos','nosotros','hora','muchas','pueblo','alguna','dar','don','da','tú','derecho','verdad','maría','unidos','podría','sería','junto','cabeza','aquel','luis','cuanto','tierra','equipo','segundo','director','dicho','cierto','casos','manos','nivel','podía','familia','largo','falta','llegar','propio','ministro','cosa','primero','seguridad','hemos','mal','trata','algún','tuvo','respecto','semana','varios','real','sé','voz','paso','señor','mil','quienes','proyecto','mercado','mayoría','luz','claro','iba','éste','pesetas','orden','español','buena','quiere','aquella','programa','palabras','internacional','esas','segunda','empresa','puesto','ahí','propia','libro','igual','político','persona','últimos','ellas','total','creo','tengo','dios','española','condiciones','méxico','fuerza','solo','único','acción','amor','policía','puerta','pesar','sabe','calle','interior','tampoco','ningún','vista','campo','buen','hubiera','saber','obras','razón','niños','presencia','tema','dinero','comisión','antonio','servicio','hijo','última','ciento','estoy','hablar','dio','minutos','producción','camino','seis','quién','fondo','dirección','papel','demás','idea','especial','diferentes','dado','base','capital','ambos','europa','libertad','relaciones','espacio','medios','ir','actual','población','empresas','estudio','salud','servicios','haya','principio','siendo','cultura','anterior','alto','media','mediante','primeros','arte','paz','sector','imagen','medida','deben','datos','consejo','personal','interés','julio','grupos','miembros','ninguna','existe','cara','edad','movimiento','visto','llegó','puntos','actividad','bueno','uso','niño','difícil','joven','futuro','aquellos','mes','pronto','soy','hacía','nuevos','nuestros','estaban','posibilidad','sigue','cerca','resultados','educación','atención','gonzález','capacidad','efecto','necesario','valor','aire','investigación','siguiente','figura','central','comunidad','necesidad','serie','organizació','nuevas','calidad']
DicEng = ['all','my','have','do','and', 'or', 'what', 'can', 'you', 'the', 'on', 'it', 'at', 'since', 'for', 'ago', 'before', 'past', 'by', 'next', 'from','with', 'wich','law','is','the','of','and','to','in','is','you','that','it','he','was','for','on','are','as','with','his','they','at','be','this','have','from','or','one','had','by','word','but','not','what','all','were','we','when','your','can','said','there','use','an','each','which','she','do','how','their','if','will','up','other','about','out','many','then','them','these','so','some','her','would','make','like','him','into','time','has','look','two','more','write','go','see','number','no','way','could','people','my','than','first','water','been','call','who','oil','its','now','find','long','down','day','did','get','come','made','may','part','may','part']
def WhichLanguage(text):
Input = text.lower().split(" ")
CatScore = []
EspScore = []
EngScore = []
for e in Input:
if e in DicCat:
CatScore.append(e)
if e in DicEsp:
EspScore.append(e)
if e in DicEng:
EngScore.append(e)
if(len(EngScore) > len(EspScore)) and (len(EngScore) > len(CatScore)):
Language ='English'
else:
if(len(CatScore) > len(EspScore)):
Language ='Catala'
else:
Language ='Espanyol'
print(text)
print("ESP= ",len(EspScore),EspScore)
print("Cat = ",len(CatScore), CatScore)
print("ING= ",len(EngScore),EngScore)
print( 'Language is =', Language)
print("-----")
return(Language)
print(WhichLanguage("Hola bon dia"))

Categories