Resource punkt not found. But, it is downloaded and installed - python

I have the following columns in a dataframe.
Unnamed: 0, title, publication, author, year, month, title.1, content, len_article, gensim_summary, split_words, first_100_words
I am trying to run this small piece of code.
import nltk
nltk.download('punkt')
# TOKENIZE
df.first_100_words = df.first_100_words.str.lower()
df['tokenized_first_100'] = df.first_100_words.apply(lambda x: word_tokenize(x, language = 'en'))
The last line of code throws an error. I'm getting this error message.
df.first_100_words = df.first_100_words.str.lower()
df['tokenized_first_100'] = df.first_100_words.apply(lambda x: word_tokenize(x, language = 'en'))
Traceback (most recent call last):
File "<ipython-input-129-42381e657774>", line 2, in <module>
df['tokenized_first_100'] = df.first_100_words.apply(lambda x: word_tokenize(x, language = 'en'))
File "C:\Users\ryans\Anaconda3\lib\site-packages\pandas\core\series.py", line 3848, in apply
mapped = lib.map_infer(values, f, convert=convert_dtype)
File "pandas\_libs\lib.pyx", line 2329, in pandas._libs.lib.map_infer
File "<ipython-input-129-42381e657774>", line 2, in <lambda>
df['tokenized_first_100'] = df.first_100_words.apply(lambda x: word_tokenize(x, language = 'en'))
File "C:\Users\ryans\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py", line 144, in word_tokenize
sentences = [text] if preserve_line else sent_tokenize(text, language)
File "C:\Users\ryans\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py", line 105, in sent_tokenize
tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
File "C:\Users\ryans\Anaconda3\lib\site-packages\nltk\data.py", line 868, in load
opened_resource = _open(resource_url)
File "C:\Users\ryans\Anaconda3\lib\site-packages\nltk\data.py", line 993, in _open
return find(path_, path + ['']).open()
File "C:\Users\ryans\Anaconda3\lib\site-packages\nltk\data.py", line 701, in find
raise LookupError(resource_not_found)
LookupError:
**********************************************************************
Resource punkt not found.
Please use the NLTK Downloader to obtain the resource:
import nltk
nltk.download('punkt')
For more information see: https://www.nltk.org/data.html
Attempted to load tokenizers/punkt/en.pickle
Searched in:
- 'C:\\Users\\ryans/nltk_data'
- 'C:\\Users\\ryans\\Anaconda3\\nltk_data'
- 'C:\\Users\\ryans\\Anaconda3\\share\\nltk_data'
- 'C:\\Users\\ryans\\Anaconda3\\lib\\nltk_data'
- 'C:\\Users\\ryans\\AppData\\Roaming\\nltk_data'
- 'C:\\nltk_data'
- 'D:\\nltk_data'
- 'E:\\nltk_data'
- ''
**********************************************************************
I'm pretty new to all the tokenization stuff.
The sample code is from this site.
https://github.com/AustinKrause/Mod_5_Text_Summarizer/blob/master/Notebooks/Text_Cleaning_and_KMeans.ipynb

I found that and it helped: https://github.com/b0noI/dialog_converter/issues/7
Just add
nltk.download('punkt')
SENT_DETECTOR = nltk.data.load('tokenizers/punkt/english.pickle')

Related

Hugging Face Spaces failing to build using fast.ai because of Resampling on module PIL.Image

I keep getting the below error when I try to access my model on hugging face spaces. I am building my model in a Kaggle notebook, then downloading to a pkl file to my spaces repo and git pushing to HF spaces. Below is my ImageDataLoaders class that I am using, as I suspect the error is coming from here.
dls = ImageDataLoaders.from_folder(path, valid_pct = 0.2, item_tfms=Resize(460), batch_tfms=aug_transforms(size=224, min_scale=0.75))
Here is the error I am getting.
Traceback (most recent call last):
File "app.py", line 5, in <module>
learn = load_learner('new_model.pkl')
File "/home/user/.local/lib/python3.8/site-packages/fastai/learner.py", line 428, in load_learner
try: res = torch.load(fname, map_location=map_loc, pickle_module=pickle_module)
File "/home/user/.local/lib/python3.8/site-packages/torch/serialization.py", line 712, in load
return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
File "/home/user/.local/lib/python3.8/site-packages/torch/serialization.py", line 1049, in _load
result = unpickler.load()
File "/home/user/.local/lib/python3.8/site-packages/torch/serialization.py", line 1042, in find_class
return super().find_class(mod_name, name)
AttributeError: Custom classes or functions exported with your `Learner` not available in namespace.\Re-declare/import before loading:
Can't get attribute 'Resampling' on <module 'PIL.Image' from '/home/user/.local/lib/python3.8/site-packages/PIL/Image.py'>
Here is my fill app.py code.
from fastai.vision.all import *
import gradio as gr
import skimage
learn = load_learner('new_model.pkl')
categories = ('deer', 'elk', 'moose')
def classify_image(img):
img = PILImage.create(img)
pred, idx, probs = learn.predict(img)
return dict(zip(categories, map(float,probs)))
image = gr.inputs.Image(type='pil', shape=(192,192))
label = gr.outputs.Label()
intf = gr.Interface(fn=classify_image, inputs=image, outputs=label)
intf.launch(inline=False)

Python nltk shows error when i try to identify nouns and verbs

I try to identify nouns and verbs in Python. I used the nltk package and it shows me a yellow color error with a long red lettering.
my code:
import nltk
text = 'This is a table. We should table this offer. The table is in the center.'
text = nltk.word_tokenize(text)
result = nltk.pos_tag(text)
result = [i for i in result if i[0].lower() == 'table']
print(result) # it need to show: [('table', 'JJ'), ('table', 'VB'), ('table', 'NN')]
my error:
Traceback (most recent call last):
File "C:/Users/zivsi/PycharmProjects/AI/a.py", line 5, in <module>
text = nltk.word_tokenize(text)
File "C:\Users\zivsi\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\tokenize\__init__.py", line 144, in word_tokenize
sentences = [text] if preserve_line else sent_tokenize(text, language)
File "C:\Users\zivsi\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\tokenize\__init__.py", line 105, in sent_tokenize
tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
File "C:\Users\zivsi\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\data.py", line 868, in load
opened_resource = _open(resource_url)
File "C:\Users\zivsi\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\data.py", line 993, in _open
return find(path_, path + ['']).open()
File "C:\Users\zivsi\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\data.py", line 701, in find
raise LookupError(resource_not_found)
LookupError:
**********************************************************************
Resource punkt not found.
Please use the NLTK Downloader to obtain the resource:
import nltk
nltk.download('punkt')
For more information see: https://www.nltk.org/data.html
Attempted to load tokenizers/punkt/english.pickle
Searched in:
- 'C:\\Users\\zivsi/nltk_data'
- 'C:\\Users\\zivsi\\AppData\\Local\\Programs\\Python\\Python36\\nltk_data'
- 'C:\\Users\\zivsi\\AppData\\Local\\Programs\\Python\\Python36\\share\\nltk_data'
- 'C:\\Users\\zivsi\\AppData\\Local\\Programs\\Python\\Python36\\lib\\nltk_data'
- 'C:\\Users\\zivsi\\AppData\\Roaming\\nltk_data'
- 'C:\\nltk_data'
- 'D:\\nltk_data'
- 'E:\\nltk_data'
- ''
**********************************************************************
can you help me? or there are anothers packages to recognize
i needed to write:
import nltk
nltk.download('punkt')

Perform stemming on csv file using Pandas

I am new to python and Pandas and want to perform stemming on a CSV file column named 'Body' using Pandas. My code is as below:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
answers= pd.read_csv('F:/mtech/Project/answer_sample.csv')
porter_stemmer = PorterStemmer()
#print(answers.head())
#print(answers.loc[0:,"Body"])
df= pd.read_csv('F:/mtech/Project/answer_sample.csv','utf-8')
df['Body'] = df['Body'].str.lower().str.split()
stop = stopwords.words('english')
df['Body']= df['Body'].apply(lambda x: [item for item in x if item not in stop])
df['Body_Tokenized']= df['Body'].apply(lambda x : filter(None,x.split(' ')))
df['Body_Stemmed']= df['Body_Tokenized'].apply(lambda x : [porter_stemmer.stem(y) for y in x])
df.to_csv('F:/mtech/Project/answer_swr_stem.csv')
print("Done..")
I am able to perform stopword removal but while stemming, I get the following error:
Traceback (most recent call last):
File "F:\mtech\DATASET\answer_pd.py", line 10, in <module>
df= pd.read_csv('F:/mtech/Project/answer_sample.csv','utf-8')
File "C:\Users\Ayushi Misra\Anaconda2\lib\site-packages\pandas\io\parsers.py", line 646, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\Ayushi Misra\Anaconda2\lib\site-packages\pandas\io\parsers.py", line 401, in _read
data = parser.read()
File "C:\Users\Ayushi Misra\Anaconda2\lib\site-packages\pandas\io\parsers.py", line 939, in read
ret = self._engine.read(nrows)
File "C:\Users\Ayushi Misra\Anaconda2\lib\site-packages\pandas\io\parsers.py", line 1997, in read
alldata = self._rows_to_cols(content)
File "C:\Users\Ayushi Misra\Anaconda2\lib\site-packages\pandas\io\parsers.py", line 2551, in _rows_to_cols
raise ValueError(msg)
ValueError: Expected 1 fields in line 2853, saw 2. Error could possibly be due to quotes being ignored when a multi-char delimiter is used.
Need help!

Error using nltk word_tokenize

I am doing some exercises from the NLTK book on accesing text from web and from disk (chapter 3). When calling word_tokenize I get an error.
This is my code:
>>> import nltk
>>> from urllib.request import urlopen
>>> url = "http://www.gutenberg.org/files/2554/2554.txt"
>>> raw = urlopen(url).read()
>>> tokens = nltk.word_tokenize(raw)
And this is the traceback:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
for sl1, sl2 in _pair_iter(slices):
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 310, in _pair_iter
prev = next(it)
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1289, in _slices_from_text
for match in self._lang_vars.period_context_re().finditer(text):
TypeError: cannot use a string pattern on a bytes-like object
>>> File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py", line 109, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py", line 94, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1276, in <listcomp>
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1316, in _realign_boundaries
Can someone please explain me to me what is going on here and why I cannot seem to use word_tokenize properly?
Many thanks!
You have to convert html (which is obtained as byte object) into a string using decode('utf-8'):
>>> import nltk
>>> from urllib.request import urlopen
>>> url = "http://www.gutenberg.org/files/2554/2554.txt"
>>> raw = urlopen(url).read()
>>> raw = raw.decode('utf-8')
>>> tokens = nltk.word_tokenize(raw)
I was getting the Error 404 for the url so I change the url .This works for me. you can change url to below. may be it works for you as well.
from urllib import request
url = "https://ia803405.us.archive.org/21/items/crimeandpunishme02554gut/2554.txt"
raw = request.urlopen(url).read()

use polyglot package for Named Entity Recognition in hebrew

I am trying to use the polyglot package for Named Entity Recognition in hebrew.
this is my code:
# -*- coding: utf8 -*-
import polyglot
from polyglot.text import Text, Word
from polyglot.downloader import downloader
downloader.download("embeddings2.iw")
text = Text(u"in france and in germany")
print(type(text))
text2 = Text(u"נסעתי מירושלים לתל אביב")
print(type(text2))
print(text.entities)
print(text2.entities)
this is the output:
<class 'polyglot.text.Text'>
<class 'polyglot.text.Text'>
[I-LOC([u'france']), I-LOC([u'germany'])]
Traceback (most recent call last):
File "C:/Python27/Lib/site-packages/IPython/core/pyglot.py", line 15, in <module>
print(text2.entities)
File "C:\Python27\lib\site-packages\polyglot\decorators.py", line 20, in __get__
value = obj.__dict__[self.func.__name__] = self.func(obj)
File "C:\Python27\lib\site-packages\polyglot\text.py", line 132, in entities
for i, (w, tag) in enumerate(self.ne_chunker.annotate(self.words)):
File "C:\Python27\lib\site-packages\polyglot\decorators.py", line 20, in __get__
value = obj.__dict__[self.func.__name__] = self.func(obj)
File "C:\Python27\lib\site-packages\polyglot\text.py", line 100, in ne_chunker
return get_ner_tagger(lang=self.language.code)
File "C:\Python27\lib\site-packages\polyglot\decorators.py", line 30, in memoizer
cache[key] = obj(*args, **kwargs)
File "C:\Python27\lib\site-packages\polyglot\tag\base.py", line 191, in get_ner_tagger
return NEChunker(lang=lang)
File "C:\Python27\lib\site-packages\polyglot\tag\base.py", line 104, in __init__
super(NEChunker, self).__init__(lang=lang)
File "C:\Python27\lib\site-packages\polyglot\tag\base.py", line 40, in __init__
self.predictor = self._load_network()
File "C:\Python27\lib\site-packages\polyglot\tag\base.py", line 109, in _load_network
self.embeddings = load_embeddings(self.lang, type='cw', normalize=True)
File "C:\Python27\lib\site-packages\polyglot\decorators.py", line 30, in memoizer
cache[key] = obj(*args, **kwargs)
File "C:\Python27\lib\site-packages\polyglot\load.py", line 61, in load_embeddings
p = locate_resource(src_dir, lang)
File "C:\Python27\lib\site-packages\polyglot\load.py", line 43, in locate_resource
if downloader.status(package_id) != downloader.INSTALLED:
File "C:\Python27\lib\site-packages\polyglot\downloader.py", line 738, in status
info = self._info_or_id(info_or_id)
File "C:\Python27\lib\site-packages\polyglot\downloader.py", line 508, in _info_or_id
return self.info(info_or_id)
File "C:\Python27\lib\site-packages\polyglot\downloader.py", line 934, in info
raise ValueError('Package %r not found in index' % id)
ValueError: Package u'embeddings2.iw' not found in index
The english worked but not the hebrew.
Whether I try to download the package u'embeddings2.iw' or not I get:
ValueError: Package u'embeddings2.iw' not found in index
I got it!
It seems like a bug to me.
The language detection defined the language as 'iw' which is the The former ISO 639 language code for Hebrew, and was changed to 'he'.
The text.entities did not recognize the iw code, so i changes it like so:
text2.hint_language_code = 'he'

Categories