Strange behaviour with nltk sentence tokenizer and special characters

Strange behaviour with nltk sentence tokenizer and special characters - python

I get some strange behavior when using the sent_tokenizer for German text.
Example Code:
sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
for sent in sent_tokenizer.tokenize("Super Qualität. Tolles Teil.")
print sent
This fails with the error:
Traceback (most recent call last):
for sent in sent_tokenize("Super Qualität. Tolles Teil."):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/__init__.py", line 82, in sent_tokenize
return tokenizer.tokenize(text)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1270, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1318, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1309, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1348, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 354, in _pair_iter
prev = next(it)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1324, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1369, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1504, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 354, in _pair_iter
prev = next(it)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 621, in _annotate_first_pass
for aug_tok in tokens:
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 586, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 6: ordinal not in range(128)
whereas:
sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
for sent in sent_tokenizer.tokenize("Super Qualität des Produktes. Tolles Teil.")
print sent
works perfectly

I found the solution on the nltk homepage.
Caution: when tokenizing a Unicode string, make sure you are not using
an encoded version of the string (it may be necessary to decode it
first, e.g. with s.decode("utf8").
So
text = "Super Qualität. Tolles Teil."
sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
for sent in sent_tokenizer.tokenize(text.decode('utf8')):
print sent
works like a charm.

Related

anaconda navigator stuck on loading applications

(base) C:\Windows\System32>anaconda-navigator
2022-10-25 21:01:52,124 - ERROR init.global_exception_logger:19
'utf-8' codec can't decode byte 0xbb in position 0: invalid start byte
Traceback (most recent call last):
File "D:\anaconda3\lib\site-packages\anaconda_navigator\widgets\main_window_init_.py", line 497, in setup
self.post_setup(conda_data=output)
File "D:\anaconda3\lib\site-packages\anaconda_navigator\widgets\main_window_init_.py", line 525, in post_setup
self.tab_home.setup(conda_data)
File "D:\anaconda3\lib\site-packages\anaconda_navigator\widgets\tabs\home.py", line 253, in setup
self.update_applications()
File "D:\anaconda3\lib\site-packages\anaconda_navigator\widgets\tabs\home.py", line 292, in update_applications
self.api.process_apps(self._applications, prefix=self.current_prefix).values(),
File "D:\anaconda3\lib\site-packages\anaconda_navigator\api\anaconda_api.py", line 561, in process_apps
collected_applications: external_apps.ApplicationCollection = external_apps.get_applications(
File "D:\anaconda3\lib\site-packages\anaconda_navigator\api\external_apps_init.py", line 49, in get_applications
apps: typing.Sequence[typing.Union[BaseApp, AppPatch]] = config_utils.load_configuration(context=context)
File "D:\anaconda3\lib\site-packages\anaconda_navigator\api\external_apps\config_utils.py", line 217, in load_configuration
return apply_configuration(
File "D:\anaconda3\lib\site-packages\anaconda_navigator\api\external_apps\config_utils.py", line 198, in apply_configuration
addition: typing.Union[None, base.BaseApp, base.AppPatch] = base.BaseApp.parse_configuration(
File "D:\anaconda3\lib\site-packages\anaconda_navigator\api\external_apps\base.py", line 233, in parse_configuration
return target_cls._parse_configuration( # pylint: disable=protected-access
File "D:\anaconda3\lib\site-packages\anaconda_navigator\api\external_apps\base.py", line 458, in _parse_configuration
result: BaseInstallableApp = BaseInstallableApp(
File "", line 17, in init
self.attrs_post_init()
File "D:\anaconda3\lib\site-packages\anaconda_navigator\api\external_apps\base.py", line 378, in attrs_post_init
for location in self._detector(context=context):
File "D:\anaconda3\lib\site-packages\anaconda_navigator\api\external_apps\bundle\vscode_utils.py", line 58, in call
stdout, _, _ = conda_launch_utils.run_process([application.executable, '--version'])
File "D:\anaconda3\lib\site-packages\anaconda_navigator\utils\conda\launch.py", line 45, in run_process
stdout = ansi_utlils.escape_ansi(raw_stdout.decode())
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbb in position 0: invalid start byte
can anyone help me to solve this issue?

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 125: invalid continuation byte

I am reading in a file with 50000 rows and values:
2.711569 2.295490 4.141002
...
And numpy.loadtxt does a good job reading them and doing some calculations. Once i call my function :
c = setupc(2,n) # setups of cov matrix
with n = 499999
it leads me to the dispatcher:
def _compile_for_args(self, *args, **kws):
"""
For internal use. Compile a specialized version of the function
for the given *args* and *kws*, and return the resulting callable.
"""
assert not kws
def error_rewrite(e, issue_type):
"""
Rewrite and raise Exception `e` with help supplied based on the
specified issue_type.
"""
if config.SHOW_HELP:
help_msg = errors.error_extras[issue_type]
e.patch_message('\n'.join((str(e).rstrip(), help_msg)))
if config.FULL_TRACEBACKS:
raise e
else:
reraise(type(e), e, None)
argtypes = []
for a in args:
if isinstance(a, OmittedArg):
argtypes.append(types.Omitted(a.value))
else:
argtypes.append(self.typeof_pyval(a))
try:
return self.compile(tuple(argtypes))
and gives the error:
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "C:\Anaconda3\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Anaconda3\lib\site-packages\spyder_kernels\console\__main__.py", line 11, in <module>
start.main()
File "C:\Anaconda3\lib\site-packages\spyder_kernels\console\start.py", line 318, in main
kernel.start()
File "C:\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 583, in start
self.io_loop.start()
File "C:\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 153, in start
self.asyncio_loop.run_forever()
File "C:\Anaconda3\lib\asyncio\base_events.py", line 538, in run_forever
self._run_once()
File "C:\Anaconda3\lib\asyncio\base_events.py", line 1782, in _run_once
handle._run()
File "C:\Anaconda3\lib\asyncio\events.py", line 100, in _run
self._loop.call_exception_handler(context)
File "C:\Anaconda3\lib\asyncio\base_events.py", line 1647, in call_exception_handler
exc_info=True)
File "C:\Anaconda3\lib\logging\__init__.py", line 1407, in error
self._log(ERROR, msg, args, **kwargs)
File "C:\Anaconda3\lib\logging\__init__.py", line 1514, in _log
self.handle(record)
File "C:\Anaconda3\lib\logging\__init__.py", line 1524, in handle
self.callHandlers(record)
File "C:\Anaconda3\lib\logging\__init__.py", line 1594, in callHandlers
lastResort.handle(record)
File "C:\Anaconda3\lib\logging\__init__.py", line 894, in handle
self.emit(record)
File "C:\Anaconda3\lib\logging\__init__.py", line 1033, in emit
self.handleError(record)
File "C:\Anaconda3\lib\logging\__init__.py", line 947, in handleError
traceback.print_exception(t, v, tb, None, sys.stderr)
File "C:\Anaconda3\lib\traceback.py", line 104, in print_exception
type(value), value, tb, limit=limit).format(chain=chain):
File "C:\Anaconda3\lib\traceback.py", line 521, in __init__
self._load_lines()
File "C:\Anaconda3\lib\traceback.py", line 533, in _load_lines
self.__context__._load_lines()
File "C:\Anaconda3\lib\traceback.py", line 533, in _load_lines
self.__context__._load_lines()
File "C:\Anaconda3\lib\traceback.py", line 533, in _load_lines
self.__context__._load_lines()
[Previous line repeated 14 more times]
File "C:\Anaconda3\lib\traceback.py", line 531, in _load_lines
frame.line
File "C:\Anaconda3\lib\traceback.py", line 285, in line
self._line = linecache.getline(self.filename, self.lineno).strip()
File "C:\Anaconda3\lib\linecache.py", line 16, in getline
lines = getlines(filename, module_globals)
File "C:\Anaconda3\lib\linecache.py", line 47, in getlines
return updatecache(filename, module_globals)
File "C:\Anaconda3\lib\linecache.py", line 137, in updatecache
lines = fp.readlines()
File "C:\Anaconda3\lib\codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 125: invalid continuation byte
I hope somebody can help me
EDIT:
Function setupc:
def setupc(m,n):
c = np.zeros((m,m))
for i in range(1,m+1):
for j in range(1,m+1):
# Heaviside
if (i + j - n - 2 >= 0):
heav = 1.0
else:
heav = 0.0
c[i-1,j-1] = (
2.*min(i,j) * (1.+3.*i*j - min(i,j)**2) / (n - min(i,j) +1)
+ (min(i,j)**2 - min(i,j)**4) / ((n-i+1.)*(n-j+1.))
+ heav * ((n+1.-i-j)**4 - (n+1.-i-j)**2) / ((n-i+1.)*(n-j+1.))
)
return c

UnicodeDecodeError: 'ascii' codec can't decode byte in Textranking code [duplicate]

This question already has answers here:
How to fix: "UnicodeDecodeError: 'ascii' codec can't decode byte"
(20 answers)
Closed 5 years ago.
When I execute the below code
import networkx as nx
import numpy as np
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
def textrank(document):
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(document)
bow_matrix = CountVectorizer().fit_transform(sentences)
normalized = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized * normalized.T
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
fp = open("QC")
txt = fp.read()
sents = textrank(txt)
print sents
I get the following error
Traceback (most recent call last):
File "Textrank.py", line 44, in <module>
sents = textrank(txt)
File "Textrank.py", line 10, in textrank
sentences = sentence_tokenizer.tokenize(document)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1316, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 311, in _pair_iter
for el in it:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1291, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1337, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1472, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 310, in _pair_iter
prev = next(it)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 577, in _annotate_first_pass
for aug_tok in tokens:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 9: ordinal not in range(128)
I am executing the code in Ubuntu. To get the text, I referred this website
https://uwaterloo.ca/institute-for-quantum-computing/quantum-computing-101. I created a file QC (not QC.txt) and copy pasted the data paragraph by paragraph to the file.
Kindly help me resolve the error.
Thank You

Please try if the following works for you.
import networkx as nx
import numpy as np
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
def textrank(document):
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(document)
bow_matrix = CountVectorizer().fit_transform(sentences)
normalized = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized * normalized.T
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
fp = open("QC")
txt = fp.read()
sents = textrank(txt.encode('utf-8'))
print sents

word_tokenize in nltk not taking a list of string as argument

from nltk.tokenize import word_tokenize
music_comments = [['So cant you just run the bot outside of the US? ', ''], ["Just because it's illegal doesn't mean it will stop. I hope it actually gets enforced. ", ''], ['Can they do something about all the fucking bots on Tinder next? \n\nEdit: Holy crap my inbox just blew up ', '']]
print(word_tokenize(music_comments[1]))
I found this other question which says to pass a list of strings to word_tokenize, but in my case after running the above I get the following output:
Traceback (most recent call last):
File "testing.py", line 5, in <module>
print(word_tokenize(music_comments[1]))
File "C:\Users\Shraddheya Shendre\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py", line 109, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "C:\Users\Shraddheya Shendre\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py", line 94, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Users\Shraddheya Shendre\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Users\Shraddheya Shendre\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\Shraddheya Shendre\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\Shraddheya Shendre\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1276, in <listcomp>
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\Shraddheya Shendre\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1316, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Users\Shraddheya Shendre\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 310, in _pair_iter
prev = next(it)
File "C:\Users\Shraddheya Shendre\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1289, in _slices_from_text
for match in self._lang_vars.period_context_re().finditer(text):
TypeError: expected string or bytes-like object
What is the problem? What am I missing?

You are feeding a list with two items into tokenize():
["Just because it's illegal doesn't mean it will stop. I hope it actually gets enforced. ", '']
i.e the sentence and an empty string.
Changing your code to this should do the trick:
print(word_tokenize(music_comments[1][0]))

def word_tokenize(self, s):
"""Tokenize a string to split off punctuation other than periods"""
return self._word_tokenizer_re().findall(s)
this is part of the 'Source code for nltk.tokenize.punkt'.
The input of function word_tokenize() should be a string,not a list.

How to do tokenization of text file in format UTF-8 in python

I want to do tokenization and have to create file containing tokenized words without stop words for sentiment analysis. I am trying with the code but it gives an error. The code is:
import nltk
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
stopset = set(stopwords.words('english'))
with open('Grey.txt', 'r') as text_file,open('step3.txt','w') as outFile:
text = text_file.read()
tokens=word_tokenize(str(text))
tokens = [w for w in tokens if not w in stopset]
print(tokens)
outFile.write(str(tokens))
outFile.close()
and the error is:
(C:\Users\sama\Anaconda2) C:\Users\sama\Anaconda2\Amazon Project>python sw.py
Traceback (most recent call last):
File "sw.py", line 15, in <module>
tokens=word_tokenize(str(text))
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\__init__.py",
line 109, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\__init__.py",
line 94, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line
1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line
1316, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 311, in _pair_iter
for el in it:
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1291, in _slices_from_text
if self.text_contains_sentbreak(context):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1337, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1472, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 310, in _pair_iter
prev = next(it)
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 577, in _annotate_first_pass
for aug_tok in tokens:
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 12:
ordinal not in range(128)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Strange behaviour with nltk sentence tokenizer and special characters - python

Related

anaconda navigator stuck on loading applications

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 125: invalid continuation byte

UnicodeDecodeError: 'ascii' codec can't decode byte in Textranking code [duplicate]

word_tokenize in nltk not taking a list of string as argument

How to do tokenization of text file in format UTF-8 in python

Categories

Resources