UnicodeDecodeError: 'ascii' codec can't decode byte in Textranking code [duplicate] - python

This question already has answers here:
How to fix: "UnicodeDecodeError: 'ascii' codec can't decode byte"
(20 answers)
Closed 5 years ago.
When I execute the below code
import networkx as nx
import numpy as np
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
def textrank(document):
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(document)
bow_matrix = CountVectorizer().fit_transform(sentences)
normalized = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized * normalized.T
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
fp = open("QC")
txt = fp.read()
sents = textrank(txt)
print sents
I get the following error
Traceback (most recent call last):
File "Textrank.py", line 44, in <module>
sents = textrank(txt)
File "Textrank.py", line 10, in textrank
sentences = sentence_tokenizer.tokenize(document)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1316, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 311, in _pair_iter
for el in it:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1291, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1337, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1472, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 310, in _pair_iter
prev = next(it)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 577, in _annotate_first_pass
for aug_tok in tokens:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 9: ordinal not in range(128)
I am executing the code in Ubuntu. To get the text, I referred this website
https://uwaterloo.ca/institute-for-quantum-computing/quantum-computing-101. I created a file QC (not QC.txt) and copy pasted the data paragraph by paragraph to the file.
Kindly help me resolve the error.
Thank You

Please try if the following works for you.
import networkx as nx
import numpy as np
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
def textrank(document):
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(document)
bow_matrix = CountVectorizer().fit_transform(sentences)
normalized = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized * normalized.T
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
fp = open("QC")
txt = fp.read()
sents = textrank(txt.encode('utf-8'))
print sents

Related

utf-8 encoding error in seq2seq model

Hi I'm working on Language translation using Keras. I have a text file with English text and a file with Hindi text.
I'm facing "UnicodeDecodeError:". And I believe maybe its because it is unable to convert non-unicode to unicode.
Please let me know how to go about it. The github link is below
https://github.com/shashankg7/Seq2Seq/tree/master/seq2seq
Code Snippet:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import codecs
import pdb
import numpy as np
#from utils import preprocess_text, text2seq_generator
def preprocess_text(file_path_src, file_path_tar, max_feats):
f_src = open(file_path_src)
f_tar = open(file_path_tar)
vocab = defaultdict(int)
freq_src = defaultdict(int)
freq_tar = defaultdict(int)
sents_src = [line.rstrip() for line in f_src.readlines()]
sents_tar = [line.rstrip() for line in f_tar.readlines()]
def preprocess(self):
# Preprocessing source and target text sequence files
self.vocab_src, self.vocab_tar, self.sents_src, self.sents_tar =
preprocess_text(self.path_src, self.path_tar, self.max_feat)
if __name__ == "__main__":
pre = preprocess('C:\\Users\\anagha\\Desktop\\Language-Translation\\Seq2Seq-master\\Seq2Seq-master\\seq2seq\\training.hi-en.hi', 'C:\\Users\\anagha\\Desktop\\Language-Translation\\Seq2Seq-master\\Seq2Seq-master\\seq2seq\\training.hi-en.en', 5500, 15)
pre.preprocess()
for e in range(1):
print("epoch no %d"%e)
for X,Y in pre.gen_batch():
print(X)
Error :
Using TensorFlow backend.
Traceback (most recent call last):
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2898, in run_code
self.showtraceback()
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1807, in showtraceback
self.showsyntaxerror(filename)
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1864, in showsyntaxerror
stb = self.SyntaxTB.structured_traceback(etype, value, [])
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\ultratb.py", line 1441, in structured_traceback
newtext = ulinecache.getline(value.filename, value.lineno)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 16, in getline
lines = getlines(filename, module_globals)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 47, in getlines
return updatecache(filename, module_globals)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 137, in updatecache
lines = fp.readlines()
File "C:\Users\anagha\Anaconda3\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 7588: invalid start byte

How to do tokenization of text file in format UTF-8 in python

I want to do tokenization and have to create file containing tokenized words without stop words for sentiment analysis. I am trying with the code but it gives an error. The code is:
import nltk
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
stopset = set(stopwords.words('english'))
with open('Grey.txt', 'r') as text_file,open('step3.txt','w') as outFile:
text = text_file.read()
tokens=word_tokenize(str(text))
tokens = [w for w in tokens if not w in stopset]
print(tokens)
outFile.write(str(tokens))
outFile.close()
and the error is:
(C:\Users\sama\Anaconda2) C:\Users\sama\Anaconda2\Amazon Project>python sw.py
Traceback (most recent call last):
File "sw.py", line 15, in <module>
tokens=word_tokenize(str(text))
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\__init__.py",
line 109, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\__init__.py",
line 94, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line
1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line
1316, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 311, in _pair_iter
for el in it:
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1291, in _slices_from_text
if self.text_contains_sentbreak(context):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1337, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1472, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 310, in _pair_iter
prev = next(it)
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 577, in _annotate_first_pass
for aug_tok in tokens:
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 12:
ordinal not in range(128)

Error using nltk word_tokenize

I am doing some exercises from the NLTK book on accesing text from web and from disk (chapter 3). When calling word_tokenize I get an error.
This is my code:
>>> import nltk
>>> from urllib.request import urlopen
>>> url = "http://www.gutenberg.org/files/2554/2554.txt"
>>> raw = urlopen(url).read()
>>> tokens = nltk.word_tokenize(raw)
And this is the traceback:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
for sl1, sl2 in _pair_iter(slices):
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 310, in _pair_iter
prev = next(it)
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1289, in _slices_from_text
for match in self._lang_vars.period_context_re().finditer(text):
TypeError: cannot use a string pattern on a bytes-like object
>>> File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py", line 109, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py", line 94, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1276, in <listcomp>
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\u0084411\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1316, in _realign_boundaries
Can someone please explain me to me what is going on here and why I cannot seem to use word_tokenize properly?
Many thanks!
You have to convert html (which is obtained as byte object) into a string using decode('utf-8'):
>>> import nltk
>>> from urllib.request import urlopen
>>> url = "http://www.gutenberg.org/files/2554/2554.txt"
>>> raw = urlopen(url).read()
>>> raw = raw.decode('utf-8')
>>> tokens = nltk.word_tokenize(raw)
I was getting the Error 404 for the url so I change the url .This works for me. you can change url to below. may be it works for you as well.
from urllib import request
url = "https://ia803405.us.archive.org/21/items/crimeandpunishme02554gut/2554.txt"
raw = request.urlopen(url).read()

Strange behaviour with nltk sentence tokenizer and special characters

I get some strange behavior when using the sent_tokenizer for German text.
Example Code:
sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
for sent in sent_tokenizer.tokenize("Super Qualität. Tolles Teil.")
print sent
This fails with the error:
Traceback (most recent call last):
for sent in sent_tokenize("Super Qualität. Tolles Teil."):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/__init__.py", line 82, in sent_tokenize
return tokenizer.tokenize(text)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1270, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1318, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1309, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1348, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 354, in _pair_iter
prev = next(it)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1324, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1369, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1504, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 354, in _pair_iter
prev = next(it)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 621, in _annotate_first_pass
for aug_tok in tokens:
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 586, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 6: ordinal not in range(128)
whereas:
sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
for sent in sent_tokenizer.tokenize("Super Qualität des Produktes. Tolles Teil.")
print sent
works perfectly
I found the solution on the nltk homepage.
Caution: when tokenizing a Unicode string, make sure you are not using
an encoded version of the string (it may be necessary to decode it
first, e.g. with s.decode("utf8").
So
text = "Super Qualität. Tolles Teil."
sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
for sent in sent_tokenizer.tokenize(text.decode('utf8')):
print sent
works like a charm.

How to fix such ClientForm bug?

from mechanize import Browser
br = Browser()
page = br.open('http://wow.interzet.ru/news.php?readmore=23')
br.form = br.forms().next()
print br.form
gives me the following error:
Traceback (most recent call last):
File "C:\Users\roddik\Desktop\mech.py", line 6, in <module>
br.form = br.forms().next()
File "build\bdist.win32\egg\mechanize\_mechanize.py", line 426, in forms
File "D:\py26\lib\site-package\mechanize-0.1.11-py2.6.egg\mechanize\_html.py", line 559, in forms
File "D:\py26\lib\site-packages\mechanize-0.1.11-py2.6.egg\mechanize\_html.py", line 225, in forms
File "D:\py26\lib\site-packages\clientform-0.2.10-py2.6.egg\ClientForm.py", line 967, in ParseResponseEx
File "D:\py26\lib\site-packages\clientform-0.2.10-py2.6.egg\ClientForm.py", line 1100, in _ParseFileEx
File "D:\py26\lib\site-packages\clientform-0.2.10-py2.6.egg\ClientForm.py", line 870, in feed
File "D:\py26\lib\sgmllib.py", line 104, in feed
self.goahead(0)
File "D:\py26\lib\sgmllib.py", line 138, in goahead
k = self.parse_starttag(i)
File "D:\py26\lib\sgmllib.py", line 290, in parse_starttag
self._convert_ref, attrvalue)
File "D:\py26\lib\sgmllib.py", line 302, in _convert_ref
return self.convert_charref(match.group(2)) or \
File "D:\py26\lib\site-packages\clientform-0.2.10-py2.6.egg\ClientForm.py", line 850, in convert_charref
File "D:\py26\lib\site-packages\clientform-0.2.10-py2.6.egg\ClientForm.py", line 244, in unescape_charref
ValueError: invalid literal for int() with base 10: 'e'
How can I fix it?
Edit:
I've fixed it this way. Is it ok? If not, how instead?
import ClientForm
from mechanize import Browser
def myunescape_charref(data, encoding):
if not str(data).isdigit(): return 0
name, base = data, 10
if name.startswith("x"):
name, base= name[1:], 16
uc = unichr(int(name, base))
if encoding is None:
return uc
else:
try:
repl = uc.encode(encoding)
except UnicodeError:
repl = "&#%s;" % data
return repl
ClientForm.unescape_charref = myunescape_charref
The problem is caused by urls like this
http://wow.zet/forum/index.php?showtopic=1197&pid=30419&st=0&#entry30419
ClientForm is looking for an integer after the &#
It is ok to have the # in the url, but it should be escaped in the html
as &# means a character encoding

Categories