utf-8 encoding error in seq2seq model - python

Hi I'm working on Language translation using Keras. I have a text file with English text and a file with Hindi text.
I'm facing "UnicodeDecodeError:". And I believe maybe its because it is unable to convert non-unicode to unicode.
Please let me know how to go about it. The github link is below
https://github.com/shashankg7/Seq2Seq/tree/master/seq2seq
Code Snippet:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import codecs
import pdb
import numpy as np
#from utils import preprocess_text, text2seq_generator
def preprocess_text(file_path_src, file_path_tar, max_feats):
f_src = open(file_path_src)
f_tar = open(file_path_tar)
vocab = defaultdict(int)
freq_src = defaultdict(int)
freq_tar = defaultdict(int)
sents_src = [line.rstrip() for line in f_src.readlines()]
sents_tar = [line.rstrip() for line in f_tar.readlines()]
def preprocess(self):
# Preprocessing source and target text sequence files
self.vocab_src, self.vocab_tar, self.sents_src, self.sents_tar =
preprocess_text(self.path_src, self.path_tar, self.max_feat)
if __name__ == "__main__":
pre = preprocess('C:\\Users\\anagha\\Desktop\\Language-Translation\\Seq2Seq-master\\Seq2Seq-master\\seq2seq\\training.hi-en.hi', 'C:\\Users\\anagha\\Desktop\\Language-Translation\\Seq2Seq-master\\Seq2Seq-master\\seq2seq\\training.hi-en.en', 5500, 15)
pre.preprocess()
for e in range(1):
print("epoch no %d"%e)
for X,Y in pre.gen_batch():
print(X)
Error :
Using TensorFlow backend.
Traceback (most recent call last):
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2898, in run_code
self.showtraceback()
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1807, in showtraceback
self.showsyntaxerror(filename)
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1864, in showsyntaxerror
stb = self.SyntaxTB.structured_traceback(etype, value, [])
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\ultratb.py", line 1441, in structured_traceback
newtext = ulinecache.getline(value.filename, value.lineno)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 16, in getline
lines = getlines(filename, module_globals)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 47, in getlines
return updatecache(filename, module_globals)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 137, in updatecache
lines = fp.readlines()
File "C:\Users\anagha\Anaconda3\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 7588: invalid start byte

Related

Keep getting Unicode Error with Streamlit

I am trying to use OpenAi and Streamlit together to merge them into a little dummy website. Here is the coding:
save_folder = "files"
if not os.path.exists(save_folder):
os.makedirs(save_folder)
uploaded_files = st.file_uploader("Choose a file", accept_multiple_files=True)
for uploaded_file in uploaded_files:
bytes_data = uploaded_file.read()
s = bytes_data.decode("UTF-8")
with open(f"{save_folder}/{uploaded_file.name}", "w") as f:
f.write(s)
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
loader = SimpleDirectoryReader(save_folder)
documents = loader.load_data()
index = GPTSimpleVectorIndex(documents)
index.save_to_disk('index.json')
question = st.text_input("What do you want me to do with the file uploaded?")
response = index.query(question)
st.write(response)
I keep getting the same error:
File "/home/appuser/venv/lib/python3.9/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 565, in _run_script
exec(code, module.__dict__)
File "/app/indextest/streamlit_app.py", line 17, in <module>
s = bytes_data.decode("UTF-8")
UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 14-15: invalid continuation byte
Anyone know why?

Export fasttext vectors (korean) from fastText to spacy (UnicodeDecodeError)

Hi everyone i downloaded the korean fasttext model from FastText Korean Model and tried to export it to spacy using this code:
#!/usr/bin/env python
# coding: utf8
from __future__ import unicode_literals
import plac
import numpy
import spacy
from spacy.language import Language
#plac.annotations()
def main():
nlp = spacy.blank('ko')
with open("ko.vec", 'rb') as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
nlp.vocab.reset_vectors(width=int(nr_dim))
count = 0
for line in file_:
count += 1
line = line.rstrip().decode("utf-8")
pieces = line.rsplit(' ', int(nr_dim))
word = pieces[0]
print("{} - {}".format(count, word))
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
nlp.to_disk("/models/new_nlp/")
if __name__ == '__main__':
plac.call(main)
this code i got from this answered question on stackoverflow:
Export FastText from fasttext to spacy
But after executing the code i got this error at the end:
Traceback (most recent call last):
File "C:\Users\User\fasttexttospacy\fasttexttospacy.py", line 31, in <module>
plac.call(main)
File "C:\Users\User\anaconda3\envs\fasttexttospacy\lib\site-packages\plac_core.py", line 436, in call
cmd, result = parser.consume(arglist)
File "C:\Users\User\anaconda3\envs\fasttexttospacy\lib\site-packages\plac_core.py", line 287, in consume
return cmd, self.func(*(args + varargs + extraopts), **kwargs)
File "C:\Users\User\fasttexttospacy\fasttexttospacy.py", line 21, in main
line = line.rstrip().decode("utf-8")
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0: invalid continuation byte
I don't really understand where the problem is, can somebody explain me why i get this error please? It's not clear to me.

I am getting WARNING:tensorflow:From in python

I am learning ML from AI Adventures but I have a problem with that code
WARNING:tensorflow:From Ai1.py:12: load_csv_with_header (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data instead.
Traceback (most recent call last):
File "Ai1.py", line 12, in <module>
target_dtype=np.int)
File "C:\Users\Ardit\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\util\deprecation.py", line 324, in new_func
return func(*args, **kwargs)
File "C:\Users\Ardit\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\contrib\learn\python\learn\datasets\base.py", line 53, in load_csv_with_header
header = next(data_file)
File "C:\Users\Ardit\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\lib\io\file_io.py", line 220, in __next__
return self.next()
File "C:\Users\Ardit\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\lib\io\file_io.py", line 214, in next
retval = self.readline()
File "C:\Users\Ardit\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\lib\io\file_io.py", line 184, in readline
return self._prepare_value(self._read_buf.ReadLineAsString())
File "C:\Users\Ardit\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\lib\io\file_io.py", line 100, in _prepare_value
return compat.as_str_any(val)
File "C:\Users\Ardit\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\util\compat.py", line 107, in as_str_any
return as_str(value)
File "C:\Users\Ardit\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\util\compat.py", line 80, in as_text
return bytes_or_text.decode(encoding)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 8: invalid start byte
I am using python 3.6
Can anyone help me
Sorry, here is the code
import tensorflow as tf
import numpy as np
from tensorflow.contrib.learn.python.learn.datasets import base
# Data files
IRIS_TRAINING = "iris_training.csv"
IRIS_TEST = "iris_test.csv"
# Load datasets.
training_set = base.load_csv_with_header(filename=IRIS_TRAINING,
features_dtype=np.float32,
target_dtype=np.int)
test_set = base.load_csv_with_header(filename=IRIS_TEST,
features_dtype=np.float32,
target_dtype=np.int)
print(training_set.data)
print(training_set.target)

Python Unicode decode error- Not able to run script even after suggested correction

I am using python 2.7.9 to create excel sheet using tab delimited text files; however I am getting problem while running this python script
#!/usr/bin/env python
# encoding=utf8
import xlwt
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')
wb = xlwt.Workbook()
path = "/home/Final_analysis/"
#print(os.listdir())
lis = os.listdir(path)
sheetnumber = 1
for x in lis:
if os.path.isfile(x)==True:
extension = os.path.splitext(x)
print(extension[1])
if extension[1] == '.txt':
#print("Yes")
ws = wb.add_sheet(extension[0])
row = 0
column = 0
a = open(x)
while True:
a1 = a.readline()
if len(a1)==0:
break
data = a1.split("\t")
for z in data:
ws.write(row,column,z)
column += 1
column = 0
row += 1
sheetnumber+=1
else:
pass
wb.save("Ronic.xls")
I am getting following error
Traceback (most recent call last):
File "home/Final_analysis/combine_excel_v2.py", line 39, in <module>
wb.save("Ronic.xls")
File "/usr/local/lib/python2.7/site-packages/xlwt/Workbook.py", line 710, in save
doc.save(filename_or_stream, self.get_biff_data())
File "/usr/local/lib/python2.7/site-packages/xlwt/Workbook.py", line 674, in get_biff_data
shared_str_table = self.__sst_rec()
File "/usr/local/lib/python2.7/site-packages/xlwt/Workbook.py", line 636, in __sst_rec
return self.__sst.get_biff_record()
File "/usr/local/lib/python2.7/site-packages/xlwt/BIFFRecords.py", line 77, in get_biff_record
self._add_to_sst(s)
File "/usr/local/lib/python2.7/site-packages/xlwt/BIFFRecords.py", line 92, in _add_to_sst
u_str = upack2(s, self.encoding)
File "/usr/local/lib/python2.7/site-packages/xlwt/UnicodeUtils.py", line 50, in upack2
us = unicode(s, encoding)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 83: ordinal not in range(128)
I have used answer given in thread How to fix: "UnicodeDecodeError: 'ascii' codec can't decode byte"
But it didn't work.
problem is at wb.save() command
Setting the encoding at the top of your program is to handle non-ascii characters in your code, not your data. sys.setdefaultencoding('utf8') is not intended to be used in ordinary programs and does more harm than good.
To fix the problem, tell xlwt about the encoding to use.
Change this line:
wb = xlwt.Workbook()
to this:
wb = xlwt.Workbook(encoding="UTF-8")

UnicodeDecodeError: 'ascii' codec can't decode byte in Textranking code [duplicate]

This question already has answers here:
How to fix: "UnicodeDecodeError: 'ascii' codec can't decode byte"
(20 answers)
Closed 5 years ago.
When I execute the below code
import networkx as nx
import numpy as np
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
def textrank(document):
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(document)
bow_matrix = CountVectorizer().fit_transform(sentences)
normalized = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized * normalized.T
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
fp = open("QC")
txt = fp.read()
sents = textrank(txt)
print sents
I get the following error
Traceback (most recent call last):
File "Textrank.py", line 44, in <module>
sents = textrank(txt)
File "Textrank.py", line 10, in textrank
sentences = sentence_tokenizer.tokenize(document)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1316, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 311, in _pair_iter
for el in it:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1291, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1337, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1472, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 310, in _pair_iter
prev = next(it)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 577, in _annotate_first_pass
for aug_tok in tokens:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 9: ordinal not in range(128)
I am executing the code in Ubuntu. To get the text, I referred this website
https://uwaterloo.ca/institute-for-quantum-computing/quantum-computing-101. I created a file QC (not QC.txt) and copy pasted the data paragraph by paragraph to the file.
Kindly help me resolve the error.
Thank You
Please try if the following works for you.
import networkx as nx
import numpy as np
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
def textrank(document):
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(document)
bow_matrix = CountVectorizer().fit_transform(sentences)
normalized = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized * normalized.T
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
fp = open("QC")
txt = fp.read()
sents = textrank(txt.encode('utf-8'))
print sents

Categories