Enconding UTF-8 issue when trying reading a json file - python

I've got the error shown below when tryng to read a json file whose encode is UTF-8, can someone know how I can resolve this issue?
reviews = pd.read_csv('reviews.csv', nrows=1000)
businesses = pd.read_csv('businesses.csv', nrows=1000)
checkins = []
with open('checkins.json', encoding='utf-8') as f:
for row in f.readlines()[:1000]:
checkins.append(json.loads(row))
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-10-4f54896faeca> in <module>
3 checkins = []
4 with open('checkins.json', encoding='utf-8') as f:
----> 5 for row in f.readlines()[:1000]:
6 checkins.append(json.loads(row))
~\Anaconda3\lib\codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xda in position 37: invalid continuation byte

Related

"'utf-8' codec can't decode byte 0xf3" while performing the sentiment lexicon

I am getting an error in the following code while performing the sentiment lexicon. If someone could guide me here, that'd be really helpful.
"'utf-8' codec can't decode byte 0xf3"
from textblob import TextBlob
pos_count = 0
pos_correct = 0
with open("positive.txt","r") as f:
for line in f.read().split('\n'):
analysis = TextBlob(line)
if analysis.sentiment.polarity > 0:
pos_correct += 1
pos_count +=1
neg_count = 0
neg_correct = 0
with open("negative.txt","r") as f:
for line in f.read().split('\n'):
analysis = TextBlob(line)
if analysis.sentiment.polarity <= 0:
neg_correct += 1
neg_count +=1
print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))
Error:
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-8-91f2c31897d6> in <module>()
5
6 with open("positive.txt","r") as f:
----> 7 for line in f.read().split('\n'):
8 analysis = TextBlob(line)
9 if analysis.sentiment.polarity > 0:
/usr/lib/python3.7/codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf3 in position 4645: invalid continuation byte

BertLMDataBunch.from_raw_corpus UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 49: invalid continuation byte

I have trouble fine-tuning Camembert using fast-bert library,
I get this error message when creating a LMDataBunch.
Does anyone know how to fix this ?
Thanks
Ps logger is initialezed using logging.getLogger()
databunch_lm = BertLMDataBunch.from_raw_corpus(
data_dir=DATA_PATH,
text_list=all_texts,
tokenizer='camembert-base',
batch_size_per_gpu=16,
max_seq_length=512,
multi_gpu=False,
model_type='camembert-base',
logger=logger)```
07/05/2020 14:50:31 - INFO - transformers.tokenization_utils_base - loading file
https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model from cache at C:\Users\Nawel/.cache\torch\hub\transformers\3715e3a4a2de48834619b2a6f48979e13ddff5cabfb1f3409db689f9ce3bb98f.28d30f926f545047fc59da64289371eef0fbdc0764ce9ec56f808a646fcfec59
07/05/2020 14:50:31 - INFO - root - Creating features from dataset file C:\Users\Desktop\Stage\Camembert\data\lm_train.txt
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-136-5e7363fcd4d6> in <module>
7 multi_gpu=False,
8 model_type='camembert-base',
----> 9 logger=logger)
~\anaconda3\lib\site-packages\fast_bert\data_lm.py in from_raw_corpus(data_dir, text_list, tokenizer, batch_size_per_gpu, max_seq_length, multi_gpu, test_size, model_type, logger, clear_cache, no_cache)
198 logger=logger,
199 clear_cache=clear_cache,
--> 200 no_cache=no_cache,
201 )
202
~\anaconda3\lib\site-packages\fast_bert\data_lm.py in __init__(self, data_dir, tokenizer, train_file, val_file, batch_size_per_gpu, max_seq_length, multi_gpu, model_type, logger, clear_cache, no_cache)
270 cached_features_file,
271 self.logger,
--> 272 block_size=self.tokenizer.max_len_single_sentence,
273 )
274
~\anaconda3\lib\site-packages\fast_bert\data_lm.py in __init__(self, tokenizer, file_path, cache_path, logger, block_size)
131 self.examples = []
132 with open(file_path, encoding="utf-8") as f:
--> 133 text = f.read()
134
135 tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
~\anaconda3\lib\codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 63: invalid continuation byte
I'm closing this, I just had to change the encoding of the fule to utf-8

Error reading csv file to dataframe in dask and pandas

I am trying to read a csv file using dask(also pandas) but im getting the below error.i tried changing the encoding formats but nothing seems working. but when i do save as csv ut8 in excel the code starts working. same i tried with pandas and gives me same error.i tried explicitly giving the encoding as utf-16 but it got error saying asking to use utf-16-le or utf-16-be. when i used also i got the errors.
Is there something wrong with the csv file that I am using?
import dask.dataframe as dd
with open(Mar_N_W, 'rb') as f:
result = chardet.detect(f.read())
Mar_NW = dd.read_csv(Mar_N_W,encoding=result['encoding'],sep=None)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\parsers.py in _next_iter_line(self, row_num)
2693
2694 try:
-> 2695 return next(self.data)
2696 except csv.Error as e:
2697 if self.warn_bad_lines or self.error_bad_lines:
~\AppData\Local\Continuum\anaconda3\lib\codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
~\AppData\Local\Continuum\anaconda3\lib\encodings\utf_16.py in _buffer_decode(self, input, errors, final)
67 raise UnicodeError("UTF-16 stream does not start with BOM")
68 return (output, consumed)
---> 69 return self.decoder(input, self.errors, final)
70
71 def reset(self):
UnicodeDecodeError: 'utf-16-le' codec can't decode byte 0x0a in position 0: truncated data

Converting .arff file to .csv using Python

I have a file "LMD.rh.arff" which I am trying to convert to .csv file using the following code-
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import arff
# Read in .arff file-
data = arff.loadarff("LMD.rh.arff")
But this last line of code gives me the error-
--------------------------------------------------------------------------- UnicodeEncodeError Traceback (most recent call
last) in
----> 1 data = arff.loadarff("LMD.rp.arff")
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
loadarff(f)
539 ofile = open(f, 'rt')
540 try:
--> 541 return _loadarff(ofile)
542 finally:
543 if ofile is not f: # only close what we opened
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
_loadarff(ofile)
627 a = generator(ofile)
628 # No error should happen here: it is a bug otherwise
--> 629 data = np.fromiter(a, descr)
630 return data, meta
631
UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in
position 4: ordinal not in range(128)
In [6]: data = arff.loadarff("LMD.rh.arff")
--------------------------------------------------------------------------- UnicodeEncodeError Traceback (most recent call
last) in
----> 1 data = arff.loadarff("LMD.rh.arff")
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
loadarff(f)
539 ofile = open(f, 'rt')
540 try:
--> 541 return _loadarff(ofile)
542 finally:
543 if ofile is not f: # only close what we opened
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
_loadarff(ofile)
627 a = generator(ofile)
628 # No error should happen here: it is a bug otherwise
--> 629 data = np.fromiter(a, descr)
630 return data, meta
631
UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in
position 4: ordinal not in range(128)
You can download the file arff_file
Any ideas as to what's going wrong?
Thanks!
Try this
path_to_directory="./"
files = [arff for arff in os.listdir(path_to_directory) if arff.endswith(".arff")]
def toCsv(content):
data = False
header = ""
newContent = []
for line in content:
if not data:
if "#attribute" in line:
attri = line.split()
columnName = attri[attri.index("#attribute")+1]
header = header + columnName + ","
elif "#data" in line:
data = True
header = header[:-1]
header += '\n'
newContent.append(header)
else:
newContent.append(line)
return newContent
# Main loop for reading and writing files
for zzzz,file in enumerate(files):
with open(path_to_directory+file , "r") as inFile:
content = inFile.readlines()
name,ext = os.path.splitext(inFile.name)
new = toCsv(content)
with open(name+".csv", "w") as outFile:
outFile.writelines(new)
Take a look at the error trace
UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in position 4: ordinal not in range(128)
Your error suggests you have some encoding problem with the file. Consider first opening the file with the correct encoding and then loading it to the arff loader
import codecs
import arff
file_ = codecs.load('LMD.rh.arff', 'rb', 'utf-8') # or whatever encoding you have
arff.load(file_) # now this should be fine
For reference see here

UnicodeDecodeError when trying to read docx file

Error occurs when opening docx file using python 3
When I tried to run:
file=open("jinuj.docx","r",encoding="utf-8").read()
below error occured
319 # decode input (taking the buffer into account)
320 data = self.buffer + input
--> 321 (result, consumed) = self._buffer_decode(data, self.errors, final)
322 # keep undecoded input until the next call
323 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb6 in position 11: invalid start byte
python-docx can open a document from a so-called file-like object. It can also save to a file-like object:
from docx import Document
f = open('jinuj.docx', 'rb')
document = Document(f)
f.close()
OR
with open('jinuj.docx', 'rb') as f:
source_stream = StringIO(f.read())
document = Document(source_stream)
source_stream.close()
Docs

Categories