UnicodeDecodeError when trying to read docx file - python

Error occurs when opening docx file using python 3
When I tried to run:
file=open("jinuj.docx","r",encoding="utf-8").read()
below error occured
319 # decode input (taking the buffer into account)
320 data = self.buffer + input
--> 321 (result, consumed) = self._buffer_decode(data, self.errors, final)
322 # keep undecoded input until the next call
323 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb6 in position 11: invalid start byte

python-docx can open a document from a so-called file-like object. It can also save to a file-like object:
from docx import Document
f = open('jinuj.docx', 'rb')
document = Document(f)
f.close()
OR
with open('jinuj.docx', 'rb') as f:
source_stream = StringIO(f.read())
document = Document(source_stream)
source_stream.close()
Docs

Related

Enconding UTF-8 issue when trying reading a json file

I've got the error shown below when tryng to read a json file whose encode is UTF-8, can someone know how I can resolve this issue?
reviews = pd.read_csv('reviews.csv', nrows=1000)
businesses = pd.read_csv('businesses.csv', nrows=1000)
checkins = []
with open('checkins.json', encoding='utf-8') as f:
for row in f.readlines()[:1000]:
checkins.append(json.loads(row))
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-10-4f54896faeca> in <module>
3 checkins = []
4 with open('checkins.json', encoding='utf-8') as f:
----> 5 for row in f.readlines()[:1000]:
6 checkins.append(json.loads(row))
~\Anaconda3\lib\codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xda in position 37: invalid continuation byte

BertLMDataBunch.from_raw_corpus UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 49: invalid continuation byte

I have trouble fine-tuning Camembert using fast-bert library,
I get this error message when creating a LMDataBunch.
Does anyone know how to fix this ?
Thanks
Ps logger is initialezed using logging.getLogger()
databunch_lm = BertLMDataBunch.from_raw_corpus(
data_dir=DATA_PATH,
text_list=all_texts,
tokenizer='camembert-base',
batch_size_per_gpu=16,
max_seq_length=512,
multi_gpu=False,
model_type='camembert-base',
logger=logger)```
07/05/2020 14:50:31 - INFO - transformers.tokenization_utils_base - loading file
https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model from cache at C:\Users\Nawel/.cache\torch\hub\transformers\3715e3a4a2de48834619b2a6f48979e13ddff5cabfb1f3409db689f9ce3bb98f.28d30f926f545047fc59da64289371eef0fbdc0764ce9ec56f808a646fcfec59
07/05/2020 14:50:31 - INFO - root - Creating features from dataset file C:\Users\Desktop\Stage\Camembert\data\lm_train.txt
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-136-5e7363fcd4d6> in <module>
7 multi_gpu=False,
8 model_type='camembert-base',
----> 9 logger=logger)
~\anaconda3\lib\site-packages\fast_bert\data_lm.py in from_raw_corpus(data_dir, text_list, tokenizer, batch_size_per_gpu, max_seq_length, multi_gpu, test_size, model_type, logger, clear_cache, no_cache)
198 logger=logger,
199 clear_cache=clear_cache,
--> 200 no_cache=no_cache,
201 )
202
~\anaconda3\lib\site-packages\fast_bert\data_lm.py in __init__(self, data_dir, tokenizer, train_file, val_file, batch_size_per_gpu, max_seq_length, multi_gpu, model_type, logger, clear_cache, no_cache)
270 cached_features_file,
271 self.logger,
--> 272 block_size=self.tokenizer.max_len_single_sentence,
273 )
274
~\anaconda3\lib\site-packages\fast_bert\data_lm.py in __init__(self, tokenizer, file_path, cache_path, logger, block_size)
131 self.examples = []
132 with open(file_path, encoding="utf-8") as f:
--> 133 text = f.read()
134
135 tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
~\anaconda3\lib\codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 63: invalid continuation byte
I'm closing this, I just had to change the encoding of the fule to utf-8

Error reading csv file to dataframe in dask and pandas

I am trying to read a csv file using dask(also pandas) but im getting the below error.i tried changing the encoding formats but nothing seems working. but when i do save as csv ut8 in excel the code starts working. same i tried with pandas and gives me same error.i tried explicitly giving the encoding as utf-16 but it got error saying asking to use utf-16-le or utf-16-be. when i used also i got the errors.
Is there something wrong with the csv file that I am using?
import dask.dataframe as dd
with open(Mar_N_W, 'rb') as f:
result = chardet.detect(f.read())
Mar_NW = dd.read_csv(Mar_N_W,encoding=result['encoding'],sep=None)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\parsers.py in _next_iter_line(self, row_num)
2693
2694 try:
-> 2695 return next(self.data)
2696 except csv.Error as e:
2697 if self.warn_bad_lines or self.error_bad_lines:
~\AppData\Local\Continuum\anaconda3\lib\codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
~\AppData\Local\Continuum\anaconda3\lib\encodings\utf_16.py in _buffer_decode(self, input, errors, final)
67 raise UnicodeError("UTF-16 stream does not start with BOM")
68 return (output, consumed)
---> 69 return self.decoder(input, self.errors, final)
70
71 def reset(self):
UnicodeDecodeError: 'utf-16-le' codec can't decode byte 0x0a in position 0: truncated data

Trouble scanning list for duplicates

Hey so i want to scan this text file of emails and if two of the same emails pop up i want it to be printed if only 1 email is on the list i dont want it to be printed.
It worked for a different text file but now its saying traceback error???
#note make sure found.txt and list.txt are in the 'include' for pycharmfrom collect ions import Counter
print("Welcome DADDY")
with open('myheritage-1-million.txt') as f:
c=Counter(c.strip().lower() for c in f if c.strip()) #for case-insensitive search
for line in c:
if c[line] > 1:
print(line)
ERROR:
rs/dcaputo/PycharmProjects/searchtoolforrhys/venv/include/search.py
Welcome DADDY
Traceback (most recent call last):
File "/Users/dcaputo/PycharmProjects/searchtoolforrhys/venv/include/search.py", line 5, in <module>
c = Counter(c.strip().lower() for c in f if c.strip()) #for case-insensitive search
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/collections/__init__.py", line 566, in __init__
self.update(*args, **kwds)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/collections/__init__.py", line 653, in update
_count_elements(self, iterable)
File "/Users/dcaputo/PycharmProjects/searchtoolforrhys/venv/include/search.py", line 5, in <genexpr>
c = Counter(c.strip().lower() for c in f if c.strip()) #for case-insensitive search
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc5 in position 2668: invalid continuation byte
Process finished with exit code 1
a list of all emails that are shown up 2 times in that whole text file
The key is the error message at the end:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc5 in position 2668: invalid continuation byte
This error can occur when trying to read a non-text file as text. Your file could be corrupted somehow and has some data (at position 2668) in it that can't be read as text.

Converting .arff file to .csv using Python

I have a file "LMD.rh.arff" which I am trying to convert to .csv file using the following code-
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import arff
# Read in .arff file-
data = arff.loadarff("LMD.rh.arff")
But this last line of code gives me the error-
--------------------------------------------------------------------------- UnicodeEncodeError Traceback (most recent call
last) in
----> 1 data = arff.loadarff("LMD.rp.arff")
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
loadarff(f)
539 ofile = open(f, 'rt')
540 try:
--> 541 return _loadarff(ofile)
542 finally:
543 if ofile is not f: # only close what we opened
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
_loadarff(ofile)
627 a = generator(ofile)
628 # No error should happen here: it is a bug otherwise
--> 629 data = np.fromiter(a, descr)
630 return data, meta
631
UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in
position 4: ordinal not in range(128)
In [6]: data = arff.loadarff("LMD.rh.arff")
--------------------------------------------------------------------------- UnicodeEncodeError Traceback (most recent call
last) in
----> 1 data = arff.loadarff("LMD.rh.arff")
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
loadarff(f)
539 ofile = open(f, 'rt')
540 try:
--> 541 return _loadarff(ofile)
542 finally:
543 if ofile is not f: # only close what we opened
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
_loadarff(ofile)
627 a = generator(ofile)
628 # No error should happen here: it is a bug otherwise
--> 629 data = np.fromiter(a, descr)
630 return data, meta
631
UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in
position 4: ordinal not in range(128)
You can download the file arff_file
Any ideas as to what's going wrong?
Thanks!
Try this
path_to_directory="./"
files = [arff for arff in os.listdir(path_to_directory) if arff.endswith(".arff")]
def toCsv(content):
data = False
header = ""
newContent = []
for line in content:
if not data:
if "#attribute" in line:
attri = line.split()
columnName = attri[attri.index("#attribute")+1]
header = header + columnName + ","
elif "#data" in line:
data = True
header = header[:-1]
header += '\n'
newContent.append(header)
else:
newContent.append(line)
return newContent
# Main loop for reading and writing files
for zzzz,file in enumerate(files):
with open(path_to_directory+file , "r") as inFile:
content = inFile.readlines()
name,ext = os.path.splitext(inFile.name)
new = toCsv(content)
with open(name+".csv", "w") as outFile:
outFile.writelines(new)
Take a look at the error trace
UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in position 4: ordinal not in range(128)
Your error suggests you have some encoding problem with the file. Consider first opening the file with the correct encoding and then loading it to the arff loader
import codecs
import arff
file_ = codecs.load('LMD.rh.arff', 'rb', 'utf-8') # or whatever encoding you have
arff.load(file_) # now this should be fine
For reference see here

Categories