Error reading csv file to dataframe in dask and pandas - python

I am trying to read a csv file using dask(also pandas) but im getting the below error.i tried changing the encoding formats but nothing seems working. but when i do save as csv ut8 in excel the code starts working. same i tried with pandas and gives me same error.i tried explicitly giving the encoding as utf-16 but it got error saying asking to use utf-16-le or utf-16-be. when i used also i got the errors.
Is there something wrong with the csv file that I am using?
import dask.dataframe as dd
with open(Mar_N_W, 'rb') as f:
result = chardet.detect(f.read())
Mar_NW = dd.read_csv(Mar_N_W,encoding=result['encoding'],sep=None)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\parsers.py in _next_iter_line(self, row_num)
2693
2694 try:
-> 2695 return next(self.data)
2696 except csv.Error as e:
2697 if self.warn_bad_lines or self.error_bad_lines:
~\AppData\Local\Continuum\anaconda3\lib\codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
~\AppData\Local\Continuum\anaconda3\lib\encodings\utf_16.py in _buffer_decode(self, input, errors, final)
67 raise UnicodeError("UTF-16 stream does not start with BOM")
68 return (output, consumed)
---> 69 return self.decoder(input, self.errors, final)
70
71 def reset(self):
UnicodeDecodeError: 'utf-16-le' codec can't decode byte 0x0a in position 0: truncated data

Related

"'utf-8' codec can't decode byte 0xf3" while performing the sentiment lexicon

I am getting an error in the following code while performing the sentiment lexicon. If someone could guide me here, that'd be really helpful.
"'utf-8' codec can't decode byte 0xf3"
from textblob import TextBlob
pos_count = 0
pos_correct = 0
with open("positive.txt","r") as f:
for line in f.read().split('\n'):
analysis = TextBlob(line)
if analysis.sentiment.polarity > 0:
pos_correct += 1
pos_count +=1
neg_count = 0
neg_correct = 0
with open("negative.txt","r") as f:
for line in f.read().split('\n'):
analysis = TextBlob(line)
if analysis.sentiment.polarity <= 0:
neg_correct += 1
neg_count +=1
print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))
Error:
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-8-91f2c31897d6> in <module>()
5
6 with open("positive.txt","r") as f:
----> 7 for line in f.read().split('\n'):
8 analysis = TextBlob(line)
9 if analysis.sentiment.polarity > 0:
/usr/lib/python3.7/codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf3 in position 4645: invalid continuation byte

Enconding UTF-8 issue when trying reading a json file

I've got the error shown below when tryng to read a json file whose encode is UTF-8, can someone know how I can resolve this issue?
reviews = pd.read_csv('reviews.csv', nrows=1000)
businesses = pd.read_csv('businesses.csv', nrows=1000)
checkins = []
with open('checkins.json', encoding='utf-8') as f:
for row in f.readlines()[:1000]:
checkins.append(json.loads(row))
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-10-4f54896faeca> in <module>
3 checkins = []
4 with open('checkins.json', encoding='utf-8') as f:
----> 5 for row in f.readlines()[:1000]:
6 checkins.append(json.loads(row))
~\Anaconda3\lib\codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xda in position 37: invalid continuation byte

BertLMDataBunch.from_raw_corpus UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 49: invalid continuation byte

I have trouble fine-tuning Camembert using fast-bert library,
I get this error message when creating a LMDataBunch.
Does anyone know how to fix this ?
Thanks
Ps logger is initialezed using logging.getLogger()
databunch_lm = BertLMDataBunch.from_raw_corpus(
data_dir=DATA_PATH,
text_list=all_texts,
tokenizer='camembert-base',
batch_size_per_gpu=16,
max_seq_length=512,
multi_gpu=False,
model_type='camembert-base',
logger=logger)```
07/05/2020 14:50:31 - INFO - transformers.tokenization_utils_base - loading file
https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model from cache at C:\Users\Nawel/.cache\torch\hub\transformers\3715e3a4a2de48834619b2a6f48979e13ddff5cabfb1f3409db689f9ce3bb98f.28d30f926f545047fc59da64289371eef0fbdc0764ce9ec56f808a646fcfec59
07/05/2020 14:50:31 - INFO - root - Creating features from dataset file C:\Users\Desktop\Stage\Camembert\data\lm_train.txt
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-136-5e7363fcd4d6> in <module>
7 multi_gpu=False,
8 model_type='camembert-base',
----> 9 logger=logger)
~\anaconda3\lib\site-packages\fast_bert\data_lm.py in from_raw_corpus(data_dir, text_list, tokenizer, batch_size_per_gpu, max_seq_length, multi_gpu, test_size, model_type, logger, clear_cache, no_cache)
198 logger=logger,
199 clear_cache=clear_cache,
--> 200 no_cache=no_cache,
201 )
202
~\anaconda3\lib\site-packages\fast_bert\data_lm.py in __init__(self, data_dir, tokenizer, train_file, val_file, batch_size_per_gpu, max_seq_length, multi_gpu, model_type, logger, clear_cache, no_cache)
270 cached_features_file,
271 self.logger,
--> 272 block_size=self.tokenizer.max_len_single_sentence,
273 )
274
~\anaconda3\lib\site-packages\fast_bert\data_lm.py in __init__(self, tokenizer, file_path, cache_path, logger, block_size)
131 self.examples = []
132 with open(file_path, encoding="utf-8") as f:
--> 133 text = f.read()
134
135 tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
~\anaconda3\lib\codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 63: invalid continuation byte
I'm closing this, I just had to change the encoding of the fule to utf-8

UnicodeDecodeError when trying to read docx file

Error occurs when opening docx file using python 3
When I tried to run:
file=open("jinuj.docx","r",encoding="utf-8").read()
below error occured
319 # decode input (taking the buffer into account)
320 data = self.buffer + input
--> 321 (result, consumed) = self._buffer_decode(data, self.errors, final)
322 # keep undecoded input until the next call
323 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb6 in position 11: invalid start byte
python-docx can open a document from a so-called file-like object. It can also save to a file-like object:
from docx import Document
f = open('jinuj.docx', 'rb')
document = Document(f)
f.close()
OR
with open('jinuj.docx', 'rb') as f:
source_stream = StringIO(f.read())
document = Document(source_stream)
source_stream.close()
Docs

Pandas read_sas error: 'ascii' codec can't decode byte 0xd8 in position 0: ordinal not in range(128)

I am using Pandas 0.18 to open a sas7bdat dataset
I simply use:
df=pd.read_sas(P:/myfile.sas7bdat)
and I get the following error
buf[0:text_block_size].rstrip(b"\x00 ").decode())
UnicodeDecodeError: 'ascii' codec can't decode byte 0xd8 in position 0: ordinal not in range(128)
If I use
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
I get
UnicodeDecodeError: 'utf8' codec can't decode byte 0xd8 in position 0: invalid continuation byte
Other sas7bdat files in my folder are handled just fine by Pandas.
When I open the file in SAS I see that the column names are very long and span several lines, but otherwise the files look just fine.
There are not so many possible options in read_sas... what to do? I
Many thanks!
You probably have to set the encoding to UTF-8. Something like this (according to the docs):
df=pd.read_sas(P:/myfile.sas7bdat, encoding='utf-8')
I have the same problem.
The issue is I have encoding='utf-8'
I still get the below error:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-20-5deb45266124> in <module>
----> 1 df = pd.read_sas("/workspace/em_data1/dev/sas_data/bureau/data_validation/dnb/freq_202008/_freq_2138_201503_202009.sas7bdat",encoding='utf-8')
/opt/Anaconda/2018.12/lib/python3.7/site-packages/pandas/io/sas/sasreader.py in read_sas(filepath_or_buffer, format, index, encoding, chunksize, iterator)
121
122 reader = SAS7BDATReader(
--> 123 filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
124 )
125 else:
/opt/Anaconda/2018.12/lib/python3.7/site-packages/pandas/io/sas/sas7bdat.py in __init__(self, path_or_buf, index, convert_dates, blank_missing, chunksize, encoding, convert_text, convert_header_text)
144
145 self._get_properties()
--> 146 self._parse_metadata()
147
148 def column_data_lengths(self):
/opt/Anaconda/2018.12/lib/python3.7/site-packages/pandas/io/sas/sas7bdat.py in _parse_metadata(self)
349 self.close()
350 raise ValueError("Failed to read a meta data page from the SAS file.")
--> 351 done = self._process_page_meta()
352
353 def _process_page_meta(self):
/opt/Anaconda/2018.12/lib/python3.7/site-packages/pandas/io/sas/sas7bdat.py in _process_page_meta(self)
355 pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
356 if self._current_page_type in pt:
--> 357 self._process_page_metadata()
358 is_data_page = self._current_page_type & const.page_data_type
359 is_mix_page = self._current_page_type in const.page_mix_types
/opt/Anaconda/2018.12/lib/python3.7/site-packages/pandas/io/sas/sas7bdat.py in _process_page_metadata(self)
390 subheader_signature, pointer.compression, pointer.ptype
391 )
--> 392 self._process_subheader(subheader_index, pointer)
393
394 def _get_subheader_index(self, signature, compression, ptype):
/opt/Anaconda/2018.12/lib/python3.7/site-packages/pandas/io/sas/sas7bdat.py in _process_subheader(self, subheader_index, pointer)
458 raise ValueError("unknown subheader index")
459
--> 460 processor(offset, length)
461
462 def _process_rowsize_subheader(self, offset, length):
/opt/Anaconda/2018.12/lib/python3.7/site-packages/pandas/io/sas/sas7bdat.py in _process_columntext_subheader(self, offset, length)
512 cname = cname_raw
513 if self.convert_header_text:
--> 514 cname = cname.decode(self.encoding or self.default_encoding)
515 self.column_names_strings.append(cname)
516
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 0: invalid continuation byte
From my unix shell, I have this:
echo $LANG
en_US.UTF-8

Categories