Unicode error ascii can't encode character - python

I am trying to import a csv file in order to train my classifier but I keep receiving this error
traceback (most recent call last):
File "updateClassif.py", line 17, in <module>
myClassif = NaiveBayesClassifier(fp, format="csv")
File "C:\Python27\lib\site-packages\textblob\classifiers.py", line 191, in __init__
super(NLTKClassifier, self).__init__(train_set, feature_extractor, format, **kwargs)
File "C:\Python27\lib\site-packages\textblob\classifiers.py", line 123, in __init__
self.train_set = self._read_data(train_set, format)
File "C:\Python27\lib\site-packages\textblob\classifiers.py", line 143, in _read_data
return format_class(dataset, **self.format_kwargs).to_iterable()
File "C:\Python27\lib\site-packages\textblob\formats.py", line 68, in __init__
self.data = [row for row in reader]
File "C:\Python27\lib\site-packages\textblob\unicodecsv\__init__.py", line 106, in next
row = self.reader.next()
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe6' in position 55: ordinal not in range(128)
The CSV file contains 1600000 lines of tweets so I believe some tweets contain special characters. I have tried saving it using open office as someone recommended but still the same result. I also tried using latin encoding but the same result.
This is my code :
with codecs.open('tr.csv', 'r' ,encoding='latin-1') as fp:
myClassif = NaiveBayesClassifier(fp, format="csv")
This is the code from the library I am using:
def __init__(self, csvfile, fieldnames=None, restkey=None, restval=None,
dialect='excel', encoding='utf-8', errors='strict', *args,
**kwds):
if fieldnames is not None:
fieldnames = _stringify_list(fieldnames, encoding)
csv.DictReader.__init__(self, csvfile, fieldnames, restkey, restval, dialect, *args, **kwds)
self.reader = UnicodeReader(csvfile, dialect, encoding=encoding,
errors=errors, *args, **kwds)
if fieldnames is None and not hasattr(csv.DictReader, 'fieldnames'):
# Python 2.5 fieldnames workaround. (http://bugs.python.org/issue3436)
reader = UnicodeReader(csvfile, dialect, encoding=encoding, *args, **kwds)
self.fieldnames = _stringify_list(reader.next(), reader.encoding)
self.unicode_fieldnames = [_unicodify(f, encoding) for f in
self.fieldnames]
self.unicode_restkey = _unicodify(restkey, encoding)
def next(self):
row = csv.DictReader.next(self)
result = dict((uni_key, row[str_key]) for (str_key, uni_key) in
izip(self.fieldnames, self.unicode_fieldnames))
rest = row.get(self.restkey)

In Python2, the csv module does not support unicode. So you must pass in some kind of iterator object (such as a file) which only produces byte-strings.
This means that your code should look like this:
with open('tr.csv', 'rb') as fp:
myClassif = NaiveBayesClassifier(fp, format="csv")
But note that the csv file must be encoded as UTF-8. If it's not, you will obviously need to convert it to UTF-8 first, in order for the code above to work.

Note that the traceback says EncodeError, not DecodeError. It looks like the NaiveBayesClassifier is expecting ascii. Either make it accept Unicode, or, if this is OK for your application, replace non-ascii characters with '?' or something.

Related

Keep getting Unicode Error with Streamlit

I am trying to use OpenAi and Streamlit together to merge them into a little dummy website. Here is the coding:
save_folder = "files"
if not os.path.exists(save_folder):
os.makedirs(save_folder)
uploaded_files = st.file_uploader("Choose a file", accept_multiple_files=True)
for uploaded_file in uploaded_files:
bytes_data = uploaded_file.read()
s = bytes_data.decode("UTF-8")
with open(f"{save_folder}/{uploaded_file.name}", "w") as f:
f.write(s)
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
loader = SimpleDirectoryReader(save_folder)
documents = loader.load_data()
index = GPTSimpleVectorIndex(documents)
index.save_to_disk('index.json')
question = st.text_input("What do you want me to do with the file uploaded?")
response = index.query(question)
st.write(response)
I keep getting the same error:
File "/home/appuser/venv/lib/python3.9/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 565, in _run_script
exec(code, module.__dict__)
File "/app/indextest/streamlit_app.py", line 17, in <module>
s = bytes_data.decode("UTF-8")
UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 14-15: invalid continuation byte
Anyone know why?

Program crashes during reading text file

def process_file(self):
error_flag = 0
line_count = 0
log_file = self.file_name
pure_name = log_file.strip()
# print('Before opening file ',pure_name)
logfile_in = open(pure_name, 'r') # Read file
lines = logfile_in.readlines()
# print('After reading file enteries ', pure_name)
Error Message
Traceback (most recent call last):
File "C:\Users\admin\PycharmProjects\BackupLogCheck\main.py", line 49, in <module>
backupLogs.process_file()
File "C:\Users\admin\PycharmProjects\BackupLogCheck\main.py", line 20, in process_file
lines = logfile_in.readlines()
File "C:\Users\admin\AppData\Local\Programs\Python\Python39\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 350: character maps to <undefined>
Process finished with exit code 1
Line 49 is where I call above method. But I have traced that it crashes at reading the file. I have checked the file; it has just text in it. I don't know if there are some characters which it doesn't like on reading entries. I am running on Windows 10.
I am new to Python, any suggestion how to find/correct the issue?
Try the file name in string format
logfile_in = open('pure_name', 'r') # Read file
lines = logfile_in.readlines()
print(lines)
output
['test line one\n', 'test line two']
or
logfile_in = open('pure_name', 'r') # Read file
lines = logfile_in.readlines()
for line in lines:
print(line)
output
test line one
test line two

Converting .arff file to .csv using Python

I have a file "LMD.rh.arff" which I am trying to convert to .csv file using the following code-
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import arff
# Read in .arff file-
data = arff.loadarff("LMD.rh.arff")
But this last line of code gives me the error-
--------------------------------------------------------------------------- UnicodeEncodeError Traceback (most recent call
last) in
----> 1 data = arff.loadarff("LMD.rp.arff")
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
loadarff(f)
539 ofile = open(f, 'rt')
540 try:
--> 541 return _loadarff(ofile)
542 finally:
543 if ofile is not f: # only close what we opened
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
_loadarff(ofile)
627 a = generator(ofile)
628 # No error should happen here: it is a bug otherwise
--> 629 data = np.fromiter(a, descr)
630 return data, meta
631
UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in
position 4: ordinal not in range(128)
In [6]: data = arff.loadarff("LMD.rh.arff")
--------------------------------------------------------------------------- UnicodeEncodeError Traceback (most recent call
last) in
----> 1 data = arff.loadarff("LMD.rh.arff")
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
loadarff(f)
539 ofile = open(f, 'rt')
540 try:
--> 541 return _loadarff(ofile)
542 finally:
543 if ofile is not f: # only close what we opened
~/.local/lib/python3.6/site-packages/scipy/io/arff/arffread.py in
_loadarff(ofile)
627 a = generator(ofile)
628 # No error should happen here: it is a bug otherwise
--> 629 data = np.fromiter(a, descr)
630 return data, meta
631
UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in
position 4: ordinal not in range(128)
You can download the file arff_file
Any ideas as to what's going wrong?
Thanks!
Try this
path_to_directory="./"
files = [arff for arff in os.listdir(path_to_directory) if arff.endswith(".arff")]
def toCsv(content):
data = False
header = ""
newContent = []
for line in content:
if not data:
if "#attribute" in line:
attri = line.split()
columnName = attri[attri.index("#attribute")+1]
header = header + columnName + ","
elif "#data" in line:
data = True
header = header[:-1]
header += '\n'
newContent.append(header)
else:
newContent.append(line)
return newContent
# Main loop for reading and writing files
for zzzz,file in enumerate(files):
with open(path_to_directory+file , "r") as inFile:
content = inFile.readlines()
name,ext = os.path.splitext(inFile.name)
new = toCsv(content)
with open(name+".csv", "w") as outFile:
outFile.writelines(new)
Take a look at the error trace
UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in position 4: ordinal not in range(128)
Your error suggests you have some encoding problem with the file. Consider first opening the file with the correct encoding and then loading it to the arff loader
import codecs
import arff
file_ = codecs.load('LMD.rh.arff', 'rb', 'utf-8') # or whatever encoding you have
arff.load(file_) # now this should be fine
For reference see here

Python Unicode decode error- Not able to run script even after suggested correction

I am using python 2.7.9 to create excel sheet using tab delimited text files; however I am getting problem while running this python script
#!/usr/bin/env python
# encoding=utf8
import xlwt
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')
wb = xlwt.Workbook()
path = "/home/Final_analysis/"
#print(os.listdir())
lis = os.listdir(path)
sheetnumber = 1
for x in lis:
if os.path.isfile(x)==True:
extension = os.path.splitext(x)
print(extension[1])
if extension[1] == '.txt':
#print("Yes")
ws = wb.add_sheet(extension[0])
row = 0
column = 0
a = open(x)
while True:
a1 = a.readline()
if len(a1)==0:
break
data = a1.split("\t")
for z in data:
ws.write(row,column,z)
column += 1
column = 0
row += 1
sheetnumber+=1
else:
pass
wb.save("Ronic.xls")
I am getting following error
Traceback (most recent call last):
File "home/Final_analysis/combine_excel_v2.py", line 39, in <module>
wb.save("Ronic.xls")
File "/usr/local/lib/python2.7/site-packages/xlwt/Workbook.py", line 710, in save
doc.save(filename_or_stream, self.get_biff_data())
File "/usr/local/lib/python2.7/site-packages/xlwt/Workbook.py", line 674, in get_biff_data
shared_str_table = self.__sst_rec()
File "/usr/local/lib/python2.7/site-packages/xlwt/Workbook.py", line 636, in __sst_rec
return self.__sst.get_biff_record()
File "/usr/local/lib/python2.7/site-packages/xlwt/BIFFRecords.py", line 77, in get_biff_record
self._add_to_sst(s)
File "/usr/local/lib/python2.7/site-packages/xlwt/BIFFRecords.py", line 92, in _add_to_sst
u_str = upack2(s, self.encoding)
File "/usr/local/lib/python2.7/site-packages/xlwt/UnicodeUtils.py", line 50, in upack2
us = unicode(s, encoding)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 83: ordinal not in range(128)
I have used answer given in thread How to fix: "UnicodeDecodeError: 'ascii' codec can't decode byte"
But it didn't work.
problem is at wb.save() command
Setting the encoding at the top of your program is to handle non-ascii characters in your code, not your data. sys.setdefaultencoding('utf8') is not intended to be used in ordinary programs and does more harm than good.
To fix the problem, tell xlwt about the encoding to use.
Change this line:
wb = xlwt.Workbook()
to this:
wb = xlwt.Workbook(encoding="UTF-8")

script to merge multiple csv's into single xslx not working

Have read all of the threads on this but I'm still hitting a dead end. Just trying to take all the csv's in a directory and add them as new sheets to a new xlsx workbook. Here's what I've got:
import xlwt, csv, os, glob
def make_excel_workbook(path):
wb = xlwt.Workbook()
for filename in os.listdir(folder_path):
if filename.endswith('.csv'):
ws = wb.add_sheet(os.path.splitext(filename)[0])
with open('{}\\{}'.format(folder_path, filename), 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for rowx, row in enumerate(reader):
for colx, value in enumerate(row):
ws.write(rowx, colx, value)
return wb
csvDir = "C:\\Temp\\Data\\outfiles"
outDir = "C:\\Temp\\Data\\output"
os.chdir(csvDir)
csvFileList = []
searchTerm = "character string"
for file in glob.glob('*.csv'):
csvFileList.append(file)
for i in csvFileList: # search a set of extant csv files for a string and make new csv files filtered on the search term
csv_file = csv.reader(open(i, 'rb'), delimiter=',')
rowList = []
for row in csv_file:
for field in row:
if searchTerm in field:
rowList.append(row)
outputCsvFile = os.path.join(rootDir, i)
with open(outputCsvFile, 'wb') as newCsvFile:
wr = csv.writer(newCsvFile, quoting=csv.QUOTE_ALL)
wr.writerows(rowList)
So far, it works, and creates the new csv files from the original, much larger ones. Here's where it breaks:
if __name__ == '__main__':
xls = make_excel_workbook(outDir)
xls_name = "My_Team_Tasks"
xls.save('{}\\{}{}.'format(outDir, xls_name, '.xls'))
print('{}\\{}{} saved successfully'.format(outDir, xls_name, '.xls'))
when it gets to xls.save, it gives me the following error:
Update: here's the entire traceback:
Traceback (most recent call last):
File"M:/Testing/scripts/csv_parse.py", line 44, in <module>
xls.save('{}\\{}{}'.format(rootDir, xls_name, '.xls'))
File "C:\Python27\ArcGIS10.4\lib\site-packages\xlwt\Workbook.py", line 696, in save
doc.save(filename_or_stream, self.get_biff_data())
File "C:\Python27\ArcGIS10.4\lib\site-packages\xlwt\Workbook.py", line 660, in get_biff_data
shared_str_table = self.__sst_rec()
File "C:\Python27\ArcGIS10.4\lib\site-packages\xlwt\Workbook.py", line 662, in __sst_rec
return self.__sst.get_biff_record()
File "C:\Python27\ArcGIS10.4\lib\site-packages\xlwt\BIFFRecords.py", line 77, in get_biff_record
self._add_to_sst(s)
File "C:\Python27\ArcGIS10.4\lib\site-packages\xlwt\BIFFRecords.py", line 92, in _add_to_sst
u_str = upack2(s, self.encoding)
File "C:\Python27\ArcGIS10.4\lib\site-packages\xlwt\UnicodeUtils.py", line 50, in upack2
us = unicode(s, encoding)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 69: ordinal not in range (128)
Do you know how the input CSV files are encoded? It appears from the error message to be unicode?
You can try:
wb = xlwt.Workbook(encoding='utf-8')
Failing that, as per this answer (xlwt module - saving xls unicode error) it seems another possible way to get around this issue is to encode your text into unicode before writing out.
ws.write(rowx, colx, value.decode('utf-8'))
Again, it depends on how your inputs are encoded.

Categories