Python - SQLite to CSV Writer Error - ASCII values not parsed - python

Afternoon,
I am having some trouble with a SQLite to CSV python script. I have searched high and I have searched low for an answer but none have worked for me, or I am having a problem with my syntax.
I want to replace characters within the SQLite database which fall outside of the ASCII table (larger than 128).
Here is the script I have been using:
#!/opt/local/bin/python
import sqlite3
import csv, codecs, cStringIO
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
conn = sqlite3.connect('test.db')
c = conn.cursor()
# Select whichever rows you want in whatever order you like
c.execute('select ROWID, Name, Type, PID from PID')
writer = UnicodeWriter(open("ProductListing.csv", "wb"))
# Make sure the list of column headers you pass in are in the same order as your SELECT
writer.writerow(["ROWID", "Product Name", "Product Type", "PID", ])
writer.writerows(c)
I have tried to add the 'replace' as indicated here but have got the same error. Python: Convert Unicode to ASCII without errors for CSV file
The error is the UnicodeDecodeError.
Traceback (most recent call last):
File "SQLite2CSV1.py", line 53, in <module>
writer.writerows(c)
File "SQLite2CSV1.py", line 32, in writerows
self.writerow(row)
File "SQLite2CSV1.py", line 19, in writerow
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
UnicodeDecodeError: 'ascii' codec can't decode byte 0xa0 in position 65: ordinal not in range(128)
Obviously I want the code to be robust enough that if it encounters characters outside of these bounds that it replaces it with a character such as '?' (\x3f).
Is there a way to do this within the UnicodeWriter class? And a way I can make the code robust that it won't produce these errors.
Your help is greatly appreciated.

If you just want to write an ASCII CSV, simply use the stock csv.writer(). To ensure that all values passed are indeed ASCII, use encode('ascii', errors='replace').
Example:
import csv
rows = [
[u'some', u'other', u'more'],
[u'umlaut:\u00fd', u'euro sign:\u20ac', '']
]
with open('/tmp/test.csv', 'wb') as csvFile:
writer = csv.writer(csvFile)
for row in rows:
asciifiedRow = [item.encode('ascii', errors='replace') for item in row]
print '%r --> %r' % (row, asciifiedRow)
writer.writerow(asciifiedRow)
The console output for this is:
[u'some', u'other', u'more'] --> ['some', 'other', 'more']
[u'umlaut:\xfd', u'euro sign:\u20ac', ''] --> ['umlaut:?', 'euro sign:?', '']
The resulting CSV file contains:
some,other,more
umlaut:?,euro sign:?,

With access to a unix environment, here's what worked for me
sqlite3.exe a.db .dump > a.sql;
tr -d "[\\200-\\377]" < a.sql > clean.sql;
sqlite3.exe clean.db < clean.sql;
(It's not a python solution, but maybe it will help someone else due to its brevity. This solution STRIPS OUT all non ascii characters, doesn't try to replace them.)

Related

How do I deal with non ascii character from CSV when using json.loads in Python?

I looked at some answers, including this but none seem to answer my question.
Here are some example lines from CSV:
_id category
ObjectId(56266da778d34fdc048b470b) [{"group":"Home","id":"53cea0be763f4a6f4a8b459e","name":"Cleaning Services","name_singular":"Cleaning Service"}]
ObjectId(56266e0c78d34f22058b46de) [{"group":"Local","id":"5637a1b178d34f20158b464f","name":"Balloon Dí©cor","name_singular":"Balloon Dí©cor"}]
Here is my code:
import csv
import sys
from sys import argv
import json
def ReadCSV(csvfile):
with open('newCSVFile.csv','wb') as g:
filewriter = csv.writer(g) #, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with open(csvfile, 'rb') as f:
reader = csv.reader(f) # ceate reader object
next(reader) # skip first row
for row in reader: #go trhough all the rows
listForExport = [] #initialize list that will have two items: id and list of categories
# ID section
vendorId = str(row[0]) #pull the raw vendor id out of the first column of the csv
vendorId = vendorId[9:33] # slice to remove objectdId lable and parenthases
listForExport.append(vendorId) #add evendor ID to first item in list
# categories section
tempCatList = [] #temporarly list of categories for scond item in listForExport
#this is line 41 where the error stems
categories = json.loads(row[1]) #create's a dict with the categoreis from a given row
for names in categories: # loop through the categorie names using the key 'name'
print names['name']
Here's what I get:
Cleaning Services
Traceback (most recent call last):
File "csvtesting.py", line 57, in <module>
ReadCSV(csvfile)
File "csvtesting.py", line 41, in ReadCSV
categories = json.loads(row[1]) #create's a dict with the categoreis from a given row
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/decoder.py", line 366, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/decoder.py", line 382, in raw_decode
obj, end = self.scan_once(s, idx)
UnicodeDecodeError: 'utf8' codec can't decode bytes in position 9-10: invalid continuation byte
So the code pulls out the fist category Cleaning Services, but then fails when we get to the non ascii characters.
How do I deal with this? I'm happy to just remove any non-ascii items.
As you open the input csv file in rb mode, I assume that you are using a Python2.x version. The good news is that you have no problem in the csv part because the csv reader will read plain bytes without trying to interpret them. But the json module will insist in decoding the text into unicode and by default uses utf8. As your input file is not utf8 encoded is chokes and raises a UnicodeDecodeError.
Latin1 has a nice property: the unicode value of any byte is just the value of the byte, so you are sure to decode anything - whether it makes sense then depend of actual encoding being Latin1...
So you could just do:
categories = json.loads(row[1], encoding="Latin1")
Alternatively, if you want to ignore non ascii characters, you could first convert the byte string to unicode ignoring errors and only then load the json:
categories = json.loads(row[1].decode(errors='ignore)) # ignore all non ascii characters
Most probably you have certain non-ascii characters in your csv content.
import re
def remove_unicode(text):
if not text:
return text
if isinstance(text, str):
text = str(text.decode('ascii', 'ignore'))
else:
text = text.encode('ascii', 'ignore')
remove_ctrl_chars_regex = re.compile(r'[^\x20-\x7e]')
return remove_ctrl_chars_regex.sub('', text)
...
vendorId = remove_unicode(row[0])
...
categories = json.loads(remove_unicode(row[1]))

how to write a unicode csv in Python 2.7

I want to write data to files where a row from a CSV should look like this list (directly from the Python console):
row = ['\xef\xbb\xbft_11651497', 'http://kozbeszerzes.ceu.hu/entity/t/11651497.xml', "Szabolcs Mag '98 Kft.", 'ny\xc3\xadregyh\xc3\xa1za', 'ny\xc3\xadregyh\xc3\xa1za', '4400', 't\xc3\xbcnde utca 20.', 47.935175, 21.744975, u'Ny\xedregyh\xe1za', u'Borb\xe1nya', u'Szabolcs-Szatm\xe1r-Bereg', u'Ny\xedregyh\xe1zai', u'20', u'T\xfcnde utca', u'Magyarorsz\xe1g', u'4405']
Py2k does not do Unicode, but I had a UnicodeWriter wrapper:
import cStringIO, codecs
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
However, these lines still produce the dreaded encoding error message below:
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(row)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xfc' in position 9: ordinal not in range(128)
What is there to do? Thanks!
You are passing bytestrings containing non-ASCII data in, and these are being decoded to Unicode using the default codec at this line:
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
unicode(bytestring) with data that cannot be decoded as ASCII fails:
>>> unicode('\xef\xbb\xbft_11651497')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)
Decode the data to Unicode before passing it to the writer:
row = [v.decode('utf8') if isinstance(v, str) else v for v in row]
This assumes that your bytestring values contain UTF-8 data instead. If you have a mix of encodings, try to decode to Unicode at the point of origin; where your program first sourced the data. You really want to do so anyway, regardless of where the data came from or if it already was encoded to UTF-8 as well.

DictReader and UnicodeError

def openFile(fileName):
try:
trainFile = io.open(fileName,"r",encoding = "utf-8")
except IOError as e:
print ("File could not be opened: {}".format(e))
else:
trainData = csv.DictReader(trainFile)
print trainData
return trainData
def computeTFIDF(trainData):
bodyList = []
print "Inside computeTFIDF"
for row in trainData:
for key, value in row.iteritems():
print key, unicode(value, "utf-8", "ignore")
print "Done"
return
if __name__ == "__main__":
print "Main"
trainData = openFile("../Data/TrainSample.csv")
print "File Opened"
computeTFIDF(trainData)
Error:
Traceback (most recent call last):
File "C:\DebSeal\IUB MS Program\IUB Sem III\Facebook Kaggle Comp\Src\facebookChallenge.py", line 62, in <module>
computeTFIDF(trainData)
File "C:\DebSeal\IUB MS Program\IUB Sem III\Facebook Kaggle Comp\Src\facebookChallenge.py", line 42, in computeTFIDF
for row in trainData:
File "C:\Python27\lib\csv.py", line 104, in next
row = self.reader.next()
UnicodeEncodeError: 'ascii' codec can't encode character u'\u201c' in position 215: ordinal not in range(128)
TrainSample.csv: Is a csv file with 4 columns (with header).
OS: Windows 7 64 bit.
Using Python 2.x
I don't know what is going wrong here. I said it to ignore the encoding. But still is throws the same error.
I think before the control reaches the encoding, it throws an error.
Can anybody tell me where I am going wrong.
The Python 2 CSV module does not handle Unicode input.
Open the file in binary mode, and decode after parsing it as CSV. This is safe for the UTF-8 codec as newlines, delimiters and quotes all encode to 1 byte.
The csv module documentation includes a UnicodeReader wrapper class in the example section that will do the decoding for you; it is easily adapted to the DictReader class:
import csv
class UnicodeDictReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
self.encoding = encoding
self.reader = csv.DictReader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return {k: unicode(v, "utf-8") for k, v in row.iteritems()}
def __iter__(self):
return self
Use this with the file opened in binary mode:
def openFile(fileName):
try:
trainFile = open(fileName, "rb")
except IOError as e:
print "File could not be opened: {}".format(e)
else:
return UnicodeDictReader(trainFile)
I can't give a comment to Martijn, which solution works for me perfectly after little upgrade which I leave here for others:
def next(self):
row = self.reader.next()
try:
d = dict((unicode(k, self.encoding), unicode(v, self.encoding)) for k, v in row.iteritems())
except TypeError:
d = row
return d
One thing is that python 2.6 and lower doesn't support dict comprahension.
Another, that dicts can use different types, and unicode function not, so it's worth to catch TypeError in case of null or number.
One more thing which drive me creazy was, it doesn't work when you open file with encoding! Just leave it simple open().

UnicodeDecodeError: while writing into a file

I get this error while writing into a file. How can I handle this.
Traceback (most recent call last):
File "C:\Python27\AureusBAXProjectFB.py", line 278, in <module>
rows = [[unicode(x) for x in row] for row in outlist]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe0 in position 0: ordinal not in range(128)
>>>
Code for writing into a file
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
with open('C:/Users/Desktop/fboutput.csv', 'wb') as f:
writer = UnicodeWriter(f)
rows = [[unicode(x) for x in row] for row in outlist]
writer.writerows(rows)
I am using BeautifulSoup to parse the html data and thats working fine. I get an error only while writing into a file.
unicode() constructor defined as unicode(string[, encoding, errors]) and encoding has default is ascii. If multi-byte string is in outlist, you should appoint unicode encode like utf-8.

python csv unicode 'ascii' codec can't encode character u'\xf6' in position 1: ordinal not in range(128)

I have copied this script from [python web site][1] This is another question but now problem with encoding:
import sqlite3
import csv
import codecs
import cStringIO
import sys
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
This time problem with encoding, when I ran this it gave me this error:
Traceback (most recent call last):
File "makeCSV.py", line 87, in <module>
uW.writerow(d)
File "makeCSV.py", line 54, in writerow
self.writer.writerow([s.encode("utf-8") for s in row])
AttributeError: 'int' object has no attribute 'encode'
Then I converted all integers to string, but this time I got this error:
Traceback (most recent call last):
File "makeCSV.py", line 87, in <module>
uW.writerow(d)
File "makeCSV.py", line 54, in writerow
self.writer.writerow([str(s).encode("utf-8") for s in row])
UnicodeEncodeError: 'ascii' codec can't encode character u'\xf6' in position 1: ordinal not in range(128)
I have implemented above to deal with unicode characters, but it gives me such error. What is the problem and how to fix it?
Then I converted all integers to string,
You converted both integers and strings to byte strings. For strings this will use the default character encoding which happens to be ASCII, and this fails when you have non-ASCII characters. You want unicode instead of str.
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
It might be better to convert everything to unicode before calling that method. The class is designed specifically for parsing Unicode strings. It was not designed to support other data types.
From the documentation:
http://docs.python.org/library/stringio.html?highlight=cstringio#cStringIO.StringIO
Unlike the StringIO module, this module is not able to accept Unicode strings that cannot be encoded as plain ASCII strings.
I.e. only 7-bit clean strings can be stored.
If you are using Python 2:
make encoding as : str(s.encode("utf-8"))
i.e.
def writerow(self, row):
self.writer.writerow([str(s.encode("utf-8")) for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)

Categories