Stream Bytes chunks to csv rows in python - python

I need to process a large remote CSV line by line without downloading it entirely.
Below is the closest I got.
I iterate byte chunks from Azure, and have some code to handle truncated lines.
But this cannot work if csv values contain a newline as I am not able to discernate between value newlines and csv newlines.
# this does not work
def azure_iter_lines(logger_scope, client, file_path):
# get a StorageStreamDownloader
# https://learn.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.storagestreamdownloader?view=azure-python
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
truncated_line = ''
for chunk in file_handle.chunks():
# have the previous truncated line appended to the next block
chunk_txt = truncated_line + chunk.decode("utf-8")
lines = chunk_txt.split('\n') # THIS CANNOT WORK AS VALUES CONTAIN NEWLINES
for line in lines[0:len(lines)-2]:
yield line
truncated_line = lines[len(lines)-1]
# process the last chunk (same code)
chunk_txt = truncated_line
lines = chunk_txt.split('\n') # THIS CANNOT WORK AS VALUES CONTAIN NEWLINES
for line in lines[0:len(lines)-2]:
yield line
truncated_line = lines[len(lines)-1]
Ideally I would use csv.DictReader() but I was not able to to so as it downloads the file entirely.
# this does not work
def azure_iter_lines(logger_scope, client, file_path):
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
buffer = io.BytesIO()
file_handle.readinto(buffer) # THIS DOWNLOADS THE FILE ENTIRELY
csvreader = csv.DictReader(buffer, delimiter=";")
return csvreader
Here is an update using some hints by #H.Leger
Please note that this still does not work
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
stream = codecs.iterdecode(file_handle.chunks(), 'utf-8')
csvreader = csv.DictReader(stream, delimiter=";")
for row in csvreader:
print(row)
# => _csv.Error: new-line character seen in unquoted field - do you need to open the file in universal-newline mode?
EDIT: Final solution based on #paiv answer
EDIT: Updated solution to use io instead of codecs for faster parsing
import io
import csv
import ctypes as ct
# bytes chunk iterator to python stream adapter
# https://stackoverflow.com/a/67547597/2523414
class ChunksAdapter:
def __init__(self, chunks):
self.chunks = chunks
self.buf = b''
self.closed = False
def readable(self):
return True
def writable(self):
return False
def seekable(self):
return False
def close(self):
self.closed = True
def read(self, size):
if not self.buf:
self.buf = next(self.chunks, b'')
res, self.buf = self.buf[:size], self.buf[size:]
return res
# get the downloader object
file_client = client.get_file_client(file_path)
downloader = file_client.download_file()
# adapt the downloader iterator to a byte stream
file_object = ChunksAdapter(downloader.chunks())
# decode bytes stream to utf-8
text_stream = io.TextIOWrapper(file_object, encoding='utf-8', newline='')
# update csv field limit to handle large fields
# https://stackoverflow.com/a/54517228/2523414
csv.field_size_limit(int(ct.c_ulong(-1).value // 2))
csvreader = csv.DictReader(text_stream, delimiter=";", quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in csvreader:
print(row)

Disclaimer: I know little Azure specifics. Ultimately, you would want to stream separate chunks too.
In Python, given a file object, you can set up CSV streaming this way:
import codecs
import csv
codec = codecs.getreader('utf-8')
text_stream = codec(file_object)
csvreader = csv.DictReader(text_stream)
Now you can iterate over csvreader, and it will read from file_object in a streaming fasion.
Edit: as #Martijn Pieters suggested, we can gain performance with TextIOWrapper instead of codecs:
text_stream = io.TextIOWrapper(file_object, encoding='utf-8', newline='')
Check the comment in csv module on newline parameter.
But Azure's StorageStreamDownloader does not provide python's file object interface. It has .chunks() generator (which I assume will invoke separate HTTP request to retrieve next chunk).
You can adapt .chunks() into a file object with a simple adapter:
class ChunksAdapter:
def __init__(self, chunks):
self.chunks = chunks
self.buf = b''
def read(self, size):
if not self.buf:
self.buf = next(self.chunks, b'')
res, self.buf = self.buf[:size], self.buf[size:]
return res
And use like
downloader = file_client.download_file()
file_object = ChunksAdapter(downloader.chunks())
Be sure to configure DictReader for the appropriate CSV dialect.
And set appropriate values for max_single_get_size, max_chunk_get_size on the blob client.

I believe the requests package can be useful for you. Using the stream option while getting your file and the Response.iter_lines() function should do what you need :
import codecs
import csv
import requests
url = "https://navitia.opendatasoft.com//explore/dataset/all-datasets/download?format=csv"
r = requests.get(url, stream=True) # using the stream option to avoid loading everything
try:
buffer = r.iter_lines() # iter_lines() will feed you the distant file line by line
reader = csv.DictReader(codecs.iterdecode(buffer, 'utf-8'), delimiter=';')
for row in reader:
print(row) # Do stuff here
finally:
r.close()

Related

How to read CSV after metadata?

I have a CSV file like this:
#Description
#Param1: value
#Param2: value
...
#ParamN: value
Time (s),Header1,Header2
243.41745,3,1
243.417455,3,5
243.41746,7,6
...
I need to read it with Python without using Pandas as requirement. How to read the CSV data itself ignoring the initial lines until the empty one? I am using the code below that successfully reads the metadata.
def read(file_path: str):
'''Read the data of the Digilent WaveForms Logic Analyzer Acquisition
(moodel Discovery2).
Parameter: File path.
'''
meta = {}
RE_CONFIG = re.compile(r'^#(?P<name>[^:]+)(: *(?P<value>.+)\s*$)*')
with open(file_path, 'r') as fh:
# Read the metadata and description at the beginning of the file.
for line in fh.readlines():
line = line.strip()
if not line:
break
config = RE_CONFIG.match(line)
if config:
if not config.group('value'):
meta.update({'Description': config.group('name')})
else:
meta.update({config.group('name'): config.group('value')})
# Read the data it self.
data = csv.DictReader(fh, delimiter=',')
return data, meta
This seems to work. I had to change for line in fh.readlines(): to for line in fh: the portion that reads the meta-data so line with data wouldn't be read, then create the DictReader and use it to get the data.
import csv
from pprint import pprint, pp
import re
def read(file_path: str):
'''Read the data of the Digilent WaveForms Logic Analyzer Acquisition
(moodel Discovery2).
Parameter: File path.
'''
meta = {}
RE_CONFIG = re.compile(r'^#(?P<name>[^:]+)(: *(?P<value>.+)\s*$)*')
with open(file_path, 'r') as fh:
# Read the metadata and description at the beginning of the file.
for line in fh: # CHANGED
line = line.strip()
if not line:
break
config = RE_CONFIG.match(line)
if config:
if not config.group('value'):
meta.update({'Description': config.group('name')})
else:
meta.update({config.group('name'): config.group('value')})
# Read the data itself.
reader = csv.DictReader(fh, delimiter=',')
data = list(reader)
return data, meta
res = read('mixed.csv')
pprint(res)

How to read lines from arbitrary BZ2 streams for CSV?

The bz2 module provides a standard open() method from which one can call readline(). However, my situation is one where I have a stream (pointing to a large amount of data) that I want to decompress lines from on the fly. My current implementation is as follows but I know there must be a more succinct way to do this.
import bz2
import csv
BZ2_BUFFER = ''
BZ2_DECOMPRESSOR = None
BZ2_FILE = None
BZ2_READ_SIZE = 100 * 1024
def bz2_csv_rows(fp):
global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE
BZ2_BUFFER = ''
BZ2_DECOMPRESSOR = bz2.BZ2Decompressor()
BZ2_FILE = fp
for row in csv.reader(iter(bz2_line_reader, b'')):
yield row
def bz2_line_reader():
global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE
if BZ2_BUFFER is None:
return None
while '\n' not in BZ2_BUFFER:
bindata = BZ2_FILE.read(BZ2_READ_SIZE)
try:
data = BZ2_DECOMPRESSOR.decompress(bindata)
except EOFError:
break
except IOError:
pass
BZ2_BUFFER += data
if len(data) < BZ2_READ_SIZE:
BZ2_FILE = None
break
i = BZ2_BUFFER.find('\n')
if i is None or i < 0:
line = BZ2_BUFFER
BZ2_BUFFER = None
return line
line = BZ2_BUFFER[:i]
BZ2_BUFFER = BZ2_BUFFER[i + 1:]
return line
Thoughts?
Here's something that's a little more succinct, and (in my opinion) it's more readable and gets rid of all those nasty global variables your code uses:
import bz2
import csv
from functools import partial
class BZ2_CSV_LineReader(object):
def __init__(self, filename, buffer_size=4*1024):
self.filename = filename
self.buffer_size = buffer_size
def readlines(self):
with open(self.filename, 'rb') as file:
for row in csv.reader(self._line_reader(file)):
yield row
def _line_reader(self, file):
buffer = ''
decompressor = bz2.BZ2Decompressor()
reader = partial(file.read, self.buffer_size)
for bindata in iter(reader, b''):
block = decompressor.decompress(bindata).decode('utf-8')
buffer += block
if '\n' in buffer:
lines = buffer.splitlines(True)
if lines:
buffer = '' if lines[-1].endswith('\n') else lines.pop()
for line in lines:
yield line
if __name__ == '__main__':
bz2_csv_filename = 'test_csv.bz2'
for row in BZ2_CSV_LineReader(bz2_csv_filename).readlines():
print(row)
Maybe it'll be useful: I use Python 3 and I have a large csv.bz2 file.
I handle it this way:
import bz2
import csv
def bz2_csv_rows(fp):
with bz2.open(fp, mode='rt', newline='') as bzfp:
for row in csv.reader(bzfp):
yield row
Key feature is to open stream in text mode: mode='rt' in call bz2.open() instead of manual searching "\n" in binary mode. But I'm not sure this will work for not physical files.

Can you skip non utf-8 data in python csv?

I am dealing with a very large csv file in python where some lines are throwing an error "'utf-8' codec can't decode byte 0x9b in position 7657: invalid start byte". Is there a way to skip lines that aren't utf-8 without going by hand and deleting or fixing data?
for filename in filenames:
f = open(filename, 'rt')
reader = csv.reader(f, delimiter = ',')
for row in reader:
#process data for future use
I can't use the non-utf8 data because of later processes that require utf-8 use
You could use a filter that reads a line as raw bytes, tries to convert it to unicode as UTF8 and then :
if successful, passes it down to the csv reader
if not, stores it for later analyzing
Assuming that you are using Python2, you could use something like :
class MyFilter:
def __init__(self, instr, errstr):
self.instr = instr
self.errstr = errstr
def __enter__(self):
print("ENTERING filter")
return self
def __exit__(self, a, b, c):
print("EXITING filter")
self.instr.close()
self.errstr.close()
return False
def __next__(self):
line = next(self.instr)
while True:
try:
t = line.decode('utf8')
return line.strip()
except UnicodeDecodeError:
self.errstr.write(line)
line = next(self.instr)
return line
def __iter__(self):
return self
def next(self):
return self.__next__()
You could then use it that way (assuming Python 2.7), getting all offending lines in err.txt :
with open('file.csv') as istream, open("err.txt", 'w') as err, MyFilter(istream, err) as fd:
c = csv.reader(fd)
for i in c:
# do you stuff, eg: print i
If you use Python 3, you can use almost same filter class, simply replacing line return line.strip() with return t.strip(), in order to return a string and not bytes.
Usage is again almost the same :
with open('file.csv', 'rb') as istream, open("err.txt", 'wb') as err, MyFilter(istream, err) as fd:
c = csv.reader(fd)
for i in c:
# do you stuff, eg: print (i)
Per your comment, you want to also filter lines containing null characters. This only needs a slight change in filter, the while block becoming (Python 3 version) :
while True:
if b'\x00' not in line:
try:
t = line.decode('utf8')
return t.strip()
except UnicodeDecodeError:
pass
self.errstr.write(line)
line = next(self.instr)

how to open a csv in universal new line mode through django upload?

I am trying to upload a csv file in a django form:
class CSVUploadForm(forms.Form):
csv_file = forms.FileField(label='Select a CSV file to import:',)
def clean(self):
file_csv = self.cleaned_data['csv_file']
records = csv.reader(open('/mypath/'+file_csv.name, 'rU'), dialect=csv.excel_tab)
I need to open the file in universal new line mode. I can do that with "open" method above, but that will not work for this form because the file I am dealing with is an in memory uploaded version of the csv.
How do I pass the universal new line mode flag rU to something like this:
records = csv.reader(file_csv, dialect=csv.excel_tab)
?
You can use str.splitlines() -- which automatically splits on universale line-breaks -- in the following manner:
def clean(self):
file_csv = self.cleaned_data['csv_file']
lines = file_csv.read().splitlines()
records = csv.reader(lines, dialect=csv.excel_tab)
If you are worried about the memory cost of creating the lines variable, you can force Django to save the file to a local file on disk changing the FILE_UPLOAD_MAX_MEMORY_SIZE variable in settings.py (more on this variable here):
# add to your settings.py
FILE_UPLOAD_MAX_MEMORY_SIZE = 0
FILE_UPLOAD_TEMP_DIR = '/tmp'
Then to process the file from it's tmp folder using universal mode:
def clean(self):
file_csv = open(self.cleaned_data['csv_file'].temporary_file_path, 'rU')
records = csv.reader(file_csv, dialect=csv.excel_tab)
Problem with the solution above is that it reads the whole file all at once, make it not acceptable when processing large csv file. For small CSV files files will be saved to disk instead of being kept in memory which is also not so great.
I've created a class to handle new lines
class FileWithUniversalNewLine(object):
def __init__(self, file_obj):
self.file = file_obj
def lines(self):
buff = "" # In case of reading incomplete line, buff will temporarly keep the incomplete line
while True:
line = self.file.read(2048)
if not line:
if buff:
yield buff
raise StopIteration
# Convert all new lines into linux new line
line = buff + line.replace("\r\n", "\n").replace("\r", "\n")
lines = line.split("\n")
buff = lines.pop()
for sline in lines:
yield sline
def close(self):
self.file.close()
def __exit__(self, *args, **kwargs):
return self.file.__exit__(*args, **kwargs)
def __enter__(self, *args, **kwargs):
return self
def __iter__(self):
return self.lines()
Usage:
csvfile = FileWithUniversalNewLine(file_csv)
records = csv.reader(csvfile, dialect=csv.excel_tab)

Read and Write CSV files including unicode with Python 2.7

I am new to Python, and I have a question about how to use Python to read and write CSV files. My file contains like Germany, French, etc. According to my code, the files can be read correctly in Python, but when I write it into a new CSV file, the unicode becomes some strange characters.
The data is like:
And my code is:
import csv
f=open('xxx.csv','rb')
reader=csv.reader(f)
wt=open('lll.csv','wb')
writer=csv.writer(wt,quoting=csv.QUOTE_ALL)
wt.close()
f.close()
And the result is like:
What should I do to solve the problem?
Another alternative:
Use the code from the unicodecsv package ...
https://pypi.python.org/pypi/unicodecsv/
>>> import unicodecsv as csv
>>> from io import BytesIO
>>> f = BytesIO()
>>> w = csv.writer(f, encoding='utf-8')
>>> _ = w.writerow((u'é', u'ñ'))
>>> _ = f.seek(0)
>>> r = csv.reader(f, encoding='utf-8')
>>> next(r) == [u'é', u'ñ']
True
This module is API compatible with the STDLIB csv module.
Make sure you encode and decode as appropriate.
This example will roundtrip some example text in utf-8 to a csv file and back out to demonstrate:
# -*- coding: utf-8 -*-
import csv
tests={'German': [u'Straße',u'auslösen',u'zerstören'],
'French': [u'français',u'américaine',u'épais'],
'Chinese': [u'中國的',u'英語',u'美國人']}
with open('/tmp/utf.csv','w') as fout:
writer=csv.writer(fout)
writer.writerows([tests.keys()])
for row in zip(*tests.values()):
row=[s.encode('utf-8') for s in row]
writer.writerows([row])
with open('/tmp/utf.csv','r') as fin:
reader=csv.reader(fin)
for row in reader:
temp=list(row)
fmt=u'{:<15}'*len(temp)
print fmt.format(*[s.decode('utf-8') for s in temp])
Prints:
German Chinese French
Straße 中國的 français
auslösen 英語 américaine
zerstören 美國人 épais
There is an example at the end of the csv module documentation that demonstrates how to deal with Unicode. Below is copied directly from that example. Note that the strings read or written will be Unicode strings. Don't pass a byte string to UnicodeWriter.writerows, for example.
import csv,codecs,cStringIO
class UTF8Recoder:
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
'''next() -> unicode
This function reads and returns the next line as a Unicode string.
'''
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
'''writerow(unicode) -> None
This function takes a Unicode string and encodes it to the output.
'''
self.writer.writerow([s.encode("utf-8") for s in row])
data = self.queue.getvalue()
data = data.decode("utf-8")
data = self.encoder.encode(data)
self.stream.write(data)
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
with open('xxx.csv','rb') as fin, open('lll.csv','wb') as fout:
reader = UnicodeReader(fin)
writer = UnicodeWriter(fout,quoting=csv.QUOTE_ALL)
for line in reader:
writer.writerow(line)
Input (UTF-8 encoded):
American,美国人
French,法国人
German,德国人
Output:
"American","美国人"
"French","法国人"
"German","德国人"
Because str in python2 is bytes actually. So if want to write unicode to csv, you must encode unicode to str using utf-8 encoding.
def py2_unicode_to_str(u):
# unicode is only exist in python2
assert isinstance(u, unicode)
return u.encode('utf-8')
Use class csv.DictWriter(csvfile, fieldnames, restval='', extrasaction='raise', dialect='excel', *args, **kwds):
py2
The csvfile: open(fp, 'w')
pass key and value in bytes which are encoded with utf-8
writer.writerow({py2_unicode_to_str(k): py2_unicode_to_str(v) for k,v in row.items()})
py3
The csvfile: open(fp, 'w')
pass normal dict contains str as row to writer.writerow(row)
Finally code
import sys
is_py2 = sys.version_info[0] == 2
def py2_unicode_to_str(u):
# unicode is only exist in python2
assert isinstance(u, unicode)
return u.encode('utf-8')
with open('file.csv', 'w') as f:
if is_py2:
data = {u'Python中国': u'Python中国', u'Python中国2': u'Python中国2'}
# just one more line to handle this
data = {py2_unicode_to_str(k): py2_unicode_to_str(v) for k, v in data.items()}
fields = list(data[0])
writer = csv.DictWriter(f, fieldnames=fields)
for row in data:
writer.writerow(row)
else:
data = {'Python中国': 'Python中国', 'Python中国2': 'Python中国2'}
fields = list(data[0])
writer = csv.DictWriter(f, fieldnames=fields)
for row in data:
writer.writerow(row)
Conclusion
In python3, just use the unicode str.
In python2, use unicode handle text, use str when I/O occurs.
I had the very same issue. The answer is that you are doing it right already. It is the problem of MS Excel. Try opening the file with another editor and you will notice that your encoding was successful already. To make MS Excel happy, move from UTF-8 to UTF-16. This should work:
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel_tab, encoding="utf-16", **kwds):
# Redirect output to a queue
self.queue = StringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
# Force BOM
if encoding=="utf-16":
import codecs
f.write(codecs.BOM_UTF16)
self.encoding = encoding
def writerow(self, row):
# Modified from original: now using unicode(s) to deal with e.g. ints
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = data.encode(self.encoding)
# strip BOM
if self.encoding == "utf-16":
data = data[2:]
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
I couldn't respond to Mark above, but I just made one modification which fixed the error that was caused if data in the cells was not unicode, e.g. float or int data. I replaced this line into the UnicodeWriter function: "self.writer.writerow([s.encode("utf-8") if type(s)==types.UnicodeType else s for s in row])" so that it became:
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
'''writerow(unicode) -> None
This function takes a Unicode string and encodes it to the output.
'''
self.writer.writerow([s.encode("utf-8") if type(s)==types.UnicodeType else s for s in row])
data = self.queue.getvalue()
data = data.decode("utf-8")
data = self.encoder.encode(data)
self.stream.write(data)
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
You will also need to "import types".
I don't think this is the best answer, but it's probably the most self-contained answer and also the funniest.
UTF7 is a 7-bit ASCII encoding of unicode. It just so happens that UTF7 makes no special use of commas, quotes, or whitespace. It just passes them through from input to output. So really it makes no difference if you UTF7-encode first and then parse as CSV, or if you parse as CSV first and then UTF7-encode. Python 2's CSV parser can't handle unicode, but python 2 does have a UTF-7 encoder. So you can encode, parse, and then decode, and it's as if you had a unicode-capable parser.
import csv
import io
def read_csv(path):
with io.open(path, 'rt', encoding='utf8') as f:
lines = f.read().split("\r\n")
lines = [l.encode('utf7').decode('ascii') for l in lines]
reader = csv.reader(lines, dialect=csv.excel)
for row in reader:
yield [x.encode('ascii').decode('utf7') for x in row]
for row in read_csv("lol.csv"):
print(repr(row))
lol.csv
foo,bar,foo∆bar,"foo,bar"
output:
[u'foo', u'bar', u'foo\u2206bar', u'foo,bar']

Categories