How to read lines from arbitrary BZ2 streams for CSV? - python

The bz2 module provides a standard open() method from which one can call readline(). However, my situation is one where I have a stream (pointing to a large amount of data) that I want to decompress lines from on the fly. My current implementation is as follows but I know there must be a more succinct way to do this.
import bz2
import csv
BZ2_BUFFER = ''
BZ2_DECOMPRESSOR = None
BZ2_FILE = None
BZ2_READ_SIZE = 100 * 1024
def bz2_csv_rows(fp):
global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE
BZ2_BUFFER = ''
BZ2_DECOMPRESSOR = bz2.BZ2Decompressor()
BZ2_FILE = fp
for row in csv.reader(iter(bz2_line_reader, b'')):
yield row
def bz2_line_reader():
global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE
if BZ2_BUFFER is None:
return None
while '\n' not in BZ2_BUFFER:
bindata = BZ2_FILE.read(BZ2_READ_SIZE)
try:
data = BZ2_DECOMPRESSOR.decompress(bindata)
except EOFError:
break
except IOError:
pass
BZ2_BUFFER += data
if len(data) < BZ2_READ_SIZE:
BZ2_FILE = None
break
i = BZ2_BUFFER.find('\n')
if i is None or i < 0:
line = BZ2_BUFFER
BZ2_BUFFER = None
return line
line = BZ2_BUFFER[:i]
BZ2_BUFFER = BZ2_BUFFER[i + 1:]
return line
Thoughts?

Here's something that's a little more succinct, and (in my opinion) it's more readable and gets rid of all those nasty global variables your code uses:
import bz2
import csv
from functools import partial
class BZ2_CSV_LineReader(object):
def __init__(self, filename, buffer_size=4*1024):
self.filename = filename
self.buffer_size = buffer_size
def readlines(self):
with open(self.filename, 'rb') as file:
for row in csv.reader(self._line_reader(file)):
yield row
def _line_reader(self, file):
buffer = ''
decompressor = bz2.BZ2Decompressor()
reader = partial(file.read, self.buffer_size)
for bindata in iter(reader, b''):
block = decompressor.decompress(bindata).decode('utf-8')
buffer += block
if '\n' in buffer:
lines = buffer.splitlines(True)
if lines:
buffer = '' if lines[-1].endswith('\n') else lines.pop()
for line in lines:
yield line
if __name__ == '__main__':
bz2_csv_filename = 'test_csv.bz2'
for row in BZ2_CSV_LineReader(bz2_csv_filename).readlines():
print(row)

Maybe it'll be useful: I use Python 3 and I have a large csv.bz2 file.
I handle it this way:
import bz2
import csv
def bz2_csv_rows(fp):
with bz2.open(fp, mode='rt', newline='') as bzfp:
for row in csv.reader(bzfp):
yield row
Key feature is to open stream in text mode: mode='rt' in call bz2.open() instead of manual searching "\n" in binary mode. But I'm not sure this will work for not physical files.

Related

So I made a file editing program in python... and one of the functions isn't working right

As the title says, I made a file editing program with python.
Here is the code that I'm have a problem with:
#fileEditing.py
def fileError(file):
raise OSError("file {} does not exist".format(file))
class AccessFile():
def fileExists(self, file):
import os
return bool(os.path.exists(file))
def filecreate(self, file):
if not self.fileExists(file):
with open(file, "w") as f:
f.close()
else: raise OSError("file {} already exists".format(file))
def filedelete(self, file):
import os
if self.fileExists(file):
os.remove(file)
else: fileError(file)
def fileread(self, file):
#check if file exists
if self.fileExists(file):
#detect length of file
with open(file, "r") as f:
line = " "
x = 0
while line != "":
line = f.readline()
x += 1
#piece lines together in a list
filelines = []
with open(file, "r") as f:
for i in range(x - 1):
filelines.append(str(f.readline()))
#return a tuple
return tuple(filelines)
else: fileError(file)
def filewrite(self, file, line, text):
''' BUG: apparently this either overwrites the line its writing or appends
to the line its writing... make up your mind!'''
if self.fileExists(file):
#get file contents
filelines = list(self.fileread(file))
#see if line parameter is out of range or not
try:
filelines[line] = text
except IndexError:
for i in range(line - len(filelines)):
filelines.append("")
filelines.append(str(text) + "\n")
#apply changes
with open(file, "w") as f:
f.write("") #delete contents
with open(file, "w") as f:
for l in filelines:
f.write(l)
else: fileError(file)
def fileoverwrite(self, file, data):
#if there is no file to delete, it will make a new one
try:
self.filedelete(file)
except:
pass
self.filecreate(file)
x = 0
for line in data:
print(line)
self.filewrite(file, x, line)
x += 1
accessfile = AccessFile()
The bug is in the filewrite(self, file, line, text) function. When called, it either writes a new line (which is what I want it to do), appends to the line its supposed to replace, or just doesn't write any lines at all.
Say I want to write a python file with this program:
#pytesting.py
from fileEditing import *
file = "/Users/ashton/Desktop/Atom/Python/FileEditing/FileManager.py"
data = [
"from fileEditing import *",
"",
"class FileEditing():",
" def __init__(options, immutable_files):",
" self.options, self.immutable_files = options, immutable_files",
" ",
" def prompt():",
" ",
"",
"while True:",
" pass"
]
accessfile.fileoverwrite(file, data)
When I run it, it makes a file with accessfile.fileoverwrite(file, data), like its supposed to.
But thats where things get whacky.
(FileManager.py below)
from fileEditing import *
class FileEditing():
def __init__(options, immutable_files): self.options, self.immutable_files = options, immutable_files
def prompt():
while True:
If you know how to fix the filewrite(self, file, line, text), please let me know.
(I use python 2.7 but python 3 is fine)
So this is definitely a Python 3.x solution but you said that it is fine, don't know if it will work in Python 2.x but it is so simple it should:
def file_overwrite(self, file, data):
with open(file, 'w') as file:
file.write('\n'.join(data))
And you seemingly also need to fix that data list because it is missing a few commas. Also the fact that this is all in a class is a bit weird, you do nothing with the instance, they all might as well be separate functions or #classmethods or #staticmethods. Also several things could be improved with your other functions. For example you shouldn't open the file twice and count its lines to read it. Just do file.readlines() at it will return a list of all lines:
def fileread(self, file):
if self.fileExists(file):
with open(file) as file:
return file.readlines()
else:
fileError(file)
Then also import os once at the start of the file, you don't need to import it in every function where you use os, also:
with open(file, "w") as f:
f.close()
f.close() is completely pointless because the context manger closes the file anyways and also there is mode "x" which is specifically made for file creation and will raise an error if the file already exists: https://www.w3schools.com/python/python_file_handling.asp

Stream Bytes chunks to csv rows in python

I need to process a large remote CSV line by line without downloading it entirely.
Below is the closest I got.
I iterate byte chunks from Azure, and have some code to handle truncated lines.
But this cannot work if csv values contain a newline as I am not able to discernate between value newlines and csv newlines.
# this does not work
def azure_iter_lines(logger_scope, client, file_path):
# get a StorageStreamDownloader
# https://learn.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.storagestreamdownloader?view=azure-python
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
truncated_line = ''
for chunk in file_handle.chunks():
# have the previous truncated line appended to the next block
chunk_txt = truncated_line + chunk.decode("utf-8")
lines = chunk_txt.split('\n') # THIS CANNOT WORK AS VALUES CONTAIN NEWLINES
for line in lines[0:len(lines)-2]:
yield line
truncated_line = lines[len(lines)-1]
# process the last chunk (same code)
chunk_txt = truncated_line
lines = chunk_txt.split('\n') # THIS CANNOT WORK AS VALUES CONTAIN NEWLINES
for line in lines[0:len(lines)-2]:
yield line
truncated_line = lines[len(lines)-1]
Ideally I would use csv.DictReader() but I was not able to to so as it downloads the file entirely.
# this does not work
def azure_iter_lines(logger_scope, client, file_path):
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
buffer = io.BytesIO()
file_handle.readinto(buffer) # THIS DOWNLOADS THE FILE ENTIRELY
csvreader = csv.DictReader(buffer, delimiter=";")
return csvreader
Here is an update using some hints by #H.Leger
Please note that this still does not work
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
stream = codecs.iterdecode(file_handle.chunks(), 'utf-8')
csvreader = csv.DictReader(stream, delimiter=";")
for row in csvreader:
print(row)
# => _csv.Error: new-line character seen in unquoted field - do you need to open the file in universal-newline mode?
EDIT: Final solution based on #paiv answer
EDIT: Updated solution to use io instead of codecs for faster parsing
import io
import csv
import ctypes as ct
# bytes chunk iterator to python stream adapter
# https://stackoverflow.com/a/67547597/2523414
class ChunksAdapter:
def __init__(self, chunks):
self.chunks = chunks
self.buf = b''
self.closed = False
def readable(self):
return True
def writable(self):
return False
def seekable(self):
return False
def close(self):
self.closed = True
def read(self, size):
if not self.buf:
self.buf = next(self.chunks, b'')
res, self.buf = self.buf[:size], self.buf[size:]
return res
# get the downloader object
file_client = client.get_file_client(file_path)
downloader = file_client.download_file()
# adapt the downloader iterator to a byte stream
file_object = ChunksAdapter(downloader.chunks())
# decode bytes stream to utf-8
text_stream = io.TextIOWrapper(file_object, encoding='utf-8', newline='')
# update csv field limit to handle large fields
# https://stackoverflow.com/a/54517228/2523414
csv.field_size_limit(int(ct.c_ulong(-1).value // 2))
csvreader = csv.DictReader(text_stream, delimiter=";", quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in csvreader:
print(row)
Disclaimer: I know little Azure specifics. Ultimately, you would want to stream separate chunks too.
In Python, given a file object, you can set up CSV streaming this way:
import codecs
import csv
codec = codecs.getreader('utf-8')
text_stream = codec(file_object)
csvreader = csv.DictReader(text_stream)
Now you can iterate over csvreader, and it will read from file_object in a streaming fasion.
Edit: as #Martijn Pieters suggested, we can gain performance with TextIOWrapper instead of codecs:
text_stream = io.TextIOWrapper(file_object, encoding='utf-8', newline='')
Check the comment in csv module on newline parameter.
But Azure's StorageStreamDownloader does not provide python's file object interface. It has .chunks() generator (which I assume will invoke separate HTTP request to retrieve next chunk).
You can adapt .chunks() into a file object with a simple adapter:
class ChunksAdapter:
def __init__(self, chunks):
self.chunks = chunks
self.buf = b''
def read(self, size):
if not self.buf:
self.buf = next(self.chunks, b'')
res, self.buf = self.buf[:size], self.buf[size:]
return res
And use like
downloader = file_client.download_file()
file_object = ChunksAdapter(downloader.chunks())
Be sure to configure DictReader for the appropriate CSV dialect.
And set appropriate values for max_single_get_size, max_chunk_get_size on the blob client.
I believe the requests package can be useful for you. Using the stream option while getting your file and the Response.iter_lines() function should do what you need :
import codecs
import csv
import requests
url = "https://navitia.opendatasoft.com//explore/dataset/all-datasets/download?format=csv"
r = requests.get(url, stream=True) # using the stream option to avoid loading everything
try:
buffer = r.iter_lines() # iter_lines() will feed you the distant file line by line
reader = csv.DictReader(codecs.iterdecode(buffer, 'utf-8'), delimiter=';')
for row in reader:
print(row) # Do stuff here
finally:
r.close()

How to read complete file usind pickle.load()?

Suppose, I want to read the complete file using pickle.load(), not just a single line. I know I can use try - except but is there any other method to read it?
I am using this:
import pickle
d = {}
for i in range(2):
roll_no = int(input("Enter roll no: "))
name = input("Enter name: ")
d[roll_no] = name
f = open("test.dat", "ab")
pickle.dump(d, f)
f.close()
f = open("test.dat", "rb")
while True:
try:
print(pickle.load(f))
except EOFError:
break
The official Python library does not support this within a single instruction. You can define your own helper function though:
import io
import pickle
from typing import List
def unpickle(file: io.IOBase) -> List[object]:
result = []
while True:
try:
result.append(pickle.load(file))
except EOFError:
break
return result
You can then call it like this
with open('data.pickle', 'rb') as f:
objects = unpickle(f)
objects will contain all the objects that have been serialized in data.pickle here.
You can use file.tell to see if you are at EOF
f = open("test.dat", "rb")
# go to end of file and get position
size = f.seek(0, 2)
# now return to the front and pull pickle records
f.seek(0)
while f.tell() < size:
print(pickle.load(f))

flle processing using multiprocessing - python

I am beginner to Python and trying to add few lines of code to convert json to csv and back to json. Have thousands of files (size 300 MB) to be converted and processed. With current program (using 1 CPU), i am not able to use 16 CPUs of server and need suggestions to fine tune the program for multiprocessing. Below is my code with python 3.7 version.
import json
import csv
import os
os.chdir('/stagingData/Scripts/test')
for JsonFile in os.listdir(os.getcwd()):
PartialFileName = JsonFile.split('.')[0]
j = 1
with open(PartialFileName +".csv", 'w', newline='') as Output_File:
with open(JsonFile) as fileHandle:
i = 1
for Line in fileHandle:
try:
data = json.loads(Line, parse_float=str)
except:
print("Can't load line {}".format(i))
if i == 1:
header = data.keys()
output = csv.writer(Output_File)
output.writerow(header) #Writes header row
i += 1
output.writerow(data.values()) #writes values row
j += 1
Appreciate suggestions on multiprocessing logic
If you have a single big file that you want to process more effectively I suggest the following:
Split file into chunks
Create a process to process each chunk
(if necessary) merge the processed chunks back into a single file
Something like this:
import csv
import json
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
source_big_file = Path('/path/to/file')
def chunk_file_by_line(source_filepath: Path, chunk_size: int = 10_000):
chunk_line_size = 10_000
intermediate_file_handlers = {}
last_chunk_filepath = None
with source_big_file.open('r', encoding='utf8') as big:
for line_number, line in big:
group = line_number - (line_number % chunk_line_size)
chunk_filename = f'{source_big_file.stem}.g{group}{source_big_file.suffix}'
chunk_filepath = source_big_file.parent / chunk_filename
if chunk_filepath not in intermediate_file_handlers:
file_handler = chuck_filepath.open('w', encoding='utf8')
intermediate_file_handlers[chunk_filepath] = file_handler
if last_chunk_filepath:
last_file_hanlder = intermediate_file_handlers[last_chunk_filepath]
last_file_handler.close()
yield last_chunk_filepath
else:
file_handler = intermediate_file_handlers[chunk_filepath]
file_handler.write(line)
last_chunk_filepath = chunk_filepath
# output last one
yield last_chunk_filepath
def json_to_csv(json_filepath: Path) -> Path:
csv_filename = f'{json_filepath.stem}.csv'
csv_filepath = json_filepath.parent / csv_filename
with csv_filepath.open('w', encoding='utf8') as csv_out, json_filepath.open('r', encoding='utf8') as json_in:
dwriter = csv.DictWriter(csv_out)
headers_written = False
for json_line in json_in:
data = json.loads(json_line)
if not headers_written:
# create header record
headers = {k:k for k in data.keys()}
dwriter.writeline(headers)
headers_written = True
dwriter.writeline(data)
return csv_filepath
with ProcessPoolExecutor() as pool:
futures = []
for chunk_filepath in chuck_file_by_line(source_big_file):
future = pool.submit(json_to_csv, chunk_filepath)
futures.append(future)
# wait for all to finish
for future in futures:
csv_filepath = future.result(timeout=None) # waits until complete
print(f'conversion complete> csv filepath: {csv_filepath}')
Since you have many files, the simplest multiprocessing example from the documentation should work for you. https://docs.python.org/3.4/library/multiprocessing.html?highlight=process
f(JsonFile):
# open input, output files and convert
with Pool(16) as p:
p.map(f, os.listdir(os.getcwd()))
You could also try replacing listdir with os.scandir(), which doesn't have to return all directory entries before starting.

Can you skip non utf-8 data in python csv?

I am dealing with a very large csv file in python where some lines are throwing an error "'utf-8' codec can't decode byte 0x9b in position 7657: invalid start byte". Is there a way to skip lines that aren't utf-8 without going by hand and deleting or fixing data?
for filename in filenames:
f = open(filename, 'rt')
reader = csv.reader(f, delimiter = ',')
for row in reader:
#process data for future use
I can't use the non-utf8 data because of later processes that require utf-8 use
You could use a filter that reads a line as raw bytes, tries to convert it to unicode as UTF8 and then :
if successful, passes it down to the csv reader
if not, stores it for later analyzing
Assuming that you are using Python2, you could use something like :
class MyFilter:
def __init__(self, instr, errstr):
self.instr = instr
self.errstr = errstr
def __enter__(self):
print("ENTERING filter")
return self
def __exit__(self, a, b, c):
print("EXITING filter")
self.instr.close()
self.errstr.close()
return False
def __next__(self):
line = next(self.instr)
while True:
try:
t = line.decode('utf8')
return line.strip()
except UnicodeDecodeError:
self.errstr.write(line)
line = next(self.instr)
return line
def __iter__(self):
return self
def next(self):
return self.__next__()
You could then use it that way (assuming Python 2.7), getting all offending lines in err.txt :
with open('file.csv') as istream, open("err.txt", 'w') as err, MyFilter(istream, err) as fd:
c = csv.reader(fd)
for i in c:
# do you stuff, eg: print i
If you use Python 3, you can use almost same filter class, simply replacing line return line.strip() with return t.strip(), in order to return a string and not bytes.
Usage is again almost the same :
with open('file.csv', 'rb') as istream, open("err.txt", 'wb') as err, MyFilter(istream, err) as fd:
c = csv.reader(fd)
for i in c:
# do you stuff, eg: print (i)
Per your comment, you want to also filter lines containing null characters. This only needs a slight change in filter, the while block becoming (Python 3 version) :
while True:
if b'\x00' not in line:
try:
t = line.decode('utf8')
return t.strip()
except UnicodeDecodeError:
pass
self.errstr.write(line)
line = next(self.instr)

Categories