Python: Why is DictWriter writing 'NULL' bytes? - python

class WhatsGoingOn:
def __init__(self, filename, fieldNames, maxLines):
self.file_to_write = filename
self.fieldNames = fieldNames'
self.maxLines = maxLines
# Open the file for reading and writing. Create it if it doesn't exist,
# and truncate it if it does.
self.file = open(self.file_to_write, 'w+b')
self.csvReader = csv.DictReader(self.file, fieldnames=self.fieldNames)
self.csvWriter = csv.DictWriter(self.file, fieldnames=self.fieldNames, extrasaction='ignore')
def looper(self):
# Infinitly (don't worry about that - this is a daemon),
# write to the file. When a certain number of lines have been written,
# read the file and then truncate it.
try:
numRowsWritten = 0
while True:
# Nevermind what's being written
self.csvWriter.writerow({'name_of_field_0': str(numRowsWritten ), 'name_of_field_1': str(numRowsWritten )})
numRowsWritten += 1
if numRowsWritten >= self.maxLines:
# Iterate through each row of the file
self.file.seek(0)
# This only works the first time...
for row in self.csvReader:
print row['name_of_field']
# Truncate the file, and restart the
self.file.truncate(0)
numRowsWritten = 0
except Exception as e:
print 'Exception!: {0}'.format(e)
self.file.close()
Output: Exception!: line contains NULL byte
The second time the for row in self.csvReader: line gets hit, the exception gets thrown, and when I look at the file, the file does indeed have a whole bunch of NULL bytes at the start. Apparently, after the file got truncated, the DictWriter wrote a whole bunch of NULL bytes (or at least that's my assumption). How do I prevent NULL bytes from being written to my file?

Apparently, by truncating the file, you mess up some internal state of the writer. Instead of truncating, close and re-open the file (mode w+b truncates), then re-initialize the csvReader and csvWriter.

Related

Pandas Read Continuously Growing CSV File

I have a continuously growing CSV File, that I want to periodically read. I am also only interested in new values.
I was hoping to do something like:
file_chunks = pd.read_csv('file.csv', chunksize=1)
while True:
do_something(next(file_chunks))
time.sleep(0.1)
in a frequency, that is faster than the .csv file is growing.
However, as soon as the iterator does not return a value once, it "breaks" and does not return values, even if the .csv file has grown in the meantime.
Is there a way to read continuously growing .csv files line by line?
you could build a try: except: around it or make and if statement that checks if file_chunks is not none first.
Like this it shouldnt break anymore and he only sleeps when he there are no more chunks left.
while True:
file_chunks = pd.read_csv('file.csv', chunksize=1)
while True:
try:
do_something(next(file_chunks))
except:
time.sleep(0.1)
This is easier to do with the standard csv module where you can write your own line iterator that knows how to read an updating file. This generator would read in binary mode so that it can track file position, close the file at EOF and poll its size for appended data. This can fail if the reader gets a partial file update because the other side hasn't flushed yet, or if a CSV cell contains and embedded new line that invalidates the reader's assumption that a binary mode newline always terminates a row.
import csv
import time
import os
import threading
import random
def rolling_reader(filename, poll_period=.1, encoding="utf-8"):
pos = 0
while True:
while True:
try:
if os.stat(filename).st_size > pos:
break
except FileNotFoundError:
pass
time.sleep(poll_period)
fp = open(filename, "rb")
fp.seek(pos)
for line in fp:
if line.strip():
yield line.decode("utf-8")
pos = fp.tell()
# ---- TEST - thread updates test.csv periodically
class GenCSVThread(threading.Thread):
def __init__(self, csv_name):
super().__init__(daemon=True)
self.csv_name = csv_name
self.start()
def run(self):
val = 1
while True:
with open(self.csv_name, "a") as fp:
for _ in range(random.randrange(4)):
fp.write(",".join(str(val) for _ in range(4)) + "\n")
val += 1
time.sleep(random.random())
if os.path.exists("test.csv"):
os.remove("test.csv")
test_gen = GenCSVThread("test.csv")
reader = csv.reader(rolling_reader("test.csv"))
for row in reader:
print(row)
A platform dependent update would be to use a facility such as inotify to trigger reads off of a file close operation to reduce the risk of partial data.

Python3 How to split a large text file into smaller files based on line content

I have a file with the data
# FULL_ID BJD MAG UNCERT FLAG
and nearly 12,000 rows. This table contains data for 32 objects, each identified by a unique FULL_ID. So for instance it may say
# FULL_ID BJD MAG UNCERT FLAG
2_543 3215.52 19.78 0.02937 OO
2_543 3215.84 19.42 0.02231 OO
3_522 3215.52 15.43 0.01122 OO
3_522 3222.22 16.12 0.01223 OO
What I want is to run this file BigData.dat through the code, and end up with multiple files e.g. 2_543.dat, 3_522.dat etc, each containing:
# BJD MAG UNCERT FLAG
for all rows of BigData.dat that belonged to that FULL_ID.
Currently I'm doing this:
with open(path, 'r') as BigFile:
line = BigFile.readline()
for line in BigFile:
fields = line.split(None)
id = fields[0]
output = open(id+".dat", 'a')
writeline = str(fields[1])+' '+str(fields[2])+' '+str(fields[3])+' '+str(fields[4])+'\n'
output.write(writeline)
output.close()
which does produce the correct outputs but they don't have the header line:
# BJD MAG UNCERT FLAG
How can I ensure this line is at the top of each file?
Opening a file is an expensive operation, and repeatedly doing so for each input line is not efficient. I would instead keep a mapping of seen FULL_ID values to a file object. If a FULL_ID is not present, then the file has to be opened in "w" mode and the header should be immediately added. This way:
the header is correctly written to the output files
if the script is runned more than once, the old values in output files are correctly erased
Code could be:
with open(path) as bigFile:
outfiles = {} # mapping FULL_ID -> output file
header = ' '.join(['#'] + next(bigFile).split()[2:]) # compute output header
for line in bigFile:
row = line.split()
try:
output = outfiles[row[0]]
except KeyError:
output = open(f'{row[0]}.dat', 'w')
print(header, file=output)
outfiles[row[0]] = output
print(' '.join(row[1:]), file=output)
for output in outfiles.values(): # close all files before exiting
output.close()
The limit is that you have to keep all files opened until the end of the input file. It should word for 32 objects, but would break for larger numbers. The efficient way would be to change the simple dict into a more sophisticated cache, able of closing the latest file when capacity is exhausted and reopen it (in append mode) if needed.
Here is a possible cache implementation:
class FileCache:
"""Caches a number of open files referenced by string Ids.
(by default the id is the name)"""
def __init__(self, size, namemapping=None, header=None):
"""Creates a new cache of size size.
namemapping is a function that gives the filename from an ID
header is an optional header that will be written at creation
time
"""
self.size = size
self.namemapping = namemapping if namemapping is not None \
else lambda x: x
self.header = header
self.map = {} # dict id -> slot number
self.slots = [(None, None)] * size # list of pairs (id, file object)
self.curslot = 0 # next slot to be used
def getFile(self, id):
"""Gets an open file from the cache.
Directly gets it if it is already present, eventually reopen
it in append mode. Adds it to the cache if absent and open it
in truncate mode."""
try:
slot = self.map[id]
if slot != -1:
return self.slots[slot][1] # found and active
mode = 'a' # need re-opening
except:
mode = 'w' # new id: create file
slot = self.curslot
self.curslot = (slot + 1) % self.size
if self.slots[slot][0] is not None: # eventually close previous
self.slots[slot][1].close()
self.map[self.slots[slot][0]] = -1
fd = open(self.namemapping(id), mode)
# if file is new, write the optional header
if (mode == 'w') and self.header is not None:
print(self.header, file=fd)
self.slots[slot] = (id, fd)
self.map[id] = slot
return fd
def close(self):
"""Closes any cached file."""
for i in self.slots:
i[1].close()
self.map[i[0]] = -1
self.slots = [(None, None)] * self.size
Above code would become:
with open(path) as bigFile:
header = ' '.join(['#'] + next(bigFile).split()[2:]) # compute output header
outfiles = FileCache(10, lambda x: x+'.dat', header) # cache FULL_ID -> file
for line in bigFile:
row = line.split()
output = outfiles.getFile(row[0])
print(' '.join(row[1:]), file=output)
outfiles.close() # close all files before exiting
You are overwriting the header line in the for loop, keep it in a separate variable. Additionally you could remember if the header was already written to a file:
path = 'big.dat'
header_written = []
with open(path, 'r') as BigFile:
header = BigFile.readline() # keep header separately!
for line in BigFile:
fields = line.split(None)
_id = fields[0]
output = open(_id+".dat", 'a')
if _id not in header_written: # check and save the ID to keep track if header was written
output.write(header)
header_written.append(_id)
writeline = str(fields[1])+' '+str(fields[2])+' '+str(fields[3])+' '+str(fields[4])+'\n'
output.write(writeline)
output.close()
File:
# FULL_ID BJD MAG UNCERT FLAG
3215.52 19.78 0.02937 OO
3215.84 19.42 0.02231 OO

Add header and footer to each line in a file. Function returns only first line

I have a problem with some of my python code. I want it to open a file, with few lines of text, and add header + footer to each line in that file.
The problem is that 'create_output()' function returns only the first line with additional content. If I switch 'return' to 'print' at the end of this function it properly displays all lines from my file. What could be the reason? I want to understand what am I doing wrong here.
file_path = '/home/user/Desktop/text.txt'
file_path_edited = '/home/user/Desktop/text_new.txt'
header = 'http://'
footer = '.com'
def open_file():
opened_file = open(file_path)
return opened_file
def edit_file():
edited_file = open(file_path_edited, 'w')
return edited_file
def create_output():
for line in open_file():
line = line.strip()
edited_line = header+line+footer
to_file = edit_file()
to_file.writelines(edited_line)
to_file.close()
return edited_line
print (create_output())
OK, I changed it to something like this, now it works fine.
Thanks your feedback, now I know what I'm doing wrong.
file_path = '/home/user/Desktop/text.txt'
file_path_edited = '/home/user/Desktop/text_new.txt'
header = 'http://'
footer = '.com'
def CreateContent():
with open(file_path) as read_file:
with open(file_path_edited, 'w') as write_file:
for line in read_file.readlines():
new_line = "{}{}{}".format(header, line.strip(), footer)
print(new_line)
write_file.write("{}\n".format(new_line))
CreateContent()
You get only one line, because you reopen the write-file all the time instead of letting it open, "w" will truncate the file on open - so last line lives, rest is useless IO. Also you never close your reader afaics.
open(filename, mode) from https://docs.python.org/2/tutorial/inputoutput.html#reading-and-writing-files:
mode can be 'r' when the file will only be read, 'w' for only writing (an existing file with the same name will be erased), and 'a' opens the file for appending; any data written to the file is automatically added to the end. 'r+' opens the file for both reading and writing. The mode argument is optional; 'r' will be assumed if it’s omitted.
Do not split the file open into extra functions, use with open(...) as bla: bla.write(...) so they get closed as soon as you leave the block or some exception happens.
Use string-formatting - either 'this {} ist repleaced with'.format("something") or the inline variant - see below.
def create_output():
modLines = []
with open('/home/user/Desktop/text.txt',"r") as reader, \
open('/home/user/Desktop/text_new.txt',"w") as writer:
for line in reader:
line = line.strip().rstrip('\n') # rstrip might be better if you only cut \n
modline = f'http://{line}.com' # 3.6 inline string formatting, for 2.7 use
modLines.append(modline) # 'http://{}.com'.format(line)
writer.write(modline+"\n") # write does not autoappend \n
return modlines # return a list of written https...
print (create_output())
Should do the trick.
Links:
Format string syntax
Reading and writing files
You could further improve your code as follows:
file_path = '/home/user/Desktop/text.txt'
file_path_edited = '/home/user/Desktop/text_new.txt'
header = 'http://'
footer = '.com'
def CreateContent():
with open(file_path) as read_file, open(file_path_edited, 'w') as write_file:
for line in read_file:
write_file.write("{}{}{}\n".format(header, line.strip(), footer))
CreateContent()

How can I retain ASCII hex code points when writing an ElementTree in Python?

I've loaded an xml file (Rhythmbox's database file) into Python 3 via the ElementTree parser. After modifying the tree and writing it to disk (ElementTree.write()) using the ascii encoding all of the ASCII hex characters that are in hex code point are converted to ASCII decimal code point. For example here is a diff containing the copyright symbol:
< <copyright>© WNYC</copyright>
---
> <copyright>© WNYC</copyright>
Is there any way to tell Python/ElementTree not to do this? I'd like all the hex codes to stay in hex code point.
I found a solution. First I created a new codec error handler and then monkey patched ElementTree._get_writer() to use the new error handler. Looks like:
from xml.etree import ElementTree
import io
import contextlib
import codecs
def lower_first(s):
return s[:1].lower() + s[1:] if s else ''
def html_replace(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
s = []
for c in exc.object[exc.start:exc.end]:
s.append('&#%s;' % lower_first(hex(ord(c))[1:].upper()))
return ''.join(s), exc.end
else:
raise TypeError("can't handle %s" % exc.__name__)
codecs.register_error('html_replace', html_replace)
# monkey patch this python function to prevent it from using xmlcharrefreplace
#contextlib.contextmanager
def _get_writer(file_or_filename, encoding):
# returns text write method and release all resources after using
try:
write = file_or_filename.write
except AttributeError:
# file_or_filename is a file name
if encoding == "unicode":
file = open(file_or_filename, "w")
else:
file = open(file_or_filename, "w", encoding=encoding,
errors="html_replace")
with file:
yield file.write
else:
# file_or_filename is a file-like object
# encoding determines if it is a text or binary writer
if encoding == "unicode":
# use a text writer as is
yield write
else:
# wrap a binary writer with TextIOWrapper
with contextlib.ExitStack() as stack:
if isinstance(file_or_filename, io.BufferedIOBase):
file = file_or_filename
elif isinstance(file_or_filename, io.RawIOBase):
file = io.BufferedWriter(file_or_filename)
# Keep the original file open when the BufferedWriter is
# destroyed
stack.callback(file.detach)
else:
# This is to handle passed objects that aren't in the
# IOBase hierarchy, but just have a write method
file = io.BufferedIOBase()
file.writable = lambda: True
file.write = write
try:
# TextIOWrapper uses this methods to determine
# if BOM (for UTF-16, etc) should be added
file.seekable = file_or_filename.seekable
file.tell = file_or_filename.tell
except AttributeError:
pass
file = io.TextIOWrapper(file,
encoding=encoding,
errors='html_replace',
newline="\n")
# Keep the original file open when the TextIOWrapper is
# destroyed
stack.callback(file.detach)
yield file.write
ElementTree._get_writer = _get_writer

Can you skip non utf-8 data in python csv?

I am dealing with a very large csv file in python where some lines are throwing an error "'utf-8' codec can't decode byte 0x9b in position 7657: invalid start byte". Is there a way to skip lines that aren't utf-8 without going by hand and deleting or fixing data?
for filename in filenames:
f = open(filename, 'rt')
reader = csv.reader(f, delimiter = ',')
for row in reader:
#process data for future use
I can't use the non-utf8 data because of later processes that require utf-8 use
You could use a filter that reads a line as raw bytes, tries to convert it to unicode as UTF8 and then :
if successful, passes it down to the csv reader
if not, stores it for later analyzing
Assuming that you are using Python2, you could use something like :
class MyFilter:
def __init__(self, instr, errstr):
self.instr = instr
self.errstr = errstr
def __enter__(self):
print("ENTERING filter")
return self
def __exit__(self, a, b, c):
print("EXITING filter")
self.instr.close()
self.errstr.close()
return False
def __next__(self):
line = next(self.instr)
while True:
try:
t = line.decode('utf8')
return line.strip()
except UnicodeDecodeError:
self.errstr.write(line)
line = next(self.instr)
return line
def __iter__(self):
return self
def next(self):
return self.__next__()
You could then use it that way (assuming Python 2.7), getting all offending lines in err.txt :
with open('file.csv') as istream, open("err.txt", 'w') as err, MyFilter(istream, err) as fd:
c = csv.reader(fd)
for i in c:
# do you stuff, eg: print i
If you use Python 3, you can use almost same filter class, simply replacing line return line.strip() with return t.strip(), in order to return a string and not bytes.
Usage is again almost the same :
with open('file.csv', 'rb') as istream, open("err.txt", 'wb') as err, MyFilter(istream, err) as fd:
c = csv.reader(fd)
for i in c:
# do you stuff, eg: print (i)
Per your comment, you want to also filter lines containing null characters. This only needs a slight change in filter, the while block becoming (Python 3 version) :
while True:
if b'\x00' not in line:
try:
t = line.decode('utf8')
return t.strip()
except UnicodeDecodeError:
pass
self.errstr.write(line)
line = next(self.instr)

Categories