Python remove entry from zipfile - python

I'm currently writing an open source library for a container format, which involves modifying zip archives. Therefore I utilized pythons build-in zipfile module. Due to some limitations I decided to modify the module and ship it with my library. These modifications include a patch for removing entries from the zip file from the python issue tracker: https://bugs.python.org/issue6818
To be more specific I included the zipfile.remove.2.patch from ubershmekel.
After some modifications for Python-2.7 the patch works just fine according to the shipped unit-tests.
But nevertheless I'm running into some problems, when removing, adding and removing + adding files without closing the zipfile in between.
Error
Traceback (most recent call last):
File "/home/martin/git/pyCombineArchive/tests/test_zipfile.py", line 1590, in test_delete_add_no_close
self.assertEqual(zf.read(fname), data)
File "/home/martin/git/pyCombineArchive/combinearchive/custom_zip.py", line 948, in read
with self.open(name, "r", pwd) as fp:
File "/home/martin/git/pyCombineArchive/combinearchive/custom_zip.py", line 1003, in open
% (zinfo.orig_filename, fname))
BadZipFile: File name in directory 'foo.txt' and header 'bar.txt' differ.
Meaning the zip file is ok, but somehow the central dictionary/entry header gets messed up.
This unittest reproduces this error:
def test_delete_add_no_close(self):
fname_list = ["foo.txt", "bar.txt", "blu.bla", "sup.bro", "rollah"]
data_list = [''.join([chr(randint(0, 255)) for i in range(100)]) for i in range(len(fname_list))]
# add some files to the zip
with zipfile.ZipFile(TESTFN, "w") as zf:
for fname, data in zip(fname_list, data_list):
zf.writestr(fname, data)
for no in range(0, 2):
with zipfile.ZipFile(TESTFN, "a") as zf:
zf.remove(fname_list[no])
zf.writestr(fname_list[no], data_list[no])
zf.remove(fname_list[no+1])
zf.writestr(fname_list[no+1], data_list[no+1])
# try to access prior deleted/added file and prior last file (which got moved, while delete)
for fname, data in zip(fname_list, data_list):
self.assertEqual(zf.read(fname), data)
My modified zipfile module and the complete unittest file can be found in this gist: https://gist.github.com/FreakyBytes/30a6f9866154d82f1c3863f2e4969cc4

After some intensive debugging, I'm quite sure something went wrong with moving the remaining chunks. (The ones stored after the removed file) So I went ahead and rewrote this code part, so it copies these files/chunks each at a time. Also I rewrite the file header for each of them (to make sure it is valid) and the central directory at the end of the zipfile.
My remove function now looks like this:
def remove(self, member):
"""Remove a file from the archive. Only works if the ZipFile was opened
with mode 'a'."""
if "a" not in self.mode:
raise RuntimeError('remove() requires mode "a"')
if not self.fp:
raise RuntimeError(
"Attempt to modify ZIP archive that was already closed")
fp = self.fp
# Make sure we have an info object
if isinstance(member, ZipInfo):
# 'member' is already an info object
zinfo = member
else:
# Get info object for member
zinfo = self.getinfo(member)
# start at the pos of the first member (smallest offset)
position = min([info.header_offset for info in self.filelist]) # start at the beginning of first file
for info in self.filelist:
fileheader = info.FileHeader()
# is member after delete one?
if info.header_offset > zinfo.header_offset and info != zinfo:
# rewrite FileHeader and copy compressed data
# Skip the file header:
fp.seek(info.header_offset)
fheader = fp.read(sizeFileHeader)
if fheader[0:4] != stringFileHeader:
raise BadZipFile("Bad magic number for file header")
fheader = struct.unpack(structFileHeader, fheader)
fname = fp.read(fheader[_FH_FILENAME_LENGTH])
if fheader[_FH_EXTRA_FIELD_LENGTH]:
fp.read(fheader[_FH_EXTRA_FIELD_LENGTH])
if zinfo.flag_bits & 0x800:
# UTF-8 filename
fname_str = fname.decode("utf-8")
else:
fname_str = fname.decode("cp437")
if fname_str != info.orig_filename:
if not self._filePassed:
fp.close()
raise BadZipFile(
'File name in directory %r and header %r differ.'
% (zinfo.orig_filename, fname))
# read the actual data
data = fp.read(fheader[_FH_COMPRESSED_SIZE])
# modify info obj
info.header_offset = position
# jump to new position
fp.seek(info.header_offset, 0)
# write fileheader and data
fp.write(fileheader)
fp.write(data)
if zinfo.flag_bits & _FHF_HAS_DATA_DESCRIPTOR:
# Write CRC and file sizes after the file data
fp.write(struct.pack("<LLL", info.CRC, info.compress_size,
info.file_size))
# update position
fp.flush()
position = fp.tell()
elif info != zinfo:
# move to next position
position = position + info.compress_size + len(fileheader) + self._get_data_descriptor_size(info)
# Fix class members with state
self.start_dir = position
self._didModify = True
self.filelist.remove(zinfo)
del self.NameToInfo[zinfo.filename]
# write new central directory (includes truncate)
fp.seek(position, 0)
self._write_central_dir()
fp.seek(self.start_dir, 0) # jump to the beginning of the central directory, so it gets overridden at close()
You can find the complete code in the latest revision of the gist: https://gist.github.com/FreakyBytes/30a6f9866154d82f1c3863f2e4969cc4
or in the repo of the library I'm writing: https://github.com/FreakyBytes/pyCombineArchive

Related

How to edit a file in python without a temporary sacrificial file? [duplicate]

I'm using Python, and would like to insert a string into a text file without deleting or copying the file. How can I do that?
Unfortunately there is no way to insert into the middle of a file without re-writing it. As previous posters have indicated, you can append to a file or overwrite part of it using seek but if you want to add stuff at the beginning or the middle, you'll have to rewrite it.
This is an operating system thing, not a Python thing. It is the same in all languages.
What I usually do is read from the file, make the modifications and write it out to a new file called myfile.txt.tmp or something like that. This is better than reading the whole file into memory because the file may be too large for that. Once the temporary file is completed, I rename it the same as the original file.
This is a good, safe way to do it because if the file write crashes or aborts for any reason, you still have your untouched original file.
Depends on what you want to do. To append you can open it with "a":
with open("foo.txt", "a") as f:
f.write("new line\n")
If you want to preprend something you have to read from the file first:
with open("foo.txt", "r+") as f:
old = f.read() # read everything in the file
f.seek(0) # rewind
f.write("new line\n" + old) # write the new line before
The fileinput module of the Python standard library will rewrite a file inplace if you use the inplace=1 parameter:
import sys
import fileinput
# replace all occurrences of 'sit' with 'SIT' and insert a line after the 5th
for i, line in enumerate(fileinput.input('lorem_ipsum.txt', inplace=1)):
sys.stdout.write(line.replace('sit', 'SIT')) # replace 'sit' and write
if i == 4: sys.stdout.write('\n') # write a blank line after the 5th line
Rewriting a file in place is often done by saving the old copy with a modified name. Unix folks add a ~ to mark the old one. Windows folks do all kinds of things -- add .bak or .old -- or rename the file entirely or put the ~ on the front of the name.
import shutil
shutil.move(afile, afile + "~")
destination= open(aFile, "w")
source= open(aFile + "~", "r")
for line in source:
destination.write(line)
if <some condition>:
destination.write(<some additional line> + "\n")
source.close()
destination.close()
Instead of shutil, you can use the following.
import os
os.rename(aFile, aFile + "~")
Python's mmap module will allow you to insert into a file. The following sample shows how it can be done in Unix (Windows mmap may be different). Note that this does not handle all error conditions and you might corrupt or lose the original file. Also, this won't handle unicode strings.
import os
from mmap import mmap
def insert(filename, str, pos):
if len(str) < 1:
# nothing to insert
return
f = open(filename, 'r+')
m = mmap(f.fileno(), os.path.getsize(filename))
origSize = m.size()
# or this could be an error
if pos > origSize:
pos = origSize
elif pos < 0:
pos = 0
m.resize(origSize + len(str))
m[pos+len(str):] = m[pos:origSize]
m[pos:pos+len(str)] = str
m.close()
f.close()
It is also possible to do this without mmap with files opened in 'r+' mode, but it is less convenient and less efficient as you'd have to read and temporarily store the contents of the file from the insertion position to EOF - which might be huge.
As mentioned by Adam you have to take your system limitations into consideration before you can decide on approach whether you have enough memory to read it all into memory replace parts of it and re-write it.
If you're dealing with a small file or have no memory issues this might help:
Option 1)
Read entire file into memory, do a regex substitution on the entire or part of the line and replace it with that line plus the extra line. You will need to make sure that the 'middle line' is unique in the file or if you have timestamps on each line this should be pretty reliable.
# open file with r+b (allow write and binary mode)
f = open("file.log", 'r+b')
# read entire content of file into memory
f_content = f.read()
# basically match middle line and replace it with itself and the extra line
f_content = re.sub(r'(middle line)', r'\1\nnew line', f_content)
# return pointer to top of file so we can re-write the content with replaced string
f.seek(0)
# clear file content
f.truncate()
# re-write the content with the updated content
f.write(f_content)
# close file
f.close()
Option 2)
Figure out middle line, and replace it with that line plus the extra line.
# open file with r+b (allow write and binary mode)
f = open("file.log" , 'r+b')
# get array of lines
f_content = f.readlines()
# get middle line
middle_line = len(f_content)/2
# overwrite middle line
f_content[middle_line] += "\nnew line"
# return pointer to top of file so we can re-write the content with replaced string
f.seek(0)
# clear file content
f.truncate()
# re-write the content with the updated content
f.write(''.join(f_content))
# close file
f.close()
Wrote a small class for doing this cleanly.
import tempfile
class FileModifierError(Exception):
pass
class FileModifier(object):
def __init__(self, fname):
self.__write_dict = {}
self.__filename = fname
self.__tempfile = tempfile.TemporaryFile()
with open(fname, 'rb') as fp:
for line in fp:
self.__tempfile.write(line)
self.__tempfile.seek(0)
def write(self, s, line_number = 'END'):
if line_number != 'END' and not isinstance(line_number, (int, float)):
raise FileModifierError("Line number %s is not a valid number" % line_number)
try:
self.__write_dict[line_number].append(s)
except KeyError:
self.__write_dict[line_number] = [s]
def writeline(self, s, line_number = 'END'):
self.write('%s\n' % s, line_number)
def writelines(self, s, line_number = 'END'):
for ln in s:
self.writeline(s, line_number)
def __popline(self, index, fp):
try:
ilines = self.__write_dict.pop(index)
for line in ilines:
fp.write(line)
except KeyError:
pass
def close(self):
self.__exit__(None, None, None)
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
with open(self.__filename,'w') as fp:
for index, line in enumerate(self.__tempfile.readlines()):
self.__popline(index, fp)
fp.write(line)
for index in sorted(self.__write_dict):
for line in self.__write_dict[index]:
fp.write(line)
self.__tempfile.close()
Then you can use it this way:
with FileModifier(filename) as fp:
fp.writeline("String 1", 0)
fp.writeline("String 2", 20)
fp.writeline("String 3") # To write at the end of the file
If you know some unix you could try the following:
Notes: $ means the command prompt
Say you have a file my_data.txt with content as such:
$ cat my_data.txt
This is a data file
with all of my data in it.
Then using the os module you can use the usual sed commands
import os
# Identifiers used are:
my_data_file = "my_data.txt"
command = "sed -i 's/all/none/' my_data.txt"
# Execute the command
os.system(command)
If you aren't aware of sed, check it out, it is extremely useful.

Generator Reading Log File While Name Changes Python

The goal is to read a log file in real time line by line (standard generator stuff) but the catch is, the file name changes at various intervals. The name change can't be helped (application dictated appended with a time string) and the name is changed when the log file size reaches ~2MB (guesstimate).
My approach was to create a file getter function that got the file (or new file) and then passed that to the generator. I thought that when the file changed names I would get a 'File not found' error, but what my test showed, is that the file name change is prevented entirely as 'another program is using this file'. The name change must be allowed, and this reader code cannot interfere with the application logging process at all.
import os
import time
import fnmatch
directory = '\\foo\\'
def fileGenerator(logFile):
""" Run a line generator """
logFile.seek(0,2)
while True:
line = logFile.readline()
if not line:
time.sleep(0.1)
continue
yield line
def fileGetter():
""" Get the Logging File """
matchedFiles = []
for afile in os.listdir(directory):
if fnmatch.fnmatch(afile,'amc_*.txt'):
matchedFiles.append(afile)
if len(matchedFiles)==1:
#There was exactly one matching file found send it to the generator
return os.path.join(directory,matchedFiles[0])
else:
#There either wasn't a file found or many matching
#Error out and stop process... critical error
if __name__ == '__main__':
filePath = fileGetter()
try:
logFile = open(filePath,"r")
except Exception as e:
#Catch the file not found and go back to the file path getter
#Send the file back to the generator
print e
if logFile:
loglines = fileGenerator(logFile)
for line in loglines:
#handle the line
print line,
If you can't hold the file open while waiting for new content to be written to it, I suggest saving the file position you were last at and closing the file before you sleep, and then reopening the file and seeking to that point afterwards. You could also investigate filesystem notification systems if you care about spotting file additions or renames immediately.
def log_reader():
filename = "does_not_exist"
filepos = 0
while True:
try:
file = open(filename)
except FileNotFoundError:
filename = fileGetter()
# if renamed files start empty, set filepos to zero here!
continue
file.seek(filepos)
while True:
line = file.readline()
if not line:
filepos = file.tell()
file.close()
sleep(0.1) # you may want to test different sleep lengths to avoid FS thrash
break
yield line
The opening and closing of the file may stress out your filesystem if you do it too much, so I'd suggest sleeping longer than your previous code did (but you may want to test to see how well your OS handles it if you care about how responsive your log reader is).

Creating new text file in Python?

Is there a method of creating a text file without opening a text file in "w" or "a" mode? For instance If I wanted to open a file in "r" mode but the file does not exist then when I catch IOError I want a new file to be created
e.g.:
while flag == True:
try:
# opening src in a+ mode will allow me to read and append to file
with open("Class {0} data.txt".format(classNo),"r") as src:
# list containing all data from file, one line is one item in list
data = src.readlines()
for ind,line in enumerate(data):
if surname.lower() and firstName.lower() in line.lower():
# overwrite the relevant item in data with the updated score
data[ind] = "{0} {1}\n".format(line.rstrip(),score)
rewrite = True
else:
with open("Class {0} data.txt".format(classNo),"a") as src:
src.write("{0},{1} : {2}{3} ".format(surname, firstName, score,"\n"))
if rewrite == True:
# reopen src in write mode and overwrite all the records with the items in data
with open("Class {} data.txt".format(classNo),"w") as src:
src.writelines(data)
flag = False
except IOError:
print("New data file created")
# Here I want a new file to be created and assigned to the variable src so when the
# while loop iterates for the second time the file should successfully open
At the beginning just check if the file exists and create it if it doesn't:
filename = "Class {0} data.txt"
if not os.path.isfile(filename):
open(filename, 'w').close()
From this point on you can assume the file exists, this will greatly simplify your code.
No operating system will allow you to create a file without actually writing to it. You can encapsulate this in a library so that the creation is not visible, but it is impossible to avoid writing to the file system if you really want to modify the file system.
Here is a quick and dirty open replacement which does what you propose.
def open_for_reading_create_if_missing(filename):
try:
handle = open(filename, 'r')
except IOError:
with open(filename, 'w') as f:
pass
handle = open(filename, 'r')
return handle
Better would be to create the file if it doesn't exist, e.g. Something like:
import sys, os
def ensure_file_exists(file_name):
""" Make sure that I file with the given name exists """
(the_dir, fname) = os.path.split(file_name)
if not os.path.exists(the_dir):
sys.mkdirs(the_dir) # This may give an exception if the directory cannot be made.
if not os.path.exists(file_name):
open(file_name, 'w').close()
You could even have a safe_open function that did something similar prior to opening for read and returning the file handle.
The sample code provided in the question is not very clear, specially because it invokes multiple variables that are not defined anywhere. But based on it here is my suggestion. You can create a function similar to touch + file open, but which will be platform agnostic.
def touch_open( filename):
try:
connect = open( filename, "r")
except IOError:
connect = open( filename, "a")
connect.close()
connect = open( filename, "r")
return connect
This function will open the file for you if it exists. If the file doesn't exist it will create a blank file with the same name and the open it. An additional bonus functionality with respect to import os; os.system('touch test.txt') is that it does not create a child process in the shell making it faster.
Since it doesn't use the with open(filename) as src syntax you should either remember to close the connection at the end with connection = touch_open( filename); connection.close() or preferably you could open it in a for loop. Example:
file2open = "test.txt"
for i, row in enumerate( touch_open( file2open)):
print i, row, # print the line number and content
This option should be preferred to data = src.readlines() followed by enumerate( data), found in your code, because it avoids looping twice through the file.

How to redirect stdout to only the console when within fileinput loop

Currently I have this piece of code for python 2.7:
h = 0
for line in fileinput.input('HISTORY',inplace=1):
if line[0:2] == x:
h = h + 1
if h in AU:
line = line.replace(x,'AU')
if 'timestep' in line:
h = 0
sys.stdout.write(('\r%s%% ') % format(((os.stat('HISTORY').st_size / os.stat('HISTORY.bak').st_size)*100),'.1f'))
sys.stdout.write(line)
What I am having trouble with is the following line:
sys.stdout.write(('\r%s%% ') % format(((os.stat('HISTORY').st_size / os.stat('HISTORY.bak').st_size)*100),'.1f'))
I need this information to be outputted to the console ONLY and not into the HISTORY file.
This code creates a temporary copy of the input file, then scans this and rewrites the original file. It handles errors during processing the file so that the original data isn't lost during the re-write. It demonstrates how to write some data to stdout occasionally and other data back to the original file.
The temporary file creation was taken from this SO answer.
import fileinput
import os, shutil, tempfile
# create a copy of the source file into a system specified
# temporary directory. You could just put this in the original
# folder, if you wanted
def create_temp_copy(src_filename):
temp_dir = tempfile.gettempdir()
temp_path = os.path.join(temp_dir, 'temp-history.txt')
shutil.copy2(src_filename,temp_path)
return temp_path
# create a temporary copy of the input file
temp = create_temp_copy('HISTORY.txt')
# open up the input file for writing
dst = open('HISTORY.txt','w+')
for line in fileinput.input(temp):
# Added a try/catch to handle errors during processing.
# If this isn't present, any exceptions that are raised
# during processing could cause unrecoverable loss of
# the HISTORY file
try:
# some sort of replacement
if line.startswith('e'):
line = line.strip() + '#\n' # notice the newline here
# occasional status updates to stdout
if '0' in line:
print 'info:',line.strip() # notice the removal of the newline
except:
# when a problem occurs, just output a message
print 'Error processing input file'
finally:
# re-write the original input file
# even if there are exceptions
dst.write(line)
# deletes the temporary file
os.remove(temp)
# close the original file
dst.close()
If you only want the information to go to the console could you just use print instead?

python: read file continuously, even after it has been logrotated

I have a simple python script, where I read logfile continuosly (same as tail -f)
while True:
line = f.readline()
if line:
print line,
else:
time.sleep(0.1)
How can I make sure that I can still read the logfile, after it has been rotated by logrotate?
i.e. I need to do the same what tail -F would do.
I am using python 2.7
As long as you only plan to do this on Unix, the most robust way is probably to check so that the open file still refers to the same i-node as the name, and reopen it when that is no longer the case. You can get the i-number of the file from os.stat and os.fstat, in the st_ino field.
It could look like this:
import os, sys, time
name = "logfile"
current = open(name, "r")
curino = os.fstat(current.fileno()).st_ino
while True:
while True:
buf = current.read(1024)
if buf == "":
break
sys.stdout.write(buf)
try:
if os.stat(name).st_ino != curino:
new = open(name, "r")
current.close()
current = new
curino = os.fstat(current.fileno()).st_ino
continue
except IOError:
pass
time.sleep(1)
I doubt this works on Windows, but since you're speaking in terms of tail, I'm guessing that's not a problem. :)
You can do it by keeping track of where you are in the file and reopening it when you want to read. When the log file rotates, you notice that the file is smaller and since you reopen, you handle any unlinking too.
import time
cur = 0
while True:
try:
with open('myfile') as f:
f.seek(0,2)
if f.tell() < cur:
f.seek(0,0)
else:
f.seek(cur,0)
for line in f:
print line.strip()
cur = f.tell()
except IOError, e:
pass
time.sleep(1)
This example hides errors like file not found because I'm not sure of logrotate details such as small periods of time where the file is not available.
NOTE: In python 3, things are different. A regular open translates bytes to str and the interim buffer used for that conversion means that seek and tell don't operate properly (except when seeking to 0 or the end of file). Instead, open in binary mode ("rb") and do the decode manually line by line. You'll have to know the file encoding and what that encoding's newline looks like. For utf-8, its b"\n" (one of the reasons utf-8 is superior to utf-16, btw).
Thanks to #tdelaney and #Dolda2000's answers, I ended up with what follows. It should work on both Linux and Windows, and also handle logrotate's copytruncate or create options (respectively copy then truncate size to 0 and move then recreate file).
file_name = 'my_log_file'
seek_end = True
while True: # handle moved/truncated files by allowing to reopen
with open(file_name) as f:
if seek_end: # reopened files must not seek end
f.seek(0, 2)
while True: # line reading loop
line = f.readline()
if not line:
try:
if f.tell() > os.path.getsize(file_name):
# rotation occurred (copytruncate/create)
f.close()
seek_end = False
break
except FileNotFoundError:
# rotation occurred but new file still not created
pass # wait 1 second and retry
time.sleep(1)
do_stuff_with(line)
A limitation when using copytruncate option is that if lines are appended to the file while time-sleeping, and rotation occurs before wake-up, the last lines will be "lost" (they will still be in the now "old" log file, but I cannot see a decent way to "follow" that file to finish reading it). This limitation is not relevant with "move and create" create option because f descriptor will still point to the renamed file and therefore last lines will be read before the descriptor is closed and opened again.
Using 'tail -F
man tail
-F same as --follow=name --retr
-f, --follow[={name|descriptor}] output appended data as the file grows;
--retry keep trying to open a file if it is inaccessible
-F option will follow the name of the file not descriptor.
So when logrotate happens, it will follow the new file.
import subprocess
def tail(filename: str) -> Generator[str, None, None]:
proc = subprocess.Popen(["tail", "-F", filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
while True:
line = proc.stdout.readline()
if line:
yield line.decode("utf-8")
else:
break
for line in tail("/config/logs/openssh/current"):
print(line.strip())
I made a variation of the awesome above one by #pawamoy into a generator function one for my log monitoring and following needs.
def tail_file(file):
"""generator function that yields new lines in a file
:param file:File Path as a string
:type file: str
:rtype: collections.Iterable
"""
seek_end = True
while True: # handle moved/truncated files by allowing to reopen
with open(file) as f:
if seek_end: # reopened files must not seek end
f.seek(0, 2)
while True: # line reading loop
line = f.readline()
if not line:
try:
if f.tell() > os.path.getsize(file):
# rotation occurred (copytruncate/create)
f.close()
seek_end = False
break
except FileNotFoundError:
# rotation occurred but new file still not created
pass # wait 1 second and retry
time.sleep(1)
yield line
Which can be easily used like the below
import os, time
access_logfile = '/var/log/syslog'
loglines = tail_file(access_logfile)
for line in loglines:
print(line)

Categories