I have archive.zip with two files: hello.txt and world.txt
I want to overwrite hello.txt file with new one with that code:
import zipfile
z = zipfile.ZipFile('archive.zip','a')
z.write('hello.txt')
z.close()
but it won't overwrite file, somehow it creates another instance of hello.txt — take a look at winzip screenshot:
Since there is no smth like zipfile.remove(), what's the best way to handle this problem?
There's no way to do that with python zipfile module. You have to create a new zip file and recompress everything again from the first file, plus the new modified file.
Below is some code to do just that. But note that it isn't efficient, since it decompresses and then recompresses all data.
import tempfile
import zipfile
import shutil
import os
def remove_from_zip(zipfname, *filenames):
tempdir = tempfile.mkdtemp()
try:
tempname = os.path.join(tempdir, 'new.zip')
with zipfile.ZipFile(zipfname, 'r') as zipread:
with zipfile.ZipFile(tempname, 'w') as zipwrite:
for item in zipread.infolist():
if item.filename not in filenames:
data = zipread.read(item.filename)
zipwrite.writestr(item, data)
shutil.move(tempname, zipfname)
finally:
shutil.rmtree(tempdir)
Usage:
remove_from_zip('archive.zip', 'hello.txt')
with zipfile.ZipFile('archive.zip', 'a') as z:
z.write('hello.txt')
Building on nosklo's answer.
UpdateableZipFile A class that inherits from ZipFile, maintians the same interface but adds the ability to overwrite files (via writestr or write) and removing files.
import os
import shutil
import tempfile
from zipfile import ZipFile, ZIP_STORED, ZipInfo
class UpdateableZipFile(ZipFile):
"""
Add delete (via remove_file) and update (via writestr and write methods)
To enable update features use UpdateableZipFile with the 'with statement',
Upon __exit__ (if updates were applied) a new zip file will override the exiting one with the updates
"""
class DeleteMarker(object):
pass
def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=False):
# Init base
super(UpdateableZipFile, self).__init__(file, mode=mode,
compression=compression,
allowZip64=allowZip64)
# track file to override in zip
self._replace = {}
# Whether the with statement was called
self._allow_updates = False
def writestr(self, zinfo_or_arcname, bytes, compress_type=None):
if isinstance(zinfo_or_arcname, ZipInfo):
name = zinfo_or_arcname.filename
else:
name = zinfo_or_arcname
# If the file exits, and needs to be overridden,
# mark the entry, and create a temp-file for it
# we allow this only if the with statement is used
if self._allow_updates and name in self.namelist():
temp_file = self._replace[name] = self._replace.get(name,
tempfile.TemporaryFile())
temp_file.write(bytes)
# Otherwise just act normally
else:
super(UpdateableZipFile, self).writestr(zinfo_or_arcname,
bytes, compress_type=compress_type)
def write(self, filename, arcname=None, compress_type=None):
arcname = arcname or filename
# If the file exits, and needs to be overridden,
# mark the entry, and create a temp-file for it
# we allow this only if the with statement is used
if self._allow_updates and arcname in self.namelist():
temp_file = self._replace[arcname] = self._replace.get(arcname,
tempfile.TemporaryFile())
with open(filename, "rb") as source:
shutil.copyfileobj(source, temp_file)
# Otherwise just act normally
else:
super(UpdateableZipFile, self).write(filename,
arcname=arcname, compress_type=compress_type)
def __enter__(self):
# Allow updates
self._allow_updates = True
return self
def __exit__(self, exc_type, exc_val, exc_tb):
# call base to close zip file, organically
try:
super(UpdateableZipFile, self).__exit__(exc_type, exc_val, exc_tb)
if len(self._replace) > 0:
self._rebuild_zip()
finally:
# In case rebuild zip failed,
# be sure to still release all the temp files
self._close_all_temp_files()
self._allow_updates = False
def _close_all_temp_files(self):
for temp_file in self._replace.itervalues():
if hasattr(temp_file, 'close'):
temp_file.close()
def remove_file(self, path):
self._replace[path] = self.DeleteMarker()
def _rebuild_zip(self):
tempdir = tempfile.mkdtemp()
try:
temp_zip_path = os.path.join(tempdir, 'new.zip')
with ZipFile(self.filename, 'r') as zip_read:
# Create new zip with assigned properties
with ZipFile(temp_zip_path, 'w', compression=self.compression,
allowZip64=self._allowZip64) as zip_write:
for item in zip_read.infolist():
# Check if the file should be replaced / or deleted
replacement = self._replace.get(item.filename, None)
# If marked for deletion, do not copy file to new zipfile
if isinstance(replacement, self.DeleteMarker):
del self._replace[item.filename]
continue
# If marked for replacement, copy temp_file, instead of old file
elif replacement is not None:
del self._replace[item.filename]
# Write replacement to archive,
# and then close it (deleting the temp file)
replacement.seek(0)
data = replacement.read()
replacement.close()
else:
data = zip_read.read(item.filename)
zip_write.writestr(item, data)
# Override the archive with the updated one
shutil.move(temp_zip_path, self.filename)
finally:
shutil.rmtree(tempdir)
usage example:
with UpdateableZipFile("C:\Temp\Test2.docx", "a") as o:
# Overwrite a file with a string
o.writestr("word/document.xml", "Some data")
# exclude an exiting file from the zip
o.remove_file("word/fontTable.xml")
# Write a new file (with no conflict) to the zp
o.writestr("new_file", "more data")
# Overwrite a file with a file
o.write(r"C:\Temp\example.png", "word/settings.xml")
Based on this answer here's a quick and dirty way to monkey patch stock zipfile to support file deletion (while we waiting for it being accepted for python:main):
from zipfile import ZipFile, ZipInfo
from operator import attrgetter
import functools
def enable_zip_remove(func):
def _zipfile_remove_member(self, member):
# get a sorted filelist by header offset, in case the dir order
# doesn't match the actual entry order
fp = self.fp
entry_offset = 0
filelist = sorted(self.filelist, key=attrgetter('header_offset'))
for i in range(len(filelist)):
info = filelist[i]
# find the target member
if info.header_offset < member.header_offset:
continue
# get the total size of the entry
entry_size = None
if i == len(filelist) - 1:
entry_size = self.start_dir - info.header_offset
else:
entry_size = filelist[i + 1].header_offset - info.header_offset
# found the member, set the entry offset
if member == info:
entry_offset = entry_size
continue
# Move entry
# read the actual entry data
fp.seek(info.header_offset)
entry_data = fp.read(entry_size)
# update the header
info.header_offset -= entry_offset
# write the entry to the new position
fp.seek(info.header_offset)
fp.write(entry_data)
fp.flush()
# update state
self.start_dir -= entry_offset
self.filelist.remove(member)
del self.NameToInfo[member.filename]
self._didModify = True
# seek to the start of the central dir
fp.seek(self.start_dir)
def zipfile_remove(self, member):
"""Remove a file from the archive. The archive must be open with mode 'a'"""
if self.mode != 'a':
raise RuntimeError("remove() requires mode 'a'")
if not self.fp:
raise ValueError(
"Attempt to write to ZIP archive that was already closed")
if self._writing:
raise ValueError(
"Can't write to ZIP archive while an open writing handle exists."
)
# Make sure we have an info object
if isinstance(member, ZipInfo):
# 'member' is already an info object
zinfo = member
else:
# get the info object
zinfo = self.getinfo(member)
return self._zipfile_remove_member(zinfo)
#functools.wraps(func)
def wrapper(*args, **kwargs):
if not hasattr(ZipFile, "remove"):
setattr(ZipFile, "_zipfile_remove_member", _zipfile_remove_member)
setattr(ZipFile, "remove", zipfile_remove)
return func(*args, **kwargs)
return wrapper
Usage:
#enable_zip_remove
def replace_zip_file():
with ZipFile("archive.zip", "a") as z:
z.remove("hello.txt")
z.write("hello.txt")
P.S. NSFW
My solution is similar to the other answers but uses SQLite to manage the intermediate files and provides __getitem__, __setitem__ and __delitem__ for an easy interface.
By default the db is in-memory but you can provide a temp file path if you have a zip larger than available memory.
And of course SQLite is built into Python and faster than the file system
import sqlite3
import subprocess
import zipfile
from pathlib import Path
from sql import CREATE_TABLE, DELETE_FILE, INSERT_FILE, SELECT_CONTENT
class EditableZip:
"""Intended to make editing files inside zip archive easy, this class is capable of loading files
from a zip file into a sqlite database, facilitates editing/removing/adding files, and saving
to a zip.
The database can be in-memory (default) or in a temporary on disk file if
temp_db_path is provided.
If an on-disk file is used, EditableZip.close can be called to remove the file or EditableZip
can be used as a context manager.
If auto_save is set to True and an initial zip_path was provided then the file will
be overwritten when EditableZip closes. If you wish to save to a different file,
or no zip_path is used in instantiation, auto_save can take a file path.
Files can be added by item assignment
with EditableZip(auto_save="example.zip") as ez:
ez["thing.txt"] = "stuff"
# empty dir
ez["empty/"] = None
Assignment accepts Non-text files as bytes.
EditableZip is subscriptable. If the subscript is a path in the db, the data will be returned.
EditableZip.files can be used to iterate over files in the db.
"""
def __init__(
self,
zip_path: None | str | Path = None,
temp_db_path: None | Path = None,
auto_save: bool | str | Path = False,
):
self.temp_db_path, self.auto_save, self.file_path = (
temp_db_path,
auto_save,
zip_path,
)
self.db = sqlite3.connect(
str(temp_db_path if temp_db_path is not None else ":memory:")
)
self.db.execute(CREATE_TABLE)
if self.file_path:
self.load(self.file_path)
#property
def files(self):
"Returns a generator of all file paths in the database."
try:
return (
i[0] for i in self.db.execute("SELECT file_path FROM files").fetchall()
)
except TypeError:
return None
def load(self, zip_path: str | Path) -> None:
"Add all files from zip at zip_path to db."
with zipfile.ZipFile(zip_path, mode="r") as archive:
for item in archive.infolist():
self[item.filename] = (
None if item.filename[-1] == "/" else archive.read(item)
)
def save(self, zip_path: None | str | Path) -> Path:
"Save all files from db to zip at zip_path."
zip_path = self.file_path if zip_path is None else zip_path
with zipfile.ZipFile(zip_path, "w") as archive:
for file in self.files:
if file_data := self.fetch(file):
archive.writestr(file, file_data)
else:
archive.writestr(zipfile.ZipInfo(file), "")
return zip_path
def close(self):
"Auto save if applicable and close + remove db."
if self.auto_save:
self.save(
zip_path=self.auto_save
if isinstance(self.auto_save, (str, Path))
else None
)
self.db.close()
if isinstance(self.temp_db_path, Path):
self.temp_db_path.unlink(missing_ok=True)
def fetch(self, file_path: str) -> bytes:
"Get content of db file for file_path."
try:
return self.db.execute(SELECT_CONTENT, {"file_path": file_path}).fetchone()[
0
]
except TypeError:
return None
def __getitem__(self, key):
result = self.fetch(key)
try:
return result.decode("utf-8")
except AttributeError:
return result
def __setitem__(self, file_path, content: str | bytes):
if isinstance(content, str):
content = content.encode("utf-8")
self.db.execute(
INSERT_FILE,
{"file_path": file_path, "file_content": content},
)
def __delitem__(self, file_path):
self.db.execute(DELETE_FILE, {"file_path": file_path})
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
if __name__ == "__main__":
# A use case: editing epub files.
# File source:
# https://archiveofourown.org/downloads/13795605/Victoria%20Potter%20and%20the.epub?updated_at=1650231615
file_path = Path("Victoria Potter and the.epub")
new_file = (file_path.parent / (file_path.stem + "- lowercase")).with_suffix(
file_path.suffix
)
# Create a copy of the epub with all letters lowercase
with EditableZip(zip_path=file_path, auto_save=new_file) as ez:
for file in ez.files:
if Path(file).suffix in [".html", ".xhtml"]:
ez[file] = ez[file].lower()
Reference: Delete file from zipfile with the ZipFile Module
In short,
You can take the code from https://github.com/python/cpython/blob/659eb048cc9cac73c46349eb29845bc5cd630f09/Lib/zipfile.py and create a separate file from it. After that just reference it from your project instead of built-in python library: import myproject.zipfile as zipfile.
Usage:
with zipfile.ZipFile(f"archive.zip", "a") as z:
z.remove(f"firstfile.txt")
so i want to avoid overwrite the file name that existed. but i don't know how to combine the code with mycode. please help me
here's my code for write file:
def filepass(f):
print(f)
with open ('media/pass/'+'filepass.txt', 'a') as fo:
fo.write(f)
fo.close()
return fo
and here's the code to create number in name filepass:
def build_filename(name, num=0):
root, ext = os.path.splitext(name)
print(root)
return '%s%d%s' % (root, num, ext) if num else name
def find_next_filename(name, max_tries=20):
if not os.path.exists(name): return name
else:
for i in range(max_tries):
test_name = build_filename(name, i+1)
if not os.path.exists(test_name): return test_name
return None
all i want is to create filename : filepass.txt, filepass1.txt, filepass2.txt
Something like this?
def filepass(f):
print(f)
filename = find_next_filename('media/pass/filepass.txt')
with open (filename, 'a') as fo:
fo.write(f)
# you don't need to close when you use "with open";
# but then it doesn't make sense to return a closed file handle
# maybe let's return the filename instead
return filename
My below code creates exception log files at location-
C:/Users/Desktop/SampleTestFiles/ProjectFiles/ExceptionLogFiles/
Initially code keeps writing into ExceptionLog_1.txt file whenever exception occurs and when the size of file exceeds 1 MB it starts writing to ExceptionLog_2.txt until its size is 1 MB. So far, it works perfect only for these 2 file creations and writing. When size of second file exceeds 1 MB it should log exceptions into a third log file ExceptionLog_3.txt. But, it does not works. Code keeps on writing into second file.
How to modify my code to make sure a new file is created when size of latest log file exceeds 1 MB?
def WriteExceptionToFile(self, traceback):
count = 1
fileDir = 'C:/Users/Desktop/SampleTestFiles/ProjectFiles/ExceptionLogFiles/'
# check if the path exists, create directory if not.
if not (os.path.exists):
os.mkdir(fileDir)
filename = "ExceptionLog_"+ str(count) +".txt"
filepath = os.path.join(fileDir, filename)
try:
if os.path.getsize(filepath) < 1048576: # if file size is less than 1 MB
filename = "ExceptionLog_" + str(count) + ".txt"
else:
filename = "ExceptionLog_" + str(count + 1) + ".txt"
except OSError:
Print("Path '%s' does not exists or is inaccessible" % filepath)
filename = "ExceptionLog_1.txt"
filepath = os.path.join(fileDir, filename)
with open(filepath, 'a+') as f:
traceback.print_exc(file=f)
f.close()
You could also try an approach using rotating files from the logging module.
Example directly from the documentation (https://docs.python.org/3/howto/logging-cookbook.html):
import glob
import logging
import logging.handlers
LOG_FILENAME = 'logging_rotatingfile_example.out'
# Set up a specific logger with our desired output level
my_logger = logging.getLogger('MyLogger')
my_logger.setLevel(logging.DEBUG)
# Add the log message handler to the logger, HERE YOU CAN SPECIFY THE FILE SIZE
handler = logging.handlers.RotatingFileHandler(
LOG_FILENAME, maxBytes=20, backupCount=5)
my_logger.addHandler(handler)
# Log some messages
for i in range(20):
my_logger.debug('i = %d' % i)
# See what files are created
logfiles = glob.glob('%s*' % LOG_FILENAME)
for filename in logfiles:
print(filename)
I would suggest you to go with using a class that way you wont have to worry about maintaining the correct count elsewhere.
Checkout the solution below
import os
class GenericExceptionWriter:
def __init__(self):
self.count = 1
self.fileDir = 'C:/Users/Desktop/SampleTestFiles/ProjectFiles/ExceptionLogFiles/'
os.makedirs(self.fileDir, exist_ok=True)
self.currentFilePath = "".join(["ExceptionLog_",str(self.count),".txt"])
self.maxSize = 1048576
def checkSize(self):
if os.path.getsize(self.currentFilePath) > self.maxSize:
self.count += 1
self.currentFilePath = "".join(["ExceptionLog_",str(self.count),".txt"])
def WriteExceptionToFile(self, traceback):
try:
self.checkSize()
except OSError:
print("Path '%s' does not exists or is inaccessible" % self.currentFilePath)
filepath = os.path.join(self.fileDir, self.currentFilePath)
with open(filepath, 'a+') as f:
traceback.print_exc(file=f)
f.close()
write a python program to create a .html file in a directory, the directory can be created correctly, use function open to create this .html file and try to write some content in this file,but the .html file can not be created,
def save_public_figure_page(self,type,p_f_name):
glovar.date = time.strftime("%Y%m%d", time.localtime())
p_f_page_file_directory = os.path.join("dataset", "html",type,glovar.date,p_f_name)
data_storage.directory_create(p_f_page_file_directory)
html_user_page = glovar.webdriver_browser.page_source
p_f_page_file = os.path.join(p_f_page_file_directory,type + "_" + p_f_name + ".html")
html_file = open(p_f_page_file, "w", encoding='utf-8')
html_file.write(html_user_page)
html_file.close()
the directory_create function in data_storage is:
#create the file storage directory
def directory_create(path):
directory = os.path.join(os.path.dirname(__file__),path)
if not os.path.exists(directory):
os.makedirs(directory)
it errors:
<class 'FileNotFoundError'> at /public_figure_name_sub
[Errno 2] No such file or directory: 'dataset\\html\\public_figure\\20170404\\Donald Trump \\public_figure_Donald Trump .html'
the current directory is under /dataset/, I found the directory:
F:\MyDocument\F\My Document\Training\Python\PyCharmProject\FaceBookCrawl\dataset\html\public_figure\20170404\Donald Trump
has been created correctly,but the file——public_figure_Donald Trump .html can not be created correctly,could you please tell me the reason and how to correct
As suggested by Jean-François Fabre, your file has a space just before the ".html".
To solve this, use trim() in the variable p_f_name in your 7th line:
# Added trim() to p_f_name
p_f_page_file = os.path.join(p_f_page_file_directory,type +
"_" + p_f_name.trim() + ".html")
This will create the file:
public_figure_Donald Trump.html
instead of
public_figure_Donald Trump .html
PD: Anyway your filename has a lot of spaces between Donald and Trump. I don't know where the file name comes but you might want to fix it.
Function save_public_figure_page
class public_figure:
def save_public_figure_page(self, type, p_f_name):
glovar.date = time.strftime("%Y%m%d", time.localtime())
p_f_name = p_f_name.trim() # Trim the name to get rid of extra spaces
p_f_page_name = '{t}_{pfn}.html'.format(t=type, pfn=p_f_name)
p_f_page_file_directory = os.path.join(
directory, # Add the directory from the data_storage.directory property
"dataset", "html",
type, glovar.date, p_f_name,
)
if data_storage.directory_create(self.p_f_page_file_directory):
html_user_page = glovar.webdriver_browser.page_source
p_f_page_file = os.path.join(p_f_page_file_directory, p_f_page_name)
html_file = open(p_f_page_file, "w", encoding='utf-8')
html_file.write(html_user_page)
html_file.close()
directory_create method of data_storage
#create the file storage directory
class data_storage:
def directory_create(self, path):
self.directory = os.path.join(os.path.dirname(__file__), path)
if not os.path.exists(self.directory):
try:
os.makedirs(self.directory)
except:
raise
else:
return True
else:
return True
I wrote a script to read PDF metadata to ease a task at work. The current working version is not very usable in the long run:
from pyPdf import PdfFileReader
BASEDIR = ''
PDFFiles = []
def extractor():
output = open('windoutput.txt', 'r+')
for file in PDFFiles:
try:
pdf_toread = PdfFileReader(open(BASEDIR + file, 'r'))
pdf_info = pdf_toread.getDocumentInfo()
#print str(pdf_info) #print full metadata if you want
x = file + "~" + pdf_info['/Title'] + " ~ " + pdf_info['/Subject']
print x
output.write(x + '\n')
except:
x = file + '~' + ' ERROR: Data missing or corrupt'
print x
output.write(x + '\n')
pass
output.close()
if __name__ == "__main__":
extractor()
Currently, as you can see, I have to manually input the working directory and manually populate the list of PDF files. It also just prints out the data in the terminal in a format that I can copy/paste/separate into a spreadsheet.
I'd like the script to work automatically in whichever directory I throw it in and populate a CSV file for easier use. So far:
from pyPdf import PdfFileReader
import csv
import os
def extractor():
basedir = os.getcwd()
extension = '.pdf'
pdffiles = [filter(lambda x: x.endswith('.pdf'), os.listdir(basedir))]
with open('pdfmetadata.csv', 'wb') as csvfile:
for f in pdffiles:
try:
pdf_to_read = PdfFileReader(open(f, 'r'))
pdf_info = pdf_to_read.getDocumentInfo()
title = pdf_info['/Title']
subject = pdf_info['/Subject']
csvfile.writerow([file, title, subject])
print 'Metadata for %s written successfully.' % (f)
except:
print 'ERROR reading file %s.' % (f)
#output.writerow(x + '\n')
pass
if __name__ == "__main__":
extractor()
In its current state it seems to just prints a single error (as in, the error message in the exception, not an error returned by Python) message and then stop. I've been staring at it for a while and I'm not really sure where to go from here. Can anyone point me in the right direction?
writerow([file, title, subject]) should be writerow([f, title, subject])
You can use sys.exc_info() to print the details of your error
http://docs.python.org/2/library/sys.html#sys.exc_info
Did you check the pdffiles variable contains what you think it does? I was getting a list inside a list... so maybe try:
for files in pdffiles:
for f in files:
#do stuff with f
I personally like glob. Notice I add * before the .pdf in the extension variable:
import os
import glob
basedir = os.getcwd()
extension = '*.pdf'
pdffiles = glob.glob(os.path.join(basedir,extension)))
Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.