efficiently zipping files using python - python

I need to zip directories:
directory -> directory.zip.
It should be possible to easily open that file on WIndows but creation of that file should be as fast as possible -> sth like tar
Then original directory may be deleted. What would be the best options for that? The only reason for "zipping" that directory is to be able to download it over http as one file.

You can use the built-in zipfile module. In particular ZIP_STORED disables compression.
For extra performance, you could also send the generated zip output directly into the HTTP response, without first creating a file on disk or a buffer in memory.

For zipping directories I have always used the zipfile module, and used it like so;
import zipfile
def zip(src, dst):
zf = zipfile.ZipFile("%s.zip" % (dst), "w", zipfile.ZIP_DEFLATED)
abs_src = os.path.abspath(src)
for dirname, subdirs, files in os.walk(src):
for filename in files:
absname = os.path.abspath(os.path.join(dirname, filename))
arcname = absname[len(abs_src) + 1:]
print('zipping %s as %s' % (os.path.join(dirname, filename),
arcname))
zf.write(absname, arcname)
zf.close()
Then called with this;
zip(str(source), str(destination))

Related

Python zipfile extract files from directory inside a zip file

I need to extract some files inside a directory in a zip file.
The main problem is that I want to extract only the contents from this directory, not the directory itself with all the files inside.
I've tried by iterating on them using namelist() or tweaking it with zipfile.Path(), unsuccessfully.
This works but it extracts the directory with the files (like extractall() does). Path doesn't work because raises KeyError saying that the item doesn't exist yet it does.
for zip_file in zip_files:
with zipfile.ZipFile(os.path.join(home_path, zip_file), 'r') as zip_ref:
files = [n for n in zip_ref.namelist()]
zip_ref.extractall(os.path.join(home_path, 'dir'), members=files)
written from my mobile but I expect it to work:
from pathlib import Path
with ZipFile(zipfile_path, "r") as zf:
for f in zf.namelist():
if f.startswith('/'):
continue
source = zf.open(f)
target = open(target_dir / Path(f).name, "wb")
with source, target:
shutil.copyfileobj(source, target)

Python doesn't recognize zip files as zip files

I iterate through the directories and want to find all zip files and add them to download_all.zip
I am sure there are zip files, but Python doesn't recognize those zip files as zip files. Why is that?
my code:
os.chdir(boardpath)
# zf = zipfile.ZipFile('download_all.zip', mode='w')
z = zipfile.ZipFile('download_all.zip', 'w') #creating zip download_all.zip file
for path, dirs, files in os.walk(boardpath):
for file in files:
print file
if file.endswith('.zip'): # find all zip files
print ('adding', file)
z.write(file) # error shows: doesn't file is a str object, not a zip file
z.close()
z = zipfile.ZipFile("download_all.zip")
z.printdir()
I tried:
file.printdir()
# I got the following error: AttributeError: 'str' object has no attribute 'printdir'
zipfile.Zipfile.write(name), name actually stands for full file path, not just filename.
import os #at the top
if file.endswith('.zip'): # find all zip files
filepath = os.path.join(path, file)
print ('adding', filepath)
z.write(filepath) # no error
As stated in the ZipFile.write's doc, the filename argument must be relative to the archive root. So the following line:
z.write(file)
Should be:
z.write(os.path.relpath(os.path.join(path, file)))
The files that os/walk() yields are lists of filenames. These filenames are just strings (which don't have a printdir() method).
You want to use the context management while opening up the zip file archive and writing to it for each file that you find, hence the use of with. In addition, since you're walking through a directory structure, you need to full qualify each file's path.
import os
import Zipfile
with zipfile.ZipFile('download_all.zip', 'w') as zf:
for path, dirs, files in os.walk('/some_path'):
for file in files:
if file.endswith('.zip'):
zf.write(os.path.join(path, file))

Python copy files script

I built a script in Python to copy any files from a list of folders to a destination folder already made.
source = ['c:/test/source/', ]
destination = 'c:/test/destination/'
def copy(source, destination):
import os, shutil
try:
for folder in source:
files = os.listdir(folder)
for file in files:
current_file = os.path.join(folder, file)
shutil.copy(os.path.join(folder, file), destination)
except:
pass
The problem with this script is that it didn't copy the sub folders. Any suggestion to fix it ?
Thanks
I think you need to use shutil.copytree
shutil.copytree(os.path.join(folder, file), destination)
but shutil.copytree won't overwrite if folder exist,
if you want to overwrite all, use distutils.dir_util.copy_tree
from distutils import dir_util
dir_util.copy_tree(os.path.join(folder, file), destination)

Unzip nested zip files in python

I am looking for a way to unzip nested zip files in python. For example, consider the following structure (hypothetical names for ease):
Folder
ZipfileA.zip
ZipfileA1.zip
ZipfileA2.zip
ZipfileB.zip
ZipfileB1.zip
ZipfileB2.zip
...etc. I am trying to access text files that are within the second zip. I certainly don't want to extract everything, as the shear numbers would crash the computer (there is several hundred zips in the first layer, and almost 10,000 in the second layer (per zip)).
I have been playing around with the 'zipfile' module - I am able open the 1st level of zipfiles. E.g.:
zipfile_obj = zipfile.ZipFile("/Folder/ZipfileA.zip")
next_layer_zip = zipfile_obj.open("ZipfileA1.zip")
However, this returns a "ZipExtFile" instance (not a file or zipfile instance) - and I can't then go on and open this particular data type. That I can't do this:
data = next_layer_zip.open(data.txt)
I can however "read" this zip file file with:
next_layer_zip.read()
But this is entirely useless! (i.e. can only read compressed data/goobledigook).
Does anyone have any ideas on how I might go about this (without using ZipFile.extract)??
I came across this, http://pypi.python.org/pypi/zip_open/ - which looks to do exactly what I want, but it doesn't seem to work for me. (keep getting "[Errno 2] No such file or directory:" for the files I am trying to process, using that module).
Any ideas would be much appreciated!! Thanks in advance
ZipFile needs a file-like object, so you can use StringIO to turn the data you read from the nested zip into such an object. The caveat is that you'll be loading the full (still compressed) inner zip into memory.
with zipfile.ZipFile('foo.zip') as z:
with z.open('nested.zip') as z2:
z2_filedata = cStringIO.StringIO(z2.read())
with zipfile.ZipFile(z2_filedata) as nested_zip:
print nested_zip.open('data.txt').read()
Unfortunately decompressing zip files requires random access to the archive, and the ZipFile methods (not to mention the DEFLATE algorithm itself) only provide streams. It is therefore impossible to decompress nested zip files without extracting them.
Here's a function I came up with.
def extract_nested_zipfile(path, parent_zip=None):
"""Returns a ZipFile specified by path, even if the path contains
intermediary ZipFiles. For example, /root/gparent.zip/parent.zip/child.zip
will return a ZipFile that represents child.zip
"""
def extract_inner_zipfile(parent_zip, child_zip_path):
"""Returns a ZipFile specified by child_zip_path that exists inside
parent_zip.
"""
memory_zip = StringIO()
memory_zip.write(parent_zip.open(child_zip_path).read())
return zipfile.ZipFile(memory_zip)
if ('.zip' + os.sep) in path:
(parent_zip_path, child_zip_path) = os.path.relpath(path).split(
'.zip' + os.sep, 1)
parent_zip_path += '.zip'
if not parent_zip:
# This is the top-level, so read from disk
parent_zip = zipfile.ZipFile(parent_zip_path)
else:
# We're already in a zip, so pull it out and recurse
parent_zip = extract_inner_zipfile(parent_zip, parent_zip_path)
return extract_nested_zipfile(child_zip_path, parent_zip)
else:
if parent_zip:
return extract_inner_zipfile(parent_zip, path)
else:
# If there is no nesting, it's easy!
return zipfile.ZipFile(path)
Here's how I tested it:
echo hello world > hi.txt
zip wrap1.zip hi.txt
zip wrap2.zip wrap1.zip
zip wrap3.zip wrap2.zip
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap1.zip').open('hi.txt').read()
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap2.zip/wrap1.zip').open('hi.txt').read()
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap3.zip/wrap2.zip/wrap1.zip').open('hi.txt').read()
For those looking for a function that extracts a nested zip file (any level of nesting) and cleans up the original zip files:
import zipfile, re, os
def extract_nested_zip(zippedFile, toFolder):
""" Unzip a zip file and its contents, including nested zip files
Delete the zip file(s) after extraction
"""
with zipfile.ZipFile(zippedFile, 'r') as zfile:
zfile.extractall(path=toFolder)
os.remove(zippedFile)
for root, dirs, files in os.walk(toFolder):
for filename in files:
if re.search(r'\.zip$', filename):
fileSpec = os.path.join(root, filename)
extract_nested_zip(fileSpec, root)
I use python 3.7.3
import zipfile
import io
with zipfile.ZipFile('all.zip') as z:
with z.open('nested.zip') as z2:
z2_filedata = io.BytesIO(z2.read())
with zipfile.ZipFile(z2_filedata) as nested_zip:
print( nested_zip.open('readme.md').read())
This works for me. Just place this script with the nested zip under the same directory. It will also count the total number of files within the nested zip as well
import os
from zipfile import ZipFile
def unzip (path, total_count):
for root, dirs, files in os.walk(path):
for file in files:
file_name = os.path.join(root, file)
if (not file_name.endswith('.zip')):
total_count += 1
else:
currentdir = file_name[:-4]
if not os.path.exists(currentdir):
os.makedirs(currentdir)
with ZipFile(file_name) as zipObj:
zipObj.extractall(currentdir)
os.remove(file_name)
total_count = unzip(currentdir, total_count)
return total_count
total_count = unzip ('.', 0)
print(total_count)
My approach to such a problem is this, includes self-assigned objects:
import os
import re
import zipfile
import pandas as pd
# import numpy as np
path = r'G:\Important\Data\EKATTE'
# DESCRIBE
archives = os.listdir(path)
archives = [ar for ar in archives if ar.endswith(".zip")]
contents = pd.DataFrame({'elec_date':[],'files':[]})
for a in archives:
archive = zipfile.ZipFile( path+'\\'+a )
filelist = archive.namelist()
# archive.infolist()
for i in archive.namelist():
if re.match('.*zip', i):
sub_arch = zipfile.ZipFile(archive.open(i))
sub_names = [x for x in sub_arch.namelist()]
for s in sub_names:
exec(f"{s.split('.')[0]} = pd.read_excel(sub_arch.open(s), squeeze=True)")
The archive can be found on Bulgaria's National Statistics Institute page (direct link):
https://www.nsi.bg/sites/default/files/files/EKATTE/Ekatte.zip

python tarfile without full path

I made a small script as below to read group of files and tar them, its all working fine accept that the compressed file contain full path of the files when uncompressed. Is there a way to do it without the directory structure?
compressor = tarfile.open(PATH_TO_ARCHIVE + re.sub('[\s.:"-]+', '',
str(datetime.datetime.now())) + '.tar.gz', 'w:gz')
for file in os.listdir(os.path.join(settings.MEDIA_ROOT, PATH_CSS_DB_OUT)):
compressor.add(os.path.join(settings.MEDIA_ROOT, PATH_CSS_DB_OUT) + file)
compressor.close()
Take a look at the TarFile.add signature:
... If given, arcname specifies an alternative name for the file in the archive.
I created a context manager for changing the current working directory fo handling this with tar files.
import contextlib
#contextlib.contextmanager
def cd_change(tmp_location):
cd = os.getcwd()
os.chdir(tmp_location)
try:
yield
finally:
os.chdir(cd)
Then, to package everything up in your case:
with cd_change(os.path.join(settings.MEDIA_ROOT, PATH_CSS_DB_OUT)):
for file in os.listdir('.'):
compressor.add(file)

Categories