Unzip nested zip files in python

Unzip nested zip files in python - python

I am looking for a way to unzip nested zip files in python. For example, consider the following structure (hypothetical names for ease):
Folder
ZipfileA.zip
ZipfileA1.zip
ZipfileA2.zip
ZipfileB.zip
ZipfileB1.zip
ZipfileB2.zip
...etc. I am trying to access text files that are within the second zip. I certainly don't want to extract everything, as the shear numbers would crash the computer (there is several hundred zips in the first layer, and almost 10,000 in the second layer (per zip)).
I have been playing around with the 'zipfile' module - I am able open the 1st level of zipfiles. E.g.:
zipfile_obj = zipfile.ZipFile("/Folder/ZipfileA.zip")
next_layer_zip = zipfile_obj.open("ZipfileA1.zip")
However, this returns a "ZipExtFile" instance (not a file or zipfile instance) - and I can't then go on and open this particular data type. That I can't do this:
data = next_layer_zip.open(data.txt)
I can however "read" this zip file file with:
next_layer_zip.read()
But this is entirely useless! (i.e. can only read compressed data/goobledigook).
Does anyone have any ideas on how I might go about this (without using ZipFile.extract)??
I came across this, http://pypi.python.org/pypi/zip_open/ - which looks to do exactly what I want, but it doesn't seem to work for me. (keep getting "[Errno 2] No such file or directory:" for the files I am trying to process, using that module).
Any ideas would be much appreciated!! Thanks in advance

ZipFile needs a file-like object, so you can use StringIO to turn the data you read from the nested zip into such an object. The caveat is that you'll be loading the full (still compressed) inner zip into memory.
with zipfile.ZipFile('foo.zip') as z:
with z.open('nested.zip') as z2:
z2_filedata = cStringIO.StringIO(z2.read())
with zipfile.ZipFile(z2_filedata) as nested_zip:
print nested_zip.open('data.txt').read()

Unfortunately decompressing zip files requires random access to the archive, and the ZipFile methods (not to mention the DEFLATE algorithm itself) only provide streams. It is therefore impossible to decompress nested zip files without extracting them.

Here's a function I came up with.
def extract_nested_zipfile(path, parent_zip=None):
"""Returns a ZipFile specified by path, even if the path contains
intermediary ZipFiles. For example, /root/gparent.zip/parent.zip/child.zip
will return a ZipFile that represents child.zip
"""
def extract_inner_zipfile(parent_zip, child_zip_path):
"""Returns a ZipFile specified by child_zip_path that exists inside
parent_zip.
"""
memory_zip = StringIO()
memory_zip.write(parent_zip.open(child_zip_path).read())
return zipfile.ZipFile(memory_zip)
if ('.zip' + os.sep) in path:
(parent_zip_path, child_zip_path) = os.path.relpath(path).split(
'.zip' + os.sep, 1)
parent_zip_path += '.zip'
if not parent_zip:
# This is the top-level, so read from disk
parent_zip = zipfile.ZipFile(parent_zip_path)
else:
# We're already in a zip, so pull it out and recurse
parent_zip = extract_inner_zipfile(parent_zip, parent_zip_path)
return extract_nested_zipfile(child_zip_path, parent_zip)
else:
if parent_zip:
return extract_inner_zipfile(parent_zip, path)
else:
# If there is no nesting, it's easy!
return zipfile.ZipFile(path)
Here's how I tested it:
echo hello world > hi.txt
zip wrap1.zip hi.txt
zip wrap2.zip wrap1.zip
zip wrap3.zip wrap2.zip
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap1.zip').open('hi.txt').read()
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap2.zip/wrap1.zip').open('hi.txt').read()
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap3.zip/wrap2.zip/wrap1.zip').open('hi.txt').read()

For those looking for a function that extracts a nested zip file (any level of nesting) and cleans up the original zip files:
import zipfile, re, os
def extract_nested_zip(zippedFile, toFolder):
""" Unzip a zip file and its contents, including nested zip files
Delete the zip file(s) after extraction
"""
with zipfile.ZipFile(zippedFile, 'r') as zfile:
zfile.extractall(path=toFolder)
os.remove(zippedFile)
for root, dirs, files in os.walk(toFolder):
for filename in files:
if re.search(r'\.zip$', filename):
fileSpec = os.path.join(root, filename)
extract_nested_zip(fileSpec, root)

I use python 3.7.3
import zipfile
import io
with zipfile.ZipFile('all.zip') as z:
with z.open('nested.zip') as z2:
z2_filedata = io.BytesIO(z2.read())
with zipfile.ZipFile(z2_filedata) as nested_zip:
print( nested_zip.open('readme.md').read())

This works for me. Just place this script with the nested zip under the same directory. It will also count the total number of files within the nested zip as well
import os
from zipfile import ZipFile
def unzip (path, total_count):
for root, dirs, files in os.walk(path):
for file in files:
file_name = os.path.join(root, file)
if (not file_name.endswith('.zip')):
total_count += 1
else:
currentdir = file_name[:-4]
if not os.path.exists(currentdir):
os.makedirs(currentdir)
with ZipFile(file_name) as zipObj:
zipObj.extractall(currentdir)
os.remove(file_name)
total_count = unzip(currentdir, total_count)
return total_count
total_count = unzip ('.', 0)
print(total_count)

My approach to such a problem is this, includes self-assigned objects:
import os
import re
import zipfile
import pandas as pd
# import numpy as np
path = r'G:\Important\Data\EKATTE'
# DESCRIBE
archives = os.listdir(path)
archives = [ar for ar in archives if ar.endswith(".zip")]
contents = pd.DataFrame({'elec_date':[],'files':[]})
for a in archives:
archive = zipfile.ZipFile( path+'\\'+a )
filelist = archive.namelist()
# archive.infolist()
for i in archive.namelist():
if re.match('.*zip', i):
sub_arch = zipfile.ZipFile(archive.open(i))
sub_names = [x for x in sub_arch.namelist()]
for s in sub_names:
exec(f"{s.split('.')[0]} = pd.read_excel(sub_arch.open(s), squeeze=True)")
The archive can be found on Bulgaria's National Statistics Institute page (direct link):
https://www.nsi.bg/sites/default/files/files/EKATTE/Ekatte.zip

Related

Extracting Zip From HTTP Response in Python

I'm able to get the zip from HTTPs response and store in a specific folder using below code snippet:
z = zipfile.ZipFile(io.BytesIO(statement_resp.content))
z.extractall("/pathtostore")
However, in /pathtostore the zip file gets extracted with some random name. Is there a way to control the name of zip files created while extracting itself?
Currently, after zip extraction, below is the directory structure:
/pathtostore/ZaXyzzz
--> ZaXyzzz is the zip name.
I'm looking for something as below:
/pathtostore/1234_2020_03_02
--> 1234_2020_03_02 (cid_curdate) is the zip name which I want.
PS: I cannot read the zip and rename it as there could be multiple zip present inside /pathtostore

You can get names z.namelist() and read every file separatelly z.read() and write it with new name using standarad open(), write(), close()
Minimal example.
It may need more code if zipfile has folders
import zipfile
import datetime
import os
z = zipfile.ZipFile('input.zip')
folder = '/pathtostore'
os.makedirs(folder, exist_ok=True)
today = datetime.date.today().strftime('%Y_%m_%d')
cid = 0
for old_name in z.namelist():
cid += 1
new_name = os.path.join(folder, '{:04}_{}'.format(cid, today))
print(old_name, '->', new_name)
data = z.read(old_name)
with open(new_name, 'wb') as fh:
fh.write(data)

You can read the zipfile's ZipInfo structures and modify its filename attribute for the write
from pathlib import Path
z = zipfile.ZipFile(io.BytesIO(statement_resp.content))
for info in z.getinfo():
# implement your extraction policy here. Remove root
# path name and add our own
zip_path = Path(z.filename)
z.filename = str(Path("1234_2020_03_02").joinpath(*zip_path.parts[1:]))
x.extract(info)

Moving oldest file names with different extensions to common folder. Python

I have a folder that contains 15 .jpg files and 15 .pdf files. The file names are the same with just the extensions being different. Example ABC123.jpg and ABC123.pdf. I have spent the better part of the last few days trying to use shutil to move the oldest .pdf file to a new folder then finding the matching .jpg file and moving it to the same folder as the .pdf. I was able to move the oldest file or move all files of a given type. Just couldn't get the oldest of a specific type. I tried moving all .pdfs to a new folder1 and all .jpgs to a new folder2 and then moving oldest from each of those to a common folder. However, they don't always match. The oldest .jpg might be different than the oldest .pdf. I am sure there is a simple solution, I have just been working it in circles so long I can no longer see the forest through the trees.

Use the os.path.getmtime function as the key to sort your files.
import os
def oldest_file(dir, type):
return min([name for name in os.listdir(dir) if name.endswith(type)], key=lambda name: os.path.getmtime(os.path.join(dir, name)))
print(oldest('/your/folder', '.jpg'))
If you need to search the entire tree, use os.walk instead of os.listdir:
import os
from itertools import chain
def oldest_file(dir, type):
return min(list(chain(*[[os.path.join(root, file) for file in files if file.endswith(type)] for root, _, files in os.walk(dir)])), key=lambda file: os.path.getmtime(file))
print(oldest('/your/folder', '.jpg'))
I'm sure you can handle the rest of the code that deals with moving files.

I found oldest_file_in_tree from this answer.
import os
import shutil
def oldest_file_in_tree(rootfolder, extension=".avi"):
return min(
(os.path.join(dirname, filename)
for dirname, dirnames, filenames in os.walk(rootfolder)
for filename in filenames
if filename.endswith(extension)),
key=lambda fn: os.stat(fn).st_mtime)
oldest_pdf = oldest_file_in_tree('/var/somedir', '.pdf')
name = oldest_pdf[:4]
matching_jpg = '{}.jpg'.format(name)
shutil.move("/var/somedir/{}.pdf".format(name), "path/to/new/destination/{}.pdf".format(name))
shutil.move("/var/somedir/{}.jpg".format(name), "path/to/new/destination/{}.jpg".format(name))

Here is how I was able to make it work..
import os, shutil
import glob
todir = '/var/somedir/'
def oldest_file_in_tree(rootfolder, extension=".pdf"):
return min(
(os.path.join(dirname, filename)
for dirname, dirnames, filenames in os.walk(rootfolder)
for filename in filenames
if filename.endswith(extension)),
key=lambda fn: os.stat(fn).st_mtime)
oldest_g3d = oldest_file_in_tree('/var/somedir/', '.pdf')
name = oldest_pdf[:-4]
matching_jpg = '{}.jpg'.format(name)
shutil.move(oldest_pdf, todir)
shutil.move(matching_jpg, todir)

Read .txt from multiple .zip in folder

I have a folder (not zipped) containing multiple zip files (no other file type within folder). Each zip has the same type of text files containing different data saved within.
I know how to read in each separately, but I am looking to loop the process without having to type in each zip name. The zipfile archive does not seem to allow wild cards, so I cannot loop using this method. Is it possible to loop the process using glob?
The goal is to get the agency names without extracting all the zipfiles.
Single file read
import os
os.listdir('C:\\NTM\\Test\\')
['00003_32_332.zip', '00011_273_569.zip', '00012_258_276.zip']
import glob
glob.glob('C:\\NTM\\Test\\*.zip')
['C:\\NTM\\Test\\00003_32_332.zip', 'C:\\NTM\\Test\\00011_273_569.zip', 'C:\\NTM\\Test\\00012_258_276.zip']
import zipfile
archive=zipfile.ZipFile('C:\\NTM\\Test\\00011_273_569.zip')
testagency=archive.open('agency.txt')
testagency.read()
'agency_id,agency_name,nVRT,ValleyRide'
Update:
Now, that I can loop through the zip files and loop through to get the text file - I cannot print the agency_name from all of the zip files in the folder. My current code only prints the name of the last agency from the text file of the last zip file in the folder. Am I missing some compound statement structure?
def csv_dict_reader(file_obj):
reader=csv.DictReader(file_obj, delimiter=',')
for row in reader:
print(row['agency_name'])
if name == 'main':
with archive.open('agency.txt')as f_obj:
csv_dict_reader(f_obj)
Whatcom Transportation Authority

Sample Code
import glob
import zipfile
dirName = '/backup/'
zipList = glob.glob(diName+'*.zip')
for zipname in zipList:
archive = zipfile.ZipFile(zipname)
fileList = archive.namelist()
for fileName in fileList:
if fileName.endswith('.txt'):
archive.extract(fileName)
archive.close()

Thanks Jean-Francois!
for archive_name in glob.glob('C:\\NTM\\Test\\*.zip'):
archive=zipfile.ZipFile(archive_name)
testagency=archive.open('agency.txt')
testagency.read()

As I could not comment on Fuji Komalans comment.
Here is the fixed code.
import glob
import zipfile
dirName = 'C:/test/'
zipList = glob.glob(dirName + '*.zip')
print(zipList)
for zipname in zipList:
archive = zipfile.ZipFile(zipname)
fileList = archive.namelist()
for fileName in fileList:
if fileName.endswith('.txt'):
archive.extract(fileName)
print(fileName)
archive.close()

How to extract zip file recursively?

I have a zip file which contains three zip files in it like this:
zipfile.zip\
dirA.zip\
a
dirB.zip\
b
dirC.zip\
c
I want to extract all the inner zip files that are inside the zip file in directories with these names (dirA, dirB, dirC).
Basically, I want to end up with the following schema:
output\
dirA\
a
dirB\
b
dirC\
c
I have tried the following:
import os, re
from zipfile import ZipFile
os.makedirs(directory) # where directory is "\output"
with ZipFile(self.archive_name, "r") as archive:
for id, files in data.items():
if files:
print("Creating", id)
dirpath = os.path.join(directory, id)
os.mkdir(dirpath)
for file in files:
match = pattern.match(filename)
new = match.group(2)
new_filename = os.path.join(dirpath, new)
content = archive.open(file).read()
with open(new_filename, "wb") as outfile:
outfile.write(content)
But it only extracts the zip file and I end up with:
output\
dirA\
dirA.zip
dirB\
dirB.zip
dirC\
dirC.zip
Any suggestions including code-segments will be much appreciated cause I have tried so many different things and read the docs without success.

When extracting the zip file, you would want to write the inner zip files to memory instead of them on disk. To do this, I've used BytesIO.
Check out this code:
import os
import io
import zipfile
def extract(filename):
z = zipfile.ZipFile(filename)
for f in z.namelist():
# get directory name from file
dirname = os.path.splitext(f)[0]
# create new directory
os.mkdir(dirname)
# read inner zip file into bytes buffer
content = io.BytesIO(z.read(f))
zip_file = zipfile.ZipFile(content)
for i in zip_file.namelist():
zip_file.extract(i, dirname)
If you run extract("zipfile.zip") with zipfile.zip as:
zipfile.zip/
dirA.zip/
a
dirB.zip/
b
dirC.zip/
c
Output should be:
dirA/
a
dirB/
b
dirC/
c

For a function that extracts a nested zip file (any level of nesting) and cleans up the original zip files:
import zipfile, re, os
def extract_nested_zip(zippedFile, toFolder):
""" Extract a zip file including any nested zip files
Delete the zip file(s) after extraction
"""
with zipfile.ZipFile(zippedFile, 'r') as zfile:
zfile.extractall(path=toFolder)
os.remove(zippedFile)
for root, dirs, files in os.walk(toFolder):
for filename in files:
if re.search(r'\.zip$', filename):
fileSpec = os.path.join(root, filename)
extract_nested_zip(fileSpec, root)

I tried some of the other solutions but couldn't get them to work "in place". I'll post my solution to handle the "in place" version. Note: it deletes the zip files and 'replaces' them with identically named directories, so back up your zip files if you want to keep.
Strategy is simple. Unzip all zip files in the directory (and subdirectories) and rinse and repeat until no zip files remain. The rinse and repeat is needed if the zip files contain zip files.
import os
import io
import zipfile
import re
def unzip_directory(directory):
"""" This function unzips (and then deletes) all zip files in a directory """
for root, dirs, files in os.walk(directory):
for filename in files:
if re.search(r'\.zip$', filename):
to_path = os.path.join(root, filename.split('.zip')[0])
zipped_file = os.path.join(root, filename)
if not os.path.exists(to_path):
os.makedirs(to_path)
with zipfile.ZipFile(zipped_file, 'r') as zfile:
zfile.extractall(path=to_path)
# deletes zip file
os.remove(zipped_file)
def exists_zip(directory):
""" This function returns T/F whether any .zip file exists within the directory, recursively """
is_zip = False
for root, dirs, files in os.walk(directory):
for filename in files:
if re.search(r'\.zip$', filename):
is_zip = True
return is_zip
def unzip_directory_recursively(directory, max_iter=1000):
print("Does the directory path exist? ", os.path.exists(directory))
""" Calls unzip_directory until all contained zip files (and new ones from previous calls)
are unzipped
"""
iterate = 0
while exists_zip(directory) and iterate < max_iter:
unzip_directory(directory)
iterate += 1
pre = "Did not " if iterate < max_iter else "Did"
print(pre, "time out based on max_iter limit of", max_iter, ". Took iterations:", iterate)
Assuming your zip files are backed up, you make this all work by calling unzip_directory_recursively(your_directory).

This works for me. Just place this script with the nested zip under the same directory. It will extract zip into directory with the same name as the original zip and clean up the original zip. It will also count the total number of files within the nested zip as well
import os
from zipfile import ZipFile
def unzip (path, total_count):
for root, dirs, files in os.walk(path):
for file in files:
file_name = os.path.join(root, file)
if (not file_name.endswith('.zip')):
total_count += 1
else:
currentdir = file_name[:-4]
if not os.path.exists(currentdir):
os.makedirs(currentdir)
with ZipFile(file_name) as zipObj:
zipObj.extractall(currentdir)
os.remove(file_name)
total_count = unzip(currentdir, total_count)
return total_count
total_count = unzip ('.', 0)
print(total_count)

How do I zip the contents of a folder using python (version 2.5)?

Once I have all the files I require in a particular folder, I would like my python script to zip the folder contents.
Is this possible?
And how could I go about doing it?

On python 2.7 you might use: shutil.make_archive(base_name, format[, root_dir[, base_dir[, verbose[, dry_run[, owner[, group[, logger]]]]]]]).
base_name archive name minus extension
format format of the archive
root_dir directory to compress.
For example
shutil.make_archive(target_file, format="bztar", root_dir=compress_me)

Adapted version of the script is:
#!/usr/bin/env python
from __future__ import with_statement
from contextlib import closing
from zipfile import ZipFile, ZIP_DEFLATED
import os
def zipdir(basedir, archivename):
assert os.path.isdir(basedir)
with closing(ZipFile(archivename, "w", ZIP_DEFLATED)) as z:
for root, dirs, files in os.walk(basedir):
#NOTE: ignore empty directories
for fn in files:
absfn = os.path.join(root, fn)
zfn = absfn[len(basedir)+len(os.sep):] #XXX: relative path
z.write(absfn, zfn)
if __name__ == '__main__':
import sys
basedir = sys.argv[1]
archivename = sys.argv[2]
zipdir(basedir, archivename)
Example:
C:\zipdir> python -mzipdir c:\tmp\test test.zip
It creates 'C:\zipdir\test.zip' archive with the contents of the 'c:\tmp\test' directory.

Here is a recursive version
def zipfolder(path, relname, archive):
paths = os.listdir(path)
for p in paths:
p1 = os.path.join(path, p)
p2 = os.path.join(relname, p)
if os.path.isdir(p1):
zipfolder(p1, p2, archive)
else:
archive.write(p1, p2)
def create_zip(path, relname, archname):
archive = zipfile.ZipFile(archname, "w", zipfile.ZIP_DEFLATED)
if os.path.isdir(path):
zipfolder(path, relname, archive)
else:
archive.write(path, relname)
archive.close()

Both jfs's solution and Kozyarchuk's solution could work for the OP's use case, however:
jfs's solution zips all of the files in a source folder and stores them in the zip at the root level (not preserving the original source folder within the structure of the zip).
Kozyarchuk's solution inadvertently puts the newly-created zip file into itself since it is a recursive solution (e.g. creating new zip file "myzip.zip" with this code will result in the archive "myzip.zip" itself containing an empty file "myzip.zip")
Thus, here is a solution that will simply add a source folder (and any subfolders to any depth) to a zip archive. This is motivated by the fact that you cannot pass a folder name to the built-in method ZipFile.write() -- the function below, add_folder_to_zip(), offers a simple method to add a folder and all of its contents to a zip archive. Below code works for Python2 and Python3.
import zipfile
import os
def add_folder_to_zip(src_folder_name, dst_zip_archive):
""" Adds a folder and its contents to a zip archive
Args:
src_folder_name (str): Source folder name to add to the archive
dst_zip_archive (ZipFile): Destination zip archive
Returns:
None
"""
for walk_item in os.walk(src_folder_name):
for file_item in walk_item[2]:
# walk_item[2] is a list of files in the folder entry
# walk_item[0] is the folder entry full path
fn_to_add = os.path.join(walk_item[0], file_item)
dst_zip_archive.write(fn_to_add)
if __name__ == '__main__':
zf = zipfile.ZipFile('myzip.zip', mode='w')
add_folder_to_zip('zip_this_folder', zf)
zf.close()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Unzip nested zip files in python - python

Unfortunately decompressing zip files requires random access to the archive, and the ZipFile methods (not to mention the DEFLATE algorithm itself) only provide streams. It is therefore impossible to decompress nested zip files without extracting them.

I use python 3.7.3 import zipfile import io with zipfile.ZipFile('all.zip') as z: with z.open('nested.zip') as z2: z2_filedata = io.BytesIO(z2.read()) with zipfile.ZipFile(z2_filedata) as nested_zip: print( nested_zip.open('readme.md').read())

Related

Extracting Zip From HTTP Response in Python

Moving oldest file names with different extensions to common folder. Python

Read .txt from multiple .zip in folder

How to extract zip file recursively?

How do I zip the contents of a folder using python (version 2.5)?

Categories

Resources