reading zip files within tar file - python

I am trying to extract some xml files within zipped files, from a tar file. Actually, there is a large tar file, in which there are multiple "zip" files. Each zipfile contains another zipfile with the xml file.
import tarfile, os
import sys
from zipfile import ZipFile
os.chdir("C://.../temp/foo")
tar = tarfile.open("C://....")
for member in tar.getmembers():
if member.name.endswith(".zip"):
f=tar.extractfile(member)
content=ZipFile(f, 'r')
content = content.extract(content)
tar.close()
The script above the not extract the good file

You have to do a bit of manipulation to get the second ZipFile into a ZipFile rather than a ZipExtFile, as you would have noticed.
The key thing is that the second-level ZipFile has to be instantiated out of a bytestream object, and then it will behave properly. I created a test file with your spec and it works fine (tar - zip - zip - textfile) and if you have deeper levels of zipfile nesting you can just generalise the code some more.
import tarfile
from zipfile import ZipFile
import io
mytar = tarfile.open('mytar.tar')
print('Opening tar file, members:')
for member in mytar.getnames():
print('>%s'%member)
if member.endswith('zip'):
# get the tarfile object
tf = mytar.extractfile(member)
# this is what the first-lelvel ZipFile will be
with ZipFile(tf) as myzip1:
print(myzip1.namelist())
# now let's get at those second-level ZipFiles, which currently exist as ZipExtFile
for zipfile2name in myzip1.namelist():
# read the file into bytes
zipfile2bytes = myzip1.read(zipfile2name)
# get a bytestream
f = io.BytesIO(zipfile2bytes)
# now instantiate a ZipFile Object
zipfile2 = ZipFile(f)
# now we can use it like a proper ZipFile
print(zipfile2.namelist())
for textfile in zipfile2.namelist():
with zipfile2.open(textfile) as myfile:
print(myfile.read())
print('--finished--')
mytar.close()

Related

Create zipfile at local and write files from s3

I am creating a zipfile on my local machine and would like to write files from s3. So far I'm unable to do it. Here's what I have in the mean time.
import os
import zipfile
from fs import open_fs
fs = open_fs(os.getenv('s3_sample_folder'))
file_names = file_names() #list of file names
with zipfile.ZipFile('zipfile.zip', mode='w') as zf:
for file in file_names:
with fs.open('/'+file, 'rb') as remote_file:
content = remote_file.read()
zf.write(content, basename(content))
The ZipFile.write method accepts a file name, not file content. You should use the ZipFile.writestr method instead to write file content to the zip file:
zf.writestr(file, content)
Since you are using PyFilesystem, you can open a S3 filesystem and a Zip filesystem, then use copy_file to copy between them.
Something like the following should work:
import os
from fs import open_fs
from fs.copy import copy_file
with open_fs(os.getenv('s3_sample_folder')) as s3_fs:
with open_fs('zip://zipfile.zip', create=True) as zip_fs:
for filename in file_names():
copy_file(s3_fs, filename, zip_fs, filename)

Extracting zip with password to another dir without foldername

I have this password protected zip folder:
folder_1\1.zip
When I extract this it gives me
1\image.png
How can I extract this to another folder without its folder name? Just the contents of it: image.png
So far I have done all stackoverflows solutions and took me 11 hrs straight just to solve this.
import zipfile
zip = zipfile.ZipFile('C:\\Users\\Desktop\\folder_1\\1.zip', 'r')
zip.setpassword(b"virus")
zip.extractall('C:\\Users\\Desktop') <--target dir to extract all contents
zip.close()
EDIT:
This code worked for me: (Now I want many paths to be extracted at once, any ideas?
import os
import shutil
import zipfile
my_dir = r"C:\\Users\\Desktop"
my_zip = r"C:\\Users\\Desktop\\test\\folder_1\\1.zip"
with zipfile.ZipFile(my_zip) as zip_file:
zip_file.setpassword(b"virus")
for member in zip_file.namelist():
filename = os.path.basename(member)
# skip directories
if not filename:
continue
# copy file (taken from zipfile's extract)
source = zip_file.open(member)
target = file(os.path.join(my_dir, filename), "wb")
with source, target:
shutil.copyfileobj(source, target)
You can use the ZipFile.read() method to read the specific file in the archive, open your target file for writing by joining the target directory with the base name of the source file, and then write what you read to it:
import zipfile
import os
zip = zipfile.ZipFile('C:\\Users\\Desktop\\folder_1\\1.zip', 'r')
zip.setpassword(b"virus")
for name in zip.namelist():
if not name.endswith(('/', '\\')):
with open(os.path.join('C:\\Users\\Desktop', os.path.basename(name)), 'wb') as f:
f.write(zip.read(name))
zip.close()
And if you have several paths containing 1.zip for extraction:
import zipfile
import os
for path in 'C:\\Users\\Desktop\\folder_1', 'C:\\Users\\Desktop\\folder_2', 'C:\\Users\\Desktop\\folder_3':
zip = zipfile.ZipFile(os.path.join(path, '1.zip'), 'r')
zip.setpassword(b"virus")
for name in zip.namelist():
if not name.endswith(('/', '\\')):
with open(os.path.join('C:\\Users\\Desktop', os.path.basename(name)), 'wb') as f:
f.write(zip.read(name))
zip.close()

Replace/Add a class file to subfolder in jar using python zipfile command

I have a jar file and I have a path which represents a location inside Jar file.
Using this location I need to replace class file inside jar(Add a class file in some cases).I have class file inside another folder which is present where jar is present(This class file i have to move to Jar).
Code which I am trying to achieve above objective :
import zipfile
import os
zf = zipfile.ZipFile(os.path.normpath('D:\mystuff\test.jar'),mode='a')
try:
print('adding testclass.class')
zf.write(os.path.normpath('D:\mystuff\testclass.class'))
finally:
print('closing')
zf.close()
After executing above code when I saw jar below mentioned format:
Jar
|----META-INF
|----com.XYZ
|----Mystuff
|--testclass.class
Actual Output I need is -
Jar
|----META-INF
|----com.XYZ
|--ABC
|-testclass.class
How can achieve this using zipfile.write command or any other way in python?
I didn't find any params in write command where i can provide destination file location inside Jar/Zip file.
ZipFile.write(filename, arcname=None, compress_type=None)
Specify arcname to change the name of the file in the archive.
import zipfile
import os
zf = zipfile.ZipFile(os.path.normpath(r'D:\mystuff\test.jar'),mode='a')
try:
print('adding testclass.class')
zf.write(os.path.normpath(r'D:\mystuff\testclass.class'),arcname="com.XYZ/ABC/testclass.class")
finally:
print('closing')
zf.close()
Note: I doubt test.jar is your real jar name, since you didn't protect your string against special chars and the jar file opened would have been 'D:\mystuff\<TAB>est.jar' (well, it doesn't work :))
EDIT: if you want to add the new file but remove the old one, you have to do differently: you cannot delete from a zipfile, you have to rebuild another one (inspired by Delete file from zipfile with the ZipFile Module)
import zipfile
import os
infile = os.path.normpath(r'D:\mystuff\test.jar')
outfile = os.path.normpath(r'D:\mystuff\test_new.jar')
zin = zipfile.ZipFile(infile,mode='r')
zout = zipfile.ZipFile(outfile,mode='w')
for item in zin.infolist():
if os.path.basename(item.filename)=="testclass.class":
pass # skip item
else:
# write the item to the new archive
buffer = zin.read(item.filename)
zout.writestr(item, buffer)
print('adding testclass.class')
zout.write(os.path.normpath(r'D:\mystuff\testclass.class'),arcname="com.XYZ/ABC/testclass.class")
zout.close()
zin.close()
os.remove(infile)
os.rename(outfile,infile)

How to read text files in a zipped folder in Python

I have a compressed data file (all in a folder, then zipped). I want to read each file without unzipping. I tried several methods but nothing works for entering the folder in the zip file. How should I achieve that?
Without folder in the zip file:
with zipfile.ZipFile('data.zip') as z:
for filename in z.namelist():
data = filename.readlines()
With one folder:
with zipfile.ZipFile('data.zip') as z:
for filename in z.namelist():
if filename.endswith('/'):
# Here is what I was stucked
namelist() returns a list of all items in an archive recursively.
You can check whether an item is a directory by calling os.path.isdir():
import os
import zipfile
with zipfile.ZipFile('archive.zip') as z:
for filename in z.namelist():
if not os.path.isdir(filename):
# read the file
with z.open(filename) as f:
for line in f:
print line
Hope that helps.
I got Alec's code to work. I made some minor edits: (note, this won't work with password-protected zipfiles)
import os
import sys
import zipfile
z = zipfile.ZipFile(sys.argv[1]) # Flexibility with regard to zipfile
for filename in z.namelist():
if not os.path.isdir(filename):
# read the file
for line in z.open(filename):
print line
z.close() # Close the file after opening it
del z # Cleanup (in case there's further work after this)
I got RichS' code to work. I made some minor edits:
import os
import sys
import zipfile
archive = sys.argv[1] # assuming launched with `python my_script.py archive.zip`
with zipfile.ZipFile(archive) as z:
for filename in z.namelist():
if not os.path.isdir(filename):
# read the file
for line in z.open(filename):
print(line.decode('utf-8'))
As you can see the edits are minor. I've switched to Python 3, the ZipFile class has a capital F, and the output is converted from b-strings to unicode strings. Only decode if you are trying to unzip a text file.
PS I'm not dissing RichS at all. I just thought it would be hilarious. Both useful and a mild shitpost.
PPS You can get file from an archive with a password: ZipFile.open(name, mode='r', pwd=None, *, force_zip64=False) or ZipFile.read(name, pwd=None). If you use .read then there's no context manager so you would simply do
# read the file
print(z.read(filename).decode('utf-8'))

Python gzip folder structure when zipping single file

I'm using Python's gzip module to gzip content for a single file, using code similar to the example in the docs:
import gzip
content = "Lots of content here"
f = gzip.open('/home/joe/file.txt.gz', 'wb')
f.write(content)
f.close()
If I open the gz file in 7-zip, I see a folder hierarchy matching the path I wrote the gz to and my content is nested several folders deep, like /home/joe in the example above, or C: -> Documents and Settings -> etc in Windows.
How can I get the one file that I'm zipping to just be in the root of the gz file?
It looks like you will have to use GzipFile directly:
import gzip
content = "Lots of content here"
real_f = open('/home/joe/file.txt.gz', 'wb')
f = gzip.GZipFile('file.txt.gz', fileobj=real_f)
f.write(content)
f.close()
real_f.close()
It looks like open doesn't allow you to specify the fileobj separate from the filename.
You must use gzip.GzipFile and supply a fileobj. If you do that, you can specify an arbitrary filename for the header of the gz file.
Why not just open the file without specifying a directory hierarchy (just do gzip.open("file.txt.gz"))?. Seems to me like that works. You can always copy the file to another location, if you need to.
If you set your current working directory to your output folder, you can then call gzip.open("file.txt.gz") and the gz file will be created without the hierarchy
import os
import gzip
content = "Lots of content here"
outputPath = '/home/joe/file.txt.gz'
origDir = os.getcwd()
os.chdir(os.path.dirname(outputPath))
f = gzip.open(os.path.basename(outputPath), 'wb')
f.write(content)
f.close()
os.chdir(origDir)

Categories