python convert compressed zip to uncompressed tar - python

I have a compressed .zip file that i want to convert to an uncompressed .tar file.
I made this function to do it:
def decompress(filepath):
devname = re.search("_(.+?)_.+?\.zip", filepath).group(1)
with zipfile.ZipFile(filepath, 'r') as zip:
path = os.path.join(start_dir, "Downloads", devname, str(os.path.basename(filepath))[:-4])
zip.extractall(path)
with tarfile.open(path + ".tar", 'w') as tar:
for object in os.listdir(path):
tar.add(os.path.join(path, object), arcname=object)
time.sleep(2)
shutil.rmtree(path, ignore_errors=False, onerror=onError)
time.sleep(0.5)
os.remove(filepath)
return path + ".tar"
I am getting this error when running it:
Traceback (most recent call last):
File "File.py", line 195, in <module>
main()
File "File.py", line 184, in main
dld = download()
File "File.py", line 132, in download
filename = decompress(os.path.join(start_dir, "Downloads", devname, filename
))
File "File.py", line 103, in decompress
shutil.rmtree(path, ignore_errors=False, onerror=onError)
File "C:\Program Files (x86)\python27\lib\shutil.py", line 247, in rmtree
rmtree(fullname, ignore_errors, onerror)
File "C:\Program Files (x86)\python27\lib\shutil.py", line 247, in rmtree
rmtree(fullname, ignore_errors, onerror)
File "C:\Program Files (x86)\python27\lib\shutil.py", line 256, in rmtree
onerror(os.rmdir, path, sys.exc_info())
File "C:\Program Files (x86)\python27\lib\shutil.py", line 254, in rmtree
os.rmdir(path)
WindowsError: [Error 145] The directory is not empty: 'C:\\Users\\Vaibhav\\Deskt
op\\Folder\\Downloads\\toro\\_toro_nightly_test\\system\\app'
Here is my onError that I got from https://stackoverflow.com/a/2656405/2518263:
def onError(func, path, exc_info):
"""
Error handler for ``shutil.rmtree``.
If the error is due to an access error (read only file)
it attempts to add write permission and then retries.
If the error is for another reason it re-raises the error.
Usage : ``shutil.rmtree(path, onerror=onerror)``
"""
if not os.access(path, os.W_OK):
# Is the error an access error ?
os.chmod(path, stat.S_IWUSR)
func(path)
else:
raise
I only get the error on random occasions. The decompress function works 75% of the time.
I don't know why I'm getting this error. Can someone suggest a better way to do this or a way to solve this error.
I just replaced:
shutil.rmtree(path, ignore_errors=False, onerror=onError)
with:
for root, dirs, files in os.walk(path, topdown=False):
for name in files:
filename = os.path.join(root, name)
os.chmod(filename, stat.S_IWUSR)
os.remove(filename)
for name in dirs:
os.rmdir(os.path.join(root, name))
time.sleep(0.5)
os.rmdir(path)
Now it works like a charm.
EDIT:
Never mind! I still get the error but just less often, now its about 20% of the time!! Please help!
EDIT 2:
OK, so I don't get the error if i extract to a temporary file so I am using this code:
import tempfile
def decompress(filepath):
with zipfile.ZipFile(filepath) as zip:
tmp = tempfile.mkdtemp(dir=os.getcwd())
zip.extractall(tmp)
with tarfile.open(filepath[:-4] + ".tar", 'w') as tar:
for object in os.listdir(tmp):
tar.add(os.path.join(tmp, object), arcname=object)
time.sleep(1)
shutil.rmtree(tmp)
os.remove(filepath)
return filepath[:-4] + ".tar"
I works now! (hopefully the error won't occur again)
EDIT 3:
I got the error again!!!!!!!!!! this is really getting on my nerves. Please help someone.

Looks like some process in your OS manages to create another file in the directory you are in process of deletion of.
I can suggest you to avoid creation of temporary files and feed decompressed parts of original ZIP directly into new TAR file.
This require matching ZipInfo fields to TarInfo ones but it should be straightforward.
Here's my take on it:
def zip2tar(zipname, tarname):
zipf = zipfile.ZipFile(zipname, 'r')
tarf = tarfile.TarFile(tarname, 'w')
timeshift = int((datetime.datetime.now() -
datetime.datetime.utcnow()).total_seconds())
for zipinfo in zipf.infolist():
tarinfo = tarfile.TarInfo()
tarinfo.name = zipinfo.filename
tarinfo.size = zipinfo.file_size
tarinfo.mtime = calendar.timegm(zipinfo.date_time) - timeshift
if zipinfo.internal_attr & 1:
tarinfo.mode = 0666
tarinfo.type = tarfile.REGTYPE
else:
tarinfo.mode = 0777
tarinfo.type = tarfile.DIRTYPE
infile = zipf.open(zipinfo.filename)
tarf.addfile(tarinfo, infile)
zipf.close()
tarf.close()

Related

Decompressing .bz2 files in a directory in python

I would like to decompress a bunch of .bz2 files contained in a folder (where there are also .zst files). What I am doing is the following:
destination_folder = "/destination_folder_path/"
compressed_files_path="/compressedfiles_folder_path/"
dirListing = os.listdir(compressed_files_path)
for file in dirListing:
if ".bz2" in file:
unpackedfile = bz2.BZ2File(file)
data = unpackedfile.read()
open(destination_folder, 'wb').write(data)
But I keep on getting the following error message:
Traceback (most recent call last):
File "mycode.py", line 34, in <module>
unpackedfile = bz2.BZ2File(file)
File ".../miniconda3/lib/python3.9/bz2.py", line 85, in __init__
self._fp = _builtin_open(filename, mode)
FileNotFoundError: [Errno 2] No such file or directory: 'filename.bz2'
Why do I receive this error?
You must be sure that all the file paths you are using exist.
It is better to use the full path to the file being opened.
import os
import bz2
# this path must exist
destination_folder = "/full_path_to/folder/"
compressed_files_path = "/full_path_to_other/folder/"
# get list with filenames (strings)
dirListing = os.listdir(compressed_files_path)
for file in dirListing:
# ^ this is only filename.ext
if ".bz2" in file:
# concatenation of directory path and filename.bz2
existing_file_path = os.path.join(compressed_files_path, file)
# read the file as you want
unpackedfile = bz2.BZ2File(existing_file_path)
data = unpackedfile.read()
new_file_path = os.path.join(destination_folder, file)
with bz2.open(new_file_path, 'wb') as f:
f.write(data)
You can also use the shutil module to copy or move files.
os.path.exists
os.path.join
shutil
bz2 examples

Using python magic to filter files based on MIME type

I have a set of files in a directory and I'm making use of python-magic library to filter out files that are of type "text/plain" and remove all the non 'text/plain' files. Below is the code I'm using
import os
import magic
def ftype(path):
fpath = path
mime = magic.Magic(mime=True)
for root, dirs, fnames in os.walk(path):
for fname in fnames:
mi = mime.from_file(fpath+'\\'+fname)
if not mi.endswith('plain'):
os.remove(fpath + '\\' + fname)
print(fname)
else:
pass
ftype('filepath')
I'm able to run the script successfully on a small set of files. However when I ran the script on a directory that had about 40000 files I get the below error.
Traceback (most recent call last):
File "C:\Users\dmg\AppData\Local\Programs\Python\Python37\lib\site-packages\magic\magic.py", line 91, in from_file
return self._handle509Bug(e)
File "C:\Users\dmg\AppData\Local\Programs\Python\Python37\lib\site-packages\magic\magic.py", line 100, in _handle509Bug
raise e
File "C:\Users\dmg\AppData\Local\Programs\Python\Python37\lib\site-packages\magic\magic.py", line 89, in from_file
return maybe_decode(magic_file(self.cookie, filename))
File "C:\Users\dmg\AppData\Local\Programs\Python\Python37\lib\site-packages\magic\magic.py", line 255, in magic_file
return _magic_file(cookie, coerce_filename(filename))
File "C:\Users\dmg\AppData\Local\Programs\Python\Python37\lib\site-packages\magic\magic.py", line 196, in errorcheck_null
raise MagicException(err)
magic.magic.MagicException: b"line I64u: regex error 14 for `^[[:space:]]*class[[:space:]]+[[:digit:][:alpha:]:_]+[[:space:]]*\\{(.*[\n]*)*\\}(;)?$', (failed to get memory)"
I'm not sure what is the issue. Can someone help me with this or if there are any alternative approaches to do the above stated operation.
Update : Issue still exists after trying out some methods stated in the below comments.

extracting a .ppm.bz2 from a custom path to a custom path

as the title says, I have several folders, several .ppm.bz2 files and I want to extract them exactly where they are using python.
Directory structure image
I am traversing in the folders as this:
import tarfile
import os
path = '/Users/ankitkumar/Downloads/colorferet/dvd1/data/images/'
folders = os.listdir(path)
for folder in folders: #the folders starting like 00001
if not folder.startswith("0"):
pass
path2 = path + folder
zips = os.listdir(path2)
for zip in zips:
if not zip.startswith("0"):
pass
path3 = path2+"/"+zip
fh = tarfile.open(path3, 'r:bz2')
outpath = path2+"/"
fh.extractall(outpath)
fh.close
`
then I get this error
`
Traceback (most recent call last):
File "ZIP.py", line 16, in <module>
fh = tarfile.open(path3, 'r:bz2')
File "/anaconda2/lib/python2.7/tarfile.py", line 1693, in open
return func(name, filemode, fileobj, **kwargs)
File "/anaconda2/lib/python2.7/tarfile.py", line 1778, in bz2open
t = cls.taropen(name, mode, fileobj, **kwargs)
File "/anaconda2/lib/python2.7/tarfile.py", line 1723, in taropen
return cls(name, mode, fileobj, **kwargs)
File "/anaconda2/lib/python2.7/tarfile.py", line 1587, in __init__
self.firstmember = self.next()
File "/anaconda2/lib/python2.7/tarfile.py", line 2370, in next
raise ReadError(str(e))
tarfile.ReadError: invalid header
`
The tarfile module is for tar files, including tar.bz2. if your file is not tar you should use bz2 module directly.
Also, try using os.walk instead of multiple listdir as it can traverse the tree
import os
import bz2
import shutil
for path, dirs, files in os.walk(path):
for filename in files:
basename, ext = os.path.splitext(filename)
if ext.lower() != '.bz2':
continue
fullname = os.path.join(path, filename)
newname = os.path.join(path, basename)
with bz2.open(fullname) as fh, open(newname, 'wb') as fw:
shutil.copyfileobj(fh, fw)
This will uncompress all .bz2 files in all subfolders, in the same place they are. All other files will stay the same. If the uncompressed file already exists it will be overwritten.
Please backup your data before running destructive code

Using os.path.join with os.path.getsize, returning FileNotFoundError

In conjunction with my last question, I'm onto printing the filenames with their sizes next to them in a sort of list. Basically I am reading filenames from one file (which are added by the user), taking the filename and putting it in the path of the working directory to print it's size one-by-one, however I'm having an issue with the following block:
print("\n--- Stats ---\n")
with open('userdata/addedfiles', 'r') as read_files:
file_lines = read_files.readlines()
# get path for each file and find in trackedfiles
# use path to get size
print(len(file_lines), "files\n")
for file_name in file_lines:
# the actual files should be in the same working directory
cwd = os.getcwd()
fpath = os.path.join(cwd, file_name)
fsize = os.path.getsize(fpath)
print(file_name.strip(), "-- size:", fsize)
which is returning this error:
tolbiac wpm-public → ./main.py --filestatus
--- Stats ---
1 files
Traceback (most recent call last):
File "./main.py", line 332, in <module>
main()
File "./main.py", line 323, in main
parseargs()
File "./main.py", line 317, in parseargs
tracking()
File "./main.py", line 204, in tracking
fsize = os.path.getsize(fpath)
File "/usr/lib/python3.4/genericpath.py", line 50, in getsize
return os.stat(filename).st_size
FileNotFoundError: [Errno 2] No such file or directory: '/home/tolbiac/code/wpm-public/file.txt\n'
tolbiac wpm-public →
So it looks like something is adding a \n to the end of file_name, I'm not sure if thats something used in the getsize module, I tried this with os.stat, but it did the same thing.
Any suggestions? Thanks.
When you're reading in a file, you need to be aware of how the data is being seperated. In this case, the read-in file has a filename once per line seperated out by that \n operator. Need to strip it then before you use it.
for file_name in file_lines:
file_name = file_name.strip()
# rest of for loop

How to move all .log and .txt files to a new folder

I'm having trouble figuring out how to move all .log and .txt files in a certain folder and it's subdirectories to a new folder. I understand how to move one file with shutil. But, I tried to use a loop, unsuccessfully, to move all. Can someone help me with this? Thanks ....
import os, os.path
import re
def print_tgzLogs (arg, dir, files):
for file in files:
path = os.path.join (dir, file)
path = os.path.normcase (path)
defaultFolder = "Log_Text_Files"
if not defaultFolder.endswith(':') and not os.path.exists('c:\\Extracted\Log_Text_Files'):
os.mkdir('C:\\Extracted\\Log_Text_Files')
if re.search(r".*\.txt$", path) or re.search(r".*\.log$", path):
os.rename(path, 'C:\\Extracted\\Log_Text_Files')
print path
os.path.walk('C:\\Extracted\\storage', print_tgzLogs, 0)
Below is the trace back error:
Traceback (most recent call last):
File "C:\SQA_log\scan.py", line 20, in <module>
os.path.walk('C:\\Extracted\\storage', print_tgzLogs, 0)
File "C:\Python27\lib\ntpath.py", line 263, in walk
walk(name, func, arg)
File "C:\Python27\lib\ntpath.py", line 263, in walk
walk(name, func, arg)
File "C:\Python27\lib\ntpath.py", line 263, in walk
walk(name, func, arg)
File "C:\Python27\lib\ntpath.py", line 259, in walk
func(arg, top, names)
File "C:\SQA_log\scan.py", line 16, in print_tgzLogs
os.rename(path, 'C:\\Extracted\\Log_Text_Files')
WindowsError: [Error 183] Cannot create a file when that file already exists
According to the traceback, the log-files are already existing. The Python docs to the os.rename say:
On Windows, if dst already exists, OSError will be raised [...].
Now you can either:
delete the files manually or
delete the files automatically using os.remove(path)
If you want the files to be automatically deleted, the code would look like this (notice that I replaced your regular expression with the python endswith as suggested by utdemir):
import os, os.path
def print_tgzLogs (arg, dir, files):
for file in files:
path = os.path.join (dir, file)
path = os.path.normcase (path)
defaultFolder = "Log_Text_Files"
if not defaultFolder.endswith(':') and not os.path.exists('c:\\Extracted\Log_Text_Files'):
os.mkdir('C:\\Extracted\\Log_Text_Files')
if path.endswith(".txt") or path.endswith(".log"):
if os.path.exists('C:\\Extracted\\Log_Text_Files\\%s' % file):
os.remove('C:\\Extracted\\Log_Text_Files\\%s' % file)
os.rename(path, 'C:\\Extracted\\Log_Text_Files\\%s' % file)
print path
os.path.walk('C:\\Extracted\\storage', print_tgzLogs, 0)
It looks like are trying to use
os.rename(path, 'C:\\Extracted\\Log_Text_Files')
to move the file path into the directory C:\Extracted\Log_Text_Files, but rename doesn't work like this: it's going to try to make a new file named C:\Extracted\Log_Text_Files. You probably want something more like this:
os.rename(path, os.path.join('C:\\Extracted\\Log_Text_Files',os.path.basename(path))

Categories