Exclude a directory from getting zipped using zipfile module in python - python

I am trying to zip a directory using python zipfile module and its working well.But now i want to exclude some folders.ie if my director tree is like
abc
def
ghi
jkl
mno
then i want to archive all to myfile.zip but excluding "ghi"
I am trying to zip files using
zf = zipfile.ZipFile("Application server.zip", "w")
for dirname, subdirs, files in os.walk("D:\\review docs"):
zf.write(dirname)
for filename in files:
zf.write(os.path.join(dirname, filename))
zf.close()
so this is archiving everything under "D:\review docs" to "Application server.zip" but i want to exclude some directories from the zip.
In fact i can use linux commands to do the same but i want to use zipfile module.
Also if i pop exclude folder name from "dirname" list optained from os.walk,will that work?
further Adding up a check before zipping like if "dirname"=="exlude folder" will also work i think but i want a neat solution of doing the same using the module.I read some where that zipfile module provides this functionality but didn't found any code example for the same.

Yes , you can remove elements from the subdirs , that would make sure that os.walk() does not into those directories. Example -
for dirname, subdirs, files in os.walk("D:\\review docs"):
if 'exclude directory' in subdirs:
subdirs.remove('exclude directory')
zf.write(dirname)
for filename in files:
zf.write(os.path.join(dirname, filename))
zf.close()

I wrote a more complete version, which being able to filter folders and exts
We can't simply delete the folder like .svn before zipping. The following code can help.
It zips a folder to a zip file, maintaining its structure and filtering certain folders and exts, like what you expect natually.
def IsPathValid(path, ignoreDir, ignoreExt):
splited = None
if os.path.isfile(path):
if ignoreExt:
_, ext = os.path.splitext(path)
if ext in ignoreExt:
return False
splited = os.path.dirname(path).split('\\/')
else:
if not ignoreDir:
return True
splited = path.split('\\/')
if ignoreDir:
for s in splited:
if s in ignoreDir: # You can also use set.intersection or [x for],
return False
return True
def zipDirHelper(path, rootDir, zf, ignoreDir=None, ignoreExt=None):
# zf is zipfile handle
if os.path.isfile(path):
if IsPathValid(path, ignoreDir, ignoreExt):
relative = os.path.relpath(path, rootDir)
zf.write(path, relative)
return
ls = os.listdir(path)
for subFileOrDir in ls:
if not IsPathValid(subFileOrDir, ignoreDir, ignoreExt):
continue
joinedPath = os.path.join(path, subFileOrDir)
zipDirHelper(joinedPath, rootDir, zf, ignoreDir, ignoreExt)
def ZipDir(path, zf, ignoreDir=None, ignoreExt=None, close=False):
rootDir = path if os.path.isdir(path) else os.path.dirname(path)
try:
zipDirHelper(path, rootDir, zf, ignoreDir, ignoreExt)
finally:
if close:
zf.close()
use it like this:
import zipfile
theZipFile = zipfile.ZipFile(targetZipFile, 'w')
Util.ZipDir(target_dir, theZipFile, ignoreDir=[".svn"], ignoreExt=[".zip"], close=True)
# If you like to zip more files, just close=False and manually close the file or use "with xxx" on your own

Related

Copying files in python using shutil

I have the following directory structure:
-mailDir
-folderA
-sub1
-sub2
-inbox
-1.txt
-2.txt
-89.txt
-subInbox
-subInbox2
-folderB
-sub1
-sub2
-inbox
-1.txt
-2.txt
-200.txt
-577.txt
The aim is to copy all the txt files under inbox folder into another folder.
For this I tried the below code
import os
from os import path
import shutil
rootDir = "mailDir"
destDir = "destFolder"
eachInboxFolderPath = []
for root, dirs, files in os.walk(rootDir):
for dirName in dirs:
if(dirName=="inbox"):
eachInboxFolderPath.append(root+"\\"+dirName)
for ii in eachInboxFolderPath:
for i in os.listdir(ii):
shutil.copy(path.join(ii,i),destDir)
If the inbox directory only has .txt files then the above code works fine. Since the inbox folder under folderA directory has other sub directory along with .txt files, the code returns permission denied error. What I understood is shutil.copy won't allow to copy the folders.
The aim is to copy only the txt files in every inbox folder to some other location. If the file names are same in different inbox folder I have to keep both file names. How we can improve the code in this case ? Please note other than .txt all others are folders only.
One simple solution is to filter for any i that does not have the .txt extension by using the string endswith() method.
import os
from os import path
import shutil
rootDir = "mailDir"
destDir = "destFolder"
eachInboxFolderPath = []
for root, dirs, files in os.walk(rootDir):
for dirName in dirs:
if(dirName=="inbox"):
eachInboxFolderPath.append(root+"\\"+dirName)
for ii in eachInboxFolderPath:
for i in os.listdir(ii):
if i.endswith('.txt'):
shutil.copy(path.join(ii,i),destDir)
This should ignore any folders and non-txt files that are found with os.listdir(ii). I believe that is what you are looking for.
Just remembered that I once wrote several files to solve this exact problem before. You can find the source code here on my Github.
In short, there are two functions of interest here:
list_files(loc, return_dirs=False, return_files=True, recursive=False, valid_exts=None)
copy_files(loc, dest, rename=False)
For your case, you could copy and paste these functions into your project and modify copy_files like this:
def copy_files(loc, dest, rename=False):
# get files with full path
files = list_files(loc, return_dirs=False, return_files=True, recursive=True, valid_exts=('.txt',))
# copy files in list to dest
for i, this_file in enumerate(files):
# change name if renaming
if rename:
# replace slashes with hyphens to preserve unique name
out_file = sub(r'^./', '', this_file)
out_file = sub(r'\\|/', '-', out_file)
out_file = join(dest, out_file)
copy(this_file, out_file)
files[i] = out_file
else:
copy(this_file, dest)
return files
Then just call it like so:
copy_files('mailDir', 'destFolder', rename=True)
The renaming scheme might not be exactly what you want, but it will at least not override your files. I believe this should solve all your problems.
Here you go:
import os
from os import path
import shutil
destDir = '<absolute-path>'
for root, dirs, files in os.walk(os.getcwd()):
# Filter out only '.txt' files.
files = [f for f in files if f.endswith('.txt')]
# Filter out only 'inbox' directory.
dirs[:] = [d for d in dirs if d == 'inbox']
for f in files:
p = path.join(root, f)
# print p
shutil.copy(p, destDir)
Quick and simple.
sorry, I forgot the part where, you also need unique file names as well. The above solution only works for distinct file names in a single inbox folder.
For copying files from multiple inboxes and having a unique name in the destination folder, you can try this:
import os
from os import path
import shutil
sourceDir = os.getcwd()
fixedLength = len(sourceDir)
destDir = '<absolute-path>'
filteredFiles = []
for root, dirs, files in os.walk(sourceDir):
# Filter out only '.txt' files in all the inbox directories.
if root.endswith('inbox'):
# here I am joining the file name to the full path while filtering txt files
files = [path.join(root, f) for f in files if f.endswith('.txt')]
# add the filtered files to the main list
filteredFiles.extend(files)
# making a tuple of file path and file name
filteredFiles = [(f, f[fixedLength+1:].replace('/', '-')) for f in filteredFiles]
for (f, n) in filteredFiles:
print 'copying file...', f
# copying from the path to the dest directory with specific name
shutil.copy(f, path.join(destDir, n))
print 'copied', str(len(filteredFiles)), 'files to', destDir
If you need to copy all files instead of just txt files, then just change the condition f.endswith('.txt') to os.path.isfile(f) while filtering out the files.

How to extract zip file recursively?

I have a zip file which contains three zip files in it like this:
zipfile.zip\
dirA.zip\
a
dirB.zip\
b
dirC.zip\
c
I want to extract all the inner zip files that are inside the zip file in directories with these names (dirA, dirB, dirC).
Basically, I want to end up with the following schema:
output\
dirA\
a
dirB\
b
dirC\
c
I have tried the following:
import os, re
from zipfile import ZipFile
os.makedirs(directory) # where directory is "\output"
with ZipFile(self.archive_name, "r") as archive:
for id, files in data.items():
if files:
print("Creating", id)
dirpath = os.path.join(directory, id)
os.mkdir(dirpath)
for file in files:
match = pattern.match(filename)
new = match.group(2)
new_filename = os.path.join(dirpath, new)
content = archive.open(file).read()
with open(new_filename, "wb") as outfile:
outfile.write(content)
But it only extracts the zip file and I end up with:
output\
dirA\
dirA.zip
dirB\
dirB.zip
dirC\
dirC.zip
Any suggestions including code-segments will be much appreciated cause I have tried so many different things and read the docs without success.
When extracting the zip file, you would want to write the inner zip files to memory instead of them on disk. To do this, I've used BytesIO.
Check out this code:
import os
import io
import zipfile
def extract(filename):
z = zipfile.ZipFile(filename)
for f in z.namelist():
# get directory name from file
dirname = os.path.splitext(f)[0]
# create new directory
os.mkdir(dirname)
# read inner zip file into bytes buffer
content = io.BytesIO(z.read(f))
zip_file = zipfile.ZipFile(content)
for i in zip_file.namelist():
zip_file.extract(i, dirname)
If you run extract("zipfile.zip") with zipfile.zip as:
zipfile.zip/
dirA.zip/
a
dirB.zip/
b
dirC.zip/
c
Output should be:
dirA/
a
dirB/
b
dirC/
c
For a function that extracts a nested zip file (any level of nesting) and cleans up the original zip files:
import zipfile, re, os
def extract_nested_zip(zippedFile, toFolder):
""" Extract a zip file including any nested zip files
Delete the zip file(s) after extraction
"""
with zipfile.ZipFile(zippedFile, 'r') as zfile:
zfile.extractall(path=toFolder)
os.remove(zippedFile)
for root, dirs, files in os.walk(toFolder):
for filename in files:
if re.search(r'\.zip$', filename):
fileSpec = os.path.join(root, filename)
extract_nested_zip(fileSpec, root)
I tried some of the other solutions but couldn't get them to work "in place". I'll post my solution to handle the "in place" version. Note: it deletes the zip files and 'replaces' them with identically named directories, so back up your zip files if you want to keep.
Strategy is simple. Unzip all zip files in the directory (and subdirectories) and rinse and repeat until no zip files remain. The rinse and repeat is needed if the zip files contain zip files.
import os
import io
import zipfile
import re
def unzip_directory(directory):
"""" This function unzips (and then deletes) all zip files in a directory """
for root, dirs, files in os.walk(directory):
for filename in files:
if re.search(r'\.zip$', filename):
to_path = os.path.join(root, filename.split('.zip')[0])
zipped_file = os.path.join(root, filename)
if not os.path.exists(to_path):
os.makedirs(to_path)
with zipfile.ZipFile(zipped_file, 'r') as zfile:
zfile.extractall(path=to_path)
# deletes zip file
os.remove(zipped_file)
def exists_zip(directory):
""" This function returns T/F whether any .zip file exists within the directory, recursively """
is_zip = False
for root, dirs, files in os.walk(directory):
for filename in files:
if re.search(r'\.zip$', filename):
is_zip = True
return is_zip
def unzip_directory_recursively(directory, max_iter=1000):
print("Does the directory path exist? ", os.path.exists(directory))
""" Calls unzip_directory until all contained zip files (and new ones from previous calls)
are unzipped
"""
iterate = 0
while exists_zip(directory) and iterate < max_iter:
unzip_directory(directory)
iterate += 1
pre = "Did not " if iterate < max_iter else "Did"
print(pre, "time out based on max_iter limit of", max_iter, ". Took iterations:", iterate)
Assuming your zip files are backed up, you make this all work by calling unzip_directory_recursively(your_directory).
This works for me. Just place this script with the nested zip under the same directory. It will extract zip into directory with the same name as the original zip and clean up the original zip. It will also count the total number of files within the nested zip as well
import os
from zipfile import ZipFile
def unzip (path, total_count):
for root, dirs, files in os.walk(path):
for file in files:
file_name = os.path.join(root, file)
if (not file_name.endswith('.zip')):
total_count += 1
else:
currentdir = file_name[:-4]
if not os.path.exists(currentdir):
os.makedirs(currentdir)
with ZipFile(file_name) as zipObj:
zipObj.extractall(currentdir)
os.remove(file_name)
total_count = unzip(currentdir, total_count)
return total_count
total_count = unzip ('.', 0)
print(total_count)

Python folder names in the directory

how can i get the folder names existing in a directory using Python ?
I want to save all the subfolders into a list to work with the names after that but i dont know how to read the subfolder names ?
Thanks for you help
You can use os.walk()
# !/usr/bin/python
import os
directory_list = list()
for root, dirs, files in os.walk("/path/to/your/dir", topdown=False):
for name in dirs:
directory_list.append(os.path.join(root, name))
print directory_list
EDIT
If you only want the first level and not actually "walk" through the subdirectories, it is even less code:
import os
root, dirs, files = os.walk("/path/to/your/dir").next()
print dirs
This is not really what os.walk is made for. If you really only want one level of subdirectories, you can also use os.listdir() like Yannik Ammann suggested:
root='/path/to/my/dir'
dirlist = [ item for item in os.listdir(root) if os.path.isdir(os.path.join(root, item)) ]
print dirlist
Starting with Python 3.4, you can also use the new pathlib module:
from pathlib import Path
p = Path('some/folder')
subdirectories = [x for x in p.iterdir() if x.is_dir()]
print(subdirectories)
You can use os.listdir() here a link to the docs
Warning returns files and directories
example:
import os
path = 'pyth/to/dir/'
dir_list = os.listdir(path)
update: you need to check if the returned names are directories or files
import os
path = 'pyth/to/dir/'
# list of all content in a directory, filtered so only directories are returned
dir_list = [directory for directory in os.listdir(path) if os.path.isdir(path+directory)]
You should import os first.
import os
files=[]
files = [f for f in sorted(os.listdir(FileDirectoryPath))]
This would give you list with all files in the FileDirectoryPath sorted.
I use os.listdir
Get all folder names of a directory
folder_names = []
for entry_name in os.listdir(MYDIR):
entry_path = os.path.join(MYDIR, entry_name)
if os.path.isdir(entry_path):
folder_names.append(entry_name)
Get all folder paths of a directory
folder_paths = []
for entry_name in os.listdir(MYDIR):
entry_path = os.path.join(MYDIR, entry_name)
if os.path.isdir(entry_path):
folder_paths.append(entry_path)
Get all file names of a directory
file_names = []
for file_name in os.listdir(MYDIR):
file_path = os.path.join(MYDIR, file_name)
if os.path.isfile(file_path):
file_names.append(file_name)
Get all file paths of a directory
file_paths = []
for file_name in os.listdir(MYDIR):
file_path = os.path.join(MYDIR, file_name)
if os.path.isfile(file_path):
file_paths.append(file_path)
For python 3 I'm using this script
import os
root='./'
dirlist = [ item for item in os.listdir(root) if os.path.isdir(os.path.join(root, item)) ]
for dir in dirlist:
print(dir)
Use os.walk(path)
import os
path = 'C:\\'
for root, directories, files in os.walk(path):
for directory in directories:
print os.path.join(root, directory)
Python 3.x: If you want only the directories in a given directory, try:
import os
search_path = '.' # set your path here.
root, dirs, files = next(os.walk(search_path), ([],[],[]))
print(dirs)
The above example will print out a list of the directories in the current directory like this:
['dir1', 'dir2', 'dir3']
The output contains only the sub-directory names.
If the directory does not have sub-directories, it will print:
[]
os.walk() is a generator method, so use next() to only call it once. The 3-tuple of empty strings is for the error condition when the directory does not contain any sub-directories because the os.walk() generator returns 3-tuples for each layer in the directory tree. Without those, if the directory is empty, next() will raise a StopIteration exception.
For a more compact version:
dirs = next(os.walk(search_path), ([],[],[]))[1]

All Files in Dir & Sub-Dir

I would like to find all the files in a directory and all sub-directories.
code used:
import os
import sys
path = "C:\\"
dirs = os.listdir(path)
filename = "C.txt"
FILE = open(filename, "w")
FILE.write(str(dirs))
FILE.close()
print dirs
The problem is - this code only lists files in directories, not sub-directories. What do I need to change in order to also list files in subdirectories?
To traverse a directory tree you want to use os.walk() for this.
Here's an example to get you started:
import os
searchdir = r'C:\root_dir' # traversal starts in this directory (the root)
for root, dirs, files in os.walk(searchdir):
for name in files:
(base, ext) = os.path.splitext(name) # split base and extension
print base, ext
which would give you access to the file names and the components.
You'll find the functions in the os and os.path module to be of great use for this sort of work.
This function will help you: os.path.walk() http://docs.python.org/library/os.path.html#os.path.walk

Get absolute paths of all files in a directory

How do I get the absolute paths of all the files in a directory that could have many sub-folders in Python?
I know os.walk() recursively gives me a list of directories and files, but that doesn't seem to get me what I want.
os.path.abspath makes sure a path is absolute. Use the following helper function:
import os
def absoluteFilePaths(directory):
for dirpath,_,filenames in os.walk(directory):
for f in filenames:
yield os.path.abspath(os.path.join(dirpath, f))
If you have Python 3.4 or newer you can use pathlib (or a third-party backport if you have an older Python version):
import pathlib
for filepath in pathlib.Path(directory).glob('**/*'):
print(filepath.absolute())
If the argument given to os.walk is absolute, then the root dir names yielded during iteration will also be absolute. So, you only need to join them with the filenames:
import os
for root, dirs, files in os.walk(os.path.abspath("../path/to/dir/")):
for file in files:
print(os.path.join(root, file))
Try:
import os
for root, dirs, files in os.walk('.'):
for file in files:
p=os.path.join(root,file)
print p
print os.path.abspath(p)
print
You can use os.path.abspath() to turn relative paths into absolute paths:
file_paths = []
for folder, subs, files in os.walk(rootdir):
for filename in files:
file_paths.append(os.path.abspath(os.path.join(folder, filename)))
Starting with python 3.5 the idiomatic solution would be:
import os
def absolute_file_paths(directory):
path = os.path.abspath(directory)
return [entry.path for entry in os.scandir(path) if entry.is_file()]
This not just reads nicer but also is faster in many cases.
For more details (like ignoring symlinks) see original python docs:
https://docs.python.org/3/library/os.html#os.scandir
All files and folders:
x = [os.path.abspath(os.path.join(directory, p)) for p in os.listdir(directory)]
Images (.jpg | .png):
x = [os.path.abspath(os.path.join(directory, p)) for p in os.listdir(directory) if p.endswith(('jpg', 'png'))]
from glob import glob
def absolute_file_paths(directory):
return glob(join(directory, "**"))
Try:
from pathlib import Path
path = 'Desktop'
files = filter(lambda filepath: filepath.is_file(), Path(path).glob('*'))
for file in files:
print(file.absolute())
I wanted to keep the subdirectory details and not the files and wanted only subdirs with one xml file in them. I can do it this way:
for rootDirectory, subDirectories, files in os.walk(eventDirectory):
for subDirectory in subDirectories:
absSubDir = os.path.join(rootDirectory, subDirectory)
if len(glob.glob(os.path.join(absSubDir, "*.xml"))) == 1:
print "Parsing information in " + absSubDir
for root, directories, filenames in os.walk(directory):
for directory in directories:
print os.path.join(root, directory)
for filename in filenames:
if filename.endswith(".JPG"):
print filename
print os.path.join(root,filename)
Try This
pth=''
types=os.listdir(pth)
for type_ in types:
file_names=os.listdir(f'{pth}/{type_}')
file_names=list(map(lambda x:f'{pth}/{type_}/{x}',file_names))
train_folder+=file_names

Categories