A Python walker that can ignore directories

A Python walker that can ignore directories - python

I need a file system walker that I could instruct to ignore traversing
directories that I want to leave untouched, including all subdirectories
below that branch.
The os.walk and os.path.walk just don't do it.

Actually, os.walk may do exactly what you want. Say I have a list (perhaps a set) of directories to ignore in ignore. Then this should work:
def my_walk(top_dir, ignore):
for dirpath, dirnames, filenames in os.walk(top_dir):
dirnames[:] = [
dn for dn in dirnames
if os.path.join(dirpath, dn) not in ignore ]
yield dirpath, dirnames, filenames

It is possible to modify the second element of os.walk's return values in-place:
[...] the caller can modify the dirnames list in-place (perhaps using del or slice assignment), and walk() will only recurse into the subdirectories whose names remain in dirnames; this can be used to prune the search [...]
def fwalk(root, predicate):
for dirpath, dirnames, filenames in os.walk(root):
dirnames[:] = [d for d in dirnames if predicate(r, d)]
yield dirpath, dirnames, filenames
Now, you can just hand in a predicate for subdirectories:
>>> ignore_list = [...]
>>> list(fwalk("some/root", lambda r, d: d not in ignore_list))

Here's the best and simple solution.
def walk(ignores):
global ignore
path = os.getcwd()
for root, dirs, files in os.walk(path):
for ignore in ignores:
if(ignore in dirs):
dirs.remove(ignore)
print root
print dirs
print files
walk(['.git', '.svn'])
Remember, if you remove the folder name from dirs, it won't be explore by os.walk.
hope it helps

So I made this home-roles walker function:
import os
from os.path import join, isdir, islink, isfile
def mywalk(top, topdown=True, onerror=None, ignore_list=('.ignore',)):
try:
# Note that listdir and error are globals in this module due
# to earlier import-*.
names = os.listdir(top)
except Exception, err:
if onerror is not None:
onerror(err)
return
if len([1 for x in names if x in ignore_list]):
return
dirs, nondirs = [], []
for name in names:
if isdir(join(top, name)):
dirs.append(name)
else:
nondirs.append(name)
if topdown:
yield top, dirs, nondirs
for name in dirs:
path = join(top, name)
if not islink(path):
for x in mywalk(path, topdown, onerror, ignore_list):
yield x
if not topdown:
yield top, dirs, nondirs

Related

How to loop over folders and skip one

I have this code below, I want to skip "NetIQ" folder in my iteration, but I think I'm missing something there because I still get files from that folder when I run my code, please help.
path = "C:\User\Work\Identity\TestFolders"
def list_files(dir):
r = []
skip = ["NetIQ"]
for root, dirs, files in os.walk(dir):
if dirs in skip:
continue
else:
for name in files:
r.append(os.path.join(root, name))
return r
print(list_files(path))

Try out a list comprehension:
def list_files(dir):
r = []
skip = ["NetIQ"]
for root, dirs, files in os.walk(dir):
dirs[:] = [d for d in dirs if d not in skip]
for name in files:
r.append(os.path.join(root, name))
return r
print(list_files(path))

I think your expectation of the output of this for loop is wrong. os.walk does a depth first search through your directories and will yield a new output (root, dirs and files) for every subdirectory. So if you say
if dirs in skip:
continue
This will only apply for the directory where NetIQ is actually located - check, the files in this folder should be missing.
For a solution, see the other answers...

That's because dirs is not a string but a tuple. You can have a take a look at the doc here.
You can either check if the first item in the tuple is in skip:
def list_files(dir):
r = []
skip = ["omegleClient"]
for root, dirs, files in os.walk(dir):
if dirs[0] not in skip:
for name in files:
r.append(os.path.join(root, name))
return r
or check if any item in the tuple is in skip like so:
def list_files(dir):
r = []
skip = ["omegleClient"]
for root, dirs, files in os.walk(dir):
if not any(d in dirs for d in skip):
for name in files:
r.append(os.path.join(root, name))
return r

Python, copy only directories

I have a program that has a list of some files. I have to copy only the directories and the subdirectories from the list to a specified directories and don't need to copy the files. I tried this, but it doesn't work.
def copiarDirs():
items = list.curselection()
desti = tkFileDialog.askdirectory()
for dirs in os.walk(items, topdown=False):
for name in dirs:
#for i in items :
aux=root+"/"+list.get(i)
tryhard=("cp "+str(aux)+" "+str(desti))
os.system(tryhard)

Try this:
import os
def copyDirs(source, destination):
for subdir, dirs, files in os.walk(source):
for f in files:
dir = destination + os.path.join(subdir).split(':')[1]
if not os.path.exists(dir):
os.makedirs(dir)
sourceDir = 'D:\\Work\\'
destDir = 'D:\\Dest\\'
copyDirs(sourceDir, destDir) #calling function

How can I use os.walk or any other alternative to traverse folders recursively by the natural name order?

In python if I iterate all the folders by os.walk recursievely to find any filr with the defined extension. this is my present code;
def get_data_paths(root_path, ext = '*.jpg'):
import os
import fnmatch
matches = []
classes = []
class_names = []
for root, dirnames, filenames in os.walk(root_path):
for filename in fnmatch.filter(filenames, ext):
matches.append(os.path.join(root, filename))
class_name = os.path.basename(os.path.dirname(os.path.join(root, filename)))
if class_name not in class_names:
class_names.append(class_name)
classes.append(class_names.index(class_name))
print "There are ",len(matches), " files're found!!"
return matches, classes, class_names
However the problem here is, this function visits folders in a strange python order of the folder names. Instead I would like to traverse them through A-Z. How should I modify this code or use any other alternative to do this?

By default, the topdown parameter to os.walk is True, so a directory triplet is reported before its own directories are descended. The docs state:
the caller can modify the dirnames list in-place (perhaps using del or slice assignment), and walk() will only recurse into the subdirectories whose names remain in dirnames; this can be used to prune the search, impose a specific order of visiting, or even to inform walk() about directories the caller creates or renames before it resumes walk() again.
Boldface mine. So all you need to do is something like:
for root, dirnames, filenames in os.walk(root_path):
dirnames[:] = natsort.natsorted(dirnames)
# continue with other directory processing...
Since you need to edit the list in place, you need to use the [:] slice notation.
Here's an example of os.walk's operation. Given a directory tree that looks like:
$ ls -RF cm3mm/SAM3/src
Applets/ RTC.cc SAM3X/
DBGUWriteString.cc SAM3A/ SMC.cc.in
EEFC.cc SAM3N/ SoftBoot.cc
Memories.txt SAM3S/
PIO.cc SAM3U/
cm3mm/SAM3/src/Applets:
AppletAPI.cc IntFlash.cc Main.cc MessageSink.cc Runtime.cc
cm3mm/SAM3/src/SAM3A:
Map.txt Pins.txt
cm3mm/SAM3/src/SAM3N:
Map.txt Pins.txt
cm3mm/SAM3/src/SAM3S:
Map.txt Pins.txt
cm3mm/SAM3/src/SAM3U:
Map.txt Pins.txt
cm3mm/SAM3/src/SAM3X:
Map.txt Pins.txt
Now, let's see what os.walk does:
>>> import os
>>> for root, dirnames, filenames in os.walk("cm3mm/SAM3/src"):
... print "-----"
... print "root =", root
... print "dirnames =", dirnames
... print "filenames =", filenames
...
-----
root = cm3mm/SAM3/src
dirnames = ['Applets', 'SAM3A', 'SAM3N', 'SAM3S', 'SAM3U', 'SAM3X']
filenames = ['DBGUWriteString.cc', 'EEFC.cc', 'Memories.txt', 'PIO.cc', 'RTC.cc', 'SMC.cc.in', 'SoftBoot.cc']
-----
root = cm3mm/SAM3/src/Applets
dirnames = []
filenames = ['AppletAPI.cc', 'IntFlash.cc', 'Main.cc', 'MessageSink.cc', 'Runtime.cc']
-----
root = cm3mm/SAM3/src/SAM3A
dirnames = []
filenames = ['Map.txt', 'Pins.txt']
-----
root = cm3mm/SAM3/src/SAM3N
dirnames = []
filenames = ['Map.txt', 'Pins.txt']
-----
root = cm3mm/SAM3/src/SAM3S
dirnames = []
filenames = ['Map.txt', 'Pins.txt']
-----
root = cm3mm/SAM3/src/SAM3U
dirnames = []
filenames = ['Map.txt', 'Pins.txt']
-----
root = cm3mm/SAM3/src/SAM3X
dirnames = []
filenames = ['Map.txt', 'Pins.txt']
Each time through the loop, you get the directories and files for one directory. We know exactly which file belongs to which folder: the files in filenames belong to the folder root.

I changed the code like this;
def get_data_paths(root_path, ext = '*.jpg'):
import os
import fnmatch
import natsort # import this
matches = []
classes = []
class_names = []
dir_list= natsort.natsorted(list(os.walk(root_path))) # add this
for root, dirnames, filenames in dir_list:
for filename in fnmatch.filter(filenames, ext):
matches.append(os.path.join(root, filename))
class_name = os.path.basename(os.path.dirname(os.path.join(root, filename)))
if class_name not in class_names:
class_names.append(class_name)
classes.append(class_names.index(class_name))
print "There are ",len(matches), " files're found!!"
return matches, classes, class_names

Python OS.WALK Remove Directories

I'm trying to remove directories from os.walk (I don't need the files from those dirs)
My code:
def findit(root, exclude_files=[], exclude_dirs=[]):
exclude_files = (fnmatch.translate(i) for i in exclude_files)
exclude_files = '('+')|('.join(exclude_files)+')'
exclude_files = re.compile(exclude_files)
exclude_dirs = (os.path.normpath(i) for i in exclude_dirs)
exclude_dirs = (os.path.normcase(i) for i in exclude_dirs)
exclude_dirs = set(exclude_dirs)
return (os.path.join(r,f)
for r,_,f in os.walk(root)
if os.path.normpath(os.path.normcase(r)) not in exclude_dirs
for f in f
if not exclude_files.match(os.path.normcase(f)))
It works filtering the files, when I try to filter out c:/windows it will still show my files from windows dirs am I missing something?
filelist = list(findit('c:/',exclude_files = ['*.dll', '*.dat', '*.log', '*.exe'], exclude_dirs = ['c:/windows', 'c:/program files', 'c:/else']))

When filtering out directories, you are not preventing os.walk() from going into subdirectories.
You'll need to clear the dirs list for this to happen:
def findit(root, exclude_files=[], exclude_dirs=[]):
exclude_files = (fnmatch.translate(i) for i in exclude_files)
exclude_files = '('+')|('.join(exclude_files)+')'
exclude_files = re.compile(exclude_files)
exclude_dirs = (os.path.normpath(i) for i in exclude_dirs)
exclude_dirs = (os.path.normcase(i) for i in exclude_dirs)
exclude_dirs = set(exclude_dirs)
for current, dirs, files in os.walk(root):
if os.path.normpath(os.path.normcase(current)) in exclude_dirs:
# exclude this dir and subdirectories
dirs[:] = []
continue
for f in files:
if not exclude_files.match(os.path.normcase(f)):
yield os.path.join(current, f)
The dirs[:] = [] assignment clears the list in place; it removes all dirnames from the list. As this list is shared with os.walk() and the latter uses this list to subsequently visit sub-directories, this effectively stops os.walk() from visiting those subdirectories.

Reading the reply above made me wonder. Seemed to me the os.walk was missing and the root parameter did not seem to be used as needed. Also, the case of either of the optional arguments being the empty list should work. Suggesting a slight variation with less namespace look-up and exclude wildcards for directories at each directory level:
import os
import re
import fnmatch
import os.path
def findit(root, exclude_files=[], exclude_dirs=[], exclude_dirs_wc=[]):
"""Generate all files found under root excluding some.
Excluded files are given as a list of Unix shell-style wildcards
that exclude matches in each directory. Excluded directories are
assumed to be paths starting at root; no wildcards. Directory
wildcards at each level can be supplied.
"""
# Less namespace look-up.
join = os.path.join
normpath = os.path.normpath; normcase = os.path.normcase
#
def make_exclude_regex_from(lst):
if len(lst):
lst = (fnmatch.translate(i) for i in lst)
lst = "({})".format(")|(".join(lst))
lst = re.compile(lst)
return lst
#
exclude_files = make_exclude_regex_from(exclude_files)
exclude_dirs_wc = make_exclude_regex_from(exclude_dirs_wc)
if len(exclude_dirs):
exclude_dirs = (normpath(i) for i in exclude_dirs)
exclude_dirs = (normcase(i) for i in exclude_dirs)
exclude_dirs = set(exclude_dirs)
for current, dirs, files in os.walk(root):
current_dir = normpath(normcase(current))
if exclude_dirs and current_dir in exclude_dirs:
# Prune set of dirs to exclude.
exclude_dirs.discard(current_dir)
# Disregard sub-directories.
dirs[:] = [] # IN PLACE, since it is a loop var.
continue
if exclude_dirs_wc:
for dd in dirs[:]:
if exclude_dirs_wc.match(normcase(dd)):
dirs.remove(dd) # IN PLACE
if exclude_files:
for ff in files[:]:
if exclude_files.match(normcase(ff)):
files.remove(ff) # IN PLACE; also a loop var.
for f in files:
yield join(current,f)

You can use the keyword "continue" to skip the iteration while traversing using os.walk("pathName")
for dirpath, dirnames, filenames in os.walk(pathName):
# Write regular expression or a string to skip the desired folder
dirpath_pat = re.search(pattern, dirpath)
if dirpath_pat:
if dirpath_pat.group(0):
continue

Use fnmatch.filter to filter files by more than one possible file extension

Given the following piece of python code:
for root, dirs, files in os.walk(directory):
for filename in fnmatch.filter(files, '*.png'):
pass
How can I filter for more than one extension? In this special case I want to get all files ending with *.png, *.gif, *.jpg or *.jpeg.
For now I came up with
for root, dirs, files in os.walk(directory):
for extension in ['jpg', 'jpeg', 'gif', 'png']:
for filename in fnmatch.filter(files, '*.' + extension):
pass
But I think it is not very elegant and performant.
Someone has a better idea?

If you only need to check extensions (i.e. no further wildcards), why don't you simply use basic string operations?
for root, dirs, files in os.walk(directory):
for filename in files:
if filename.endswith(('.jpg', '.jpeg', '.gif', '.png')):
pass

I think your code is actually fine. If you want to touch every filename only once, define your own filtering function:
def is_image_file(filename, extensions=['.jpg', '.jpeg', '.gif', '.png']):
return any(filename.endswith(e) for e in extensions)
for root, dirs, files in os.walk(directory):
for filename in filter(is_image_file, files):
pass

I've been using this with a lot of success.
import fnmatch
import functools
import itertools
import os
# Remove the annotations if you're not on Python3
def find_files(dir_path: str=None, patterns: [str]=None) -> [str]:
"""
Returns a generator yielding files matching the given patterns
:type dir_path: str
:type patterns: [str]
:rtype : [str]
:param dir_path: Directory to search for files/directories under. Defaults to current dir.
:param patterns: Patterns of files to search for. Defaults to ["*"]. Example: ["*.json", "*.xml"]
"""
path = dir_path or "."
path_patterns = patterns or ["*"]
for root_dir, dir_names, file_names in os.walk(path):
filter_partial = functools.partial(fnmatch.filter, file_names)
for file_name in itertools.chain(*map(filter_partial, path_patterns)):
yield os.path.join(root_dir, file_name)
Examples:
for f in find_files(test_directory):
print(f)
yields:
.\test.json
.\test.xml
.\test.ini
.\test_helpers.py
.\__init__.py
Testing with multiple patterns:
for f in find_files(test_directory, ["*.xml", "*.json", "*.ini"]):
print(f)
yields:
.\test.json
.\test.xml
.\test.ini

This would be a better way, perhaps because you are not calling + repeatedly and using a tuple instead of list.
for root, dirs, files in os.walk(directory):
for extension in ('*.jpg', '*.jpeg', '*.gif', '*.png'):
for filename in fnmatch.filter(files, extension):
pass
A tuple is better because you are not going to modify the extension once you have created them. You are just using to iterate over them.

This isn't really elegant either, but it works:
for root, dirs, files in os.walk(directory):
for filename in fnmatch.filter(files, '*.png') + fnmatch.filter(files, '*.jpg') + fnmatch.filter(files, '*.jpeg') + fnmatch.filter(files, '*.gif'):
pass

Here is what I am using to filter files in apache log directories.
Here I exclude errors flles
rep_filters = [now.strftime("%Y%m%d")]
def files_filter(liste_fic, filters = rep_filters):
s = "(fic for fic in liste_fic if fic.find('error') < 0"
for filter in filters:
s += " and fic.find('%s') >=0 " % filter
s += ")"
return eval(s)

Please try this:
# pattern_list = ['*.jpg', '__.*']
def checkFilepatter(filename, pattern_list):
for pattern in pattern_list:
if fnmatch.fnmatch(filename, pattern):
return True
return False

You can use a list comprehension to check if my_file matches any of the file masks defined in patterns:
import fnmatch
my_file = 'my_precious.txt'
patterns = ('*.txt', '*.html', '*.mp3')
if [pat for pat in patterns if fnmatch.fnmatch(my_file, pat)]:
print('We have a match!')
else:
print('No match')

Internally, fnmatch users regular expressions. And there's a method that makes a regex from an fnmatch pattern — fnmatch.translate. This may also give a little speed-up.
import fnmatch
import os
import re
image_exts = ['jpg', 'jpeg', 'gif', 'png']
image_re = re.compile('|'.join(fnmatch.translate('*.' + e) for e in image_exts))
for root, dirs, files in os.walk(directory):
for filename in files:
if image_re.match(filename):
...

The clearest solution is:
import os
for root, dirs, files in os.walk(directory):
for filename in files:
_, ext = os.path.splitext(filename)
if ext in ['.jpg', '.jpeg', '.gif', '.png']:
...
or, using pathlib,
for path in pathlib.Path(directory).glob('**/*'):
if path.suffix in ['.jpg', '.jpeg', '.gif', '.png']:
...

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

A Python walker that can ignore directories - python

I need a file system walker that I could instruct to ignore traversing directories that I want to leave untouched, including all subdirectories below that branch. The os.walk and os.path.walk just don't do it.

Related

How to loop over folders and skip one

Python, copy only directories

How can I use os.walk or any other alternative to traverse folders recursively by the natural name order?

Python OS.WALK Remove Directories

Use fnmatch.filter to filter files by more than one possible file extension

Categories

Resources