How do you count subdirectories in a folder? - python

I figured out how to count directories in a folder, but not sure how I could edit my code to recursively count subdirectories. Any help would be appreciated.
This is my code so far.
def nestingLevel(path):
count = 0
for item in os.listdir(path):
if item[0] != '.':
n = os.path.join(path,item)
if os.path.isdir(n):
count += 1 + nestingLevel(n)
return count

I think you may want to use os.walk:
import os
def fcount(path):
count1 = 0
for root, dirs, files in os.walk(path):
count1 += len(dirs)
return count1
path = "/home/"
print fcount(path)

You can use a glob here - the ** pattern indicates a recursive glob. The trailing slash matches on directories, excluding other types of files.
from pathlib import Path
def recursive_subdir_count(path):
dirs = Path(path).glob('**/')
result = sum(1 for dir in dirs)
result -= 1 # discount `path` itself
Using / works on windows, macOS, and Linux, so don't worry about putting os.sep instead.
Beware of a weird edge case: shell globs typically exclude hidden directories, i.e. those which begin with a ., but pathlib includes those (it's a feature, not a bug: see issue26096). If you care about discounting hidden directories, filter them out in the expression when calling sum. Or, use the older module glob which excludes them by default.

If you want to count them all without the root, this will do it:
len([i for i, j, k in os.walk('.')])-1

Related

python glob read images with all types of extentions [duplicate]

Is there a better way to use glob.glob in python to get a list of multiple file types such as .txt, .mdown, and .markdown? Right now I have something like this:
projectFiles1 = glob.glob( os.path.join(projectDir, '*.txt') )
projectFiles2 = glob.glob( os.path.join(projectDir, '*.mdown') )
projectFiles3 = glob.glob( os.path.join(projectDir, '*.markdown') )
Maybe there is a better way, but how about:
import glob
types = ('*.pdf', '*.cpp') # the tuple of file types
files_grabbed = []
for files in types:
files_grabbed.extend(glob.glob(files))
# files_grabbed is the list of pdf and cpp files
Perhaps there is another way, so wait in case someone else comes up with a better answer.
glob returns a list: why not just run it multiple times and concatenate the results?
from glob import glob
project_files = glob('*.txt') + glob('*.mdown') + glob('*.markdown')
So many answers that suggest globbing as many times as number of extensions, I'd prefer globbing just once instead:
from pathlib import Path
files = (p.resolve() for p in Path(path).glob("**/*") if p.suffix in {".c", ".cc", ".cpp", ".hxx", ".h"})
from glob import glob
files = glob('*.gif')
files.extend(glob('*.png'))
files.extend(glob('*.jpg'))
print(files)
If you need to specify a path, loop over match patterns and keep the join inside the loop for simplicity:
from os.path import join
from glob import glob
files = []
for ext in ('*.gif', '*.png', '*.jpg'):
files.extend(glob(join("path/to/dir", ext)))
print(files)
Chain the results:
import itertools as it, glob
def multiple_file_types(*patterns):
return it.chain.from_iterable(glob.iglob(pattern) for pattern in patterns)
Then:
for filename in multiple_file_types("*.txt", "*.sql", "*.log"):
# do stuff
For example, for *.mp3 and *.flac on multiple folders, you can do:
mask = r'music/*/*.[mf][pl][3a]*'
glob.glob(mask)
The idea can be extended to more file extensions, but you have to check that the combinations won't match any other unwanted file extension you may have on those folders. So, be careful with this.
To automatically combine an arbitrary list of extensions into a single glob pattern, you can do the following:
def multi_extension_glob_mask(mask_base, *extensions):
mask_ext = ['[{}]'.format(''.join(set(c))) for c in zip(*extensions)]
if not mask_ext or len(set(len(e) for e in extensions)) > 1:
mask_ext.append('*')
return mask_base + ''.join(mask_ext)
mask = multi_extension_glob_mask('music/*/*.', 'mp3', 'flac', 'wma')
print(mask) # music/*/*.[mfw][pml][a3]*
with glob it is not possible. you can use only:
* matches everything
? matches any single character
[seq] matches any character in seq
[!seq] matches any character not in seq
use os.listdir and a regexp to check patterns:
for x in os.listdir('.'):
if re.match('.*\.txt|.*\.sql', x):
print x
While Python's default glob doesn't really follow after Bash's glob, you can do this with other libraries. We can enable braces in wcmatch's glob.
>>> from wcmatch import glob
>>> glob.glob('*.{md,ini}', flags=glob.BRACE)
['LICENSE.md', 'README.md', 'tox.ini']
You can even use extended glob patterns if that is your preference:
from wcmatch import glob
>>> glob.glob('*.#(md|ini)', flags=glob.EXTGLOB)
['LICENSE.md', 'README.md', 'tox.ini']
Same answer as #BPL (which is computationally efficient) but which can handle any glob pattern rather than extension:
import os
from fnmatch import fnmatch
folder = "path/to/folder/"
patterns = ("*.txt", "*.md", "*.markdown")
files = [f.path for f in os.scandir(folder) if any(fnmatch(f, p) for p in patterns)]
This solution is both efficient and convenient. It also closely matches the behavior of glob (see the documentation).
Note that this is simpler with the built-in package pathlib:
from pathlib import Path
folder = Path("/path/to/folder")
patterns = ("*.txt", "*.md", "*.markdown")
files = [f for f in folder.iterdir() if any(f.match(p) for p in patterns)]
Here is one-line list-comprehension variant of Pat's answer (which also includes that you wanted to glob in a specific project directory):
import os, glob
exts = ['*.txt', '*.mdown', '*.markdown']
files = [f for ext in exts for f in glob.glob(os.path.join(project_dir, ext))]
You loop over the extensions (for ext in exts), and then for each extension you take each file matching the glob pattern (for f in glob.glob(os.path.join(project_dir, ext)).
This solution is short, and without any unnecessary for-loops, nested list-comprehensions, or functions to clutter the code. Just pure, expressive, pythonic Zen.
This solution allows you to have a custom list of exts that can be changed without having to update your code. (This is always a good practice!)
The list-comprehension is the same used in Laurent's solution (which I've voted for). But I would argue that it is usually unnecessary to factor out a single line to a separate function, which is why I'm providing this as an alternative solution.
Bonus:
If you need to search not just a single directory, but also all sub-directories, you can pass recursive=True and use the multi-directory glob symbol ** 1:
files = [f for ext in exts
for f in glob.glob(os.path.join(project_dir, '**', ext), recursive=True)]
This will invoke glob.glob('<project_dir>/**/*.txt', recursive=True) and so on for each extension.
1 Technically, the ** glob symbol simply matches one or more characters including forward-slash / (unlike the singular * glob symbol). In practice, you just need to remember that as long as you surround ** with forward slashes (path separators), it matches zero or more directories.
Python 3
We can use pathlib; .glob still doesn't support globbing multiple arguments or within braces (as in POSIX shells) but we can easily filter the result.
For example, where you might ideally like to do:
# NOT VALID
Path(config_dir).glob("*.{ini,toml}")
# NOR IS
Path(config_dir).glob("*.ini", "*.toml")
you can do:
filter(lambda p: p.suffix in {".ini", ".toml"}, Path(config_dir).glob("*"))
which isn't too much worse.
A one-liner, Just for the hell of it..
folder = "C:\\multi_pattern_glob_one_liner"
files = [item for sublist in [glob.glob(folder + ext) for ext in ["/*.txt", "/*.bat"]] for item in sublist]
output:
['C:\\multi_pattern_glob_one_liner\\dummy_txt.txt', 'C:\\multi_pattern_glob_one_liner\\dummy_bat.bat']
files = glob.glob('*.txt')
files.extend(glob.glob('*.dat'))
By the results I've obtained from empirical tests, it turned out that glob.glob isn't the better way to filter out files by their extensions. Some of the reason are:
The globbing "language" does not allows perfect specification of multiple extension.
The former point results in obtaining incorrect results depending on file extensions.
The globbing method is empirically proven to be slower than most other methods.
Even if it's strange even other filesystems objects can have "extensions", folders too.
I've tested (for correcteness and efficiency in time) the following 4 different methods to filter out files by extensions and puts them in a list:
from glob import glob, iglob
from re import compile, findall
from os import walk
def glob_with_storage(args):
elements = ''.join([f'[{i}]' for i in args.extensions])
globs = f'{args.target}/**/*{elements}'
results = glob(globs, recursive=True)
return results
def glob_with_iteration(args):
elements = ''.join([f'[{i}]' for i in args.extensions])
globs = f'{args.target}/**/*{elements}'
results = [i for i in iglob(globs, recursive=True)]
return results
def walk_with_suffixes(args):
results = []
for r, d, f in walk(args.target):
for ff in f:
for e in args.extensions:
if ff.endswith(e):
results.append(path_join(r,ff))
break
return results
def walk_with_regs(args):
reg = compile('|'.join([f'{i}$' for i in args.extensions]))
results = []
for r, d, f in walk(args.target):
for ff in f:
if len(findall(reg,ff)):
results.append(path_join(r, ff))
return results
By running the code above on my laptop I obtained the following auto-explicative results.
Elapsed time for '7 times glob_with_storage()': 0.365023 seconds.
mean : 0.05214614
median : 0.051861
stdev : 0.001492152
min : 0.050864
max : 0.054853
Elapsed time for '7 times glob_with_iteration()': 0.360037 seconds.
mean : 0.05143386
median : 0.050864
stdev : 0.0007847381
min : 0.050864
max : 0.052859
Elapsed time for '7 times walk_with_suffixes()': 0.26529 seconds.
mean : 0.03789857
median : 0.037899
stdev : 0.0005759071
min : 0.036901
max : 0.038896
Elapsed time for '7 times walk_with_regs()': 0.290223 seconds.
mean : 0.04146043
median : 0.040891
stdev : 0.0007846776
min : 0.04089
max : 0.042885
Results sizes:
0 2451
1 2451
2 2446
3 2446
Differences between glob() and walk():
0 E:\x\y\z\venv\lib\python3.7\site-packages\Cython\Includes\numpy
1 E:\x\y\z\venv\lib\python3.7\site-packages\Cython\Utility\CppSupport.cpp
2 E:\x\y\z\venv\lib\python3.7\site-packages\future\moves\xmlrpc
3 E:\x\y\z\venv\lib\python3.7\site-packages\Cython\Includes\libcpp
4 E:\x\y\z\venv\lib\python3.7\site-packages\future\backports\xmlrpc
Elapsed time for 'main': 1.317424 seconds.
The fastest way to filter out files by extensions, happens even to be the ugliest one. Which is, nested for loops and string comparison using the endswith() method.
Moreover, as you can see, the globbing algorithms (with the pattern E:\x\y\z\**/*[py][pyc]) even with only 2 extension given (py and pyc) returns also incorrect results.
I have released Formic which implements multiple includes in a similar way to Apache Ant's FileSet and Globs.
The search can be implemented:
import formic
patterns = ["*.txt", "*.markdown", "*.mdown"]
fileset = formic.FileSet(directory=projectDir, include=patterns)
for file_name in fileset.qualified_files():
# Do something with file_name
Because the full Ant glob is implemented, you can include different directories with each pattern, so you could choose only those .txt files in one subdirectory, and the .markdown in another, for example:
patterns = [ "/unformatted/**/*.txt", "/formatted/**/*.mdown" ]
I hope this helps.
This is a Python 3.4+ pathlib solution:
exts = ".pdf", ".doc", ".xls", ".csv", ".ppt"
filelist = (str(i) for i in map(pathlib.Path, os.listdir(src)) if i.suffix.lower() in exts and not i.stem.startswith("~"))
Also it ignores all file names starting with ~.
After coming here for help, I made my own solution and wanted to share it. It's based on user2363986's answer, but I think this is more scalable. Meaning, that if you have 1000 extensions, the code will still look somewhat elegant.
from glob import glob
directoryPath = "C:\\temp\\*."
fileExtensions = [ "jpg", "jpeg", "png", "bmp", "gif" ]
listOfFiles = []
for extension in fileExtensions:
listOfFiles.extend( glob( directoryPath + extension ))
for file in listOfFiles:
print(file) # Or do other stuff
Not glob, but here's another way using a list comprehension:
extensions = 'txt mdown markdown'.split()
projectFiles = [f for f in os.listdir(projectDir)
if os.path.splitext(f)[1][1:] in extensions]
The following function _glob globs for multiple file extensions.
import glob
import os
def _glob(path, *exts):
"""Glob for multiple file extensions
Parameters
----------
path : str
A file name without extension, or directory name
exts : tuple
File extensions to glob for
Returns
-------
files : list
list of files matching extensions in exts in path
"""
path = os.path.join(path, "*") if os.path.isdir(path) else path + "*"
return [f for files in [glob.glob(path + ext) for ext in exts] for f in files]
files = _glob(projectDir, ".txt", ".mdown", ".markdown")
From previous answer
glob('*.jpg') + glob('*.png')
Here is a shorter one,
from glob import glob
extensions = ['jpg', 'png'] # to find these filename extensions
# Method 1: loop one by one and extend to the output list
output = []
[output.extend(glob(f'*.{name}')) for name in extensions]
print(output)
# Method 2: even shorter
# loop filename extension to glob() it and flatten it to a list
output = [p for p2 in [glob(f'*.{name}') for name in extensions] for p in p2]
print(output)
You can try to make a manual list comparing the extension of existing with those you require.
ext_list = ['gif','jpg','jpeg','png'];
file_list = []
for file in glob.glob('*.*'):
if file.rsplit('.',1)[1] in ext_list :
file_list.append(file)
import os
import glob
import operator
from functools import reduce
types = ('*.jpg', '*.png', '*.jpeg')
lazy_paths = (glob.glob(os.path.join('my_path', t)) for t in types)
paths = reduce(operator.add, lazy_paths, [])
https://docs.python.org/3.5/library/functools.html#functools.reduce
https://docs.python.org/3.5/library/operator.html#operator.add
To glob multiple file types, you need to call glob() function several times in a loop. Since this function returns a list, you need to concatenate the lists.
For instance, this function do the job:
import glob
import os
def glob_filetypes(root_dir, *patterns):
return [path
for pattern in patterns
for path in glob.glob(os.path.join(root_dir, pattern))]
Simple usage:
project_dir = "path/to/project/dir"
for path in sorted(glob_filetypes(project_dir, '*.txt', '*.mdown', '*.markdown')):
print(path)
You can also use glob.iglob() to have an iterator:
Return an iterator which yields the same values as glob() without actually storing them all simultaneously.
def iglob_filetypes(root_dir, *patterns):
return (path
for pattern in patterns
for path in glob.iglob(os.path.join(root_dir, pattern)))
One glob, many extensions... but imperfect solution (might match other files).
filetypes = ['tif', 'jpg']
filetypes = zip(*[list(ft) for ft in filetypes])
filetypes = ["".join(ch) for ch in filetypes]
filetypes = ["[%s]" % ch for ch in filetypes]
filetypes = "".join(filetypes) + "*"
print(filetypes)
# => [tj][ip][fg]*
glob.glob("/path/to/*.%s" % filetypes)
I had the same issue and this is what I came up with
import os, sys, re
#without glob
src_dir = '/mnt/mypics/'
src_pics = []
ext = re.compile('.*\.(|{}|)$'.format('|'.join(['png', 'jpeg', 'jpg']).encode('utf-8')))
for root, dirnames, filenames in os.walk(src_dir):
for filename in filter(lambda name:ext.search(name),filenames):
src_pics.append(os.path.join(root, filename))
Use a list of extension and iterate through
from os.path import join
from glob import glob
files = []
extensions = ['*.gif', '*.png', '*.jpg']
for ext in extensions:
files.extend(glob(join("path/to/dir", ext)))
print(files)
You could use filter:
import os
import glob
projectFiles = filter(
lambda x: os.path.splitext(x)[1] in [".txt", ".mdown", ".markdown"]
glob.glob(os.path.join(projectDir, "*"))
)
You could also use reduce() like so:
import glob
file_types = ['*.txt', '*.mdown', '*.markdown']
project_files = reduce(lambda list1, list2: list1 + list2, (glob.glob(t) for t in file_types))
this creates a list from glob.glob() for each pattern and reduces them to a single list.
Yet another solution (use glob to get paths using multiple match patterns and combine all paths into a single list using reduce and add):
import functools, glob, operator
paths = functools.reduce(operator.add, [glob.glob(pattern) for pattern in [
"path1/*.ext1",
"path2/*.ext2"]])
If you use pathlib try this:
import pathlib
extensions = ['.py', '.txt']
root_dir = './test/'
files = filter(lambda p: p.suffix in extensions, pathlib.Path(root_dir).glob('**/*'))
print(list(files))

Using glob to get txt and png files only from folder [duplicate]

Is there a better way to use glob.glob in python to get a list of multiple file types such as .txt, .mdown, and .markdown? Right now I have something like this:
projectFiles1 = glob.glob( os.path.join(projectDir, '*.txt') )
projectFiles2 = glob.glob( os.path.join(projectDir, '*.mdown') )
projectFiles3 = glob.glob( os.path.join(projectDir, '*.markdown') )
Maybe there is a better way, but how about:
import glob
types = ('*.pdf', '*.cpp') # the tuple of file types
files_grabbed = []
for files in types:
files_grabbed.extend(glob.glob(files))
# files_grabbed is the list of pdf and cpp files
Perhaps there is another way, so wait in case someone else comes up with a better answer.
glob returns a list: why not just run it multiple times and concatenate the results?
from glob import glob
project_files = glob('*.txt') + glob('*.mdown') + glob('*.markdown')
So many answers that suggest globbing as many times as number of extensions, I'd prefer globbing just once instead:
from pathlib import Path
files = (p.resolve() for p in Path(path).glob("**/*") if p.suffix in {".c", ".cc", ".cpp", ".hxx", ".h"})
from glob import glob
files = glob('*.gif')
files.extend(glob('*.png'))
files.extend(glob('*.jpg'))
print(files)
If you need to specify a path, loop over match patterns and keep the join inside the loop for simplicity:
from os.path import join
from glob import glob
files = []
for ext in ('*.gif', '*.png', '*.jpg'):
files.extend(glob(join("path/to/dir", ext)))
print(files)
Chain the results:
import itertools as it, glob
def multiple_file_types(*patterns):
return it.chain.from_iterable(glob.iglob(pattern) for pattern in patterns)
Then:
for filename in multiple_file_types("*.txt", "*.sql", "*.log"):
# do stuff
For example, for *.mp3 and *.flac on multiple folders, you can do:
mask = r'music/*/*.[mf][pl][3a]*'
glob.glob(mask)
The idea can be extended to more file extensions, but you have to check that the combinations won't match any other unwanted file extension you may have on those folders. So, be careful with this.
To automatically combine an arbitrary list of extensions into a single glob pattern, you can do the following:
def multi_extension_glob_mask(mask_base, *extensions):
mask_ext = ['[{}]'.format(''.join(set(c))) for c in zip(*extensions)]
if not mask_ext or len(set(len(e) for e in extensions)) > 1:
mask_ext.append('*')
return mask_base + ''.join(mask_ext)
mask = multi_extension_glob_mask('music/*/*.', 'mp3', 'flac', 'wma')
print(mask) # music/*/*.[mfw][pml][a3]*
with glob it is not possible. you can use only:
* matches everything
? matches any single character
[seq] matches any character in seq
[!seq] matches any character not in seq
use os.listdir and a regexp to check patterns:
for x in os.listdir('.'):
if re.match('.*\.txt|.*\.sql', x):
print x
While Python's default glob doesn't really follow after Bash's glob, you can do this with other libraries. We can enable braces in wcmatch's glob.
>>> from wcmatch import glob
>>> glob.glob('*.{md,ini}', flags=glob.BRACE)
['LICENSE.md', 'README.md', 'tox.ini']
You can even use extended glob patterns if that is your preference:
from wcmatch import glob
>>> glob.glob('*.#(md|ini)', flags=glob.EXTGLOB)
['LICENSE.md', 'README.md', 'tox.ini']
Same answer as #BPL (which is computationally efficient) but which can handle any glob pattern rather than extension:
import os
from fnmatch import fnmatch
folder = "path/to/folder/"
patterns = ("*.txt", "*.md", "*.markdown")
files = [f.path for f in os.scandir(folder) if any(fnmatch(f, p) for p in patterns)]
This solution is both efficient and convenient. It also closely matches the behavior of glob (see the documentation).
Note that this is simpler with the built-in package pathlib:
from pathlib import Path
folder = Path("/path/to/folder")
patterns = ("*.txt", "*.md", "*.markdown")
files = [f for f in folder.iterdir() if any(f.match(p) for p in patterns)]
Here is one-line list-comprehension variant of Pat's answer (which also includes that you wanted to glob in a specific project directory):
import os, glob
exts = ['*.txt', '*.mdown', '*.markdown']
files = [f for ext in exts for f in glob.glob(os.path.join(project_dir, ext))]
You loop over the extensions (for ext in exts), and then for each extension you take each file matching the glob pattern (for f in glob.glob(os.path.join(project_dir, ext)).
This solution is short, and without any unnecessary for-loops, nested list-comprehensions, or functions to clutter the code. Just pure, expressive, pythonic Zen.
This solution allows you to have a custom list of exts that can be changed without having to update your code. (This is always a good practice!)
The list-comprehension is the same used in Laurent's solution (which I've voted for). But I would argue that it is usually unnecessary to factor out a single line to a separate function, which is why I'm providing this as an alternative solution.
Bonus:
If you need to search not just a single directory, but also all sub-directories, you can pass recursive=True and use the multi-directory glob symbol ** 1:
files = [f for ext in exts
for f in glob.glob(os.path.join(project_dir, '**', ext), recursive=True)]
This will invoke glob.glob('<project_dir>/**/*.txt', recursive=True) and so on for each extension.
1 Technically, the ** glob symbol simply matches one or more characters including forward-slash / (unlike the singular * glob symbol). In practice, you just need to remember that as long as you surround ** with forward slashes (path separators), it matches zero or more directories.
Python 3
We can use pathlib; .glob still doesn't support globbing multiple arguments or within braces (as in POSIX shells) but we can easily filter the result.
For example, where you might ideally like to do:
# NOT VALID
Path(config_dir).glob("*.{ini,toml}")
# NOR IS
Path(config_dir).glob("*.ini", "*.toml")
you can do:
filter(lambda p: p.suffix in {".ini", ".toml"}, Path(config_dir).glob("*"))
which isn't too much worse.
A one-liner, Just for the hell of it..
folder = "C:\\multi_pattern_glob_one_liner"
files = [item for sublist in [glob.glob(folder + ext) for ext in ["/*.txt", "/*.bat"]] for item in sublist]
output:
['C:\\multi_pattern_glob_one_liner\\dummy_txt.txt', 'C:\\multi_pattern_glob_one_liner\\dummy_bat.bat']
files = glob.glob('*.txt')
files.extend(glob.glob('*.dat'))
By the results I've obtained from empirical tests, it turned out that glob.glob isn't the better way to filter out files by their extensions. Some of the reason are:
The globbing "language" does not allows perfect specification of multiple extension.
The former point results in obtaining incorrect results depending on file extensions.
The globbing method is empirically proven to be slower than most other methods.
Even if it's strange even other filesystems objects can have "extensions", folders too.
I've tested (for correcteness and efficiency in time) the following 4 different methods to filter out files by extensions and puts them in a list:
from glob import glob, iglob
from re import compile, findall
from os import walk
def glob_with_storage(args):
elements = ''.join([f'[{i}]' for i in args.extensions])
globs = f'{args.target}/**/*{elements}'
results = glob(globs, recursive=True)
return results
def glob_with_iteration(args):
elements = ''.join([f'[{i}]' for i in args.extensions])
globs = f'{args.target}/**/*{elements}'
results = [i for i in iglob(globs, recursive=True)]
return results
def walk_with_suffixes(args):
results = []
for r, d, f in walk(args.target):
for ff in f:
for e in args.extensions:
if ff.endswith(e):
results.append(path_join(r,ff))
break
return results
def walk_with_regs(args):
reg = compile('|'.join([f'{i}$' for i in args.extensions]))
results = []
for r, d, f in walk(args.target):
for ff in f:
if len(findall(reg,ff)):
results.append(path_join(r, ff))
return results
By running the code above on my laptop I obtained the following auto-explicative results.
Elapsed time for '7 times glob_with_storage()': 0.365023 seconds.
mean : 0.05214614
median : 0.051861
stdev : 0.001492152
min : 0.050864
max : 0.054853
Elapsed time for '7 times glob_with_iteration()': 0.360037 seconds.
mean : 0.05143386
median : 0.050864
stdev : 0.0007847381
min : 0.050864
max : 0.052859
Elapsed time for '7 times walk_with_suffixes()': 0.26529 seconds.
mean : 0.03789857
median : 0.037899
stdev : 0.0005759071
min : 0.036901
max : 0.038896
Elapsed time for '7 times walk_with_regs()': 0.290223 seconds.
mean : 0.04146043
median : 0.040891
stdev : 0.0007846776
min : 0.04089
max : 0.042885
Results sizes:
0 2451
1 2451
2 2446
3 2446
Differences between glob() and walk():
0 E:\x\y\z\venv\lib\python3.7\site-packages\Cython\Includes\numpy
1 E:\x\y\z\venv\lib\python3.7\site-packages\Cython\Utility\CppSupport.cpp
2 E:\x\y\z\venv\lib\python3.7\site-packages\future\moves\xmlrpc
3 E:\x\y\z\venv\lib\python3.7\site-packages\Cython\Includes\libcpp
4 E:\x\y\z\venv\lib\python3.7\site-packages\future\backports\xmlrpc
Elapsed time for 'main': 1.317424 seconds.
The fastest way to filter out files by extensions, happens even to be the ugliest one. Which is, nested for loops and string comparison using the endswith() method.
Moreover, as you can see, the globbing algorithms (with the pattern E:\x\y\z\**/*[py][pyc]) even with only 2 extension given (py and pyc) returns also incorrect results.
I have released Formic which implements multiple includes in a similar way to Apache Ant's FileSet and Globs.
The search can be implemented:
import formic
patterns = ["*.txt", "*.markdown", "*.mdown"]
fileset = formic.FileSet(directory=projectDir, include=patterns)
for file_name in fileset.qualified_files():
# Do something with file_name
Because the full Ant glob is implemented, you can include different directories with each pattern, so you could choose only those .txt files in one subdirectory, and the .markdown in another, for example:
patterns = [ "/unformatted/**/*.txt", "/formatted/**/*.mdown" ]
I hope this helps.
This is a Python 3.4+ pathlib solution:
exts = ".pdf", ".doc", ".xls", ".csv", ".ppt"
filelist = (str(i) for i in map(pathlib.Path, os.listdir(src)) if i.suffix.lower() in exts and not i.stem.startswith("~"))
Also it ignores all file names starting with ~.
After coming here for help, I made my own solution and wanted to share it. It's based on user2363986's answer, but I think this is more scalable. Meaning, that if you have 1000 extensions, the code will still look somewhat elegant.
from glob import glob
directoryPath = "C:\\temp\\*."
fileExtensions = [ "jpg", "jpeg", "png", "bmp", "gif" ]
listOfFiles = []
for extension in fileExtensions:
listOfFiles.extend( glob( directoryPath + extension ))
for file in listOfFiles:
print(file) # Or do other stuff
Not glob, but here's another way using a list comprehension:
extensions = 'txt mdown markdown'.split()
projectFiles = [f for f in os.listdir(projectDir)
if os.path.splitext(f)[1][1:] in extensions]
The following function _glob globs for multiple file extensions.
import glob
import os
def _glob(path, *exts):
"""Glob for multiple file extensions
Parameters
----------
path : str
A file name without extension, or directory name
exts : tuple
File extensions to glob for
Returns
-------
files : list
list of files matching extensions in exts in path
"""
path = os.path.join(path, "*") if os.path.isdir(path) else path + "*"
return [f for files in [glob.glob(path + ext) for ext in exts] for f in files]
files = _glob(projectDir, ".txt", ".mdown", ".markdown")
From previous answer
glob('*.jpg') + glob('*.png')
Here is a shorter one,
from glob import glob
extensions = ['jpg', 'png'] # to find these filename extensions
# Method 1: loop one by one and extend to the output list
output = []
[output.extend(glob(f'*.{name}')) for name in extensions]
print(output)
# Method 2: even shorter
# loop filename extension to glob() it and flatten it to a list
output = [p for p2 in [glob(f'*.{name}') for name in extensions] for p in p2]
print(output)
You can try to make a manual list comparing the extension of existing with those you require.
ext_list = ['gif','jpg','jpeg','png'];
file_list = []
for file in glob.glob('*.*'):
if file.rsplit('.',1)[1] in ext_list :
file_list.append(file)
import os
import glob
import operator
from functools import reduce
types = ('*.jpg', '*.png', '*.jpeg')
lazy_paths = (glob.glob(os.path.join('my_path', t)) for t in types)
paths = reduce(operator.add, lazy_paths, [])
https://docs.python.org/3.5/library/functools.html#functools.reduce
https://docs.python.org/3.5/library/operator.html#operator.add
To glob multiple file types, you need to call glob() function several times in a loop. Since this function returns a list, you need to concatenate the lists.
For instance, this function do the job:
import glob
import os
def glob_filetypes(root_dir, *patterns):
return [path
for pattern in patterns
for path in glob.glob(os.path.join(root_dir, pattern))]
Simple usage:
project_dir = "path/to/project/dir"
for path in sorted(glob_filetypes(project_dir, '*.txt', '*.mdown', '*.markdown')):
print(path)
You can also use glob.iglob() to have an iterator:
Return an iterator which yields the same values as glob() without actually storing them all simultaneously.
def iglob_filetypes(root_dir, *patterns):
return (path
for pattern in patterns
for path in glob.iglob(os.path.join(root_dir, pattern)))
One glob, many extensions... but imperfect solution (might match other files).
filetypes = ['tif', 'jpg']
filetypes = zip(*[list(ft) for ft in filetypes])
filetypes = ["".join(ch) for ch in filetypes]
filetypes = ["[%s]" % ch for ch in filetypes]
filetypes = "".join(filetypes) + "*"
print(filetypes)
# => [tj][ip][fg]*
glob.glob("/path/to/*.%s" % filetypes)
I had the same issue and this is what I came up with
import os, sys, re
#without glob
src_dir = '/mnt/mypics/'
src_pics = []
ext = re.compile('.*\.(|{}|)$'.format('|'.join(['png', 'jpeg', 'jpg']).encode('utf-8')))
for root, dirnames, filenames in os.walk(src_dir):
for filename in filter(lambda name:ext.search(name),filenames):
src_pics.append(os.path.join(root, filename))
Use a list of extension and iterate through
from os.path import join
from glob import glob
files = []
extensions = ['*.gif', '*.png', '*.jpg']
for ext in extensions:
files.extend(glob(join("path/to/dir", ext)))
print(files)
You could use filter:
import os
import glob
projectFiles = filter(
lambda x: os.path.splitext(x)[1] in [".txt", ".mdown", ".markdown"]
glob.glob(os.path.join(projectDir, "*"))
)
You could also use reduce() like so:
import glob
file_types = ['*.txt', '*.mdown', '*.markdown']
project_files = reduce(lambda list1, list2: list1 + list2, (glob.glob(t) for t in file_types))
this creates a list from glob.glob() for each pattern and reduces them to a single list.
Yet another solution (use glob to get paths using multiple match patterns and combine all paths into a single list using reduce and add):
import functools, glob, operator
paths = functools.reduce(operator.add, [glob.glob(pattern) for pattern in [
"path1/*.ext1",
"path2/*.ext2"]])
If you use pathlib try this:
import pathlib
extensions = ['.py', '.txt']
root_dir = './test/'
files = filter(lambda p: p.suffix in extensions, pathlib.Path(root_dir).glob('**/*'))
print(list(files))

Count all files in all folders/subfolders with Python

Which is the most efficient way to count all files in all folders and subfolders in Python? I want to use this on Linux systems.
Example output:
(Path files)
/ 2
/bin 100
/boot 20
/boot/efi/EFI/redhat 1
....
/root 34
....
Paths without a file should be ignored.
Thanks.
You can do it with os.walk();
import os
for root, dirs, files in os.walk('/some/path'):
if files:
print('{0} {1}'.format(root, len(files)))
Note that this will also include hidden files, i.e. those that begin with a dot (.).
import os
print [(item[0], len(item[2])) for item in os.walk('/path') if item[2]]
It returns a list of tuples of folders/subfolders and files count in /path.
OR
import os
for item in os.walk('/path'):
if item[2]:
print item[0], len(item[2])
It prints folders/subfolders and files count in /path.
If you want try faster solution, then you had to try to combine:
os.scandir() # from python 3.5.2
iterate recursively and use:
from itertools import count
counter = count()
counter.next() # returns at first 0, next 1, 2, 3 ...
if counter.next() > 1000:
print 'dir with file count over 1000' # and use continue in for loop
Maybe that will be faster, because I think in os.walk function are unnecessary things for you.

get all folders (os.walk) that are older than x days, delete

How can I concisely express "get all folders older than x days"
I have a method getOldDirs(dirPath, olderThanDays), it must walk through a given root folder and return a list of folders that are older than say 7 days.
I call the above function from another function cleanOldFolders(). cleanOldFolders() will delete those folders similar to "rm -Rf
code that I have, how can I modify the loops concisely:
"""
Clean oldFolders
"""
def cleanOldFolders(self):
pathString = self.folderRoot + '/' + self.configMode + '/' + self.appId
oldDirList = self.getOldDirs(pathString, 7);
# Notify user that the following folders are deleted
# remove all old dirs perhaps using shutil.removetree for each folder oldDirList, rm -Rf
return
Get old dirs:
"""
get all subfolders under dirPath older than olderThanDays
"""
def getOldDirs(self,dirPath, olderThanDays):
# What is the concise way of expressing Get me list of all dir/subdirs from "dirPath" that are older than "olderThanDays"
# I know I have to use os.walk,
# I want a concise loop like this - but should recurse using os.walk
a = [os.path.join(dirPath, myfile) for myfile in os.listdir(dirPath)
if (os.path.isdir(os.path.join(dirPath, myfile)) and
(self.isOlder(os.path.join(dirPath, myfile), olderThanDays))
)]
# for root, dirs, files in os.walk(dirPath):
# for name in dirs:
# print os.path.join(root, name)
return a
One of the nice things about os.walk() is that it does the recursing for you. For its usage in your application it's important to specify the optional keyword argument topdown as False because its default is True and os.rmdir() won't delete non-empty directories.
This means your code will need to delete all the files and subdirectories in each subdirectory it encounters before removing the subdirectory itself. To facilitate doing that, the directory list getOldDirs() returns should be in the order that the subdirectories need to be deleted in.
It's also important to note that in the following, the directory's age is calculated in fractional, not whole, days, which means that seconds count and that one that was only say, 6 days and 23 hours and 59 seconds old won't get put on the list to be deleted even though it is only two seconds away from being old enough.
import os
import time
def getOldDirs(self, dirPath, olderThanDays):
"""
return a list of all subfolders under dirPath older than olderThanDays
"""
olderThanDays *= 86400 # convert days to seconds
present = time.time()
for root, dirs, files in os.walk(dirPath, topdown=False):
for name in dirs:
subDirPath = os.path.join(root, name)
if (present - os.path.getmtime(subDirPath)) > olderThanDays:
yield subDirPath
This should be a starting point.
import os
from time import time as _time
SEVEN_DAYS = 60*60*24*7
def get_old_dirs(dir_path, older_than=SEVEN_DAYS):
time_now = _time()
for path, folders, files in os.walk(dir_path):
for folder in folders:
folder_path = os.path.join(path, folder)
if (time_now - os.path.getmtime(folder_path)) > older_than:
yield folder_path
list_of_folders = list(get_old_dirs("/some/path"))
Also, if you don't want to walk into folders that are older than older_than days (because you're going to delete them) you can prune the search tree be removing folder names from the folders list
def get_old_dirs(dir_path, older_than=SEVEN_DAYS):
time_now = _time()
for path, folders, files in os.walk(dir_path):
for folder in folders[:]:
folder_path = os.path.join(path, folder)
if (time_now - os.path.getmtime(folder_path)) > older_than:
yield folder_path
folders.remove(folder)
This uses os.walk and gets you the list of files older than 7 days
import os
from datetime import date
old_dirs = []
today = date.today()
for root, dirs, files in os.walk(start_path):
for name in dirs:
filedate = date.fromtimestamp(os.path.getmtime(os.path.join(root, name)))
if (today - filedate).days > 7:
old_dirs.append(name)

Python glob multiple filetypes

Is there a better way to use glob.glob in python to get a list of multiple file types such as .txt, .mdown, and .markdown? Right now I have something like this:
projectFiles1 = glob.glob( os.path.join(projectDir, '*.txt') )
projectFiles2 = glob.glob( os.path.join(projectDir, '*.mdown') )
projectFiles3 = glob.glob( os.path.join(projectDir, '*.markdown') )
Maybe there is a better way, but how about:
import glob
types = ('*.pdf', '*.cpp') # the tuple of file types
files_grabbed = []
for files in types:
files_grabbed.extend(glob.glob(files))
# files_grabbed is the list of pdf and cpp files
Perhaps there is another way, so wait in case someone else comes up with a better answer.
glob returns a list: why not just run it multiple times and concatenate the results?
from glob import glob
project_files = glob('*.txt') + glob('*.mdown') + glob('*.markdown')
So many answers that suggest globbing as many times as number of extensions, I'd prefer globbing just once instead:
from pathlib import Path
files = (p.resolve() for p in Path(path).glob("**/*") if p.suffix in {".c", ".cc", ".cpp", ".hxx", ".h"})
from glob import glob
files = glob('*.gif')
files.extend(glob('*.png'))
files.extend(glob('*.jpg'))
print(files)
If you need to specify a path, loop over match patterns and keep the join inside the loop for simplicity:
from os.path import join
from glob import glob
files = []
for ext in ('*.gif', '*.png', '*.jpg'):
files.extend(glob(join("path/to/dir", ext)))
print(files)
Chain the results:
import itertools as it, glob
def multiple_file_types(*patterns):
return it.chain.from_iterable(glob.iglob(pattern) for pattern in patterns)
Then:
for filename in multiple_file_types("*.txt", "*.sql", "*.log"):
# do stuff
For example, for *.mp3 and *.flac on multiple folders, you can do:
mask = r'music/*/*.[mf][pl][3a]*'
glob.glob(mask)
The idea can be extended to more file extensions, but you have to check that the combinations won't match any other unwanted file extension you may have on those folders. So, be careful with this.
To automatically combine an arbitrary list of extensions into a single glob pattern, you can do the following:
def multi_extension_glob_mask(mask_base, *extensions):
mask_ext = ['[{}]'.format(''.join(set(c))) for c in zip(*extensions)]
if not mask_ext or len(set(len(e) for e in extensions)) > 1:
mask_ext.append('*')
return mask_base + ''.join(mask_ext)
mask = multi_extension_glob_mask('music/*/*.', 'mp3', 'flac', 'wma')
print(mask) # music/*/*.[mfw][pml][a3]*
with glob it is not possible. you can use only:
* matches everything
? matches any single character
[seq] matches any character in seq
[!seq] matches any character not in seq
use os.listdir and a regexp to check patterns:
for x in os.listdir('.'):
if re.match('.*\.txt|.*\.sql', x):
print x
While Python's default glob doesn't really follow after Bash's glob, you can do this with other libraries. We can enable braces in wcmatch's glob.
>>> from wcmatch import glob
>>> glob.glob('*.{md,ini}', flags=glob.BRACE)
['LICENSE.md', 'README.md', 'tox.ini']
You can even use extended glob patterns if that is your preference:
from wcmatch import glob
>>> glob.glob('*.#(md|ini)', flags=glob.EXTGLOB)
['LICENSE.md', 'README.md', 'tox.ini']
Same answer as #BPL (which is computationally efficient) but which can handle any glob pattern rather than extension:
import os
from fnmatch import fnmatch
folder = "path/to/folder/"
patterns = ("*.txt", "*.md", "*.markdown")
files = [f.path for f in os.scandir(folder) if any(fnmatch(f, p) for p in patterns)]
This solution is both efficient and convenient. It also closely matches the behavior of glob (see the documentation).
Note that this is simpler with the built-in package pathlib:
from pathlib import Path
folder = Path("/path/to/folder")
patterns = ("*.txt", "*.md", "*.markdown")
files = [f for f in folder.iterdir() if any(f.match(p) for p in patterns)]
Here is one-line list-comprehension variant of Pat's answer (which also includes that you wanted to glob in a specific project directory):
import os, glob
exts = ['*.txt', '*.mdown', '*.markdown']
files = [f for ext in exts for f in glob.glob(os.path.join(project_dir, ext))]
You loop over the extensions (for ext in exts), and then for each extension you take each file matching the glob pattern (for f in glob.glob(os.path.join(project_dir, ext)).
This solution is short, and without any unnecessary for-loops, nested list-comprehensions, or functions to clutter the code. Just pure, expressive, pythonic Zen.
This solution allows you to have a custom list of exts that can be changed without having to update your code. (This is always a good practice!)
The list-comprehension is the same used in Laurent's solution (which I've voted for). But I would argue that it is usually unnecessary to factor out a single line to a separate function, which is why I'm providing this as an alternative solution.
Bonus:
If you need to search not just a single directory, but also all sub-directories, you can pass recursive=True and use the multi-directory glob symbol ** 1:
files = [f for ext in exts
for f in glob.glob(os.path.join(project_dir, '**', ext), recursive=True)]
This will invoke glob.glob('<project_dir>/**/*.txt', recursive=True) and so on for each extension.
1 Technically, the ** glob symbol simply matches one or more characters including forward-slash / (unlike the singular * glob symbol). In practice, you just need to remember that as long as you surround ** with forward slashes (path separators), it matches zero or more directories.
Python 3
We can use pathlib; .glob still doesn't support globbing multiple arguments or within braces (as in POSIX shells) but we can easily filter the result.
For example, where you might ideally like to do:
# NOT VALID
Path(config_dir).glob("*.{ini,toml}")
# NOR IS
Path(config_dir).glob("*.ini", "*.toml")
you can do:
filter(lambda p: p.suffix in {".ini", ".toml"}, Path(config_dir).glob("*"))
which isn't too much worse.
A one-liner, Just for the hell of it..
folder = "C:\\multi_pattern_glob_one_liner"
files = [item for sublist in [glob.glob(folder + ext) for ext in ["/*.txt", "/*.bat"]] for item in sublist]
output:
['C:\\multi_pattern_glob_one_liner\\dummy_txt.txt', 'C:\\multi_pattern_glob_one_liner\\dummy_bat.bat']
files = glob.glob('*.txt')
files.extend(glob.glob('*.dat'))
By the results I've obtained from empirical tests, it turned out that glob.glob isn't the better way to filter out files by their extensions. Some of the reason are:
The globbing "language" does not allows perfect specification of multiple extension.
The former point results in obtaining incorrect results depending on file extensions.
The globbing method is empirically proven to be slower than most other methods.
Even if it's strange even other filesystems objects can have "extensions", folders too.
I've tested (for correcteness and efficiency in time) the following 4 different methods to filter out files by extensions and puts them in a list:
from glob import glob, iglob
from re import compile, findall
from os import walk
def glob_with_storage(args):
elements = ''.join([f'[{i}]' for i in args.extensions])
globs = f'{args.target}/**/*{elements}'
results = glob(globs, recursive=True)
return results
def glob_with_iteration(args):
elements = ''.join([f'[{i}]' for i in args.extensions])
globs = f'{args.target}/**/*{elements}'
results = [i for i in iglob(globs, recursive=True)]
return results
def walk_with_suffixes(args):
results = []
for r, d, f in walk(args.target):
for ff in f:
for e in args.extensions:
if ff.endswith(e):
results.append(path_join(r,ff))
break
return results
def walk_with_regs(args):
reg = compile('|'.join([f'{i}$' for i in args.extensions]))
results = []
for r, d, f in walk(args.target):
for ff in f:
if len(findall(reg,ff)):
results.append(path_join(r, ff))
return results
By running the code above on my laptop I obtained the following auto-explicative results.
Elapsed time for '7 times glob_with_storage()': 0.365023 seconds.
mean : 0.05214614
median : 0.051861
stdev : 0.001492152
min : 0.050864
max : 0.054853
Elapsed time for '7 times glob_with_iteration()': 0.360037 seconds.
mean : 0.05143386
median : 0.050864
stdev : 0.0007847381
min : 0.050864
max : 0.052859
Elapsed time for '7 times walk_with_suffixes()': 0.26529 seconds.
mean : 0.03789857
median : 0.037899
stdev : 0.0005759071
min : 0.036901
max : 0.038896
Elapsed time for '7 times walk_with_regs()': 0.290223 seconds.
mean : 0.04146043
median : 0.040891
stdev : 0.0007846776
min : 0.04089
max : 0.042885
Results sizes:
0 2451
1 2451
2 2446
3 2446
Differences between glob() and walk():
0 E:\x\y\z\venv\lib\python3.7\site-packages\Cython\Includes\numpy
1 E:\x\y\z\venv\lib\python3.7\site-packages\Cython\Utility\CppSupport.cpp
2 E:\x\y\z\venv\lib\python3.7\site-packages\future\moves\xmlrpc
3 E:\x\y\z\venv\lib\python3.7\site-packages\Cython\Includes\libcpp
4 E:\x\y\z\venv\lib\python3.7\site-packages\future\backports\xmlrpc
Elapsed time for 'main': 1.317424 seconds.
The fastest way to filter out files by extensions, happens even to be the ugliest one. Which is, nested for loops and string comparison using the endswith() method.
Moreover, as you can see, the globbing algorithms (with the pattern E:\x\y\z\**/*[py][pyc]) even with only 2 extension given (py and pyc) returns also incorrect results.
I have released Formic which implements multiple includes in a similar way to Apache Ant's FileSet and Globs.
The search can be implemented:
import formic
patterns = ["*.txt", "*.markdown", "*.mdown"]
fileset = formic.FileSet(directory=projectDir, include=patterns)
for file_name in fileset.qualified_files():
# Do something with file_name
Because the full Ant glob is implemented, you can include different directories with each pattern, so you could choose only those .txt files in one subdirectory, and the .markdown in another, for example:
patterns = [ "/unformatted/**/*.txt", "/formatted/**/*.mdown" ]
I hope this helps.
This is a Python 3.4+ pathlib solution:
exts = ".pdf", ".doc", ".xls", ".csv", ".ppt"
filelist = (str(i) for i in map(pathlib.Path, os.listdir(src)) if i.suffix.lower() in exts and not i.stem.startswith("~"))
Also it ignores all file names starting with ~.
After coming here for help, I made my own solution and wanted to share it. It's based on user2363986's answer, but I think this is more scalable. Meaning, that if you have 1000 extensions, the code will still look somewhat elegant.
from glob import glob
directoryPath = "C:\\temp\\*."
fileExtensions = [ "jpg", "jpeg", "png", "bmp", "gif" ]
listOfFiles = []
for extension in fileExtensions:
listOfFiles.extend( glob( directoryPath + extension ))
for file in listOfFiles:
print(file) # Or do other stuff
Not glob, but here's another way using a list comprehension:
extensions = 'txt mdown markdown'.split()
projectFiles = [f for f in os.listdir(projectDir)
if os.path.splitext(f)[1][1:] in extensions]
The following function _glob globs for multiple file extensions.
import glob
import os
def _glob(path, *exts):
"""Glob for multiple file extensions
Parameters
----------
path : str
A file name without extension, or directory name
exts : tuple
File extensions to glob for
Returns
-------
files : list
list of files matching extensions in exts in path
"""
path = os.path.join(path, "*") if os.path.isdir(path) else path + "*"
return [f for files in [glob.glob(path + ext) for ext in exts] for f in files]
files = _glob(projectDir, ".txt", ".mdown", ".markdown")
From previous answer
glob('*.jpg') + glob('*.png')
Here is a shorter one,
from glob import glob
extensions = ['jpg', 'png'] # to find these filename extensions
# Method 1: loop one by one and extend to the output list
output = []
[output.extend(glob(f'*.{name}')) for name in extensions]
print(output)
# Method 2: even shorter
# loop filename extension to glob() it and flatten it to a list
output = [p for p2 in [glob(f'*.{name}') for name in extensions] for p in p2]
print(output)
You can try to make a manual list comparing the extension of existing with those you require.
ext_list = ['gif','jpg','jpeg','png'];
file_list = []
for file in glob.glob('*.*'):
if file.rsplit('.',1)[1] in ext_list :
file_list.append(file)
import os
import glob
import operator
from functools import reduce
types = ('*.jpg', '*.png', '*.jpeg')
lazy_paths = (glob.glob(os.path.join('my_path', t)) for t in types)
paths = reduce(operator.add, lazy_paths, [])
https://docs.python.org/3.5/library/functools.html#functools.reduce
https://docs.python.org/3.5/library/operator.html#operator.add
To glob multiple file types, you need to call glob() function several times in a loop. Since this function returns a list, you need to concatenate the lists.
For instance, this function do the job:
import glob
import os
def glob_filetypes(root_dir, *patterns):
return [path
for pattern in patterns
for path in glob.glob(os.path.join(root_dir, pattern))]
Simple usage:
project_dir = "path/to/project/dir"
for path in sorted(glob_filetypes(project_dir, '*.txt', '*.mdown', '*.markdown')):
print(path)
You can also use glob.iglob() to have an iterator:
Return an iterator which yields the same values as glob() without actually storing them all simultaneously.
def iglob_filetypes(root_dir, *patterns):
return (path
for pattern in patterns
for path in glob.iglob(os.path.join(root_dir, pattern)))
One glob, many extensions... but imperfect solution (might match other files).
filetypes = ['tif', 'jpg']
filetypes = zip(*[list(ft) for ft in filetypes])
filetypes = ["".join(ch) for ch in filetypes]
filetypes = ["[%s]" % ch for ch in filetypes]
filetypes = "".join(filetypes) + "*"
print(filetypes)
# => [tj][ip][fg]*
glob.glob("/path/to/*.%s" % filetypes)
I had the same issue and this is what I came up with
import os, sys, re
#without glob
src_dir = '/mnt/mypics/'
src_pics = []
ext = re.compile('.*\.(|{}|)$'.format('|'.join(['png', 'jpeg', 'jpg']).encode('utf-8')))
for root, dirnames, filenames in os.walk(src_dir):
for filename in filter(lambda name:ext.search(name),filenames):
src_pics.append(os.path.join(root, filename))
Use a list of extension and iterate through
from os.path import join
from glob import glob
files = []
extensions = ['*.gif', '*.png', '*.jpg']
for ext in extensions:
files.extend(glob(join("path/to/dir", ext)))
print(files)
You could use filter:
import os
import glob
projectFiles = filter(
lambda x: os.path.splitext(x)[1] in [".txt", ".mdown", ".markdown"]
glob.glob(os.path.join(projectDir, "*"))
)
You could also use reduce() like so:
import glob
file_types = ['*.txt', '*.mdown', '*.markdown']
project_files = reduce(lambda list1, list2: list1 + list2, (glob.glob(t) for t in file_types))
this creates a list from glob.glob() for each pattern and reduces them to a single list.
Yet another solution (use glob to get paths using multiple match patterns and combine all paths into a single list using reduce and add):
import functools, glob, operator
paths = functools.reduce(operator.add, [glob.glob(pattern) for pattern in [
"path1/*.ext1",
"path2/*.ext2"]])
If you use pathlib try this:
import pathlib
extensions = ['.py', '.txt']
root_dir = './test/'
files = filter(lambda p: p.suffix in extensions, pathlib.Path(root_dir).glob('**/*'))
print(list(files))

Categories