The test is failing because it's getting the files from hidden folders too. How can I modify the code so that it skips the hidden folders?
def get_files_not_in_hidden_folder(parent_folder: str, extension: str) -> List[str]:
"""
Get all files recursively from parent folder,
except for the ones that are in hidden folders
"""
files = []
for root, _, filenames in os.walk(parent_folder):
for filename in filenames:
if filename.endswith(extension) and not root.startswith('.'):
files.append(os.path.join(root, filename))
logger.debug(f"get_files_not_in_hidden_folder: {parent_folder}, {extension} -> {files}")
return files
def test_get_files_not_in_hidden_folder():
Path('tmp').mkdir(parents=True, exist_ok=True)
Path('tmp/test.json').touch()
Path('tmp/tmp/.tmp').mkdir(parents=True, exist_ok=True)
Path('tmp/tmp/.tmp/test.json').touch()
Path('tmp/.tmp/tmp').mkdir(parents=True, exist_ok=True)
Path('tmp/.tmp/tmp/test.json').touch()
assert get_files_not_in_hidden_folder('tmp', '.json') == ['tmp/test.json']
shutil.rmtree(Path('tmp'))
What you call root is the full path, including parent names.
If you want to convert to just the directory name, you can use os.path.basename, like:
for root, _, filenames in os.walk(parent_folder):
for filename in filenames:
if filename.endswith(extension) and "/." not in root:
files.append(os.path.join(root, filename))
I would implement this something like as follows ...
def my_walk(root_dir):
files,dirs = [],[]
try:
for fname in os.listdir(root_dir):
if not fname.startswith("."):
fpath = os.path.join(root_dir,fname)
if os.path.isdir(fpath):
dirs.append(fpath)
else:
files.append(fpath)
except:
print("SKIP:",root_dir)
yield root_dir,dirs,files
for d in dirs:
yield from my_walk(d)
I think should work ...
for root, _, filenames in my_walk(parent_folder):
print(f"{root} contains {filenames}")
Related
I am new to python and have been trying to do some project but i have been stuck on this for a while now i hope i can get help.
import os
def find_files(filename, search_path):
result = []
for root, dir, files in os.walk(search_path):
if filename in files:
result.append(os.path.join(root, filename))
return result
find = find_files("Among Us.exe","D:")
os.startfile(find)
The error is:
Exception has occurred: TypeError
startfile: filepath should be string, bytes or os.PathLike, not list
You problem is rather simple: you're trying to give a function expecting a string, a list.
Either pick an index from the list or make find_files return a string. If there's supposed to be one index, pick 0
Example
import os
def find_files(filename, search_path):
result = []
for root, dir, files in os.walk(search_path):
if filename in files:
result.append(os.path.join(root, filename))
return result
find = find_files("Among Us.exe","D:")
os.startfile(find[0])
However I think this is more what you're looking for:
import os
def find_files(filename, search_path):
for root, dir, files in os.walk(search_path):
if filename in files:
return os.path.join(root, filename)
find = find_files("Among Us.exe","D:")
os.startfile(find)
I have a directory that consists of other directories. Each of those sub-directories have files that I need the absolute path for. For example, let's say the parent directory is /home/Documents and each of the sub-directories is 1, 2,..., 10. I have tried something like files = [os.path.abspath(f) for d in os.listdir('/home/Documents') for f in os.listdir(d)], but that gets me something like (for a file) /home/Documents/file1, when it should be /home/Documents/1/file1. Is there a way to do this with the sub-directory in there?
Yes. You can try os.walk.
Consider the following path which has 3 sub directories: '1', '2', '3'.
- '1' has a file ("123.txt")
- '2' is empty
- '3' has 2 files ("123.txt", "1234.txt")
path = r"C:\Users\hvasala\Documents\Udemy Course\project\del"
import os
for dirname, _, filenames in os.walk(path):
for filename in filenames:
print(os.path.join(dirname, filename))
Output:
C:\Users\hvasala\Documents\Udemy Course\project\del\1\123.txt
C:\Users\hvasala\Documents\Udemy Course\project\del\3\123.txt
C:\Users\hvasala\Documents\Udemy Course\project\del\3\1234.txt
Use os.path.join:
root = '/tmp/project'
files = [os.path.join(root, d, f) for d in os.listdir(root) for f in os.listdir(os.path.join(root, d))]
print files
Output:
['/tmp/project/auth/__init__.py', '/tmp/project/controllers/__init__.py']
Try this code below:
import os
def find_file_name(path=None):
paths = []
if not path:
path = os.getcwd()
for element in os.listdir(path):
full_path = os.path.join(path, element)
if os.path.isdir(full_path):
paths += find_file_name(path=full_path)
else:
paths.append(full_path)
else:
return paths
def find_file_name(path=None, extention=".pdf"):
pdf_files = []
if not path:
path = os.getcwd()
for element in os.listdir(path):
full_path = os.path.join(path, element)
file = full_path.split("/")[-1]
if os.path.isdir(full_path):
pdf_files += find_file_name(path=full_path)
else:
if extention in file:
pdf_files.append(file)
return pdf_files
Basically I have 4 subfolders in my directory, and these are present in an array with the following structure:
path_list = [path1, path2, path3, path4]
When I run this code, I can scan all files in one of the folders.
for file_name in os.listdir(path_list[2]):
full_path = os.path.join(path_list[2], file_name)
...
new_sub = os.path.join(new_path, subdir_list[2])
final_path = os.path.join(new_sub, file_name)
imsave(final_path, img_norm)
I would like to find a way for the loop to scan the folder and once it has finished, an i++ occurred in path_list[i] and subdir_list[i] which it could change the value of the path. The loop should stop when it have scanned and modified all the files in the 4 folders.
You can wrap the entire thing in a for loop.
path_list = [path1, path2, path3]
subdir_list = [subdir1, subdir2, subdir3]
for i, _path in enumerate(path_list):
for file_name in os.listdir(_path):
full_path = os.path.join(_path, file_name)
...
new_sub = os.path.join(new_path, subdir_list[i])
final_path = os.path.join(new_sub, file_name)
imsave(final_path, img_norm)
In Python 2.7.4 on Windows, if I have a directory structure that follows:
test/foo/a.bak
test/foo/b.bak
test/foo/bar/c.bak
test/d.bak
And I use the following to add them to an existing archive such that 'd.bak' is at the root of the archive:
import zipfile
import os.path
import fnmatch
def find_files(directory, pattern):
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
yield filename
if __name__=='__main__':
z = zipfile.ZipFile("testarch.zip", "a", zipfile.ZIP_DEFLATED)
for filename in find_files('test', '*.*'):
print 'Found file:', filename
z.write(filename, os.path.basename(filename), zipfile.ZIP_DEFLATED)
z.close()
The directory of the zip file is flat. It creates the foo/ directory only if a sub-directory exists in it (If I exclude test/foo/bar/c.bak, it does not create the directory. If it is included, foo/ is created but not foo/bar/ if that makes sense), but no sub-directories or files:
foo/
a.bak
b.bak
c.bak
d.bak
Am I missing something?
The problem is that you're explicitly asking it to flatten all the paths:
z.write(filename, os.path.basename(filename), zipfile.ZIP_DEFLATED)
If you look at the docs, the default arcname is:
the same as filename, but without a drive letter and with leading path separators removed
But you're overriding that with os.path.basename(filename). (If you don't know what basename does, it returns "the last pathname component". If you don't want just the last pathname component, don't call basename.)
If you just do z.write('test/foo/bar/c.bak'), it will create a zip entry named test/foo/bar/c.bak, but if you do z.write('test/foo/bar/c.bak', 'c.bak'), it will create a zip entry named c.bak. Since you do that for all of the entries, the whole thing ends up flattened.
I figured it out. As abarnet pointed out, I had misread the docs on zipfiles. Using the following function, I can create the correct archive name for the zip file:
def createArchName(path):
line = path
if "\\" in line:
''' windows '''
discard, val = line.split("\\", 1)
return val
else:
''' unix '''
discard, val = line.split("/", 1)
return val
For those interested, the full code is as follows:
import urllib2
import zipfile
import os.path
import fnmatch
def find_files(directory, pattern):
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
yield filename
def createArchName(path):
line = path
if "\\" in line:
''' windows '''
discard, val = line.split("\\", 1)
return val
else:
''' unix '''
discard, val = line.split("/", 1)
return val
if __name__=='__main__':
if not os.path.exists("test"):
os.mkdir("test")
z = zipfile.ZipFile("testarch.zip", "a", zipfile.ZIP_DEFLATED)
for filename in find_files('test', '*.*'):
archname = createArchName(filename)
print 'Found file:', archname
z.write(filename, archname, zipfile.ZIP_DEFLATED)
z.close()
Given the following piece of python code:
for root, dirs, files in os.walk(directory):
for filename in fnmatch.filter(files, '*.png'):
pass
How can I filter for more than one extension? In this special case I want to get all files ending with *.png, *.gif, *.jpg or *.jpeg.
For now I came up with
for root, dirs, files in os.walk(directory):
for extension in ['jpg', 'jpeg', 'gif', 'png']:
for filename in fnmatch.filter(files, '*.' + extension):
pass
But I think it is not very elegant and performant.
Someone has a better idea?
If you only need to check extensions (i.e. no further wildcards), why don't you simply use basic string operations?
for root, dirs, files in os.walk(directory):
for filename in files:
if filename.endswith(('.jpg', '.jpeg', '.gif', '.png')):
pass
I think your code is actually fine. If you want to touch every filename only once, define your own filtering function:
def is_image_file(filename, extensions=['.jpg', '.jpeg', '.gif', '.png']):
return any(filename.endswith(e) for e in extensions)
for root, dirs, files in os.walk(directory):
for filename in filter(is_image_file, files):
pass
I've been using this with a lot of success.
import fnmatch
import functools
import itertools
import os
# Remove the annotations if you're not on Python3
def find_files(dir_path: str=None, patterns: [str]=None) -> [str]:
"""
Returns a generator yielding files matching the given patterns
:type dir_path: str
:type patterns: [str]
:rtype : [str]
:param dir_path: Directory to search for files/directories under. Defaults to current dir.
:param patterns: Patterns of files to search for. Defaults to ["*"]. Example: ["*.json", "*.xml"]
"""
path = dir_path or "."
path_patterns = patterns or ["*"]
for root_dir, dir_names, file_names in os.walk(path):
filter_partial = functools.partial(fnmatch.filter, file_names)
for file_name in itertools.chain(*map(filter_partial, path_patterns)):
yield os.path.join(root_dir, file_name)
Examples:
for f in find_files(test_directory):
print(f)
yields:
.\test.json
.\test.xml
.\test.ini
.\test_helpers.py
.\__init__.py
Testing with multiple patterns:
for f in find_files(test_directory, ["*.xml", "*.json", "*.ini"]):
print(f)
yields:
.\test.json
.\test.xml
.\test.ini
This would be a better way, perhaps because you are not calling + repeatedly and using a tuple instead of list.
for root, dirs, files in os.walk(directory):
for extension in ('*.jpg', '*.jpeg', '*.gif', '*.png'):
for filename in fnmatch.filter(files, extension):
pass
A tuple is better because you are not going to modify the extension once you have created them. You are just using to iterate over them.
This isn't really elegant either, but it works:
for root, dirs, files in os.walk(directory):
for filename in fnmatch.filter(files, '*.png') + fnmatch.filter(files, '*.jpg') + fnmatch.filter(files, '*.jpeg') + fnmatch.filter(files, '*.gif'):
pass
Here is what I am using to filter files in apache log directories.
Here I exclude errors flles
rep_filters = [now.strftime("%Y%m%d")]
def files_filter(liste_fic, filters = rep_filters):
s = "(fic for fic in liste_fic if fic.find('error') < 0"
for filter in filters:
s += " and fic.find('%s') >=0 " % filter
s += ")"
return eval(s)
Please try this:
# pattern_list = ['*.jpg', '__.*']
def checkFilepatter(filename, pattern_list):
for pattern in pattern_list:
if fnmatch.fnmatch(filename, pattern):
return True
return False
You can use a list comprehension to check if my_file matches any of the file masks defined in patterns:
import fnmatch
my_file = 'my_precious.txt'
patterns = ('*.txt', '*.html', '*.mp3')
if [pat for pat in patterns if fnmatch.fnmatch(my_file, pat)]:
print('We have a match!')
else:
print('No match')
Internally, fnmatch users regular expressions. And there's a method that makes a regex from an fnmatch pattern — fnmatch.translate. This may also give a little speed-up.
import fnmatch
import os
import re
image_exts = ['jpg', 'jpeg', 'gif', 'png']
image_re = re.compile('|'.join(fnmatch.translate('*.' + e) for e in image_exts))
for root, dirs, files in os.walk(directory):
for filename in files:
if image_re.match(filename):
...
The clearest solution is:
import os
for root, dirs, files in os.walk(directory):
for filename in files:
_, ext = os.path.splitext(filename)
if ext in ['.jpg', '.jpeg', '.gif', '.png']:
...
or, using pathlib,
for path in pathlib.Path(directory).glob('**/*'):
if path.suffix in ['.jpg', '.jpeg', '.gif', '.png']:
...