Recursively append files to zip archive in python

Recursively append files to zip archive in python - python

In Python 2.7.4 on Windows, if I have a directory structure that follows:
test/foo/a.bak
test/foo/b.bak
test/foo/bar/c.bak
test/d.bak
And I use the following to add them to an existing archive such that 'd.bak' is at the root of the archive:
import zipfile
import os.path
import fnmatch
def find_files(directory, pattern):
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
yield filename
if __name__=='__main__':
z = zipfile.ZipFile("testarch.zip", "a", zipfile.ZIP_DEFLATED)
for filename in find_files('test', '*.*'):
print 'Found file:', filename
z.write(filename, os.path.basename(filename), zipfile.ZIP_DEFLATED)
z.close()
The directory of the zip file is flat. It creates the foo/ directory only if a sub-directory exists in it (If I exclude test/foo/bar/c.bak, it does not create the directory. If it is included, foo/ is created but not foo/bar/ if that makes sense), but no sub-directories or files:
foo/
a.bak
b.bak
c.bak
d.bak
Am I missing something?

The problem is that you're explicitly asking it to flatten all the paths:
z.write(filename, os.path.basename(filename), zipfile.ZIP_DEFLATED)
If you look at the docs, the default arcname is:
the same as filename, but without a drive letter and with leading path separators removed
But you're overriding that with os.path.basename(filename). (If you don't know what basename does, it returns "the last pathname component". If you don't want just the last pathname component, don't call basename.)
If you just do z.write('test/foo/bar/c.bak'), it will create a zip entry named test/foo/bar/c.bak, but if you do z.write('test/foo/bar/c.bak', 'c.bak'), it will create a zip entry named c.bak. Since you do that for all of the entries, the whole thing ends up flattened.

I figured it out. As abarnet pointed out, I had misread the docs on zipfiles. Using the following function, I can create the correct archive name for the zip file:
def createArchName(path):
line = path
if "\\" in line:
''' windows '''
discard, val = line.split("\\", 1)
return val
else:
''' unix '''
discard, val = line.split("/", 1)
return val
For those interested, the full code is as follows:
import urllib2
import zipfile
import os.path
import fnmatch
def find_files(directory, pattern):
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
yield filename
def createArchName(path):
line = path
if "\\" in line:
''' windows '''
discard, val = line.split("\\", 1)
return val
else:
''' unix '''
discard, val = line.split("/", 1)
return val
if __name__=='__main__':
if not os.path.exists("test"):
os.mkdir("test")
z = zipfile.ZipFile("testarch.zip", "a", zipfile.ZIP_DEFLATED)
for filename in find_files('test', '*.*'):
archname = createArchName(filename)
print 'Found file:', archname
z.write(filename, archname, zipfile.ZIP_DEFLATED)
z.close()

Related

Copy directory structure and files excluding file types, sub folders... Is it possible using the standard library without writing a whole script?

I'm looking for a quick way to copy the entire directory structure (including sub folders and files), with the following conditions:
Copy file if it does not exist in the destination or source is newer
Allow excluding a list of sub folders i.e. ['temp', '.git']
Allow excluding files by type i.e. ['.txt', '.pyc', '*.zip']
I have seen some of the answers using shutil.copy and copytree but none is doing what I was looking for...
I am hoping this could by done by using one of the standard utilities by providing arguments etc. If not I will write a script to do it...
This is what I ended up writing... it does the job, I was hoping this basic functionality would be provided by one of the standard libraries...
import os, sys, pathlib, shutil
def copy_files_on_tree(srcdir, dstdir, sub_folder_to_include, extensions_to_include):
srcdir = str(pathlib.Path(srcdir)).replace('\\', '/')
dstdir = str(pathlib.Path(dstdir)).replace('\\', '/')
for dirpath, dirs, files in os.walk(pathlib.Path(srcdir)):
this_dir = dirpath.replace('\\', "/")
if os.path.basename(this_dir) in sub_folder_to_include:
dest_dir = this_dir.replace(srcdir, dstdir)
# create folder in the destinatin if it does not exist
pathlib.Path(dest_dir).mkdir(parents=True, exist_ok=True)
for filename in files:
dest_file = os.path.join(dest_dir, os.path.basename(filename))
source_file = os.path.join(this_dir, filename)
if os.path.isfile(source_file) and filename.endswith(extensions_to_include):
# copy file if destination is older by more than a second, or does not exist
if (not os.path.exists(dest_file)) or (os.stat(source_file).st_mtime - os.stat(dest_file).st_mtime > 1):
print (f'Copying {source_file} to {dest_dir}')
shutil.copy2(source_file, dest_dir)
else:
print (f'.....Skipping {source_file} to {dest_dir}')
srcdir = 'c:/temp/a'
dstdir = 'c:/temp/j'
sub_folder_to_include = ('a', 'aa','bb')
extensions_to_include = ('.py', '.png', '.gif', '.txt')
copy_files_on_tree(srcdir, dstdir, sub_folder_to_include, extensions_to_include)

This is the solution:
import os, sys, pathlib, shutil
def copy_files_on_tree(srcdir, dstdir, sub_folder_to_include, extensions_to_include):
srcdir = str(pathlib.Path(srcdir)).replace('\\', '/')
dstdir = str(pathlib.Path(dstdir)).replace('\\', '/')
for dirpath, dirs, files in os.walk(pathlib.Path(srcdir)):
this_dir = dirpath.replace('\\', "/")
if os.path.basename(this_dir) in sub_folder_to_include:
dest_dir = this_dir.replace(srcdir, dstdir)
# create folder in the destinatin if it does not exist
pathlib.Path(dest_dir).mkdir(parents=True, exist_ok=True)
for filename in files:
dest_file = os.path.join(dest_dir, os.path.basename(filename))
source_file = os.path.join(this_dir, filename)
if os.path.isfile(source_file) and filename.endswith(extensions_to_include):
# copy file if destination is older by more than a second, or does not exist
if (not os.path.exists(dest_file)) or (os.stat(source_file).st_mtime - os.stat(dest_file).st_mtime > 1):
print (f'Copying {source_file} to {dest_dir}')
shutil.copy2(source_file, dest_dir)
else:
print (f'.....Skipping {source_file} to {dest_dir}')
srcdir = 'c:/temp/a'
dstdir = 'c:/temp/j'
sub_folder_to_include = ('a', 'aa','bb')
extensions_to_include = ('.py', '.png', '.gif', '.txt')
copy_files_on_tree(srcdir, dstdir, sub_folder_to_include, extensions_to_include)

Wildcard for read_template [duplicate]

This question's answers are a community effort. Edit existing answers to improve this post. It is not currently accepting new answers or interactions.
How can I find all the files in a directory having the extension .txt in python?

You can use glob:
import glob, os
os.chdir("/mydir")
for file in glob.glob("*.txt"):
print(file)
or simply os.listdir:
import os
for file in os.listdir("/mydir"):
if file.endswith(".txt"):
print(os.path.join("/mydir", file))
or if you want to traverse directory, use os.walk:
import os
for root, dirs, files in os.walk("/mydir"):
for file in files:
if file.endswith(".txt"):
print(os.path.join(root, file))

Use glob.
>>> import glob
>>> glob.glob('./*.txt')
['./outline.txt', './pip-log.txt', './test.txt', './testingvim.txt']

Something like that should do the job
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.txt'):
print(file)

You can simply use pathlibs glob 1:
import pathlib
list(pathlib.Path('your_directory').glob('*.txt'))
or in a loop:
for txt_file in pathlib.Path('your_directory').glob('*.txt'):
# do something with "txt_file"
If you want it recursive you can use .glob('**/*.txt')
1The pathlib module was included in the standard library in python 3.4. But you can install back-ports of that module even on older Python versions (i.e. using conda or pip): pathlib and pathlib2.

Something like this will work:
>>> import os
>>> path = '/usr/share/cups/charmaps'
>>> text_files = [f for f in os.listdir(path) if f.endswith('.txt')]
>>> text_files
['euc-cn.txt', 'euc-jp.txt', 'euc-kr.txt', 'euc-tw.txt', ... 'windows-950.txt']

import os
path = 'mypath/path'
files = os.listdir(path)
files_txt = [i for i in files if i.endswith('.txt')]

I like os.walk():
import os
for root, dirs, files in os.walk(dir):
for f in files:
if os.path.splitext(f)[1] == '.txt':
fullpath = os.path.join(root, f)
print(fullpath)
Or with generators:
import os
fileiter = (os.path.join(root, f)
for root, _, files in os.walk(dir)
for f in files)
txtfileiter = (f for f in fileiter if os.path.splitext(f)[1] == '.txt')
for txt in txtfileiter:
print(txt)

Here's more versions of the same that produce slightly different results:
glob.iglob()
import glob
for f in glob.iglob("/mydir/*/*.txt"): # generator, search immediate subdirectories
print f
glob.glob1()
print glob.glob1("/mydir", "*.tx?") # literal_directory, basename_pattern
fnmatch.filter()
import fnmatch, os
print fnmatch.filter(os.listdir("/mydir"), "*.tx?") # include dot-files

Try this this will find all your files recursively:
import glob, os
os.chdir("H:\\wallpaper")# use whatever directory you want
#double\\ no single \
for file in glob.glob("**/*.txt", recursive = True):
print(file)

Python v3.5+
Fast method using os.scandir in a recursive function. Searches for all files with a specified extension in folder and sub-folders. It is fast, even for finding 10,000s of files.
I have also included a function to convert the output to a Pandas Dataframe.
import os
import re
import pandas as pd
import numpy as np
def findFilesInFolderYield(path, extension, containsTxt='', subFolders = True, excludeText = ''):
""" Recursive function to find all files of an extension type in a folder (and optionally in all subfolders too)
path: Base directory to find files
extension: File extension to find. e.g. 'txt'. Regular expression. Or 'ls\d' to match ls1, ls2, ls3 etc
containsTxt: List of Strings, only finds file if it contains this text. Ignore if '' (or blank)
subFolders: Bool. If True, find files in all subfolders under path. If False, only searches files in the specified folder
excludeText: Text string. Ignore if ''. Will exclude if text string is in path.
"""
if type(containsTxt) == str: # if a string and not in a list
containsTxt = [containsTxt]
myregexobj = re.compile('\.' + extension + '$') # Makes sure the file extension is at the end and is preceded by a .
try: # Trapping a OSError or FileNotFoundError: File permissions problem I believe
for entry in os.scandir(path):
if entry.is_file() and myregexobj.search(entry.path): #
bools = [True for txt in containsTxt if txt in entry.path and (excludeText == '' or excludeText not in entry.path)]
if len(bools)== len(containsTxt):
yield entry.stat().st_size, entry.stat().st_atime_ns, entry.stat().st_mtime_ns, entry.stat().st_ctime_ns, entry.path
elif entry.is_dir() and subFolders: # if its a directory, then repeat process as a nested function
yield from findFilesInFolderYield(entry.path, extension, containsTxt, subFolders)
except OSError as ose:
print('Cannot access ' + path +'. Probably a permissions error ', ose)
except FileNotFoundError as fnf:
print(path +' not found ', fnf)
def findFilesInFolderYieldandGetDf(path, extension, containsTxt, subFolders = True, excludeText = ''):
""" Converts returned data from findFilesInFolderYield and creates and Pandas Dataframe.
Recursive function to find all files of an extension type in a folder (and optionally in all subfolders too)
path: Base directory to find files
extension: File extension to find. e.g. 'txt'. Regular expression. Or 'ls\d' to match ls1, ls2, ls3 etc
containsTxt: List of Strings, only finds file if it contains this text. Ignore if '' (or blank)
subFolders: Bool. If True, find files in all subfolders under path. If False, only searches files in the specified folder
excludeText: Text string. Ignore if ''. Will exclude if text string is in path.
"""
fileSizes, accessTimes, modificationTimes, creationTimes , paths = zip(*findFilesInFolderYield(path, extension, containsTxt, subFolders))
df = pd.DataFrame({
'FLS_File_Size':fileSizes,
'FLS_File_Access_Date':accessTimes,
'FLS_File_Modification_Date':np.array(modificationTimes).astype('timedelta64[ns]'),
'FLS_File_Creation_Date':creationTimes,
'FLS_File_PathName':paths,
})
df['FLS_File_Modification_Date'] = pd.to_datetime(df['FLS_File_Modification_Date'],infer_datetime_format=True)
df['FLS_File_Creation_Date'] = pd.to_datetime(df['FLS_File_Creation_Date'],infer_datetime_format=True)
df['FLS_File_Access_Date'] = pd.to_datetime(df['FLS_File_Access_Date'],infer_datetime_format=True)
return df
ext = 'txt' # regular expression
containsTxt=[]
path = 'C:\myFolder'
df = findFilesInFolderYieldandGetDf(path, ext, containsTxt, subFolders = True)

path.py is another alternative: https://github.com/jaraco/path.py
from path import path
p = path('/path/to/the/directory')
for f in p.files(pattern='*.txt'):
print f

To get all '.txt' file names inside 'dataPath' folder as a list in a Pythonic way:
from os import listdir
from os.path import isfile, join
path = "/dataPath/"
onlyTxtFiles = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith(".txt")]
print onlyTxtFiles

Python has all tools to do this:
import os
the_dir = 'the_dir_that_want_to_search_in'
all_txt_files = filter(lambda x: x.endswith('.txt'), os.listdir(the_dir))

I did a test (Python 3.6.4, W7x64) to see which solution is the fastest for one folder, no subdirectories, to get a list of complete file paths for files with a specific extension.
To make it short, for this task os.listdir() is the fastest and is 1.7x as fast as the next best: os.walk() (with a break!), 2.7x as fast as pathlib, 3.2x faster than os.scandir() and 3.3x faster than glob.
Please keep in mind, that those results will change when you need recursive results. If you copy/paste one method below, please add a .lower() otherwise .EXT would not be found when searching for .ext.
import os
import pathlib
import timeit
import glob
def a():
path = pathlib.Path().cwd()
list_sqlite_files = [str(f) for f in path.glob("*.sqlite")]
def b():
path = os.getcwd()
list_sqlite_files = [f.path for f in os.scandir(path) if os.path.splitext(f)[1] == ".sqlite"]
def c():
path = os.getcwd()
list_sqlite_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".sqlite")]
def d():
path = os.getcwd()
os.chdir(path)
list_sqlite_files = [os.path.join(path, f) for f in glob.glob("*.sqlite")]
def e():
path = os.getcwd()
list_sqlite_files = [os.path.join(path, f) for f in glob.glob1(str(path), "*.sqlite")]
def f():
path = os.getcwd()
list_sqlite_files = []
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".sqlite"):
list_sqlite_files.append( os.path.join(root, file) )
break
print(timeit.timeit(a, number=1000))
print(timeit.timeit(b, number=1000))
print(timeit.timeit(c, number=1000))
print(timeit.timeit(d, number=1000))
print(timeit.timeit(e, number=1000))
print(timeit.timeit(f, number=1000))
Results:
# Python 3.6.4
0.431
0.515
0.161
0.548
0.537
0.274

import os
import sys
if len(sys.argv)==2:
print('no params')
sys.exit(1)
dir = sys.argv[1]
mask= sys.argv[2]
files = os.listdir(dir);
res = filter(lambda x: x.endswith(mask), files);
print res

To get an array of ".txt" file names from a folder called "data" in the same directory I usually use this simple line of code:
import os
fileNames = [fileName for fileName in os.listdir("data") if fileName.endswith(".txt")]

This code makes my life simpler.
import os
fnames = ([file for root, dirs, files in os.walk(dir)
for file in files
if file.endswith('.txt') #or file.endswith('.png') or file.endswith('.pdf')
])
for fname in fnames: print(fname)

Use fnmatch: https://docs.python.org/2/library/fnmatch.html
import fnmatch
import os
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*.txt'):
print file

A copy-pastable solution similar to the one of ghostdog:
def get_all_filepaths(root_path, ext):
"""
Search all files which have a given extension within root_path.
This ignores the case of the extension and searches subdirectories, too.
Parameters
----------
root_path : str
ext : str
Returns
-------
list of str
Examples
--------
>>> get_all_filepaths('/run', '.lock')
['/run/unattended-upgrades.lock',
'/run/mlocate.daily.lock',
'/run/xtables.lock',
'/run/mysqld/mysqld.sock.lock',
'/run/postgresql/.s.PGSQL.5432.lock',
'/run/network/.ifstate.lock',
'/run/lock/asound.state.lock']
"""
import os
all_files = []
for root, dirs, files in os.walk(root_path):
for filename in files:
if filename.lower().endswith(ext):
all_files.append(os.path.join(root, filename))
return all_files
You can also use yield to create a generator and thus avoid assembling the complete list:
def get_all_filepaths(root_path, ext):
import os
for root, dirs, files in os.walk(root_path):
for filename in files:
if filename.lower().endswith(ext):
yield os.path.join(root, filename)

I suggest you to use fnmatch and the upper method. In this way you can find any of the following:
Name.txt;
Name.TXT;
Name.Txt
.
import fnmatch
import os
for file in os.listdir("/Users/Johnny/Desktop/MyTXTfolder"):
if fnmatch.fnmatch(file.upper(), '*.TXT'):
print(file)

Here's one with extend()
types = ('*.jpg', '*.png')
images_list = []
for files in types:
images_list.extend(glob.glob(os.path.join(path, files)))

Functional solution with sub-directories:
from fnmatch import filter
from functools import partial
from itertools import chain
from os import path, walk
print(*chain(*(map(partial(path.join, root), filter(filenames, "*.txt")) for root, _, filenames in walk("mydir"))))

In case the folder contains a lot of files or memory is an constraint, consider using generators:
def yield_files_with_extensions(folder_path, file_extension):
for _, _, files in os.walk(folder_path):
for file in files:
if file.endswith(file_extension):
yield file
Option A: Iterate
for f in yield_files_with_extensions('.', '.txt'):
print(f)
Option B: Get all
files = [f for f in yield_files_with_extensions('.', '.txt')]

use Python OS module to find files with specific extension.
the simple example is here :
import os
# This is the path where you want to search
path = r'd:'
# this is extension you want to detect
extension = '.txt' # this can be : .jpg .png .xls .log .....
for root, dirs_list, files_list in os.walk(path):
for file_name in files_list:
if os.path.splitext(file_name)[-1] == extension:
file_name_path = os.path.join(root, file_name)
print file_name
print file_name_path # This is the full path of the filter file

Many users have replied with os.walk answers, which includes all files but also all directories and subdirectories and their files.
import os
def files_in_dir(path, extension=''):
"""
Generator: yields all of the files in <path> ending with
<extension>
\param path Absolute or relative path to inspect,
\param extension [optional] Only yield files matching this,
\yield [filenames]
"""
for _, dirs, files in os.walk(path):
dirs[:] = [] # do not recurse directories.
yield from [f for f in files if f.endswith(extension)]
# Example: print all the .py files in './python'
for filename in files_in_dir('./python', '*.py'):
print("-", filename)
Or for a one off where you don't need a generator:
path, ext = "./python", ext = ".py"
for _, _, dirfiles in os.walk(path):
matches = (f for f in dirfiles if f.endswith(ext))
break
for filename in matches:
print("-", filename)
If you are going to use matches for something else, you may want to make it a list rather than a generator expression:
matches = [f for f in dirfiles if f.endswith(ext)]

Python: search for a file in current directory and all it's parents

Is there an inbuilt module to search for a file in the current directory, as well as all the super-directories?
Without the module, I'll have to list all the files in the current directory, search for the file in question, and recursively move up if the file isn't present. Is there an easier way to do this?

Well this is not so well implemented, but will work
use listdir to get list of files/folders in current directory and then in the list search for you file.
If it exists loop breaks but if it doesn't it goes to parent directory using os.path.dirname and listdir.
if cur_dir == '/' the parent dir for "/" is returned as "/" so if cur_dir == parent_dir it breaks the loop
import os
import os.path
file_name = "test.txt" #file to be searched
cur_dir = os.getcwd() # Dir from where search starts can be replaced with any path
while True:
file_list = os.listdir(cur_dir)
parent_dir = os.path.dirname(cur_dir)
if file_name in file_list:
print "File Exists in: ", cur_dir
break
else:
if cur_dir == parent_dir: #if dir is root dir
print "File not found"
break
else:
cur_dir = parent_dir

Here's another one, using pathlib:
from pathlib import Path
def find_upwards(cwd: Path, filename: str) -> Path | None:
if cwd == Path(cwd.root) or cwd == cwd.parent:
return None
fullpath = cwd / filename
return fullpath if fullpath.exists() else find_upwards(cwd.parent, filename)
# usage example:
find_upwards(Path.cwd(), "helloworld.txt")
(using some Python 3.10 typing syntax here, you can safely skip that if you are using an earlier version)

Another option, using pathlib:
from pathlib import Path
def search_upwards_for_file(filename):
"""Search in the current directory and all directories above it
for a file of a particular name.
Arguments:
---------
filename :: string, the filename to look for.
Returns
-------
pathlib.Path, the location of the first file found or
None, if none was found
"""
d = Path.cwd()
root = Path(d.root)
while d != root:
attempt = d / filename
if attempt.exists():
return attempt
d = d.parent
return None

The parent question was to walk parent directories (not descend into children like the find command):
# walk PARENT directories looking for `filename`:
f = 'filename'
d = os.getcwd()
while d != "/" and f not in os.listdir(d):
d = os.path.abspath(d + "/../")
if os.path.isfile(os.path.join(d,f)):
do_something(f)
Here's a version that uses shell globbing to match multiple files:
# walk PARENT directories looking for any *.csv files,
# stopping when a directory that contains any:
f = '*.csv'
d = os.getcwd()
while d != "/" and not glob.glob(os.path.join(d, f)):
d = os.path.abspath(d + "/../")
files = glob.glob(os.path.join(d,f))
for filename in files:
do_something(filename)

Here a function that does an upward search:
import sys, os, os.path
def up_dir(match,start=None):
"""
Find a parent path producing a match on one of its entries.
Without match an empty string is returned.
:param match: a function returning a bool on a directory entry
:param start: absolute path or None
:return: directory with a match on one of its entries
>>> up_dir(lambda x: False)
''
"""
if start is None:
start = os.getcwd()
if any(match(x) for x in os.listdir(start)):
return start
parent = os.path.dirname(start)
if start == parent:
rootres = start.replace('\\','/').strip('/').replace(':','')
if len(rootres)==1 and sys.platform=='win32':
rootres = ''
return rootres
return up_dir(match,start=parent)

Here is an example that will find all the .csv files in a specified directory "path" and all its root directories and print them:
import os
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".csv"):
path_file = os.path.join(root,file)
print(path_file)
If you want to start at one directory and work your way through the parents then this would work for finding all the .csv files (for example):
import os
import glob
last_dir = ''
dir = r'c:\temp\starting_dir'
os.chdir(dir)
while last_dir != dir:
dir = os.getcwd()
print(glob.glob('*.csv'))
os.chdir('..')
last_dir = os.getcwd()

I was looking for this too, since os.walk is exactly the opposite of what I wanted. That searches subdirectories. I wanted to search backwards through parent directories until I hit the drive root.
Bumming some inspiration from previous answers, below is what I am using. It doesn't require changing the working directory and it has a place for you to do something when you find a match. And you can change how the match is found. I'm using regex but a basic string compare would work fine too.
# Looking for a file with the string 'lowda' in it (like beltalowda or inyalowda)
import os
import re # only if you want to use regex
# Setup initial directories
starting_dir = 'C:\\Users\\AvasaralaC\\Documents\\Projects'
last_dir = ''
curr_dir = starting_dir
filename = ''
# Loop through parent directories until you hit the end or find a match
while last_dir != curr_dir:
for item in os.listdir(curr_dir):
if re.compile('.*lowda.*').search(item): # Here you can do your own comparison
filename = (curr_dir + os.path.sep + item)
break
if filename:
break
last_dir = curr_dir
curr_dir = os.path.abspath(curr_dir + os.path.sep + os.pardir)
Other comparisons you could do are item.lower().endswith('.txt') or some other string comparison.

Just wrote this to find the "images" directory, note '/' is Linux style
dir = os.getcwd()
while dir != '/' and not glob.glob( dir + '/images' ):
dir = os.path.dirname(dir)

Use fnmatch.filter to filter files by more than one possible file extension

Given the following piece of python code:
for root, dirs, files in os.walk(directory):
for filename in fnmatch.filter(files, '*.png'):
pass
How can I filter for more than one extension? In this special case I want to get all files ending with *.png, *.gif, *.jpg or *.jpeg.
For now I came up with
for root, dirs, files in os.walk(directory):
for extension in ['jpg', 'jpeg', 'gif', 'png']:
for filename in fnmatch.filter(files, '*.' + extension):
pass
But I think it is not very elegant and performant.
Someone has a better idea?

If you only need to check extensions (i.e. no further wildcards), why don't you simply use basic string operations?
for root, dirs, files in os.walk(directory):
for filename in files:
if filename.endswith(('.jpg', '.jpeg', '.gif', '.png')):
pass

I think your code is actually fine. If you want to touch every filename only once, define your own filtering function:
def is_image_file(filename, extensions=['.jpg', '.jpeg', '.gif', '.png']):
return any(filename.endswith(e) for e in extensions)
for root, dirs, files in os.walk(directory):
for filename in filter(is_image_file, files):
pass

I've been using this with a lot of success.
import fnmatch
import functools
import itertools
import os
# Remove the annotations if you're not on Python3
def find_files(dir_path: str=None, patterns: [str]=None) -> [str]:
"""
Returns a generator yielding files matching the given patterns
:type dir_path: str
:type patterns: [str]
:rtype : [str]
:param dir_path: Directory to search for files/directories under. Defaults to current dir.
:param patterns: Patterns of files to search for. Defaults to ["*"]. Example: ["*.json", "*.xml"]
"""
path = dir_path or "."
path_patterns = patterns or ["*"]
for root_dir, dir_names, file_names in os.walk(path):
filter_partial = functools.partial(fnmatch.filter, file_names)
for file_name in itertools.chain(*map(filter_partial, path_patterns)):
yield os.path.join(root_dir, file_name)
Examples:
for f in find_files(test_directory):
print(f)
yields:
.\test.json
.\test.xml
.\test.ini
.\test_helpers.py
.\__init__.py
Testing with multiple patterns:
for f in find_files(test_directory, ["*.xml", "*.json", "*.ini"]):
print(f)
yields:
.\test.json
.\test.xml
.\test.ini

This would be a better way, perhaps because you are not calling + repeatedly and using a tuple instead of list.
for root, dirs, files in os.walk(directory):
for extension in ('*.jpg', '*.jpeg', '*.gif', '*.png'):
for filename in fnmatch.filter(files, extension):
pass
A tuple is better because you are not going to modify the extension once you have created them. You are just using to iterate over them.

This isn't really elegant either, but it works:
for root, dirs, files in os.walk(directory):
for filename in fnmatch.filter(files, '*.png') + fnmatch.filter(files, '*.jpg') + fnmatch.filter(files, '*.jpeg') + fnmatch.filter(files, '*.gif'):
pass

Here is what I am using to filter files in apache log directories.
Here I exclude errors flles
rep_filters = [now.strftime("%Y%m%d")]
def files_filter(liste_fic, filters = rep_filters):
s = "(fic for fic in liste_fic if fic.find('error') < 0"
for filter in filters:
s += " and fic.find('%s') >=0 " % filter
s += ")"
return eval(s)

Please try this:
# pattern_list = ['*.jpg', '__.*']
def checkFilepatter(filename, pattern_list):
for pattern in pattern_list:
if fnmatch.fnmatch(filename, pattern):
return True
return False

You can use a list comprehension to check if my_file matches any of the file masks defined in patterns:
import fnmatch
my_file = 'my_precious.txt'
patterns = ('*.txt', '*.html', '*.mp3')
if [pat for pat in patterns if fnmatch.fnmatch(my_file, pat)]:
print('We have a match!')
else:
print('No match')

Internally, fnmatch users regular expressions. And there's a method that makes a regex from an fnmatch pattern — fnmatch.translate. This may also give a little speed-up.
import fnmatch
import os
import re
image_exts = ['jpg', 'jpeg', 'gif', 'png']
image_re = re.compile('|'.join(fnmatch.translate('*.' + e) for e in image_exts))
for root, dirs, files in os.walk(directory):
for filename in files:
if image_re.match(filename):
...

The clearest solution is:
import os
for root, dirs, files in os.walk(directory):
for filename in files:
_, ext = os.path.splitext(filename)
if ext in ['.jpg', '.jpeg', '.gif', '.png']:
...
or, using pathlib,
for path in pathlib.Path(directory).glob('**/*'):
if path.suffix in ['.jpg', '.jpeg', '.gif', '.png']:
...

Filtering os.walk() dirs and files

I'm looking for a way to include/exclude files patterns and exclude directories from a os.walk() call.
Here's what I'm doing by now:
import fnmatch
import os
includes = ['*.doc', '*.odt']
excludes = ['/home/paulo-freitas/Documents']
def _filter(paths):
for path in paths:
if os.path.isdir(path) and not path in excludes:
yield path
for pattern in (includes + excludes):
if not os.path.isdir(path) and fnmatch.fnmatch(path, pattern):
yield path
for root, dirs, files in os.walk('/home/paulo-freitas'):
dirs[:] = _filter(map(lambda d: os.path.join(root, d), dirs))
files[:] = _filter(map(lambda f: os.path.join(root, f), files))
for filename in files:
filename = os.path.join(root, filename)
print(filename)
Is there a better way to do this? How?

This solution uses fnmatch.translate to convert glob patterns to regular expressions (it assumes the includes only is used for files):
import fnmatch
import os
import os.path
import re
includes = ['*.doc', '*.odt'] # for files only
excludes = ['/home/paulo-freitas/Documents'] # for dirs and files
# transform glob patterns to regular expressions
includes = r'|'.join([fnmatch.translate(x) for x in includes])
excludes = r'|'.join([fnmatch.translate(x) for x in excludes]) or r'$.'
for root, dirs, files in os.walk('/home/paulo-freitas'):
# exclude dirs
dirs[:] = [os.path.join(root, d) for d in dirs]
dirs[:] = [d for d in dirs if not re.match(excludes, d)]
# exclude/include files
files = [os.path.join(root, f) for f in files]
files = [f for f in files if not re.match(excludes, f)]
files = [f for f in files if re.match(includes, f)]
for fname in files:
print fname

From docs.python.org:
os.walk(top[, topdown=True[, onerror=None[, followlinks=False]]])
When topdown is True, the caller can modify the dirnames list in-place … this can be used to prune the search …
for root, dirs, files in os.walk('/home/paulo-freitas', topdown=True):
# excludes can be done with fnmatch.filter and complementary set,
# but it's more annoying to read.
dirs[:] = [d for d in dirs if d not in excludes]
for pat in includes:
for f in fnmatch.filter(files, pat):
print os.path.join(root, f)
I should point out that the above code assumes excludes is a pattern, not a full path. You would need to adjust the list comprehension to filter if os.path.join(root, d) not in excludes to match the OP case.

why fnmatch?
import os
excludes=....
for ROOT,DIR,FILES in os.walk("/path"):
for file in FILES:
if file.endswith(('doc','odt')):
print file
for directory in DIR:
if not directory in excludes :
print directory
not exhaustively tested

dirtools is perfect for your use-case:
from dirtools import Dir
print(Dir('.', exclude_file='.gitignore').files())

Here is one way to do that
import fnmatch
import os
excludes = ['/home/paulo-freitas/Documents']
matches = []
for path, dirs, files in os.walk(os.getcwd()):
for eachpath in excludes:
if eachpath in path:
continue
else:
for result in [os.path.abspath(os.path.join(path, filename)) for
filename in files if fnmatch.fnmatch(filename,'*.doc') or fnmatch.fnmatch(filename,'*.odt')]:
matches.append(result)
print matches

import os
includes = ['*.doc', '*.odt']
excludes = ['/home/paulo-freitas/Documents']
def file_search(path, exe):
for x,y,z in os.walk(path):
for a in z:
if a[-4:] == exe:
print os.path.join(x,a)
for x in includes:
file_search(excludes[0],x)

This is an example of excluding directories and files with os.walk():
ignoreDirPatterns=[".git"]
ignoreFilePatterns=[".php"]
def copyTree(src, dest, onerror=None):
src = os.path.abspath(src)
src_prefix = len(src) + len(os.path.sep)
for root, dirs, files in os.walk(src, onerror=onerror):
for pattern in ignoreDirPatterns:
if pattern in root:
break
else:
#If the above break didn't work, this part will be executed
for file in files:
for pattern in ignoreFilePatterns:
if pattern in file:
break
else:
#If the above break didn't work, this part will be executed
dirpath = os.path.join(dest, root[src_prefix:])
try:
os.makedirs(dirpath,exist_ok=True)
except OSError as e:
if onerror is not None:
onerror(e)
filepath=os.path.join(root,file)
shutil.copy(filepath,dirpath)
continue;#If the above else didn't executed, this will be reached
continue;#If the above else didn't executed, this will be reached
python >=3.2 due to exist_ok in makedirs

The above methods had not worked for me.
So, This is what I came up with an expansion of my original answer to another question.
What worked for me was:
if (not (str(root) + '/').startswith(tuple(exclude_foldr)))
which compiled a path and excluded the tuple of my listed folders.
This gave me the exact result I was looking for.
My goal for this was to keep my mac organized.
I can Search any folder by path, locate & move specific file.types, ignore subfolders and i preemptively prompt the user if they want to move the files.
NOTE: the Prompt is only one time per run and is NOT per file
By Default the prompt defaults to NO when you hit enter instead of [y/N], and will just list the Potential files to be moved.
This is only a snippet of my GitHub Please visit for the total script.
HINT: Read the script below as I added info per line as to what I had done.
#!/usr/bin/env python3
# =============================================================================
# Created On : MAC OSX High Sierra 10.13.6 (17G65)
# Created On : Python 3.7.0
# Created By : Jeromie Kirchoff
# =============================================================================
"""THE MODULE HAS BEEN BUILD FOR KEEPING YOUR FILES ORGANIZED."""
# =============================================================================
from os import walk
from os import path
from shutil import move
import getpass
import click
mac_username = getpass.getuser()
includes_file_extensn = ([".jpg", ".gif", ".png", ".jpeg", ])
search_dir = path.dirname('/Users/' + mac_username + '/Documents/')
target_foldr = path.dirname('/Users/' + mac_username + '/Pictures/Archive/')
exclude_foldr = set([target_foldr,
path.dirname('/Users/' + mac_username +
'/Documents/GitHub/'),
path.dirname('/Users/' + mac_username +
'/Documents/Random/'),
path.dirname('/Users/' + mac_username +
'/Documents/Stupid_Folder/'),
])
if click.confirm("Would you like to move files?",
default=False):
question_moving = True
else:
question_moving = False
def organize_files():
"""THE MODULE HAS BEEN BUILD FOR KEEPING YOUR FILES ORGANIZED."""
# topdown=True required for filtering.
# "Root" had all info i needed to filter folders not dir...
for root, dir, files in walk(search_dir, topdown=True):
for file in files:
# creating a directory to str and excluding folders that start with
if (not (str(root) + '/').startswith(tuple(exclude_foldr))):
# showcase only the file types looking for
if (file.endswith(tuple(includes_file_extensn))):
# using path.normpath as i found an issue with double //
# in file paths.
filetomove = path.normpath(str(root) + '/' +
str(file))
# forward slash required for both to split
movingfileto = path.normpath(str(target_foldr) + '/' +
str(file))
# Answering "NO" this only prints the files "TO BE Moved"
print('Files To Move: ' + str(filetomove))
# This is using the prompt you answered at the beginning
if question_moving is True:
print('Moving File: ' + str(filetomove) +
"\n To:" + str(movingfileto))
# This is the command that moves the file
move(filetomove, movingfileto)
pass
# The rest is ignoring explicitly and continuing
else:
pass
pass
else:
pass
else:
pass
if __name__ == '__main__':
organize_files()
Example of running my script from terminal:
$ python3 organize_files.py
Exclude list: {'/Users/jkirchoff/Pictures/Archive', '/Users/jkirchoff/Documents/Stupid_Folder', '/Users/jkirchoff/Documents/Random', '/Users/jkirchoff/Documents/GitHub'}
Files found will be moved to this folder:/Users/jkirchoff/Pictures/Archive
Would you like to move files?
No? This will just list the files.
Yes? This will Move your files to the target folder.
[y/N]:
Example of listing files:
Files To Move: /Users/jkirchoff/Documents/Archive/JayWork/1.custom-award-768x512.jpg
Files To Move: /Users/jkirchoff/Documents/Archive/JayWork/10351458_318162838331056_9023492155204267542_n.jpg
...etc
Example of moving files:
Moving File: /Users/jkirchoff/Documents/Archive/JayWork/1.custom-award-768x512.jpg
To: /Users/jkirchoff/Pictures/Archive/1.custom-award-768x512.jpg
Moving File: /Users/jkirchoff/Documents/Archive/JayWork/10351458_318162838331056_9023492155204267542_n.jpg
To: /Users/jkirchoff/Pictures/Archive/10351458_318162838331056_9023492155204267542_n.jpg
...

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Recursively append files to zip archive in python - python

Related

Copy directory structure and files excluding file types, sub folders... Is it possible using the standard library without writing a whole script?

Wildcard for read_template [duplicate]

Python: search for a file in current directory and all it's parents

Use fnmatch.filter to filter files by more than one possible file extension

Filtering os.walk() dirs and files

Categories

Resources