Find first occurrence of file in list of directories - python

I have a list of directories. In this list I want to find the first directory with a certain file and return the abspath of the file. I currently have the following code that works:
from os.path import exists, join, abspath
path = ["/some/where", "/some/where/else", "/another/location"]
file_name = "foo.bar"
try:
file = [abspath(join(d, file_name)) for d in path if exists(join(d, file_name))][0]
except IndexError:
file = ""
How can I do this more elegant? What i in particular dislike are the two joins.

You could pull the join out into a genexp:
>>> paths = ["/some/where", "/some/where/else", "/another/location", "/tmp"]
>>> file_name = "foo.bar"
>>> joined = (join(p, file_name) for p in paths)
>>> next((abspath(f) for f in joined if exists(f)), '')
'/tmp/foo.bar'
(You could trivially make this a one-liner if you wanted by inlining it.)
Note that this differs from your code because it stops after finding the first one, whereas your code finds them all.

Even if you joined the the directories with the filename before hand to avoid joining twice, you are still joining all directories. For example, if your list has 10 directories, you will call os.path.join() 10 times, even if the directory which contains the file may be first in the list. Worse yet, when you have to do this several thousands or millions of times, it adds up.
I could not see any elegant solution using list comprehension, so I designed an iterative one. In my solution, as soon as we find a directory which contains the file, we immediately return the full, absolute path to that file and do not process any further. This solution is not elegant, but it is faster.
The downside of this solution is the overhead of calling a function. If what you find is at the end of the list, my solution might be slower than the list comprehension solution.
import os
def find_first(directories, filename):
'''
Given a list of directories and a file name, find first existent
occurrence.
'''
for directory in directories:
fullpath = os.path.abspath(os.path.join(directory, filename))
if os.path.exists(fullpath):
return fullpath
return False
directories = ['/foo', '/bin', '/usr/bin']
filename = 'bash'
print find_first(directories, filename) # /bin/bash

Related

How to iterate over files in specific directories?

I'd like to iterate over files in two folders in a directory only, and ignore any other files/directories.
e.g in path: "dirA/subdirA/folder1" and "dirA/subdirA/folder2"
I tried passing both to pathlib as:
root_dir_A = "dirA/subdirA/folder1"
root_dir_B = "dirA/subdirA/folder2"
for file in Path(root_dir_A,root_dir_B).glob('**/*.json'):
json_data = open(file, encoding="utf8")
...
But it only iterates over the 2nd path in Path(root_dir_A,root_dir_B).
You can't pass two separate directories to Path(). You'll need to loop over them.
for dirpath in (root_dir_A, root_dir_B):
for file in Path(dirpath).glob('**/*.json'):
...
According to the documentation, Path("foo", "bar") should produce "foo/bar"; but it seems to actually use only the second path segment if it is absolute. Either way, it doesn't do what you seemed to hope it would.
Please check the output of Path(root_dir_A,root_dir_B) to see if it returns what you want.
In your specific case this should work:
path_root = Path('dirA')
for path in path_root.glob('subdirA/folder[12]/*/*.json'):
...
If your paths aren't homogeneous enough you might have to chain generators. I. e.:
from itertools import chain
content_dir_A = Path(root_dir_A).glob('**/*.json')
content_dir_B = Path(root_dir_B).glob('**/*.json')
content_all = chain(content_dir_A, content_dir_B)
for path in content_all:
...

In Python how to find a specific directory by name and walk through the files? [duplicate]

I feel that assigning files, and folders and doing the += [item] part is a bit hackish. Any suggestions? I'm using Python 3.2
from os import *
from os.path import *
def dir_contents(path):
contents = listdir(path)
files = []
folders = []
for i, item in enumerate(contents):
if isfile(contents[i]):
files += [item]
elif isdir(contents[i]):
folders += [item]
return files, folders
os.walk and os.scandir are great options, however, I've been using pathlib more and more, and with pathlib you can use the .glob() or .rglob() (recursive glob) methods:
root_directory = Path(".")
for path_object in root_directory.rglob('*'):
if path_object.is_file():
print(f"hi, I'm a file: {path_object}")
elif path_object.is_dir():
print(f"hi, I'm a dir: {path_object}")
Take a look at the os.walk function which returns the path along with the directories and files it contains. That should considerably shorten your solution.
For anyone looking for a solution using pathlib (python >= 3.4)
from pathlib import Path
def walk(path):
for p in Path(path).iterdir():
if p.is_dir():
yield from walk(p)
continue
yield p.resolve()
# recursively traverse all files from current directory
for p in walk(Path('.')):
print(p)
# the function returns a generator so if you need a list you need to build one
all_files = list(walk(Path('.')))
However, as mentioned above, this does not preserve the top-down ordering given by os.walk
Since Python >= 3.4 the exists the generator method Path.rglob.
So, to process all paths under some/starting/path just do something such as
from pathlib import Path
path = Path('some/starting/path')
for subpath in path.rglob('*'):
# do something with subpath
To get all subpaths in a list do list(path.rglob('*')).
To get just the files with sql extension, do list(path.rglob('*.sql')).
If you want to recursively iterate through all the files, including all files in the subfolders, I believe this is the best way.
import os
def get_files(input):
for fd, subfds, fns in os.walk(input):
for fn in fns:
yield os.path.join(fd, fn)
## now this will print all full paths
for fn in get_files(fd):
print(fn)
Since Python 3.4 there is new module pathlib. So to get all dirs and files one can do:
from pathlib import Path
dirs = [str(item) for item in Path(path).iterdir() if item.is_dir()]
files = [str(item) for item in Path(path).iterdir() if item.is_file()]
Another solution how to walk a directory tree using the pathlib module:
from pathlib import Path
for directory in Path('.').glob('**'):
for item in directory.iterdir():
print(item)
The pattern ** matches current directory and all subdirectories, recursively, and the method iterdir then iterates over each directory's contents. Useful when you need more control when traversing the directory tree.
def dir_contents(path):
files,folders = [],[]
for p in listdir(path):
if isfile(p): files.append(p)
else: folders.append(p)
return files, folders
Indeed using
items += [item]
is bad for many reasons...
The append method has been made exactly for that (appending one element to the end of a list)
You are creating a temporary list of one element just to throw it away. While raw speed should not your first concern when using Python (otherwise you're using the wrong language) still wasting speed for no reason doesn't seem the right thing.
You are using a little asymmetry of the Python language... for list objects writing a += b is not the same as writing a = a + b because the former modifies the object in place, while the second instead allocates a new list and this can have a different semantic if the object a is also reachable using other ways. In your specific code this doesn't seem the case but it could become a problem later when someone else (or yourself in a few years, that is the same) will have to modify the code. Python even has a method extend with a less subtle syntax that is specifically made to handle the case in which you want to modify in place a list object by adding at the end the elements of another list.
Also as other have noted seems that your code is trying to do what os.walk already does...
Instead of the built-in os.walk and os.path.walk, I use something derived from this piece of code I found suggested elsewhere which I had originally linked to but have replaced with inlined source:
import os
import stat
class DirectoryStatWalker:
# a forward iterator that traverses a directory tree, and
# returns the filename and additional file information
def __init__(self, directory):
self.stack = [directory]
self.files = []
self.index = 0
def __getitem__(self, index):
while 1:
try:
file = self.files[self.index]
self.index = self.index + 1
except IndexError:
# pop next directory from stack
self.directory = self.stack.pop()
self.files = os.listdir(self.directory)
self.index = 0
else:
# got a filename
fullname = os.path.join(self.directory, file)
st = os.stat(fullname)
mode = st[stat.ST_MODE]
if stat.S_ISDIR(mode) and not stat.S_ISLNK(mode):
self.stack.append(fullname)
return fullname, st
if __name__ == '__main__':
for file, st in DirectoryStatWalker("/usr/include"):
print file, st[stat.ST_SIZE]
It walks the directories recursively and is quite efficient and easy to read.
Try using the append method.
While googling for the same info, I found this question.
I am posting here the smallest, clearest code which I found at http://www.pythoncentral.io/how-to-traverse-a-directory-tree-in-python-guide-to-os-walk/ (rather than just posting the URL, in case of link rot).
The page has some useful info and also points to a few other relevant pages.
# Import the os module, for the os.walk function
import os
# Set the directory you want to start from
rootDir = '.'
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for fname in fileList:
print('\t%s' % fname)
I've not tested this extensively yet, but I believe
this will expand the os.walk generator, join dirnames to all the file paths, and flatten the resulting list; To give a straight up list of concrete files in your search path.
import itertools
import os
def find(input_path):
return itertools.chain(
*list(
list(os.path.join(dirname, fname) for fname in files)
for dirname, _, files in os.walk(input_path)
)
)

Changing name of file until it is unique

I have a script that downloads files (pdfs, docs, etc) from a predetermined list of web pages. I want to edit my script to alter the names of files with a trailing _x if the file name already exists, since it's possible files from different pages will share the same filename but contain different contents, and urlretrieve() appears to automatically overwrite existing files.
So far, I have:
urlfile = 'https://www.foo.com/foo/foo/foo.pdf'
filename = urlfile.split('/')[-1]
filename = foo.pdf
if os.path.exists(filename):
filename = filename('.')[0] + '_' + 1
That works fine for one occurrence, but it looks like after one foo_1.pdf it will start saving as foo_1_1.pdf, and so on. I would like to save the files as foo_1.pdf, foo_2.pdf, and so on.
Can anybody point me in the right direction on how to I can ensure that file names are stored in the correct fashion as the script runs?
Thanks.
So what you want is something like this:
curName = "foo_0.pdf"
while os.path.exists(curName):
num = int(curName.split('.')[0].split('_')[1])
curName = "foo_{}.pdf".format(str(num+1))
Here's the general scheme:
Assume you start from the first file name (foo_0.pdf)
Check if that name is taken
If it is, iterate the name by 1
Continue looping until you find a name that isn't taken
One alternative: Generate a list of file numbers that are in use, and update it as needed. If it's sorted you can say name = "foo_{}.pdf".format(flist[-1]+1). This has the advantage that you don't have to run through all the files every time (as the above solution does). However, you need to keep the list of numbers in memory. Additionally, this will not fill any gaps in the numbers
Why not just use the tempfile module:
fileobj = tempfile.NamedTemporaryFile(suffix='.pdf', prefix='', delete = False)
Now your filename will be available in fileobj.name and you can manipulate to your heart's content. As an added benefit, this is cross-platform.
Since you're dealing with multiple pages, this seeems more like a "global archive" than a per-page archive. For a per-page archive, I would go with the answer from #wnnmaw
For a global archive, I would take a different approch...
Create a directory for each filename
Store the file in the directory as "1" + extension
write the current "number" to the directory as "_files.txt"
additional files are written as 2,3,4,etc and increment the value in _files.txt
The benefits of this:
The directory is the original filename. If you keep turning "Example-1.pdf" into "Example-2.pdf" you run into a possibility where you download a real "Example-2.pdf", and can't associate it to the original filename.
You can grab the number of like-named files either by reading _files.txt or counting the number of files in the directory.
Personally, I'd also suggest storing the files in a tiered bucketing system, so that you don't have too many files/directories in any one directory (hundreds of files makes it annoying as a user, thousands of files can affect OS performance ). A bucketing system might turn a filename into a hexdigest, then drop the file into `/%s/%s/%s" % ( hex[0:3], hex[3:6], filename ). The hexdigest is used to give you a more even distribution of characters.
import os
def uniquify(path, sep=''):
path = os.path.normpath(path)
num = 0
newpath = path
dirname, basename = os.path.split(path)
filename, ext = os.path.splitext(basename)
while os.path.exists(newpath):
newpath = os.path.join(dirname, '{f}{s}{n:d}{e}'
.format(f=filename, s=sep, n=num, e=ext))
num += 1
return newpath
filename = uniquify('foo.pdf', sep='_')
Possible problems with this include:
If you call to uniquify many many thousands of times with the same
path, each subsequent call may get a bit slower since the
while-loop starts checking from num=0 each time.
uniquify is vulnerable to race conditions whereby a file may not
exist at the time os.path.exists is called, but may exist at the
time you use the value returned by uniquify. Use
tempfile.NamedTemporaryFile to avoid this problem. You won't get
incremental numbering, but you will get files with unique names,
guaranteed not to already exist. You could use the prefix parameter to
specify the original name of the file. For example,
import tempfile
import os
def uniquify(path, sep='_', mode='w'):
path = os.path.normpath(path)
if os.path.exists(path):
dirname, basename = os.path.split(path)
filename, ext = os.path.splitext(basename)
return tempfile.NamedTemporaryFile(prefix=filename+sep, suffix=ext, delete=False,
dir=dirname, mode=mode)
else:
return open(path, mode)
Which could be used like this:
In [141]: f = uniquify('/tmp/foo.pdf')
In [142]: f.name
Out[142]: '/tmp/foo_34cvy1.pdf'
Note that to prevent a race-condition, the opened filehandle -- not merely the name of the file -- is returned.

WxPython - building a directory tree based on file availability

I do atomistic modelling, and use Python to analyze simulation results. To simplify work with a whole bunch of Python scripts used for different tasks, I decided to write simple GUI to run scripts from it.
I have a (rather complex) directory structure beginning from some root (say ~/calc), and I want to populate wx.TreeCtrl control with directories containing calculation results preserving their structure. The folder contains the results if it contains a file with .EXT extension. What i try to do is walk through dirs from root and in each dir check whether it contains .EXT file. When such dir is reached, add it and its ancestors to the tree:
def buildTree(self, rootdir):
root = rootdir
r = len(rootdir.split('/'))
ids = {root : self.CalcTree.AddRoot(root)}
for (dirpath, dirnames, filenames) in os.walk(root):
for dirname in dirnames:
fullpath = os.path.join(dirpath, dirname)
if sum([s.find('.EXT') for s in filenames]) > -1 * len(filenames):
ancdirs = fullpath.split('/')[r:]
ad = rootdir
for ancdir in ancdirs:
d = os.path.join(ad, ancdir)
ids[d] = self.CalcTree.AppendItem(ids[ad], ancdir)
ad = d
But this code ends up with many second-level nodes with the same name, and that's definitely not what I want. So I somehow need to see if the node is already added to the tree, and in positive case add new node to the existing one, but I do not understand how this could be done. Could you please give me a hint?
Besides, the code contains 2 dirty hacks I'd like to get rid of:
I get the list of ancestor dirs with splitting the full path in \
positions, and this is Linux-specific;
I find if .EXT file is in the directory by trying to find the extension in the strings from filenames list, taking in account that s.find returns -1 if the substring is not found.
Is there a way to make these chunks of code more readable?
First of all the hacks:
To get the path seperator for whatever os your using you can use os.sep.
Use str.endswith() and use the fact that in Python the empty list [] evaluates to False:
if [ file for file in filenames if file.endswith('.EXT') ]:
In terms of getting them all nicely nested you're best off doing it recursively. So the pseudocode would look something like the following. Please note this is just provided to give you an idea of how to do it, don't expect it to work as it is!
def buildTree(self, rootdir):
rootId = self.CalcTree.AddRoot(root)
self.buildTreeRecursion(rootdir, rootId)
def buildTreeRecursion(self, dir, parentId)
# Iterate over the files in dir
for file in dirFiles:
id = self.CalcTree.AppendItem(parentId, file)
if file is a directory:
self.buildTreeRecursion(file, id)
Hope this helps!

Delete multiple files matching a pattern

I have made an online gallery using Python and Django. I've just started to add editing functionality, starting with a rotation. I use sorl.thumbnail to auto-generate thumbnails on demand.
When I edit the original file, I need to clean up all the thumbnails so new ones are generated. There are three or four of them per image (I have different ones for different occasions).
I could hard-code in the file-varients... But that's messy and if I change the way I do things, I'll need to revisit the code.
Ideally I'd like to do a regex-delete. In regex terms, all my originals are named like so:
^(?P<photo_id>\d+)\.jpg$
So I want to delete:
^(?P<photo_id>\d+)[^\d].*jpg$
(Where I replace photo_id with the ID I want to clean.)
Using the glob module:
import glob, os
for f in glob.glob("P*.jpg"):
os.remove(f)
Alternatively, using pathlib:
from pathlib import Path
for p in Path(".").glob("P*.jpg"):
p.unlink()
Try something like this:
import os, re
def purge(dir, pattern):
for f in os.listdir(dir):
if re.search(pattern, f):
os.remove(os.path.join(dir, f))
Then you would pass the directory containing the files and the pattern you wish to match.
If you need recursion into several subdirectories, you can use this method:
import os, re, os.path
pattern = "^(?P<photo_id>\d+)[^\d].*jpg$"
mypath = "Photos"
for root, dirs, files in os.walk(mypath):
for file in filter(lambda x: re.match(pattern, x), files):
os.remove(os.path.join(root, file))
You can safely remove subdirectories on the fly from dirs, which contains the list of the subdirectories to visit at each node.
Note that if you are in a directory, you can also get files corresponding to a simple pattern expression with glob.glob(pattern). In this case you would have to substract the set of files to keep from the whole set, so the code above is more efficient.
How about this?
import glob, os, multiprocessing
p = multiprocessing.Pool(4)
p.map(os.remove, glob.glob("P*.jpg"))
Mind you this does not do recursion and uses wildcards (not regex).
UPDATE
In Python 3 the map() function will return an iterator, not a list. This is useful since you will probably want to do some kind processing on the items anyway, and an iterator will always be more memory-efficient to that end.
If however, a list is what you really need, just do this:
...
list(p.map(os.remove, glob.glob("P*.jpg")))
I agree it's not the most functional way, but it's concise and does the job.
It's not clear to me that you actually want to do any named-group matching -- in the use you describe, the photoid is an input to the deletion function, and named groups' purpose is "output", i.e., extracting certain substrings from the matched string (and accessing them by name in the match object). So, I would recommend a simpler approach:
import re
import os
def delete_thumbnails(photoid, photodirroot):
matcher = re.compile(r'^%s\d+\D.*jpg$' % photoid)
numdeleted = 0
for rootdir, subdirs, filenames in os.walk(photodirroot):
for name in filenames:
if not matcher.match(name):
continue
path = os.path.join(rootdir, name)
os.remove(path)
numdeleted += 1
return "Deleted %d thumbnails for %r" % (numdeleted, photoid)
You can pass the photoid as a normal string, or as a RE pattern piece if you need to remove several matchable IDs at once (e.g., r'abc[def] to remove abcd, abce, and abcf in a single call) -- that's the reason I'm inserting it literally in the RE pattern, rather than inserting the string re.escape(photoid) as would be normal practice. Certain parts such as counting the number of deletions and returning an informative message at the end are obviously frills which you should remove if they give you no added value in your use case.
Others, such as the "if not ... // continue" pattern, are highly recommended practice in Python (flat is better than nested: bailing out to the next leg of the loop as soon as you determine there is nothing to do on this one is better than nesting the actions to be done within an if), although of course other arrangements of the code would work too.
My recomendation:
def purge(dir, pattern, inclusive=True):
regexObj = re.compile(pattern)
for root, dirs, files in os.walk(dir, topdown=False):
for name in files:
path = os.path.join(root, name)
if bool(regexObj.search(path)) == bool(inclusive):
os.remove(path)
for name in dirs:
path = os.path.join(root, name)
if len(os.listdir(path)) == 0:
os.rmdir(path)
This will recursively remove every file that matches the pattern by default, and every file that doesn't if inclusive is true. It will then remove any empty folders from the directory tree.
import os, sys, glob, re
def main():
mypath = "<Path to Root Folder to work within>"
for root, dirs, files in os.walk(mypath):
for file in files:
p = os.path.join(root, file)
if os.path.isfile(p):
if p[-4:] == ".jpg": #Or any pattern you want
os.remove(p)
I find Popen(["rm " + file_name + "*.ext"], shell=True, stdout=PIPE).communicate() to be a much simpler solution to this problem. Although this is prone to injection attacks, I don't see any issues if your program is using this internally.
def recursive_purge(dir, pattern):
for f in os.listdir(dir):
if os.path.isdir(os.path.join(dir, f)):
recursive_purge(os.path.join(dir, f), pattern)
elif re.search(pattern, os.path.join(dir, f)):
os.remove(os.path.join(dir, f))

Categories