This is a function that lists all the files in the current folder and the subfolder which is recursive, I'm not able to write it non-recursively
I tried using while and for loops in nested format but I was not able to make it work.
def recur_files(start_dir):
files = []
original_path = os.getcwd()
os.chdir(start_dir)
items = os.listdir()
for item in items:
if "." in item:
files.append(os.path.abspath(item))
else:
files.extend(recur_files(os.path.abspath(item)))
os.chdir(original_path)
return files
Example:
from os import walk
files = []
for _, _, filenames in walk(your_path):
files.extend(filenames)
print("Files: {}".format(files))
You can get all files recursively in you path.
If you want to list your files in a Depth First Search fashion without using the program stack (aka via recursion), you can always create your own stack (just a list in Python) and write a simple DFS algorithm as follows.
import os
def recur_files():
original_path = os.getcwd()
stack = os.listdir()
results = []
for item in stack:
os.path.join(original_path, item)
while stack:
elem = stack.pop(0)
if os.path.isdir(elem):
results.append(elem)
for item in os.listdir(elem):
stack = [os.path.join(elem, item)] + stack
else:
results.append(elem)
return results
You can use os.walk() to get all files in folder and subfolders.
But if you want to create own function then you need list for dirs. Loop should get dirname from this list and you should add new dirs to this list instead of running function with new start_dir
import os
def recur_files(start_dir):
files = []
dirs = [start_dir]
for dirname in dirs:
for item in os.listdir(dirname):
fullpath = os.path.join(dirname, item)
if os.path.isdir(fullpath): #and fullpath not in ('.', '..'):
dirs.append(fullpath)
else:
files.append(fullpath)
return files, dirs
recur_files('.')
Related
I am creating a python function which will take "n" numbers of arguments as list and based on their position it should create folders and nested folder
For eg.
path = Path("Path to Directory")
root_folders = ['f1','f2','f3']
yr = ['2018', '2019']
mnth = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
"root_folders" should be main folder within "path"
"yr" should be sub-folders for each of "root_folders"
"mnth" be sub-folders for each of "yr"
I have been able to achieve this using following code:
def folder(root,*args):
for a in args[0]:
path = os.path.join(root, a)
if not os.path.exists(path):
os.mkdir(path)
for b in args[1] :
path2 = os.path.join(path, b)
if not os.path.exists(path2):
os.mkdir(path2)
for c in args[2] :
path3 = os.path.join(path2, c)
if not os.path.exists(path3):
os.mkdir(path3)
folder(path,root_folders,yr,mnth)
But this has certain limitation as the nested folders increase the scalability will be a problem. So is there any solution that this can be achieved using Recursion or any other method
You can use recursion to create the folders, passing the folder creation function a running path at each call:
import os
def create_folders(root, *args):
if args:
for i in args[0]:
os.mkdir(p:=os.path.join(root, i))
create_folders(p, *args[1:])
root_folders = ['f1','f2','f3']
yr = ['2018', '2019']
mnth = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
create_folders(os.getcwd(), root_folders, yr, mnth)
Not sure if this is what you wanted, but I whipped up a short function for you which uses recursion to create a folderstructure from a dictionary:
def folder(root, folderdict, subdir=""):
"""
The folderdict could look something like this:
folderdict = {folder1: None, folder2: None, folder3:{subfolder1: None, subfolder2: {subsubfolder1}}}
"""
# set up path progression for recursion
current_directory = os.path.join(root, subdir)
for foldername, subfolders in folderdict.items():
path = os.path.join(current_directory, foldername)
if not os.path.exists(path):
os.mkdir(path)
# check if newly created path should contain subfolders
# if so: start recursion
if subfolders is not None:
new_subdir = os.path.join(subdir, foldername)
folder(root, subfolders, new_subdir)
Be careful though. Recursion usually doesn't scale very well.
I have a main directory(root) which countain 6 sub directory.
I would like to count the number of files present in each sub directory and add all to a simple python list.
For this result : mylist = [497643, 5976, 3698, 12, 456, 745]
I'm blocked on that code:
import os, sys
list = []
# Open a file
path = "c://root"
dirs = os.listdir( path )
# This would print all the files and directories
for file in dirs:
print (file)
#fill a list with each sub directory number of elements
for sub_dir in dirs:
list = dirs.append(len(sub_dir))
My trying for the list fill doesn't work and i'm dramaticaly at my best...
Finding a way to iterate sub-directory of a main directory and fill a list with a function applied on each sub directory would sky rocket the speed of my actual data science project!
Thanks for your help
Abel
You can use os.path.isfile and os.path.isdir
res = [len(list(map(os.path.isfile, os.listdir(os.path.join(path, name))))) for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
print(res)
Using the for loop
res = []
for name in os.listdir(path):
dir_path = os.path.join(path, name)
if os.path.isdir(dir_path):
res.append(len(list(map(os.path.isfile, os.listdir(dir_path)))))
You need to use os.listdir on each subdirectory. The current code simply takes the length of a filepath.
import os, sys
list = []
# Open a file
path = "c://root"
dirs = os.listdir( path )
# This would print all the files and directories
for file in dirs:
print (file)
#fill a list with each sub directory number of elements
for sub_dir in dirs:
temp = os.listdir(sub_dir)
list = dirs.append(len(temp))
Adding this line to the code will list out the subdirectory
You were almost there:
import os, sys
list = []
# Open a file
path = "c://root"
dirs = os.listdir(path)
# This would print all the files and directories
for file in dirs:
print(file)
for sub_dir in dirs:
if os.path.isdir(sub_dir):
list.append(len(os.listdir(os.path.join(path, sub_dir))))
print(list)
As an alternative, you can also utilize glob module for this and other related tasks.
I have created a test directory containing 3 subdirectories l,m and k containing 3 test files each.
import os, glob
list = []
path = "test" # you can leave this "." if you want files in the current directory
for root, dirs, files in os.walk(path, topdown=True):
for name in dirs:
list.append(len(glob.glob(root + '/' + name + '/*')))
print(list)
Output :
[3, 3, 3]
I've got a directory which can have many folders within other folders, and txt files in them. I'd like to return a list of all directories which contain at least one .txt file.
I was attempting the following recursive approach, but it's not working:
def getDirectoryList(dir):
directoryList = []
# return nothing if dir is a file
if not os.path.isdir(dir):
return []
# add dir to directorylist if it contains .txt files
if len([file for file in os.listdir(dir) if file.endswith('.txt')])>0:
directoryList.append(dir)
for d in os.listdir(dir):
for x in getDirectoryList(d):
directoryList.append[x]
return directoryList
def getDirectoryList(path):
directoryList = []
#return nothing if path is a file
if os.path.isfile(path):
return []
#add dir to directorylist if it contains .txt files
if len([f for f in os.listdir(path) if f.endswith('.txt')])>0:
directoryList.append(path)
for d in os.listdir(path):
new_path = os.path.join(path, d)
if os.path.isdir(new_path):
directoryList += getDirectoryList(new_path)
return directoryList
here is the code that worked. the important differences are the if "os.path.isdir(os.path.join(path, d)):" check and the addition of path and d before recursive calls, because os.listdir() gives names, not paths
I need to iterate over a folder tree. I have to check each subfolder, which looks like this:
moduleA-111-date
moduleA-112-date
moduleA-113-date
moduleB-111-date
moduleB-112-date
etc.
I figured out how to iterate over a folder tree. I can also use stat with mtime to get the date of the folder which seems easier than parsing the name of the date.
How do I single out modules with the same prefix (such as "moduleA") and compare their mtime's so I can delete the oldest?
Since you have no code, I assume that you're looking for design help. I'd lead my students to something like:
Make a list of the names
From each name, find the prefix, such as "moduleA. Put those in a set.
For each prefix in the set
Find all names with that prefix; put these in a temporary list
Sort this list.
For each file in this list *except* the last (newest)
delete the file
Does this get you moving?
I'm posting the code (answer) here, I suppose my question wasn't clear since I'm getting minus signs but anyway the solution wasn't as straight forward as I thought, I'm sure the code could use some fine tuning but it get's the job done.
#!/usr/bin/python
import os
import sys
import fnmatch
import glob
import re
import shutil
##########################################################################################################
#Remove the directory
def remove(path):
try:
shutil.rmtree(path)
print "Deleted : %s" % path
except OSError:
print OSError
print "Unable to remove folder: %s" % path
##########################################################################################################
#This function will look for the .sh files in a given path and returns them as a list.
def searchTreeForSh(path):
full_path = path+'*.sh'
listOfFolders = glob.glob(full_path)
return listOfFolders
##########################################################################################################
#Gets the full path to files containig .sh and returns a list of folder names (prefix) to be acted upon.
#listOfScripts is a list of full paths to .sh file
#dirname is the value that holds the root directory where listOfScripts is operating in
def getFolderNames(listOfScripts):
listOfFolders = []
folderNames = []
for foldername in listOfScripts:
listOfFolders.append(os.path.splitext(foldername)[0])
for folders in listOfFolders:
folder = folders.split('/')
foldersLen=len(folder)
folderNames.append(folder[foldersLen-1])
folderNames.sort()
return folderNames
##########################################################################################################
def minmax(items):
return max(items)
##########################################################################################################
#This function will check the latest entry in the tuple provided, and will then send "everything" to the remove function except that last entry
def sortBeforeDelete(statDir, t):
count = 0
tuple(statDir)
timeNotToDelete = minmax(statDir)
for ff in t:
if t[count][1] == timeNotToDelete:
count += 1
continue
else:
remove(t[count][0])
count += 1
##########################################################################################################
#A loop to run over the fullpath which is broken into items (see os.listdir above), elemenates the .sh and the .txt files, leaves only folder names, then matches it to one of the
#name in the "folders" variable
def coolFunction(folderNames, path):
localPath = os.listdir(path)
for folder in folderNames:
t = () # a tuple to act as sort of a dict, it will hold the folder name and it's equivalent st_mtime
statDir = [] # a list that will hold the st_mtime for all the folder names in subDirList
for item in localPath:
if os.path.isdir(path + item) == True:
if re.search(folder, item):
mtime = os.stat(path + '/' + item)
statDir.append(mtime.st_mtime)
t = t + ((path + item,mtime.st_mtime),)# the "," outside the perenthasis is how to make t be a list of lists and not set the elements one after theother.
if t == ():continue
sortBeforeDelete(statDir, t)
##########################################################################################################
def main(path):
dirs = os.listdir(path)
for component in dirs:
if os.path.isdir(component) == True:
newPath = path + '/' + component + '/'
listOfFolders= searchTreeForSh(newPath)
folderNames = getFolderNames(listOfFolders)
coolFunction(folderNames, newPath)
##########################################################################################################
if __name__ == "__main__":
main(sys.argv[1])
I'm trying to return a unique list (set) of all directories if they do not contain certain file types. If that file type is NOT found, add that directory name to a list for further auditing.
The function below will find all valid folders and add it to a set for further comparison. I'd like to extend this to only return those directories that DO NOT contain files in the out_list. These directories MAY contain sub-directories with file in the out_list. If that's TRUE, I only want that path of the folder name of the valid dir.
# directory = r'w:\workorder'
#
# Example:
# w:\workorder\region1\12345678\hi.pdf
# w:\workorder\region2\23456789\test\bye.pdf
# w:\workorder\region3\34567891\<empty>
# w:\workorder\region4\45678912\Final.doc
#
# Results:
# ['34567891', '45678912']
job_folders = set([]) #set list is unique
out_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory):
"""
This function will generate the file names in a directory
tree by walking the tree either top-down or bottom-up. For each
directory in the tree rooted at directory top (including top itself),
it yields a 3-tuple (dirpath, dirnames, filenames).
"""
folder_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for item in os.listdir(directory):
if os.path.isdir(os.path.join(directory, item)):
folderpath = os.path.join(directory, item) # Join the two strings in order to form the full folderpath.
if re.search('^[0-9]', item):
job_folders.add(item[:8])
folder_paths.append(folderpath) # Add it to the list.
return folder_paths
Does this do what you want?
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in get_directories_without_exts('W:\\workorder', exts):
print(directory)
def get_directories_without_exts(root, exts):
for root, dirs, files in os.walk(root):
for file in files:
if os.path.splitext(file)[1] in exts:
break
else:
yield root
if __name__ == '__main__':
main()
Edit: After looking at your requirements, I decided to create a tree object to analyze your directory structure. Once created, it is simple to make a recursive query with caching that to find out if a directory "is okay." From there, creating a generator that only finds top-level directories that are "not okay" is fairly simple. There is probably a better way to do this, but the code should at least work.
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in Tree('W:\\workorder', exts).not_okay:
print(directory)
class Tree:
def __init__(self, root, exts):
if not os.path.isdir(root):
raise ValueError('root must be a directory')
self.name = root
self.exts = exts
self.files = set()
self.directories = []
try:
names = os.listdir(root)
except OSError:
pass
else:
for child in names:
path = os.path.join(root, child)
if os.path.isfile(path):
self.files.add(os.path.splitext(child)[1])
elif os.path.isdir(path):
self.directories.append(self.__class__(path, exts))
self._is_okay = None
#property
def is_okay(self):
if self._is_okay is None:
self._is_okay = any(c.is_okay for c in self.directories) or \
any(c in self.exts for c in self.files)
return self._is_okay
#property
def not_okay(self):
if self.is_okay:
for child in self.directories:
for not_okay in child.not_okay:
yield not_okay
else:
yield self.name
if __name__ == '__main__':
main()
Did you copy and paste the existing code from somewhere else? Because the docstring appears to be that of os.walk...
Your question is unclear on several points:
You state that the goal of the code is to "return a unique list (set) of all directories if they do not contain certain file types".
First of all list and set are different data structures.
Secondly, your code creates one of each: job_folders is a set of folder names containing numbers, while folder_paths is a list of complete paths to folders regardless of whether or not they contain numbers.
What do you actually want as output here?
Should "those directories that DO NOT contain files in the out_list" be defined recursively, or only include first-level contents of those directories? My solution assumes the latter
Your example is contradictory on this point: it shows 34567891 in the results, but not region3 in the results. Whether or not the definition is recursive, region3 should be included in the results because region3 does not contain any files with the listed extensions under it.
Should job_folders be populated only with directories that satisfy the criterion about their contents, or with all folder names containing numbers? My solution assumes the latter
One poor practice in your code that I'd highlight is your use of global variables, out_list and job_folders. I've changed the former to a second parameter of get_filepaths and the latter to a second return value.
Anyway, here goes the solution...
import os, re
ext_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory, ext_list):
folder_paths = [] # List which will store all of the full filepaths.
job_folders = set([])
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
_, lastlevel = os.path.split(dir)
if re.search('^[0-9]', lastlevel):
job_folders.add(lastlevel[:8])
for item in files:
root, ext = os.path.splitext(item)
if ext in ext_list:
break
else:
# Since none of the file extensions matched ext_list, add it to the list of folder_paths
folder_paths.append(os.path.relpath(dir, directory))
return folder_paths, job_folders
I created a directory structure identical to yours under /tmp and ran the following:
folder_paths, job_folders = get_filepaths( os.path.expandvars(r"%TEMP%\workorder"), ext_list )
print "folder_paths =", folder_paths
print "job_folders =", job_folders
Here's the output:
folder_paths = ['.', 'region1', 'region2', 'region2\\23456789', 'region3', 'region3\\34567891', 'region4', 'region4\\456789123']
job_folders = set(['12345678', '23456789', '34567891', '45678912'])
As you can see, region1\12345678 and region2\23456789\test are not included in the output folder_paths because they do directly contain files of the specified extensions; all the other subdirectories are included in the output because they do not directly contain files of the specified extensions.
To get the file extension:
name,ext = os.path.splitext(os.path.join(directory,item))
if ext not in out_list:
job_folders.add(item[:8])
thanks to #DanLenski and #NoctisSkytower I was able to get this worked out.
My WorkOrder directory is always the 7th folder down when walking in_path and I found that using os.sep.
I borrowed from both of your solutions and came up with the following:
import os, re
ext_list = [".pdf"]
in_path = r'\\server\E\Data\WorkOrder'
def get_filepaths(directory, ext_list):
not_okay = set([]) # Set which will store Job folder where no ext_list files found
okay = set([]) # Set which will store Job folder where ext_list files found
job_folders = set([]) #valid Job ID folder
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
for item in files:
root, ext = os.path.splitext(item)
if len(dir.split(os.sep)) >= 8: #Tree must contain Job ID folder
job_folder = dir.split(os.sep)[7]
if ext in ext_list:
okay.add(job_folder)
else: # Since none of the file extensions matched ext_list, add it to the list of folder_paths
not_okay.add(job_folder)
bad_list = list(not_okay - okay)
bad_list.sort()
return bad_list
bad_list = get_filepaths( os.path.expandvars(in_path), ext_list )