I've got a directory which can have many folders within other folders, and txt files in them. I'd like to return a list of all directories which contain at least one .txt file.
I was attempting the following recursive approach, but it's not working:
def getDirectoryList(dir):
directoryList = []
# return nothing if dir is a file
if not os.path.isdir(dir):
return []
# add dir to directorylist if it contains .txt files
if len([file for file in os.listdir(dir) if file.endswith('.txt')])>0:
directoryList.append(dir)
for d in os.listdir(dir):
for x in getDirectoryList(d):
directoryList.append[x]
return directoryList
def getDirectoryList(path):
directoryList = []
#return nothing if path is a file
if os.path.isfile(path):
return []
#add dir to directorylist if it contains .txt files
if len([f for f in os.listdir(path) if f.endswith('.txt')])>0:
directoryList.append(path)
for d in os.listdir(path):
new_path = os.path.join(path, d)
if os.path.isdir(new_path):
directoryList += getDirectoryList(new_path)
return directoryList
here is the code that worked. the important differences are the if "os.path.isdir(os.path.join(path, d)):" check and the addition of path and d before recursive calls, because os.listdir() gives names, not paths
Related
I want to move all files containing a special string into another folder.
For example all files from testfolder/folder1and its subdirectories into testfolder/movedfiles.
I already wrote the code to extract all files from the folder and subfolder, but I am not quite sure how to move the files with the string (for example): abcd
import os
from fnmatch import fnmatch
root = "Directory"
string = "abcd"
def getListOfFiles(dirName):
# create a list of file and sub directories
# names in the given directory
listOfFile = os.listdir(dirName)
allFiles = list()
# Iterate over all the entries
for entry in listOfFile:
# Create full path
fullPath = os.path.join(dirName, entry)
# If entry is a directory then get the list of files in this directory
if os.path.isdir(fullPath):
allFiles = allFiles + getListOfFiles(fullPath)
else:
allFiles.append(fullPath)
return allFiles
# just to remove all the Directories before the current one
remove_foldername = "Directory Prefix"
allFiles = getListOfFiles(root)
filename = ""
filelist = []
for filename in allFiles:
if remove_foldername in filename:
filename = filename.replace(remove_foldername, "")
filelist.append(filename)
Now after all that, now the filelist would look like this:
['testfolder/folder/abcd.rtf', test/folder/folder1/abcd.rtf'] and so on.
Now I want to have all files moved to another directory (lets say to .../testfolder/movedfiles)
How would I do that?
i want to delete all the files from all the folders iteratively using python in windows filesystem where i need to keep only .parquet files and remove all other files ending with .crc,.bak etc.,
The problem is i have folders like files1,files2,files3 ... files100 folders and i have to remove all other .bak,.crc etc., files from all the folders and just keep .parquet files,can anyone help me on this please
i tried this
mydir='c/users/name/files'
for f in os.listdir(mydir):
if f.endswith(".parquet"):
continue
os.remove(os.path.join(mydir, f))
import os
def getListOfFiles(dirName):
# create a list of file and sub directories
# names in the given directory
listOfFile = os.listdir(dirName)
allFiles = list()
# Iterate over all the entries
for entry in listOfFile:
# Create full path
fullPath = os.path.join(dirName, entry)
# If entry is a directory then get the list of files in this directory
if os.path.isdir(fullPath):
allFiles = allFiles + getListOfFiles(fullPath)
else:
allFiles.append(fullPath)
return allFiles
## select only files in all dirr
onlyfiles = getListOfFiles("C:/Users")
index_dot = 0
to_delete = []
to_keep = ["parquet"] #list of format to keep
for name in onlyfiles:
for l in range(len(name)):
if name[l] == '.':
index_dot = l
format_file = len(name)-index_dot-1
if name[-format_file:] not in to_keep:
to_delete.append(name)
for to_del in to_delete:
os.remove(to_del)
Don't forget to modify this line :
onlyfiles = getListOfFiles("C:/Users")
Becare full with this code, you could delete lot of files by mistake
I have a main directory(root) which countain 6 sub directory.
I would like to count the number of files present in each sub directory and add all to a simple python list.
For this result : mylist = [497643, 5976, 3698, 12, 456, 745]
I'm blocked on that code:
import os, sys
list = []
# Open a file
path = "c://root"
dirs = os.listdir( path )
# This would print all the files and directories
for file in dirs:
print (file)
#fill a list with each sub directory number of elements
for sub_dir in dirs:
list = dirs.append(len(sub_dir))
My trying for the list fill doesn't work and i'm dramaticaly at my best...
Finding a way to iterate sub-directory of a main directory and fill a list with a function applied on each sub directory would sky rocket the speed of my actual data science project!
Thanks for your help
Abel
You can use os.path.isfile and os.path.isdir
res = [len(list(map(os.path.isfile, os.listdir(os.path.join(path, name))))) for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
print(res)
Using the for loop
res = []
for name in os.listdir(path):
dir_path = os.path.join(path, name)
if os.path.isdir(dir_path):
res.append(len(list(map(os.path.isfile, os.listdir(dir_path)))))
You need to use os.listdir on each subdirectory. The current code simply takes the length of a filepath.
import os, sys
list = []
# Open a file
path = "c://root"
dirs = os.listdir( path )
# This would print all the files and directories
for file in dirs:
print (file)
#fill a list with each sub directory number of elements
for sub_dir in dirs:
temp = os.listdir(sub_dir)
list = dirs.append(len(temp))
Adding this line to the code will list out the subdirectory
You were almost there:
import os, sys
list = []
# Open a file
path = "c://root"
dirs = os.listdir(path)
# This would print all the files and directories
for file in dirs:
print(file)
for sub_dir in dirs:
if os.path.isdir(sub_dir):
list.append(len(os.listdir(os.path.join(path, sub_dir))))
print(list)
As an alternative, you can also utilize glob module for this and other related tasks.
I have created a test directory containing 3 subdirectories l,m and k containing 3 test files each.
import os, glob
list = []
path = "test" # you can leave this "." if you want files in the current directory
for root, dirs, files in os.walk(path, topdown=True):
for name in dirs:
list.append(len(glob.glob(root + '/' + name + '/*')))
print(list)
Output :
[3, 3, 3]
I want to perform:
iterate over the content of the folder
if content is file, append to list
if content is folder, goto 1
if folder name is "depth" or "ir", ignore
I am using python. Can you help?
ended up doing something like:
_files = []
dir = "path/to/folder"
for root, dirs, files in os.walk(dir, topdown=False):
for name in files:
files = os.path.join(root, name)
if root.split("/")[-1] in ["depth", "ir"]:
continue
_files.append(files)
print(_files)
The os.walk() will recurse for you.
import os
res = []
for (root, dirs, files) in os.walk('/path/to/dir'):
# root is the absolute path to the dir, so check if the last part is depth or ir
if root.split("/")[-1] in ["depth", "ir"]:
continue
else:
# files is a list of files
res.extend(files)
print(res)
Try this
import os
basepath ="<path>"
files=[]
def loopover(path):
contents = os.listdir(path)
for c in contents:
d = os.path.join(path,c)
if os.path.isfile(d):
files.append(c)
if os.path.isdir(d):
if (c=="depth" or c=="ir"):
continue
else:
loopover(d)
loopover(basepath)
I'm trying to return a unique list (set) of all directories if they do not contain certain file types. If that file type is NOT found, add that directory name to a list for further auditing.
The function below will find all valid folders and add it to a set for further comparison. I'd like to extend this to only return those directories that DO NOT contain files in the out_list. These directories MAY contain sub-directories with file in the out_list. If that's TRUE, I only want that path of the folder name of the valid dir.
# directory = r'w:\workorder'
#
# Example:
# w:\workorder\region1\12345678\hi.pdf
# w:\workorder\region2\23456789\test\bye.pdf
# w:\workorder\region3\34567891\<empty>
# w:\workorder\region4\45678912\Final.doc
#
# Results:
# ['34567891', '45678912']
job_folders = set([]) #set list is unique
out_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory):
"""
This function will generate the file names in a directory
tree by walking the tree either top-down or bottom-up. For each
directory in the tree rooted at directory top (including top itself),
it yields a 3-tuple (dirpath, dirnames, filenames).
"""
folder_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for item in os.listdir(directory):
if os.path.isdir(os.path.join(directory, item)):
folderpath = os.path.join(directory, item) # Join the two strings in order to form the full folderpath.
if re.search('^[0-9]', item):
job_folders.add(item[:8])
folder_paths.append(folderpath) # Add it to the list.
return folder_paths
Does this do what you want?
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in get_directories_without_exts('W:\\workorder', exts):
print(directory)
def get_directories_without_exts(root, exts):
for root, dirs, files in os.walk(root):
for file in files:
if os.path.splitext(file)[1] in exts:
break
else:
yield root
if __name__ == '__main__':
main()
Edit: After looking at your requirements, I decided to create a tree object to analyze your directory structure. Once created, it is simple to make a recursive query with caching that to find out if a directory "is okay." From there, creating a generator that only finds top-level directories that are "not okay" is fairly simple. There is probably a better way to do this, but the code should at least work.
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in Tree('W:\\workorder', exts).not_okay:
print(directory)
class Tree:
def __init__(self, root, exts):
if not os.path.isdir(root):
raise ValueError('root must be a directory')
self.name = root
self.exts = exts
self.files = set()
self.directories = []
try:
names = os.listdir(root)
except OSError:
pass
else:
for child in names:
path = os.path.join(root, child)
if os.path.isfile(path):
self.files.add(os.path.splitext(child)[1])
elif os.path.isdir(path):
self.directories.append(self.__class__(path, exts))
self._is_okay = None
#property
def is_okay(self):
if self._is_okay is None:
self._is_okay = any(c.is_okay for c in self.directories) or \
any(c in self.exts for c in self.files)
return self._is_okay
#property
def not_okay(self):
if self.is_okay:
for child in self.directories:
for not_okay in child.not_okay:
yield not_okay
else:
yield self.name
if __name__ == '__main__':
main()
Did you copy and paste the existing code from somewhere else? Because the docstring appears to be that of os.walk...
Your question is unclear on several points:
You state that the goal of the code is to "return a unique list (set) of all directories if they do not contain certain file types".
First of all list and set are different data structures.
Secondly, your code creates one of each: job_folders is a set of folder names containing numbers, while folder_paths is a list of complete paths to folders regardless of whether or not they contain numbers.
What do you actually want as output here?
Should "those directories that DO NOT contain files in the out_list" be defined recursively, or only include first-level contents of those directories? My solution assumes the latter
Your example is contradictory on this point: it shows 34567891 in the results, but not region3 in the results. Whether or not the definition is recursive, region3 should be included in the results because region3 does not contain any files with the listed extensions under it.
Should job_folders be populated only with directories that satisfy the criterion about their contents, or with all folder names containing numbers? My solution assumes the latter
One poor practice in your code that I'd highlight is your use of global variables, out_list and job_folders. I've changed the former to a second parameter of get_filepaths and the latter to a second return value.
Anyway, here goes the solution...
import os, re
ext_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory, ext_list):
folder_paths = [] # List which will store all of the full filepaths.
job_folders = set([])
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
_, lastlevel = os.path.split(dir)
if re.search('^[0-9]', lastlevel):
job_folders.add(lastlevel[:8])
for item in files:
root, ext = os.path.splitext(item)
if ext in ext_list:
break
else:
# Since none of the file extensions matched ext_list, add it to the list of folder_paths
folder_paths.append(os.path.relpath(dir, directory))
return folder_paths, job_folders
I created a directory structure identical to yours under /tmp and ran the following:
folder_paths, job_folders = get_filepaths( os.path.expandvars(r"%TEMP%\workorder"), ext_list )
print "folder_paths =", folder_paths
print "job_folders =", job_folders
Here's the output:
folder_paths = ['.', 'region1', 'region2', 'region2\\23456789', 'region3', 'region3\\34567891', 'region4', 'region4\\456789123']
job_folders = set(['12345678', '23456789', '34567891', '45678912'])
As you can see, region1\12345678 and region2\23456789\test are not included in the output folder_paths because they do directly contain files of the specified extensions; all the other subdirectories are included in the output because they do not directly contain files of the specified extensions.
To get the file extension:
name,ext = os.path.splitext(os.path.join(directory,item))
if ext not in out_list:
job_folders.add(item[:8])
thanks to #DanLenski and #NoctisSkytower I was able to get this worked out.
My WorkOrder directory is always the 7th folder down when walking in_path and I found that using os.sep.
I borrowed from both of your solutions and came up with the following:
import os, re
ext_list = [".pdf"]
in_path = r'\\server\E\Data\WorkOrder'
def get_filepaths(directory, ext_list):
not_okay = set([]) # Set which will store Job folder where no ext_list files found
okay = set([]) # Set which will store Job folder where ext_list files found
job_folders = set([]) #valid Job ID folder
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
for item in files:
root, ext = os.path.splitext(item)
if len(dir.split(os.sep)) >= 8: #Tree must contain Job ID folder
job_folder = dir.split(os.sep)[7]
if ext in ext_list:
okay.add(job_folder)
else: # Since none of the file extensions matched ext_list, add it to the list of folder_paths
not_okay.add(job_folder)
bad_list = list(not_okay - okay)
bad_list.sort()
return bad_list
bad_list = get_filepaths( os.path.expandvars(in_path), ext_list )