Python keep multiple counters in one recursion function - python

I am trying to count the number of python files and non-python files in a path recursively.
import os
def main():
#path = input('Enter an existing path to a file or directory: ')
path ='/Users/ziyuanhan/PycharmProjects/lab6/'
print(count_file(path, counter={'py':0, 'non_py':0}))
def count_file(path,counter):
if os.path.isfile(path):
if path.endswith('.py') :
counter['py']+=1
return path, counter
else:
counter['non_py']+=1
return path, counter
elif os.path.isdir(path):
for files in os.listdir(path):
print(files)
path = os.path.abspath(files)
print(path)
count_file(path, counter)
return path, counter
main()
The few problems I have is
I had trouble in keeping multiple counters in one recursion function.
Also the return I want is a dictionary format, but I can only do it this way because I have to return it with path.
I use print(files) to check if the function is working alright, but it shows a lot more files(the top 7 files) I never seen in my folder, why is this happening?
When print(files)
/Library/Frameworks/Python.framework/Versions/3.5/bin/python3.5
/Users/ziyuanhan/PycharmProjects/lab7/recursive_dir_traversal.py
.DS_Store
/Users/ziyuanhan/PycharmProjects/lab7/.DS_Store
.idea
/Users/ziyuanhan/PycharmProjects/lab7/.idea
lab7.iml
/Users/ziyuanhan/PycharmProjects/lab7/lab7.iml
misc.xml
/Users/ziyuanhan/PycharmProjects/lab7/misc.xml
modules.xml
/Users/ziyuanhan/PycharmProjects/lab7/modules.xml
workspace.xml
/Users/ziyuanhan/PycharmProjects/lab7/workspace.xml
km_mi_table.py
/Users/ziyuanhan/PycharmProjects/lab7/km_mi_table.py
km_to_miles.py
/Users/ziyuanhan/PycharmProjects/lab7/km_to_miles.py
wordfrequency.py
/Users/ziyuanhan/PycharmProjects/lab7/wordfrequency.py
('/Users/ziyuanhan/PycharmProjects/lab7/wordfrequency.py', {'non_py': 0, 'py': 0})
BTW we have to use recursive function, it is mandatory as the Prof requested.

You don't need to iterate directory recursively yourself. You can use os.walk which yields directories, files for you:
You cannot change local variable / argument of caller. How about returns total_python, total_non_python and use in caller like below?
def count_file(path):
total_python, total_non_python = 0, 0
for parent, directories, files in os.walk(path):
for filename in files:
if filename.lower().endswith('.py'):
total_python += 1
else:
total_non_python += 1
return total_python, total_non_python
def main():
path = input('Enter a path to a file or directory: ')
total_python, total_non_python = count_file(path)
print(path, total_python, total_non_python)
Alternatively, os.scandir is also available since Python 3.5.

You can pass a dictionary as an argument to the function and change the values of the items in the dictionary.
First intialize the dictionary:
counters = {'py': 0, 'other': 0}
Then modify it inside the recursive function:
counters['py'] += 1
This will work because dictionaries are mutable.

This function takes a pathname and returns (total_python, total_not_python). It calls itself on each entries in directories. This is meant to be as close to the given code as reasonable.
def count_file(path):
if os.path.isfile(path):
if path.endswith('.py') :
return 1, 0
else:
return 0, 1
elif os.path.isdir(path):
total_python, total_not_python = 0, 0
for files in os.listdir(path):
print(files)
path = os.path.join(path, files)
subtotal_python, subtotal_python = count_file(path)
total_python += subtotal_python
total_not_python += subtotal_not_python
return total_python, total_not_python

Related

Python Function that take n number of arguments as list and create folder and sub folders based on their index

I am creating a python function which will take "n" numbers of arguments as list and based on their position it should create folders and nested folder
For eg.
path = Path("Path to Directory")
root_folders = ['f1','f2','f3']
yr = ['2018', '2019']
mnth = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
"root_folders" should be main folder within "path"
"yr" should be sub-folders for each of "root_folders"
"mnth" be sub-folders for each of "yr"
I have been able to achieve this using following code:
def folder(root,*args):
for a in args[0]:
path = os.path.join(root, a)
if not os.path.exists(path):
os.mkdir(path)
for b in args[1] :
path2 = os.path.join(path, b)
if not os.path.exists(path2):
os.mkdir(path2)
for c in args[2] :
path3 = os.path.join(path2, c)
if not os.path.exists(path3):
os.mkdir(path3)
folder(path,root_folders,yr,mnth)
But this has certain limitation as the nested folders increase the scalability will be a problem. So is there any solution that this can be achieved using Recursion or any other method
You can use recursion to create the folders, passing the folder creation function a running path at each call:
import os
def create_folders(root, *args):
if args:
for i in args[0]:
os.mkdir(p:=os.path.join(root, i))
create_folders(p, *args[1:])
root_folders = ['f1','f2','f3']
yr = ['2018', '2019']
mnth = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
create_folders(os.getcwd(), root_folders, yr, mnth)
Not sure if this is what you wanted, but I whipped up a short function for you which uses recursion to create a folderstructure from a dictionary:
def folder(root, folderdict, subdir=""):
"""
The folderdict could look something like this:
folderdict = {folder1: None, folder2: None, folder3:{subfolder1: None, subfolder2: {subsubfolder1}}}
"""
# set up path progression for recursion
current_directory = os.path.join(root, subdir)
for foldername, subfolders in folderdict.items():
path = os.path.join(current_directory, foldername)
if not os.path.exists(path):
os.mkdir(path)
# check if newly created path should contain subfolders
# if so: start recursion
if subfolders is not None:
new_subdir = os.path.join(subdir, foldername)
folder(root, subfolders, new_subdir)
Be careful though. Recursion usually doesn't scale very well.

Zero all filesize in a large directory tree (delete file content, keep files)

How can I delete the content (zero the filesize) of a large directory tree (10 GB, 1K files) but keep the entire tree structure, filenames, extensions. (If I can keep the original last write time [last content modification time] that's a bonus).
I have seen several suggestions for individual files, but can not figure out the way to make this work for the entire CWD.
def deleteContent(fName):
with open(fName, "w"):
pass
Running following as administrator should reset all content to an empty file and retain the lastwritetime's of the files
gci c:\temp\test\*.* -recurse | % {
$LastWriteTime = $PSItem.LastWriteTime
clear-content $PSItem;
$PSItem.LastWriteTime = $LastWriteTime
}
os.walk() returns all directories as a list of following tuple:
(directory, list of folders in the directory, list of files in the directory)
When we combine your code with os.walk():
import os
for tuple in os.walk("top_directory"):
files = tuple[2]
dir = tuple[0]
for file in files:
with open(os.path.join(dir, file), "w"):
pass
All good answers, but I can see two more challenges with the answers provided:
When traversing over a directory tree, you may want to limit the depth it goes to, this to protect you from very large directory trees. And secondly Windows has a limitation (enforced by Explorer) of 256 characters in the filename and path. While this limitation will produce various OS errors, there is a workaround for this.
Lets start with the workaround for the maximum length of the filepath, you can do something like the following as a workaround:
import os
import platform
def full_path_windows(filepath):
"""
Filenames and paths have a default limitation of 256 characters in Windows.
By inserting '\\\\?\\' at the start of the path it removes this limitation.
This function inserts '\\\\?\\' at the start of the path, on Windows only
Only if the path starts with '<driveletter>:\\' e.g 'C:\\'.
It will also normalise the characters/case of the path.
"""
if platform.system() == 'Windows':
if filepath[1:3] == ':\\':
return u'\\\\?\\' + os.path.normcase(filepath)
return os.path.normcase(filepath)
There are mentions of write protect, or file in use, or any other condition which may result in not being able to write to the file, this can be checked (without actually writing) by the following:
import os
def write_access(filepath):
"""
Usage:
read_access(filepath)
This function returns True if Write Access is obtained
This function returns False if Write Access is not obtained
This function returns False if the filepath does not exists
filepath = must be an existing file
"""
if os.path.isfile(filepath):
return os.access(filepath, os.W_OK)
return False
For setting minimum depth or maximum depth, you can do something like this:
import os
def get_all_files(rootdir, mindepth = 1, maxdepth = float('inf')):
"""
Usage:
get_all_files(rootdir, mindepth = 1, maxdepth = float('inf'))
This returns a list of all files of a directory, including all files in
subdirectories. Full paths are returned.
WARNING: this may create a very large list if many files exists in the
directory and subdirectories. Make sure you set the maxdepth appropriately.
rootdir = existing directory to start
mindepth = int: the level to start, 1 is start at root dir, 2 is start
at the sub direcories of the root dir, and-so-on-so-forth.
maxdepth = int: the level which to report to. Example, if you only want
in the files of the sub directories of the root dir,
set mindepth = 2 and maxdepth = 2. If you only want the files
of the root dir itself, set mindepth = 1 and maxdepth = 1
"""
file_paths = []
root_depth = rootdir.rstrip(os.path.sep).count(os.path.sep) - 1
for dirpath, dirs, files in os.walk(rootdir):
depth = dirpath.count(os.path.sep) - root_depth
if mindepth <= depth <= maxdepth:
for filename in files:
file_paths.append(os.path.join(dirpath, filename))
elif depth > maxdepth:
del dirs[:]
return file_paths
Now to roll the above code up in a single function, this should give you an idea:
import os
def clear_all_files_content(rootdir, mindepth = 1, maxdepth = float('inf')):
not_cleared = []
root_depth = rootdir.rstrip(os.path.sep).count(os.path.sep) - 1
for dirpath, dirs, files in os.walk(rootdir):
depth = dirpath.count(os.path.sep) - root_depth
if mindepth <= depth <= maxdepth:
for filename in files:
filename = os.path.join(dirpath, filename)
if filename[1:3] == ':\\':
filename = u'\\\\?\\' + os.path.normcase(filename)
if (os.path.isfile(filename) and os.access(filename, os.W_OK)):
with open(filename, 'w'):
pass
else:
not_cleared.append(filename)
elif depth > maxdepth:
del dirs[:]
return not_cleared
This does not maintain the "last write time".
It will return the list not_cleared, which you can check for files which encountered a write access problem.

Python call the recursive function on every element in the list

I am working on this recursive function called traverseDir, everything was going well until here, I don't know how to iterate over the list and call my traverseDir function on every element in the list. Thanks a lot if you can help!
path = sys.argv[1]
def traverseDir(path):
allFile = 0
someFile = 0
if os.path.isfile(path): # base case
print(sys.argv[0])
allFile += 1
if path.endswith('.some'):
someFile += 1
else:
files = os.listdir(path)
return files
#if len(files[0]) <= 1: these 2 lines are where I can't figure out
# return traverseDir(item)
A built in function called os.walk already does this. However, for the sake of your question, you need to iterate over your files list. You will also need to pass all_files and some_files down through the recursion so they can accumulate as they go. You'll also need to return all_files and some_files (python lets you return multiple values as a tuple). You can then add the recursively returned values for all_files and some_files.
def traverseDir(path, all_files=0, some_files=0):
# ... your existing code
files = os.listdir(path)
for f in files:
# extend the path
full_path = os.path.join(path, f)
# unroll the returned values from the recursion
rec_all_files, rec_some_files = traverseDir(full_path, all_files, some_files)
# accumulate the values
all_files += rec_all_files
some_files += rec_some_files
return all_files, some_files
This will call traverseDir() on every file listed. In turn, that recursion will call traverseDir() on every file it lists.
os.listdir
import sys
import os
path = sys.argv[1]
def traverseDir(path):
allFile = 0
someFile = 0
if os.path.isfile(path):
allFile += 1
if path.endswith('.some'):
someFile += 1
else:
for file in os.listdir(path):
agAllFile, agSomeFile = traverseDir(os.path.join(path, file))
allFile += agAllFile
someFile += agSomeFile
return allFile, someFile
print traverseDir(path)
os.walk
import sys
import os
path = sys.argv[1]
def traverseDir(path):
files = [file for dirFiles in os.walk(path) for file in dirFiles[2]]
return len(files), len([file for file in files if file.endswith('.some')])
print traverseDir(path)

delete older folder with similar name using python

I need to iterate over a folder tree. I have to check each subfolder, which looks like this:
moduleA-111-date
moduleA-112-date
moduleA-113-date
moduleB-111-date
moduleB-112-date
etc.
I figured out how to iterate over a folder tree. I can also use stat with mtime to get the date of the folder which seems easier than parsing the name of the date.
How do I single out modules with the same prefix (such as "moduleA") and compare their mtime's so I can delete the oldest?
Since you have no code, I assume that you're looking for design help. I'd lead my students to something like:
Make a list of the names
From each name, find the prefix, such as "moduleA. Put those in a set.
For each prefix in the set
Find all names with that prefix; put these in a temporary list
Sort this list.
For each file in this list *except* the last (newest)
delete the file
Does this get you moving?
I'm posting the code (answer) here, I suppose my question wasn't clear since I'm getting minus signs but anyway the solution wasn't as straight forward as I thought, I'm sure the code could use some fine tuning but it get's the job done.
#!/usr/bin/python
import os
import sys
import fnmatch
import glob
import re
import shutil
##########################################################################################################
#Remove the directory
def remove(path):
try:
shutil.rmtree(path)
print "Deleted : %s" % path
except OSError:
print OSError
print "Unable to remove folder: %s" % path
##########################################################################################################
#This function will look for the .sh files in a given path and returns them as a list.
def searchTreeForSh(path):
full_path = path+'*.sh'
listOfFolders = glob.glob(full_path)
return listOfFolders
##########################################################################################################
#Gets the full path to files containig .sh and returns a list of folder names (prefix) to be acted upon.
#listOfScripts is a list of full paths to .sh file
#dirname is the value that holds the root directory where listOfScripts is operating in
def getFolderNames(listOfScripts):
listOfFolders = []
folderNames = []
for foldername in listOfScripts:
listOfFolders.append(os.path.splitext(foldername)[0])
for folders in listOfFolders:
folder = folders.split('/')
foldersLen=len(folder)
folderNames.append(folder[foldersLen-1])
folderNames.sort()
return folderNames
##########################################################################################################
def minmax(items):
return max(items)
##########################################################################################################
#This function will check the latest entry in the tuple provided, and will then send "everything" to the remove function except that last entry
def sortBeforeDelete(statDir, t):
count = 0
tuple(statDir)
timeNotToDelete = minmax(statDir)
for ff in t:
if t[count][1] == timeNotToDelete:
count += 1
continue
else:
remove(t[count][0])
count += 1
##########################################################################################################
#A loop to run over the fullpath which is broken into items (see os.listdir above), elemenates the .sh and the .txt files, leaves only folder names, then matches it to one of the
#name in the "folders" variable
def coolFunction(folderNames, path):
localPath = os.listdir(path)
for folder in folderNames:
t = () # a tuple to act as sort of a dict, it will hold the folder name and it's equivalent st_mtime
statDir = [] # a list that will hold the st_mtime for all the folder names in subDirList
for item in localPath:
if os.path.isdir(path + item) == True:
if re.search(folder, item):
mtime = os.stat(path + '/' + item)
statDir.append(mtime.st_mtime)
t = t + ((path + item,mtime.st_mtime),)# the "," outside the perenthasis is how to make t be a list of lists and not set the elements one after theother.
if t == ():continue
sortBeforeDelete(statDir, t)
##########################################################################################################
def main(path):
dirs = os.listdir(path)
for component in dirs:
if os.path.isdir(component) == True:
newPath = path + '/' + component + '/'
listOfFolders= searchTreeForSh(newPath)
folderNames = getFolderNames(listOfFolders)
coolFunction(folderNames, newPath)
##########################################################################################################
if __name__ == "__main__":
main(sys.argv[1])

How to list all directories that do not contain a file type?

I'm trying to return a unique list (set) of all directories if they do not contain certain file types. If that file type is NOT found, add that directory name to a list for further auditing.
The function below will find all valid folders and add it to a set for further comparison. I'd like to extend this to only return those directories that DO NOT contain files in the out_list. These directories MAY contain sub-directories with file in the out_list. If that's TRUE, I only want that path of the folder name of the valid dir.
# directory = r'w:\workorder'
#
# Example:
# w:\workorder\region1\12345678\hi.pdf
# w:\workorder\region2\23456789\test\bye.pdf
# w:\workorder\region3\34567891\<empty>
# w:\workorder\region4\45678912\Final.doc
#
# Results:
# ['34567891', '45678912']
job_folders = set([]) #set list is unique
out_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory):
"""
This function will generate the file names in a directory
tree by walking the tree either top-down or bottom-up. For each
directory in the tree rooted at directory top (including top itself),
it yields a 3-tuple (dirpath, dirnames, filenames).
"""
folder_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for item in os.listdir(directory):
if os.path.isdir(os.path.join(directory, item)):
folderpath = os.path.join(directory, item) # Join the two strings in order to form the full folderpath.
if re.search('^[0-9]', item):
job_folders.add(item[:8])
folder_paths.append(folderpath) # Add it to the list.
return folder_paths
Does this do what you want?
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in get_directories_without_exts('W:\\workorder', exts):
print(directory)
def get_directories_without_exts(root, exts):
for root, dirs, files in os.walk(root):
for file in files:
if os.path.splitext(file)[1] in exts:
break
else:
yield root
if __name__ == '__main__':
main()
Edit: After looking at your requirements, I decided to create a tree object to analyze your directory structure. Once created, it is simple to make a recursive query with caching that to find out if a directory "is okay." From there, creating a generator that only finds top-level directories that are "not okay" is fairly simple. There is probably a better way to do this, but the code should at least work.
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in Tree('W:\\workorder', exts).not_okay:
print(directory)
class Tree:
def __init__(self, root, exts):
if not os.path.isdir(root):
raise ValueError('root must be a directory')
self.name = root
self.exts = exts
self.files = set()
self.directories = []
try:
names = os.listdir(root)
except OSError:
pass
else:
for child in names:
path = os.path.join(root, child)
if os.path.isfile(path):
self.files.add(os.path.splitext(child)[1])
elif os.path.isdir(path):
self.directories.append(self.__class__(path, exts))
self._is_okay = None
#property
def is_okay(self):
if self._is_okay is None:
self._is_okay = any(c.is_okay for c in self.directories) or \
any(c in self.exts for c in self.files)
return self._is_okay
#property
def not_okay(self):
if self.is_okay:
for child in self.directories:
for not_okay in child.not_okay:
yield not_okay
else:
yield self.name
if __name__ == '__main__':
main()
Did you copy and paste the existing code from somewhere else? Because the docstring appears to be that of os.walk...
Your question is unclear on several points:
You state that the goal of the code is to "return a unique list (set) of all directories if they do not contain certain file types".
First of all list and set are different data structures.
Secondly, your code creates one of each: job_folders is a set of folder names containing numbers, while folder_paths is a list of complete paths to folders regardless of whether or not they contain numbers.
What do you actually want as output here?
Should "those directories that DO NOT contain files in the out_list" be defined recursively, or only include first-level contents of those directories? My solution assumes the latter
Your example is contradictory on this point: it shows 34567891 in the results, but not region3 in the results. Whether or not the definition is recursive, region3 should be included in the results because region3 does not contain any files with the listed extensions under it.
Should job_folders be populated only with directories that satisfy the criterion about their contents, or with all folder names containing numbers? My solution assumes the latter
One poor practice in your code that I'd highlight is your use of global variables, out_list and job_folders. I've changed the former to a second parameter of get_filepaths and the latter to a second return value.
Anyway, here goes the solution...
import os, re
ext_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory, ext_list):
folder_paths = [] # List which will store all of the full filepaths.
job_folders = set([])
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
_, lastlevel = os.path.split(dir)
if re.search('^[0-9]', lastlevel):
job_folders.add(lastlevel[:8])
for item in files:
root, ext = os.path.splitext(item)
if ext in ext_list:
break
else:
# Since none of the file extensions matched ext_list, add it to the list of folder_paths
folder_paths.append(os.path.relpath(dir, directory))
return folder_paths, job_folders
I created a directory structure identical to yours under /tmp and ran the following:
folder_paths, job_folders = get_filepaths( os.path.expandvars(r"%TEMP%\workorder"), ext_list )
print "folder_paths =", folder_paths
print "job_folders =", job_folders
Here's the output:
folder_paths = ['.', 'region1', 'region2', 'region2\\23456789', 'region3', 'region3\\34567891', 'region4', 'region4\\456789123']
job_folders = set(['12345678', '23456789', '34567891', '45678912'])
As you can see, region1\12345678 and region2\23456789\test are not included in the output folder_paths because they do directly contain files of the specified extensions; all the other subdirectories are included in the output because they do not directly contain files of the specified extensions.
To get the file extension:
name,ext = os.path.splitext(os.path.join(directory,item))
if ext not in out_list:
job_folders.add(item[:8])
thanks to #DanLenski and #NoctisSkytower I was able to get this worked out.
My WorkOrder directory is always the 7th folder down when walking in_path and I found that using os.sep.
I borrowed from both of your solutions and came up with the following:
import os, re
ext_list = [".pdf"]
in_path = r'\\server\E\Data\WorkOrder'
def get_filepaths(directory, ext_list):
not_okay = set([]) # Set which will store Job folder where no ext_list files found
okay = set([]) # Set which will store Job folder where ext_list files found
job_folders = set([]) #valid Job ID folder
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
for item in files:
root, ext = os.path.splitext(item)
if len(dir.split(os.sep)) >= 8: #Tree must contain Job ID folder
job_folder = dir.split(os.sep)[7]
if ext in ext_list:
okay.add(job_folder)
else: # Since none of the file extensions matched ext_list, add it to the list of folder_paths
not_okay.add(job_folder)
bad_list = list(not_okay - okay)
bad_list.sort()
return bad_list
bad_list = get_filepaths( os.path.expandvars(in_path), ext_list )

Categories