Python count files in a directory and all its subdirectories - python

I am trying to count all the files in a folder and all its subfolders
For exemple, if my folder looks like this:
file1.txt
subfolder1/
├── file2.txt
├── subfolder2/
│ ├── file3.txt
│ ├── file4.txt
│ └── subfolder3/
│ └── file5.txt
└── file6.txt
file7.txt
I would like get the number 7.
The first thing I tried is a recursive function who count all files and calls itself for each folder
def get_file_count(directory: str) -> int:
count = 0
for filename in os.listdir(directory):
file = (os.path.join(directory, filename))
if os.path.isfile(file):
count += 1
elif os.path.isdir(file):
count += get_file_count(file)
return count
This way works but takes a lot of time for big directories.
I also remembered this post, which shows a quick way to count the total size of a folder using win32com and I wondered if this librairy also offered a way to do what I was looking for.
But after searching, I only found this
fso = com.Dispatch("Scripting.FileSystemObject")
folder = fso.GetFolder(".")
size = folder.Files.Count
But this only returns the number of files in only the targeted folder (and not in its subfolders)
So, do you know if there is an optimal function in python that returns the number of files in a folder and all its subfolders?

IIUC, you can just do
sum(len(files) for _, _, files in os.walk('path/to/folder'))
or perhaps, to avoid the len for probably slightly better performance:
sum(1 for _, _, files in os.walk('folder_test') for f in files)

This code will reveal a count of all directory entries that are not directories (e.g., plain files, symlinks) from a specified root.
Includes timing and an actual pathname used in the test:
from glob import glob, escape
import os
import time
def get_file_count(directory: str) -> int:
count = 0
for filename in glob(os.path.join(escape(directory), '*')):
if os.path.isdir(filename):
count += get_file_count(filename)
else:
count += 1
return count
start = time.perf_counter()
count = get_file_count('/Volumes/G-DRIVE Thunderbolt 3')
end = time.perf_counter()
print(count)
print(f'{end-start:.2f}s')
Output:
166231
2.38s

i used os.walk()
its my sample , i hope it'll helps you
def file_dir():
directories = []
res = {}
cwd = os.getcwd()
for root, dirs, files in os.walk(cwd):
for file in files:
if file.endswith(".tsv"):
directories.append(os.path.join(root, file))
res['dir'] = directories
return res

you could also directly use the command:
find DIR_NAME -type f | wc -l
this returns the count of all files
With os.system() this can be done from python.

Another solution using the libraries os and Path:
from pathlib import Path
from os.path import isfile
len([x for x in Path('./dir1').rglob('*') if isfile(x)])

The proper way is to use os.walk as others have pointed out, but to give another solution which resembles your original as much as possible:
You can use os.scandir to avoid the cost of constructing the entire list, it should be substantially faster:
def get_file_count(directory: str) -> int:
count = 0
for entry in os.scandir(directory):
if entry.is_file():
count += 1
elif entry.is_dir():
count += get_file_count(os.path.join(directory, entry.name))
return count

Related

How to get the latest folder in a directory using Python

I need to retrieve the directory of the most recently create folder. I am using a program that will output a new run## folder each time it is executed (i.e run01, run02, run03 and so on). Within any one run## folder resides a data file that I want analyze (file-i-want.txt).
folder_numb = 'run01'
dir = os.path.dirname(__file__)
filepath = os.path.join(dir, '..\data\directory',run_numb,'file-i-want.txt')
In short I want to skip having to hardcode in run## and just get the directory of a file within the most recently created run## folder.
You can get the creation date with os.stat
path = '/a/b/c'
#newest
newest = max([f for f in os.listdir(path)], key=lambda x: os.stat(os.path.join(path,x)).st_birthtime)
# all files sorted
sorted_files = sorted([f for f in os.listdir(path)],key=lambda x: os.stat(os.path.join(path, x)).st_birthtime, reverse=True)
pathlib is the recommeded over os for filesystem related tasks.
reference
You can try:
filepath = Path(__file__).parent / 'data/directory'
fnames = sorted(list(Path(filepath).rglob('file-i-want.txt')), key=lambda x: Path.stat(x).st_mtime, reverse=True)
filepath = str(fnames[0])
filepath
glob.glob('run*') will return the list of files/directories that match the pattern ordered by name.
so if you want the latest run your code will be:
import glob
print(glob.glob('run*')[-1]) # raises index error if there are no runs
IMPORTANT, the files are ordered by name, in this case, for example, 'run21' will come AFTER 'run100', so you will need to use a high enough number of digits to not see this error. or just count the number of matched files and recreate the name of the folder with this number.
you can use glob to check the number of files with the same name pattern:
import glob
n = len(glob.glob('run*')) # number of files which name starts with 'run'
new_run_name = 'run' + str(n)
Note: with this code the file names starts from 0, if you want to start from 1 just add 1 to n.
if you want always double digit run number (00, 01, 02) instead of 'str(n)' use 'str(n).zfill(2)'
example:
import glob
n = len(glob.glob('run*')) # number of files which name starts with 'run'
new_run_name = 'run' + str(n + 1).zfill(2)

How do you count subdirectories in a folder?

I figured out how to count directories in a folder, but not sure how I could edit my code to recursively count subdirectories. Any help would be appreciated.
This is my code so far.
def nestingLevel(path):
count = 0
for item in os.listdir(path):
if item[0] != '.':
n = os.path.join(path,item)
if os.path.isdir(n):
count += 1 + nestingLevel(n)
return count
I think you may want to use os.walk:
import os
def fcount(path):
count1 = 0
for root, dirs, files in os.walk(path):
count1 += len(dirs)
return count1
path = "/home/"
print fcount(path)
You can use a glob here - the ** pattern indicates a recursive glob. The trailing slash matches on directories, excluding other types of files.
from pathlib import Path
def recursive_subdir_count(path):
dirs = Path(path).glob('**/')
result = sum(1 for dir in dirs)
result -= 1 # discount `path` itself
Using / works on windows, macOS, and Linux, so don't worry about putting os.sep instead.
Beware of a weird edge case: shell globs typically exclude hidden directories, i.e. those which begin with a ., but pathlib includes those (it's a feature, not a bug: see issue26096). If you care about discounting hidden directories, filter them out in the expression when calling sum. Or, use the older module glob which excludes them by default.
If you want to count them all without the root, this will do it:
len([i for i, j, k in os.walk('.')])-1

Count all files in all folders/subfolders with Python

Which is the most efficient way to count all files in all folders and subfolders in Python? I want to use this on Linux systems.
Example output:
(Path files)
/ 2
/bin 100
/boot 20
/boot/efi/EFI/redhat 1
....
/root 34
....
Paths without a file should be ignored.
Thanks.
You can do it with os.walk();
import os
for root, dirs, files in os.walk('/some/path'):
if files:
print('{0} {1}'.format(root, len(files)))
Note that this will also include hidden files, i.e. those that begin with a dot (.).
import os
print [(item[0], len(item[2])) for item in os.walk('/path') if item[2]]
It returns a list of tuples of folders/subfolders and files count in /path.
OR
import os
for item in os.walk('/path'):
if item[2]:
print item[0], len(item[2])
It prints folders/subfolders and files count in /path.
If you want try faster solution, then you had to try to combine:
os.scandir() # from python 3.5.2
iterate recursively and use:
from itertools import count
counter = count()
counter.next() # returns at first 0, next 1, 2, 3 ...
if counter.next() > 1000:
print 'dir with file count over 1000' # and use continue in for loop
Maybe that will be faster, because I think in os.walk function are unnecessary things for you.

get all folders (os.walk) that are older than x days, delete

How can I concisely express "get all folders older than x days"
I have a method getOldDirs(dirPath, olderThanDays), it must walk through a given root folder and return a list of folders that are older than say 7 days.
I call the above function from another function cleanOldFolders(). cleanOldFolders() will delete those folders similar to "rm -Rf
code that I have, how can I modify the loops concisely:
"""
Clean oldFolders
"""
def cleanOldFolders(self):
pathString = self.folderRoot + '/' + self.configMode + '/' + self.appId
oldDirList = self.getOldDirs(pathString, 7);
# Notify user that the following folders are deleted
# remove all old dirs perhaps using shutil.removetree for each folder oldDirList, rm -Rf
return
Get old dirs:
"""
get all subfolders under dirPath older than olderThanDays
"""
def getOldDirs(self,dirPath, olderThanDays):
# What is the concise way of expressing Get me list of all dir/subdirs from "dirPath" that are older than "olderThanDays"
# I know I have to use os.walk,
# I want a concise loop like this - but should recurse using os.walk
a = [os.path.join(dirPath, myfile) for myfile in os.listdir(dirPath)
if (os.path.isdir(os.path.join(dirPath, myfile)) and
(self.isOlder(os.path.join(dirPath, myfile), olderThanDays))
)]
# for root, dirs, files in os.walk(dirPath):
# for name in dirs:
# print os.path.join(root, name)
return a
One of the nice things about os.walk() is that it does the recursing for you. For its usage in your application it's important to specify the optional keyword argument topdown as False because its default is True and os.rmdir() won't delete non-empty directories.
This means your code will need to delete all the files and subdirectories in each subdirectory it encounters before removing the subdirectory itself. To facilitate doing that, the directory list getOldDirs() returns should be in the order that the subdirectories need to be deleted in.
It's also important to note that in the following, the directory's age is calculated in fractional, not whole, days, which means that seconds count and that one that was only say, 6 days and 23 hours and 59 seconds old won't get put on the list to be deleted even though it is only two seconds away from being old enough.
import os
import time
def getOldDirs(self, dirPath, olderThanDays):
"""
return a list of all subfolders under dirPath older than olderThanDays
"""
olderThanDays *= 86400 # convert days to seconds
present = time.time()
for root, dirs, files in os.walk(dirPath, topdown=False):
for name in dirs:
subDirPath = os.path.join(root, name)
if (present - os.path.getmtime(subDirPath)) > olderThanDays:
yield subDirPath
This should be a starting point.
import os
from time import time as _time
SEVEN_DAYS = 60*60*24*7
def get_old_dirs(dir_path, older_than=SEVEN_DAYS):
time_now = _time()
for path, folders, files in os.walk(dir_path):
for folder in folders:
folder_path = os.path.join(path, folder)
if (time_now - os.path.getmtime(folder_path)) > older_than:
yield folder_path
list_of_folders = list(get_old_dirs("/some/path"))
Also, if you don't want to walk into folders that are older than older_than days (because you're going to delete them) you can prune the search tree be removing folder names from the folders list
def get_old_dirs(dir_path, older_than=SEVEN_DAYS):
time_now = _time()
for path, folders, files in os.walk(dir_path):
for folder in folders[:]:
folder_path = os.path.join(path, folder)
if (time_now - os.path.getmtime(folder_path)) > older_than:
yield folder_path
folders.remove(folder)
This uses os.walk and gets you the list of files older than 7 days
import os
from datetime import date
old_dirs = []
today = date.today()
for root, dirs, files in os.walk(start_path):
for name in dirs:
filedate = date.fromtimestamp(os.path.getmtime(os.path.join(root, name)))
if (today - filedate).days > 7:
old_dirs.append(name)

Python list directory, subdirectory, and files

I'm trying to make a script to list all directory, subdirectory, and files in a given directory.
I tried this:
import sys, os
root = "/home/patate/directory/"
path = os.path.join(root, "targetdirectory")
for r, d, f in os.walk(path):
for file in f:
print(os.path.join(root, file))
Unfortunatly it doesn't work properly.
I get all the files, but not their complete paths.
For example if the dir struct would be:
/home/patate/directory/targetdirectory/123/456/789/file.txt
It would print:
/home/patate/directory/targetdirectory/file.txt
What I need is the first result. Any help would be greatly appreciated! Thanks.
Use os.path.join to concatenate the directory and file name:
for path, subdirs, files in os.walk(root):
for name in files:
print(os.path.join(path, name))
Note the usage of path and not root in the concatenation, since using root would be incorrect.
In Python 3.4, the pathlib module was added for easier path manipulations. So the equivalent to os.path.join would be:
pathlib.PurePath(path, name)
The advantage of pathlib is that you can use a variety of useful methods on paths. If you use the concrete Path variant you can also do actual OS calls through them, like changing into a directory, deleting the path, opening the file it points to and much more.
Just in case... Getting all files in the directory and subdirectories matching some pattern (*.py for example):
import os
from fnmatch import fnmatch
root = '/some/directory'
pattern = "*.py"
for path, subdirs, files in os.walk(root):
for name in files:
if fnmatch(name, pattern):
print(os.path.join(path, name))
Couldn't comment so writing answer here. This is the clearest one-line I have seen:
import os
[os.path.join(path, name) for path, subdirs, files in os.walk(root) for name in files]
Here is a one-liner:
import os
[val for sublist in [[os.path.join(i[0], j) for j in i[2]] for i in os.walk('./')] for val in sublist]
# Meta comment to ease selecting text
The outer most val for sublist in ... loop flattens the list to be one dimensional. The j loop collects a list of every file basename and joins it to the current path. Finally, the i loop iterates over all directories and sub directories.
This example uses the hard-coded path ./ in the os.walk(...) call, you can supplement any path string you like.
Note: os.path.expanduser and/or os.path.expandvars can be used for paths strings like ~/
Extending this example:
Its easy to add in file basename tests and directoryname tests.
For Example, testing for *.jpg files:
... for j in i[2] if j.endswith('.jpg')] ...
Additionally, excluding the .git directory:
... for i in os.walk('./') if '.git' not in i[0].split('/')]
Another option would be using the glob module from the standard lib:
import glob
path = "/home/patate/directory/targetdirectory/**"
for path in glob.glob(path, recursive=True):
print(path)
If you need an iterator you can use iglob as an alternative:
for file in glob.iglob(my_path, recursive=True):
# ...
A bit simpler one-liner:
import os
from itertools import product, chain
chain.from_iterable([[os.sep.join(w) for w in product([i[0]], i[2])] for i in os.walk(dir)])
You can take a look at this sample I made. It uses the os.path.walk function which is deprecated beware.Uses a list to store all the filepaths
root = "Your root directory"
ex = ".txt"
where_to = "Wherever you wanna write your file to"
def fileWalker(ext,dirname,names):
'''
checks files in names'''
pat = "*" + ext[0]
for f in names:
if fnmatch.fnmatch(f,pat):
ext[1].append(os.path.join(dirname,f))
def writeTo(fList):
with open(where_to,"w") as f:
for di_r in fList:
f.write(di_r + "\n")
if __name__ == '__main__':
li = []
os.path.walk(root,fileWalker,[ex,li])
writeTo(li)
Since every example here is just using walk (with join), i'd like to show a nice example and comparison with listdir:
import os, time
def listFiles1(root): # listdir
allFiles = []; walk = [root]
while walk:
folder = walk.pop(0)+"/"; items = os.listdir(folder) # items = folders + files
for i in items: i=folder+i; (walk if os.path.isdir(i) else allFiles).append(i)
return allFiles
def listFiles2(root): # listdir/join (takes ~1.4x as long) (and uses '\\' instead)
allFiles = []; walk = [root]
while walk:
folder = walk.pop(0); items = os.listdir(folder) # items = folders + files
for i in items: i=os.path.join(folder,i); (walk if os.path.isdir(i) else allFiles).append(i)
return allFiles
def listFiles3(root): # walk (takes ~1.5x as long)
allFiles = []
for folder, folders, files in os.walk(root):
for file in files: allFiles+=[folder.replace("\\","/")+"/"+file] # folder+"\\"+file still ~1.5x
return allFiles
def listFiles4(root): # walk/join (takes ~1.6x as long) (and uses '\\' instead)
allFiles = []
for folder, folders, files in os.walk(root):
for file in files: allFiles+=[os.path.join(folder,file)]
return allFiles
for i in range(100): files = listFiles1("src") # warm up
start = time.time()
for i in range(100): files = listFiles1("src") # listdir
print("Time taken: %.2fs"%(time.time()-start)) # 0.28s
start = time.time()
for i in range(100): files = listFiles2("src") # listdir and join
print("Time taken: %.2fs"%(time.time()-start)) # 0.38s
start = time.time()
for i in range(100): files = listFiles3("src") # walk
print("Time taken: %.2fs"%(time.time()-start)) # 0.42s
start = time.time()
for i in range(100): files = listFiles4("src") # walk and join
print("Time taken: %.2fs"%(time.time()-start)) # 0.47s
So as you can see for yourself, the listdir version is much more efficient. (and that join is slow)
Using any supported Python version (3.4+), you should use pathlib.rglob to recusrively list the contents of the current directory and all subdirectories:
from pathlib import Path
def generate_all_files(root: Path, only_files: bool = True):
for p in root.rglob("*"):
if only_files and not p.is_file():
continue
yield p
for p in generate_all_files(Path("."), only_files=False):
print(p)
If you want something copy-pasteable:
Example
Folder structure:
$ tree . -a
.
├── a.txt
├── bar
├── b.py
├── collect.py
├── empty
├── foo
│   └── bar.bz.gz2
├── .hidden
│   └── secrect-file
└── martin
└── thoma
└── cv.pdf
gives:
$ python collect.py
bar
empty
.hidden
collect.py
a.txt
b.py
martin
foo
.hidden/secrect-file
martin/thoma
martin/thoma/cv.pdf
foo/bar.bz.gz2
And this is how you list it in case you want to list the files on SharePoint. Your path will probably start after the "\teams\" part
import os
root = r"\\mycompany.sharepoint.com#SSL\DavWWWRoot\teams\MyFolder\Policies and Procedures\Deal Docs\My Deals"
list = [os.path.join(path, name) for path, subdirs, files in os.walk(root) for name in files]
print(list)
It's just an addition, with this you can get the data into CSV format
import sys,os
try:
import pandas as pd
except:
os.system("pip3 install pandas")
root = "/home/kiran/Downloads/MainFolder" # it may have many subfolders and files inside
lst = []
from fnmatch import fnmatch
pattern = "*.csv" #I want to get only csv files
pattern = "*.*" # Note: Use this pattern to get all types of files and folders
for path, subdirs, files in os.walk(root):
for name in files:
if fnmatch(name, pattern):
lst.append((os.path.join(path, name)))
df = pd.DataFrame({"filePaths":lst})
df.to_csv("filepaths.csv")
Pretty simple solution would be to run a couple of sub process calls to export the files into CSV format:
import subprocess
# Global variables for directory being mapped
location = '.' # Enter the path here.
pattern = '*.py' # Use this if you want to only return certain filetypes
rootDir = location.rpartition('/')[-1]
outputFile = rootDir + '_directory_contents.csv'
# Find the requested data and export to CSV, specifying a pattern if needed.
find_cmd = 'find ' + location + ' -name ' + pattern + ' -fprintf ' + outputFile + ' "%Y%M,%n,%u,%g,%s,%A+,%P\n"'
subprocess.call(find_cmd, shell=True)
That command produces comma separated values that can be easily analyzed in Excel.
f-rwxrwxrwx,1,cathy,cathy,2642,2021-06-01+00:22:00.2970880000,content-audit.py
The resulting CSV file doesn't have a header row, but you can use a second command to add them.
# Add headers to the CSV
headers_cmd = 'sed -i.bak 1i"Permissions,Links,Owner,Group,Size,ModifiedTime,FilePath" ' + outputFile
subprocess.call(headers_cmd, shell=True)
Depending on how much data you get back, you can massage it further using Pandas. Here are some things I found useful, especially if you're dealing with many levels of directories to look through.
Add these to your imports:
import numpy as np
import pandas as pd
Then add this to your code:
# Create DataFrame from the csv file created above.
df = pd.read_csv(outputFile)
# Format columns
# Get the filename and file extension from the filepath
df['FileName'] = df['FilePath'].str.rsplit("/",1).str[-1]
df['FileExt'] = df['FileName'].str.rsplit('.',1).str[1]
# Get the full path to the files. If the path doesn't include a "/" it's the root directory
df['FullPath'] = df["FilePath"].str.rsplit("/",1).str[0]
df['FullPath'] = np.where(df['FullPath'].str.contains("/"), df['FullPath'], rootDir)
# Split the path into columns for the parent directory and its children
df['ParentDir'] = df['FullPath'].str.split("/",1).str[0]
df['SubDirs'] = df['FullPath'].str.split("/",1).str[1]
# Account for NaN returns, indicates the path is the root directory
df['SubDirs'] = np.where(df.SubDirs.str.contains('NaN'), '', df.SubDirs)
# Determine if the item is a directory or file.
df['Type'] = np.where(df['Permissions'].str.startswith('d'), 'Dir', 'File')
# Split the time stamp into date and time columns
df[['ModifiedDate', 'Time']] = df.ModifiedTime.str.rsplit('+', 1, expand=True)
df['Time'] = df['Time'].str.split('.').str[0]
# Show only files, output includes paths so you don't necessarily need to display the individual directories.
df = df[df['Type'].str.contains('File')]
# Set columns to show and their order.
df=df[['FileName','ParentDir','SubDirs','FullPath','DocType','ModifiedDate','Time', 'Size']]
filesize=[] # Create an empty list to store file sizes to convert them to something more readable.
# Go through the items and convert the filesize from bytes to something more readable.
for items in df['Size'].items():
filesize.append(convert_bytes(items[1]))
df['Size'] = filesize
# Send the data to an Excel workbook with sheets by parent directory
with pd.ExcelWriter("scripts_directory_contents.xlsx") as writer:
for directory, data in df.groupby('ParentDir'):
data.to_excel(writer, sheet_name = directory, index=False)
# To convert sizes to be more human readable
def convert_bytes(size):
for x in ['b', 'K', 'M', 'G', 'T']:
if size < 1024:
return "%3.1f %s" % (size, x)
size /= 1024
return size

Categories