How can I concisely express "get all folders older than x days"
I have a method getOldDirs(dirPath, olderThanDays), it must walk through a given root folder and return a list of folders that are older than say 7 days.
I call the above function from another function cleanOldFolders(). cleanOldFolders() will delete those folders similar to "rm -Rf
code that I have, how can I modify the loops concisely:
"""
Clean oldFolders
"""
def cleanOldFolders(self):
pathString = self.folderRoot + '/' + self.configMode + '/' + self.appId
oldDirList = self.getOldDirs(pathString, 7);
# Notify user that the following folders are deleted
# remove all old dirs perhaps using shutil.removetree for each folder oldDirList, rm -Rf
return
Get old dirs:
"""
get all subfolders under dirPath older than olderThanDays
"""
def getOldDirs(self,dirPath, olderThanDays):
# What is the concise way of expressing Get me list of all dir/subdirs from "dirPath" that are older than "olderThanDays"
# I know I have to use os.walk,
# I want a concise loop like this - but should recurse using os.walk
a = [os.path.join(dirPath, myfile) for myfile in os.listdir(dirPath)
if (os.path.isdir(os.path.join(dirPath, myfile)) and
(self.isOlder(os.path.join(dirPath, myfile), olderThanDays))
)]
# for root, dirs, files in os.walk(dirPath):
# for name in dirs:
# print os.path.join(root, name)
return a
One of the nice things about os.walk() is that it does the recursing for you. For its usage in your application it's important to specify the optional keyword argument topdown as False because its default is True and os.rmdir() won't delete non-empty directories.
This means your code will need to delete all the files and subdirectories in each subdirectory it encounters before removing the subdirectory itself. To facilitate doing that, the directory list getOldDirs() returns should be in the order that the subdirectories need to be deleted in.
It's also important to note that in the following, the directory's age is calculated in fractional, not whole, days, which means that seconds count and that one that was only say, 6 days and 23 hours and 59 seconds old won't get put on the list to be deleted even though it is only two seconds away from being old enough.
import os
import time
def getOldDirs(self, dirPath, olderThanDays):
"""
return a list of all subfolders under dirPath older than olderThanDays
"""
olderThanDays *= 86400 # convert days to seconds
present = time.time()
for root, dirs, files in os.walk(dirPath, topdown=False):
for name in dirs:
subDirPath = os.path.join(root, name)
if (present - os.path.getmtime(subDirPath)) > olderThanDays:
yield subDirPath
This should be a starting point.
import os
from time import time as _time
SEVEN_DAYS = 60*60*24*7
def get_old_dirs(dir_path, older_than=SEVEN_DAYS):
time_now = _time()
for path, folders, files in os.walk(dir_path):
for folder in folders:
folder_path = os.path.join(path, folder)
if (time_now - os.path.getmtime(folder_path)) > older_than:
yield folder_path
list_of_folders = list(get_old_dirs("/some/path"))
Also, if you don't want to walk into folders that are older than older_than days (because you're going to delete them) you can prune the search tree be removing folder names from the folders list
def get_old_dirs(dir_path, older_than=SEVEN_DAYS):
time_now = _time()
for path, folders, files in os.walk(dir_path):
for folder in folders[:]:
folder_path = os.path.join(path, folder)
if (time_now - os.path.getmtime(folder_path)) > older_than:
yield folder_path
folders.remove(folder)
This uses os.walk and gets you the list of files older than 7 days
import os
from datetime import date
old_dirs = []
today = date.today()
for root, dirs, files in os.walk(start_path):
for name in dirs:
filedate = date.fromtimestamp(os.path.getmtime(os.path.join(root, name)))
if (today - filedate).days > 7:
old_dirs.append(name)
Related
I need to retrieve the directory of the most recently create folder. I am using a program that will output a new run## folder each time it is executed (i.e run01, run02, run03 and so on). Within any one run## folder resides a data file that I want analyze (file-i-want.txt).
folder_numb = 'run01'
dir = os.path.dirname(__file__)
filepath = os.path.join(dir, '..\data\directory',run_numb,'file-i-want.txt')
In short I want to skip having to hardcode in run## and just get the directory of a file within the most recently created run## folder.
You can get the creation date with os.stat
path = '/a/b/c'
#newest
newest = max([f for f in os.listdir(path)], key=lambda x: os.stat(os.path.join(path,x)).st_birthtime)
# all files sorted
sorted_files = sorted([f for f in os.listdir(path)],key=lambda x: os.stat(os.path.join(path, x)).st_birthtime, reverse=True)
pathlib is the recommeded over os for filesystem related tasks.
reference
You can try:
filepath = Path(__file__).parent / 'data/directory'
fnames = sorted(list(Path(filepath).rglob('file-i-want.txt')), key=lambda x: Path.stat(x).st_mtime, reverse=True)
filepath = str(fnames[0])
filepath
glob.glob('run*') will return the list of files/directories that match the pattern ordered by name.
so if you want the latest run your code will be:
import glob
print(glob.glob('run*')[-1]) # raises index error if there are no runs
IMPORTANT, the files are ordered by name, in this case, for example, 'run21' will come AFTER 'run100', so you will need to use a high enough number of digits to not see this error. or just count the number of matched files and recreate the name of the folder with this number.
you can use glob to check the number of files with the same name pattern:
import glob
n = len(glob.glob('run*')) # number of files which name starts with 'run'
new_run_name = 'run' + str(n)
Note: with this code the file names starts from 0, if you want to start from 1 just add 1 to n.
if you want always double digit run number (00, 01, 02) instead of 'str(n)' use 'str(n).zfill(2)'
example:
import glob
n = len(glob.glob('run*')) # number of files which name starts with 'run'
new_run_name = 'run' + str(n + 1).zfill(2)
I need your advice on this problem.
I have collected what I need in these two lists: simpl2, astik, with this code:
simpl2 = []
astik = []
for path, subdirs, files in os.walk(rootfolder):
for name in files:
if 'sim2.shp' == name:
simpl2.append(os.path.join(path, name))
elif 'ASTIK.shp' == name:
astik.append(os.path.join(path, name))
The code above searches in a rootfolder that contains folders: v1.v2,v3,v4
So using this:
for i,j in zip(simpl2,astik):
print(i,j)
gives this:
CONTENT
C:\Users\user\Desktop\pl\v1\exported\sim2.shp C:\Users\user\Desktop\pl\v1\ASTIK\ASTIK.shp
C:\Users\user\Desktop\pl\v2\exported\sim2.shp C:\Users\user\Desktop\pl\v4\ASTIK\ASTIK.shp
Question
How to ensure that the pairs would be from the same folder (like the first row that come both from v1 and if don't (like the second row where one is from v2 and the other from v4) make them not have a pair at all.
This should happen because, they will be used later and they have to be correct pairs otherwise I have a code ready with exception for those that don't have a pair, so the problem is how to fix this part that is described earlier.
Explanation
The rootfolder is:
C:\Users\user\Desktop\pl
after that pl there is a v1,v2,v3,v4 folder. Each of these folders has some files that are the same to all the 4 folders. The only difference is that some will be empty. I just want to check if correct pairs of the same v are created in the lists.
Ok, seeing your update maybe you are interested in something more like this:
import os
simpl2 = []
astik = []
rootfolder = r'C:\Users\user\Desktop\pl'
subfolders = [os.path.join(rootfolder, i) for i in ['v1','v2','v3','v4']]
for folder in subfolders:
temp = {name: os.path.join(path, name)
for path, subdirs, files in os.walk(folder)
for name in files
if name in ['sim2.shp', 'ASTIK.shp']}
if len(temp) == 2:
simpl2.append(temp['sim2.shp'])
astik.append(temp['ASTIK.shp'])
OLD CODE
But... if this is your end goal you could also just store the paths. If both files are in the path then you know the path contains both files. You can then easily build the endpaths with os.path.join() when needed.
paths = []
for path, subdirs, files in os.walk(rootfolder):
if ('sim2.shp' in files) and ('ASTIK.shp' in files):
paths.append(path)
Or a more compact format:
lookfor = ['sim2.shp','ASTIK.shp']
paths = [p for p,s,f in os.walk(rootfolder) if all(i in f for i in lookfor)]
this is my first post, so be gentle. ;)
PROBLEM: I would like to be able to use os.walk as a directory walker, but not do into certain folders. Ex:
Tree:
\Proj1_0
\Load001
\lib
\src
\Proj2_0
\Load001
\lib
\src
\Load002
\lib
\src
I want to show the projects and loads, but not the sub-directories under loads. I can do that using the following code.
import os
for root, subFolders, files in os.walk('.'):
# root does NOT contain 'Load'
if root.find('Load') == -1:
print "\nPROJECT: " + root + "\n"
for folder in subFolders:
print " " + folder
However, the list is a big list, so I tried using del to but could not get it to work right and the same thing using lists, such as (which I got from another post here):
def my_walk(top_dir, ignore):
for dirpath, dirnames, filenames in os.walk(top_dir):
dirnames[:] = [
dn for dn in dirnames
if os.path.join(dirpath, dn) not in ignore]
yield dirpath, dirnames, filename
list my_walk('.','Load')
But I could not get the return to work properly, either. I am new to Python and appreciate any help. Thanks!
Try:
dirnames[:] = [
dn for dn in dirnames
if ignore not in os.path.join(dirpath, dn)]
You want to keep directories where os.path.join(dirpath, dn) does not contain the string ignore.
By the way, you are right to use dirnames[:] on the left-hand side of the assignment. To prune the directories visited by os.walk, you have to modify the same list dirnames.
dirnames[:] = ... modifies the same list in-place.
dirnames = ... would redirect the name dirnames to a different value.
You can try the following:
for x in os.walk('.', topdown=True):
dirpath, dirnames, dirfiles = x
print(dirpath, dirnames)
dirnames[:] = filter(lambda x : not x.startswith('Load'), dirnames)
From help(os.walk), you can modify the names if topdown is True, in order to restrict the search.
BTW, this is what I ended up with...
import os,string
path = '.'
path = os.path.normpath(path)
res = []
for root,dirs,files in os.walk(path, topdown=True):
depth = root[len(path) + len(os.path.sep):].count(os.path.sep)
if depth == 2:
# We're currently two directories in, so all subdirs have depth 3
res += [os.path.join(root, d) for d in dirs]
dirs[:] = [] # Don't recurse any deeper
print(res)
I know this is an old post, but thought I should update it with my answer. In case, anyone else finds it useful.
I have a directory logfiles. I want to process each file inside this directory using a Python script.
for file in directory:
# do something
How do I do this?
With os.listdir() or os.walk(), depending on whether you want to do it recursively.
In Python 2, you can try something like:
import os.path
def print_it(x, dir_name, files):
print dir_name
print files
os.path.walk(your_dir, print_it, 0)
Note: the 3rd argument of os.path.walk is whatever you want. You'll get it as the 1st arg of the callback.
In Python 3 os.path.walk has been removed; use os.walk instead. Instead of taking a callback, you just pass it a directory and it yields (dirpath, dirnames, filenames) triples. So a rough equivalent of the above becomes
import os
for dirpath, dirnames, filenames in os.walk(your_dir):
print dirpath
print dirnames
print filenames
You can list every file from a directory recursively like this.
from os import listdir
from os.path import isfile, join, isdir
def getAllFilesRecursive(root):
files = [ join(root,f) for f in listdir(root) if isfile(join(root,f))]
dirs = [ d for d in listdir(root) if isdir(join(root,d))]
for d in dirs:
files_in_d = getAllFilesRecursive(join(root,d))
if files_in_d:
for f in files_in_d:
files.append(join(root,f))
return files
import os
# location of directory you want to scan
loc = '/home/sahil/Documents'
# global dictonary element used to store all results
global k1
k1 = {}
# scan function recursively scans through all the diretories in loc and return a dictonary
def scan(element,loc):
le = len(element)
for i in range(le):
try:
second_list = os.listdir(loc+'/'+element[i])
temp = loc+'/'+element[i]
print "....."
print "Directory %s " %(temp)
print " "
print second_list
k1[temp] = second_list
scan(second_list,temp)
except OSError:
pass
return k1 # return the dictonary element
# initial steps
try:
initial_list = os.listdir(loc)
print initial_list
except OSError:
print "error"
k =scan(initial_list,loc)
print " ..................................................................................."
print k
I made this code as a directory scanner to make a playlist feature for my audio player and it will recursively scan all the sub directories present in directory.
You could try glob:
import glob
for file in glob.glob('log-*-*.txt'):
# Etc.
But glob doesn't work recursively (as far as I know), so if your logs are in folders inside of that directory, you'd be better off looking at what Ignacio Vazquez-Abrams posted.
If you need to check for multiple file types, use
glob.glob("*.jpg") + glob.glob("*.png")
Glob doesn't care about the ordering of the files in the list. If you need files sorted by filename, use
sorted(glob.glob("*.jpg"))
import os
rootDir = '.'
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for fname in fileList:
print('\t%s' % fname)
# Remove the first entry in the list of sub-directories
# if there are any sub-directories present
if len(subdirList) > 0:
del subdirList[0]
Here's my version of the recursive file walker based on the answer of Matheus Araujo, that can take optional exclusion list arguments, which happens to be very helpful when dealing with tree copies where some directores / files / file extensions aren't wanted.
import os
def get_files_recursive(root, d_exclude_list=[], f_exclude_list=[], ext_exclude_list=[], primary_root=None):
"""
Walk a path to recursively find files
Modified version of https://stackoverflow.com/a/24771959/2635443 that includes exclusion lists
:param root: path to explore
:param d_exclude_list: list of root relative directories paths to exclude
:param f_exclude_list: list of filenames without paths to exclude
:param ext_exclude_list: list of file extensions to exclude, ex: ['.log', '.bak']
:param primary_root: Only used for internal recursive exclusion lookup, don't pass an argument here
:return: list of files found in path
"""
# Make sure we use a valid os separator for exclusion lists, this is done recursively :(
d_exclude_list = [os.path.normpath(d) for d in d_exclude_list]
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
and f not in f_exclude_list and os.path.splitext(f)[1] not in ext_exclude_list]
dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
for d in dirs:
p_root = os.path.join(primary_root, d) if primary_root is not None else d
if p_root not in d_exclude_list:
files_in_d = get_files_recursive(os.path.join(root, d), d_exclude_list, f_exclude_list, ext_exclude_list, primary_root=p_root)
if files_in_d:
for f in files_in_d:
files.append(os.path.join(root, f))
return files
This is an update of my last version that accepts glob style wildcards in exclude lists.
The function basically walks into every subdirectory of the given path and returns the list of all files from those directories, as relative paths.
Function works like Matheus' answer, and may use optional exclude lists.
Eg:
files = get_files_recursive('/some/path')
files = get_files_recursive('/some/path', f_exclude_list=['.cache', '*.bak'])
files = get_files_recursive('C:\\Users', d_exclude_list=['AppData', 'Temp'])
files = get_files_recursive('/some/path', ext_exclude_list=['.log', '.db'])
Hope this helps someone like the initial answer of this thread helped me :)
import os
from fnmatch import fnmatch
def glob_path_match(path, pattern_list):
"""
Checks if path is in a list of glob style wildcard paths
:param path: path of file / directory
:param pattern_list: list of wildcard patterns to check for
:return: Boolean
"""
return any(fnmatch(path, pattern) for pattern in pattern_list)
def get_files_recursive(root, d_exclude_list=None, f_exclude_list=None, ext_exclude_list=None, primary_root=None):
"""
Walk a path to recursively find files
Modified version of https://stackoverflow.com/a/24771959/2635443 that includes exclusion lists
and accepts glob style wildcards on files and directories
:param root: path to explore
:param d_exclude_list: list of root relative directories paths to exclude
:param f_exclude_list: list of filenames without paths to exclude
:param ext_exclude_list: list of file extensions to exclude, ex: ['.log', '.bak']
:param primary_root: Only used for internal recursive exclusion lookup, don't pass an argument here
:return: list of files found in path
"""
if d_exclude_list is not None:
# Make sure we use a valid os separator for exclusion lists, this is done recursively :(
d_exclude_list = [os.path.normpath(d) for d in d_exclude_list]
else:
d_exclude_list = []
if f_exclude_list is None:
f_exclude_list = []
if ext_exclude_list is None:
ext_exclude_list = []
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
and not glob_path_match(f, f_exclude_list) and os.path.splitext(f)[1] not in ext_exclude_list]
dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
for d in dirs:
p_root = os.path.join(primary_root, d) if primary_root is not None else d
if not glob_path_match(p_root, d_exclude_list):
files_in_d = get_files_recursive(os.path.join(root, d), d_exclude_list, f_exclude_list, ext_exclude_list,
primary_root=p_root)
if files_in_d:
for f in files_in_d:
files.append(os.path.join(root, f))
return files
I'm trying to make a script to list all directory, subdirectory, and files in a given directory.
I tried this:
import sys, os
root = "/home/patate/directory/"
path = os.path.join(root, "targetdirectory")
for r, d, f in os.walk(path):
for file in f:
print(os.path.join(root, file))
Unfortunatly it doesn't work properly.
I get all the files, but not their complete paths.
For example if the dir struct would be:
/home/patate/directory/targetdirectory/123/456/789/file.txt
It would print:
/home/patate/directory/targetdirectory/file.txt
What I need is the first result. Any help would be greatly appreciated! Thanks.
Use os.path.join to concatenate the directory and file name:
for path, subdirs, files in os.walk(root):
for name in files:
print(os.path.join(path, name))
Note the usage of path and not root in the concatenation, since using root would be incorrect.
In Python 3.4, the pathlib module was added for easier path manipulations. So the equivalent to os.path.join would be:
pathlib.PurePath(path, name)
The advantage of pathlib is that you can use a variety of useful methods on paths. If you use the concrete Path variant you can also do actual OS calls through them, like changing into a directory, deleting the path, opening the file it points to and much more.
Just in case... Getting all files in the directory and subdirectories matching some pattern (*.py for example):
import os
from fnmatch import fnmatch
root = '/some/directory'
pattern = "*.py"
for path, subdirs, files in os.walk(root):
for name in files:
if fnmatch(name, pattern):
print(os.path.join(path, name))
Couldn't comment so writing answer here. This is the clearest one-line I have seen:
import os
[os.path.join(path, name) for path, subdirs, files in os.walk(root) for name in files]
Here is a one-liner:
import os
[val for sublist in [[os.path.join(i[0], j) for j in i[2]] for i in os.walk('./')] for val in sublist]
# Meta comment to ease selecting text
The outer most val for sublist in ... loop flattens the list to be one dimensional. The j loop collects a list of every file basename and joins it to the current path. Finally, the i loop iterates over all directories and sub directories.
This example uses the hard-coded path ./ in the os.walk(...) call, you can supplement any path string you like.
Note: os.path.expanduser and/or os.path.expandvars can be used for paths strings like ~/
Extending this example:
Its easy to add in file basename tests and directoryname tests.
For Example, testing for *.jpg files:
... for j in i[2] if j.endswith('.jpg')] ...
Additionally, excluding the .git directory:
... for i in os.walk('./') if '.git' not in i[0].split('/')]
Another option would be using the glob module from the standard lib:
import glob
path = "/home/patate/directory/targetdirectory/**"
for path in glob.glob(path, recursive=True):
print(path)
If you need an iterator you can use iglob as an alternative:
for file in glob.iglob(my_path, recursive=True):
# ...
A bit simpler one-liner:
import os
from itertools import product, chain
chain.from_iterable([[os.sep.join(w) for w in product([i[0]], i[2])] for i in os.walk(dir)])
You can take a look at this sample I made. It uses the os.path.walk function which is deprecated beware.Uses a list to store all the filepaths
root = "Your root directory"
ex = ".txt"
where_to = "Wherever you wanna write your file to"
def fileWalker(ext,dirname,names):
'''
checks files in names'''
pat = "*" + ext[0]
for f in names:
if fnmatch.fnmatch(f,pat):
ext[1].append(os.path.join(dirname,f))
def writeTo(fList):
with open(where_to,"w") as f:
for di_r in fList:
f.write(di_r + "\n")
if __name__ == '__main__':
li = []
os.path.walk(root,fileWalker,[ex,li])
writeTo(li)
Since every example here is just using walk (with join), i'd like to show a nice example and comparison with listdir:
import os, time
def listFiles1(root): # listdir
allFiles = []; walk = [root]
while walk:
folder = walk.pop(0)+"/"; items = os.listdir(folder) # items = folders + files
for i in items: i=folder+i; (walk if os.path.isdir(i) else allFiles).append(i)
return allFiles
def listFiles2(root): # listdir/join (takes ~1.4x as long) (and uses '\\' instead)
allFiles = []; walk = [root]
while walk:
folder = walk.pop(0); items = os.listdir(folder) # items = folders + files
for i in items: i=os.path.join(folder,i); (walk if os.path.isdir(i) else allFiles).append(i)
return allFiles
def listFiles3(root): # walk (takes ~1.5x as long)
allFiles = []
for folder, folders, files in os.walk(root):
for file in files: allFiles+=[folder.replace("\\","/")+"/"+file] # folder+"\\"+file still ~1.5x
return allFiles
def listFiles4(root): # walk/join (takes ~1.6x as long) (and uses '\\' instead)
allFiles = []
for folder, folders, files in os.walk(root):
for file in files: allFiles+=[os.path.join(folder,file)]
return allFiles
for i in range(100): files = listFiles1("src") # warm up
start = time.time()
for i in range(100): files = listFiles1("src") # listdir
print("Time taken: %.2fs"%(time.time()-start)) # 0.28s
start = time.time()
for i in range(100): files = listFiles2("src") # listdir and join
print("Time taken: %.2fs"%(time.time()-start)) # 0.38s
start = time.time()
for i in range(100): files = listFiles3("src") # walk
print("Time taken: %.2fs"%(time.time()-start)) # 0.42s
start = time.time()
for i in range(100): files = listFiles4("src") # walk and join
print("Time taken: %.2fs"%(time.time()-start)) # 0.47s
So as you can see for yourself, the listdir version is much more efficient. (and that join is slow)
Using any supported Python version (3.4+), you should use pathlib.rglob to recusrively list the contents of the current directory and all subdirectories:
from pathlib import Path
def generate_all_files(root: Path, only_files: bool = True):
for p in root.rglob("*"):
if only_files and not p.is_file():
continue
yield p
for p in generate_all_files(Path("."), only_files=False):
print(p)
If you want something copy-pasteable:
Example
Folder structure:
$ tree . -a
.
├── a.txt
├── bar
├── b.py
├── collect.py
├── empty
├── foo
│ └── bar.bz.gz2
├── .hidden
│ └── secrect-file
└── martin
└── thoma
└── cv.pdf
gives:
$ python collect.py
bar
empty
.hidden
collect.py
a.txt
b.py
martin
foo
.hidden/secrect-file
martin/thoma
martin/thoma/cv.pdf
foo/bar.bz.gz2
And this is how you list it in case you want to list the files on SharePoint. Your path will probably start after the "\teams\" part
import os
root = r"\\mycompany.sharepoint.com#SSL\DavWWWRoot\teams\MyFolder\Policies and Procedures\Deal Docs\My Deals"
list = [os.path.join(path, name) for path, subdirs, files in os.walk(root) for name in files]
print(list)
It's just an addition, with this you can get the data into CSV format
import sys,os
try:
import pandas as pd
except:
os.system("pip3 install pandas")
root = "/home/kiran/Downloads/MainFolder" # it may have many subfolders and files inside
lst = []
from fnmatch import fnmatch
pattern = "*.csv" #I want to get only csv files
pattern = "*.*" # Note: Use this pattern to get all types of files and folders
for path, subdirs, files in os.walk(root):
for name in files:
if fnmatch(name, pattern):
lst.append((os.path.join(path, name)))
df = pd.DataFrame({"filePaths":lst})
df.to_csv("filepaths.csv")
Pretty simple solution would be to run a couple of sub process calls to export the files into CSV format:
import subprocess
# Global variables for directory being mapped
location = '.' # Enter the path here.
pattern = '*.py' # Use this if you want to only return certain filetypes
rootDir = location.rpartition('/')[-1]
outputFile = rootDir + '_directory_contents.csv'
# Find the requested data and export to CSV, specifying a pattern if needed.
find_cmd = 'find ' + location + ' -name ' + pattern + ' -fprintf ' + outputFile + ' "%Y%M,%n,%u,%g,%s,%A+,%P\n"'
subprocess.call(find_cmd, shell=True)
That command produces comma separated values that can be easily analyzed in Excel.
f-rwxrwxrwx,1,cathy,cathy,2642,2021-06-01+00:22:00.2970880000,content-audit.py
The resulting CSV file doesn't have a header row, but you can use a second command to add them.
# Add headers to the CSV
headers_cmd = 'sed -i.bak 1i"Permissions,Links,Owner,Group,Size,ModifiedTime,FilePath" ' + outputFile
subprocess.call(headers_cmd, shell=True)
Depending on how much data you get back, you can massage it further using Pandas. Here are some things I found useful, especially if you're dealing with many levels of directories to look through.
Add these to your imports:
import numpy as np
import pandas as pd
Then add this to your code:
# Create DataFrame from the csv file created above.
df = pd.read_csv(outputFile)
# Format columns
# Get the filename and file extension from the filepath
df['FileName'] = df['FilePath'].str.rsplit("/",1).str[-1]
df['FileExt'] = df['FileName'].str.rsplit('.',1).str[1]
# Get the full path to the files. If the path doesn't include a "/" it's the root directory
df['FullPath'] = df["FilePath"].str.rsplit("/",1).str[0]
df['FullPath'] = np.where(df['FullPath'].str.contains("/"), df['FullPath'], rootDir)
# Split the path into columns for the parent directory and its children
df['ParentDir'] = df['FullPath'].str.split("/",1).str[0]
df['SubDirs'] = df['FullPath'].str.split("/",1).str[1]
# Account for NaN returns, indicates the path is the root directory
df['SubDirs'] = np.where(df.SubDirs.str.contains('NaN'), '', df.SubDirs)
# Determine if the item is a directory or file.
df['Type'] = np.where(df['Permissions'].str.startswith('d'), 'Dir', 'File')
# Split the time stamp into date and time columns
df[['ModifiedDate', 'Time']] = df.ModifiedTime.str.rsplit('+', 1, expand=True)
df['Time'] = df['Time'].str.split('.').str[0]
# Show only files, output includes paths so you don't necessarily need to display the individual directories.
df = df[df['Type'].str.contains('File')]
# Set columns to show and their order.
df=df[['FileName','ParentDir','SubDirs','FullPath','DocType','ModifiedDate','Time', 'Size']]
filesize=[] # Create an empty list to store file sizes to convert them to something more readable.
# Go through the items and convert the filesize from bytes to something more readable.
for items in df['Size'].items():
filesize.append(convert_bytes(items[1]))
df['Size'] = filesize
# Send the data to an Excel workbook with sheets by parent directory
with pd.ExcelWriter("scripts_directory_contents.xlsx") as writer:
for directory, data in df.groupby('ParentDir'):
data.to_excel(writer, sheet_name = directory, index=False)
# To convert sizes to be more human readable
def convert_bytes(size):
for x in ['b', 'K', 'M', 'G', 'T']:
if size < 1024:
return "%3.1f %s" % (size, x)
size /= 1024
return size