Python list directory, subdirectory, and files - python

I'm trying to make a script to list all directory, subdirectory, and files in a given directory.
I tried this:
import sys, os
root = "/home/patate/directory/"
path = os.path.join(root, "targetdirectory")
for r, d, f in os.walk(path):
for file in f:
print(os.path.join(root, file))
Unfortunatly it doesn't work properly.
I get all the files, but not their complete paths.
For example if the dir struct would be:
/home/patate/directory/targetdirectory/123/456/789/file.txt
It would print:
/home/patate/directory/targetdirectory/file.txt
What I need is the first result. Any help would be greatly appreciated! Thanks.

Use os.path.join to concatenate the directory and file name:
for path, subdirs, files in os.walk(root):
for name in files:
print(os.path.join(path, name))
Note the usage of path and not root in the concatenation, since using root would be incorrect.
In Python 3.4, the pathlib module was added for easier path manipulations. So the equivalent to os.path.join would be:
pathlib.PurePath(path, name)
The advantage of pathlib is that you can use a variety of useful methods on paths. If you use the concrete Path variant you can also do actual OS calls through them, like changing into a directory, deleting the path, opening the file it points to and much more.

Just in case... Getting all files in the directory and subdirectories matching some pattern (*.py for example):
import os
from fnmatch import fnmatch
root = '/some/directory'
pattern = "*.py"
for path, subdirs, files in os.walk(root):
for name in files:
if fnmatch(name, pattern):
print(os.path.join(path, name))

Couldn't comment so writing answer here. This is the clearest one-line I have seen:
import os
[os.path.join(path, name) for path, subdirs, files in os.walk(root) for name in files]

Here is a one-liner:
import os
[val for sublist in [[os.path.join(i[0], j) for j in i[2]] for i in os.walk('./')] for val in sublist]
# Meta comment to ease selecting text
The outer most val for sublist in ... loop flattens the list to be one dimensional. The j loop collects a list of every file basename and joins it to the current path. Finally, the i loop iterates over all directories and sub directories.
This example uses the hard-coded path ./ in the os.walk(...) call, you can supplement any path string you like.
Note: os.path.expanduser and/or os.path.expandvars can be used for paths strings like ~/
Extending this example:
Its easy to add in file basename tests and directoryname tests.
For Example, testing for *.jpg files:
... for j in i[2] if j.endswith('.jpg')] ...
Additionally, excluding the .git directory:
... for i in os.walk('./') if '.git' not in i[0].split('/')]

Another option would be using the glob module from the standard lib:
import glob
path = "/home/patate/directory/targetdirectory/**"
for path in glob.glob(path, recursive=True):
print(path)
If you need an iterator you can use iglob as an alternative:
for file in glob.iglob(my_path, recursive=True):
# ...

A bit simpler one-liner:
import os
from itertools import product, chain
chain.from_iterable([[os.sep.join(w) for w in product([i[0]], i[2])] for i in os.walk(dir)])

You can take a look at this sample I made. It uses the os.path.walk function which is deprecated beware.Uses a list to store all the filepaths
root = "Your root directory"
ex = ".txt"
where_to = "Wherever you wanna write your file to"
def fileWalker(ext,dirname,names):
'''
checks files in names'''
pat = "*" + ext[0]
for f in names:
if fnmatch.fnmatch(f,pat):
ext[1].append(os.path.join(dirname,f))
def writeTo(fList):
with open(where_to,"w") as f:
for di_r in fList:
f.write(di_r + "\n")
if __name__ == '__main__':
li = []
os.path.walk(root,fileWalker,[ex,li])
writeTo(li)

Since every example here is just using walk (with join), i'd like to show a nice example and comparison with listdir:
import os, time
def listFiles1(root): # listdir
allFiles = []; walk = [root]
while walk:
folder = walk.pop(0)+"/"; items = os.listdir(folder) # items = folders + files
for i in items: i=folder+i; (walk if os.path.isdir(i) else allFiles).append(i)
return allFiles
def listFiles2(root): # listdir/join (takes ~1.4x as long) (and uses '\\' instead)
allFiles = []; walk = [root]
while walk:
folder = walk.pop(0); items = os.listdir(folder) # items = folders + files
for i in items: i=os.path.join(folder,i); (walk if os.path.isdir(i) else allFiles).append(i)
return allFiles
def listFiles3(root): # walk (takes ~1.5x as long)
allFiles = []
for folder, folders, files in os.walk(root):
for file in files: allFiles+=[folder.replace("\\","/")+"/"+file] # folder+"\\"+file still ~1.5x
return allFiles
def listFiles4(root): # walk/join (takes ~1.6x as long) (and uses '\\' instead)
allFiles = []
for folder, folders, files in os.walk(root):
for file in files: allFiles+=[os.path.join(folder,file)]
return allFiles
for i in range(100): files = listFiles1("src") # warm up
start = time.time()
for i in range(100): files = listFiles1("src") # listdir
print("Time taken: %.2fs"%(time.time()-start)) # 0.28s
start = time.time()
for i in range(100): files = listFiles2("src") # listdir and join
print("Time taken: %.2fs"%(time.time()-start)) # 0.38s
start = time.time()
for i in range(100): files = listFiles3("src") # walk
print("Time taken: %.2fs"%(time.time()-start)) # 0.42s
start = time.time()
for i in range(100): files = listFiles4("src") # walk and join
print("Time taken: %.2fs"%(time.time()-start)) # 0.47s
So as you can see for yourself, the listdir version is much more efficient. (and that join is slow)

Using any supported Python version (3.4+), you should use pathlib.rglob to recusrively list the contents of the current directory and all subdirectories:
from pathlib import Path
def generate_all_files(root: Path, only_files: bool = True):
for p in root.rglob("*"):
if only_files and not p.is_file():
continue
yield p
for p in generate_all_files(Path("."), only_files=False):
print(p)
If you want something copy-pasteable:
Example
Folder structure:
$ tree . -a
.
├── a.txt
├── bar
├── b.py
├── collect.py
├── empty
├── foo
│   └── bar.bz.gz2
├── .hidden
│   └── secrect-file
└── martin
└── thoma
└── cv.pdf
gives:
$ python collect.py
bar
empty
.hidden
collect.py
a.txt
b.py
martin
foo
.hidden/secrect-file
martin/thoma
martin/thoma/cv.pdf
foo/bar.bz.gz2

And this is how you list it in case you want to list the files on SharePoint. Your path will probably start after the "\teams\" part
import os
root = r"\\mycompany.sharepoint.com#SSL\DavWWWRoot\teams\MyFolder\Policies and Procedures\Deal Docs\My Deals"
list = [os.path.join(path, name) for path, subdirs, files in os.walk(root) for name in files]
print(list)

It's just an addition, with this you can get the data into CSV format
import sys,os
try:
import pandas as pd
except:
os.system("pip3 install pandas")
root = "/home/kiran/Downloads/MainFolder" # it may have many subfolders and files inside
lst = []
from fnmatch import fnmatch
pattern = "*.csv" #I want to get only csv files
pattern = "*.*" # Note: Use this pattern to get all types of files and folders
for path, subdirs, files in os.walk(root):
for name in files:
if fnmatch(name, pattern):
lst.append((os.path.join(path, name)))
df = pd.DataFrame({"filePaths":lst})
df.to_csv("filepaths.csv")

Pretty simple solution would be to run a couple of sub process calls to export the files into CSV format:
import subprocess
# Global variables for directory being mapped
location = '.' # Enter the path here.
pattern = '*.py' # Use this if you want to only return certain filetypes
rootDir = location.rpartition('/')[-1]
outputFile = rootDir + '_directory_contents.csv'
# Find the requested data and export to CSV, specifying a pattern if needed.
find_cmd = 'find ' + location + ' -name ' + pattern + ' -fprintf ' + outputFile + ' "%Y%M,%n,%u,%g,%s,%A+,%P\n"'
subprocess.call(find_cmd, shell=True)
That command produces comma separated values that can be easily analyzed in Excel.
f-rwxrwxrwx,1,cathy,cathy,2642,2021-06-01+00:22:00.2970880000,content-audit.py
The resulting CSV file doesn't have a header row, but you can use a second command to add them.
# Add headers to the CSV
headers_cmd = 'sed -i.bak 1i"Permissions,Links,Owner,Group,Size,ModifiedTime,FilePath" ' + outputFile
subprocess.call(headers_cmd, shell=True)
Depending on how much data you get back, you can massage it further using Pandas. Here are some things I found useful, especially if you're dealing with many levels of directories to look through.
Add these to your imports:
import numpy as np
import pandas as pd
Then add this to your code:
# Create DataFrame from the csv file created above.
df = pd.read_csv(outputFile)
# Format columns
# Get the filename and file extension from the filepath
df['FileName'] = df['FilePath'].str.rsplit("/",1).str[-1]
df['FileExt'] = df['FileName'].str.rsplit('.',1).str[1]
# Get the full path to the files. If the path doesn't include a "/" it's the root directory
df['FullPath'] = df["FilePath"].str.rsplit("/",1).str[0]
df['FullPath'] = np.where(df['FullPath'].str.contains("/"), df['FullPath'], rootDir)
# Split the path into columns for the parent directory and its children
df['ParentDir'] = df['FullPath'].str.split("/",1).str[0]
df['SubDirs'] = df['FullPath'].str.split("/",1).str[1]
# Account for NaN returns, indicates the path is the root directory
df['SubDirs'] = np.where(df.SubDirs.str.contains('NaN'), '', df.SubDirs)
# Determine if the item is a directory or file.
df['Type'] = np.where(df['Permissions'].str.startswith('d'), 'Dir', 'File')
# Split the time stamp into date and time columns
df[['ModifiedDate', 'Time']] = df.ModifiedTime.str.rsplit('+', 1, expand=True)
df['Time'] = df['Time'].str.split('.').str[0]
# Show only files, output includes paths so you don't necessarily need to display the individual directories.
df = df[df['Type'].str.contains('File')]
# Set columns to show and their order.
df=df[['FileName','ParentDir','SubDirs','FullPath','DocType','ModifiedDate','Time', 'Size']]
filesize=[] # Create an empty list to store file sizes to convert them to something more readable.
# Go through the items and convert the filesize from bytes to something more readable.
for items in df['Size'].items():
filesize.append(convert_bytes(items[1]))
df['Size'] = filesize
# Send the data to an Excel workbook with sheets by parent directory
with pd.ExcelWriter("scripts_directory_contents.xlsx") as writer:
for directory, data in df.groupby('ParentDir'):
data.to_excel(writer, sheet_name = directory, index=False)
# To convert sizes to be more human readable
def convert_bytes(size):
for x in ['b', 'K', 'M', 'G', 'T']:
if size < 1024:
return "%3.1f %s" % (size, x)
size /= 1024
return size

Related

how do I read file from two folder with the same order in python

I have two folders with the same file names, but when I try to read all text files from the folders in python, it reads in a different order. but I need to read files from two folders in the same order because they correspond. I used the following code to read all text files in a folder.
dir_psnr=current_path+'\\'+dir_psnr+'\\'
os.chdir(dir_psnr) #change directory to downloads folder
files_path =[os.path.abspath(x) for x in os.listdir()]
fnames_psnr_tmp = [x for x in files_path if x.endswith(".txt")]
the address of the folders are as follows:
F:\RD_data_from_twitch_system\RD_data_from_twitch_system\psnr
F:\RD_data_from_twitch_system\RD_data_from_twitch_system\bitrate
the name of text files in both two folders are as follows:
asmr_1.txt
asmr_2.txt
Counter_strike_1.txt
Counter_strike_2.txt
dota2_1.txt
what is the problem? and how can I read files in the same order?
the full code is :
def reading_file_to_array(dir_psnr,current_path):
dir_psnr=current_path+'\\'+dir_psnr+'\\'
os.chdir(dir_psnr) #change directory to downloads folder
files_path =[os.path.abspath(x) for x in os.listdir()]
fnames_psnr_tmp = [x for x in files_path if x.endswith(".txt")]
.
.
.
return()
current_path='F:/RD_data_from_twitch_system/RD_data_from_twitch_system'
current_dir ='F:/RD_data_from_twitch_system/RD_data_from_twitch_system'
all_sub_dir_paths = glob(str(current_dir) + '/*/')
all_sub_dir_names = [Path(sub_dir).name for sub_dir in all_sub_dir_paths]
for i in range(len(all_sub_dir_names)):
if all_sub_dir_names[i]=='bitrate':
bitrate_1080p,bitrate_720p,bitrate_480p,bitrate_360p,bitrate_160p=reading_file_to_array(all_sub_dir_names[i], current_path)
else:
psnr_1080p,psnr_720p,psnr_480p,psnr_360p,psnr_160p=reading_file_to_array(all_sub_dir_names[i], current_path)
Since the file names are the same, you could list the files in one directory and then add the bases to both for processing. This could be done in a generator that you can use in a loop. For example
folder1 = r"F:\RD_data_from_twitch_system\RD_data_from_twitch_system\psnr"
folder2 = r"F:\RD_data_from_twitch_system\RD_data_from_twitch_system\bitrate"
def list_directories(primary, secondary):
primary = os.path.abspath(primary)
secondary = os.path.abspath(secondary)
for fn in os.listdir(primary):
if fn.endswith(".txt"):
yield (os.path.join(primary, fn),
os.path.join(secondary, fn))
# print files for test
for f1, f2 in list_directories(folder1, folder2):
print(f1, f2)
Its usually a bad idea to os.chdir- especially without remembering which directory you came from. As long as your code builds absolute path names, the current working directory doesn't matter.
The easiest way would be to use listdir and to append the path to the front of every element of the list.
import os
#hardcoded folders
def reading_file_to_array(dir_1, dir_2):
list_1 = [f"{dir_1}/"+f for f in os.listdir(dir_1)]
list_2 = [f"{dir_2}/"+f for f in os.listdir(dir_2)]
# Add more lists
# Do sorting stuff here if needed
return zip(list_1, list_2)
for f1, f2 in reading_file_to_array("./f_1", "./f_2"):
print(f1, f2)
#more dynamic appraoch
def reading_file_to_array_dyn(dirs):
results = list()
for directory in dirs:
results.append([f"{directory}/"+f for f in os.listdir(directory)])
# Do sorting stuff here if needed
return zip(*results)
for f1, f2 in reading_file_to_array_dyn(["./f_1", "./f_2"]):
print(f1, f2)
The result of this test code looks like this for me:
./f_1/a.txt ./f_2/a.txt
./f_1/b.txt ./f_2/b.txt
./f_1/c.txt ./f_2/c.txt
If you want to filter the files in the folder based on type, I recommend the package glob.

Python count files in a directory and all its subdirectories

I am trying to count all the files in a folder and all its subfolders
For exemple, if my folder looks like this:
file1.txt
subfolder1/
├── file2.txt
├── subfolder2/
│ ├── file3.txt
│ ├── file4.txt
│ └── subfolder3/
│ └── file5.txt
└── file6.txt
file7.txt
I would like get the number 7.
The first thing I tried is a recursive function who count all files and calls itself for each folder
def get_file_count(directory: str) -> int:
count = 0
for filename in os.listdir(directory):
file = (os.path.join(directory, filename))
if os.path.isfile(file):
count += 1
elif os.path.isdir(file):
count += get_file_count(file)
return count
This way works but takes a lot of time for big directories.
I also remembered this post, which shows a quick way to count the total size of a folder using win32com and I wondered if this librairy also offered a way to do what I was looking for.
But after searching, I only found this
fso = com.Dispatch("Scripting.FileSystemObject")
folder = fso.GetFolder(".")
size = folder.Files.Count
But this only returns the number of files in only the targeted folder (and not in its subfolders)
So, do you know if there is an optimal function in python that returns the number of files in a folder and all its subfolders?
IIUC, you can just do
sum(len(files) for _, _, files in os.walk('path/to/folder'))
or perhaps, to avoid the len for probably slightly better performance:
sum(1 for _, _, files in os.walk('folder_test') for f in files)
This code will reveal a count of all directory entries that are not directories (e.g., plain files, symlinks) from a specified root.
Includes timing and an actual pathname used in the test:
from glob import glob, escape
import os
import time
def get_file_count(directory: str) -> int:
count = 0
for filename in glob(os.path.join(escape(directory), '*')):
if os.path.isdir(filename):
count += get_file_count(filename)
else:
count += 1
return count
start = time.perf_counter()
count = get_file_count('/Volumes/G-DRIVE Thunderbolt 3')
end = time.perf_counter()
print(count)
print(f'{end-start:.2f}s')
Output:
166231
2.38s
i used os.walk()
its my sample , i hope it'll helps you
def file_dir():
directories = []
res = {}
cwd = os.getcwd()
for root, dirs, files in os.walk(cwd):
for file in files:
if file.endswith(".tsv"):
directories.append(os.path.join(root, file))
res['dir'] = directories
return res
you could also directly use the command:
find DIR_NAME -type f | wc -l
this returns the count of all files
With os.system() this can be done from python.
Another solution using the libraries os and Path:
from pathlib import Path
from os.path import isfile
len([x for x in Path('./dir1').rglob('*') if isfile(x)])
The proper way is to use os.walk as others have pointed out, but to give another solution which resembles your original as much as possible:
You can use os.scandir to avoid the cost of constructing the entire list, it should be substantially faster:
def get_file_count(directory: str) -> int:
count = 0
for entry in os.scandir(directory):
if entry.is_file():
count += 1
elif entry.is_dir():
count += get_file_count(os.path.join(directory, entry.name))
return count

Python - Truncate unknown file names

Let's say I have the following files in a directory:
snackbox_1a.dat
zebrabar_3z.dat
cornrows_00.dat
meatpack_z2.dat
I have SEVERAL of these directories, in which all of the files are of the same format, ie:
snackbox_xx.dat
zebrabar_xx.dat
cornrows_xx.dat
meatpack_xx.dat
So what I KNOW about these files is the first bit (snackbox, zebrabar, cornrows, meatpack). What I don't know is the bit for the file extension (the 'xx'). This changes both within the directory across the files, and across the directories (so another directory might have different xx values, like 12, yy, 2m, 0t, whatever).
Is there a way for me to rename all of these files, or truncate them all (since the xx.dat will always be the same length), for ease of use when attempting to call them? For instance, I'd like to rename them so that I can, in another script, use a simple index to step through and find the file I want (instead of having to go into each directory and pull the file out manually).
In other words, I'd like to change the file names to:
snackbox.dat
zebrabar.dat
cornrows.dat
meatpack.dat
Thanks!
You can use shutil.move to move files. To calculate the new filename, you can use Python's string split method:
original_name = "snackbox_12.dat"
truncated_name = original.split("_")[0] + ".dat"
Try re.sub:
import re
filename = 'snackbox_xx.dat'
filename_new = re.sub(r'_[A-Za-z0-9]{2}', '', filename)
You should get 'snackbox.dat' for filename_new
This assumes the two characters after the "_" are either a number or lowercase/uppercase letter, but you could choose to expand the classes included in the regular expression.
EDIT: including moving and recursive search:
import shutil, re, os, fnmatch
directory = 'your_path'
for root, dirnames, filenames in os.walk(directory):
for filename in fnmatch.filter(filenames, '*.dat'):
filename_new = re.sub(r'_[A-Za-z0-9]{2}', '', filename)
shutil.move(os.path.join(root, filename), os.path.join(root, filename_new))
This solution renames all files in the current directory that match the pattern in the function call.
What the function does
snackbox_5R.txt >>> snackbox.txt
snackbox_6y.txt >>> snackbox_0.txt
snackbox_a2.txt >>> snackbox_1.txt
snackbox_Tm.txt >>> snackbox_2.txt
Let's look at the functions inputs and some examples.
list_of_files_names This is a list of string. Where each string is the filename without the _?? part.
Examples:
['snackbox.txt', 'zebrabar.txt', 'cornrows.txt', 'meatpack.txt', 'calc.txt']
['text.dat']
upper_bound=1000 This is an integer. When the ideal filename is already taken, e.g snackbox.dat already exist it will create snackbox_0.dat all the way up to snackbox_9999.dat if need be. You shouldn't have to change the default.
The Code
import re
import os
import os.path
def find_and_rename(dir, list_of_files_names, upper_bound=1000):
"""
:param list_of_files_names: List. A list of string: filname (without the _??) + extension, EX: snackbox.txt
Renames snackbox_R5.dat to snackbox.dat, etc.
"""
# split item in the list_of_file_names into two parts, filename and extension "snackbox.dat" -> "snackbox", "dat"
list_of_files_names = [(prefix.split('.')[0], prefix.split('.')[1]) for prefix in list_of_files_names]
# store the content of the dir in a list
list_of_files_in_dir = os.listdir(dir)
for file_in_dir in list_of_files_in_dir: # list all files and folders in current dir
file_in_dir_full_path = os.path.join(dir, file_in_dir) # we need the full path to rename to use .isfile()
print() # DEBUG
print('Is "{}" a file?: '.format(file_in_dir), end='') # DEBUG
print(os.path.isfile(file_in_dir_full_path)) # DEBUG
if os.path.isfile(file_in_dir_full_path): # filters out the folder, only files are needed
# Filename is a tuple containg the prefix filename and the extenstion
for file_name in list_of_files_names: # check if the file matches on of our renaming prefixes
# match both the file name (e.g "snackbox") and the extension (e.g "dat")
# It find "snackbox_5R.txt" by matching "snackbox" in the front and matching "dat" in the rear
if re.match('{}_\w+\.{}'.format(file_name[0], file_name[1]), file_in_dir):
print('\nOriginal File: ' + file_in_dir) # printing this is not necessary
print('.'.join(file_name))
ideal_new_file_name = '.'.join(file_name) # name might already be taken
# print(ideal_new_file_name)
if os.path.isfile(os.path.join(dir, ideal_new_file_name)): # file already exists
# go up a name, e.g "snackbox.dat" --> "snackbox_1.dat" --> "snackbox_2.dat
for index in range(upper_bound):
# check if this new name already exists as well
next_best_name = file_name[0] + '_' + str(index) + '.' + file_name[1]
# file does not already exist
if os.path.isfile(os.path.join(dir,next_best_name)) == False:
print('Renaming with next best name')
os.rename(file_in_dir_full_path, os.path.join(dir, next_best_name))
break
# this file exist as well, keeping increasing the name
else:
pass
# file with ideal name does not already exist, rename with the ideal name (no _##)
else:
print('Renaming with ideal name')
os.rename(file_in_dir_full_path, os.path.join(dir, ideal_new_file_name))
def find_and_rename_include_sub_dirs(master_dir, list_of_files_names, upper_bound=1000):
for path, subdirs, files in os.walk(master_dir):
print(path) # DEBUG
find_and_rename(path, list_of_files_names, upper_bound)
find_and_rename_include_sub_dirs('C:/Users/Oxen/Documents/test_folder', ['snackbox.txt', 'zebrabar.txt', 'cornrows.txt', 'meatpack.txt', 'calc.txt'])

get all folders (os.walk) that are older than x days, delete

How can I concisely express "get all folders older than x days"
I have a method getOldDirs(dirPath, olderThanDays), it must walk through a given root folder and return a list of folders that are older than say 7 days.
I call the above function from another function cleanOldFolders(). cleanOldFolders() will delete those folders similar to "rm -Rf
code that I have, how can I modify the loops concisely:
"""
Clean oldFolders
"""
def cleanOldFolders(self):
pathString = self.folderRoot + '/' + self.configMode + '/' + self.appId
oldDirList = self.getOldDirs(pathString, 7);
# Notify user that the following folders are deleted
# remove all old dirs perhaps using shutil.removetree for each folder oldDirList, rm -Rf
return
Get old dirs:
"""
get all subfolders under dirPath older than olderThanDays
"""
def getOldDirs(self,dirPath, olderThanDays):
# What is the concise way of expressing Get me list of all dir/subdirs from "dirPath" that are older than "olderThanDays"
# I know I have to use os.walk,
# I want a concise loop like this - but should recurse using os.walk
a = [os.path.join(dirPath, myfile) for myfile in os.listdir(dirPath)
if (os.path.isdir(os.path.join(dirPath, myfile)) and
(self.isOlder(os.path.join(dirPath, myfile), olderThanDays))
)]
# for root, dirs, files in os.walk(dirPath):
# for name in dirs:
# print os.path.join(root, name)
return a
One of the nice things about os.walk() is that it does the recursing for you. For its usage in your application it's important to specify the optional keyword argument topdown as False because its default is True and os.rmdir() won't delete non-empty directories.
This means your code will need to delete all the files and subdirectories in each subdirectory it encounters before removing the subdirectory itself. To facilitate doing that, the directory list getOldDirs() returns should be in the order that the subdirectories need to be deleted in.
It's also important to note that in the following, the directory's age is calculated in fractional, not whole, days, which means that seconds count and that one that was only say, 6 days and 23 hours and 59 seconds old won't get put on the list to be deleted even though it is only two seconds away from being old enough.
import os
import time
def getOldDirs(self, dirPath, olderThanDays):
"""
return a list of all subfolders under dirPath older than olderThanDays
"""
olderThanDays *= 86400 # convert days to seconds
present = time.time()
for root, dirs, files in os.walk(dirPath, topdown=False):
for name in dirs:
subDirPath = os.path.join(root, name)
if (present - os.path.getmtime(subDirPath)) > olderThanDays:
yield subDirPath
This should be a starting point.
import os
from time import time as _time
SEVEN_DAYS = 60*60*24*7
def get_old_dirs(dir_path, older_than=SEVEN_DAYS):
time_now = _time()
for path, folders, files in os.walk(dir_path):
for folder in folders:
folder_path = os.path.join(path, folder)
if (time_now - os.path.getmtime(folder_path)) > older_than:
yield folder_path
list_of_folders = list(get_old_dirs("/some/path"))
Also, if you don't want to walk into folders that are older than older_than days (because you're going to delete them) you can prune the search tree be removing folder names from the folders list
def get_old_dirs(dir_path, older_than=SEVEN_DAYS):
time_now = _time()
for path, folders, files in os.walk(dir_path):
for folder in folders[:]:
folder_path = os.path.join(path, folder)
if (time_now - os.path.getmtime(folder_path)) > older_than:
yield folder_path
folders.remove(folder)
This uses os.walk and gets you the list of files older than 7 days
import os
from datetime import date
old_dirs = []
today = date.today()
for root, dirs, files in os.walk(start_path):
for name in dirs:
filedate = date.fromtimestamp(os.path.getmtime(os.path.join(root, name)))
if (today - filedate).days > 7:
old_dirs.append(name)

How to traverse through the files in a directory?

I have a directory logfiles. I want to process each file inside this directory using a Python script.
for file in directory:
# do something
How do I do this?
With os.listdir() or os.walk(), depending on whether you want to do it recursively.
In Python 2, you can try something like:
import os.path
def print_it(x, dir_name, files):
print dir_name
print files
os.path.walk(your_dir, print_it, 0)
Note: the 3rd argument of os.path.walk is whatever you want. You'll get it as the 1st arg of the callback.
In Python 3 os.path.walk has been removed; use os.walk instead. Instead of taking a callback, you just pass it a directory and it yields (dirpath, dirnames, filenames) triples. So a rough equivalent of the above becomes
import os
for dirpath, dirnames, filenames in os.walk(your_dir):
print dirpath
print dirnames
print filenames
You can list every file from a directory recursively like this.
from os import listdir
from os.path import isfile, join, isdir
def getAllFilesRecursive(root):
files = [ join(root,f) for f in listdir(root) if isfile(join(root,f))]
dirs = [ d for d in listdir(root) if isdir(join(root,d))]
for d in dirs:
files_in_d = getAllFilesRecursive(join(root,d))
if files_in_d:
for f in files_in_d:
files.append(join(root,f))
return files
import os
# location of directory you want to scan
loc = '/home/sahil/Documents'
# global dictonary element used to store all results
global k1
k1 = {}
# scan function recursively scans through all the diretories in loc and return a dictonary
def scan(element,loc):
le = len(element)
for i in range(le):
try:
second_list = os.listdir(loc+'/'+element[i])
temp = loc+'/'+element[i]
print "....."
print "Directory %s " %(temp)
print " "
print second_list
k1[temp] = second_list
scan(second_list,temp)
except OSError:
pass
return k1 # return the dictonary element
# initial steps
try:
initial_list = os.listdir(loc)
print initial_list
except OSError:
print "error"
k =scan(initial_list,loc)
print " ..................................................................................."
print k
I made this code as a directory scanner to make a playlist feature for my audio player and it will recursively scan all the sub directories present in directory.
You could try glob:
import glob
for file in glob.glob('log-*-*.txt'):
# Etc.
But glob doesn't work recursively (as far as I know), so if your logs are in folders inside of that directory, you'd be better off looking at what Ignacio Vazquez-Abrams posted.
If you need to check for multiple file types, use
glob.glob("*.jpg") + glob.glob("*.png")
Glob doesn't care about the ordering of the files in the list. If you need files sorted by filename, use
sorted(glob.glob("*.jpg"))
import os
rootDir = '.'
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for fname in fileList:
print('\t%s' % fname)
# Remove the first entry in the list of sub-directories
# if there are any sub-directories present
if len(subdirList) > 0:
del subdirList[0]
Here's my version of the recursive file walker based on the answer of Matheus Araujo, that can take optional exclusion list arguments, which happens to be very helpful when dealing with tree copies where some directores / files / file extensions aren't wanted.
import os
def get_files_recursive(root, d_exclude_list=[], f_exclude_list=[], ext_exclude_list=[], primary_root=None):
"""
Walk a path to recursively find files
Modified version of https://stackoverflow.com/a/24771959/2635443 that includes exclusion lists
:param root: path to explore
:param d_exclude_list: list of root relative directories paths to exclude
:param f_exclude_list: list of filenames without paths to exclude
:param ext_exclude_list: list of file extensions to exclude, ex: ['.log', '.bak']
:param primary_root: Only used for internal recursive exclusion lookup, don't pass an argument here
:return: list of files found in path
"""
# Make sure we use a valid os separator for exclusion lists, this is done recursively :(
d_exclude_list = [os.path.normpath(d) for d in d_exclude_list]
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
and f not in f_exclude_list and os.path.splitext(f)[1] not in ext_exclude_list]
dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
for d in dirs:
p_root = os.path.join(primary_root, d) if primary_root is not None else d
if p_root not in d_exclude_list:
files_in_d = get_files_recursive(os.path.join(root, d), d_exclude_list, f_exclude_list, ext_exclude_list, primary_root=p_root)
if files_in_d:
for f in files_in_d:
files.append(os.path.join(root, f))
return files
This is an update of my last version that accepts glob style wildcards in exclude lists.
The function basically walks into every subdirectory of the given path and returns the list of all files from those directories, as relative paths.
Function works like Matheus' answer, and may use optional exclude lists.
Eg:
files = get_files_recursive('/some/path')
files = get_files_recursive('/some/path', f_exclude_list=['.cache', '*.bak'])
files = get_files_recursive('C:\\Users', d_exclude_list=['AppData', 'Temp'])
files = get_files_recursive('/some/path', ext_exclude_list=['.log', '.db'])
Hope this helps someone like the initial answer of this thread helped me :)
import os
from fnmatch import fnmatch
def glob_path_match(path, pattern_list):
"""
Checks if path is in a list of glob style wildcard paths
:param path: path of file / directory
:param pattern_list: list of wildcard patterns to check for
:return: Boolean
"""
return any(fnmatch(path, pattern) for pattern in pattern_list)
def get_files_recursive(root, d_exclude_list=None, f_exclude_list=None, ext_exclude_list=None, primary_root=None):
"""
Walk a path to recursively find files
Modified version of https://stackoverflow.com/a/24771959/2635443 that includes exclusion lists
and accepts glob style wildcards on files and directories
:param root: path to explore
:param d_exclude_list: list of root relative directories paths to exclude
:param f_exclude_list: list of filenames without paths to exclude
:param ext_exclude_list: list of file extensions to exclude, ex: ['.log', '.bak']
:param primary_root: Only used for internal recursive exclusion lookup, don't pass an argument here
:return: list of files found in path
"""
if d_exclude_list is not None:
# Make sure we use a valid os separator for exclusion lists, this is done recursively :(
d_exclude_list = [os.path.normpath(d) for d in d_exclude_list]
else:
d_exclude_list = []
if f_exclude_list is None:
f_exclude_list = []
if ext_exclude_list is None:
ext_exclude_list = []
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
and not glob_path_match(f, f_exclude_list) and os.path.splitext(f)[1] not in ext_exclude_list]
dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
for d in dirs:
p_root = os.path.join(primary_root, d) if primary_root is not None else d
if not glob_path_match(p_root, d_exclude_list):
files_in_d = get_files_recursive(os.path.join(root, d), d_exclude_list, f_exclude_list, ext_exclude_list,
primary_root=p_root)
if files_in_d:
for f in files_in_d:
files.append(os.path.join(root, f))
return files

Categories