Excluding all but a single subdirectory from a file search - python

I have a directory structure that resembles the following:
Dir1
Dir2
Dir3
Dir4
L SubDir4.1
L SubDir4.2
L SubDir4.3
I want to generate a list of files (with full paths) that include all the contents of Dirs1-3, but only SubDir4.2 inside Dir4. The code I have so far is
import fnmatch
import os
for root, dirs, files in os.walk( '.' )
if 'Dir4' in dirs:
if not 'SubDir4.2' in 'Dir4':
dirs.remove( 'Dir4' )
for file in files
print os.path.join( root, file )
My problem is that the part where I attempt to exclude any file that does not have SubDir4.2 in it's path is excluding everything in Dir4, including the things I would like to remain. How should I amend that above to to do what I desire?
Update 1: I should add that there are a lot of directories below Dir4 so manually listing them in an excludes list isn't a practical option. I'd like to be able to specify SubDur4.2 as the only subdirectory within Dir4 to be read.
Update 2: For reason outside of my control, I only have access to Python version 2.4.3.

There are a few typos in your snippet. I propose this:
import os
def any_p(iterable):
for element in iterable:
if element:
return True
return False
include_dirs = ['Dir4/SubDir4.2', 'Dir1/SubDir4.2', 'Dir3', 'Dir2'] # List all your included folder names in that
for root, dirs, files in os.walk( '.' ):
dirs[:] = [d for d in dirs if any_p(d in os.path.join(root, q_inc) for q_inc in include_dirs)]
for file in files:
print file
EDIT: According to comments, I have changed that so this is include list, instead of an exclude one.
EDIT2: Added a any_p (any() equivalent function for python version < 2.5)
EDIT3bis: if you have other subfolders with the same name 'SubDir4.2' in other folders, you can use the following to specify the location:
include_dirs = ['Dir4/SubDir4.2', 'Dir1/SubDir4.2']
Assuming you have a Dir1/SubDir4.2.
If they are a lot of those, then you may want to refine this approach with fnmatch, or probably a regex query.

I altered mstud's solution to give you what you are looking for:
import os;
for root, dirs, files in os.walk('.'):
# Split the root into its path parts
tmp = root.split(os.path.sep)
# If the lenth of the path is long enough to be your path AND
# The second to last part of the path is Dir4 AND
# The last part of the path is SubDir4.2 THEN
# Stop processing this pass.
if (len(tmp) > 2) and (tmp[-2] == 'Dir4') and (tmp[-1] != 'SubDir4.2'):
continue
# If we aren't in Dir4, print the file paths.
if tmp[-1] != 'Dir4':
for file in files:
print os.path.join(root, file)
In short, the first "if" skips the printing of any directory contents under Dir4 that aren't SubDir4.2. The second "if" skips the printing of the contents of the Dir4 directory.

for root, dirs, files in os.walk('.'):
tmp = root.split(os.path.sep)
if len(tmp)>2 and tmp[-2]=="Dir4" and tmp[-1]=="SubDir4.2":
continue
for file in files:
print os.path.join(root, file)

Related

Check multiple files exist for each folders

I am looking for the way to print out the file that not exist in directories.
So far I could do
QA_files_pattern = '*QA.xlsx'
EP_files_pattern = '*EP.xlsx'
AD_files_pattern = '*AD.xlsx'
filelist = [QA_files_pattern,EP_files_pattern,AD_files_pattern]
path = os.path.abspath(os.getcwd())
for (path, dir, files) in os.walk(path): # Get all files in current file's path
for a_file in filelist:
if fnmatch.filter(os.listdir(path), a_file):
print(fnmatch.filter(os.listdir(path), a_file))
else:
print("missing"+path+a_file)
The problem in this way is that it checks whether the files pattern exist not only the folders I want to look for but also every root it enters to reach to folder I want to look for.
Will there be a way to make it search just the end path?
EDIT: I cannot specify how many subfolders there will be, but I am sure the files are only contained at the end folders and the common name for end folder is 'QS'
You can set your path so that it is explicitly on the file path you desire.
Right now you are setting the path on the current directory, which I believe is the root.
path = os.path.abspath(os.getcwd())
You could make it a path that is explicity to limit it
path = os.path.abspath(os.getcwd())
extended_path= os.path.join(path, "specific_directory", "subdirectory", "etc")
And then sub in the extended_path
for (extended_path, dir, files) in os.walk(extended_path): # Get all files in current file's path
for a_file in filelist:
if fnmatch.filter(os.listdir(extended_path), a_file):
print(fnmatch.filter(os.listdir(extended_path), a_file))
else:
print("missing"+extended_path+a_file)
You can use glob recursively for something like:
from glob import glob
from os import path
QA = "*QA.xlsx"
EP = "*EP.xlsx"
AD = "*AD.xlsx"
base_path = "/base/path/**/*" # change /base/path to the dir you want to check
for d in glob(base_path, recursive=1): # get all files/dirs recursively inside base_path
if path.isdir(d): # If is a dir
to_match = {"QA":glob(f"{d}/{QA}"), "EP":glob(f"{d}/{EP}"), "AD":glob(f"{d}/{AD}")}
for k, v in to_match.items():
if not v:
print(f"Dir '{d}' Missing {k}")
Ok so I was able to solve for my own question, which was much simpler way than I expected
QA_files_pattern = '*QA.xlsx'
EP_files_pattern = '*EP.xlsx'
AD_files_pattern = '*AD.xlsx'
filelist = [QA_files_pattern,EP_files_pattern,AD_files_pattern]
path = os.path.abspath(os.getcwd())
for (path, dir, files) in os.walk(path): # Get all files in current file's path
if "QS" in path:
for a_file in filelist:
if fnmatch.filter(os.listdir(path), a_file):
print(fnmatch.filter(os.listdir(path), a_file))
else:
print("missing"+path+a_file)

Exclude subfolders and files

I have a little problem with excludes files and subfolders.
for x in os.walk('core'):
for y in glob.glob(os.path.join(x[0], '*.py')):
s = y.replace('\\', '.')
x = s.replace('.py', '')
cogs.append(x)
My code for taking all files from every folders and now i just want to exclude files __init__, models and subfolder migrations with files like 0002_auto etc? Right now i just delete it from list manually like:
cogs.remove('core.rpg.models')
cogs.remove('core.rpg.__init__')
cogs.remove('core.rpg.migrations.__init__')
Normally you would do for root,dirs,files in os.walk('core'): .. and operate on dirs or files and combine them with root to get the full path to them.
Using glob on top is akin to doing something to x[2] (aka files - wich is the list of files inside root)
import os
what_i_want = []
skip_files = {"__init__.py"}
for root, dirs, files in os.walk('core'):
for f in files:
# skipe the subdirs models and migrations
if root.endswith("models") or root.endswith("migrations"):
continue
# skip any non .py file
if not f.endswith(".py"):
continue
# skip ceratain .py files
if f in skip_files:
continue
# remove .py from filename
f = f[:-3]
# add filename including full root and subst \ to .
what_i_want.append(os.path.join(root,f).replace("\\","."))
This would need some more slicing to only include the starting dir ("core") and not the full path to it.

python: collect files with one extention from all sub-dir

I am trying to collect all files with all sub-directories and move to another directory
Code used
#collects all mp3 files from folders to a new folder
import os
from pathlib import Path
import shutil
#run once
path = os.getcwd()
os.mkdir("empetrishki")
empetrishki = path + "/empetrishki" #destination dir
print(path)
print(empetrishki)
#recursive collection
for root, dirs, files in os.walk(path, topdown=True, onerror=None, followlinks=True):
for name in files:
filePath = Path(name)
if filePath.suffix.lower() == ".mp3":
print(filePath)
os.path.join
filePath.rename(empetrishki.joinpath(filePath))
I have trouble with the last line of moving files: filePath.rename() nor shutil.move nor joinpath() have worked for me. Maybe that's because I am trying to change the element in the tuple - the output from os.walk
Similar code works with os.scandir but this would collect files only in the current directory
How can I fix that, thanks!
If you use pathlib.Path(name) that doesn't mean that something exists called name. Hence, you do need to be careful that you have a full path, or relative path, and you need to make sure to resolve those. In particular I am noting that you don't change your working directory and have a line like this:
filePath = Path(name)
This means that while you may be walking down the directory, your working directory may not be changing. You should make your path from the root and the name, it is also a good idea to resolve so that the full path is known.
filePath = Path(root).joinpath(name).resolve()
You can also place the Path(root) outside the inner loop as well. Now you have an absolute path from '/home/' to the filename. Hence, you should be able to rename with .rename(), like:
filePath.rename(x.parent.joinpath(newname))
#Or to another directory
filePath.rename(other_dir.joinpath(newname))
All together:
from pathlib import os, Path
empetrishki = Path.cwd().joinpath("empetrishki").resolve()
for root, dirs, files in os.walk(path, topdown=True, onerror=None, followlinks=True):
root = Path(root).resolve()
for name in files:
file = root.joinpath(name)
if file.suffix.lower() == ".mp3":
file.rename(empetrishki.joinpath(file.name))
for root, dirs, files in os.walk(path, topdown=True, onerror=None, followlinks=True):
if root == empetrishki:
continue # skip the destination dir
for name in files:
basename, extension = os.path.splitext(name)
if extension.lower() == ".mp3":
oldpath = os.path.join(root, name)
newpath = os.path.join(empetrishki, name)
print(oldpath)
shutil.move(oldpath, newpath)
This is what I suggest. Your code is running on the current directory, and the file is at the path os.path.join(root, name) and you need to provide such path to your move function.
Besides, I would also suggest to use os.path.splitext for extracting the file extension. More pythonic. And also you might want to skip scanning your target directory.

Python: Loop to open multiple folders and files in python

I am new to python and currently work on data analysis.
I am trying to open multiple folders in a loop and read all files in folders.
Ex. working directory contains 10 folders needed to open and each folder contains 10 files.
My code for open each folder with .txt file;
file_open = glob.glob("home/....../folder1/*.txt")
I want to open folder 1 and read all files, then go to folder 2 and read all files... until folder 10 and read all files.
Can anyone help me how to write loop to open folder, included library needed to be used?
I have my background in R, for example, in R I could write loop to open folders and files use code below.
folder_open <- dir("......./main/")
for (n in 1 to length of (folder_open)){
file_open <-dir(paste0("......./main/",folder_open[n]))
for (k in 1 to length of (file_open){
file_open<-readLines(paste0("...../main/",folder_open[n],"/",file_open[k]))
//Finally I can read all folders and files.
}
}
This recursive method will scan all directories within a given directory and then print the names of the txt files. I kindly invite you to take it forward.
import os
def scan_folder(parent):
# iterate over all the files in directory 'parent'
for file_name in os.listdir(parent):
if file_name.endswith(".txt"):
# if it's a txt file, print its name (or do whatever you want)
print(file_name)
else:
current_path = "".join((parent, "/", file_name))
if os.path.isdir(current_path):
# if we're checking a sub-directory, recursively call this method
scan_folder(current_path)
scan_folder("/example/path") # Insert parent direcotry's path
Given the following folder/file tree:
C:.
├───folder1
│ file1.txt
│ file2.txt
│ file3.csv
│
└───folder2
file4.txt
file5.txt
file6.csv
The following code will recursively locate all .txt files in the tree:
import os
import fnmatch
for path,dirs,files in os.walk('.'):
for file in files:
if fnmatch.fnmatch(file,'*.txt'):
fullname = os.path.join(path,file)
print(fullname)
Output:
.\folder1\file1.txt
.\folder1\file2.txt
.\folder2\file4.txt
.\folder2\file5.txt
Your glob() pattern is almost correct. Try one of these:
file_open = glob.glob("home/....../*/*.txt")
file_open = glob.glob("home/....../folder*/*.txt")
The first one will examine all of the text files in any first-level subdirectory of home/......, whatever that is. The second will limit itself to subdirectories named like "folder1", "folder2", etc.
I don't speak R, but this might translate your code:
for filename in glob.glob("......../main/*/*.txt"):
with open(filename) as file_handle:
for line in file_handle:
# perform data on each line of text
I think nice way to do that would be to use os.walk. That will generate tree and you can then iterate through that tree.
import os
directory = './'
for d in os.walk(directory):
print(d)
This code will look for all directories inside a directory, printing out the names of all files found there:
#--------*---------*---------*---------*---------*---------*---------*---------*
# Desc: print filenames one level down from starting folder
#--------*---------*---------*---------*---------*---------*---------*---------*
import os, fnmatch, sys
def find_dirs(directory, pattern):
for item in os.listdir(directory):
if os.path.isdir(os.path.join(directory, item)):
if fnmatch.fnmatch(item, pattern):
filename = os.path.join(directory, item)
yield filename
def find_files(directory, pattern):
for item in os.listdir(directory):
if os.path.isfile(os.path.join(directory, item)):
if fnmatch.fnmatch(item, pattern):
filename = os.path.join(directory, item)
yield filename
#--------*---------*---------*---------*---------*---------*---------*---------#
while True:# M A I N L I N E #
#--------*---------*---------*---------*---------*---------*---------*---------#
# # Set directory
os.chdir("C:\\Users\\Mike\\\Desktop")
for filedir in find_dirs('.', '*'):
print ('Got directory:', filedir)
for filename in find_files(filedir, '*'):
print (filename)
sys.exit() # END PROGRAM
pathlib is a good choose
from pathlib import Path
# or use: glob('**/*.txt')
for txt_path in [_ for _ in Path('demo/test_dir').rglob('*.txt') if _.is_file()]:
print(txt_path.absolute())

All Files in Dir & Sub-Dir

I would like to find all the files in a directory and all sub-directories.
code used:
import os
import sys
path = "C:\\"
dirs = os.listdir(path)
filename = "C.txt"
FILE = open(filename, "w")
FILE.write(str(dirs))
FILE.close()
print dirs
The problem is - this code only lists files in directories, not sub-directories. What do I need to change in order to also list files in subdirectories?
To traverse a directory tree you want to use os.walk() for this.
Here's an example to get you started:
import os
searchdir = r'C:\root_dir' # traversal starts in this directory (the root)
for root, dirs, files in os.walk(searchdir):
for name in files:
(base, ext) = os.path.splitext(name) # split base and extension
print base, ext
which would give you access to the file names and the components.
You'll find the functions in the os and os.path module to be of great use for this sort of work.
This function will help you: os.path.walk() http://docs.python.org/library/os.path.html#os.path.walk

Categories