Match file names containing a given list of substrings

Match file names containing a given list of substrings - python

I'm writing a module that will take in an array of strings among other command line arguments. The array would be something like:
['PUPSFF', 'PCASPE', 'PCASEN']
My module has a method that will search for files matching a possible format in a directory:
def search(self, fundCode, type):
funds_string = '_'.join(fundCode)
files = set(os.listdir(self.unmappedDir))
file_match = 'citco_unmapped_{type}_{funds}_{start}_{end}.csv'.format(type=type, funds=funds_string, start=self.startDate, end=self.endDate)
if file_match in files:
filename = os.path.join(self.unmappedDir, file_match)
return self.read_file(filename)
else:
Logger.error('No {type} file/s found for {funds}, between {start} and {end}'.format(type=type, funds=fundCode, start=self.startDate, end=self.endDate))
So if my directory has a file like this one:
citco_unmapped_positions_PUPSFF_PCASPE_PCASEN_2018-07-01_2018-07-11.csv
And I pass this array as the cmd line argument: ['PUPSFF', 'PCASPE', 'PCASEN']
After calling my method (and passing in the rest of the self arguments) like this:
positions = alerter.search(alerter.fundCodes, 'positions')
It will search, find that file, and do whatever in needs to do.
However, I want it to be independent of the order. so that it will still find the file if the command line arguments are written like this:
['PCASPE', 'PCASEN', 'PUPSFF'] or
['PCASEN', 'PUPSFF', 'PCASPE'] or whatever
Any ideas on how to go on about this?

Use the all function to see the each of the needed tags is in the file name. This example should get you going:
files = [
"citco_unmapped_positions_PUPSFF_PCASPE_PCASEN_2018-07-01_2018-07-11.csv", # yes
"citco_unmapped_positions_PUPSFF_NO_WAY_PCASEN_2018-07-01_2018-07-11.csv", # no
"citco_unmapped_positions_PCASEN_PCASEN_PUPSFF_2018-07-01_2018-07-11.csv", # no
"citco_unmapped_positions_PCASPE_PCASEN_PUPSFF_2018-07-01_2018-07-11.csv", # yes
]
tags = ['PUPSFF', 'PCASPE', 'PCASEN']
for fname in files:
if (all(tag in fname for tag in tags)):
# the file is a match.
print("Match", fname)
Output:
Match citco_unmapped_positions_PUPSFF_PCASPE_PCASEN_2018-07-01_2018-07-11.csv
Match citco_unmapped_positions_PCASPE_PCASEN_PUPSFF_2018-07-01_2018-07-11.csv

Found a possible solution with permutations from itertools
def search(self, fundCodes, type):
permutations = self.find_permutations(fundCodes)
files = set(os.listdir(self.unmappedDir))
for perm in permutations:
fund_codes = '_'.join(perm)
file_match = 'citco_unmapped_{type}_{funds}_{start}_{end}.csv'.format(type=type, funds=fund_codes, start=self.startDate, end=self.endDate)
if file_match in files:
filename = os.path.join(self.unmappedDir, file_match)
return self.read_file(filename)
else:
Logger.error('No {type} file/s found for {funds}, between {start} and {end}'.format(type=type, funds=fund_codes, start=self.startDate, end=self.endDate))
def find_permutations(self, list):
perms = [p for p in permutations(list)]
return perms
Probably really slow though.

Related

How can I avoid repeating a for loop in two different functions

I am writing an ImageCollection class in python that should hold a dictionary with a name and the image-object (pygame.image object).
In one case I want to load all images inside a folder to the dictionary and in another case just specific files, for example only button-files.
What I have written so far is this:
class ImageCollection:
def __init__(self):
self.dict = {}
def load_images(self, path):
directory = os.fsencode(path)
for file in os.listdir(directory):
file_name = os.fsdecode(file)
img_path = path + "/" + file_name
if file_name.endswith(".jpg") or file_name.endswith(".png"):
# Remove extension for dictionary entry name and add image to dictionary
#-----------------------------------------------------------------------
dict_entry_name = file_name.removesuffix(".jpg").removesuffix(".png")
self.dict.update({dict_entry_name: image.Image(img_path, 0)})
def load_specific_images(self, path, contains_str):
directory = os.fsencode(path)
for file in os.listdir(directory):
file_name = os.fsdecode(file)
img_path = path + "/" + file_name
if file_name.endswith(".jpg") or file_name.endswith(".png"):
if file_name.rfind(contains_str):
# Remove extension for dictionary entry name and add image to dictionary
#-----------------------------------------------------------------------
dict_entry_name = file_name.removesuffix(".jpg").removesuffix(".png")
self.dict.update({dict_entry_name: image.Image(img_path, 0)})
The only problem is that this is probably bad programming pattern, right? In this case it probably doesnt matter but I would like to know what the best-practice in this case would be.
How can I avoid repeating myself in two different functions when the only difference is just a single if condition?
I have tried creating a "dict_add" function that creates the entry.
Then I was thinking I could create two different functions, one which directly calls "dict_add" and the other one checks for the specific condition and then calls "dict_add".
Then I thought I could add create just a single function with the for-loop but pass a function as an argument (which would be a callback I assume?). But one callback would need an additional argument so thats where I got stuck and wondered if my approach was correct.

You could make the contains_str an optional argument.
In cases where you want to load_images - you just provide the path
In cases where you want to load specific images - you provide the path and the contains_str argument
In both cases you call load_images(...)
Code:
class ImageCollection:
def __init__(self):
self.dict = {}
def load_images(self, path, contains_str=""):
directory = os.fsencode(path)
for file in os.listdir(directory):
file_name = os.fsdecode(file)
img_path = path + "/" + file_name
if file_name.endswith(".jpg") or file_name.endswith(".png"):
if contains_str == "" or (contains_str != "" and file_name.rfind(contains_str)):
# Remove extension for dictionary entry name and add image to dictionary
#-----------------------------------------------------------------------
dict_entry_name = file_name.removesuffix(".jpg").removesuffix(".png")
self.dict.update({dict_entry_name: image.Image(img_path, 0)})

Use Python Regex to search files and return filename

Please help.
I'm searching several .txt files, in several directories for a pattern. If there is a match, I would like to print the filename and location of the match.
Here is my code:
a = ('Z:/rodney/020year/2020-04/')
b = []
for y in os.listdir(a):
b.append(a+y+'/')
for filename in b:
path = filename
for filenames in listdir(path):
with open(path+filenames) as currentfile:
text = currentfile.read()
loan = re.compile(r'2 NNN \d LOANS')
bb = loan.search(text)
with open('z:/rodney/results.txt','a') as f:
f.write(os.path.dirname(path)+' ')
f.write(filenames[:-4]+'\n')
f.write(bb)
Error message = "TypeError: write() argument must be str, not None"
If there is a match, I would like to see only the filename and location of a match. I do not need to see "None" in every file where there is no match.

You have:
bb = loan.search(text)
But if the string you are looking for is not found in text, bb will ne None and consequently f.write(bb) will raise an exception (you did not indicate which line of code was raising the exception, so this is an educated guess).
You need to modify your code to be:
bb = loan.search(text)
if bb:
with open('z:/rodney/results.txt','a') as f:
f.write(os.path.dirname(path)+' ')
f.write(filenames[:-4]+'\n')
As an aside:
You have the statement loan = re.compile(r'2 NNN \d LOANS') in a loop. There is no need for that to be in a loop since it is invariant.

You can avoid using string slicing and bunch of functions to parse file path by using pathlib, where most of needed cases are already implemented. Also you can optimize your code by moving re.compile() out of loop (create once and use). Same with writing result - you don't need to reopen file every time, just open it once before loop start.
Optimized code:
from pathlib import Path
import re
src_dir = Path(r"Z:\rodney\020year\2020-04")
res_fn = r"z:\rodney\results.txt"
with open(res_fn, "w+") as res_f:
search_re = re.compile(r"2\sN{3}\s{28}\d\sLOANS")
for directory in src_dir.iterdir():
if directory.is_dir():
for file in directory.iterdir():
if file.is_file():
with open(file) as of:
bb = search_re.search(of.read())
if bb:
print(file.parent, file.stem, file=res_f)
print(bb.group(), file=res_f)
# res_f.write(file.parent + " " + file.stem + "\n" + bb.group())

Based on your source code, I optimized it.
I use os.walk to access each .txt file and then read it line by line in those txt files and save it in an enum. Then I will check each line in that enum with regex (I referenced Olvin Roght-san). If there is a match, it will print out the exact file location and line for you.
import os
import re
extension = [".txt"]
regex = r"2\sN{3}\s{28}\d\sLOANS"
re_Search = re.compile(regex)
path = "Z:\rodney\020year\2020-04"
for subdir, dirs, files in os.walk(path):
for file in files:
file_path = os.path.join(subdir, file)
ext = os.path.splitext(file)[-1].lower()
if ext in extension:
with open(file_path, "r") as f:
try:
f_content = f.readlines()
except Exception as e:
print(e)
for l_idx, line in enumerate(f_content):
if re_Search.search(line):
print(file_path)
print("At line: {l_idx}".format(l_idx = l_idx+1))
else:
print("Nothing!!")

delete older folder with similar name using python

I need to iterate over a folder tree. I have to check each subfolder, which looks like this:
moduleA-111-date
moduleA-112-date
moduleA-113-date
moduleB-111-date
moduleB-112-date
etc.
I figured out how to iterate over a folder tree. I can also use stat with mtime to get the date of the folder which seems easier than parsing the name of the date.
How do I single out modules with the same prefix (such as "moduleA") and compare their mtime's so I can delete the oldest?

Since you have no code, I assume that you're looking for design help. I'd lead my students to something like:
Make a list of the names
From each name, find the prefix, such as "moduleA. Put those in a set.
For each prefix in the set
Find all names with that prefix; put these in a temporary list
Sort this list.
For each file in this list *except* the last (newest)
delete the file
Does this get you moving?

I'm posting the code (answer) here, I suppose my question wasn't clear since I'm getting minus signs but anyway the solution wasn't as straight forward as I thought, I'm sure the code could use some fine tuning but it get's the job done.
#!/usr/bin/python
import os
import sys
import fnmatch
import glob
import re
import shutil
##########################################################################################################
#Remove the directory
def remove(path):
try:
shutil.rmtree(path)
print "Deleted : %s" % path
except OSError:
print OSError
print "Unable to remove folder: %s" % path
##########################################################################################################
#This function will look for the .sh files in a given path and returns them as a list.
def searchTreeForSh(path):
full_path = path+'*.sh'
listOfFolders = glob.glob(full_path)
return listOfFolders
##########################################################################################################
#Gets the full path to files containig .sh and returns a list of folder names (prefix) to be acted upon.
#listOfScripts is a list of full paths to .sh file
#dirname is the value that holds the root directory where listOfScripts is operating in
def getFolderNames(listOfScripts):
listOfFolders = []
folderNames = []
for foldername in listOfScripts:
listOfFolders.append(os.path.splitext(foldername)[0])
for folders in listOfFolders:
folder = folders.split('/')
foldersLen=len(folder)
folderNames.append(folder[foldersLen-1])
folderNames.sort()
return folderNames
##########################################################################################################
def minmax(items):
return max(items)
##########################################################################################################
#This function will check the latest entry in the tuple provided, and will then send "everything" to the remove function except that last entry
def sortBeforeDelete(statDir, t):
count = 0
tuple(statDir)
timeNotToDelete = minmax(statDir)
for ff in t:
if t[count][1] == timeNotToDelete:
count += 1
continue
else:
remove(t[count][0])
count += 1
##########################################################################################################
#A loop to run over the fullpath which is broken into items (see os.listdir above), elemenates the .sh and the .txt files, leaves only folder names, then matches it to one of the
#name in the "folders" variable
def coolFunction(folderNames, path):
localPath = os.listdir(path)
for folder in folderNames:
t = () # a tuple to act as sort of a dict, it will hold the folder name and it's equivalent st_mtime
statDir = [] # a list that will hold the st_mtime for all the folder names in subDirList
for item in localPath:
if os.path.isdir(path + item) == True:
if re.search(folder, item):
mtime = os.stat(path + '/' + item)
statDir.append(mtime.st_mtime)
t = t + ((path + item,mtime.st_mtime),)# the "," outside the perenthasis is how to make t be a list of lists and not set the elements one after theother.
if t == ():continue
sortBeforeDelete(statDir, t)
##########################################################################################################
def main(path):
dirs = os.listdir(path)
for component in dirs:
if os.path.isdir(component) == True:
newPath = path + '/' + component + '/'
listOfFolders= searchTreeForSh(newPath)
folderNames = getFolderNames(listOfFolders)
coolFunction(folderNames, newPath)
##########################################################################################################
if __name__ == "__main__":
main(sys.argv[1])

How to list all directories that do not contain a file type?

I'm trying to return a unique list (set) of all directories if they do not contain certain file types. If that file type is NOT found, add that directory name to a list for further auditing.
The function below will find all valid folders and add it to a set for further comparison. I'd like to extend this to only return those directories that DO NOT contain files in the out_list. These directories MAY contain sub-directories with file in the out_list. If that's TRUE, I only want that path of the folder name of the valid dir.
# directory = r'w:\workorder'
#
# Example:
# w:\workorder\region1\12345678\hi.pdf
# w:\workorder\region2\23456789\test\bye.pdf
# w:\workorder\region3\34567891\<empty>
# w:\workorder\region4\45678912\Final.doc
#
# Results:
# ['34567891', '45678912']
job_folders = set([]) #set list is unique
out_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory):
"""
This function will generate the file names in a directory
tree by walking the tree either top-down or bottom-up. For each
directory in the tree rooted at directory top (including top itself),
it yields a 3-tuple (dirpath, dirnames, filenames).
"""
folder_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for item in os.listdir(directory):
if os.path.isdir(os.path.join(directory, item)):
folderpath = os.path.join(directory, item) # Join the two strings in order to form the full folderpath.
if re.search('^[0-9]', item):
job_folders.add(item[:8])
folder_paths.append(folderpath) # Add it to the list.
return folder_paths

Does this do what you want?
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in get_directories_without_exts('W:\\workorder', exts):
print(directory)
def get_directories_without_exts(root, exts):
for root, dirs, files in os.walk(root):
for file in files:
if os.path.splitext(file)[1] in exts:
break
else:
yield root
if __name__ == '__main__':
main()
Edit: After looking at your requirements, I decided to create a tree object to analyze your directory structure. Once created, it is simple to make a recursive query with caching that to find out if a directory "is okay." From there, creating a generator that only finds top-level directories that are "not okay" is fairly simple. There is probably a better way to do this, but the code should at least work.
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in Tree('W:\\workorder', exts).not_okay:
print(directory)
class Tree:
def __init__(self, root, exts):
if not os.path.isdir(root):
raise ValueError('root must be a directory')
self.name = root
self.exts = exts
self.files = set()
self.directories = []
try:
names = os.listdir(root)
except OSError:
pass
else:
for child in names:
path = os.path.join(root, child)
if os.path.isfile(path):
self.files.add(os.path.splitext(child)[1])
elif os.path.isdir(path):
self.directories.append(self.__class__(path, exts))
self._is_okay = None
#property
def is_okay(self):
if self._is_okay is None:
self._is_okay = any(c.is_okay for c in self.directories) or \
any(c in self.exts for c in self.files)
return self._is_okay
#property
def not_okay(self):
if self.is_okay:
for child in self.directories:
for not_okay in child.not_okay:
yield not_okay
else:
yield self.name
if __name__ == '__main__':
main()

Did you copy and paste the existing code from somewhere else? Because the docstring appears to be that of os.walk...
Your question is unclear on several points:
You state that the goal of the code is to "return a unique list (set) of all directories if they do not contain certain file types".
First of all list and set are different data structures.
Secondly, your code creates one of each: job_folders is a set of folder names containing numbers, while folder_paths is a list of complete paths to folders regardless of whether or not they contain numbers.
What do you actually want as output here?
Should "those directories that DO NOT contain files in the out_list" be defined recursively, or only include first-level contents of those directories? My solution assumes the latter
Your example is contradictory on this point: it shows 34567891 in the results, but not region3 in the results. Whether or not the definition is recursive, region3 should be included in the results because region3 does not contain any files with the listed extensions under it.
Should job_folders be populated only with directories that satisfy the criterion about their contents, or with all folder names containing numbers? My solution assumes the latter
One poor practice in your code that I'd highlight is your use of global variables, out_list and job_folders. I've changed the former to a second parameter of get_filepaths and the latter to a second return value.
Anyway, here goes the solution...
import os, re
ext_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory, ext_list):
folder_paths = [] # List which will store all of the full filepaths.
job_folders = set([])
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
_, lastlevel = os.path.split(dir)
if re.search('^[0-9]', lastlevel):
job_folders.add(lastlevel[:8])
for item in files:
root, ext = os.path.splitext(item)
if ext in ext_list:
break
else:
# Since none of the file extensions matched ext_list, add it to the list of folder_paths
folder_paths.append(os.path.relpath(dir, directory))
return folder_paths, job_folders
I created a directory structure identical to yours under /tmp and ran the following:
folder_paths, job_folders = get_filepaths( os.path.expandvars(r"%TEMP%\workorder"), ext_list )
print "folder_paths =", folder_paths
print "job_folders =", job_folders
Here's the output:
folder_paths = ['.', 'region1', 'region2', 'region2\\23456789', 'region3', 'region3\\34567891', 'region4', 'region4\\456789123']
job_folders = set(['12345678', '23456789', '34567891', '45678912'])
As you can see, region1\12345678 and region2\23456789\test are not included in the output folder_paths because they do directly contain files of the specified extensions; all the other subdirectories are included in the output because they do not directly contain files of the specified extensions.

To get the file extension:
name,ext = os.path.splitext(os.path.join(directory,item))
if ext not in out_list:
job_folders.add(item[:8])

thanks to #DanLenski and #NoctisSkytower I was able to get this worked out.
My WorkOrder directory is always the 7th folder down when walking in_path and I found that using os.sep.
I borrowed from both of your solutions and came up with the following:
import os, re
ext_list = [".pdf"]
in_path = r'\\server\E\Data\WorkOrder'
def get_filepaths(directory, ext_list):
not_okay = set([]) # Set which will store Job folder where no ext_list files found
okay = set([]) # Set which will store Job folder where ext_list files found
job_folders = set([]) #valid Job ID folder
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
for item in files:
root, ext = os.path.splitext(item)
if len(dir.split(os.sep)) >= 8: #Tree must contain Job ID folder
job_folder = dir.split(os.sep)[7]
if ext in ext_list:
okay.add(job_folder)
else: # Since none of the file extensions matched ext_list, add it to the list of folder_paths
not_okay.add(job_folder)
bad_list = list(not_okay - okay)
bad_list.sort()
return bad_list
bad_list = get_filepaths( os.path.expandvars(in_path), ext_list )

Python - Loop through list within regex

Right, i'm relatively new to Python, which you will likely see in my code, but is there any way to iterate through a list within regex?
Basically, i'm looping through each filename within a folder, getting a code (2-6 digits) from the filename, and i'm wanting to compare it with a list of codes in a text file, which have a name attached, in the format "1234_Name" (without the quotation marks). If the code exists in both lists, I want to print out the list entry, i.e. 1234_Name. Currently my code only seems to look at the first entry in the text file's list and i'm not sure how to make it look through them all to find matches.
import os, re
sitesfile = open('C:/Users/me/My Documents/WORK_PYTHON/Renaming/testnames.txt', 'r')
filefolder = r'C:/Users/me/My Documents/WORK_PYTHON/Renaming/files/'
sites = sitesfile.read()
site_split = re.split('\n', sites)
old = []
newname = []
for site in site_split:
newname.append(site)
for root, dirs, filenames in os.walk(filefolder):
for filename in filenames:
fullpath = os.path.join(root, filename)
filename_split = os.path.splitext(fullpath)
filename_zero, fileext = filename_split
filename_zs = re.split("/", filename_zero)
filenm = re.search(r"[\w]+", str(filename_zs[-1:]))#get only filename, not path
filenmgrp = filenm.group()
pacode = re.search('\d\d+', filenmgrp)
if pacode:
pacodegrp = pacode.group()
match = re.match(pacodegrp, site)
if match:
print site
Hope this makes sense - thanks a lot in advance!

So, use this code instead:
import os
import re
def locate(pattern = r'\d+[_]', root=os.curdir):
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in re.findall(pattern, ' '.join(files)):
yield os.path.join(path, filename)
..this will only return files in a folder that match a given regex pattern.
with open('list_file.txt', 'r') as f:
lines = [x.split('_')[0] for x in f.readlines()]
print_out = []
for f in locate(<your code regex>, <your directory>):
if f in lines: print_out.append(f)
print(print_out)
...find the valid codes in your list_file first, then compare the files that come back with your given regex.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Match file names containing a given list of substrings - python

Related

How can I avoid repeating a for loop in two different functions

Use Python Regex to search files and return filename

delete older folder with similar name using python

How to list all directories that do not contain a file type?

Python - Loop through list within regex

Categories

Resources