Python - Loop through list within regex - python

Right, i'm relatively new to Python, which you will likely see in my code, but is there any way to iterate through a list within regex?
Basically, i'm looping through each filename within a folder, getting a code (2-6 digits) from the filename, and i'm wanting to compare it with a list of codes in a text file, which have a name attached, in the format "1234_Name" (without the quotation marks). If the code exists in both lists, I want to print out the list entry, i.e. 1234_Name. Currently my code only seems to look at the first entry in the text file's list and i'm not sure how to make it look through them all to find matches.
import os, re
sitesfile = open('C:/Users/me/My Documents/WORK_PYTHON/Renaming/testnames.txt', 'r')
filefolder = r'C:/Users/me/My Documents/WORK_PYTHON/Renaming/files/'
sites = sitesfile.read()
site_split = re.split('\n', sites)
old = []
newname = []
for site in site_split:
newname.append(site)
for root, dirs, filenames in os.walk(filefolder):
for filename in filenames:
fullpath = os.path.join(root, filename)
filename_split = os.path.splitext(fullpath)
filename_zero, fileext = filename_split
filename_zs = re.split("/", filename_zero)
filenm = re.search(r"[\w]+", str(filename_zs[-1:]))#get only filename, not path
filenmgrp = filenm.group()
pacode = re.search('\d\d+', filenmgrp)
if pacode:
pacodegrp = pacode.group()
match = re.match(pacodegrp, site)
if match:
print site
Hope this makes sense - thanks a lot in advance!

So, use this code instead:
import os
import re
def locate(pattern = r'\d+[_]', root=os.curdir):
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in re.findall(pattern, ' '.join(files)):
yield os.path.join(path, filename)
..this will only return files in a folder that match a given regex pattern.
with open('list_file.txt', 'r') as f:
lines = [x.split('_')[0] for x in f.readlines()]
print_out = []
for f in locate(<your code regex>, <your directory>):
if f in lines: print_out.append(f)
print(print_out)
...find the valid codes in your list_file first, then compare the files that come back with your given regex.

Related

Can't get python script to pick up all terms in text search list

I am trying to make a python 3 script that will search the user-defined folder for certain search terms and 1. copy files with matching words to a new folder on the desktop and 2. create a text file that contains the matched search terms.
The script is working, but for whatever reason, it seems to be maxing out with 1-2 search terms added to the "search_text_list". For example, in the example below, I only get matches for "Displayed" and "x exception". However, when I modify the list to just have "ERR:", it will pick it up, where it would miss it in my longer list of search terms.
I'm very new to python/coding, but anyone have an idea of what might be going on?
CODE
import os
import shutil
import datetime
source_folder = input("Enter the source folder path: ")
search_text_list = ["x exception", "Displayed", "!!!!!", "thermal event", "ERR:", "WRN", "InstrumentMonitorEvent"]
target_folder_name = "NovaSeq 6000 Parsing"
match_file_folder_name = "NovaSeq 6000 Analyzer Output_" + str(datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S"))
match_file_info = "Matched Search Terms.txt"
desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
target_folder = os.path.join(desktop, target_folder_name)
match_file_folder = os.path.join(target_folder, match_file_folder_name)
match_file_path = os.path.join(match_file_folder, match_file_info)
if not os.path.exists(target_folder):
os.makedirs(target_folder)
if not os.path.exists(match_file_folder):
os.makedirs(match_file_folder)
#Find all the matched search terms defined in the Search_text_list and copy them to the folder as well as make a text file with the matched fw
matched_search_terms = []
for root, dirs, files in os.walk(source_folder):
if "ETF" in dirs:
dirs.remove("ETF")
for file in files:
if "Warnings_And_Errors" in file:
continue
if "RunSetup" in file:
continue
if "Wash" in file:
continue
full_path = os.path.join(root, file)
with open(full_path, 'r', errors='ignore') as f:
file_content = f.read().lower()
for search_text in search_text_list:
if search_text.lower() in file_content:
matched_search_terms.append(search_text)
shutil.copy2(full_path, match_file_folder)
break
with open(match_file_path, 'w') as f:
if len(matched_search_terms) > 0:
f.write("\n".join(matched_search_terms))
else:
f.write("NA")
I have confirmed that my search text list works, but there seems to be a limit on the number of terms I can add to my list that will match sucessfully.
You only get the first matching search term for each file, because you break out of the loop after the first match.
Instead of a loop, use a list comprehension to get all the matches, then add that to the list.
with open(full_path, 'r', errors='ignore') as f:
file_content = f.read().lower()
matching_terms = [term for term in search_text_list if term.lower() in file_content]
if matching_terms:
matched_seach_terms.extend(matching_terms)
shutil.copy2(full_path, match_file_folder)
You might also want to make matched_search_terms a set rather than a list, so you don't get lots of duplicates.
Here is the updated code that is only matching 1 file now. I'm now matching Displayed, WRN, InstrumentMonitorEvent, which I was previously missing.
import os
import shutil
import datetime
source_folder = input("Enter the source folder path: ")
search_text_list = ["x exception", "Displayed", "!!!!!", "thermal event", "ERR:", "WRN", "InstrumentMonitorEvent"]
target_folder_name = "NovaSeq 6000 Parsing"
match_file_folder_name = "NovaSeq 6000 Analyzer Output_" + str(datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S"))
match_file_info = "Matched Search Terms.txt"
desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
target_folder = os.path.join(desktop, target_folder_name)
match_file_folder = os.path.join(target_folder, match_file_folder_name)
match_file_path = os.path.join(match_file_folder, match_file_info)
if not os.path.exists(target_folder):
os.makedirs(target_folder)
if not os.path.exists(match_file_folder):
os.makedirs(match_file_folder)
#Find all the matched search terms defined in the Search_text_list and copy them to the folder as well as make a text file with the matched fw
matched_search_terms = []
for root, dirs, files in os.walk(source_folder):
for file in files:
if "Warnings_And_Errors" in file:
continue
if "RunSetup" in file:
continue
if "Wash" in file:
continue
full_path = os.path.join(root, file)
with open(full_path, 'r', errors='ignore') as f:
file_content = f.read().lower()
matching_terms = [term for term in search_text_list if term.lower() in file_content]
if matching_terms:
matched_search_terms.extend(matching_terms)
shutil.copy2(full_path, match_file_folder)
break
with open(match_file_path, 'w') as f:
if len(matched_search_terms) > 0:
f.write("\n".join(matched_search_terms))
else:
f.write("NA")

I need to find all text files in all subfolders created on a certain date, open them, and copy the contents into a single text file

My goal is to take the contents of all text files in subfolders created today and move them to a single existing report.txt but I can't seem to find a good way to go about it. I'm not very experienced in coding so any help would be much appreciated. Here is what I have so far (I know it's rubbish):
if getmtime == today:
with open(glob.iglob(drive + "://CADIQ//CADIQ_JOBS//?????????????????????")) as f:
for line in f:
content += line
with open(reportFile, "a") as f:
f.write(content)
Try this, based on How do I list all files of a directory?
import os, time
def last_mod_today(path):
'''
return True if getmtime and time have year, mon, day coincinding in their localtime struct, False else
'''
t_s = time.localtime(os.path.getmtime(path))
today = time.localtime(time.time())
return t_s.tm_mday==today.tm_mday and t_s.tm_year == today.tm_year and t_s.tm_mon == today.tm_mon
name_to_path = lambda d,x:os.path.normpath(os.path.join(os.path.join(os.getcwd(), d),x))
def log_files(d):
'''
walking through the files in d
log the content of f when last modif time for f is today
WARNING : what happens when the file is a JPEG ?
'''
scand_dir = os.path.join(os.getcwd(), d)
print(f"scanning {scand_dir}...")
(_, _, filenames) = next(os.walk(scand_dir))
log = open("log.txt", 'a')
for f in filenames:
if last_mod_today(name_to_path(d,f)):
with open(name_to_path(d,f), 'r') as todays_file:
log.write('##############################\n')
log.write(f"file : {name_to_path(d,f)}\n")
log.write(todays_file.read())
log.write('\n')
log.write('##############################\n')
log.close()
#first scanning files in the current directory
(_, dirnames, _) = next(os.walk('./'))
log_files('./')
#then crawling through the subdirs (one level)
for d in dirnames:
log_files(d)
I would start by creating a desired_date object, which is a datetime.date. You can then format that date into a string, which makes up the pattern you want to look for in your glob. The glob pattern doesn't care about the time, just the date.
from pathlib import Path
import datetime
desired_date = datetime.date(year=2020, month=12, day=22)
pattern = "13.2.1_" + desired_date.strftime("%y_%m_%d") + "_*"
for path in Path("path/to/folders").glob(pattern):
if not path.is_dir():
continue
print(path)
From there, you can visit each path, glob all text files in the current path, and accumulate the lines in each text file. Finally, write everything to one file.
import glob
contents = b''
for file in glob.glob('./*/*.txt'): # u can change as per your directory
fname = file.split(r'\\')[-1]
with open(fname, 'rb') as f1:
contents += f1.read()
with open('report.txt','wb') as rep:
rep.write(contents)
Hope this helps so :)
Better try to read or write files in terms of bytes because sometimes there may be a chance of corrupting data.

Use Python Regex to search files and return filename

Please help.
I'm searching several .txt files, in several directories for a pattern. If there is a match, I would like to print the filename and location of the match.
Here is my code:
a = ('Z:/rodney/020year/2020-04/')
b = []
for y in os.listdir(a):
b.append(a+y+'/')
for filename in b:
path = filename
for filenames in listdir(path):
with open(path+filenames) as currentfile:
text = currentfile.read()
loan = re.compile(r'2 NNN \d LOANS')
bb = loan.search(text)
with open('z:/rodney/results.txt','a') as f:
f.write(os.path.dirname(path)+' ')
f.write(filenames[:-4]+'\n')
f.write(bb)
Error message = "TypeError: write() argument must be str, not None"
If there is a match, I would like to see only the filename and location of a match. I do not need to see "None" in every file where there is no match.
You have:
bb = loan.search(text)
But if the string you are looking for is not found in text, bb will ne None and consequently f.write(bb) will raise an exception (you did not indicate which line of code was raising the exception, so this is an educated guess).
You need to modify your code to be:
bb = loan.search(text)
if bb:
with open('z:/rodney/results.txt','a') as f:
f.write(os.path.dirname(path)+' ')
f.write(filenames[:-4]+'\n')
As an aside:
You have the statement loan = re.compile(r'2 NNN \d LOANS') in a loop. There is no need for that to be in a loop since it is invariant.
You can avoid using string slicing and bunch of functions to parse file path by using pathlib, where most of needed cases are already implemented. Also you can optimize your code by moving re.compile() out of loop (create once and use). Same with writing result - you don't need to reopen file every time, just open it once before loop start.
Optimized code:
from pathlib import Path
import re
src_dir = Path(r"Z:\rodney\020year\2020-04")
res_fn = r"z:\rodney\results.txt"
with open(res_fn, "w+") as res_f:
search_re = re.compile(r"2\sN{3}\s{28}\d\sLOANS")
for directory in src_dir.iterdir():
if directory.is_dir():
for file in directory.iterdir():
if file.is_file():
with open(file) as of:
bb = search_re.search(of.read())
if bb:
print(file.parent, file.stem, file=res_f)
print(bb.group(), file=res_f)
# res_f.write(file.parent + " " + file.stem + "\n" + bb.group())
Based on your source code, I optimized it.
I use os.walk to access each .txt file and then read it line by line in those txt files and save it in an enum. Then I will check each line in that enum with regex (I referenced Olvin Roght-san). If there is a match, it will print out the exact file location and line for you.
import os
import re
extension = [".txt"]
regex = r"2\sN{3}\s{28}\d\sLOANS"
re_Search = re.compile(regex)
path = "Z:\rodney\020year\2020-04"
for subdir, dirs, files in os.walk(path):
for file in files:
file_path = os.path.join(subdir, file)
ext = os.path.splitext(file)[-1].lower()
if ext in extension:
with open(file_path, "r") as f:
try:
f_content = f.readlines()
except Exception as e:
print(e)
for l_idx, line in enumerate(f_content):
if re_Search.search(line):
print(file_path)
print("At line: {l_idx}".format(l_idx = l_idx+1))
else:
print("Nothing!!")

How to search through both zipped and unzipped folders for a specific line

I'm trying to implement a Python script that takes a folder from the user (can be zipped or unzipped), and search through all the files in the folder to output the specific lines that my regular expression matches. My code below works for regular unzipped folders, but I can't figure out how to do the same with zipped folders that are inputted to function. Below are my code, thanks in advance!
def myFunction(folder_name):
path = folder_name
for (path, subdirs, files) in os.walk(path):
files = [f for f in os.listdir(path) if f.endswith('.txt') or f.endswith('.log') or f.endswith('-release') or f.endswith('.out') or f.endswith('messages') or f.endswith('.zip')] # Specify here the format of files you hope to search from (ex: ".txt" or ".log")
files.sort() # file is sorted list
files = [os.path.join(path, name) for name in files] # Joins the path and the name, so the files can be opened and scanned by the open() function
# The following for loop searches all files with the selected format
for filename in files:
#print('start parsing... ' + str(datetime.datetime.now()))
matched_line = []
try:
with open(filename, 'r', encoding = 'utf-8') as f:
f = f.readlines()
except:
with open(filename, 'r') as f:
f = f.readlines()
# print('Finished parsing... ' + str(datetime.datetime.now()))
for line in f:
#0strip out \x00 from read content, in case it's encoded differently
line = line.replace('\x00', '')
RE2 = r'^Version: \d.+\d.+\d.\w\d.+'
RE3 = r'^.+version.(\d+.\d+.\d+.\d+)'
pattern2 = re.compile('('+RE2+'|'+RE3+')', re.IGNORECASE)
for match2 in pattern2.finditer(line):
matched_line.append(line)
print(line)
#Calling the function to use it
myFunction(r"SampleZippedFolder.zip")
The try and except block of my code was my attempt to open the zipped folder and read it. I'm still not very clear with how to open the zipped folder or how it works. Please let me know how I can modify my code to make it work, much appreciated!
One possibility is first determine what object type folder_name is using zipfile and os.isdir() and whichever one succeeds, get the list of files and proceed. Maybe something like this:
import zipfile, os, re
def myFunction(folder_name):
files = None # nothing yet
path = folder_name
if zipfile.is_zipfile(path):
print('ZipFile: {}'.format(path))
f = zipfile.ZipFile(path)
files = f.namelist()
# for name in f.namelist(): # debugging
# print('file: {}'.format(name))
elif os.path.isdir(path):
print('Folder: {}'.format(path))
files = os.listdir(path)
# for name in os.listdir(path): # debugging
# print('file: {}'.format(name))
# should now have a list of files
# proceed processing the files
for filename in files:
...

Python - Iterating over all text files recursively

I am creating a text parser with python 3.6. I have a file layout like below:
(The real file structure I will be using is much more extensive than this.)
-Directory(main folder)
-amerigroup.txt
-bcbs.txt
childfolder
-medicare.txt
I need to extract text into 2 different lists (going through and appending to my ever-growing lists). Whenever I run my current code, I can't seem to get my program to open up my medicare.txt file to read and extract the information. I get an error stating that there is no such file or directory: 'medicare.txt'.
My goal is to get the data from the 3 files and extract it in one go. How do I get the amerigroup and bcbs data then go into the childfolder and get medicare.txt, then repeat that for all branches of my file path?
I am simply trying to open and close my text files in this code snippet. Here's what I have so far:
import re
import os
import pandas as pd
#change active directory
os.chdir(r'\\company\Files\HomeDrive\user\My Documents\claimstest')
#rootdir = r'\\company\Files\HomeDrive\user\My Documents\claimstest'
#set up Regular Expression objects to parse X12
claimidRegex = re.compile(r'(CLM\*)(\d+)')
dxRegex = re.compile(r'(ABK:)(\w\d+)(\*|~)(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?')
claimids = []
dxinfo = []
for dirpath, dirnames, files in os.walk(topdir):
for name in files:
cid = []
dx = []
if name.lower().endswith(exten):
data = open(name, 'r')
data.close()
Thank you so much for taking your time to assist me on this!
edit: I have tried using walk to no avail so far. My most recent attempt (I tried using txtfile_full_path as well--did not work):
for dirpath, dirnames, filename in os.walk(base_dir):
for filename in filename:
#defining file type
txtfile=open(filename,"r")
txtfile_full_path = os.path.join(dirpath, filename)
print(filename)
edit2 for anyone interested. This was my final solution to the problem:
import re
import os
import pandas as pd
#change active directory
os.chdir(r'\\company\Files\HomeDrive\user\My Documents\claimstest')
base_dir = (r'\\company\Files\HomeDrive\user\My Documents\claimstest')
#set up Regular Expression objects to parse X12
claimidRegex = re.compile(r'(CLM\*)(\d+)')
dxRegex = re.compile(r'(ABK:)(\w\d+)(\*|~)(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?')
claimids = []
dxinfo = []
for dirpath, dirnames, filename in os.walk(base_dir):
for filename in filename:
txtfile_full_path = os.path.join(dirpath, filename)
x12 = open(txtfile_full_path, 'r')
for i in x12:
match = claimidRegex.findall(i)
for word in match:
claimids.append(word[1])
x12.seek(0)
for i in x12:
match = dxRegex.findall(i)
for word in match:
dxinfo.append(word)
x12.close()
datadic = dict(zip(claimids, dxinfo))
You need to pass the full path to open. Just creating a string variable somewhere won't do anything for you! So the following should avoid your error:
txt_list = []
for dirpath, dirnames, filename in os.walk(base_dir):
for filename in filename:
# create full path
txtfile_full_path = os.path.join(dirpath, filename)
with open(txtfile_full_path) as f:
txt_list.append(f.read())
It should be easy enough to integrate the segregation based on your regexes now...

Categories