Extract certain data from multiple .txt files using Python and RegEx

Extract certain data from multiple .txt files using Python and RegEx - python

I have several .txt files and I need to extract certain data from them. Files looks similar, but each of them stores different data. Here is an example of that file:
Start Date: 21/05/2016
Format: TIFF
Resolution: 300dpi
Source: X Company
...
There is more information in the text files, but I need to extract the start date, format and the resolution. Files are in the same parent directory ("E:\Images") but each file has its own folder. Therefore I need a script for recursive reading of these files. Here is my script so far:
#importing a library
import os
#defining location of parent folder
BASE_DIRECTORY = 'E:\Images'
#scanning through subfolders
for dirpath, dirnames, filenames in os.walk(BASE_DIRECTORY):
for filename in filenames:
#defining file type
txtfile=open(filename,"r")
txtfile_full_path = os.path.join(dirpath, filename)
try:
for line in txtfile:
if line.startswidth('Start Date:'):
start_date = line.split()[-1]
elif line.startswidth('Format:'):
data_format = line.split()[-1]
elif line.startswidth('Resolution:'):
resolution = line.split()[-1]
print(
txtfile_full_path,
start_date,
data_format,
resolution)
Ideally it might be better if Python extracts it together with a name of ech file and saves it in a text file. Because I don't have much experience in Python, I don't know how to progress any further.

Here is the code I've used:
# importing libraries
import os
# defining location of parent folder
BASE_DIRECTORY = 'E:\Images'
output_file = open('output.txt', 'w')
output = {}
file_list = []
# scanning through sub folders
for (dirpath, dirnames, filenames) in os.walk(BASE_DIRECTORY):
for f in filenames:
if 'txt' in str(f):
e = os.path.join(str(dirpath), str(f))
file_list.append(e)
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
if 'Start Date:' in line:
output[f].append(line)
elif 'Format' in line:
output[f].append(line)
elif 'Resolution' in line:
output[f].append(line)
tabs = []
for tab in output:
tabs.append(tab)
tabs.sort()
for tab in tabs:
output_file.write(tab + '\n')
output_file.write('\n')
for row in output[tab]:
output_file.write(row + '')
output_file.write('\n')
output_file.write('----------------------------------------------------------\n')
raw_input()

You do not need regular expressions. You can use
basic string functions:
txtfile=open(filename,"r")
for line in txtfile:
if line.startswidth("Start Date:"):
start_date = line.split()[-1]
...
break if you have all information collected.

To grab the Start Date, you can use the following regex:
^(?:Start Date:)\D*(\d+/\d+/\d+)$
# ^ anchor the regex to the start of the line
# capture the string "Start Date:" in a group
# followed by non digits zero or unlimited times
# followed by a group with the start date in it
In Python this would be:
import re
regex = r"^(?:Start Date:)\D*(\d+/\d+/\d+)$"
# the variable line points to your line in the file
if re.search(regex, line):
# do sth. useful here
See a demo on regex 101.

Related

Can't get python script to pick up all terms in text search list

I am trying to make a python 3 script that will search the user-defined folder for certain search terms and 1. copy files with matching words to a new folder on the desktop and 2. create a text file that contains the matched search terms.
The script is working, but for whatever reason, it seems to be maxing out with 1-2 search terms added to the "search_text_list". For example, in the example below, I only get matches for "Displayed" and "x exception". However, when I modify the list to just have "ERR:", it will pick it up, where it would miss it in my longer list of search terms.
I'm very new to python/coding, but anyone have an idea of what might be going on?
CODE
import os
import shutil
import datetime
source_folder = input("Enter the source folder path: ")
search_text_list = ["x exception", "Displayed", "!!!!!", "thermal event", "ERR:", "WRN", "InstrumentMonitorEvent"]
target_folder_name = "NovaSeq 6000 Parsing"
match_file_folder_name = "NovaSeq 6000 Analyzer Output_" + str(datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S"))
match_file_info = "Matched Search Terms.txt"
desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
target_folder = os.path.join(desktop, target_folder_name)
match_file_folder = os.path.join(target_folder, match_file_folder_name)
match_file_path = os.path.join(match_file_folder, match_file_info)
if not os.path.exists(target_folder):
os.makedirs(target_folder)
if not os.path.exists(match_file_folder):
os.makedirs(match_file_folder)
#Find all the matched search terms defined in the Search_text_list and copy them to the folder as well as make a text file with the matched fw
matched_search_terms = []
for root, dirs, files in os.walk(source_folder):
if "ETF" in dirs:
dirs.remove("ETF")
for file in files:
if "Warnings_And_Errors" in file:
continue
if "RunSetup" in file:
continue
if "Wash" in file:
continue
full_path = os.path.join(root, file)
with open(full_path, 'r', errors='ignore') as f:
file_content = f.read().lower()
for search_text in search_text_list:
if search_text.lower() in file_content:
matched_search_terms.append(search_text)
shutil.copy2(full_path, match_file_folder)
break
with open(match_file_path, 'w') as f:
if len(matched_search_terms) > 0:
f.write("\n".join(matched_search_terms))
else:
f.write("NA")
I have confirmed that my search text list works, but there seems to be a limit on the number of terms I can add to my list that will match sucessfully.

You only get the first matching search term for each file, because you break out of the loop after the first match.
Instead of a loop, use a list comprehension to get all the matches, then add that to the list.
with open(full_path, 'r', errors='ignore') as f:
file_content = f.read().lower()
matching_terms = [term for term in search_text_list if term.lower() in file_content]
if matching_terms:
matched_seach_terms.extend(matching_terms)
shutil.copy2(full_path, match_file_folder)
You might also want to make matched_search_terms a set rather than a list, so you don't get lots of duplicates.

Here is the updated code that is only matching 1 file now. I'm now matching Displayed, WRN, InstrumentMonitorEvent, which I was previously missing.
import os
import shutil
import datetime
source_folder = input("Enter the source folder path: ")
search_text_list = ["x exception", "Displayed", "!!!!!", "thermal event", "ERR:", "WRN", "InstrumentMonitorEvent"]
target_folder_name = "NovaSeq 6000 Parsing"
match_file_folder_name = "NovaSeq 6000 Analyzer Output_" + str(datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S"))
match_file_info = "Matched Search Terms.txt"
desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
target_folder = os.path.join(desktop, target_folder_name)
match_file_folder = os.path.join(target_folder, match_file_folder_name)
match_file_path = os.path.join(match_file_folder, match_file_info)
if not os.path.exists(target_folder):
os.makedirs(target_folder)
if not os.path.exists(match_file_folder):
os.makedirs(match_file_folder)
#Find all the matched search terms defined in the Search_text_list and copy them to the folder as well as make a text file with the matched fw
matched_search_terms = []
for root, dirs, files in os.walk(source_folder):
for file in files:
if "Warnings_And_Errors" in file:
continue
if "RunSetup" in file:
continue
if "Wash" in file:
continue
full_path = os.path.join(root, file)
with open(full_path, 'r', errors='ignore') as f:
file_content = f.read().lower()
matching_terms = [term for term in search_text_list if term.lower() in file_content]
if matching_terms:
matched_search_terms.extend(matching_terms)
shutil.copy2(full_path, match_file_folder)
break
with open(match_file_path, 'w') as f:
if len(matched_search_terms) > 0:
f.write("\n".join(matched_search_terms))
else:
f.write("NA")

I need to find all text files in all subfolders created on a certain date, open them, and copy the contents into a single text file

My goal is to take the contents of all text files in subfolders created today and move them to a single existing report.txt but I can't seem to find a good way to go about it. I'm not very experienced in coding so any help would be much appreciated. Here is what I have so far (I know it's rubbish):
if getmtime == today:
with open(glob.iglob(drive + "://CADIQ//CADIQ_JOBS//?????????????????????")) as f:
for line in f:
content += line
with open(reportFile, "a") as f:
f.write(content)

Try this, based on How do I list all files of a directory?
import os, time
def last_mod_today(path):
'''
return True if getmtime and time have year, mon, day coincinding in their localtime struct, False else
'''
t_s = time.localtime(os.path.getmtime(path))
today = time.localtime(time.time())
return t_s.tm_mday==today.tm_mday and t_s.tm_year == today.tm_year and t_s.tm_mon == today.tm_mon
name_to_path = lambda d,x:os.path.normpath(os.path.join(os.path.join(os.getcwd(), d),x))
def log_files(d):
'''
walking through the files in d
log the content of f when last modif time for f is today
WARNING : what happens when the file is a JPEG ?
'''
scand_dir = os.path.join(os.getcwd(), d)
print(f"scanning {scand_dir}...")
(_, _, filenames) = next(os.walk(scand_dir))
log = open("log.txt", 'a')
for f in filenames:
if last_mod_today(name_to_path(d,f)):
with open(name_to_path(d,f), 'r') as todays_file:
log.write('##############################\n')
log.write(f"file : {name_to_path(d,f)}\n")
log.write(todays_file.read())
log.write('\n')
log.write('##############################\n')
log.close()
#first scanning files in the current directory
(_, dirnames, _) = next(os.walk('./'))
log_files('./')
#then crawling through the subdirs (one level)
for d in dirnames:
log_files(d)

I would start by creating a desired_date object, which is a datetime.date. You can then format that date into a string, which makes up the pattern you want to look for in your glob. The glob pattern doesn't care about the time, just the date.
from pathlib import Path
import datetime
desired_date = datetime.date(year=2020, month=12, day=22)
pattern = "13.2.1_" + desired_date.strftime("%y_%m_%d") + "_*"
for path in Path("path/to/folders").glob(pattern):
if not path.is_dir():
continue
print(path)
From there, you can visit each path, glob all text files in the current path, and accumulate the lines in each text file. Finally, write everything to one file.

import glob
contents = b''
for file in glob.glob('./*/*.txt'): # u can change as per your directory
fname = file.split(r'\\')[-1]
with open(fname, 'rb') as f1:
contents += f1.read()
with open('report.txt','wb') as rep:
rep.write(contents)
Hope this helps so :)
Better try to read or write files in terms of bytes because sometimes there may be a chance of corrupting data.

Use Python Regex to search files and return filename

Please help.
I'm searching several .txt files, in several directories for a pattern. If there is a match, I would like to print the filename and location of the match.
Here is my code:
a = ('Z:/rodney/020year/2020-04/')
b = []
for y in os.listdir(a):
b.append(a+y+'/')
for filename in b:
path = filename
for filenames in listdir(path):
with open(path+filenames) as currentfile:
text = currentfile.read()
loan = re.compile(r'2 NNN \d LOANS')
bb = loan.search(text)
with open('z:/rodney/results.txt','a') as f:
f.write(os.path.dirname(path)+' ')
f.write(filenames[:-4]+'\n')
f.write(bb)
Error message = "TypeError: write() argument must be str, not None"
If there is a match, I would like to see only the filename and location of a match. I do not need to see "None" in every file where there is no match.

You have:
bb = loan.search(text)
But if the string you are looking for is not found in text, bb will ne None and consequently f.write(bb) will raise an exception (you did not indicate which line of code was raising the exception, so this is an educated guess).
You need to modify your code to be:
bb = loan.search(text)
if bb:
with open('z:/rodney/results.txt','a') as f:
f.write(os.path.dirname(path)+' ')
f.write(filenames[:-4]+'\n')
As an aside:
You have the statement loan = re.compile(r'2 NNN \d LOANS') in a loop. There is no need for that to be in a loop since it is invariant.

You can avoid using string slicing and bunch of functions to parse file path by using pathlib, where most of needed cases are already implemented. Also you can optimize your code by moving re.compile() out of loop (create once and use). Same with writing result - you don't need to reopen file every time, just open it once before loop start.
Optimized code:
from pathlib import Path
import re
src_dir = Path(r"Z:\rodney\020year\2020-04")
res_fn = r"z:\rodney\results.txt"
with open(res_fn, "w+") as res_f:
search_re = re.compile(r"2\sN{3}\s{28}\d\sLOANS")
for directory in src_dir.iterdir():
if directory.is_dir():
for file in directory.iterdir():
if file.is_file():
with open(file) as of:
bb = search_re.search(of.read())
if bb:
print(file.parent, file.stem, file=res_f)
print(bb.group(), file=res_f)
# res_f.write(file.parent + " " + file.stem + "\n" + bb.group())

Based on your source code, I optimized it.
I use os.walk to access each .txt file and then read it line by line in those txt files and save it in an enum. Then I will check each line in that enum with regex (I referenced Olvin Roght-san). If there is a match, it will print out the exact file location and line for you.
import os
import re
extension = [".txt"]
regex = r"2\sN{3}\s{28}\d\sLOANS"
re_Search = re.compile(regex)
path = "Z:\rodney\020year\2020-04"
for subdir, dirs, files in os.walk(path):
for file in files:
file_path = os.path.join(subdir, file)
ext = os.path.splitext(file)[-1].lower()
if ext in extension:
with open(file_path, "r") as f:
try:
f_content = f.readlines()
except Exception as e:
print(e)
for l_idx, line in enumerate(f_content):
if re_Search.search(line):
print(file_path)
print("At line: {l_idx}".format(l_idx = l_idx+1))
else:
print("Nothing!!")

How to search through both zipped and unzipped folders for a specific line

I'm trying to implement a Python script that takes a folder from the user (can be zipped or unzipped), and search through all the files in the folder to output the specific lines that my regular expression matches. My code below works for regular unzipped folders, but I can't figure out how to do the same with zipped folders that are inputted to function. Below are my code, thanks in advance!
def myFunction(folder_name):
path = folder_name
for (path, subdirs, files) in os.walk(path):
files = [f for f in os.listdir(path) if f.endswith('.txt') or f.endswith('.log') or f.endswith('-release') or f.endswith('.out') or f.endswith('messages') or f.endswith('.zip')] # Specify here the format of files you hope to search from (ex: ".txt" or ".log")
files.sort() # file is sorted list
files = [os.path.join(path, name) for name in files] # Joins the path and the name, so the files can be opened and scanned by the open() function
# The following for loop searches all files with the selected format
for filename in files:
#print('start parsing... ' + str(datetime.datetime.now()))
matched_line = []
try:
with open(filename, 'r', encoding = 'utf-8') as f:
f = f.readlines()
except:
with open(filename, 'r') as f:
f = f.readlines()
# print('Finished parsing... ' + str(datetime.datetime.now()))
for line in f:
#0strip out \x00 from read content, in case it's encoded differently
line = line.replace('\x00', '')
RE2 = r'^Version: \d.+\d.+\d.\w\d.+'
RE3 = r'^.+version.(\d+.\d+.\d+.\d+)'
pattern2 = re.compile('('+RE2+'|'+RE3+')', re.IGNORECASE)
for match2 in pattern2.finditer(line):
matched_line.append(line)
print(line)
#Calling the function to use it
myFunction(r"SampleZippedFolder.zip")
The try and except block of my code was my attempt to open the zipped folder and read it. I'm still not very clear with how to open the zipped folder or how it works. Please let me know how I can modify my code to make it work, much appreciated!

One possibility is first determine what object type folder_name is using zipfile and os.isdir() and whichever one succeeds, get the list of files and proceed. Maybe something like this:
import zipfile, os, re
def myFunction(folder_name):
files = None # nothing yet
path = folder_name
if zipfile.is_zipfile(path):
print('ZipFile: {}'.format(path))
f = zipfile.ZipFile(path)
files = f.namelist()
# for name in f.namelist(): # debugging
# print('file: {}'.format(name))
elif os.path.isdir(path):
print('Folder: {}'.format(path))
files = os.listdir(path)
# for name in os.listdir(path): # debugging
# print('file: {}'.format(name))
# should now have a list of files
# proceed processing the files
for filename in files:
...

Python - Loop through list within regex

Right, i'm relatively new to Python, which you will likely see in my code, but is there any way to iterate through a list within regex?
Basically, i'm looping through each filename within a folder, getting a code (2-6 digits) from the filename, and i'm wanting to compare it with a list of codes in a text file, which have a name attached, in the format "1234_Name" (without the quotation marks). If the code exists in both lists, I want to print out the list entry, i.e. 1234_Name. Currently my code only seems to look at the first entry in the text file's list and i'm not sure how to make it look through them all to find matches.
import os, re
sitesfile = open('C:/Users/me/My Documents/WORK_PYTHON/Renaming/testnames.txt', 'r')
filefolder = r'C:/Users/me/My Documents/WORK_PYTHON/Renaming/files/'
sites = sitesfile.read()
site_split = re.split('\n', sites)
old = []
newname = []
for site in site_split:
newname.append(site)
for root, dirs, filenames in os.walk(filefolder):
for filename in filenames:
fullpath = os.path.join(root, filename)
filename_split = os.path.splitext(fullpath)
filename_zero, fileext = filename_split
filename_zs = re.split("/", filename_zero)
filenm = re.search(r"[\w]+", str(filename_zs[-1:]))#get only filename, not path
filenmgrp = filenm.group()
pacode = re.search('\d\d+', filenmgrp)
if pacode:
pacodegrp = pacode.group()
match = re.match(pacodegrp, site)
if match:
print site
Hope this makes sense - thanks a lot in advance!

So, use this code instead:
import os
import re
def locate(pattern = r'\d+[_]', root=os.curdir):
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in re.findall(pattern, ' '.join(files)):
yield os.path.join(path, filename)
..this will only return files in a folder that match a given regex pattern.
with open('list_file.txt', 'r') as f:
lines = [x.split('_')[0] for x in f.readlines()]
print_out = []
for f in locate(<your code regex>, <your directory>):
if f in lines: print_out.append(f)
print(print_out)
...find the valid codes in your list_file first, then compare the files that come back with your given regex.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract certain data from multiple .txt files using Python and RegEx - python

You do not need regular expressions. You can use basic string functions: txtfile=open(filename,"r") for line in txtfile: if line.startswidth("Start Date:"): start_date = line.split()[-1] ... break if you have all information collected.

Related

Can't get python script to pick up all terms in text search list

I need to find all text files in all subfolders created on a certain date, open them, and copy the contents into a single text file

Use Python Regex to search files and return filename

How to search through both zipped and unzipped folders for a specific line

Python - Loop through list within regex

Categories

Resources