I'm searching occurrences of different words in a text file.
I'm not good at python but i did that on google colab.
import os
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
if not os.path.exists('/content/drive/My Drive/Miserables'):
os.makedirs('/content/drive/My Drive/Miserables')
root_dir = '/content/drive/My Drive/Miserables/'
os.listdir('/content/drive/My Drive/Miserables')
with open("/content/drive/My Drive/Miserables/miserable.txt", 'r') as f:
myString = f.readlines()
print(len(myString))
searchWords = ["example"]
for word in searchWords:
print(f"Word '{word}' appeared {myString.count(word)} time/s.")
The thing is python doesnt actually count the number of words, and i have 0 in results when i know that those words are actually in the text.
Can somebody help me please? Thanks you.
I guess the problem is that you use f.readlines() to get the file content.
This function returns a list of every line in the file.
e.g.
["foo foo faa", "faa foo faa"]
This means youre searching for the word in this list.
Try f.read() instead.
f.readlines() gives you the list with every element representing the line.
For ex. if the text is:
I'm here
this is here too
Bye buddy
will give you the list:
[
"I'm here",
"this is here too",
"Bye buddy"
]
To solve it either use f.read() instead of f.readlines()
import os
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
if not os.path.exists('/content/drive/My Drive/Miserables'):
os.makedirs('/content/drive/My Drive/Miserables')
root_dir = '/content/drive/My Drive/Miserables/'
os.listdir('/content/drive/My Drive/Miserables')
with open("/content/drive/My Drive/Miserables/miserable.txt", 'r') as f:
myString = f.read()
print(len(myString))
searchWords = ["example"]
for word in searchWords:
print(f"Word '{word}' appeared {myString.count(word)} time/s.")
Or
Loop it:
import os
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
if not os.path.exists('/content/drive/My Drive/Miserables'):
os.makedirs('/content/drive/My Drive/Miserables')
root_dir = '/content/drive/My Drive/Miserables/'
os.listdir('/content/drive/My Drive/Miserables')
with open("/content/drive/My Drive/Miserables/miserable.txt", 'r') as f:
myString = f.readlines()
print(len(myString))
searchWords = "example"
count = 0
for i in myString:
count += i.count(searchWords)
print(f"Word '{word}' appeared {count} time/s.")
I want to search a list of group of strings inside a text file (.txt or .log).
it must include group A or B (or CDE..).
group A OR B each words need in the same line but not near by. (eg. ["123456", "Login"] or ["123457", "Login"] if in the same line then save it to a new txt file.
Some of example output line:
20221110,1668057560.965,AE111,123457,0,"Action=Account Login,XXX,XXX",XXX,XXX
20221110,1668057560.965,AE112,123458,0,"Action=Account Login,XXX,XXX",XXX,XXX
20221111,1668057560.965,AE113,123458,0,"Action=Order,XXX,XXX",XXX,XXX
below is my code:
import os, re
path = "Log\\"
file_list = [path + f for f in os.listdir(path) if f.endswith('.log')]
keep_phrases1 = ["123456", "Login"]
keep_phrases2 = ["123457", "Login"]
pat = r"\b.*?\b".join([re.escape(word) for word in keep_phrases1])
pat = re.compile(r"\b" + pat + r"\b")
pat2 = r"\b.*?\b".join([re.escape(word) for word in keep_phrases2])
pat2 = re.compile(r"\b" + pat2 + r"\b")
print(pat2,pat)
if len(file_list) != 0:
for infile in sorted(file_list):
with open(infile, encoding="latin-1") as f:
f = f.readlines()
for line in f:
found1 = pat.search(line)
found2 = pat2.search(line)
if found1 or found2:
with open(outfile, "a") as wf:
wf.write(line)
It's works for me but not easy to add more group of words. And I think the code is not good for understand?
My problems is How can I simplify the code?
How can I easier to add other group to search? e.g. ["123458", "Login"] ["123456", "order"] ["123457", "order"]
import os, re
path = "Log\\"
file_list = [path + f for f in os.listdir(path) if f.endswith('.log')]
All keep_phrases in a container, I choose a dictionary but since they are identified by order, it could have been a list:
keep_phrases = {'keep_phrases1': ["123456", "Login"], 'keep_phrases2':["123457", "Login"]}
# Alternative, a list would work:
# keep_phrases = [["123456", "Login"], ["123457", "Login"]]
Now let's generate a list with the compiled patterns:
def compile_pattern(keep_phrase):
pat = r"\b.*?\b".join([re.escape(word) for word in keep_phrase])
pat = re.compile(r"\b" + pat + r"\b")
return pat
patterns = [compile_pattern(keep_phrases[keep_phrase]) for keep_phrase in keep_phrases.keys()]
# if keep_phrases had been a list, we would do
# patterns = [compile_pattern(keep_phrase) for keep_phrase in keep_phrases]
Finally, we look for matches for every pattern and if we get any finding, we write to file.
if len(file_list) != 0:
for infile in sorted(file_list):
with open(infile, encoding="latin-1") as f:
f = f.readlines()
for line in f:
findings = [pat.search(line) for pat in patterns] # can do this because there's a list with patterns
if any(findings):
with open(outfile, "a") as wf:
wf.write(line)
Try, this. I read the whole file in a string to make code fast and readable, findall will return a list with all matching lines for the file.
If memory is a problem the pattern also works on individual lines:
import re
file_list=["sit.txt"]
keep_phrases=[["123456", "Login"],["123457", "Login"]]
pat = [r"(?:.*?(?:" + p1 + r"\b.*?"+p2+r".*?(?:\n|$)))" for p1,p2 in keep_phrases]
pat= r"|".join(pat)
for infile in sorted(file_list):
with open(infile, encoding="latin-1") as f:
text=f.read()
print(re.findall(pat,text))
Without regex
def match_words(line, words):
return all(word in words for word in line)
with open(infile, encoding="latin-1") as f:
f = f.readlines()
for line in f:
split_line = line.split(",")
if any( match_words(split_line , word) for word in [keep_phrases1, keep_phrases2]):
with open(outfile, "a") as wf:
wf.write(line)
I am trying to filter all txt documents in my director with multiple regular expressions, and the output to show all the files that contain any of the four regular expressions below and its match per file.
Here is what I have but I am getting errors running the code. Any ideas?
import glob
import re
folder_path = "/home"
file_pattern = "/*.txt"
match_list = []
folder_contents = glob.glob(folder_path + file_pattern)
#Search for Emails
regex1= re.compile(r'\S+#\S+')
#Search for Phone Numbers
regex2 = re.compile(r'\d\d\d[-]\d\d\d[-]\d\d\d\d')
#Search for Physician's Name
regex3=re.compile(r'\b\w\w\.\w+\b')
#Search for SSN's
regex4 = re.compile(r'\d\d\d-\d\d-\d\d\d\
combined_pat = r'|'.join((regex1, regex2,regex3,regex4))
for file in folder_contents:
read_file = open(file, 'rt').read()
matches = combined_pat.findall(read_file)
if matches:
match_list.append(file)
print('This file contains PHI:', file)
print('PHI detected:', matches)
Looks like you're missing a ') on the line where you assign regex4. Does that fix it?
I am trying to open a text file, remove certain words that have a ] after them, and then write the new contents to a new file. With the following code, new_content contains what I need, and a new file is created, but it's empty. I cannot figure out why. I've tried indenting differently and passing in an encoding type, with no luck. Any help greatly appreciated.
import glob
import os
import nltk, re, pprint
from nltk import word_tokenize, sent_tokenize
import pandas
import string
import collections
path = "/pathtofiles"
for file in glob.glob(os.path.join(path, '*.txt')):
if file.endswith(".txt"):
f = open(file, 'r')
flines = f.readlines()
for line in flines:
content = line.split()
for word in content:
if word.endswith(']'):
content.remove(word)
new_content = ' '.join(content)
f2 = open((file.rsplit( ".", 1 )[ 0 ] ) + "_preprocessed.txt", "w")
f2.write(new_content)
f.close
This should work #firefly. Happy to answer questions if you have them.
import glob
import os
path = "/pathtofiles"
for file in glob.glob(os.path.join(path, '*.txt')):
if file.endswith(".txt"):
with open(file, 'r') as f:
flines = f.readlines()
new_content = []
for line in flines:
content = line.split()
new_content_line = []
for word in content:
if not word.endswith(']'):
new_content_line.append(word)
new_content.append(' '.join(new_content_line))
f2 = open((file.rsplit( ".", 1 )[ 0 ] ) + "_preprocessed.txt", "w")
f2.write('\n'.join(new_content))
f.close
f2.close
This script reads and writes all the individual html files in a directory. The script reiterates, highlight and write the output.The issue is, after highlighting the last instance of the search item, the script removes all the remaining contents after the last search instance in the output of each file. Any help here is appreciated.
import os
import sys
import re
source = raw_input("Enter the source files path:")
listfiles = os.listdir(source)
for f in listfiles:
filepath = os.path.join(source+'\\'+f)
infile = open(filepath, 'r+')
source_content = infile.read()
color = ('red')
regex = re.compile(r"(\b in \b)|(\b be \b)|(\b by \b)|(\b user \b)|(\bmay\b)|(\bmight\b)|(\bwill\b)|(\b's\b)|(\bdon't\b)|(\bdoesn't\b)|(\bwon't\b)|(\bsupport\b)|(\bcan't\b)|(\bkill\b)|(\betc\b)|(\b NA \b)|(\bfollow\b)|(\bhang\b)|(\bbelow\b)", re.I)
i = 0; output = ""
for m in regex.finditer(source_content):
output += "".join([source_content[i:m.start()],
"<strong><span style='color:%s'>" % color[0:],
source_content[m.start():m.end()],
"</span></strong>"])
i = m.end()
outfile = open(filepath, 'w')
outfile.seek(0, 2)
outfile.write(output)
print "\nProcess Completed!\n"
infile.close()
outfile.close()
raw_input()
After your for loop is over, you need to include whatever is left after the last match:
...
i = m.end()
output += source_content[i:]) # Here's the end of your file
outfile = open(filepath, 'w')
...