Hello so I am trying to filter the bad words from this list, I have for this script usually list of 5 to 10 million line of words, I tried threading to make it fast but after the first 20k word it gets slower and slower why is that, will it be faster if I use Multiprocessing instead ?
I run this script on Ubuntu with 48 CPU core and 200GB RAM
from tqdm import tqdm
import queue
import threading
a=input("The List: ")+".txt"
thr=input('Threads: ')
c=input("clear old[y]: ")
inputQueue = queue.Queue()
if c == 'y' or c == 'Y':#clean
if c =="y":
open("goodWord.txt",'w').close()
s = ["bad_word"]#bad words list
class myclass:
def dem(self,my_word):
for key in s:
if key in my_word:
return 1
return 0
def chk(self):
while 1:
old = open("goodWord.txt","r",encoding='utf-8',errors='ignore').readlines()
y = inputQueue.get()
if my_word not in old:
rez = self.dem(my_word)
if rez == 0:
sav = open("goodWord.txt","a+")
sav.write(my_word+"\n")
sav.close()
self.pbar.update(1)
else :
self.pbar.update(1)
inputQueue.task_done()
def run_thread(self):
for y in tqdm(open(a, 'r',encoding='utf-8', errors='ignore').readlines()):
inputQueue.put(y)
tqdm.write("All in the Queue")
self.pbar = tqdm(total=inputQueue.qsize(),unit_divisor=1000)
for x in range(int(thr)):
t = threading.Thread(target=self.chk)
t.setDaemon(True)
t.start()
inputQueue.join()
try:
open("goodWord.txt","a")
except:
open("goodWord.txt","w")
old = open("goodWord.txt","r",encoding='utf-8',errors='ignore').readlines()
myclass=myclass()
omyclass.run_thread()
For the sake of curiosity and education, I wrote a virtually identical (in function) program:
import pathlib
from tqdm import tqdm
# check_words_file_path = pathlib.Path(input("Enter the path of the file which contains the words to check: "))
check_words_file_path = pathlib.Path("/Users/****/Documents/Projects/AdHoc/resources/temp/check_words.txt")
good_words_file_path = pathlib.Path("/Users/****/Documents/Projects/AdHoc/resources/temp/good_words.txt")
bad_words = {"abadword", "anotherbadword"}
# load the list of good words
with open(good_words_file_path) as good_words_file:
stripped_lines = (line.rstrip() for line in good_words_file)
good_words = set(stripped_line for stripped_line in stripped_lines if stripped_line)
# check each word to see if is one of the bad words
# if it isn't, add it to the good words
with open(check_words_file_path) as check_words_file:
for curr_word in tqdm(check_words_file):
curr_word = curr_word.rstrip()
if curr_word not in bad_words:
good_words.add(curr_word)
# write the new/expanded list of good words back to file
with open(good_words_file_path, "w") as good_words_file:
for good_word in good_words:
good_words_file.write(good_word + "\n")
It is based on my understanding of the original program, which, as I already mentioned, I find far too complex.
I hope that this one is clearer, and it is almost certainly much faster. In fact, this might be fast enough that there is no need to consider things like multiprocessing.
I have a large txt file and I'm trying to pull out every instance of a specific word, as well as the 15 words on either side. I'm running into a problem when there are two instances of that word within 15 words of each other, which I'm trying to get as one large snippet of text.
I'm trying to get chunks of text to analyze about a specific topic. So far, I have working code for all instances except the scenario mentioned above.
def occurs(word1, word2, filename):
import os
infile = open(filename,'r') #opens file, reads, splits into lines
lines = infile.read().splitlines()
infile.close()
wordlist = [word1, word2] #this list allows for multiple words
wordsString = ''.join(lines) #splits file into individual words
words = wordsString.split()
f = open(filename, 'w')
f.write("start")
f.write(os.linesep)
for word in wordlist:
matches = [i for i, w in enumerate(words) if w.lower().find(word) != -1]
for m in matches:
l = " ".join(words[m-15:m+16])
f.write(f"...{l}...") #writes the data to the external file
f.write(os.linesep)
f.close
So far, when two of the same word are too close together, the program just doesn't run on one of them. Instead, I want to get out a longer chunk of text that extends 15 words behind and in front of furthest back and forward words
This snippet will get number of words around the chosen keyword. If there are some keywords together, it will join them:
s = '''xxx I have a large txt file and I'm xxx trying to pull out every instance of a specific word, as well as the 15 words on either side. I'm running into a problem when there are two instances of that word within 15 words of each other, which I'm trying to get as one large snippet of text.
I'm trying to xxx get chunks of text to analyze about a specific topic. So far, I have working code for all instances except the scenario mentioned above. xxx'''
words = s.split()
from itertools import groupby, chain
word = 'xxx'
def get_snippets(words, word, l):
snippets, current_snippet, cnt = [], [], 0
for v, g in groupby(words, lambda w: w != word):
w = [*g]
if v:
if len(w) < l:
current_snippet += [w]
else:
current_snippet += [w[:l] if cnt % 2 else w[-l:]]
snippets.append([*chain.from_iterable(current_snippet)])
current_snippet = [w[-l:] if cnt % 2 else w[:l]]
cnt = 0
cnt += 1
else:
if current_snippet:
current_snippet[-1].extend(w)
else:
current_snippet += [w]
if current_snippet[-1][-1] == word or len(current_snippet) > 1:
snippets.append([*chain.from_iterable(current_snippet)])
return snippets
for snippet in get_snippets(words, word, 15):
print(' '.join(snippet))
Prints:
xxx I have a large txt file and I'm xxx trying to pull out every instance of a specific word, as well as the 15
other, which I'm trying to get as one large snippet of text. I'm trying to xxx get chunks of text to analyze about a specific topic. So far, I have working
topic. So far, I have working code for all instances except the scenario mentioned above. xxx
With the same data and different lenght:
for snippet in get_snippets(words, word, 2):
print(' '.join(snippet))
Prints:
xxx and I'm
I have xxx trying to
trying to xxx get chunks
mentioned above. xxx
As always, a variety of solutions avaliable here. A fun one would a be a recursive wordFind, where it searches the next 15 words and if it finds the target word it can call itself.
A simpler, though perhaps not efficient, solution would be to add words one at a time:
for m in matches:
l = " ".join(words[m-15:m])
i = 1
while i < 16:
if (words[m+i].lower() == word):
i=1
else:
l.join(words[m+(i++)])
f.write(f"...{l}...") #writes the data to the external file
f.write(os.linesep)
Or if you're wanting the subsequent uses to be removed...
bExtend = false;
for m in matches:
if (!bExtend):
l = " ".join(words[m-15:m])
f.write("...")
bExtend = false
i = 1
while (i < 16):
if (words[m].lower() == word):
l.join(words[m+i])
bExtend = true
break
else:
l.join(words[m+(i++)])
f.write(l)
if (!bExtend):
f.write("...")
f.write(os.linesep)
Note, have not tested so may require a bit of debugging. But the gist is clear: add words piecemeal and extend the addition process when a target word is encountered. This also allows you to extend with other target words other than the current one with a bit of addition to to the second conditional if.
I have made a compression code, and have tested it on 10 KB text files, which took no less than 3 minutes. However, I've tested it with a 1 MB file, which is the assessment assigned by my teacher, and it takes longer than half an hour. Compared to my classmates, mine is irregularly long. It might be my computer or my code, but I have no idea. Does anyone know any tips or shortcuts into making the speed of my code shorter? My compression code is below, if there are any quicker ways of doing loops, etc. please send me an answer (:
(by the way my code DOES work, so I'm not asking for corrections, just enhancements, or tips, thanks!)
import re #used to enable functions(loops, etc.) to find patterns in text file
import os #used for anything referring to directories(files)
from collections import Counter #used to keep track on how many times values are added
size1 = os.path.getsize('file.txt') #find the size(in bytes) of your file, INCLUDING SPACES
print('The size of your file is ', size1,)
words = re.findall('\w+', open('file.txt').read())
wordcounts = Counter(words) #turns all words into array, even capitals
common100 = [x for x, it in Counter(words).most_common(100)] #identifies the 200 most common words
keyword = []
kcount = []
z = dict(wordcounts)
for key, value in z.items():
keyword.append(key) #adds each keyword to the array called keywords
kcount.append(value)
characters =['$','#','#','!','%','^','&','*','(',')','~','-','/','{','[', ']', '+','=','}','|', '?','cb',
'dc','fd','gf','hg','kj','mk','nm','pn','qp','rq','sr','ts','vt','wv','xw','yx','zy','bc',
'cd','df','fg','gh','jk','km','mn','np','pq','qr','rs','st','tv','vw','wx','xy','yz','cbc',
'dcd','fdf','gfg','hgh','kjk','mkm','nmn','pnp','qpq','rqr','srs','tst','vtv','wvw','xwx',
'yxy','zyz','ccb','ddc','ffd','ggf','hhg','kkj','mmk','nnm','ppn','qqp','rrq','ssr','tts','vvt',
'wwv','xxw','yyx''zzy','cbb','dcc','fdd','gff','hgg','kjj','mkk','nmm','pnn','qpp','rqq','srr',
'tss','vtt','wvv','xww','yxx','zyy','bcb','cdc','dfd','fgf','ghg','jkj','kmk','mnm','npn','pqp',
'qrq','rsr','sts','tvt','vwv','wxw','xyx','yzy','QRQ','RSR','STS','TVT','VWV','WXW','XYX','YZY',
'DC','FD','GF','HG','KJ','MK','NM','PN','QP','RQ','SR','TS','VT','WV','XW','YX','ZY','BC',
'CD','DF','FG','GH','JK','KM','MN','NP','PQ','QR','RS','ST','TV','VW','WX','XY','YZ','CBC',
'DCD','FDF','GFG','HGH','KJK','MKM','NMN','PNP','QPQ','RQR','SRS','TST','VTV','WVW','XWX',
'YXY','ZYZ','CCB','DDC','FFD','GGF','HHG','KKJ','MMK','NNM','PPN','QQP','RRQ','SSR','TTS','VVT',
'WWV','XXW','YYX''ZZY','CBB','DCC','FDD','GFF','HGG','KJJ','MKK','NMM','PNN','QPP','RQQ','SRR',
'TSS','VTT','WVV','XWW','YXX','ZYY','BCB','CDC','DFD','FGF','GHG','JKJ','KMK','MNM','NPN','PQP',] #characters which I can use
symbols_words = []
char = 0
for i in common100:
symbols_words.append(characters[char]) #makes the array literally contain 0 values
char = char + 1
print("Compression has now started")
f = 0
g = 0
no = 0
while no < 100:
for i in common100:
for w in words:
if i == w and len(i)>1: #if the values in common200 are ACTUALLY in words
place = words.index(i)#find exactly where the most common words are in the text
symbols = symbols_words[common100.index(i)] #assigns one character with one common word
words[place] = symbols # replaces the word with the symbol
g = g + 1
no = no + 1
string = words
stringMade = ' '.join(map(str, string))#makes the list into a string so you can put it into a text file
file = open("compression.txt", "w")
file.write(stringMade)#imports everything in the variable 'words' into the new file
file.close()
size2 = os.path.getsize('compression.txt')
no1 = int(size1)
no2 = int(size2)
print('Compression has finished.')
print('Your original file size has been compressed by', 100 - ((100/no1) * no2 ) ,'percent.'
'The size of your file now is ', size2)
Using something like
word_substitutes = dict(zip(common100, characters))
will give you a dict that maps common words to their corresponding symbol.
Then you can simply iterate over the words:
# Iterate over all the words
# Use enumerate because we're going to modify the word in-place in the words list
for word_idx, word in enumerate(words):
# If the current word is in the `word_substitutes` dict, then we know its in the
# 'common' words, and can be replaced by the symbol
if word in word_substitutes:
# Replaces the word in-place
replacement_symbol = word_substitutes[word]
words[word_idx] = replacement_symbol
This will give much better performance, because the dictionary lookup used for the common word symbol mapping is logarithmic in time rather than linear. So the overall complexity will be something like O(N log(N)) rather than O(N^3) that you get from the 2 nested loops with the .index() call inside that.
The first thing I see that is bad for performance is:
for i in common100:
for w in words:
if i == w and len(i)>1:
...
What you are doing is seeing if the word w is in your list of common100 words. However, this check can be done in O(1) time by using a set and not looping through all of your top 100 words for each word.
common_words = set(common100)
for w in words:
if w in common_words:
...
Generally you would do the following:
Measure how much time each "part" of your program needs. You could use a profiler (e.g. this one in the standard library) or simply sprinkle some times.append(time.time.now) into your code and compute differences. Then you know which part of your code is slow.
See if you can improve the algorithm of the slow part. gnicholas answer shows one possibility to speed things up. The while no<=100 seems suspiciously, maybe that can be improved. This step needs understanding of the algorithms you use. Be careful to select the best data structures for your use case.
If you can't use a better algorithm (because you always use the best way to calculate something) you need to speed up the computations themselves. Numerical stuff benefits from numpy, with cython you can basically compile python code to C and numba uses LLVM to compile.
Need some advice in improving the performance of my code.
I have two files ( Keyword.txt , description.txt ). Keyword.txt consists of list of keywords (11,000+ to be specific) and descriptions.txt consists of very large text descriptions(9,000+ ).
I am trying to read keywords from keyword.txt one at a time and check if the keyword exists in the description. If the keyword exists I am writing it to a new file. So this is like a many - to - many relationship ( 11,000 * 9,000).
Sample Keywords:
Xerox
VMWARE CLOUD
Sample Description(it's huge):
Planning and implementing entire IT Infrastructure. Cyberoam firewall implementation and administration in head office and branch office. Report generation and analysis. Including band width conception, internet traffic and application performance. Windows 2003/2008 Server Domain controller implementation and managing. VERITAS Backup for Clients backup, Daily backup of applications and database. Verify the backed up database for data integrity. Send backup tapes to remote location for safe storage Installing and configuring various network devices; Routers, Modems, Access Points, Wireless ADSL+ modems / Routers Monitoring, managing & optimizing Network. Maintaining Network Infrastructure for various clients. Creating Users and maintaining the Linux Proxy servers for clients. Trouble shooting, diagnosing, isolating & resolving Windows / Network Problems. Configuring CCTV camera, Biometrics attendance machine, Access Control System Kaspersky Internet Security / ESET NOD32
Below is the code which I've written:
import csv
import nltk
import re
wr = open(OUTPUTFILENAME,'w')
def match():
c = 0
ft = open('DESCRIPTION.TXT','r')
ky2 = open('KEYWORD.TXT','r')
reader = csv.reader(ft)
keywords = []
keyword_reader2 = csv.reader(ky2)
for x in keyword_reader2: # Storing all the keywords to a list
keywords.append(x[1].lower())
string = ' '
c = 0
for row in reader:
sentence = row[1].lower()
id = row[0]
for word in keywords:
if re.search(r'\b{}\b'.format(re.escape(word.lower())),sentence):
string = string + id+'$'+word.lower()+'$'+sentence+ '\n'
c = c + 1
if c > 5000: # I am writing 5000 lines at a time.
print("Batch printed")
c = 0
wr.write(string)
string = ' '
wr.write(string)
ky2.close()
ft.close()
wr.close()
match()
Now this code takes around 120 min to complete. I tried a couple of ways to improve the speed.
At first I was writing one line at a time, then I changed it to 5000 lines at a time since it is a small file and i can afford to put everything in memory. Did not see much improvement.
I pushed everything to stdout and used pipe from console to append everything to file. This was even slower.
I want to know if there is a better way of doing this, since I may have done something wrong in the code.
My PC Specs : Ram : 15gb Processor: i7 4th gen
If all your search-word phrases consist of whole words (begin/end on a word boundary) then parallel indexing into a word tree would about as efficient as it gets.
Something like
# keep lowercase characters and digits
# keep apostrophes for contractions (isn't, couldn't, etc)
# convert uppercase characters to lowercase
# replace all other printable symbols with spaces
TO_ALPHANUM_LOWER = str.maketrans(
"ABCDEFGHIJKLMNOPQRSTUVWXYZ'!#$%&()*+,-./:;<=>?#[]^_`{|}~ \t\n\r\x0b\x0c\"\\",
"abcdefghijklmnopqrstuvwxyz' "
)
def clean(s):
"""
Convert string `s` to canonical form for searching
"""
return s.translate(TO_ALPHANUM_LOWER)
class WordTree:
__slots__ = ["children", "terminal"]
def __init__(self, phrases=None):
self.children = {} # {"word": WordTrie}
self.terminal = '' # if end of search phrase, full phrase is stored here
# preload tree
if phrases:
for phrase in phrases:
self.add_phrase(phrase)
def add_phrase(self, phrase):
tree = self
words = clean(phrase).split()
for word in words:
ch = tree.children
if word in ch:
tree = ch[word]
else:
tree = ch[word] = WordTree()
tree.terminal = " ".join(words)
def inc_search(self, word):
"""
Search one level deeper into the tree
Returns
(None, '' ) if word not found
(subtree, '' ) if word found but not terminal
(subtree, phrase) if word found and completes a search phrase
"""
ch = self.children
if word in ch:
wt = ch[word]
return wt, wt.terminal
else:
return (None, '')
def parallel_search(self, text):
"""
Return all search phrases found in text
"""
found = []
fd = found.append
partials = []
for word in clean(text).split():
new_partials = []
np = new_partials.append
# new search from root
wt, phrase = self.inc_search(word)
if wt: np(wt)
if phrase: fd(phrase)
# continue existing partial matches
for partial in partials:
wt, phrase = partial.inc_search(word)
if wt: np(wt)
if phrase: fd(phrase)
partials = new_partials
return found
def tree_repr(self, depth=0, indent=" ", terminal=" *"):
for word,tree in self.children.items():
yield indent * depth + word + (terminal if tree.terminal else '')
yield from tree.tree_repr(depth + 1, indent, terminal)
def __repr__(self):
return "\n".join(self.tree_repr())
then your program becomes
import csv
SEARCH_PHRASES = "keywords.csv"
SEARCH_INTO = "descriptions.csv"
RESULTS = "results.txt"
# get search phrases, build WordTree
with open(SEARCH_PHRASES) as inf:
wt = WordTree(*(phrase for _,phrase in csv.reader(inf)))
with open(SEARCH_INTO) as inf, open(RESULTS, "w") as outf:
# bound methods (save some look-ups)
find_phrases = wt.parallel_search
fmt = "{}${}${}\n".format
write = outf.write
# sentences to search
for id,sentence in csv.reader(inf):
# search phrases found
for found in find_phrases(sentence):
# store each result
write(fmt(id, found, sentence))
which should be something like a thousand times faster.
I'm guessing you want to make your searches faster. In which case, if you don't care about the frequency of the keywords in the description, only that they exist, you could try the following:
For each description file, split the text into individual words, and generate a set of unique words.
Then, for each keyword in your list of keywords, check if the set contains keyword, write to file if true.
That should make your iterations faster. It should also help you skip the regexes, which are also likely part of your performance problem.
PS: My approach assumes that you filter out punctuation.