How to do a parallel search for a strings in files - python

I wrote this simple code for searching for strings in a collection of files. It works but is not optimal.
I need to hardcode file names in a dictionary
some of my files are about 60Mb of size, search lasts for a while
Could someone please optimize my code for the following:
read all files in a given directory without the need to hardcode the file names
parallelize the search for speed
write the search result in a output.txt file
my_file = {"File1.xml", "File2.xml", "File3.xml"}
my_string = {"John", "Mary", "Clara"}
for f in my_file:
for s in my_string:
with open(f) as fp:
a = fp.read().count(s)
fp.close()
print f,',',s,',',a
Thank you

1. Read the files:
files_queue = Queue()
for root, dirs, files in os.walk(start_path):
for file in files:
files_queue.put(file)
2. Parallel search:
res_queue = Queue()
threads = []
def search(files_queue, words, res_queue):
while True:
file = files_queue.get(block=Flase)
with open(file) as fp:
content = fp.read()
results = {}
for word in words
results[word] = content.count(s)
res_queue.put(results)
files_queue.task_done()
# use 10 workers
for _ in range(10):
thread = Thread(target=search, args=(files_queue, words, res_queue)
threads.append(thread)
thread.start()
3. Collect results:
# wait until all files processed
files.join()
# collect results from queue
results = []
while not res_queue.empty()
results.append( res_queue.get() )
# profit
...

Related

How to combine multiple txt files into one txt file using multi-threading?

As I asked, I have multiple txt files:
file1.txt
file2.txt
file3.txt
file4.txt
...
They're in the same folder, each file has a few lines. I want to merge them into one file using multithreading in Python.
Here is my code without thread:
filenames = glob.glob(DATA_DIR + '/*.txt')
with open('final.txt', 'w') as outputfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outputfile.write(line)
outputfile.write('\n')
Here is a very simple example demonstrating a basic concept. You will have to play with what to do with Lock() to end up getting the result you need. But in short, this code is an example that the "threaded" version (use_thread = True) writes files in an inconsistent manner compared to the "non_threaded" version (use_thread = False) where the results are always consistent.
import threading
use_thread = True
def read_and_write_file(file_name):
if use_thread:
print(thread.name)
with threading.Lock() as lock:
with open('file_all.txt', 'a') as out, open(file_name) as in_:
out.writelines(in_)
if __name__ == '__main__':
file_names = ['file1.txt', 'file2.txt', 'file3.txt']
if use_thread:
for file_name in file_names:
thread = threading.Thread(target = read_and_write_file, args = (file_name,), name = file_name)
thread.start()
else:
for file_name in file_names:
read_and_write_file(file_name)

How to split a big file(5GB) into small files according to its contents?

I have 100 big files and each is about 5GB.
I need to split them into files based on its contents.
The big files have much lines, each line is like this
{"task_op_id": 143677789, "task_op_time": 1530927931, "task_op_tag": 1, "create_time": 1530923701, "status": 2}
and I need to split the content based on the task_op_id, every big file have 350 different task_op_id, so each should generated 350 different small files, each has the same task_op_id contents.
My tried method is:
def split_to_id_file(original_file):
destination_file = 'processed_data2/data_over_one_id/break_into_ids/'
with open(original_file) as f1:
for line in f1:
data_dict = json.loads(line)
task_op_id = data_dict['task_op_id']
with open(destination_file+str(task_op_id), 'a+') as f2:
json.dump(data_dict, f2, ensure_ascii=False)
f2.write('\n')
# multiprocessing with pool
def multiprocessing_pool(workers_number, job, files_list):
p = Pool(workers_number)
p.map(job, files_list)
def main():
input_path = 'processed_data2/data_over_one_id'
files_list = [join(input_path, f) for f in listdir(input_path)
if isfile(join(input_path, f))
and join(input_path, f).split('/')[-1].startswith('uegaudit')]
multiprocessing_pool(80, split_to_id_file, files_list)
if __name__ == '__main__':
main()
But the speed is too low, processing 10GB data needs 2hours.
So is there a better way to process the data?
Thank you very much for helping.
I speculate that the major time taking process is file IO operations. Can you break down the running time and check that?
Another reason could be the JSON parser. Check out this thread for more information.
Can you sort these files?
If yes, try don't parse every line as JSON, only these with new ID.
Something like this?
def get_id(json_line):
data_dict = json.loads(json_line)
return data_dict['task_op_id']
def split_to_id_file(original_file):
current_id = 'blabla_xxxxxxxx'
destination_file = 'processed_data2/data_over_one_id/break_into_ids/'
with open(original_file) as f1:
for line in f1:
if current_id not in line:
if not f2.closed:
f2.close()
task_op_id = get_id(line)
current_id = "\"task_op_id\": " + task_op_id
f2 = open(destination_file+str(task_op_id), 'a+')
f2.write(line+'\n')

How to read n number of files in n variables and then add those variables in a list?

This is my code:
file_input1 = open('Amazon_Indi_Seller.py', 'r')
f1 = file_input1.read().lower()
file_input2 = open('Amazon_Prices.py', 'r')
f2 = file_input2.read().lower()
documents = [f1, f2]
import nltk, string, numpy
stemmer = nltk.stem.porter.PorterStemmer()
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
return
LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
from sklearn.feature_extraction.text import CountVectorizer
LemVectorizer = CountVectorizer(tokenizer=LemNormalize,
stop_words='english')
LemVectorizer.fit_transform(documents)
Instead of reading 2 files i want to read all the files in a directory. And read them individually so that later I can add those variables in a list named documents.
You can use the code mentioned below,
import os
def read_files(file):
file_input1 = open(file, 'r')
f1 = file_input1.read()
return f1
files = ['sample.py', 'Amazon_Indi_Seller.py']
data = list()
for file in files:
data.append(read_files(file))
print(data)
The above code will used to read the files mentioned in the list
import os
def read_files(file):
file_input1 = open(file, 'r')
f1 = file_input1.read()
return f1
src = r'DIRECTORY PATH'
data = list()
for file in os.listdir(src):
data.append(read_files(file))
print(data)
And the above code will read all the files from the directory mentioned
You could collect all the in a list, for example:
lst = []
for file in os.listdir():
file_input = open(file,"r")
lst.append(file_input.read())
One extra recommendation - in general it might be wise to store the contents of a file as a collection of its lines by for example using file_input.readlines() which returns a list of lines.
Create a list of all filenames and then iterate over filename list and add their content in a dictionary.
from collections import defaultdict #imported default dictionary
result = defaultdict() #created empty default dictionary
filenames = ['name1.py', 'name2.py', 'name3.py'] #added filenames to a list
for name in filenames: #iterate over filename list
with open(name, 'r') as stream: #open each file
data = stream.readlines() #read contents lines by line (readlines return list of lines)
result[name] = data # set name as key and content as value in dictionary
print(result)
In this way you will have a dictionary with keys as filenames and values as their contents
If the directory may include other directories with files, which you want to read to - use os.walk
Here is sample code from the official documentation:
import os
from os.path import join, getsize
for root, dirs, files in os.walk('python/Lib/email'):
print root, "consumes",
print sum(getsize(join(root, name)) for name in files),
print "bytes in", len(files), "non-directory files"
if 'CVS' in dirs:
dirs.remove('CVS') # don't visit CVS directories

Sum of values in a function called from thread

I have few text files containing some numbers in each line. I read the file and displayed total of each number in the function.
from Queue import Queue
from threading import Thread
import os
enclosure_queue = Queue()
list=[]
def getAllFiles():
for root, dirs, files in os.walk("C:/Users/test"):
for file in files:
if file.endswith(".txt"):
file_path= os.path.join(root, file)
list.append(file_path)
def calc(i, q):
while True:
file = q.get()
fileData = open(file, 'r')
add=0
for line in fileData:
add= add + int(line)
print str(add) + '\n'
q.task_done()
getAllFiles()
num_fetch_threads = len(list)
for i in range(num_fetch_threads):
worker = Thread(target=calc, args=(i, enclosure_queue,))
worker.setDaemon(True)
worker.start()
for ind_file in list:
enclosure_queue.put(ind_file)
enclosure_queue.join()
It displays the sum of data in lines of individual file, but I need to add up the results.
For example if the calc function's add has 300 , 200 and 500 , I want final result as 1000 . I thought of adding each result to a list and then splitting it in another function and adding them. Is there any better solution ?
No need to use Queue here. Use multiprocessing.Pool.map, and change your calc method accordingly. Also threading.Thread does not return results, whereas multiprocessing.Pool.map returns results.
import multiprocessing
import os
def getAllFiles():
my_files = list()
for root, dirs, files in os.walk("C:/Users/test"):
for file in files:
if file.endswith(".txt"):
file_path= os.path.join(root, file)
my_files.append(file_path)
return my_files
def calc(file):
with open(file, 'r') as f:
return sum(map(int, f.readlines()))
if __name__ == '__main__':
my_files = getAllFiles()
num_fetch_threads = len(my_files)
pool = multiprocessing.Pool(processes=num_fetch_threads)
results = pool.map(calc, my_files)
result = sum(results)
print result

How to open multiple files without repeating the code

I am making a program that will open multiple files, they are all very similar. All contains a few one word lines in lowercase on Notepad. I do not want to repeat the code multiple times. Ideally I want to use a while loop to repeat the code but change what file it opens each repeat. Is there a way to do it?
This is the current code:
File = open("Key Words\Audio.txt","r") #This will open the file called Audio.
Audio = [] #This creates the Audio list
Audio = File.read().splitlines() #This saves everything on each line of the Audio file to a diffrent section of the Audio list.
File = open("Key Words\Calls.txt","r") #This will open the file called Calls.
Calls = [] #This creates the Calls list
Calls = File.read().splitlines() #This saves everything on each line of the Calls file to a diffrent section of the Calls list.
File = open("Key Words\Charging.txt","r") #This will open the file called Charging.
Charging = [] #This creates the Charging list
Charging = File.read().splitlines() #This saves everything on each line of the Charging file to a diffrent section of the Charging list.
File.close() #This closes the File(s).
This is what functions are for:
def readfile(filepath):
with open(filepath, 'r') as f:
return f.read().splitlines()
audio = readfile('Key Words\Audio.txt')
calls = readfile('Key Words\Calls.txt')
charging = readfile('Key Words\Charging.txt')
Make a list of the files you need to open:
files_to_open = [
'file_1.txt',
'file_2.txt'
]
calls_info = {}
Iterate over the list, and open and process:
for file_ in files_to_open:
with open(file_) as f:
calls_info[file_] = f.read().splitlines()
Here, I created a calls_info variable. What this will do is store everything in a dictionary. These hold keys and values - to access the value of a file, simply index it like so:
calls_info[file_path] # Make sure file_path is the right path you put in the list!
Put the code in a function:
def openandread(filename):
# No need to close the file if you use with:
with open(filename,"r") as File:
return_this = File.read().splitlines()
return return_this
and then just call this function multiple times:
Audio = openandread("Key Words\Audio.txt")
Calls = openandread("Key Words\Calls.txt")
Charging = openandread("Key Words\Charging.txt")
or if you want to make it even shorter:
Audio, Calls, Charging = (openandread(i) for i in ["Key Words\Audio.txt", "Key Words\Calls.txt", "Key Words\Charging.txt"])
Try this
Audio = []
Calls = []
Charging = []
FILES_LISTS = (
( "Key Words\Audio.txt", Audio ),
( "Key Words\Calls.txt", Calls ),
( "Key Words\Charging.txt", Charging )
)
for file_name, list_var in FILES_LISTS:
File = open( file_name, 'r' )
list_var += File.read().splitlines()
File.close()
Make sure to type list_var += and not list_var =. This works because lists are mutable and because python works with references.
You can try unipath
# Install
$easy_install unipath
# In python
from unipath import Path
t1 = Path('Key Words\Audio.txt')
...

Categories