Can I accelerate python file method `read()` by parallelism? - python

I have a lot of files(300~500) to read, and I want to accelerate this task.
The idealization is:
from multiprocessing import Pool
import os
import _io
filelist = map(open,os.listdir())
if __name__ == '__main__':
with Pool() as pool:
a = pool.map(_io.TextIOWrapper.read,filelist)
Of course, I got an error:
TypeError: cannot serialize '_io.TextIOWrapper' object
The question is: Can I accelerate I/O process by parallelism? If yes, how to?
UPDATE conclusion:
Now I get the way to parallelism and have tested my code:
I used 22 items, totalling 63.2 MB
from multiprocessing import Pool
import os
import _io
def my_read(file_name):
with open(file_name) as f:
return f.read()
def mul():
with Pool() as pool:
a = pool.map(my_read, os.listdir())
def single():
a = []
for i in os.listdir():
with open(i) as f:
r = f.read()
a.append(r)
if __name__ == '__main__':
mul()
# single()
Sadly, single() costs 0.4s while mul() costs 0.8s.
UPDATE 1:
Some people said it's an IO-bound task so I can not improve it by parallelism。
However, I can find these words in Python doc:
However, threading is still an appropriate model if you want to run multiple I/O-bound tasks simultaneously.
The full code is here:
My purpose is to transfer Epub to txt.
I have parallelized char2text and now I want to accelerate readall:
import zipfile
from multiprocessing import Pool
import bs4
def char2text(i):
soup = bs4.BeautifulSoup(i)
chapter = soup.body.getText().splitlines()
chapter = "\n".join(chapter).strip() + "\n\n"
return chapter
class Epub(zipfile.ZipFile):
def __init__(self, file, mode='r', compression=0, allowZip64=False):
zipfile.ZipFile.__init__(self, file, mode, compression, allowZip64)
if mode == 'r':
self.opf = self.read('OEBPS/content.opf').decode()
opf_soup = bs4.BeautifulSoup(self.opf)
self.author = opf_soup.find(name='dc:creator').getText()
self.title = opf_soup.find(name='dc:title').getText()
try:
self.description = opf_soup.find(name='dc:description').getText()
except:
self.description = ''
try:
self.chrpattern = opf_soup.find(name='dc:chrpattern').getText()
except:
self.chrpattern = ''
self.cover = self.read('OEBPS/images/cover.jpg')
elif mode == 'w':
pass
def get_text(self):
self.tempread = ""
charlist = self.readall(self.namelist())
with Pool() as pool:
txtlist = pool.map(char2text, charlist)
self.tempread = "".join(txtlist)
return self.tempread
def readall(self, namelist):
charlist = []
for i in namelist:
if i.startswith('OEBPS/') and i.endswith('.xhtml'):
r = self.read(i).decode()
charlist.append(r)
return charlist
def epub2txt(self):
tempread = self.get_text()
with open(self.title + '.txt', 'w', encoding='utf8') as f:
f.write(tempread)
if __name__ == "__main__":
e = Epub("assz.epub")
import cProfile
cProfile.run("e.epub2txt()")

Did you try something like:
from multiprocessing import Pool
import os
import _io
def my_read(file_name):
with open(file_name) as f:
return _io.TextIOWrapper.read(f)
if __name__ == '__main__':
with Pool() as pool:
a = pool.map(my_read, os.listdir('some_dir'))
Is sounds more logical to me to open/close the file in the sub-process and string are easily serializable.
for your readall method try:
def readall(self, namelist):
filter_func = lambda i: i.startswith('OEBPS/') and i.endswith('.xhtml')
read_fun= lambda i: self.read(i).decode()
with Pool() as pool:
a = pool.map(read_fun, filter(filter_func, namelist))
return a

Related

Reading thousands of json file and process them using python multiprocessing

I'm trying to read thousands of json file from directory and process each file separately and store the result in a dictionary. I already write a working code for sequential execution. Now i want to take the leverage of multi-processing for speed up the whole process.
So far what i did -
import json
import os
from multiprocessing import Process, Manager
def read_file(file_name):
'''
Read the given json file and return data
'''
with open(file_name) as file :
data = json.load(file)
return data
def do_some_process(data):
'''
Some calculation will be done here
and return the result
'''
return some_result
def process_each_file(file, result):
file_name = file.split('.')[0]
# reading data from file
data = read_file('../data/{}'.format(file))
processed_result = do_some_process(data)
result[file_name] = processed_result
if __name__ == '__main__':
manager = Manager()
result = manager.dict()
file_list = os.listdir("../data")
all_process = [Process(target=process_each_file, args=(file, result, ))
for file in file_list if file.endswith(".json")]
for p in all_process:
p.start()
for p in all_process:
p.join()
'''
Do some further work with 'rusult' variable
'''
When i run this code it shows OSError: [Errno 24] Too many open files
How can i achive my goal ?
To read and process multiple JSON files using Python's multiprocessing module, you can use the following approach:
import os
import json
from multiprocessing import Pool
# List all the JSON files in the current directory
json_files = [f for f in os.listdir('.') if f.endswith('.json')]
def process_data(data):
return data
def process_json_file(filename):
with open(filename, 'r') as f:
data = json.load(f)
# Process the data here...
processed_data = process_data(data)
return processed_data
# Create a pool of workers to process the files concurrently
with Pool() as pool:
# Apply the processing function to each JSON file concurrently
results = pool.map(process_json_file, json_files)
# Do something with the results
for result in results:
print(result)

Why is my memory usage steadily increasing despite my variables not changing significantly in size?

When I run the code below on a large corpus of many gigabytes of parquet files, the memory usage steadily increases, e.g. from 3% per core at 10 minutes to 7% at 30 minutes to 10%+ at 6 hours. This despite my explicit call to gc.collect() and the fact that the files are all approximately the same size, and that only one file should be loaded at a time. I don't see any persistent references to old files, so I am stumped.
This problem forces me to limit the number of cores I use to tokenize my data, doubling my processing time.
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import re
import string
import os
from multiprocessing import Pool
import gc
import glob
def adjust_tokens(tokens):
new_tokens = []
number_regex = '[0-9]+(?:[.][0-9]+)?'
for token in tokens:
if token in string.punctuation:
continue
if token == '...':
continue
if re.match(number_regex, token):
new_tokens.append('aquantity')
else:
new_tokens.append(token.lower())
return new_tokens
class ParquetsIterable():
def __init__(self, tokenizer=None, indir=None, infiles=None,
filters=None, keys=None):
if tokenizer is not None:
self.tokenizer = tokenizer
else:
self.tokenizer = RegexpTokenizer(
r'(?:\w+)(?:[.-]\w+)*[-+]?|(?:[.][.][.])'
+ r'|(?:[!"#$%&\'()*+,-./:;<=>?#[\\]^_`{|}~])'
)
if infiles is not None:
self.infiles = infiles
else:
self.infiles = [os.path.join(indir, f) for f in os.listdir(indir)]
self.filters = filters
self.keys = keys
def __iter__(self):
for filename in self.infiles:
try:
df = pd.read_parquet(filename)
except Exception:
print('Invalid parquet file %s\n' % filename)
continue
if self.keys is not None:
try:
assert (self.keys[filename].values
== df['key'].values).all()
except AssertionError:
# This never happens
print('Skipping %s because of reorderinging'
' and my unwillingness to write in a join'
% filename)
continue
if self.filters is not None:
df = df[self.filters[filename].values]
gc.collect()
for text in df['sentences']:
for sentence in text.split('\n'):
yield adjust_tokens(self.tokenizer.tokenize(sentence))
def write_iterables_to_files(iterables, files, n_cores=8):
pairs = zip(iterables, files)
with Pool(n_cores) as p:
p.map(write_iterable_to_file, pairs)
def write_iterable_to_file(iterable_and_file):
iterable, file_ = iterable_and_file
with open(file_, 'w') as f:
for tokens in iterable:
f.write(' '.join(tokens) + '\n')
if '__name__' == '__main__':
files = sorted([f for f in glob.glob('data/sentence-parquets/*')
if 'parts' in f and 'sampled' not in f])
os.makedirs('data/processed-sentences-text', exist_ok=True)
iterables = [ParquetsIterable(infiles=files[i::8]) for i in range(8)]
outfiles = ['data/processed-sentences-text/sentences-%d.txt' % i
for i in range(8)]
write_iterables_to_files(iterables, outfiles, n_cores=8)

Storing result of a thread or process with concurrent.futures

I'm writing a utility I can use to check ports on many subnets. Currently I'm adding my results to a csv file and then sorting the file. I would like to instead add my results to a single list and then output the list so I'm doing fewer file open/close operations. I cannot seem to figure out how to make my results persist between threads. Below is my code:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
import os
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
print("Testing the " + sub + " subnet.")
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
print("Tested " + s_ip_addr)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
with open('tested.csv', 'a') as file:
writer = csv.writer(file)
writer.writerow(this_list)
file.close()
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
f.close()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
print(results)
t1 = time.time()
print(t1-t0)
with open("tested.csv", 'r',newline='') as f_input:
csv_input = csv.DictReader(f_input)
data = sorted(csv_input, key=lambda row: (row['IP_Addr']))
f_input.close()
with open("sorted.csv", 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=csv_input.fieldnames)
csv_output.writeheader()
csv_output.writerows(data)
f_output.close()
if os.path.exists("tested.csv"):
os.remove("tested.csv")
else:
print("The file does not exist")
I'm using the class to try and create some kind of location each method would see. I have a feeling the class-specific tested_list is not available to each thread, rather each thread is seeing one instance of tested_list and not a shared list.
The test_ports module is just a wrapper for some socket operations.
I figured out that there is a small difference in concurrent.futures.ProcessPoolExecutor
and
concurrent.futures.ThreadPoolExecutor
ThreadPoolExecutor is doing exactly what I wanted, preserving data between threads. New code looks like this:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
t1 = time.time()
print(t1-t0)
sorted_list = (sorted(results, key=lambda x: x[0]))
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(sorted_list)
f.close()
The end result is a sorted list of opened and closed ssh and rdp ports.

How to process access log using python multiprocessing library?

I have to parse 30 days access logs from the server based on client IP and accessed hosts and need to know top 10 accessed sites. The log file will be around 10-20 GB in size which takes lots of time for single threaded execution of script. Initially, I wrote a script which was working fine but it is taking a lot of time to due to large log file size. Then I tried to implement multiprocessing library for parallel processing but it is not working. It seems implementation of multiprocessing is repeating tasks instead of doing parallel processing. Not sure, what is wrong in the code. Can some one please help on this? Thank you so much in advance for your help.
Code:
from datetime import datetime, timedelta
import commands
import os
import string
import sys
import multiprocessing
def ipauth (slave_list, static_ip_list):
file_record = open('/home/access/top10_domain_accessed/logs/combined_log.txt', 'a')
count = 1
while (count <=30):
Nth_days = datetime.now() - timedelta(days=count)
date = Nth_days.strftime("%Y%m%d")
yr_month = Nth_days.strftime("%Y/%m")
file_name = 'local2' + '.' + date
with open(slave_list) as file:
for line in file:
string = line.split()
slave = string[0]
proxy = string[1]
log_path = "/LOGS/%s/%s" %(slave, yr_month)
try:
os.path.exists(log_path)
file_read = os.path.join(log_path, file_name)
with open(file_read) as log:
for log_line in log:
log_line = log_line.strip()
if proxy in log_line:
file_record.write(log_line + '\n')
except IOError:
pass
count = count + 1
file_log = open('/home/access/top10_domain_accessed/logs/ipauth_logs.txt', 'a')
with open(static_ip_list) as ip:
for line in ip:
with open('/home/access/top10_domain_accessed/logs/combined_log.txt','r') as f:
for content in f:
log_split = content.split()
client_ip = log_split[7]
if client_ip in line:
content = str(content).strip()
file_log.write(content + '\n')
return
if __name__ == '__main__':
slave_list = sys.argv[1]
static_ip_list = sys.argv[2]
jobs = []
for i in range(5):
p = multiprocessing.Process(target=ipauth, args=(slave_list, static_ip_list))
jobs.append(p)
p.start()
p.join()
UPDATE AFTER CONVERSATION WITH OP, PLEASE SEE COMMENTS
My take: Split the file into smaller chunks and use a process pool to work on those chunks:
import multiprocessing
def chunk_of_lines(fp, n):
# read n lines from file
# then yield
pass
def process(lines):
pass # do stuff to a file
p = multiprocessing.Pool()
fp = open(slave_list)
for f in chunk_of_lines(fp,10):
p.apply_async(process, [f,static_ip_list])
p.close()
p.join() # Wait for all child processes to close.
There are many ways to implement the chunk_of_lines method, you could iterate over the file lines using a simple for or do something more advance like call fp.read().

Writing to a file with multiprocessing

I'm having the following problem in python.
I need to do some calculations in parallel whose results I need to be written sequentially in a file. So I created a function that receives a multiprocessing.Queue and a file handle, do the calculation and print the result in the file:
import multiprocessing
from multiprocessing import Process, Queue
from mySimulation import doCalculation
# doCalculation(pars) is a function I must run for many different sets of parameters and collect the results in a file
def work(queue, fh):
while True:
try:
parameter = queue.get(block = False)
result = doCalculation(parameter)
print >>fh, string
except:
break
if __name__ == "__main__":
nthreads = multiprocessing.cpu_count()
fh = open("foo", "w")
workQueue = Queue()
parList = # list of conditions for which I want to run doCalculation()
for x in parList:
workQueue.put(x)
processes = [Process(target = writefh, args = (workQueue, fh)) for i in range(nthreads)]
for p in processes:
p.start()
for p in processes:
p.join()
fh.close()
But the file ends up empty after the script runs. I tried to change the worker() function to:
def work(queue, filename):
while True:
try:
fh = open(filename, "a")
parameter = queue.get(block = False)
result = doCalculation(parameter)
print >>fh, string
fh.close()
except:
break
and pass the filename as parameter. Then it works as I intended. When I try to do the same thing sequentially, without multiprocessing, it also works normally.
Why it didn't worked in the first version? I can't see the problem.
Also: can I guarantee that two processes won't try to write the file simultaneously?
EDIT:
Thanks. I got it now. This is the working version:
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
from random import uniform
def doCalculation(par):
t = uniform(0,2)
sleep(t)
return par * par # just to simulate some calculation
def feed(queue, parlist):
for par in parlist:
queue.put(par)
def calc(queueIn, queueOut):
while True:
try:
par = queueIn.get(block = False)
print "dealing with ", par, ""
res = doCalculation(par)
queueOut.put((par,res))
except:
break
def write(queue, fname):
fhandle = open(fname, "w")
while True:
try:
par, res = queue.get(block = False)
print >>fhandle, par, res
except:
break
fhandle.close()
if __name__ == "__main__":
nthreads = multiprocessing.cpu_count()
fname = "foo"
workerQueue = Queue()
writerQueue = Queue()
parlist = [1,2,3,4,5,6,7,8,9,10]
feedProc = Process(target = feed , args = (workerQueue, parlist))
calcProc = [Process(target = calc , args = (workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target = write, args = (writerQueue, fname))
feedProc.start()
for p in calcProc:
p.start()
writProc.start()
feedProc.join ()
for p in calcProc:
p.join()
writProc.join ()
You really should use two queues and three separate kinds of processing.
Put stuff into Queue #1.
Get stuff out of Queue #1 and do calculations, putting stuff in Queue #2. You can have many of these, since they get from one queue and put into another queue safely.
Get stuff out of Queue #2 and write it to a file. You must have exactly 1 of these and no more. It "owns" the file, guarantees atomic access, and absolutely assures that the file is written cleanly and consistently.
If anyone is looking for a simple way to do the same, this can help you.
I don't think there are any disadvantages to doing it in this way. If there are, please let me know.
import multiprocessing
import re
def mp_worker(item):
# Do something
return item, count
def mp_handler():
cpus = multiprocessing.cpu_count()
p = multiprocessing.Pool(cpus)
# The below 2 lines populate the list. This listX will later be accessed parallely. This can be replaced as long as listX is passed on to the next step.
with open('ExampleFile.txt') as f:
listX = [line for line in (l.strip() for l in f) if line]
with open('results.txt', 'w') as f:
for result in p.imap(mp_worker, listX):
# (item, count) tuples from worker
f.write('%s: %d\n' % result)
if __name__=='__main__':
mp_handler()
Source: Python: Writing to a single file with queue while using multiprocessing Pool
There is a mistake in the write worker code, if the block is false, the worker will never get any data. Should be as follows:
par, res = queue.get(block = True)
You can check it by adding line
print "QSize",queueOut.qsize()
after the
queueOut.put((par,res))
With block=False you would be getting ever increasing length of the queue until it fills up, unlike with block=True where you get always "1".

Categories