Can linecache be used for concurrent reading? - python

I want to read and process a file by using multiprocessing with low memory consumption, high throughput (sentence/s), and - especially important - ordered results.
I was wondering whether we can use linecache's getline for this purpose. The following code reads a file, hopefully in parallel, and executes some function on the lines that are gathered in the subprocess. Here I opted for running some tokenisation on the files with spaCy.
import datetime
from multiprocessing import Pool, current_process
from os import cpu_count
from pathlib import Path
from functools import partial
from linecache import getline
import spacy
class Processor:
def __init__(self, spacy_model='en_core_web_sm', batch_size=2048):
self.nlp = spacy.load(spacy_model, disable=['ner', 'textcat'])
self.batch_size = batch_size
#staticmethod
def get_n_lines(pfin):
with pfin.open(encoding='utf-8') as fhin:
for line_idx, _ in enumerate(fhin, 1):
pass
return line_idx
def process_file(self, fin):
pfin = Path(fin).resolve()
total_lines = self.get_n_lines(pfin)
start_time = datetime.datetime.now()
procfunc = partial(self.process_batch, pfin)
with Pool(cpu_count() - 1) as pool:
# map the starting indexex to the processes
for _ in pool.imap(procfunc, range(0, total_lines+1, self.batch_size)):
pass
print('done', (datetime.datetime.now() - start_time).total_seconds())
def process_batch(self, pfin, start):
lines = [getline(str(pfin), i) for i in range(start, start+self.batch_size)]
# Parse text with spaCy
docs = list(self.nlp.pipe(lines))
# Chop into sentences
spacy_sents = [str(sent) for doc in docs for sent in doc.sents]
return str(current_process()), spacy_sents
if __name__ == '__main__':
fn = r'data/train.tok.low.en'
proc = Processor()
proc.process_file(fn)
I found that on my work laptop, running with 3 active cores on a file of 140K sentences the duration is 261 seconds. When running with a single core (n_workers=1), the processing time is 431 seconds. I am not sure how to interpret this difference but I guess it comes down to the question: does linecache.getline allow for concurrent reading? Parallel execution s faster, but considering getline expects a file name (rather than a file object), I expect it to have to open the file every time and as such blocking access for other processes. Is this assumption correct because parallel execution still seems much faster? Is there a better way to read files fast and in parallel whilst also keeping the results ordered?

You don't need linecache, and it doesn't help.
First, you don't need any special tricks to read the same file simultaneously from multiple processes. You can just do it. It'll work.
Second, linecache loads a whole file immediately as soon as a single line is requested from that file. You're not splitting the work of reading the file at all. You're doing more I/O than if you just had the parent process read the file and let the workers inherit the data. If you're getting any speedup from multiprocessing, it's probably due to parallelizing the NLP work, not the file reading.
Third, linecache is designed to support the traceback module, and it does a lot of stuff that doesn't make sense for a general-purpose file reading module, including searching the import path for a file if it doesn't find the file in the current directory.

Related

Python multitprocessing to process files

I've never done anything with multiprocessing before, but I recently ran into a problem with one of my projects taking an excessive amount of time to run. I have about 336,000 files I need to process, and a traditional for loop would likely take about a week to run.
There are two loops to do this, but they are effectively identical in what they return so I've only included one.
import json
import os
from tqdm import tqdm
import multiprocessing as mp
jsons = os.listdir('/content/drive/My Drive/mrp_workflow/JSONs')
materials = [None] * len(jsons)
def asyncJSONs(file, index):
try:
with open('/content/drive/My Drive/mrp_workflow/JSONs/{}'.format(file)) as f:
data = json.loads(f.read())
properties = process_dict(data, {})
properties['name'] = file.split('.')[0]
materials[index] = properties
except:
print("Error parsing at {}".format(file))
process_list = []
i = 0
for file in tqdm(jsons):
p = mp.Process(target=asyncJSONs,args=(file,i))
p.start()
process_list.append(p)
i += 1
for process in process_list:
process.join()
Everything in that relating to multiprocessing was cobbled together from a collection of google searches and articles, so I wouldn't be surprised if it wasn't remotely correct. For example, the 'i' variable is a dirty attempt to keep the information in some kind of order.
What I'm trying to do is load information from those JSON files and store it in the materials variable. But when I run my current code nothing is stored in materials.
As you can read in other answers - processes don't share memory and you can't set value directly in materials. Function has to use return to send result back to main process and it has to wait for result and get it.
It can be simpler with Pool. It doesn't need to use queue manually. And it should return results in the same order as data in all_jsons. And you can set how many processes to run at the same time so it will not block CPU for other processes in system.
But it can't use tqdm.
I couldn't test it but it can be something like this
import os
import json
from multiprocessing import Pool
# --- functions ---
def asyncJSONs(filename):
try:
fullpath = os.path.join(folder, filename)
with open(fullpath) as f:
data = json.loads(f.read())
properties = process_dict(data, {})
properties['name'] = filename.split('.')[0]
return properties
except:
print("Error parsing at {}".format(filename))
# --- main ---
# for all processes (on some systems it may have to be outside `__main__`)
folder = '/content/drive/My Drive/mrp_workflow/JSONs'
if __name__ == '__main__':
# code only for main process
all_jsons = os.listdir(folder)
with Pool(5) as p:
materials = p.map(asyncJSONs, all_jsons)
for item in materials:
print(item)
BTW:
Other modules: concurrent.futures, joblib, ray,
Going to mention a totally different way of solving this problem. Don't bother trying to append all the data to the same list. Extract the data you need, and append it to some target file in ndjson/jsonlines format. That's just where, instead of objects part of a json array [{},{}...], you have separate objects on each line.
{"foo": "bar"}
{"foo": "spam"}
{"eggs": "jam"}
The workflow looks like this:
spawn N workers with a manifest of files to process and the output file to write to. You don't even need MP, you could use a tool like rush to parallelize.
worker parses data, generates the output dict
worker opens the output file with append flag. dump the data and flush immediately:
with open(out_file, 'a') as fp:
print(json.dumps(data), file=fp, flush=True)
Flush ensure that as long as your data is less than the buffer size on your kernel (usually several MB), your different processes won't stomp on each other and conflict writes. If they do get conflicted, you may need to write to a separate output file for each worker, and then join them all.
You can join the files and/or convert to regular JSON array if needed using jq. To be honest, just embrace jsonlines. It's a way better data format for long lists of objects, since you don't have to parse the whole thing in memory.
You need to understand how multiprocessing works. It starts a brand new process for EACH task, each with a brand new Python interpreter, which runs your script all over again. These processes do not share memory in any way. The other processes get a COPY of your globals, but they obviously can't be the same memory.
If you need to send information back, you can using a multiprocessing.queue. Have the function stuff the results in a queue, while your main code waits for stuff to magically appear in the queue.
Also PLEASE read the instructions in the multiprocessing docs about main. Each new process will re-execute all the code in your main file. Thus, any one-time stuff absolutely must be contained in a
if __name__ == "__main__":
block. This is one case where the practice of putting your mainline code into a function called main() is a "best practice".
What is taking all the time here? Is it reading the files? If so, then you might be able to do this with multithreading instead of multiprocessing. However, if you are limited by disk speed, then no amount of multiprocessing is going to reduce your run time.

Python multiprocessing: Global objects not being copied to children properly

A few days back I answered a question on SO regarding reading a tar file in parallel.
This was the gist of the question:
import bz2
import tarfile
from multiprocessing import Pool
tr = tarfile.open('data.tar')
def clean_file(tar_file_entry):
if '.bz2' not in str(tar_file_entry):
return
with tr.extractfile(tar_file_entry) as bz2_file:
with bz2.open(bz2_file, "rt") as bzinput:
# Reading bz2 file
....
....
def process_serial():
members = tr.getmembers()
processed_files = []
for i, member in enumerate(members):
processed_files.append(clean_file(member))
print(f'done {i}/{len(members)}')
def process_parallel():
members = tr.getmembers()
with Pool() as pool:
processed_files = pool.map(clean_file, members)
print(processed_files)
def main():
process_serial() # No error
process_parallel() # Error
if __name__ == '__main__':
main()
We were able to make the error disappear by just opening the tar file inside the child process rather than in the parent, as mentioned in the answer.
I am not able to understand why did this work.
Even if we open the tarfile in the parent process, the child process will get a new copy.
So why does opening the tarfile in the child process explicitly make any difference?
Does this mean that in the first case, the child processes were somehow mutating the common tarfile object and causing memory corruption due to concurrent writes?
FWIW, the answer in the comments wrt open is actually incorrect on UNIX-like systems regarding file handle numbers.
If multiprocessing uses fork() (which it does under Linux and similar, although I read there was an issue with forking on macOS), the file handles and everything else are happily copied to child processes (by "happily" I mean it's complicated in many edge cases such as forking threads, but still it works fine for file handles).
The following works fine for me:
import multiprocessing
this = open(__file__, 'r')
def read_file():
print(len(this.read()))
def main():
process = multiprocessing.Process(target=read_file)
process.start()
process.join()
if __name__ == '__main__':
main()
The problem is likely that tarfile has an internal structure and/or buffering while reading, also you can simply run into conflicts by trying to seek and read different parts of the same archive simultaneously. I.e., I'm speculating that using a threadpool without any synchronization is likely to run into exactly the same issues in this case.
Edit: to clarify, extracting a file from a Tar archive is likely (I haven't checked the exact details) done as follows: (1) seek to the offset of the encapsulated part (file), (2) read a chunk of the encapsulated file, write the chunk to the destination file (or pipe, or w/e), (3) repeat (2) until the whole file is extracted.
By attempting to do this in a non-synchronized way from parallel processes using the same file handle, will likely result in mixing of these steps, i.e. starting to process file #2 will seek away from file #1, while we are in the middle of reading file #1, etc.
Edit2 answering the comment below: Memory representation is forked afresh for child processes, that's true; but resources managed on the kernel side (such as file handles, and kernel buffers) are shared.
To illustrate:
import multiprocessing
this = open(__file__, 'rb')
def read_file(worker):
print(worker, this.read(80))
def main():
processes = []
for number in (1, 2):
processes.append(
multiprocessing.Process(target=read_file, args=(number,)))
for process in processes:
process.start()
for process in processes:
process.join()
if __name__ == '__main__':
main()
Running this on Linux I get:
$ python3.8 test.py
1 b"import multiprocessing\n\nthis = open(__file__, 'rb')\n\n\ndef read_file(worker):\n "
2 b''
If seeking and reading were independent, both processes would print an identical result, but they don't. Since this is a small file, and Python opts to buffer a small amount of data (8 KiB), the first process reads to the EOF, and the second process has no data left to read (unless it of course seeks back).

How to speed up multiple http requsts in Python

I want to fetch whois data from a txt file with 50000 urls. It's working but takes at least 20 minutes. What can i do to improve performance of this
import whois
from concurrent.futures import ThreadPoolExecutor
import threading
import time
pool = ThreadPoolExecutor(max_workers=2500)
def query(domain):
while True:
try:
w = whois.whois(domain)
fwrite = open("whoISSuccess.txt", "a")
fwrite.write('\n{0} : {1}'.format(w.domain, w.expiration_date))
fwrite.close()
except:
time.sleep(3)
continue
else:
break
with open('urls.txt') as f:
for line in f:
lines = line.rstrip("\n\r")
pool.submit(query, lines)
pool.shutdown(wait=True)
You can do two things to improve the speed:
Use multiprocessing, rather than threading as python Threads do not really run in parallel, while processes will be managed the OS and truly run in parallel.
Secondly, have each process write to its own file, e.g. <url>.txt, as having all processes write to the same file will cause lock contention on the file writing, which significantly slows your program. After all processes have completed you can then aggregate all files to a single one if this is a critical requirement. Alternatively, you can just keep the whois result in memory and then write it out to a file in the end.

Python line-by-line memory profiler?

I'm looking to generate, from a large Python codebase, a summary of heap usage or memory allocations over the course of a function's run.
I'm familiar with heapy, and it's served me well for taking "snapshots" of the heap at particular points in my code, but I've found it difficult to generate a "memory-over-time" summary with it. I've also played with line_profiler, but that works with run time, not memory.
My fallback right now is Valgrind with massif, but that lacks a lot of the contextual Python information that both Heapy and line_profiler give. Is there some sort of combination of the latter two that give a sense of memory usage or heap growth over the execution span of a Python program?
I would use sys.settrace at program startup to register a custom tracer function. The custom_trace_function will be called for each line of code. Then you can use that function to store information gathered by heapy or meliae in a file for later processing.
Here is a very simple example which logs the output of hpy.heap() each second to a plain text file:
import sys
import time
import atexit
from guppy import hpy
_last_log_time = time.time()
_logfile = open('logfile.txt', 'w')
def heapy_profile(frame, event, arg):
currtime = time.time()
if currtime - _last_log_time < 1:
return
_last_log_time = currtime
code = frame.f_code
filename = code.co_filename
lineno = code.co_firstlineno
idset = hpy().heap()
logfile.write('%s %s:%s\n%s\n\n' % (currtime, filename, lineno, idset))
logfile.flush()
atexit.register(_logfile.close)
sys.settrace(heapy_profile)
You might be interested by memory_profiler.

python: how to create persistent in-memory structure for debugging

[Python 3.1]
My program takes a long time to run just because of the pickle.load method on a huge data structure. This makes debugging very annoying and time-consuming: every time I make a small change, I need to wait for a few minutes to see if the regression tests passed.
I would like replace pickle with an in-memory data structure.
I thought of starting a python program in one process, and connecting to it from another; but I am afraid the inter-process communication overhead will be huge.
Perhaps I could run a python function from the interpreter to load the structure in memory. Then as I modify the rest of the program, I can run it many times (without exiting the interpreter in between). This seems like it would work, but I'm not sure if I will suffer any overhead or other problems.
You can use mmap to open a view on the same file in multiple processes, with access at almost the speed of memory once the file is loaded.
First you can pickle different parts of the hole object using this method:
# gen_objects.py
import random
import pickle
class BigBadObject(object):
def __init__(self):
self.a_dictionary={}
for x in xrange(random.randint(1, 1000)):
self.a_dictionary[random.randint(1,98675676)]=random.random()
self.a_list=[]
for x in xrange(random.randint(1000, 10000)):
self.a_list.append(random.random())
self.a_string=''.join([chr(random.randint(65, 90))
for x in xrange(random.randint(100, 10000))])
if __name__=="__main__":
output=open('lotsa_objects.pickled', 'wb')
for i in xrange(10000):
pickle.dump(BigBadObject(), output, pickle.HIGHEST_PROTOCOL)
output.close()
Once you generated the BigFile in various separate parts you can read it with a python program with several running at the same time reading each one different parts.
# reader.py
from threading import Thread
from Queue import Queue, Empty
import cPickle as pickle
import time
import operator
from gen_objects import BigBadObject
class Reader(Thread):
def __init__(self, filename, q):
Thread.__init__(self, target=None)
self._file=open(filename, 'rb')
self._queue=q
def run(self):
while True:
try:
one_object=pickle.load(self._file)
except EOFError:
break
self._queue.put(one_object)
class uncached(object):
def __init__(self, filename, queue_size=100):
self._my_queue=Queue(maxsize=queue_size)
self._my_reader=Reader(filename, self._my_queue)
self._my_reader.start()
def __iter__(self):
while True:
if not self._my_reader.is_alive():
break
# Loop until we get something or the thread is done processing.
try:
print "Getting from the queue. Queue size=", self._my_queue.qsize()
o=self._my_queue.get(True, timeout=0.1) # Block for 0.1 seconds
yield o
except Empty:
pass
return
# Compute an average of all the numbers in a_lists, just for show.
list_avg=0.0
list_count=0
for x in uncached('lotsa_objects.pickled'):
list_avg+=reduce(operator.add, x.a_list)
list_count+=len(x.a_list)
print "Average: ", list_avg/list_count
This way of reading the pickle file will take 1% of the time it takes in the other way. This is because you are running 100 parallel threads at the same time.

Categories