Translating python code from unix to windows - Multiprocessing Calls - python

Can someone help me I'm trying to translate the code from the link below to work in windows.
http://wilson.bronger.org/lens_calibration_tutorial/calibrate.py
import subprocess, os, os.path, sys, multiprocessing, math, re, contextlib, glob, codecs, struct, numpy
from scipy.optimize.minpack import leastsq
candidate_groups = [['distortion\\Standard--13.3--5.5.DNG'],['distortion\\Standard--17.5--5.7.DNG']]
def call_exiv2(candidate_group):
exiv2_process = subprocess.Popen(
["exiv2", "-PEkt", "-g", "Exif.Photo.LensModel", "-g", "Exif.Photo.FocalLength", "-g", "Exif.Photo.FNumber"]
+ candidate_group, stdout=subprocess.PIPE)
lines = exiv2_process.communicate()[0].splitlines()
assert exiv2_process.returncode in [0, 253]
result = {}
for line in lines:
filename, data = line.split("Exif.Photo.")
filename = filename.rstrip()
if not filename:
assert len(candidate_group) == 1
filename = candidate_group[0]
fieldname, field_value = data.split(None, 1)
exif_data = result.setdefault(filename, [None, float("nan"), float("nan")])
if fieldname == "LensModel":
exif_data[0] = field_value
elif fieldname == "FocalLength":
exif_data[1] = float(field_value.partition("mm")[0])
elif fieldname == "FNumber":
exif_data[2] = float(field_value.partition("F")[2])
for filename, exif_data in list(result.copy().items()):
result[filename] = tuple(exif_data)
return result
if __name__ == '__main__':
multiprocessing.freeze_support()
pool = multiprocessing.Pool()
for group_exif_data in pool.map(call_exiv2, candidate_groups):
file_exif_data.update(group_exif_data)
pool.close()
pool.join()
However, there seems to be issues with the multiprocessing sections. I initially used 2to3 to convert the code to python3 as I'm using 3.4. I also added the code below above the pool = multiprocessing.Pool() calls.
if __name___ == '__main__':
multiprocessing.freeze_support()
Which got rid of one error but I still have the following:
Traceback (most recent call last):
File "calibrate.py", line 166, in <module>
for group_exif_data in pool.map(call_exiv2, candidate_groups):
File "C:\Python34\lib\multiprocessing\pool.py", line 260, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Python34\lib\multiprocessing\pool.py", line 599, in get
raise self._value
TypeError: Type str doesn't support the buffer API
I initially thought it might have to do with Pickling, but couldn't figure it out. Any ideas?

Related

Python 3.9: multiprocessing process start() got an error| TypeError: cannot pickle 'weakref' object

I'm trying to decrease running time by using multiprocessing.
I got a weird error TypeError: cannot pickle 'weakref' object
I'm not quite sure why this error occurs because I also use this approach to run another program but it run normally. Can someone explain why this error occurs.
I already follow this Solution but it did not work for me.
import multiprocessing
from scipy import stats
import numpy as np
import pandas as pd
class T_TestFeature:
def __init__(self, data, classes):
self.data = data
self.classes = classes
self.manager = multiprocessing.Manager()
self.pval = self.manager.list()
def preform(self):
process = []
for i in range(10):
process.append(multiprocessing.Process(target=self.t_test, args=(i,)))
for p in process:
p.start()
for p in process:
p.join()
def t_test(self, k):
index_samples = np.array(self.data)[:,k]
rs1 = [index_samples[i] for i in range(len(index_samples)) if self.classes[i] == "Virginia"]
rs2 = [index_samples[i] for i in range(len(index_samples)) if self.classes[i] != "Virginia"]
self.pval.append(stats.ttest_ind(rs1, rs2, equal_var=False).pvalue)
def main():
df = pd.read_excel("/Users/xxx/Documents/Project/src/flattened.xlsx")
flattened = df.values.T
y = df.columns
result = T_TestFeature(flattened, y)
result.preform()
print(result.pval)
if __name__ == "__main__":
main()
Traceback (most recent call last):
File "/Users/xxx/Documents/Project/src/t_test.py", line 41, in <module>
main()
File "/Users/xxx/Documents/Project/src/t_test.py", line 37, in main
result.preform()
File "/Users/xxx/Documents/Project/src/t_test.py", line 21, in preform
p.start()
File "/Users/xxx/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/Users/xxx/opt/anaconda3/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
return Popen(process_obj)
File "/Users/xxx/opt/anaconda3/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
super().__init__(process_obj)
File "/Users/xxx/opt/anaconda3/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/Users/x/opt/anaconda3/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _xxlaunch
reduction.dump(process_obj, fp)
File "/Users/xxx/opt/anaconda3/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
TypeError: cannot pickle 'weakref' object
Here is a simpler way to reproduce your issue:
from multiprocessing import Manager, Process
class A:
def __init__(self):
self.manager = Manager()
def start(self):
print("started")
if __name__ == "__main__":
a = A()
proc = Process(target=a.start)
proc.start()
proc.join()
You cannot pickle instances containing manager objects, because they contain reference to the manager process they started (therefore, in general you can't pickle instances containing objects of class Process).
A simple fix would be to not store the manager. It will automatically be garbage collected once no references to the managed list remains:
def __init__(self, data, classes):
self.data = data
self.classes = classes
manager = multiprocessing.Manager()
self.pval = manager.list()

How to recover the return value of a function passed to multiprocessing.Process?

I have looked at this question to get started and it works just fine How can I recover the return value of a function passed to multiprocessing.Process?
But in my case I would like to write a small tool, that would connect to many computers and gather some statistics, each stat would be gathered within a Process to make it snappy. But as soon as I try to wrap up the multiprocessing command in a class for a machine then it fails.
Here is my code
import multiprocessing
import pprint
def run_task(command):
p = subprocess.Popen(command, stdout = subprocess.PIPE, universal_newlines = True, shell = False)
result = p.communicate()[0]
return result
MACHINE_NAME = "cptr_name"
A_STAT = "some_stats_A"
B_STAT = "some_stats_B"
class MachineStatsGatherer():
def __init__(self, machineName):
self.machineName = machineName
manager = multiprocessing.Manager()
self.localStats = manager.dict() # creating a shared ressource for the sub processes to use
self.localStats[MACHINE_NAME] = machineName
def gatherStats(self):
self.runInParallel(
self.GatherSomeStatsA,
self.GatherSomeStatsB,
)
self.printStats()
def printStats(self):
pprint.pprint(self.localStats)
def runInParallel(self, *fns):
processes = []
for fn in fns:
process = multiprocessing.Process(target=fn, args=(self.localStats))
processes.append(process)
process.start()
for process in processes:
process.join()
def GatherSomeStatsA(self, returnStats):
# do some remote command, simplified here for the sake of debugging
result = "Windows"
returnStats[A_STAT] = result.find("Windows") != -1
def GatherSomeStatsB(self, returnStats):
# do some remote command, simplified here for the sake of debugging
result = "Windows"
returnStats[B_STAT] = result.find("Windows") != -1
def main():
machine = MachineStatsGatherer("SOMEMACHINENAME")
machine.gatherStats()
return
if __name__ == '__main__':
main()
And here is the error message
Traceback (most recent call last):
File "C:\Users\mesirard\AppData\Local\Programs\Python\Python37\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\Users\mesirard\AppData\Local\Programs\Python\Python37\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "d:\workdir\trunks6\Tools\VTKAppTester\Utils\NXMachineMonitorShared.py", line 45, in GatherSomeStatsA
returnStats[A_STAT] = result.find("Windows") != -1
TypeError: 'str' object does not support item assignment
Process Process-3:
Traceback (most recent call last):
File "C:\Users\mesirard\AppData\Local\Programs\Python\Python37\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\Users\mesirard\AppData\Local\Programs\Python\Python37\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "d:\workdir\trunks6\Tools\VTKAppTester\Utils\NXMachineMonitorShared.py", line 50, in GatherSomeStatsB
returnStats[B_STAT] = result.find("Windows") != -1
TypeError: 'str' object does not support item assignment
The issue is coming from this line
process = multiprocessing.Process(target=fn, args=(self.localStats))
it should have a extra comma at the end of args like so
process = multiprocessing.Process(target=fn, args=(self.localStats,))

too many files open error with multiprocessing

I have a code that uses multiprocessing over about 10000 files on a 12 core vcpu on Ubuntu.
def process_file(name):
inp = open(name)
out = open(name.split('.')[0]+'wikiout.txt','a')
for row in inp:
row = row.strip()
sent_text = nltk.sent_tokenize(text)
for sent in sent_text:
# process sentence
inp.close()
out.close()
if __name__ == '__main__':
processes = []
for i in 'ABCDEF':
for j in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
for k in range(100)
filename = os.path.join(os.path.dirname(__file__), (i + j + '/' + 'wiki_' + str(k) + '.txt'))
p = multiprocessing.Process(target=process_file, args=(filename,))
processes.append(p)
p.start()
for process in processes:
process.join()
For some reason I get this issue
File "wikirules.py", line 37, in <module>
p.start()
File "/usr/lib/python3.8/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/usr/lib/python3.8/multiprocessing/context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "/usr/lib/python3.8/multiprocessing/context.py", line 277, in _Popen
return Popen(process_obj)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 69, in _launch
child_r, parent_w = os.pipe()
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
File "wikirules.py", line 13, in process_file
File "/usr/local/lib/python3.8/dist-packages/nltk/tokenize/__init__.py", line 106, in sent_tokenize
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 752, in load
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 877, in _open
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 327, in open
OSError: [Errno 24] Too many open files: '/root/nltk_data/tokenizers/punkt/PY3/english.pickle'
Any clue why this might be happening? Im still new to multiprocessing. So shouldn't this not open more than 12 files at once.
Your code is trying to run
len('ABCDEF') * len('ABCD...Z') * len(range(100)) = 6 * 26 * 100 = 15 600
operating system processes simultaneously.
Actually multiprocessing module contains relatively low level primitives to work with multiprocessing, and for basic tasks standard library suggests more safe and convenient option - module concurrent.futures which contains Pools implementations for threads and processes, and could be very useful especially for "embarrassingly parallel" workloads.
Here is example how the code from your question could be transformed using concurrent.futures and some other python features like generators, context managers and pathlib module.
import concurrent.futures as futures
import itertools
import pathlib
import nltk
BASE_PATH = pathlib.Path(__file__).parent.absolute()
def filename_generator():
"""produce filenames sequence"""
for i, j, k in itertools.product("ABCDEF", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", range(100)):
yield BASE_PATH / f"{i}{j}/wiki_{k}.txt"
def worker(filename: pathlib.Path):
"""do all the job"""
out_filename = filename.with_suffix('.wikiout.txt')
with open(filename) as inp, open(out_filename, "a") as out:
for row in inp:
text = row.strip()
sent_text = nltk.sent_tokenize(text)
for sent in sent_text:
"""process sentence"""
def main():
with futures.ProcessPoolExecutor() as pool:
# mapping future->filename, useful in case of error
task_to_filename = {pool.submit(worker, f): f for f in filename_generator()}
for f in futures.as_completed(task_to_filename):
try:
f.result()
except Exception as e:
filename = task_to_filename[f]
print(f"{filename} processing failed: {e}")
if __name__ == "__main__":
main()

How to get the input from the subprocess?

I have a python program will will be called by another python program via a subprocess:
The python program is like this:
import argparse
import sys
from base64 import b64decode
import json
def set_key(args, **kwargs):
path = "/tmp/key"
print(kwargs)
try:
with open(path, 'wb') as open_file:
b = b64decode(kwargs)
open_file.write(b)
return Result(True)
except OSError as e:
return Result(False)
return Result(True)
commands_map = {
"set-key": set_key,
}
def main(sys_argv=None):
parser = argparse.ArgumentParser(
description="a util command"
)
parser.add_argument("-N", dest="check_testing", action="store_true",
help="check testing")
parser.add_argument("cmd", choices=sorted(list(commands_map)))
args, remaining = parser.parse_known_args(sys_argv)
# Not entirely sure there's much benefit to this check.
if args.check_testing:
return 1 if args.testing else 0
result, vals = commands_map[args.cmd](
remaining, testing=args.testing,
check_testing=args.check_testing)
print(json.dumps(vals, indent=2, sort_keys=True))
return 0 if result else 1
if __name__ == "__main__":
sys.exit(main())
And the caller program is:
import subprocess
input = b'{\n "keys": "l0Pu3TlknxWqTZwDG1yJcjUDGcBH7c8F19fkxeNmBl/2wXQoochlbxLTKhgkzeQNRDvFkQfMBcdlsbcxrrEQX+IydyiLkU5o8Gmhe2JGP56CNPLIefl9WPvLlPQBdvjEWO2UBaBjo2VW3Xsd1Ng+xFSUbP/ls7dso+h5/Ty37Rw="\n}'
cmda = ['python3', 'test.py', 'set-key']
try:
s = subprocess.run(cmda, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, input=input)
except subprocess.CalledProcessError as e:
print(e.stderr)
When I try to execute the call program, it complains the subprocess program kwargs is a dict.
b'Traceback (most recent call last):\n File "test.py", line 46, in <module>\n sys.exit(main())\n File "test.py", line 40, in main\n check_testing=args.check_testing)\n File "test.py", line 12, in set_key\n b = b64decode(kwargs)\n File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/base64.py", line 80, in b64decode\n s = _bytes_from_decode_data(s)\n File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/base64.py", line 46, in _bytes_from_decode_data\n "string, not %r" % s.__class__.__name__) from None\nTypeError: argument should be a bytes-like object or ASCII string, not \'dict\'\n'
How can I properly pass the input to the subprocess program so that I can write to a file?
Also, the args and kwargs seems to be empty (I expect it should be taken from the input call from subprocess). Anyone know why they are empty?

Shared arrays in multiprocessing Python

I'm trying to write in the same shared array in a parallel processing python script.
When I do it outside a class, in a normal script, everything works right. But when I try to do it through a class (using the same code), I get the
Runtime Error: SynchronizedArray objects should only be shared between processes through inheritance.
My script is the following (without a class):
import numpy
import ctypes
from multiprocessing import Pool, Array, cpu_count
n = 2
total_costs_matrix_base = Array(ctypes.c_double, n*n)
total_costs_matrix = numpy.ctypeslib.as_array(
total_costs_matrix_base.get_obj())
total_costs_matrix = total_costs_matrix.reshape(n,n)
def set_total_costs_matrix( i, j, def_param = total_costs_matrix_base):
total_costs_matrix[i,j] = i * j
if __name__ == "__main__":
pool = Pool(processes=cpu_count())
iterable = []
for i in range(n):
for j in range(i+1,n):
iterable.append((i,j))
pool.starmap(set_total_costs_matrix, iterable)
total_costs_matrix.dump('some/path/to/file')
That script works well. The one that doesn't is the following (which uses a class):
import numpy
import ctypes
from multiprocessing import Pool, Array, cpu_count
class CostComputation(object):
"""Computes the cost matrix."""
def __init__(self):
self.n = 2
self.total_costs_matrix_base = Array(ctypes.c_double, self.n*self.n)
self.total_costs_matrix = numpy.ctypeslib.as_array(
self.total_costs_matrix_base.get_obj())
self.total_costs_matrix = self.total_costs_matrix.reshape(self.n,self.n)
def set_total_costs_matrix(self, i, j, def_param = None):
def_param = self.total_costs_matrix_base
self.total_costs_matrix[i,j] = i * j
def write_cost_matrix(self):
pool = Pool(processes=cpu_count())
iterable = []
for i in range(self.n):
for j in range(i+1,self.n):
iterable.append((i,j))
pool.starmap(self.set_total_costs_matrix, iterable)
self.total_costs_matrix.dump('some/path/to/file')
After this, I would call write_cost_matrix from another file, after creating an instance of CostComputation.
I read this answer but still couldn't solve my problem.
I'm using Python 3.4.2 in a Mac OSX Yosemite 10.10.4.
EDIT
When using the class CostComputation, the script I'm using is:
from cost_computation import CostComputation
cc = CostComputation()
cc.write_costs_matrix()
The whole error is:
Traceback (most recent call last):
File "app.py", line 65, in <module>
cc.write_cost_matrix()
File "/path/to/cost_computation.py", line 75, in write_cost_matrix
pool.starmap(self.set_total_costs_matrix, iterable)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/pool.py", line 268, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/pool.py", line 599, in get
raise self._value
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/pool.py", line 383, in _handle_tasks
put(task)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/connection.py", line 206, in send
self._send_bytes(ForkingPickler.dumps(obj))
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/reduction.py", line 50, in dumps
cls(buf, protocol).dump(obj)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/sharedctypes.py", line 192, in __reduce__
assert_spawning(self)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/context.py", line 347, in assert_spawning
' through inheritance' % type(obj).__name__
RuntimeError: SynchronizedArray objects should only be shared between processes through inheritance
Try creating a second class which contains the shared data only. Then use that class object in your main class.

Categories