String issue for Python multiprocessing

String issue for Python multiprocessing - python

facing some issues with string parsing and the multiprocessing library. Here is my code, and I also outline the function calls and error.
def semi_func(tile):
with open(tile, 'rb') as f:
img = Image.open(BytesIO(f.read()))
resized_im, seg_map = MODEL.run(img)
vis_segmentation_tiles(str(tile),resized_im, seg_map)
x = np.unique(seg_map)
x = x.tolist()
print("THIS IS X", x)
ans_tiles[str(tile)] = x
print(x)
return ans_tiles
def split_tiles_new(image_path, tiledir):
print("1")
pool = Pool(processes=5)
print("2")
num_tiles = 9
tiles = image_slicer.slice(image_path, num_tiles, save=False)
print("3")
print(tiles)
image_slicer.save_tiles(tiles, directory=tiledir)
print(tiles)
print("TILES ABOEVE")
onlytiles = [os.path.join(tiledir,f) for f in listdir(tiledir) if isfile(join(tiledir, f))]
ans_tiles = {}
print(onlytiles)
onlytiles = list(map(str, onlytiles))
for t in onlytiles:
print(t)
for tile in onlytiles:
print(tile)
pool.map(semi_func,tile)
pool.close()
pool.join()
print(ans_tiles)
return ans_tiles
Here's what I'm feeding in terms of my functions:
ans_tiles = split_tiles_new(local_jpg, tiledir)
local_jpg = 'wheat044146108.jpg'
tiledir = 'tiles044146108'
Inside tiledir (the directory), there's a bunch of tiled images:
['tiles044146108/_03_02.png', 'tiles044146108/_03_01.png', 'tiles044146108/_02_02.png', 'tiles044146108/_01_01.png', 'tiles044146108/_03_03.png', 'tiles044146108/_01_02.png', 'tiles044146108/_02_01.png', 'tiles044146108/_02_03.png', 'tiles044146108/_01_03.png']
That's what is in the variable 'onlytiles'.
But my issue is this error:
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "/usr/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
File "serve_wh.py", line 128, in semi_func
with open(tile, 'rb') as f:
FileNotFoundError: [Errno 2] No such file or directory: 't'
"""
I am not sure why it is doing further slicing of the string? Any idea what I can do to ensure it just grabs each file from 'onlyfiles' list separately in this?

Your iterable is a filename string thats why it's trying to open file with name t. Check Pool.map second argument.
pool.map(semi_func,tile)
You should use
pool.map(semi_func,onlytiles)
Without the for loop so that it iterates over the list rather than string.

Related

Where am I going wrong with patching a function with mock_open?

I have a function that calls a sub-function to open up a file. I am trying to test the parent function, but I want to patch the sub-function and have it return the data I pass in (as if it read from a file).
tests.py
# Read in the sample data
__SAMPLE_LOG = os.path.join(settings.BASE_DIR, "apps/tests/log_viewer/sample_logs/sample_manager_log.log")
sample_data = []
for line in reversed_lines(open(__SAMPLE_LOG)):
sample_data.append(line)
sample_data = ('').join(sample_data)
class ReadLog(TestCase):
#patch('apps.log_viewer.utils.reversed_lines', new_callable = mock_open, read_data = sample_data)
def test_returnsDictionaryContainingListOfDictionaries(self, mock_file):
activity = read_log()
# Make sure the sample data was read ==> this fails.
self.assertEqual(open(settings.ACTIVITY_LOG_FILE).read(), sample_data)
utils.py
def read_log():
# This is the line I am trying to patch
for line in reversed_lines(open(settings.ACTIVITY_LOG_FILE)):
# process data
# see: https://stackoverflow.com/questions/260273/most-efficient-way-to-search-the-last-x-lines-of-a-file-in-python/260433#260433
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
for block in reversed_blocks(file):
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def reversed_blocks(file, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
The error
I am trying to patch reversed_lines() in utils.py within the read_log() method, but read_log() is still reading from the actual log, indicating that I am not patching reversed_lines() correctly.
When I change
#patch('apps.log_viewer.utils.reversed_lines', new_callable = mock_open, read_data = sample_data)
to
#patch('builtins.open', new_callable = mock_open, read_data = sample_data)
I get
======================================================================
ERROR: test_returnsDictionaryContainingListOfDictionaries
(tests.log_viewer.test_utils.ReadLog)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/usr/local/Cellar/python/3.7.4/Frameworks/Python.framework/Versions/3.7/lib/python3.7/unittest/mock.py", line 1209, in patched
return func(*args, **keywargs)
File "/webapp/apps/tests/log_viewer/test_utils.py", line 32, in test_returnsDictionaryContainingListOfDictionaries
activity = read_log()
File "/webapp/apps/log_viewer/utils.py", line 64, in read_log
for line in reversed_lines(open(settings.ACTIVITY_LOG_FILE)):
File "/webapp/apps/log_viewer/utils.py", line 173, in reversed_lines
for block in reversed_blocks(file):
File "/webapp/apps/log_viewer/utils.py", line 164, in reversed_blocks
while 0 < here:
TypeError: '<' not supported between instances of 'int' and 'MagicMock'
Where am I going wrong?

Following the example from the docs at https://docs.python.org/3.3/library/unittest.mock.html#mock-open I think you want
#patch('builtins.open', mock_open(read_data = sample_data), create=True)
However reading through the source of mock_open: https://github.com/python/cpython/blob/3.7/Lib/unittest/mock.py#L2350
It appears that the tell method for filehandles is not implemented by the mock. The only supported methods are read, readline, readlines, write and iterating over the contents. You'll need to manually set up the mock for the tell method. This is not a general implementation but will work in your specific case:
class ReadLog(TestCase):
#patch('builtins.open', mock_open(read_data = sample_data), create=True)
def test_returnsDictionaryContainingListOfDictionaries(self, mock_file):
mock_file.return_value.tell.return_value = len(sample_data)
...

Can't run file with sys.arg[1]

I'm working through https://testdriven.io/developing-an-asynchronous-task-queue-in-python . I've also taken a look at sys.argv[1] meaning in script for clarification on sys.argv
From the former I have:
def save_file(filename, data):
random_str = uuid.uuid4().hex
outfile = f'{filename}_{random_str}.txt'
with open(os.path.join(OUTPUT_DIRECTORY, outfile), 'w') as outfile:
outfile.write(data)
def get_word_counts(filename):
wordcount = collections.Counter()
# get counts
with open(os.path.join(DATA_DIRECTORY, filename), 'r') as f:
for line in f:
wordcount.update(line.split())
for word in set(COMMON_WORDS):
del wordcount[word]
# save file
save_file(filename, json.dumps(dict(wordcount.most_common(20))))
# simulate long-running task
time.sleep(2)
proc = os.getpid()
print(f'Processed {filename} with process id: {proc}')
if __name__ == '__main__':
print(sys.argv, len(sys.argv))
# print(sys.argv[1], len(sys.argv))
get_word_counts(sys.argv[1])
When I run it directly with I get:
$ python tasks.py
['tasks.py'] 1
Traceback (most recent call last):
File "tasks.py", line 46, in <module>
get_word_counts(sys.argv[1])
IndexError: list index out of range
Given that you can see there is only one element in the list, why did the author write the the code in this way?

get_word_counts(sys.argv[1])
Should be
get_word_counts(sys.argv[0])
Indexes start at zero in most languages (including python)

parallel processing write dictionary to multiple csv files

I have a large dataframe that I would like to write to different files depending on the value in a particular column.
The first function takes a dictionary where the key is the file to write out to and the value is a numpy array which is a subset of the original dataframe.
def write_in_parallel(inputDict):
for key,value in inputDict.items():
df = pd.DataFrame(value)
with open(baseDir + outDir + outputFileName + key + outputFileType, 'a') as oFile:
data.to_csv(oFile, sep = '|', index = False, header = False)
print("Finished writing month: " + outputFileName + key)
function 2 takes the column values for partitioning the dataframe and the dataframe itself, and returns the dataframe.
def make_slices(files, df):
outlist = dict()
for item in files:
data = np.array(df[df.iloc[:,1] == item])
outlist[item] = data
return outlist
the final function uses multiprocessing to call write_in_parallel and iterates over the dictionary from make_slices, hopefully in parallel.
def make_dynamic_columns():
perfPath = baseDir + rawDir
perfFiles = glob.glob(perfPath + "/*" + inputFileType)
perfFrame = pd.DataFrame()
for file_ in perfFiles:
df = pd.read_table(file_, delimiter = '|', header = None)
df.fillna(missingDataChar,inplace=True)
df.iloc[:,1] = df.iloc[:,1].astype(str)
fileList = list(df.iloc[:, 1].astype('str').unique())
with mp.Pool(processes=10) as pool:
pool.map(write_in_parallel, make_slices(fileList, df))
the error I am getting is 'str object has no attribute items' which leads me to believe that pool.map and write_in_parallel is not receiving the dictionary. I am not sure how to solve this issue. Any help is greatly appreciated.
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/ssun/library/python/Python-3.5.2/build/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/home/ssun/library/python/Python-3.5.2/build/lib/python3.5/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
File "_FHLMC_LLP_dataprep.py", line 22, in write_in_parallel
for key,value in dict.items():
AttributeError: 'str' object has no attribute 'items'
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "_FHLMC_LLP_dataprep.py", line 59, in <module>
make_dynamic_columns_freddie()
File "_FHLMC_LLP_dataprep.py", line 55, in make_dynamic_columns_freddie
pool.map(write_in_parallel, dictinput)
File "/home/ssun/library/python/Python-3.5.2/build/lib/python3.5/multiprocessing/pool.py", line 260, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/home/ssun/library/python/Python-3.5.2/build/lib/python3.5/multiprocessing/pool.py", line 608, in get
raise self._value
AttributeError: 'str' object has no attribute 'items'

Your problem is that make_slices returns a dictionary, not a list, and pool.map() does not like that. It just passes your dictionary keys to your workers, which means they are strings (try printing what you receive as inputDict). It is not dictionary but just keys.
def make_slices(files, df):
outlist = []
for item in files:
data = df + item
outlist.append({item: data})
return outlist
Could you try something like this, so that you actually return a list? Members would then be dictionary items. (I had to modify your code to just create something in data to test).
This way you can receive a key and a related data item in your worker if that is what you want to do.

PYTHON ghostscript loop only executes once

I have a folder called pdfs. I first obtain a list of the files and print them:
import ghostscript, os
from os import listdir
from os.path import isfile, join
def get_files(path):
input_files = [f for f in listdir(path) if isfile(join(path, f))]
return input_files
def pdf2jpeg(pdf_input_path, jpeg_output_path):
args = ["pdf2jpeg", # actual value doesn't matter
"-dNOPAUSE",
"-sDEVICE=jpeg",
"-dJPEGQ=95",
"-r600x600",
"-sOutputFile=" + jpeg_output_path,
pdf_input_path]
ghostscript.Ghostscript(*args)
if __name__ == '__main__':
input_files = get_files("pdfs")
# pdf2jpeg("pdfs/test1.pdf", "jpgs/test1.jpg")
for input_file in input_files:
input_file_name = str("pdfs/"+str(input_file))
output_file_name = str('jpgs/'+str(input_file).replace(" ", "_").replace("pdf", "jpg"))#split(".")[0]
print input_file_name
print output_file_name
# pdf2jpeg(input_file_name, output_file_name)
OUTPUT:
pdfs/test1 (5th copy).pdf
jpgs/test1_(5th_copy).jpg
pdfs/test1 (copy).pdf
jpgs/test1_(copy).jpg
pdfs/test1 (4th copy).pdf
jpgs/test1_(4th_copy).jpg
pdfs/test1 (3rd copy).pdf
jpgs/test1_(3rd_copy).jpg
pdfs/test1 (another copy).pdf
jpgs/test1_(another_copy).jpg
Also when i execute pdf2jpeg("pdfs/test1.pdf", "jpgs/test1.jpg") the code works and I get the converted jpg.
Now when I want to loop through the list and uncoment the last line:pdf2jpeg(input_file_name, output_file_name)
if __name__ == '__main__':
input_files = get_files("pdfs")
# pdf2jpeg("pdfs/test1.pdf", "jpgs/test1.jpg")
for input_file in input_files:
input_file_name = str("pdfs/"+str(input_file))
output_file_name = str('jpgs/'+str(input_file).replace(" ", "_").replace("pdf", "jpg"))#split(".")[0]
print input_file_name
print output_file_name
pdf2jpeg(input_file_name, output_file_name)
I GET THIS ERROR:
Traceback (most recent call last):
File "gsPdf2Jpg.py", line 28, in <module>
pdf2jpeg(input_file_name, output_file_name)
File "gsPdf2Jpg.py", line 17, in pdf2jpeg
ghostscript.Ghostscript(*args)
File "/home/trackstarz/prohealth/phenv/local/lib/python2.7/site-packages/ghostscript/__init__.py", line 157, in Ghostscript
stderr=kw.get('stderr', None))
File "/home/trackstarz/prohealth/phenv/local/lib/python2.7/site-packages/ghostscript/__init__.py", line 72, in __init__
rc = gs.init_with_args(instance, args)
File "/home/trackstarz/prohealth/phenv/local/lib/python2.7/site-packages/ghostscript/_gsprint.py", line 177, in init_with_args
raise GhostscriptError(rc)
ghostscript._gsprint.GhostscriptError: limitcheck
I went through and changed the loop to only go through individual input_files[0], input_files[1] and they work, the moment I loop them all they stop working. The only thing I can think of is that I have to clear something from the memory, or disconnect from the file. I am just taking wild guesses here.

Finding duplicate files with python

I'm trying to write a Python script that will crawl through a directory and find all files that are duplicates and report back the duplicates. What's the best was to solve this?
import os, sys
def crawlDirectories(directoryToCrawl):
crawledDirectory = [os.path.join(path, subname) for path, dirnames, filenames in os.walk(directoryToCrawl) for subname in dirnames + filenames]
return crawledDirectory
#print 'Files crawled',crawlDirectories(sys.argv[1])
directoriesWithSize = {}
def getByteSize(crawledDirectory):
for eachFile in crawledDirectory:
size = os.path.getsize(eachFile)
directoriesWithSize[eachFile] = size
return directoriesWithSize
getByteSize(crawlDirectories(sys.argv[1]))
#print directoriesWithSize.values()
duplicateItems = {}
def compareSizes(dictionaryDirectoryWithSizes):
for key,value in dictionaryDirectoryWithSizes.items():
if directoriesWithSize.values().count(value) > 1:
duplicateItems[key] = value
compareSizes(directoriesWithSize)
#print directoriesWithSize.values().count(27085)
compareSizes(directoriesWithSize)
print duplicateItems
Why does this throw back this error?
Traceback (most recent call last):
File "main.py", line 16, in <module>
getByteSize(crawlDirectories(sys.argv[1]))
File "main.py", line 12, in getByteSize
size = os.path.getsize(eachFile)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/genericpath.py", line 49, in getsize
OSError: [Errno 2] No such file or directory: '../Library/Containers/com.apple.ImageKit.RecentPictureService/Data/Documents/iChats'

It seems to me that your crawledDirectory function is too complicated:
def crawlDirectories(directoryToCrawl):
output = []
for path, dirnames, filenames in os.walk(directoryToCrawl):
for fname in filenames:
output.append(os.path.join(path,fname))
return output

I'd suggest to try:
def crawlDirectories(directoryToCrawl):
crawledDirectory = [os.path.realpath(os.path.join(p, f))
for (p, d, f) in os.walk(directoryToCrawl)]
return crawledDirectory
That is, use a canonical path instead of relative paths in your crawl.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

String issue for Python multiprocessing - python

Your iterable is a filename string thats why it's trying to open file with name t. Check Pool.map second argument. pool.map(semi_func,tile) You should use pool.map(semi_func,onlytiles) Without the for loop so that it iterates over the list rather than string.

Related

Where am I going wrong with patching a function with mock_open?

Can't run file with sys.arg[1]

parallel processing write dictionary to multiple csv files

PYTHON ghostscript loop only executes once

Finding duplicate files with python

Categories

Resources