I often find my self writing a python script which takes parameters:
python my_script.py input_file output_file other_parameter_a other_parameter_b optional_parameter_c
Now, I want the option to either run the script on a single file like what the above would do, or run it on every single file in a directory. I find myself writing a new script my_script_run_on_directory.py that looks up every file in a directory and then calls my_script.py So, I would have:
python my_script_run_on_directory.py directory_input directory_output other_parameter_a other_parameter_b optional_parameter_c
I need to do this often and I keep writing a new directory script for each my_script. Is there a better way to do this? I thought of using decorators but not sure what the best way to do this is.
I suppose what I want is something like
python general_run_on_directory_script.py my_script directory_input directory_output <and all other paremeters needed for my_script>
As for your question on what to use. In general, I'd say abstract the generic code away in a function that takes a specific function as an argument. Using a decorator is a rather clean way to do this. So in my opinion, yes it is a good solution.
Simple case (always expecting the same argument for your function):
import os
#Define decorator, takes the function to execute as an argument
def dir_or_file_decorator(func):
def newFunc(path):
if os.path.isdir(path):
filenames = os.listdir(path)
for filename in filenames:
filepath = os.path.join(path,filename)
func(filepath)
else:
func(path)
return newFunc
#Define the function we want to decorate
#dir_or_file_decorator
def print_file_name(filepath):
print filepath
#Run some tests
print 'Testing file'
print_file_name(r'c:\testdir\testfile1.txt')
print 'Testing dir'
print_file_name(r'c:\testdir')
#The #decorator is just syntactic sugar. The code below shows what actually happens
def print_file_name2(filepath):
print filepath
decorated_func = dir_or_file_decorator(print_file_name2)
print 'Testing file'
decorated_func(r'c:\testdir\testfile1.txt')
print 'Testing dir'
decorated_func(r'c:\testdir')
#Output:
# Testing file
# c:\testdir\testfile1.txt
# Testing dir
# c:\testdir\testfile1.txt
# c:\testdir\testfile2.txt
More complicated cases:
Extra arguments in your functions:
import os
def dir_or_file_decorator(func):
def newFunc(path, *args, **kwargs):
if os.path.isdir(path):
filenames = os.listdir(path)
for filename in filenames:
filepath = os.path.join(path,filename)
func(filepath, *args, **kwargs)
else:
func(path, *args, **kwargs)
return newFunc
#dir_or_file_decorator
def print_file_name_and_args(path, extra):
print extra, path
#We can use the parameter order in the function (our decorator assumes path is the first one)
print_file_name_and_args(r'c:\testdir', 'extra for test 1')
#Or we can just be safe and use named arguments (our decorator assumes the argument is named path)
print_file_name_and_args(extra='extra for test 1', path=r'c:\testdir')
#A combination of both is possible too (but I feel it's more complicated and hence more prone to error)
print_file_name_and_args(r'c:\testdir', extra='extra for test 1')
#Output (in all 3 cases):
# extra for test 1 c:\testdir\testfile1.txt
# extra for test 1 c:\testdir\testfile2.txt
Having to return values as well:
import os
def dir_or_file_decorator_with_results(concatenateResultFunc):
def dir_or_file_decorator(func):
def newFunc(path, *args, **kwargs):
if os.path.isdir(path):
results = []
filenames = os.listdir(path)
for filename in filenames:
filepath = os.path.join(path,filename)
results.append(func(filepath, *args, **kwargs))
return concatenateResultFunc(results)
else:
return func(path, *args, **kwargs)
return newFunc
return dir_or_file_decorator
#Our function to concatenate the results in case of a directory
def concatenate_results(results):
return ','.join(results)
#We pass the function used to concatenate the results in case of a directory when we apply to decorator
#What happens is that we create a new dir_or_file_decorator that uses the specified concatenateResultFunc
#That newly created decorator is then applied to our function
#dir_or_file_decorator_with_results(concatenate_results)
def get_file_name_and_args(extra, path):
return extra + ' -> ' + path
#Test again
print get_file_name_and_args(r'c:\testdir', 'extra for test 1')
#Output:
# c:\testdir\testfile1.txt -> extra for test 1,c:\testdir\testfile2.txt -> extra for test 1
Related
I have a function like below.
# in retrieve_data.py
import os
def create_output_csv_file_path_and_name(output_folder='outputs') -> str:
"""
Creates an output folder in the project root if it doesn't already exist.
Then returns the path and name of the output CSV file, which will be used
to write the data.
"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
logging.info(f"New folder created for output file: " f"{output_folder}")
return os.path.join(output_folder, 'results.csv')
I also created a unit test file like below.
# in test_retrieve_data.py
class OutputCSVFilePathAndNameCreationTest(unittest.TestCase):
#patch('path.to.retrieve_data.os.path.exists')
#patch('path.to.retrieve_data.os.makedirs')
def test_create_output_csv_file_path_and_name_calls_exists_and_makedirs_once_when_output_folder_is_not_created_yet(
self,
os_path_exists_mock,
os_makedirs_mock
):
os_path_exists_mock.return_value = False
retrieve_cradle_profile_details.create_output_csv_file_path_and_name()
os_path_exists_mock.assert_called_once()
os_makedirs_mock.assert_called_once()
But when I run the above unit test, I get the following error.
def assert_called_once(self):
"""assert that the mock was called only once.
"""
if not self.call_count == 1:
msg = ("Expected '%s' to have been called once. Called %s times.%s"
% (self._mock_name or 'mock',
self.call_count,
self._calls_repr()))
raise AssertionError(msg)
AssertionError: Expected 'makedirs' to have been called once. Called 0 times.
I tried poking around with pdb.set_trace() in create_output_csv_file_path_and_name method and I'm sure it is receiving a mocked object for os.path.exists(), but the code never go pasts that os.path.exists(output_folder) check (output_folder was already created in the program folder but I do not use it for unit testing purpose and want to keep it alone). What could I possibly be doing wrong here to mock os.path.exists() and os.makedirs()? Thank you in advance for your answers!
You have the arguments to your test function reversed. When you have stacked decorators, like:
#patch("retrieve_data.os.path.exists")
#patch("retrieve_data.os.makedirs")
def test_create_output_csv_file_path_...():
They apply bottom to top, so you need to write:
#patch("retrieve_data.os.path.exists")
#patch("retrieve_data.os.makedirs")
def test_create_output_csv_file_path_and_name_calls_exists_and_makedirs_once_when_output_folder_is_not_created_yet(
self, os_makedirs_mock, os_path_exists_mock
):
With this change, if I have this in retrieve_data.py:
import os
import logging
def create_output_csv_file_path_and_name(output_folder='outputs') -> str:
"""
Creates an output folder in the project root if it doesn't already exist.
Then returns the path and name of the output CSV file, which will be used
to write the data.
"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
logging.info(f"New folder created for output file: " f"{output_folder}")
return os.path.join(output_folder, 'results.csv')
And this is test_retrieve_data.py:
import unittest
from unittest.mock import patch
import retrieve_data
class OutputCSVFilePathAndNameCreationTest(unittest.TestCase):
#patch("retrieve_data.os.path.exists")
#patch("retrieve_data.os.makedirs")
def test_create_output_csv_file_path_and_name_calls_exists_and_makedirs_once_when_output_folder_is_not_created_yet(
self, os_makedirs_mock, os_path_exists_mock
):
os_path_exists_mock.return_value = False
retrieve_data.create_output_csv_file_path_and_name()
os_path_exists_mock.assert_called_once()
os_makedirs_mock.assert_called_once()
Then the tests run successfully:
$ python -m unittest -v
test_create_output_csv_file_path_and_name_calls_exists_and_makedirs_once_when_output_folder_is_not_created_yet (test_retrieve_data.OutputCSVFilePathAndNameCreationTest.test_create_output_csv_file_path_and_name_calls_exists_and_makedirs_once_when_output_folder_is_not_created_yet) ... ok
----------------------------------------------------------------------
Ran 1 test in 0.001s
OK
Update I wanted to leave a comment on the diagnostics I performed here, because I didn't initially spot the reversed arguments, either, but the problem became immediately apparent when I added a breakpoint() the beginning of the test and printed out the values of the mocks:
(Pdb) p os_path_exists_mock
<MagicMock name='makedirs' id='140113966613456'>
(Pdb) p os_makedirs_mock
<MagicMock name='exists' id='140113966621072'>
The fact that the names were swapped made the underlying problem easy to spot.
I'm trying to build a routine that calls a Pytest class for each PDF document in current directoy... Let me explain
Lets say i have this test file
import pytest
class TestHeader:
#asserts...
class TestBody:
#asserts...
This script needs to test each pdf document in my cwd
Here is my best attempt:
import glob
import pytest
class TestHeader:
#asserts...
class TestBody:
#asserts...
filelist = glob.glob('*.pdf')
for file in filelist:
#magically call pytest for each file
How would i approach this?
EDIT: Complementing my question.
I have a huge function that extracts each document's data, lets call it extract_pdf
this function returns a tuple (header, body).
Current attempt looks like this:
import glob
import pytest
class TestHeader:
#asserts...
class TestBody:
#asserts...
filelist = glob.glob('*.pdf')
for file in filelist:
header, body = extract_pdf(file)
pytest.main(<pass header and body as args for pytest>)
I need to parse each document prior to testing. Can it be done this way?
The best way to do this through parameterization of the testcases dynamically..
This can be achieved using the pytest_generate_tests hook..
def pytest_generate_tests(metafunc):
filelist = glob.glob('*.pdf')
metafunc.parametrize("fileName", filelist )
NOTE: fileName should be one of the argument to your test function.
This will result in executing the testcase for each of the file in the directory and the testcase will be like
TestFunc[File1]
TestFunc[File2]
TestFunc[File3]
.
.
and so on..
This is expanding on the existing answer by #ArunKalirajaBaskaran.
The problem is that you have different test classes that want to use the same data, but you want to parse the data only once. If it is ok for you to read all data at once, you could read them into global variables and use these for parametrizing your tests:
def extract_data():
filenames = []
headers = []
bodies = []
for filename in glob.glob('*.pdf'):
header, body = extract_pdf(filename)
filenames.append(filename)
headers.append(header)
bodies.append(body)
return filenames, headers, bodies
filenames, headers, bodies = extract_data()
def pytest_generate_tests(metafunc):
if "header" in metafunc.fixturenames:
# use the filename as ID for better test names
metafunc.parametrize("header", headers, ids=filenames)
elif "body" in metafunc.fixturenames:
metafunc.parametrize("body", bodies, ids=filenames)
class TestHeader:
def test_1(header):
...
def test_2(header):
...
class TestBody:
def test_1(body):
...
This is the same as using
class TestHeader:
#pytest.mark.parametrize("header", headers, ids=filenames)
def test_1(header):
...
#pytest.mark.parametrize("header", headers, ids=filenames)
def test_2(header):
...
pytest_generate_tests just adds a bit of convenience so you don't have to repeat the parametrize decorator for each test.
The downside of this is of course that you will read in all of the data at once, which may cause a problem with memory usage if there is a lot of files. Your approach with pytest.main will not work, because that is the same as calling pytest on the command line with the given parameters. Parametrization can be done at the fixture level or on the test level (like here), but both need the parameters alreay evaluated at load time, so I don't see a possibility to do this lazily (apart from putting it all into one test). Maybe someone else has a better idea...
I want to test a Python function that reads a gzip file and extracts something from the file (using pytest).
import gzip
def my_function(file_path):
output = []
with gzip.open(file_path, 'rt') as f:
for line in f:
output.append('something from line')
return output
Can I create a gzip file like object that I can pass to my_function? The object should have defined content and should work with gzip.open()
I know that I can create a temporary gzip file in a fixture but this depends on the filesystem and other properties of the environment. Creating a file-like object from code would be more portable.
You can use the io and gzip libraries to create in-memory file objects. Example:
import io, gzip
def inmem():
stream = io.BytesIO()
with gzip.open(stream, 'wb') as f:
f.write(b'spam\neggs\n')
stream.seek(0)
return stream
You should never try to test outside code in a unit test. Only test the code you wrote. If you're testing gzip, then gzip is doing something wrong (they should be writing their own unit tests). Instead, do something like this:
from unittest import mock
#mock.Mock('gzip', return_value=b'<whatever you expect to be returned from gzip>')
def test_my_function(mock_gzip):
file_path = 'testpath'
output = my_function(file_path=file_path)
mock_gzip.open.assert_called_with(file_path)
assert output == b'<whatever you expect to be returned from your method>'
That's your whole unit test. All you want to know is that gzip.open() was called (and you assume it works or else gzip is failing and that's their problem) and that you got back what you expected from the method being tested. You specify what gzip returns based on what you expect it to return, but you don't actually call the function in your test.
It's a bit verbose but I'd do something like this (I have assumed that you saved my_function to a file called patch_one.py):
import patch_one # this is the file with my_function in it
from unittest.mock import patch
from unittest import TestCase
class MyTestCase(TestCase):
def test_my_function(self):
# because you used "with open(...) as f", we need a mock context
class MyContext:
def __enter__(self, *args, **kwargs):
return [1, 2] # note the two items
def __exit__(self, *args, **kwargs):
return None
# in case we want to know the arguments to open()
open_args = None
def f(*args, **kwargs):
def my_open(*args, **kwargs):
nonlocal open_args
open_args = args
return MyContext()
return my_open
# patch the gzip.open in our file under test
with patch('patch_one.gzip.open', new_callable=f):
# finally, we can call the function we want to test
ret_val = patch_one.my_function('not a real file path')
# note the two items, corresponding to the list in __enter__()
self.assertListEqual(['something from line', 'something from line'], ret_val)
# check the arguments, just for fun
self.assertEqual('rt', open_args[1])
If you want to try anything more complicated, I would recommend reading the unittest mock docs because how you import the "patch_one" file matters as does the string you pass to patch().
There will definitely be a way to do this with Mock or MagicMock but I find them a bit hard to debug so I went the long way round.
Obviously I'm doing something stupid. But what?
I get:
File "<path>", line 71, in args
filename = filename
NameError: name 'filename' is not defined
...on the next-to-last line below ("filename = filename"):
def parseLog(filename, explain=False, omitminor=False, omitexpected=False,
omitgdocs=False, args=None):
print(filename)
if not args:
class args:
filename = filename
explain = explain
Yet the 2nd line above ("print(filename)") works fine. So, why the error?
In case you're wondering why I'm doing this in the first place, it's because the function parseLog() can also be called by the command line, like so:
def parseLogCLI():
''' parse command line for arguments '''
parser = argparse.ArgumentParser()
parser.add_argument('filename')
parser.add_argument('-explain', action="store_true", help='Explain what program has done')
parser.add_argument('-omitminor', action="store_true", help='Omit minor errors ' + repr(minor_errors))
parser.add_argument('-omitexpected', action="store_true", help='Omit machines expected to be often offline')
parser.add_argument('-omitgdocs', action="store_true", help='Omit errors on Google Docs native files (not copyable) ' + repr(gdocs))
args = parser.parse_args()
parseLog(arg.filename, args=args)
...so I'm trying to construct an 'arg' class (as argparse does) to pass to my function. If there's a better way to do this, I'm interested.
With class args: you are starting the definition of a class.
In that context, the first occurrence of filename defines a class attribute, which you are trying to assign from its own value before it is fully defined.
First of all, I think you should investigate in more details the concepts of classes, scopes, and instances.
This will help you understand why your function argument filename is hidden by the new definition inside the class scope.
It seems to me like you have a scope issue, in that the inner class 'arg' doesn't have access to the scope of the outer class. One solution would be to use the 'global' keyword like so:
def parseLog(filename, explain=False, omitminor=False, omitexpected=False,
omitgdocs=False, args=None):
global fname, expl
fname = filename
expl = explain
print(filename)
if not args:
class args:
filename = fname
explain = expl
You can read more about Python variable scopes here.
I have a luigi preprocessing task that splits my raw data into smaller files. These Files will then be processed by the actual pipeline.
So regarding the parameters, I would like to require each pipeline with one preprocessed file id as parameter. However, this file id is only generated in the preprocessing step and is thus only known at runtime. To illustrate my idea I provide this not-working code:
import luigi
import subprocess
import random
class GenPipelineFiles(luigi.Task):
input_file = luigi.Parameter()
def requires(self):
pass
def output(self):
for i in range(random.randint(0,10)):
yield luigi.LocalTarget("output/{}_{}.txt".format(self.input_file, i))
def run(self):
for iout in self.output:
command = "touch {}".format(iout.fname)
subprocess.call(command, shell=True)
class RunPipelineOnSmallChunk(luigi.Task):
pass
class Experiment(luigi.WrapperTask):
input_file = luigi.Parameter(default="ex")
def requires(self):
file_ids = GenPipelineFiles(input_file=self.input_file)
for file_id in file_ids:
yield RunPipelineOnSmallChunk(directory=self.input_file, file_id=file_id)
luigi.run()
The wrapper task Experiment should
first, somehow require the splitting of the raw data into documents
secondly, require the actual pipeline with the obtained file id of the preprocessing.
The random number of output files in the GenPipelineFiles indicates that this cannot be hard-coded into the Experiment's requires.
A question that is probably related to this one is the fact, that a luigi task properly only has one input target and one output target. Probably a note on how to model multiple outputs in GenPipelineFiles could also solve the problem.
One simple approach to dealing with multiple outputs is to create a directory named after the input file, and put the output files from the split into that a directory named after the input file. That way the dependent task can just check for the existence of the directory. Let's say I have an input file 123.txt, I then make a directory 123_split with files 1.txt, 2.txt, 3.txt as the output of GenPipelineFiles, and then a directory 123_processed with 1.txt, 2.txt, 3.txt as the output of RunPipelineOnSmallChunk.
For your requires method in Experiment, you have to return the tasks you want to run, in a list for example. The way you have written file_ids = GenPipelineFiles(input_file=self.input_file) makes me think the run method of that object is not being called, because it is not being returned by the method.
here's some sample code that works with targets on a per file basis (but not a task per file basis). I still think it is safer to have a single output target of a directory or a sentinel file out of some kind to indicate you are done. Atomicity is lost unless the tasks ensures each target is created.
PYTHONPATH=. luigi --module sampletask RunPipelineOnSmallChunk --local-scheduler
sampletask.py
import luigi
import os
import subprocess
import random
class GenPipelineFiles(luigi.Task):
inputfile = luigi.Parameter()
num_targets = random.randint(0,10)
def requires(self):
pass
def get_prefix(self):
return self.inputfile.split(".")[0]
def get_dir(self):
return "split_{}".format(self.get_prefix())
def output(self):
targets = []
for i in range(self.num_targets):
targets.append(luigi.LocalTarget(" {}/{}_{}.txt".format(self.get_dir(), self.get_prefix(), i)))
return targets
def run(self):
if not os.path.exists(self.get_dir()):
os.makedirs(self.get_dir())
for iout in self.output():
command = "touch {}".format(iout.path)
subprocess.call(command, shell=True)
class RunPipelineOnSmallChunk(luigi.Task):
inputfile = luigi.Parameter(default="test")
def get_prefix(self):
return self.inputfile.split(".")[0]
def get_dir(self):
return "processed_{}".format(self.get_prefix())
#staticmethod
def clean_input_path(path):
return path.replace("split", "processed")
def requires(self):
return GenPipelineFiles(self.inputfile)
def output(self):
targets = []
for target in self.input():
targets.append(luigi.LocalTarget(RunPipelineOnSmallChunk.clean_input_path(target.path)))
return targets
def run(self):
if not os.path.exists(self.get_dir()):
os.makedirs(self.get_dir())
for iout in self.output():
command = "touch {}".format(iout.path)
subprocess.call(command, shell=True)