I'm trying to build a routine that calls a Pytest class for each PDF document in current directoy... Let me explain
Lets say i have this test file
import pytest
class TestHeader:
#asserts...
class TestBody:
#asserts...
This script needs to test each pdf document in my cwd
Here is my best attempt:
import glob
import pytest
class TestHeader:
#asserts...
class TestBody:
#asserts...
filelist = glob.glob('*.pdf')
for file in filelist:
#magically call pytest for each file
How would i approach this?
EDIT: Complementing my question.
I have a huge function that extracts each document's data, lets call it extract_pdf
this function returns a tuple (header, body).
Current attempt looks like this:
import glob
import pytest
class TestHeader:
#asserts...
class TestBody:
#asserts...
filelist = glob.glob('*.pdf')
for file in filelist:
header, body = extract_pdf(file)
pytest.main(<pass header and body as args for pytest>)
I need to parse each document prior to testing. Can it be done this way?
The best way to do this through parameterization of the testcases dynamically..
This can be achieved using the pytest_generate_tests hook..
def pytest_generate_tests(metafunc):
filelist = glob.glob('*.pdf')
metafunc.parametrize("fileName", filelist )
NOTE: fileName should be one of the argument to your test function.
This will result in executing the testcase for each of the file in the directory and the testcase will be like
TestFunc[File1]
TestFunc[File2]
TestFunc[File3]
.
.
and so on..
This is expanding on the existing answer by #ArunKalirajaBaskaran.
The problem is that you have different test classes that want to use the same data, but you want to parse the data only once. If it is ok for you to read all data at once, you could read them into global variables and use these for parametrizing your tests:
def extract_data():
filenames = []
headers = []
bodies = []
for filename in glob.glob('*.pdf'):
header, body = extract_pdf(filename)
filenames.append(filename)
headers.append(header)
bodies.append(body)
return filenames, headers, bodies
filenames, headers, bodies = extract_data()
def pytest_generate_tests(metafunc):
if "header" in metafunc.fixturenames:
# use the filename as ID for better test names
metafunc.parametrize("header", headers, ids=filenames)
elif "body" in metafunc.fixturenames:
metafunc.parametrize("body", bodies, ids=filenames)
class TestHeader:
def test_1(header):
...
def test_2(header):
...
class TestBody:
def test_1(body):
...
This is the same as using
class TestHeader:
#pytest.mark.parametrize("header", headers, ids=filenames)
def test_1(header):
...
#pytest.mark.parametrize("header", headers, ids=filenames)
def test_2(header):
...
pytest_generate_tests just adds a bit of convenience so you don't have to repeat the parametrize decorator for each test.
The downside of this is of course that you will read in all of the data at once, which may cause a problem with memory usage if there is a lot of files. Your approach with pytest.main will not work, because that is the same as calling pytest on the command line with the given parameters. Parametrization can be done at the fixture level or on the test level (like here), but both need the parameters alreay evaluated at load time, so I don't see a possibility to do this lazily (apart from putting it all into one test). Maybe someone else has a better idea...
Related
I try test a function which reads a file and returns the content of the file or returns none if the file is not found.
def read_yaml_from_cwd(file: str) -> Dict:
"""[reads a yaml file from current working directory]
Args:
file ([type]): [.yaml or .yml file]
Returns:
[type]: [Dictionary]
"""
path = os.path.join(Path.cwd().resolve(), file)
if os.path.isfile(path):
with open(path) as f:
content = yaml.load(f, Loader=SafeLoader)
return content
else:
return None
This is my test:
from unittest import mock, TestCase
from project import fs
class TextExamples(TestCase):
def test_read_yaml_from_cwd():
with mock.patch('os.listdir') as mocked_listdir:
mocked_listdir.return_value = ['test-config.yml']
val = fs.read_yaml_from_cwd("false-config.yml")
assert val == None
val2 = fs.read_yaml_from_cwd("false-config.yml")
assert val2 != None
I guess I am fundamentally doing something wrong with these tests and what these mocks do. Can somebody help me with this?
One possibility to test this is to patch both os.path.isfile and open. To patch open, there is already a special mock function, mock_open, which gives you the possibilty to set the contents of the mocked file. This means that you don't have to mock yaml.load, as this will return the mocked file content. This could look something like:
from unittest import mock, TestCase
from unittest.mock import mock_open
class YamlTest(TestCase):
#mock.patch("builtins.open", mock_open(read_data="data"))
#mock.patch("os.path.isfile")
def test_read_yaml_from_cwd(self, patched_isfile):
# valid file case
patched_isfile.return_value = True
result = read_yaml_from_cwd("some_file.yaml")
self.assertEqual("data", result)
# invalid file case
patched_isfile.return_value = False
result = read_yaml_from_cwd("some_file.yaml")
self.assertEqual(None, result)
In this case you test that the function returns the file content if you pass a valid file name, and None for an invalid file name, which is probably all you want to test here.
For completeness, and because I mentioned it in the comments: using pyfakefs instead would replace the file system with a fake file system, which you can handle like a real filesystem, in this case it could look like:
from pyfakefs import fake_filesystem_unittest
class YamlTest(fake_filesystem_unittest.TestCase):
def setUp(self) -> None:
self.setUpPyfakefs()
self.fs.create_file("some_file.yaml", contents="data")
def test_read_yaml_from_cwd(self):
# valid file case
result = read_yaml_from_cwd("some_file.yaml")
self.assertEqual("data", result)
# invalid file case
result = read_yaml_from_cwd("non_existing.yaml")
self.assertEqual(None, result)
This makes sense if you have many filesystem related tests, though in your case this would probably be overkill.
Disclaimer: I'm a contributor to pyfakefs.
I have a method, which contains external rest-api calls.
ex:
def get_dataset():
url=requests.get("http://api:5001/get_trainingdata")
filename=url.text[0]
return filename
When I do #patch for this function, I can able to do unittest. But, coverage in not covering whole function.
How can write unittest case for this method with full coverage?
My testcase
#mock.patch('api.get_dataset')
def test_using_decorator1(self, mocked_get_dataset):
file = [{"file":"ddddd"}]
mocked_get_dataset.return_value = Mock()
mocked_get_dataset.return_value.json.return_value = file
filename = file[0]
self.assertEqual(filename, file[0])
I want to test a Python function that reads a gzip file and extracts something from the file (using pytest).
import gzip
def my_function(file_path):
output = []
with gzip.open(file_path, 'rt') as f:
for line in f:
output.append('something from line')
return output
Can I create a gzip file like object that I can pass to my_function? The object should have defined content and should work with gzip.open()
I know that I can create a temporary gzip file in a fixture but this depends on the filesystem and other properties of the environment. Creating a file-like object from code would be more portable.
You can use the io and gzip libraries to create in-memory file objects. Example:
import io, gzip
def inmem():
stream = io.BytesIO()
with gzip.open(stream, 'wb') as f:
f.write(b'spam\neggs\n')
stream.seek(0)
return stream
You should never try to test outside code in a unit test. Only test the code you wrote. If you're testing gzip, then gzip is doing something wrong (they should be writing their own unit tests). Instead, do something like this:
from unittest import mock
#mock.Mock('gzip', return_value=b'<whatever you expect to be returned from gzip>')
def test_my_function(mock_gzip):
file_path = 'testpath'
output = my_function(file_path=file_path)
mock_gzip.open.assert_called_with(file_path)
assert output == b'<whatever you expect to be returned from your method>'
That's your whole unit test. All you want to know is that gzip.open() was called (and you assume it works or else gzip is failing and that's their problem) and that you got back what you expected from the method being tested. You specify what gzip returns based on what you expect it to return, but you don't actually call the function in your test.
It's a bit verbose but I'd do something like this (I have assumed that you saved my_function to a file called patch_one.py):
import patch_one # this is the file with my_function in it
from unittest.mock import patch
from unittest import TestCase
class MyTestCase(TestCase):
def test_my_function(self):
# because you used "with open(...) as f", we need a mock context
class MyContext:
def __enter__(self, *args, **kwargs):
return [1, 2] # note the two items
def __exit__(self, *args, **kwargs):
return None
# in case we want to know the arguments to open()
open_args = None
def f(*args, **kwargs):
def my_open(*args, **kwargs):
nonlocal open_args
open_args = args
return MyContext()
return my_open
# patch the gzip.open in our file under test
with patch('patch_one.gzip.open', new_callable=f):
# finally, we can call the function we want to test
ret_val = patch_one.my_function('not a real file path')
# note the two items, corresponding to the list in __enter__()
self.assertListEqual(['something from line', 'something from line'], ret_val)
# check the arguments, just for fun
self.assertEqual('rt', open_args[1])
If you want to try anything more complicated, I would recommend reading the unittest mock docs because how you import the "patch_one" file matters as does the string you pass to patch().
There will definitely be a way to do this with Mock or MagicMock but I find them a bit hard to debug so I went the long way round.
I have a luigi preprocessing task that splits my raw data into smaller files. These Files will then be processed by the actual pipeline.
So regarding the parameters, I would like to require each pipeline with one preprocessed file id as parameter. However, this file id is only generated in the preprocessing step and is thus only known at runtime. To illustrate my idea I provide this not-working code:
import luigi
import subprocess
import random
class GenPipelineFiles(luigi.Task):
input_file = luigi.Parameter()
def requires(self):
pass
def output(self):
for i in range(random.randint(0,10)):
yield luigi.LocalTarget("output/{}_{}.txt".format(self.input_file, i))
def run(self):
for iout in self.output:
command = "touch {}".format(iout.fname)
subprocess.call(command, shell=True)
class RunPipelineOnSmallChunk(luigi.Task):
pass
class Experiment(luigi.WrapperTask):
input_file = luigi.Parameter(default="ex")
def requires(self):
file_ids = GenPipelineFiles(input_file=self.input_file)
for file_id in file_ids:
yield RunPipelineOnSmallChunk(directory=self.input_file, file_id=file_id)
luigi.run()
The wrapper task Experiment should
first, somehow require the splitting of the raw data into documents
secondly, require the actual pipeline with the obtained file id of the preprocessing.
The random number of output files in the GenPipelineFiles indicates that this cannot be hard-coded into the Experiment's requires.
A question that is probably related to this one is the fact, that a luigi task properly only has one input target and one output target. Probably a note on how to model multiple outputs in GenPipelineFiles could also solve the problem.
One simple approach to dealing with multiple outputs is to create a directory named after the input file, and put the output files from the split into that a directory named after the input file. That way the dependent task can just check for the existence of the directory. Let's say I have an input file 123.txt, I then make a directory 123_split with files 1.txt, 2.txt, 3.txt as the output of GenPipelineFiles, and then a directory 123_processed with 1.txt, 2.txt, 3.txt as the output of RunPipelineOnSmallChunk.
For your requires method in Experiment, you have to return the tasks you want to run, in a list for example. The way you have written file_ids = GenPipelineFiles(input_file=self.input_file) makes me think the run method of that object is not being called, because it is not being returned by the method.
here's some sample code that works with targets on a per file basis (but not a task per file basis). I still think it is safer to have a single output target of a directory or a sentinel file out of some kind to indicate you are done. Atomicity is lost unless the tasks ensures each target is created.
PYTHONPATH=. luigi --module sampletask RunPipelineOnSmallChunk --local-scheduler
sampletask.py
import luigi
import os
import subprocess
import random
class GenPipelineFiles(luigi.Task):
inputfile = luigi.Parameter()
num_targets = random.randint(0,10)
def requires(self):
pass
def get_prefix(self):
return self.inputfile.split(".")[0]
def get_dir(self):
return "split_{}".format(self.get_prefix())
def output(self):
targets = []
for i in range(self.num_targets):
targets.append(luigi.LocalTarget(" {}/{}_{}.txt".format(self.get_dir(), self.get_prefix(), i)))
return targets
def run(self):
if not os.path.exists(self.get_dir()):
os.makedirs(self.get_dir())
for iout in self.output():
command = "touch {}".format(iout.path)
subprocess.call(command, shell=True)
class RunPipelineOnSmallChunk(luigi.Task):
inputfile = luigi.Parameter(default="test")
def get_prefix(self):
return self.inputfile.split(".")[0]
def get_dir(self):
return "processed_{}".format(self.get_prefix())
#staticmethod
def clean_input_path(path):
return path.replace("split", "processed")
def requires(self):
return GenPipelineFiles(self.inputfile)
def output(self):
targets = []
for target in self.input():
targets.append(luigi.LocalTarget(RunPipelineOnSmallChunk.clean_input_path(target.path)))
return targets
def run(self):
if not os.path.exists(self.get_dir()):
os.makedirs(self.get_dir())
for iout in self.output():
command = "touch {}".format(iout.path)
subprocess.call(command, shell=True)
I am using Pytest to test an executable. This .exe file reads a configuration file on startup.
I have written a fixture to spawn this .exe file at the start of each test and closes it down at the end of the test. However, I cannot work out how to tell the fixture which configuration file to use. I want the fixture to copy a specified config file to a directory before spawning the .exe file.
#pytest.fixture
def session(request):
copy_config_file(specific_file) # how do I specify the file to use?
link = spawn_exe()
def fin():
close_down_exe()
return link
# needs to use config file foo.xml
def test_1(session):
session.talk_to_exe()
# needs to use config file bar.xml
def test_2(session):
session.talk_to_exe()
How do I tell the fixture to use foo.xml for test_1 function and bar.xml for test_2 function?
Thanks
John
One solution is to use pytest.mark for that:
import pytest
#pytest.fixture
def session(request):
m = request.node.get_closest_marker('session_config')
if m is None:
pytest.fail('please use "session_config" marker')
specific_file = m.args[0]
copy_config_file(specific_file)
link = spawn_exe()
yield link
close_down_exe(link)
#pytest.mark.session_config("foo.xml")
def test_1(session):
session.talk_to_exe()
#pytest.mark.session_config("bar.xml")
def test_2(session):
session.talk_to_exe()
Another approach would be to just change your session fixture slightly to delegate the creation of the link to the test function:
import pytest
#pytest.fixture
def session_factory(request):
links = []
def make_link(specific_file):
copy_config_file(specific_file)
link = spawn_exe()
links.append(link)
return link
yield make_link
for link in links:
close_down_exe(link)
def test_1(session_factory):
session = session_factory('foo.xml')
session.talk_to_exe()
def test_2(session):
session = session_factory('bar.xml')
session.talk_to_exe()
I prefer the latter as its simpler to understand and allows for more improvements later, for example, if you need to use #parametrize in a test based on the config value. Also notice the latter allows to spawn more than one executable in the same test.