I am working on a python project, where we read parquet files from azure datalake and perform the required operations. We have defined a common parquet file reader function which we use to read such files. I am using MagicMock to mock objects and write test cases, when I ran the test cases some tests were failing because they have mocked values of some other test cases. I didn't understand this behavior completely. Below is my code
utils/common.py
def parquet_reader(parquet_path: str) -> Tuple[Dict, int]:
table = pq.read_table(parquet_directory_path)
return table.to_pydict(), table.num_rows
-------Test01-------------
PARQUET_DATA = {
"se_vl": ["530"],
"l_depart": ["028"],
"r_code": ["f5"],
"r_dir": ["W"],
}
NUM_RECORDS = 1
TEST_CAF_DATA = (PARQUET_DATA, NUM_RECORDS)
def test_read_from_source():
common.parquet_reader = MagicMock(return_value=TEST_CAF_DATA)
obj = next(read_from_source('path to file'))
assert obj.se_vl == "530"
-------Test02-------------
from utils import common
SAMPLE_DATA_PATH = "src/schedule/sample_data"
def test_parquet_reader():
rows, _ = common.parquet_reader(SAMPLE_DATA_PATH)
assert rows["key_id"][0] == 345689865
assert rows["com"][0] == "UP"
When I run all the tests(total of 240 test) than the variable 'rows' in test02 holds the data for test01 (PARQUET_DATA).
However I fixed the above issue using patch. But still confused as why such behaviour using MagicMock?
Related
I'm using vscode as IDE
I have code a very simple usage of pytest fixture but it doesn't working when basic example fixture found in the pytest documentation are working well :
#pytest.fixture
def declare_hexidict():
hd = hexidict()
rvc = ReferenceValueCluster()
rv = ReferenceValue(init=3)
hd_var = (hd, rvc, rv)
return hd_var
def setitem_getitem(declare_hexidict):
print('start')
# hd = hexidict()
# rvc = ReferenceValueCluster()
# rv = ReferenceValue(init=3)
hd, rvc, rv = declare_hexidict
print('datastruct defined')
hd[rvc("key1").reflink] = rv[0].reflink
hd[rvc["key1"]] == rv[0]
assert rvc["key1"] in hd.keys(), "key :{} is not int this hexidict".format(
rvc("key1")
)
assert hd[rvc["key1"]] == rv[0], "key :{} return {} instead of {}".format(
rvc["key1"], hd[rvc["key1"]], rv[0]
)
#set non value item (on set une liste)
hd[rvc("key2").reflink] = [rv[1].reflink]
hd[rvc["key2"]]
assert type(hd[rvc["key2"]]) == list
#on verifie que l'item dans la list est bien celui qui provient de rv
assert hd[rvc["key2"]][0] in rv
I get in the test summary info :
ERROR test/process/hexidict/test_hd_basic_function.py - TypeError: setitem_getitem() missing 1 required positional argument: 'declare_hexidict'
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Interrupted: 1 error during collection !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pytest does not recognize setitem_getitem like test, so you should rename it to test_setitem_getitem and try it out:
def test_setitem_getitem(declare_hexidict):
The problem is that your test is not detected by Pytest's test discovery.
Depending on how you execute your tests (whether you provide a full path to your test file, provide path with sub directories and multiple test files or want to execute all tests matching a specific mark in the entire project) you will want to make sure all test modules, classes and functions are discovered properly. By default test files need to match test_*.py or *_test.py, classes - Test* and functions - test*.
https://docs.pytest.org/en/7.1.x/explanation/goodpractices.html#conventions-for-python-test-discovery
Test discovery can also be configured to match your needs in pytest.ini.
Example pytest.ini:
[pytest]
python_files = *_pytest.py
python_functions = mytest_*
python_classes = *Tests
I want to load multiple files in spark data frame in parallel using Luigi workflow and store them in dictionary .
Once all the files are loaded,i want to be able to access these data-frame from dictionary in main and then do further processing.This process is working when i am running Luigi with one worker.if running Luigi with more than one worker,this variable is empty in main method.
Any suggestion will be helpful.
import Luigi
from Luigi import LocalTarget
from pyspark import SQLContext
from src.etl.SparkAbstract import SparkAbstract
from src.util.getSpark import get_spark_session
from src.util import getSpark,read_json
import configparser as cp
import datetime
from src.input.InputCSVFileComponent import InputCSVFile
import os
from src.etl.Component import ComponentInfo
class fileloadTask(luigi.Task):
compinfo = luigi.Parameter()
def output(self):
return luigi.LocalTarget("src/workflow_output/"+str(datetime.date.today().isoformat() )+"-"+ str(self.compinfo.id)+".csv")
def run(self):
a = InputCSVFile(self.compinfo) ##this class is responsible to return the object of spark dataframe and put it in dictionary
a.execute()
with self.output().open('w') as f:
f.write("done")
class EnqueueTask(luigi.WrapperTask):
compinfo = read_json.read_json_config('path to json file')
def requires(self):
folders = [
comp.id for comp in list(self.compinfo) if comp.component_type == 'INPUTFILE'
]
print(folders)
newcominfo = []
for index, objid in enumerate(folders):
newcominfo.append(self.compinfo[index])
for i in newcominfo:
print(f" in compingo..{i.id}")
callmethod = [fileloadTask(compinfo) for compinfo in newcominfo]
print(callmethod)
return callmethod
class MainTask(luigi.WrapperTask):
def requires(self):
return EnqueueTask()
def output(self):
return luigi.LocalTarget("src/workflow_output/"+str(datetime.date.today().isoformat() )+"-"+ "maintask"+".csv")
def run(self):
print(f"printing mapdf..{SparkAbstract.mapDf}")
res = not SparkAbstract.mapDf
print("Is dictionary empty ? : " + str(res)) ####-------------> this is empty when workers > 1 ################
for key, value in SparkAbstract.mapDf.items():
print("prinitng from dict")
print(key, value.show(10))
with self.output().open('w') as f:
f.write("done")
"""
entry point for spark application
"""
if __name__ == "__main__":
luigi.build([MainTask()],workers=2,local_scheduler=True)
Each worker runs in its own process. That mean workers can't share python object (in this instance the dictionary in which you put the results).
Generally speaking luigi is best to orchestrate tasks with side effects (like writing to files etc).
If you you are trying to parallelise tasks that load data in memory, I'd recommand using dask instead of luigi.
I have an django app route that will run a pytest.main() command if some conditions are met:
def run_single_test(request, single_test_name):
# get dict of test names, test paths
test_dict = get_single_test_names()
# check to see if test is in the dict
if single_test_name in test_dict:
for test_name,test_path in test_dict.items():
# if testname is valid run associated test
if test_name == single_test_name:
os.chdir('/lib/tests/')
run_test = pytest.main(['-v', '--json-report', test_path])
else:
return 'The requested test could not be found.'
I would like to include a unit test that validates run_test has been executed.
What is the best approach to doing this? Mock and unittest are new to me.
I tried messing around with stdout:
def test_run_single_test_flow_control(self):
mock_get = patch('test_automation_app.views.get_single_test_names')
mock_get = mock_get.start()
mock_get.return_value = {'test_search': 'folder/test_file.py::TestClass::test'}
results = run_single_test('this-request', 'test_search')
output = sys.stdout
self.assertEqual(output, '-v --json-report folder/test_file.py::TestClass::test')
but this returns:
<_pytest.capture.EncodedFile object at XXXXXXXXXXXXXX>
Here are two example tests that verify that pytest.main is invoked when a valid test name is passed and not invoked otherwise. I also added some different invocations of mock_pytest_main.assert_called as an example; they all do pretty much the same, with extra check for args that were passed on function call. Hope this helps you to write more complex tests!
from unittest.mock import patch
from test_automation_app.views import run_single_test
def test_pytest_invoked_when_test_name_valid():
with patch('pytest.main') as mock_pytest_main, patch('test_automation_app.views.get_single_test_names') as mock_get:
mock_get.return_value = {'test_search': 'folder/test_file.py::TestClass::test'}
results = run_single_test('this-request', 'test_search')
mock_pytest_main.assert_called()
mock_pytest_main.assert_called_with(['-v', '--json-report', 'folder/test_file.py::TestClass::test'])
mock_pytest_main.assert_called_once()
mock_pytest_main.assert_called_once_with(['-v', '--json-report', 'folder/test_file.py::TestClass::test'])
def test_pytest_not_invoked_when_test_name_invalid():
with patch('pytest.main') as mock_pytest_main, patch('test_automation_app.views.get_single_test_names') as mock_get:
mock_get.return_value = {'test_search': 'folder/test_file.py::TestClass::test'}
results = run_single_test('this-request', 'test_non_existent')
mock_pytest_main.assert_not_called()
The code below is a unit test code to test the original code
import unittest
from drones import Drone, DroneStore
class DroneTest(unittest.TestCase):
def test_add_success(self):
drone = Drone('Drone1', 1, False)
store = DroneStore()
store.add(drone)
self.assertEqual(drone, store.get(1))
print(store.get(1).name)
def test_add_raiseException_true(self):
drone = Drone('Drone1', 1, False)
store = DroneStore()
store.add(drone)
self.assertEqual(drone, store.get(1))
with self.assertRaises(Exception) as context:
store.add(drone)
self.assertTrue('Drone already exists in store' in str(context.exception)
def test_remove_success(self):
drone = Drone('Drone1', 1, False)
store = DroneStore()
store.add(drone)
self.assertEqual(drone, store.get(1))
print(store.get(1).name)
store.remove(drone)
if __name__ == '__main__':
unittest.main()
This is the partial original code I get :
It looks like I get error from the function named "test_add_raiseException_true", this function is created to test if the 'add' function from original code raises an exception if a drone is already exist in the store
Consider this basic example for the use of a with statement from Jeff Knupp's blog:
class File():
def __init__(self, filename, mode):
self.filename = filename
self.mode = mode
def __enter__(self):
self.open_file = open(self.filename, self.mode)
return self.open_file
def __exit__(self, *args):
self.open_file.close()
I have a testfile which contains two lines, 'ABC' and 'DEF'. Everything works as expected:
with File('/tmp/testfile','r') as f:
txt = [x.strip() for x in f.readlines()]
print(txt)
# ['ABC', 'DEF']
Calling a class method outside the with block gives me the expected error:
f.readlines()
ValueError Traceback (most recent call last)
in ()
----> 1 f.readlines()
ValueError: I/O operation on closed file.
Now to my question:
How can I achieve the same behavior with a gdal object instead of a file?
I have a class method, which should read data from disk and put it into a gdal raster for further processing. Once this is done, I would like to close the generated gdal raster appropriately. Normally this is done with setting it to None and with using gdal.Unlink.
However, when I put everything into a context structure as in the previous example, I can still interact with the dataset outside of the with block.
Here's a reproducible example:
class Raster:
'''Raster class with sidelength s'''
def __init__(self,s):
self.sidelength = s
def __enter__(self):
# create raster in memory
driver = gdal.GetDriverByName('GTiff')
self.raster = driver.Create('/vsimem/inmem.tif', self.sidelength, self.sidelength, 1, gdal.GDT_Float32)
self.raster.GetRasterBand(1).WriteArray(np.random.rand(self.sidelength,self.sidelength))
return self.raster
def __exit__(self, *args):
# close file and unlink
self.raster = None
gdal.Unlink('/vsimem/inmem.tif')
The with block works as expected:
with Raster(5) as r:
print(r)
# <osgeo.gdal.Dataset; proxy of <Swig Object of type 'GDALDatasetShadow *' at 0x7f8078862ae0> >
But after the block the object is still there and I can still read the values:
print(r)
#<osgeo.gdal.Dataset; proxy of <Swig Object of type 'GDALDatasetShadow *' at 0x7f8078044a20> >
print(r.ReadAsArray())
#[[0.2549882 0.80292517 0.23358545 0.6284887 0.7294142 ]
# [0.9310723 0.21535267 0.9054575 0.60967094 0.9937953 ]
# [0.69144976 0.01727938 0.16800325 0.61249655 0.1785022 ]
# [0.16179436 0.43245795 0.7042811 0.4809799 0.85534436]
# [0.67751276 0.7560658 0.9594516 0.6294476 0.3539126 ]]
You cannot really close a gdal dataset while keeping a reference to it. You can keep a reference to the Raster instance instead and use r.raster to access the dataset inside the with block.
class Raster:
'''Raster class with sidelength s'''
def __init__(self,s):
self.sidelength = s
def __enter__(self):
# create raster in memory
driver = gdal.GetDriverByName('GTiff')
self.raster = driver.Create('/vsimem/inmem.tif', self.sidelength, self.sidelength, 1, gdal.GDT_Float32)
self.raster.GetRasterBand(1).WriteArray(np.random.rand(self.sidelength,self.sidelength))
return self
def __exit__(self, *args):
# close file and unlink
self.raster = None
gdal.Unlink('/vsimem/inmem.tif')
with Raster(5) as r:
print(r.raster)
Output:
<osgeo.gdal.Dataset; proxy of <Swig Object of type 'GDALDatasetShadow *' at 0x04564728> >
Outside the with block the dataset is unreachable:
r.raster is None
Output:
True
There will be problems again if you bind another variable to r.raster so you might want to completely encapsulate it inside the Raster instance, along with any functionality you need. At this point you would be more or less reinventing Rasterio but if your needs are simple that might be better than depending on it.
As suggested in the comment, Rasterio via rasterio.open implements the behaviour you are looking for.