How to return calling object from mocked pandas function - python

I am attempting to write a test (using pytest-mock) for some code that uses pandas for I/O. Ideally instead of writing the contents to a file using pandas.to_excel() function, I would rather return the dataframe to my test function for comparison.
So my questions are:
Is what I want even possible
If so, how do I return the dataframe from the code under test to my test function
Here's my code so far.
mymodule.py
def my_module_func(input_file: Path, dbc: DBConnection) -> None:
db_data = dbc.get_from_db()
spreadsheet_data = pandas.read_excel(input_file, engine='openpyxl')
# Do some stuff to modify the spreadsheet_data dataframe
# Generate the path to the output file and write updated info to it
output_name = f"{input_file.stem}_Checked{input_file.suffix}"
output_file = input_file.parent.joinpath(output_name)
spreadsheet_data.to_excel(output_file, index=False)
test_mymodule.py
from mymodule import my_module_func
dummy_data = {'values': ['val1']}
#pytest.fixture()
def fake_file():
return Path("./mocked.xlsx")
def test_my_func(mocker, fake_file)
# Patch the db class function so we don't actually hit the db
mock_db_connector = mocker.patch('mymodule.DBConnection', autospec=True)
mock_db_connector.get_from_db.return_value = ["val1", "val2"]
# Patch the pandas read function for getting the data from the input file
mocker.patch('mymodule.pandas.read_excel', return_value=DataFrame(data=dummy_data))
# Patch the pandas to_excel function for writing data to file
# Ideally I'd like to tell this to return the dataframe instead of write it
output_patch = mocker.patch('mymodule.pandas.DataFrame.to_excel')
my_module_func(fake_file, mock_db_connector)
# End Goal
# pandas.testing.assert_frame_equal(actual, expected)

Related

Call function and parameters based on condition in Python

i have written a function in Python that reads either a .csv or .xls and return it as a pandas dataframe.
Based on the passed file_type the function uses either the pandas.read_csv() or pandas.read_excel() function with (just one slight) difference in the parameters.
It works without an issue but its obviously repeated code i would like to reduce.
So how could i best:
Have just one function call that is dynamically changed to the specific one defined by the file_type variable
Dynamically change the parameters of the then called function based on the same variable?
Here is my current code.
Thanks for your help.
def file_to_df(file_name, fields= None, file_type = None, encoding = None):
"""Read stock level from csv or xlsx file.Filter SKU and Qty.Return dataframe."""
if file_type == 'csv' or 'xls':
if file_type == 'csv':
data_frame = pd.read_csv(
file_name,
encoding = encoding,
converters={'Barcode':str,'Qty':int},
usecols=fields
)
elif file_type == 'xls':
data_frame = pd.read_excel(
file_name,
converters={'Barcode':str,'Qty':int},
usecols=fields
)
# Remove empty rows
data_frame.replace('', np_nan, inplace=True)
data_frame.dropna(axis=0, how='any', subset=None, inplace=True)
return data_frame
else:
print('no csv or xls filetype was handed to file_to_df')
For the parameters i tried using two tuples that are put into the function call.
You can modify your signature function and use the keyword-only arguments (PEP 3102). After that, create a dict of parameters, add your fixed parameters (converters), rename some parameters (fields -> usecols) and add other parameters as it:
import pandas as pd
import pathlib
def file_to_df(file_name, **kwargs):
xfile = pathlib.Path(file_name)
params = {
'converters': {'Barcode': str, 'Qty': int}, # add fixed parameters
'usecols': kwargs.pop('fields', None) # convert fields to usecols
} | kwargs # pass all other parameters as it
# determine the right function according the extension
funcs = {'.csv': pd.read_csv, '.xlsx': pd.read_excel}
try:
df = funcs[xfile.suffix](xfile, **params)
except KeyError:
raise RuntimeError('no csv or xls filetype was handed to file_to_df')
return df
Don't pass a string that has to be mapped to a particular function; just pass the correct function.
def file_to_df(file_name, fields=None, *, converter, **kwargs):
"""Read stock level from csv or xlsx file.Filter SKU and Qty.Return dataframe."""
data_frame = converter(file_name, , converters={'Barcode': str, 'Qty': int}, usecols=fields, **kwargs)
data_frame.replace('', np_nan, inplace=True)
data_frame.dropna(axis=0, how='any', subset=None, inplace=True)
return data_frame
df1 = file_to_df('foo.csv', converter=pd.read_csv)
df2 = file_to_df('foo.xlsx', converter=pd.read_excel, encoding='...')

How to query an external csv file uploaded through StringIO in S3 in Snowflake with the right format?

I've written these Python methods in my custom Airflow operator to convert a dictionary to a dataframe then to StringIO object and upload it to S3 as a CSV file without saving locally.
def execute(self, context):
s3_hook = S3Hook(aws_conn_id=self.s3_conn_id)
retailer, d1 = context['task_instance'].xcom_pull(self.data_source)
self._upload_file(d1, retailer, s3_hook)
def _upload_to_s3(self, df, s3_hook):
csv_buffer = StringIO()
df.to_csv(csv_buffer)
s3_hook.load_string(string_data=csv_buffer.getvalue(),
key=self.s3_key,
bucket_name=self.s3_bucket,
replace=True)
def _upload_file(self, d, retailer, s3_hook):
self.s3_key = f"S3_STAGING/{retailer}/{retailer}_summary.csv"
df = pd.DataFrame.from_dict(d, orient="index")
df.index.name = 'product_code'
self._upload_to_s3(df, s3_hook)
The DAG runs and uploads the file successfully, and the file looks normal when using S3 query on it. But when I try to query it in Snowflake:
select t.$1 as description,
t.$2 as parent_company
from #S3_STAGING/S3_STAGING/sample/sample_summary.csv as t
All columns are concatenated into one for some reasons. Is there any ways to fix this?
Can you check if you defined a specific field_delimiter for the stage? To be sure, can you create a file format and use it:
create file format myformat type = 'csv';
select t.$1 as description,
t.$2 as parent_company from #S3_STAGING/S3_STAGING/sample/sample_summary.csv (file_format =>
'myformat') t;

How can I mock pathlib.Path.open and pathlib.path.unlink with the same syntax?

I use pathlib.Path.open() and pathlib.Path.unlink() in my productive code. The unittest for that work. But I use two different ways to patch(). One with #patch decorator and one with a context manager with mock.patch().
I would like to have #patch only like this.
class MyTest(unittest.TestCase):
#mock.patch('pathlib.Path.unlink')
#mock.patch('pathlib.Path.open')
def test_foobar(self, mock_open, mock_unlink):
But the real code currenty looks like this
import unittest
from unittest import mock
import pathlib
class MyTest(unittest.TestCase):
#mock.patch('pathlib.Path.unlink')
def test_foobar(self, mock_unlink):
# simulated CSV file
opener = mock.mock_open(read_data='A;B\n1;2')
with mock.patch('pathlib.Path.open', opener):
result = validate_csv(file_path=pathlib.Path('foo.csv'))
self.assertTrue(result)
Technical my problem here is that I do not know how to add my CSV content to mock_open when using the #patch decorator.
It could look like this:
class MyTest(unittest.TestCase):
#mock.patch('pathlip.Path.open')
#mock.patch('pathlib.Path.unlink')
def test_foobar(self, mymock_unlink, mymock_open):
# simulated CSV file
opener = mock.mock_open(read_data='A;B\n1;2')
# QUESTION: How do I bring 'opener' and 'mymock_open'
# together now?
result = validate_csv(file_path=pathlib.Path('foo.csv'))
self.assertTrue(result)
But the goal of my question is to improve readability and maintainability of the code. Using two decorators would reduce the indention. Choosing one way (decorators or context managers) would IMHO be easier to read.
For learning purposes
Q: How do I bring 'opener' and 'mymock_open' together now?
A: Assign side_effect and return_value of mymock_open to those of opener.
#mock.patch('pathlib.Path.open')
#mock.patch('pathlib.Path.unlink')
def test_foobar(self, mymock_unlink, mymock_open):
# simulated CSV file
opener = mock.mock_open(read_data='A;B\n1;2')
# QUESTION: How do I bring 'opener' and 'mymock_open'
# together now?
mymock_open.side_effect = opener.side_effect # +
mymock_open.return_value = opener.return_value # +
result = validate_csv(file_path=pathlib.Path('foo.csv'))
opener.assert_not_called() # +
mymock_open.assert_called_once() # +
mymock_unlink.assert_called_once() # +
self.assertTrue(result)
But this is hardly a readability improvement.
Both using decorators
#mock.patch('pathlib.Path.open', new_callable=lambda: mock.mock_open(read_data='A;B\n1;2')) # +
#mock.patch('pathlib.Path.unlink')
def test_foobar(self, mock_unlink, mock_open):
result = validate_csv(file_path=pathlib.Path('foo.csv'))
mock_open.assert_called_once() # +
mock_unlink.assert_called_once() # +
self.assertTrue(result)
Passing just mock.mock_open(read_data='A;B\n1;2') (as positional argument new) instead of new_callable=lambda: ... works too, but then #mock.patch won't pass mock_open to test_foobar.
Both using context managers
def test_foobar(self):
# simulated CSV file
opener = mock.mock_open(read_data='A;B\n1;2')
with mock.patch('pathlib.Path.unlink') as mock_unlink,\
mock.patch('pathlib.Path.open', opener) as mock_open: # +
self.assertIs(mock_open, opener) # +
result = validate_csv(file_path=pathlib.Path('foo.csv'))
mock_open.assert_called_once() # +
mock_unlink.assert_called_once() # +
self.assertTrue(result)
Notice that mock_open is the same instance as opener.
Verifying the solutions
Sample implementation of validate_csv for a minimal, reproducible example:
def validate_csv(file_path):
"""
:param pathlib.Path file_path:
:rtype: bool
"""
with file_path.open() as f:
data = f.read()
file_path.unlink()
return data == 'A;B\n1;2'

How to use a pandas data frame in a unit test

I am developing a set of python scripts to pre-process a dataset then produce a series of machine learning models using scikit-learn. I would like to develop a set of unittests to check the data pre-processing functions, and would like to be able to use a small test pandas dataframe for which I can determine the answers for and use it in assert statements.
I cannot seem to get it to load the dataframe and to pass it to the unit tests using self. My code looks something like this;
def setUp(self):
TEST_INPUT_DIR = 'data/'
test_file_name = 'testdata.csv'
try:
data = pd.read_csv(INPUT_DIR + test_file_name,
sep = ',',
header = 0)
except IOError:
print 'cannot open file'
self.fixture = data
def tearDown(self):
del self.fixture
def test1(self):
self.assertEqual(somefunction(self.fixture), somevalue)
if __name__ == '__main__':
unittest.main()
Thanks for the help.
Pandas has some utilities for testing.
import unittest
import pandas as pd
from pandas.util.testing import assert_frame_equal # <-- for testing dataframes
class DFTests(unittest.TestCase):
""" class for running unittests """
def setUp(self):
""" Your setUp """
TEST_INPUT_DIR = 'data/'
test_file_name = 'testdata.csv'
try:
data = pd.read_csv(INPUT_DIR + test_file_name,
sep = ',',
header = 0)
except IOError:
print 'cannot open file'
self.fixture = data
def test_dataFrame_constructedAsExpected(self):
""" Test that the dataframe read in equals what you expect"""
foo = pd.DataFrame()
assert_frame_equal(self.fixture, foo)
If you are using latest pandas, I think the following way is a bit cleaner:
import pandas as pd
pd.testing.assert_frame_equal(my_df, expected_df)
pd.testing.assert_series_equal(my_series, expected_series)
pd.testing.assert_index_equal(my_index, expected_index)
Each of these functions will raise AssertionError if they are not "equal".
For more information and options: https://pandas.pydata.org/pandas-docs/stable/reference/general_utility_functions.html#testing-functions
You could do something like this as well with snapshottest.
https://stackoverflow.com/a/64070787/3384609

pandas: Writing to an existing excel file (xlsx) using to_excel

I have a simple use case for df.to_excel() that I'm struggling with. I want to write to a specific worksheet tab (let's call it "Data") of an existing XLSX workbook, which could be referenced by formulas and pivots on other tabs.
I've tried to modify ExcelWriter in two ways but both produce errors from openpyxl.
Read an existing sheet using get_sheet_by_name (This errors: "NotImplementedError: use 'iter_rows()' instead".)
Create a new sheet using create_sheet. (This errors:"ReadOnlyWorkbookException: Cannot create new sheet in a read-only workbook")
df=DataFrame()
from openpyxl.reader.excel import load_workbook
book = load_workbook('my_excel_file.xlsx', use_iterators=True) # Assume my_excel_file.xlsx contains a sheet called 'Data'
class temp_excel_writer(ExcelWriter): # I need this to inherit the other methods of ExcelWriter in io/parsers.py
def __init__(self, path, book):
self.book=book
test_sheet=self.book.create_sheet(title='Test') # This errors: ReadOnlyWorkbookException
self.use_xlsx = True
self.sheet_names=self.book.get_sheet_names()
self.actual_sheets=self.book.worksheets
self.sheets={}
for i,j in enumerate(self.sheet_names):
self.sheets[j] = (self.actual_sheets[i],1)
self.cur_sheet = None
self.path = save
my_temp_writer=temp_excel_writer('my_excel_file.xlsx', book)
df.to_excel(my_temp_writer, sheet_name='Data')
Any thoughts? Am I missing something obvious? I'm still in pandas 7.2
When you load your workbook with use_iterators=True, it then _set_optimized_read() on the Workbook object, which cause it to be loaded read-only.
Thus, with the following code :
from openpyxl.reader.excel import load_workbook
book = load_workbook('t.xlsx', use_iterators=False) # Assume t.xlsx contains ['Data', 'Feuil2', 'Feuil3']
print book.get_sheet_names()
class temp_excel_writer():
def __init__(self, path, book):
self.book=book
test_sheet=self.book.create_sheet(title='Test') # No exception here now
self.book.save(path)
self.use_xlsx = True
self.sheet_names=self.book.get_sheet_names()
print self.sheet_names
self.actual_sheets=self.book.worksheets
self.sheets={}
for i,j in enumerate(self.sheet_names):
self.sheets[j] = (self.actual_sheets[i],1)
self.cur_sheet = None
self.path = path # I had to modify this line also
my_temp_writer = temp_excel_writer('my_excel_file.xlsx', book)
It create a file named my_excel_file.xlsx and the following output :
['Data', 'Feuil2', 'Feuil3']
['Data', 'Feuil2', 'Feuil3', 'Test']
Hope it helps

Categories