Call function and parameters based on condition in Python - python

i have written a function in Python that reads either a .csv or .xls and return it as a pandas dataframe.
Based on the passed file_type the function uses either the pandas.read_csv() or pandas.read_excel() function with (just one slight) difference in the parameters.
It works without an issue but its obviously repeated code i would like to reduce.
So how could i best:
Have just one function call that is dynamically changed to the specific one defined by the file_type variable
Dynamically change the parameters of the then called function based on the same variable?
Here is my current code.
Thanks for your help.
def file_to_df(file_name, fields= None, file_type = None, encoding = None):
"""Read stock level from csv or xlsx file.Filter SKU and Qty.Return dataframe."""
if file_type == 'csv' or 'xls':
if file_type == 'csv':
data_frame = pd.read_csv(
file_name,
encoding = encoding,
converters={'Barcode':str,'Qty':int},
usecols=fields
)
elif file_type == 'xls':
data_frame = pd.read_excel(
file_name,
converters={'Barcode':str,'Qty':int},
usecols=fields
)
# Remove empty rows
data_frame.replace('', np_nan, inplace=True)
data_frame.dropna(axis=0, how='any', subset=None, inplace=True)
return data_frame
else:
print('no csv or xls filetype was handed to file_to_df')
For the parameters i tried using two tuples that are put into the function call.

You can modify your signature function and use the keyword-only arguments (PEP 3102). After that, create a dict of parameters, add your fixed parameters (converters), rename some parameters (fields -> usecols) and add other parameters as it:
import pandas as pd
import pathlib
def file_to_df(file_name, **kwargs):
xfile = pathlib.Path(file_name)
params = {
'converters': {'Barcode': str, 'Qty': int}, # add fixed parameters
'usecols': kwargs.pop('fields', None) # convert fields to usecols
} | kwargs # pass all other parameters as it
# determine the right function according the extension
funcs = {'.csv': pd.read_csv, '.xlsx': pd.read_excel}
try:
df = funcs[xfile.suffix](xfile, **params)
except KeyError:
raise RuntimeError('no csv or xls filetype was handed to file_to_df')
return df

Don't pass a string that has to be mapped to a particular function; just pass the correct function.
def file_to_df(file_name, fields=None, *, converter, **kwargs):
"""Read stock level from csv or xlsx file.Filter SKU and Qty.Return dataframe."""
data_frame = converter(file_name, , converters={'Barcode': str, 'Qty': int}, usecols=fields, **kwargs)
data_frame.replace('', np_nan, inplace=True)
data_frame.dropna(axis=0, how='any', subset=None, inplace=True)
return data_frame
df1 = file_to_df('foo.csv', converter=pd.read_csv)
df2 = file_to_df('foo.xlsx', converter=pd.read_excel, encoding='...')

Related

How to return calling object from mocked pandas function

I am attempting to write a test (using pytest-mock) for some code that uses pandas for I/O. Ideally instead of writing the contents to a file using pandas.to_excel() function, I would rather return the dataframe to my test function for comparison.
So my questions are:
Is what I want even possible
If so, how do I return the dataframe from the code under test to my test function
Here's my code so far.
mymodule.py
def my_module_func(input_file: Path, dbc: DBConnection) -> None:
db_data = dbc.get_from_db()
spreadsheet_data = pandas.read_excel(input_file, engine='openpyxl')
# Do some stuff to modify the spreadsheet_data dataframe
# Generate the path to the output file and write updated info to it
output_name = f"{input_file.stem}_Checked{input_file.suffix}"
output_file = input_file.parent.joinpath(output_name)
spreadsheet_data.to_excel(output_file, index=False)
test_mymodule.py
from mymodule import my_module_func
dummy_data = {'values': ['val1']}
#pytest.fixture()
def fake_file():
return Path("./mocked.xlsx")
def test_my_func(mocker, fake_file)
# Patch the db class function so we don't actually hit the db
mock_db_connector = mocker.patch('mymodule.DBConnection', autospec=True)
mock_db_connector.get_from_db.return_value = ["val1", "val2"]
# Patch the pandas read function for getting the data from the input file
mocker.patch('mymodule.pandas.read_excel', return_value=DataFrame(data=dummy_data))
# Patch the pandas to_excel function for writing data to file
# Ideally I'd like to tell this to return the dataframe instead of write it
output_patch = mocker.patch('mymodule.pandas.DataFrame.to_excel')
my_module_func(fake_file, mock_db_connector)
# End Goal
# pandas.testing.assert_frame_equal(actual, expected)

Hand over settings in a class to a pd.read_csv() function

Hi i am pretty new to python. I developed the following class:
import pandas as pd
import os
class Class1:
def __init__(self, path, cols = None, settings = {"sep" : ";", "encoding" : "unicode_escape", "header" : "infer", "decimal" :"."
, "skiprows" : None, "names" : None, "skipfooter" : 0, "engine" : "python"} ):
self.raw = self._load_raw(path = path, s = settings, cols = cols)
def _load_raw(self, path, s, cols = None):
df = pd.read_csv(path, sep = s["sep"], encoding = s["encoding"], decimal = s["decimal"], skiprows = s["skiprows"], skipfooter = s["skipfooter"]
, engine = s["engine"], header = s["header"], names = s["names"], usecols = cols)
return df
Inside of the class is a function which reads a csv file into a pd.DataFrame. I am wondering if there is a smart way of developing the class without handing over such a setting dictionary to read the dataframe later on when creating an object. Lets suppose the csv file is much more easy and just need 1 argument e.g. "sep" and not all the other arguments, but then the class needs also to be able to read csv files which require more arguments. Is there a pythonic way to just hand over as many as required ?
for example 1 object just needs "sep", and another object neeeds all of the settings parameters defined in the docs of pd.read_csv for example, but they can be both created with the same class

Class that returns a transformed dataframe

I'm trying to create a class that takes the path and name of the CSV file, converts it to a dataframe, deletes some columns, converts another one to datetime, as in the code
import os
from pathlib import Path
import pandas as pd
import datetime
class Plans:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.Dataframe()
def get_dataframe(self):
os.chdir(self.path)
self.df = pd.read_csv(self.file, encoding="latin-1", low_memory=False, sep=';')
if 'data' in df.columns:
self.tipo = 'sales'
self.df['data'] = pd.to_datetime(df['data'])
return clean_unused_data()
def clean_unused_data(self):
columns = ['id', 'docs', 'sequence','data_in','received', 'banc', 'return', 'status', 'return_cod',
'bank_account_return', 'id_transcript', 'id_tx','type_order']
for item in columns:
del self.df[item]
del columns[:]
return self.df
When I call an object of the class it gives an error with the clean_unused_data function
returns the following error:
__getattr__ raise AttributeError(f"module 'pandas' has no attribute '{name}'")
Also, I would like to do more dataframe transformations in the Plans class. but since this first one failed, I was a little lost.
Thanks for the help and I apologize for the lack of intimacy with python
I think the error refers to calling an attribute that does not exist in Pandas. From what I can see you wrote pd.DataFrame as pd.Dataframe. Notice the capitalization.
Try the following:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.DataFrame()
Probably one of the columns you are trying to delete is not actually in your file. You can handle the exception or remove this column label from your array.

TypeError: __init__() missing 1 required positional argument: 'df'

I want to call df["ID"] in the dataset_csv function and then call the dataset_csv function using dataset = RawToCSV.dataset_csv(input_path). df["ID"] was defined in the raw_file_processing function.
My code raised TypeError: __init__() missing 1 required positional argument: 'df' error.
import re
import pandas as pd
import os
import numpy as np
input_path = "../input_data"
class RawToCSV:
def __init__(self, path_, df):
self.measurement_df = None
self.cls = None
self.path_ = path_
self.df = df
def raw_file_processing(self, path_):
# Open all the subfolders within path
for root, dirs, files in os.walk(path_):
for file in files:
with open(os.path.join(root, file), "r") as data:
self.df = pd.read_csv(data)
# 'Class' refers to the independent variable
cls_info = self.df.iloc[2]
# Dummy-code the classes
cls = pd.get_dummies(cls_info)
# Create the ID series by concatenating columns 1-3
self.df = self.df.assign(
ID=self.df[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
self.df = self.df.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in self.df[9:]:
if re.findall(r"Blank|Empty", col):
background = col
else:
line = col.readline()
for dat in line:
norm_data = dat / background
self.measurement_df = np.log2(norm_data)
return self.df["ID"], cls, self.measurement_df
def dataset_csv(self):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
ids = self.df["ID"]
id_col = ids.to_frame()
cls_col = self.cls.to_frame()
frames = [id_col, cls_col, self.measurement_df]
dataset_df = pd.concat(frames)
data_csv = dataset_df.to_csv("../input_data/dataset.csv")
return data_csv
raw = RawToCSV(input_path)
three_tuple = raw.raw_file_processing(input_path)
dataset = raw.data_csv()
Traceback:
> --------------------------------------------------------------------------- TypeError Traceback (most recent call
> last) /tmp/ipykernel_136/323215226.py in <module>
> ----> 1 raw = RawToCSV(input_path)
> 2 three_tuple = raw.raw_file_processing(input_path)
>
> TypeError: __init__() missing 1 required positional argument: 'df'
In this part of code:
dataset = RawToCSV.dataset_csv(input_path)
You are using the class itself, however you should first instantiate from the class RawToCSV, like this:
rawToCSV = RawTOCSV(input_path)
dataset = rawToCSV.data_csv()
But still you have another mistake ,too. In the constructor of the class , __init__ you've initiated the self.df with self.df, which the latter one hasn't been defined ,yet.
Therefore in this part of code, you'll get another error (AttributeError: 'RawToCSV' object has no attribute 'df'):
def __init__(self, path_):
self.measurement_df = None
self.cls = None
self.path_ = path_
self.df = self.df # <-----
On this line:
dataset = RawToCSV.dataset_csv(input_path)
you're calling dataset_csv as if it were a static method (calling it on the class not an instance). You are passing in input_path, which I assume is a string. Since you're calling the method as if it were static, it is not invisibly adding the actual self value into the call (you have to have an object to even be sent as self).
This means that your one parameter of dataset_csv, which you named self, is receiving the (string) value of input_path.
The error message is telling you that the string input_path has no member .df because it doesn't.
With the way your class and its methods are currently set up, you'll need your entry point code at the bottom to be something like this:
raw = RawToCSV(input_path)
three_tuple = raw.raw_file_processing(input_path)
dataset = raw.dataset_csv()
Though, you may want to restructure your class and its methods

How to compare an installation path with a wildcarded application name in python

I want to traverse a dataframe in python and, by this, to get rid of the full installation path. So only the application name, e.g. firefox.exe or firefox shall be explicated and not "C:\Program Files\Mozilla Firefox\firefox.exe".
I created a function which takes the respective dataframe and a dcitonary containing key value pairs with a application name as key (firefox) and the wildcarded path as value (*firefox.exe).
def transform_process_name(mid_result_df, name_dict):
for row, col in mid_result_df.iterrows():
for name, value in name_dict.items():
print(name)
print(value)
if col['ProcessName'] == value:
col['ProcessName'] = name
return mid_result_df
this is called in a function as follows:
transform_process_name(mid_result_df, __name_of_processes)
where
__name_of_processes =
{
'firefox': '*firefox.exe',
}
The comparison does not work. so the output is still "C:\Program Files\Mozilla Firefox\firefox.exe" and not "firefox".
Thank you :)
EDIT:
It now works with
def transform_process_name(mid_result_df, name_dict):
for row, col in mid_result_df.iterrows():
for name, value in name_dict.items():
string_check = col['ProcessName']
if string_check.endswith(value):
mid_result_df.at[row, 'ProcessName'] = name
return mid_result_df
I don't think you can use == to compare "C:\Program Files\Mozilla Firefox\firefox.exe" and "*firefox.exe". I think you probably want to replace
if col['ProcessName'] == value:
with
if value in col['ProcessName']:
or
if col['ProcessName'].endswith(value):
and value should be 'firefox.exe'.
EDIT:
This is my code:
import pandas as pd
def transform_process_name(mid_result_df, name_dict):
for row, col in mid_result_df.iterrows():
for name, value in name_dict.items():
if value in col['ProcessName']:
# if col['ProcessName'].endswith(value):
col['ProcessName'] = name
else:
col['ProcessName'] = col['ProcessName']
return mid_result_df
mid_result_df = pd.DataFrame({'ProcessName': ['C:\\Program Files\\Mozilla Firefox\\firefox.exe']})
name_dict = {'firefox': 'firefox.exe'}
result = transform_process_name(mid_result_df, name_dict)
print(result)
Maybe I did not understand well the problem, but if you have a dataframe containing the paths to each application, why don't you use the native functions in the os module to extract the name of the application themselves?
For example:
import pandas as pd
import os
def get_application(path):
return os.path.splitext(os.path.basename(path))[0]
path = "C:/Program Files/Mozilla Firefox/"
df = pd.DataFrame([os.path.join(path + "firefox.exe"), os.path.join(path + "myapp.exe")],
columns=["full_path"])
df["application"] = df["full_path"].apply(get_application)
print(df)
Here, I created a dummy dataframe with paths to the applications. Then I define a function that given a path, returns only the app name (basename) without the dot (splitext).
The apply method applies the function to each element of the dataframe, and the output is stored in a new column.
Result:
full_path application
0 C:/Program Files/Mozilla Firefox/firefox.exe firefox
1 C:/Program Files/Mozilla Firefox/myapp.exe myapp

Categories