What parameters should I pass to load a dataframe using this class?
class SafeLoadExcel:
def __init__(self, file_name):
self.file = file_name
extension = Path(file_name).suffix
if extension == '.csv':
self.load_f = pd.read_csv
elif extension == '.xlsx':
self.load_f = pd.read_excel
def load(self):
try:
df = self.load_f(self.file)
except UnicodeDecodeError:
df = self.load_f(self.file, encoding='iso-8859-1')
return df
Assuming there are only 2 methods on this class, you can get a df by
df = SafeLoadExcel("some/file/name.xlsx").load()
And that's a good hint that this should not have been a class in the first place. It would make as much sense to do
def safe_load_excel(file_name):
extension = Path(file_name).suffix
if extension == '.csv':
load_f = pd.read_csv
elif extension == '.xlsx':
load_f = pd.read_excel
else:
raise ValueError("File must be csv or xlsx")
try:
return load_f(self.file)
except UnicodeDecodeError:
return load_f(self.file, encoding='iso-8859-1')
Assuming SafeLoadExcel provides some other methods for working with the resulting data frame, I would define two separate class methods, and maybe a third that detects which of the first two to use depending on extension present in the file name. SafeLoadExcel.__init__ itself is reserved for when you already have a dataframe, regardless of its original source.
class SafeLoadExcel:
def __init__(self, df):
self.df = df
#classmethod
def _from_file(cls, filename, reader):
"""Load a dataframe from a file.
filename - name of file to read
reader - a function to parse the file and return a dataframe
"""
return cls(reader(filename, encoding='iso-8859-1'))
# CSV-specific wrapper around _from_file
#classmethod
def from_csv(cls, filename):
return cls._from_file(filename, pd.read_csv)
# Excel-specific wrapper around _from_file
#classmethod
def from_excel(cls, filename):
return cls._from_file(filename, pd.read_excel)
# Not to be confused with the private method with a similar name.
# Detect which public class method to use.
#classmethod
def from_file(cls, filename):
if filename.endswith('.csv'):
f = cls.read_csv
elif filename.endswith('.xslx'):
f = cls.read_excel
else:
raise ValueError(f"Cannot determine type of {filename}")
return f(filename)
Then you can use
f1 = SafeLoadExcel.from_csv("foo.csv")
f2 = SafeLoadExcel.from_excel("foo.xlsx")
f3 = SafeLoadExcel.from_file("foo.csv")
f4 = SafeLoadExcel.from_file("foo.xlsx")
f1 and f3 should be effectively the same thing, as should be f2 and f4.
Related
I am trying to use my code to open a file after searching for it in either operating system. However when I assign the variable inside the function, i cant use it outside of the function. And when I keep the 2nd function out of 1st function, it doesnt recognize the function.
I tried to assign the df_location globally, but this doesnt work.
When i use df = pd.read_csv(df_location[0], index_col=0) inside the function, I am not able to use df anywhere else in my code.
if platform.system() == 'windows':
def find_file(root_folder, rex):
for root, dirs, files in os.walk(root_folder):
for f in files:
result = rex.search(f)
if result:
file_path = os.path.join(root, f)
return file_path
def find_file_in_all_drives(file_name):
matching_files = list()
# create a regular expression for the file
rex = re.compile(file_name)
for drive in win32api.GetLogicalDriveStrings().split('\000')[:-1]:
file_path = find_file(drive, rex)
if file_path:
matching_files.append(file_path)
return matching_files
global df_location
df_location = find_file_in_all_drives("AB_NYC_2019.csv")
if platform.system() == 'mac':
df_location = find_file("/", "AB_NYC_2019.csv")
df = pd.read_csv(df_location[0], index_col=0)
I would like to be able to use the file that is retrieved through the functions.
Thank you!
ideally it should be like this
if platform.system() == 'windows':
def find_file(root_folder, rex):
for root, dirs, files in os.walk(root_folder):
for f in files:
result = rex.search(f)
if result:
file_path = os.path.join(root, f)
return file_path
def find_file_in_all_drives(file_name):
matching_files = list()
# create a regular expression for the file
rex = re.compile(file_name)
for drive in win32api.GetLogicalDriveStrings().split('\000')[:-1]:
file_path = find_file(drive, rex)
if file_path:
matching_files.append(file_path)
return matching_files
df_location = find_file_in_all_drives("AB_NYC_2019.csv")
if platform.system() == 'mac':
df_location = find_file("/", "AB_NYC_2019.csv")
df = pd.read_csv(df_location[0], index_col=0)
but this gives the error message:
"NameError: name 'find_file_in_all_drives' is not defined"
You define find_file_in_all_drives for Window but you should define find_file_in_all_drives also for other systems - but every system will have different code in find_file_in_all_drives. And then you can use find_file_in_all_drives on every system
# all systems use it so it should be defined for all
def find_file(root_folder, rex):
for root, dirs, files in os.walk(root_folder):
for f in files:
result = rex.search(f)
if result:
file_path = os.path.join(root, f)
return file_path
# define different `find_file_in_all_drives` for different systems
if platform.system() == 'windows':
def find_file_in_all_drives(file_name):
matching_files = list()
# create a regular expression for the file
rex = re.compile(file_name)
for drive in win32api.GetLogicalDriveStrings().split('\000')[:-1]:
file_path = find_file(drive, rex)
if file_path:
matching_files.append(file_path)
return matching_files
if platform.system() in ('mac', 'linux'):
def find_file_in_all_drives(file_name):
return find_file("/", file_name)
# now you can use `find_file_in_all_drives` on every system
df_location = find_file_in_all_drives("AB_NYC_2019.csv")
df = pd.read_csv(df_location[0], index_col=0)
You didn't show all your code. Presumably, you have find_file and find_file_in_all_drives function implementations for mac as well, yes? At least that's what I would expect just from looking at the code you've posted.
If that really is ALL the code you have, then the way it's written now, you're only defining find_file and find_file_in_all_drives if platform.system() returns "windows" (side note: just tried this, on my Windows 7 system it returns "Windows" with a capital 'W'.) If that condition is not satisfied these function definitions are not visible anywhere else in your code, because you've put them inside the body of the if-statement.
It looks like you are trying to get different behavior depending on the contents of a string (platform.system()). Since you can't avoid having to implement the varying behavior for both operating systems, you can use polymorphism for this:
import abc
class DataFrameFinder(abc.ABC):
def __init__(self):
pass
#abc.abstractmethod
def find_file(self, root_folder, rex):
raise NotImplementedError
#abc.abstractmethod
def find_file_in_all_drives(self, file_name):
raise NotImplementedError
class DataFrameFinderWindows(DataFrameFinder):
def __init__(self, *args, **kwargs):
DataFrameFinder.__init__(self, *args, **kwargs)
def find_file(self, root_folder, rex):
# Do windows things...
pass
def find_file_in_all_drives(self, file_name):
# Do windows things...
pass
class DataFrameFinderMac(DataFrameFinder):
def __init__(self, *args, **kwargs):
DataFrameFinder.__init__(self, *args, **kwargs)
def find_file(self, root_folder, rex):
# Do mac things...
pass
def find_file_in_all_drives(self, file_name):
# Do mac things...
pass
def main():
import platform
finder_factory = {
"Windows": DataFrameFinderWindows,
"Mac": DataFrameFinderMac
}
finder = finder_factory[platform.system()]()
finder.find_file(...)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
I have csv file having contents below
101,item_1
101,item_1
if it is csv my below code will execute
import csv
fName = input()
def read_csv(fName):
try:
with open(fName, 'r') as f:
reader = csv.reader(f)
for row in reader:
print (row)
read_csv(fName)
Here how to write the exception in decorator function and call on the top of that.
first decorator
if fName not endswith .txt or .csv then it has to generate output not accept
Second decorator
if fName = file.txt text file then below operations has to taken care
def read_txt(fName):
f = open(fName, "r")
print(f.readline())
if csv then first function to execute and if txt next function to execute. How to achieve using decorator. I can put if conditon to achieve the situation, but that is not the case
My whole code without decorator is below
fName = input()
def read_csv(fName):
if fName.endswith('.csv'):
#print ('hi')
try:
with open(fName, 'r') as f:
reader = csv.reader(f)
for row in reader:
print (row)
except IOError:
print ("Could not read file:", fName)
#SECOND DECORATOR
if fName.endswith('.txt'):
f = open(fName, "r")
print(f.readline())
#FIRST DECORATOR
if not(fName.endswith('.csv')) and not(fName.endswith('.txt')):
print ('not accept')
read_csv(fName)
You can do it like this with decorators:
import functools
def check_arguments(func):
#functools.wraps(func)
def wrapper(*args, **kwargs):
fname = kwargs['fname']
if not fname.endswith('.csv') and not fname.endswith('.txt'):
print('not accept')
return func(*args, **kwargs)
return wrapper
def set_file_processor(func):
def read_csv(fname):
print('read_csv', fname)
def read_txt(fname):
print('read_txt', fname)
#functools.wraps(func)
def wrapper(*args, **kwargs):
fname = kwargs['fname']
if fname.endswith('.csv'):
read_csv(fname)
elif fname.endswith('.txt'):
read_txt(fname)
return func(*args, **kwargs)
return wrapper
#check_arguments
#set_file_processor
def process(fname):
pass
process(fname='input.csv')
Your problem doesn't seem to come under decorator but under factory pattern i.e. process differently based on the input file.
The below code is a very simple and basic Factory pattern solution to your problem, this should be modified accordingly as per your need,
import os
from abc import ABC, abstractmethod
class FileProcessor(ABC):
#abstractmethod
def process():
pass
class TextFileProcessor(FileProcessor):
def process(self, file_path):
print("Text file processing goes here")
class CsvFileProcessor(FileProcessor):
def process(self, file_path):
print("CSV file processing goes here")
class DefaultFileProcessor(FileProcessor):
def process(self, file_path):
raise ValueError("File %s is not valid" % file_path)
class FileFactory:
processors = {
'txt': TextFileProcessor,
'csv': CsvFileProcessor,
'default': DefaultFileProcessor
}
def __init__(self, file_path):
if not os.path.exists(file_path):
raise IOError("File not found")
self.file_path = file_path
def process(self):
dot_splits = self.file_path.split(".")
ext = dot_splits[-1] if len(dot_splits) > 1 else "default"
ext = ext if ext in self.processors else "default"
processor_class = self.processors.get(ext)
return processor_class().process(self.file_path)
FileFactory(file_path).process()
In later stage if you would like to add json processor then it can also be done easily by adding
processors = {
'txt': TextFileProcessor,
'csv': CsvFileProcessor,
'json': JsonFileProcessor,
'default': DefaultFileProcessor
}
and creating new Json processor class,
class JsonFileProcessor(FileProcessor):
def process(self, file_path):
print("JSON file processing goes here")
Based on your code and this very useful guide, here is a possible solution:
def read_file_decorator(fName):
def read_csv():
print('read_csv')
with open(fName, 'r') as f:
reader = csv.reader(f)
for row in reader:
print(row)
def read_txt():
print('read_txt')
f = open(fName, 'r')
for row in f:
print(row)
if fName.endswith('.csv'):
return read_csv
elif fName.endswith('.txt'):
return read_txt
else:
return None
reader_function = read_file_decorator(fileName)
if reader_function != None:
reader_function()
else:
print('not accept')
I use a stateful decorator remembering the file name inside the reader function before actually executing it (in order not to pass it twice); and I use the fixed value None for invalid file types.
Based on the requirements use of decorator would be an overkill of decorators. But if it's mandatory to implement this using decorator, this is how we can implement:
We can create a dummy function called read_file and a decorator function called reader
User will always call read_file with filename as argument and decorator function reader will check passed filename extension and call the required function - read_csv or read_text
def reader(fun):
def wrapper(*args):
fname = args[0]
if fname.endswith('.csv'):
read_csv(fname)
elif fname.endswith('.txt'):
read_text(fname)
else:
print('not accepted')
return wrapper
def read_csv(fname):
print('In read_csv()')
def read_text(fname):
print('In read_text()')
#reader
def read_file(fname):
pass
read_file('a.csv')
read_file('a.txt')
read_file('filename.py')
Output
In read_csv()
In read_text()
not accepted
I have the following base class:
class ClientRepo(Repository):
def __init__(self) -> None:
self.__clientList = []
def hasClientWithId(self, clientId):
for client in self.__clientList:
if client.getId() == clientId:
return True
return False
def addClient(self, client):
if type(client).__name__ == 'ClientDAO':
if not self.hasClientWithId(client.getId()):
client.setClientId(self.__maximumIndexInClientList() + 1)
self.__clientList.append(client)
else:
raise ObjectAlreadyInCollectionException
else:
raise TypeError
which basically only holds a list and can add a ClientDAO to it.
And the following, which derives from it:
class ClientFileRepository(ClientRepo):
def __init__(self, fileName) -> None:
super().__init__()
self.__fileName = fileName
self.__file = None
def hasClientWithId(self, clientId):
self.__loadRepo()
hasClientWithId = super().hasClientWithId(clientId)
super().clean()
return hasClientWithId
def addClient(self, client):
self.__loadRepo()
super().addClient(client)
self.__storeRepo()
super().clean()
def __loadFileReadMode(self):
self.__file = open(self.__fileName, "r")
def __loadFileWriteMode(self):
self.__file = open(self.__fileName, "w")
def __closeFile(self):
self.__file.close()
def __loadRepo(self):
self.__loadFileReadMode()
for line in self.__file:
splitLine = line.split()
clientToAdd = ClientDAO(splitLine[1])
clientToAdd.setClientId(int(splitLine[0]))
super().addClientWithId(clientToAdd)
self.__closeFile()
def __storeRepo(self):
self.__loadFileWriteMode()
self.__file.write("")
for client in super().getList():
self.__file.write(self.clientToString(client))
self.__closeFile()
def clientToString(self, clientDAO):
return str(clientDAO.getId()) + " " + clientDAO.getName() + "\n"
a class which should load the list from a file, call addClient from parent, and store the updated list in the file. The problem is that after child class loads the file in addClient, it calls the method in the parent, which calls hasClientWithId, from the child, again. But I want it to call hasClientWithId, from the parent, that is, the context it is in. Can I achieve that?
I can think of several ways to achieve your goal. I ranked them from worst to best
1. Exactly what you asked for
You wanted that ClientRepo.addClient calls ClientRepo.hasClientWithId instead of ClientFileRepository.hasClientWithId. It is possible to enforce that:
class ClientRepo(Repository):
def addClient(self, client):
if type(client).__name__ == 'ClientDAO':
if not ClientRepo.hasClientWithId(self, client.getId()):
client.setClientId(self.__maximumIndexInClientList() + 1)
self.__clientList.append(client)
else:
raise ObjectAlreadyInCollectionException
else:
raise TypeError
This is not a good approach, because it's unintuitive and breaks the principles of OOP. Any other programmer writing a subclass of ClientRepo that overrides hasClientWithId would expect that this will have an effect for every call to hasClientWithId even inside of addClient
2. Let ClientFileRepository decide which function to use
Add a variable
self.__isFileOpen = False
in ClientFileRepository.__init__, set it to True when you open the file and to False when you close the file. Then change the hasClientWithId within ClientFileRepository to
def hasClientWithId(self, clientId):
if not self.__isFileOpen:
self.__loadRepo()
result = super().hasClientWithId(clientId)
super().clean()
return result
else:
return super().hasClientWithId(clientId)
to avoid opening the same file again. This works, but it is pretty difficult to write new functions for this class, because you always need to be aware if the function call is a call from within your class or from somewhere else. Also this seems pretty inefficient, because you read and write the entire file, even when you only add one client.
3. Read the file only once and modify the underlying ClientRepo
class ClientFileRepository(ClientRepo):
def __init__(self, fileName) -> None:
super().__init__()
self.__fileName = fileName
self.__loadRepo()
# No hasClientWithId needed
def addClient(self, client):
super().addClient(client)
self.__storeRepo()
def __loadRepo(self):
with open(self.__filename) as file:
for line in file:
splitLine = line.split()
clientToAdd = ClientDAO(splitLine[1])
clientToAdd.setClientId(int(splitLine[0]))
super().addClientWithId(clientToAdd)
def __storeRepo(self):
with open(self.__filename, "w") as file:
file.write("")
for client in super().getList():
file.write(self.clientToString(client))
This obviously assumes that the file is not changed by someone else between calls to addClient and the program still overwrites the entire file for every addClient. If this is a problem for you it is best to be explicit and make loadRepo and storeRepo public. Then the programmer using this class can decide when loading and saving are necessary and useful. You can use context managers for this.
Extra: Read and save the file for every method
You can use function decorators to use solution 2 without writing the same code for every function:
import functools
def loadAndStore(function):
#functoools.wraps(function)
def wrappedFunction(self, *args, **kwargs):
if self.__isFileOpen:
return function(self, *args, **kwargs)
else:
self.__isFileOpen = True
self.__loadRepo()
try:
return function(self, *args, **kwargs)
except Exception as e: # Only catch expected exceptions
raise
finally:
self.__storeRepo()
self.clear() # some cleanup
self.__isFileOpen = False
return wrappedFunction
class ClientFileRepository(ClientRepo):
def __init__(self, fileName) -> None:
super().__init__()
self.__fileName = fileName
self.__isFileOpen = False
#loadAndStore
def hasClientWithId(self, clientId):
return super().hasClientWithId(clientId)
#loadAndStore
def addClient(self, client):
super().addClient(client)
def __loadRepo(self):
with open(self.__filename) as file:
for line in file:
splitLine = line.split()
clientToAdd = ClientDAO(splitLine[1])
clientToAdd.setClientId(int(splitLine[0]))
super().addClientWithId(clientToAdd)
def __storeRepo(self):
with open(self.__filename, "w") as file:
file.write("")
for client in super().getList():
file.write(self.clientToString(client))
Be careful here, using this is not very intuitive. For example self.__isFileOpen is defined in __init__, but none of the methods below directly use it. Instead its use is hidden in the loadAndStore decorator.
Some quick hints at the end:
type(client).__name__ == 'ClientDAO' is bad practice. Use isinstance(client, ClientDAO) to fully adopt OOP
If this is not part of a bigger project with given naming conventions use the python style guide
Using private variables like __fileName is generally considered unnecessary, just prefix the variable with one underscore to indicate "internal use". The same is true for functions.
It is ok to output() zip files like this:
def output(self):
date_path = self.search['date_path']
zip_fn = "data/%s/%s.zip" % (date_path, date_path)
return luigi.LocalTarget(zip_fn)
But how to pas this zip in run() method?
class ZeroTask(luigi.Task):
path_in = luigi.Parameter()
textfiles = []
path_to_zip = ''
def requires(self):
return []
def run(self):
# Get a bunch of text files
# Do some manipulations with textfiles
# Create a result.zip
# self.path_to_zip = '~/Project/result.zip'
def output(self):
zip_fn = self.path_to_result.zip
return luigi.LocalTarget(zip_fn)
What should I do in the run() method?
You should be able to use zipfile to build the file however you'd like.
class MyTask(luigi.Task):
def output(self):
date_path = self.search['date_path']
zip_fn = "data/%s/%s.zip" % (date_path, date_path)
return luigi.LocalTarget(zip_fn)
def run(self):
ztemp = tempfile.NamedTemporaryFile(mode='wb')
z = zipfile.ZipFile(ztemp, 'w')
# build the zip file
z.close()
os.rename(ztemp.name, self.output().path)
From the docs on FileSystemTarget,
I'm getting the following error when trying to read the row and column count of a CSV:
> coercing to Unicode: need string or buffer, S3BotoStorageFile found
import csv
class CSV:
def __init__(self, file=None):
self.file = file
def read_file(self):
data = []
file_read = read_file(self.file)
return file_read
def get_row_count(self):
return len(self.read_file())
def get_column_count(self):
new_data = self.read_file()
return len(new_data[0])
def get_data(self, rows=1):
data = self.read_file()
return data[:rows]
def read_file(self):
with open(self.file, 'r') as f:
data = [row for row in csv.reader(f.read().splitlines())]
return data
How do I resolve?
well, after reading your code my first reaction was OMG! How many does he open that poor file?
Here's a new version of your class
class CSV:
def __init__(self, file=None):
self.file = file
with open(self.file, 'r') as f:
self.data = [row for row in csv.reader(f)]
def get_row_count(self):
return len(self.data)
def get_column_count(self):
return len(self.data[0])
def get_data(self, rows=1):
return self.data
I also fixed your csv.reader() handling. It accepts a file object, no need to .read() or .read().splitlines(), it can only lead to errors. Which may be the reason why it failed.
Ok, given from what you say, you're working on AWS, and your file is not a string path to a file, but already a file object. So you don't need the open() part as is. You may want to modify your code so it is as follows:
class CSV:
def __init__(self, f=None):
self.file = f
if isinstance(self.file, str): # if the file is a string, it's a path that has to be opened
with open(self.file, 'r') as f:
self.data = [row for row in csv.reader(f)]
elif isinstance(self.file, File) or isinstance(self.file, file): # if that's a file object, no need to open
self.data = [row for row in csv.reader(self.file)]
else: # otherwise, I don't know what to do, so aaaaaaaargh!
raise Exception("File object type unknown: %s %s" % (type(file), file,))
def get_row_count(self):
return len(self.data)
def get_column_count(self):
return len(self.data[0])
def get_data(self, rows=1):
return self.data
Reading the S3BotoStorage.py, the S3BotoStorage class inherits from django.core.files.base.File, which inherits from django.core.files.utils.FileProxyMixin, which is a composition of attributes of the global python file class.
So a File object is not an instance of file, but it has a compatible interface. Therefore, in the previous code I have tested whether the self.file is a str, then it shall be a path that we open() so we get a file() and parse it. Otherwise, self.file is a File object or a file() object, and we just need to parse it. If it's neither of those, then it's an error, and we shall except.