How to output a zipfile in Luigi Task? - python

It is ok to output() zip files like this:
def output(self):
date_path = self.search['date_path']
zip_fn = "data/%s/%s.zip" % (date_path, date_path)
return luigi.LocalTarget(zip_fn)
But how to pas this zip in run() method?
class ZeroTask(luigi.Task):
path_in = luigi.Parameter()
textfiles = []
path_to_zip = ''
def requires(self):
return []
def run(self):
# Get a bunch of text files
# Do some manipulations with textfiles
# Create a result.zip
# self.path_to_zip = '~/Project/result.zip'
def output(self):
zip_fn = self.path_to_result.zip
return luigi.LocalTarget(zip_fn)
What should I do in the run() method?

You should be able to use zipfile to build the file however you'd like.
class MyTask(luigi.Task):
def output(self):
date_path = self.search['date_path']
zip_fn = "data/%s/%s.zip" % (date_path, date_path)
return luigi.LocalTarget(zip_fn)
def run(self):
ztemp = tempfile.NamedTemporaryFile(mode='wb')
z = zipfile.ZipFile(ztemp, 'w')
# build the zip file
z.close()
os.rename(ztemp.name, self.output().path)
From the docs on FileSystemTarget,

Related

How can i execute a class with a separate load method?

What parameters should I pass to load a dataframe using this class?
class SafeLoadExcel:
def __init__(self, file_name):
self.file = file_name
extension = Path(file_name).suffix
if extension == '.csv':
self.load_f = pd.read_csv
elif extension == '.xlsx':
self.load_f = pd.read_excel
def load(self):
try:
df = self.load_f(self.file)
except UnicodeDecodeError:
df = self.load_f(self.file, encoding='iso-8859-1')
return df
Assuming there are only 2 methods on this class, you can get a df by
df = SafeLoadExcel("some/file/name.xlsx").load()
And that's a good hint that this should not have been a class in the first place. It would make as much sense to do
def safe_load_excel(file_name):
extension = Path(file_name).suffix
if extension == '.csv':
load_f = pd.read_csv
elif extension == '.xlsx':
load_f = pd.read_excel
else:
raise ValueError("File must be csv or xlsx")
try:
return load_f(self.file)
except UnicodeDecodeError:
return load_f(self.file, encoding='iso-8859-1')
Assuming SafeLoadExcel provides some other methods for working with the resulting data frame, I would define two separate class methods, and maybe a third that detects which of the first two to use depending on extension present in the file name. SafeLoadExcel.__init__ itself is reserved for when you already have a dataframe, regardless of its original source.
class SafeLoadExcel:
def __init__(self, df):
self.df = df
#classmethod
def _from_file(cls, filename, reader):
"""Load a dataframe from a file.
filename - name of file to read
reader - a function to parse the file and return a dataframe
"""
return cls(reader(filename, encoding='iso-8859-1'))
# CSV-specific wrapper around _from_file
#classmethod
def from_csv(cls, filename):
return cls._from_file(filename, pd.read_csv)
# Excel-specific wrapper around _from_file
#classmethod
def from_excel(cls, filename):
return cls._from_file(filename, pd.read_excel)
# Not to be confused with the private method with a similar name.
# Detect which public class method to use.
#classmethod
def from_file(cls, filename):
if filename.endswith('.csv'):
f = cls.read_csv
elif filename.endswith('.xslx'):
f = cls.read_excel
else:
raise ValueError(f"Cannot determine type of {filename}")
return f(filename)
Then you can use
f1 = SafeLoadExcel.from_csv("foo.csv")
f2 = SafeLoadExcel.from_excel("foo.xlsx")
f3 = SafeLoadExcel.from_file("foo.csv")
f4 = SafeLoadExcel.from_file("foo.xlsx")
f1 and f3 should be effectively the same thing, as should be f2 and f4.

Python: Use local variable globally does not work

I am trying to use my code to open a file after searching for it in either operating system. However when I assign the variable inside the function, i cant use it outside of the function. And when I keep the 2nd function out of 1st function, it doesnt recognize the function.
I tried to assign the df_location globally, but this doesnt work.
When i use df = pd.read_csv(df_location[0], index_col=0) inside the function, I am not able to use df anywhere else in my code.
if platform.system() == 'windows':
def find_file(root_folder, rex):
for root, dirs, files in os.walk(root_folder):
for f in files:
result = rex.search(f)
if result:
file_path = os.path.join(root, f)
return file_path
def find_file_in_all_drives(file_name):
matching_files = list()
# create a regular expression for the file
rex = re.compile(file_name)
for drive in win32api.GetLogicalDriveStrings().split('\000')[:-1]:
file_path = find_file(drive, rex)
if file_path:
matching_files.append(file_path)
return matching_files
global df_location
df_location = find_file_in_all_drives("AB_NYC_2019.csv")
if platform.system() == 'mac':
df_location = find_file("/", "AB_NYC_2019.csv")
df = pd.read_csv(df_location[0], index_col=0)
I would like to be able to use the file that is retrieved through the functions.
Thank you!
ideally it should be like this
if platform.system() == 'windows':
def find_file(root_folder, rex):
for root, dirs, files in os.walk(root_folder):
for f in files:
result = rex.search(f)
if result:
file_path = os.path.join(root, f)
return file_path
def find_file_in_all_drives(file_name):
matching_files = list()
# create a regular expression for the file
rex = re.compile(file_name)
for drive in win32api.GetLogicalDriveStrings().split('\000')[:-1]:
file_path = find_file(drive, rex)
if file_path:
matching_files.append(file_path)
return matching_files
df_location = find_file_in_all_drives("AB_NYC_2019.csv")
if platform.system() == 'mac':
df_location = find_file("/", "AB_NYC_2019.csv")
df = pd.read_csv(df_location[0], index_col=0)
but this gives the error message:
"NameError: name 'find_file_in_all_drives' is not defined"
You define find_file_in_all_drives for Window but you should define find_file_in_all_drives also for other systems - but every system will have different code in find_file_in_all_drives. And then you can use find_file_in_all_drives on every system
# all systems use it so it should be defined for all
def find_file(root_folder, rex):
for root, dirs, files in os.walk(root_folder):
for f in files:
result = rex.search(f)
if result:
file_path = os.path.join(root, f)
return file_path
# define different `find_file_in_all_drives` for different systems
if platform.system() == 'windows':
def find_file_in_all_drives(file_name):
matching_files = list()
# create a regular expression for the file
rex = re.compile(file_name)
for drive in win32api.GetLogicalDriveStrings().split('\000')[:-1]:
file_path = find_file(drive, rex)
if file_path:
matching_files.append(file_path)
return matching_files
if platform.system() in ('mac', 'linux'):
def find_file_in_all_drives(file_name):
return find_file("/", file_name)
# now you can use `find_file_in_all_drives` on every system
df_location = find_file_in_all_drives("AB_NYC_2019.csv")
df = pd.read_csv(df_location[0], index_col=0)
You didn't show all your code. Presumably, you have find_file and find_file_in_all_drives function implementations for mac as well, yes? At least that's what I would expect just from looking at the code you've posted.
If that really is ALL the code you have, then the way it's written now, you're only defining find_file and find_file_in_all_drives if platform.system() returns "windows" (side note: just tried this, on my Windows 7 system it returns "Windows" with a capital 'W'.) If that condition is not satisfied these function definitions are not visible anywhere else in your code, because you've put them inside the body of the if-statement.
It looks like you are trying to get different behavior depending on the contents of a string (platform.system()). Since you can't avoid having to implement the varying behavior for both operating systems, you can use polymorphism for this:
import abc
class DataFrameFinder(abc.ABC):
def __init__(self):
pass
#abc.abstractmethod
def find_file(self, root_folder, rex):
raise NotImplementedError
#abc.abstractmethod
def find_file_in_all_drives(self, file_name):
raise NotImplementedError
class DataFrameFinderWindows(DataFrameFinder):
def __init__(self, *args, **kwargs):
DataFrameFinder.__init__(self, *args, **kwargs)
def find_file(self, root_folder, rex):
# Do windows things...
pass
def find_file_in_all_drives(self, file_name):
# Do windows things...
pass
class DataFrameFinderMac(DataFrameFinder):
def __init__(self, *args, **kwargs):
DataFrameFinder.__init__(self, *args, **kwargs)
def find_file(self, root_folder, rex):
# Do mac things...
pass
def find_file_in_all_drives(self, file_name):
# Do mac things...
pass
def main():
import platform
finder_factory = {
"Windows": DataFrameFinderWindows,
"Mac": DataFrameFinderMac
}
finder = finder_factory[platform.system()]()
finder.find_file(...)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())

How to write two decorator in file operations

I have csv file having contents below
101,item_1
101,item_1
if it is csv my below code will execute
import csv
fName = input()
def read_csv(fName):
try:
with open(fName, 'r') as f:
reader = csv.reader(f)
for row in reader:
print (row)
read_csv(fName)
Here how to write the exception in decorator function and call on the top of that.
first decorator
if fName not endswith .txt or .csv then it has to generate output not accept
Second decorator
if fName = file.txt text file then below operations has to taken care
def read_txt(fName):
f = open(fName, "r")
print(f.readline())
if csv then first function to execute and if txt next function to execute. How to achieve using decorator. I can put if conditon to achieve the situation, but that is not the case
My whole code without decorator is below
fName = input()
def read_csv(fName):
if fName.endswith('.csv'):
#print ('hi')
try:
with open(fName, 'r') as f:
reader = csv.reader(f)
for row in reader:
print (row)
except IOError:
print ("Could not read file:", fName)
#SECOND DECORATOR
if fName.endswith('.txt'):
f = open(fName, "r")
print(f.readline())
#FIRST DECORATOR
if not(fName.endswith('.csv')) and not(fName.endswith('.txt')):
print ('not accept')
read_csv(fName)
You can do it like this with decorators:
import functools
def check_arguments(func):
#functools.wraps(func)
def wrapper(*args, **kwargs):
fname = kwargs['fname']
if not fname.endswith('.csv') and not fname.endswith('.txt'):
print('not accept')
return func(*args, **kwargs)
return wrapper
def set_file_processor(func):
def read_csv(fname):
print('read_csv', fname)
def read_txt(fname):
print('read_txt', fname)
#functools.wraps(func)
def wrapper(*args, **kwargs):
fname = kwargs['fname']
if fname.endswith('.csv'):
read_csv(fname)
elif fname.endswith('.txt'):
read_txt(fname)
return func(*args, **kwargs)
return wrapper
#check_arguments
#set_file_processor
def process(fname):
pass
process(fname='input.csv')
Your problem doesn't seem to come under decorator but under factory pattern i.e. process differently based on the input file.
The below code is a very simple and basic Factory pattern solution to your problem, this should be modified accordingly as per your need,
import os
from abc import ABC, abstractmethod
class FileProcessor(ABC):
#abstractmethod
def process():
pass
class TextFileProcessor(FileProcessor):
def process(self, file_path):
print("Text file processing goes here")
class CsvFileProcessor(FileProcessor):
def process(self, file_path):
print("CSV file processing goes here")
class DefaultFileProcessor(FileProcessor):
def process(self, file_path):
raise ValueError("File %s is not valid" % file_path)
class FileFactory:
processors = {
'txt': TextFileProcessor,
'csv': CsvFileProcessor,
'default': DefaultFileProcessor
}
def __init__(self, file_path):
if not os.path.exists(file_path):
raise IOError("File not found")
self.file_path = file_path
def process(self):
dot_splits = self.file_path.split(".")
ext = dot_splits[-1] if len(dot_splits) > 1 else "default"
ext = ext if ext in self.processors else "default"
processor_class = self.processors.get(ext)
return processor_class().process(self.file_path)
FileFactory(file_path).process()
In later stage if you would like to add json processor then it can also be done easily by adding
processors = {
'txt': TextFileProcessor,
'csv': CsvFileProcessor,
'json': JsonFileProcessor,
'default': DefaultFileProcessor
}
and creating new Json processor class,
class JsonFileProcessor(FileProcessor):
def process(self, file_path):
print("JSON file processing goes here")
Based on your code and this very useful guide, here is a possible solution:
def read_file_decorator(fName):
def read_csv():
print('read_csv')
with open(fName, 'r') as f:
reader = csv.reader(f)
for row in reader:
print(row)
def read_txt():
print('read_txt')
f = open(fName, 'r')
for row in f:
print(row)
if fName.endswith('.csv'):
return read_csv
elif fName.endswith('.txt'):
return read_txt
else:
return None
reader_function = read_file_decorator(fileName)
if reader_function != None:
reader_function()
else:
print('not accept')
I use a stateful decorator remembering the file name inside the reader function before actually executing it (in order not to pass it twice); and I use the fixed value None for invalid file types.
Based on the requirements use of decorator would be an overkill of decorators. But if it's mandatory to implement this using decorator, this is how we can implement:
We can create a dummy function called read_file and a decorator function called reader
User will always call read_file with filename as argument and decorator function reader will check passed filename extension and call the required function - read_csv or read_text
def reader(fun):
def wrapper(*args):
fname = args[0]
if fname.endswith('.csv'):
read_csv(fname)
elif fname.endswith('.txt'):
read_text(fname)
else:
print('not accepted')
return wrapper
def read_csv(fname):
print('In read_csv()')
def read_text(fname):
print('In read_text()')
#reader
def read_file(fname):
pass
read_file('a.csv')
read_file('a.txt')
read_file('filename.py')
Output
In read_csv()
In read_text()
not accepted

Invalid syntax. I save my text file on my desktop calling it file.

Create graph:-
def loadGraphFile(file):
graph = []
for line in file:
contents = line.split()
movieName = contents[0]
actorNames = [contents[i]+ " " + contents[i+1] for i in range(1, len(contents), 2)]
movieNode = findNode(graph, movieName)
if movieNode == None:
movieNode = mkNode(movieName)
graph.append(movieNode)
for actorName in actorNames:
actorNode = findNode(graph,actorName)
if actorNode == None:
actorNode = mkNode(actorName)
graph.append(actorNode)
actorNode.neighbor.append(movieNode)
movieNode.neighbor.append(actorNode)
return graph
def loadGraphFileName('file.text'):
return loadGraphFile(Open('file.text'))
You declared your function wrong:
def loadGraphFileName('file.text'): # change this
return loadGraphFile(Open('file.text'))
To this:
def loadGraphFileName(): # You don't use it anyway
return loadGraphFile(Open('file.text'))
Or:
def loadGraphFileName(filename='file.text'): # file.text will be the default. if you give an parameter with it, filename will change to that parameter
return loadGraphFile(Open(filename)) # And use it here
You cannot have literals as function params
You can instead do
def loadGraphFileName(f = 'file.txt'):
return loadGraphFile(Open(f))

Lazy parameter binding in Python

I tried to design a workflow using composite pattern, the sample codes looks like this:
class CommandInterface(object):
def __init__(self, name, template=None, tool=None, param : dict={},input=None, output=None ):
self.name = name
self.input = input
self.output = output
self.param = param
self.template = template
def before_invoke(self):
pass # check before invoke
def invoke(self):
pass
def after_invoke(self):
pass # check after invoke
class ShellCommand(CommandInterface):
def render(self):
cmd = self.template.format(input=self.input,
output=self.output,
param=self.param,
)
return cmd
def before_invoke(self):
pass
def invoke(self):
os.system(self.render())
class Workflow(CommandInterface):
def __init__(self, name):
self.name = name
self._input = []
self._output = []
self._commands = []
self._tool = []
def add(self, command : CommandInterface):
self._commands.append(command)
return self
def invoke(self):
for command in self._commands:
command.invoke()
And can be used like this:
workflow = Workflow()
raw_files = ["file1", "file2", "file3"]
for i in raw_files:
head_command = ShellCommand("head {input} {output}",
input = i,
output = i+"_head")
workflow.add(head_command)
merge_command = ShellCommand("cat {input} > {output}",
input=" ".join([i+"_head" for i in rawfiles]),
output="merged")
workflow.add(merge_command)
workflow.invoke()
However, sometimes a command's input might be another command's result, for example:
class PythonCommand(CommandInterface):
def invoke(self):
self._result = self.template(input=self.input, output=self.output, param=self.param)
def a_command(input, output, param):
take_long_time(input, output, param)
return {"a_result": "some result will be used in b_command"}
def b_command(input, output, param):
def need_the_result_of_a_command(input, output, param):
return param["a_result"]
need_the_result_of_a_command(input, output, param)
return "success"
a = PythonCommand(a_command, input_, output_, param_)
a.invoke()
b = PythonCommand(b_command, input_, output_, param= a._result)
b.invoke()
I can't use a workflow to composite these 2 command because b uses a's result as parameter, which is only visiable after a is invoked. However, a workflow manager like the codes before is still necessary in some situations. I think I need something like this:
workflow = Workflow()
a = PythonCommand(a_command, input_, output_, param_)
workflow.add(a)
b = PythonCommand(b_command, input_, output_, param= {"a_result": a.lazy()._result})
workflow.add(b)
workflow.invoke()
Does anyone have ideas about how to design a workflow with .lazy() implementation like this?

Categories