I need to run a Hadoop jar job using Luigi from python. I searched and found examples of writing mapper and reducer in Luigi but nothing to directly run a Hadoop jar.
I need to run a Hadoop jar compiled directly. How can I do it?
You need to use the luigi.contrib.hadoop_jar package (code).
In particular, you need to extend HadoopJarJobTask. For example, like that:
from luigi.contrib.hadoop_jar import HadoopJarJobTask
from luigi.contrib.hdfs.target import HdfsTarget
class TextExtractorTask(HadoopJarJobTask):
def output(self):
return HdfsTarget('data/processed/')
def jar(self):
return 'jobfile.jar'
def main(self):
return 'com.ololo.HadoopJob'
def args(self):
return ['--param1', '1', '--param2', '2']
You can also include building a jar file with maven to the workflow:
import luigi
from luigi.contrib.hadoop_jar import HadoopJarJobTask
from luigi.contrib.hdfs.target import HdfsTarget
from luigi.file import LocalTarget
import subprocess
import os
class BuildJobTask(luigi.Task):
def output(self):
return LocalTarget('target/jobfile.jar')
def run(self):
subprocess.call(['mvn', 'clean', 'package', '-DskipTests'])
class YourHadoopTask(HadoopJarJobTask):
def output(self):
return HdfsTarget('data/processed/')
def jar(self):
return self.input().fn
def main(self):
return 'com.ololo.HadoopJob'
def args(self):
return ['--param1', '1', '--param2', '2']
def requires(self):
return BuildJobTask()
Related
I have an python application whose interface is implemented in Flask and i have a module in backend that use pyswip library. The module works perfectly when i run it separately from the application. As i searched, it seems that pyswip is not thread safe.
I get this error on consult function:
swipl_fid = PL_open_foreign_frame()
OSError: exception: access violation reading 0x00000028
I could try to use another SWI-Prolog library, but in my application i need to consult and external .pl file.
Is there any way i could make it work?
Here's how i use the pyswip library:
from pyswip_alt import Prolog
class My_Prolog():
def __init__(self, query):
self.query = query.split()
self.query = ', '.join(self.query)
self.query = '['+self.query + ']'
self.documents_path = "my/path"
self.prolog = Prolog()
self.prolog.consult("facts.pl")
self.prolog_results = []
self.final_result = ''
def process(self):
for res in self.prolog.query("complex_phrase("+self.query+", F)."):
result = []
for atom in res['F']:
result.append(atom.value)
self.prolog_results.append(result)
def run(self):
self.process()
self.final_result = ' '.join(self.final_result)
return self.final_result
And that's the way i use the class:
nl = My_Prolog(query)
nl_query = nl.run()
and all of this is in a function that is run by Flask module.
Simply use a lock?
from multiprocessing import Lock
prologlock = Lock()
#app.route(...)
def handle_x():
with prologlock:
return MyProlog.handle_x()
I am trying to mock a class which is instantiated in the constructor of the class I am trying to test. If I define the class I am trying to mock in the same module as the one I am trying to test, everything works fine but when they are in separate modules, I get errors.
Here's my example, taken from here (Note that in my real example, the test class is in a "tests" submodule and the other two files are in "app.src.code..." module.
What am I missing?
helper.py:
import os
class Helper:
def __init__(self, path):
self.path = path
def get_path(self):
base_path = os.getcwd()
return os.path.join(base_path, self.path)
worker.py:
from helper import Helper
class Worker:
def __init__(self):
self.helper = Helper('db')
def work(self):
path = self.helper.get_path()
print(f'Working on {path}')
return path
test_worker.py:
import unittest
from unittest.mock import patch
from worker import Worker
class WorkerTest(unittest.TestCase):
def test_patching_class(self):
with patch('helper.Helper') as MockHelper:
MockHelper.return_value.get_path.return_value = 'testing'
worker = Worker()
MockHelper.assert_called_once_with('db')
self.assertEqual(worker.work(), 'testing')
You need to use the patch decorators to create the mock for the Helper class of the helper.py module.
E.g.
helper.py:
import os
class Helper:
def __init__(self, path):
self.path = path
def get_path(self):
base_path = os.getcwd()
return os.path.join(base_path, self.path)
worker.py:
from helper import Helper
class Worker:
def __init__(self):
self.helper = Helper('db')
def work(self):
path = self.helper.get_path()
print(f'Working on {path}')
return path
test_worker.py:
import unittest
from unittest.mock import patch
from worker import Worker
class TestWorker(unittest.TestCase):
def test_work(self):
with patch('worker.Helper') as mock_Helper:
mock_helper_instance = mock_Helper.return_value
mock_helper_instance.get_path.return_value = 'testing'
worker = Worker()
mock_Helper.assert_called_once_with('db')
self.assertEqual(worker.work(), 'testing')
if __name__ == '__main__':
unittest.main()
unit test results with coverage report:
Working on testing
.
----------------------------------------------------------------------
Ran 1 test in 0.001s
OK
Name Stmts Miss Cover Missing
-------------------------------------------------------------------------
src/stackoverflow/61008064/helper.py 7 3 57% 6, 9-10
src/stackoverflow/61008064/test_worker.py 13 0 100%
src/stackoverflow/61008064/worker.py 8 0 100%
-------------------------------------------------------------------------
TOTAL 28 3 89%
I'm trying to batch up the processing of a few Jupyter notebooks using Luigi, and I've run into a problem.
I have two classes. The first, transform.py:
import nbformat
import nbconvert
import luigi
from nbconvert.preprocessors.execute import CellExecutionError
class Transform(luigi.Task):
"""Foo."""
notebook = luigi.Parameter()
requirements = luigi.ListParameter()
def requires(self):
return self.requirements
def run(self):
nb = nbformat.read(self.notebook, nbformat.current_nbformat)
# https://nbconvert.readthedocs.io/en/latest/execute_api.html
ep = nbconvert.preprocessors.ExecutePreprocessor(timeout=600, kernel_name='python3')
try:
ep.preprocess(nb, {'metadata': {'path': "/".join(self.notebook.split("/")[:-1])}})
with self.output().open('w') as f:
nbformat.write(nb, f)
except CellExecutionError:
pass # TODO
def output(self):
return luigi.LocalTarget(self.notebook)
This defines a Luigi task that takes a notebook as input (along with possible prior requirements to running this task) and ought to run that notebook and report a success or failure as output.
To run Transform tasks I have a tiny Runner class:
import luigi
class Runner(luigi.Task):
requirements = luigi.ListParameter()
def requires(self):
return self.requirements
To run my little job, I do:
from transform Transform
trans = Transform("../tests/fixtures/empty_valid_errorless_notebook.ipynb", [])
from runner import Runner
run_things = Runner([trans])
But this raises TypeError: Object of type 'Transform' is not JSON serializable!
Is my luigi task format correct? If so, is it obvious what component in run is making the entire class unserializable? If not, how should I go about debugging this?
requires() is supposed to return a task or tasks, not a parameter.
e.g.,
class Runner(luigi.Task):
notebooks = luigi.ListParameter()
def requires(self):
required_tasks = []
for notebook in self.notebooks:
required_tasks.append(Transform(notebook))
return required_tasks
class Transform(luigi.Task):
notebook = luigi.Parameter()
def requires(self):
return []
# then to run at cmd line
luigi --module YourModule Runner --noteboooks '["notebook1.pynb","notebook2.pynb"]'
I'm writing unit tests to validate my project functionalities. I need to replace some of the functions with mock function and I thought to use the Python mock library. The implementation I used doesn't seem to work properly though and I don't understand where I'm doing wrong. Here a simplified scenario:
root/connector.py
from ftp_utils.py import *
def main():
config = yaml.safe_load("vendor_sftp.yaml")
downloaded_files = []
downloaded_files = get_files(config)
for f in downloaded_files:
#do something
root/utils/ftp_utils.py
import os
import sys
import pysftp
def get_files(config):
sftp = pysftp.Connection(config['host'], username=config['username'])
sftp.chdir(config['remote_dir'])
down_files = sftp.listdir()
if down_files is not None:
for f in down_files:
sftp.get(f, os.path.join(config['local_dir'], f), preserve_mtime=True)
return down_files
root/tests/connector_tester.py
import unittest
import mock
import ftp_utils
import connector
def get_mock_files():
return ['digital_spend.csv', 'tv_spend.csv']
class ConnectorTester(unittest.TestCase)
#mock.patch('ftp_utils.get_files', side_effect=get_mock_files)
def test_main_process(self, get_mock_files_function):
# I want to use a mock version of the get_files function
connector.main()
When I debug my test I expect that the get_files function called inside the main of connector.py is the get_mock_files(), but instead is the ftp_utils.get_files(). What am I doing wrong here? What should I change in my code to properly call the get_mock_file() mock?
Thanks,
Alessio
I think there are several problems with your scenario:
connector.py cannot import from ftp_utils.py that way
nor can connector_tester.py
as a habit, it is better to have your testing files under the form test_xxx.py
to use unittest with patching, see this example
In general, try to provide working minimal examples so that it is easier for everyone to run your code.
I modified rather heavily your example to make it work, but basically, the problem is that you patch 'ftp_utils.get_files' while it is not the reference that is actually called inside connector.main() but probably rather 'connector.get_files'.
Here is the modified example's directory:
test_connector.py
ftp_utils.py
connector.py
test_connector.py:
import unittest
import sys
import mock
import connector
def get_mock_files(*args, **kwargs):
return ['digital_spend.csv', 'tv_spend.csv']
class ConnectorTester(unittest.TestCase):
def setUp(self):
self.patcher = mock.patch('connector.get_files', side_effect=get_mock_files)
self.patcher.start()
def test_main_process(self):
# I want to use a mock version of the get_files function
connector.main()
suite = unittest.TestLoader().loadTestsFromTestCase(ConnectorTester)
if __name__ == "__main__":
unittest.main()
NB: what is called when running connector.main() is 'connector.get_files'
connector.py:
from ftp_utils import *
def main():
config = None
downloaded_files = []
downloaded_files = get_files(config)
for f in downloaded_files:
print(f)
connector/ftp_utils.py unchanged.
i want to move functions because of lot of files, into separate python files.
But if i do it, it dont work.
I tried:
File: server.py:
import os, cherrypy, json
from customers.py import *
class application(object):
def get_webpage(self):
....
def get_data(self):
....
File: customers.py:
import os, cherrypy, json
def get_customer_data(self):
....
I use the python as server,
the data in the function: get_customer_data is in this case not processed, get a 404 Not Found,
means the function is not included in main file (server.py)
I removed the self from get_webpages() because it was not indented, which means it was not part of the class.
application.py:
class application(object):
def __init__(self):
pass
def get_webpage():
print('From application')
customers.py:
from application import *
get_webpage() # From application
You could indent get_webpages() and make it part of the class. The way you call it would change. (I put the self back and capitalized the name of the class.)
application.py:
class Application(object):
def __init__(self):
pass
def get_webpage(self):
print('From application')
customers.py:
from application import *
a = Application()
a.get_webpage() # From application