ı writing data process pipeline with luigi but ı get error

ı writing data process pipeline with luigi but ı get error - python

import os
import luigi
import pandas as pd
import requests as req
from bs4 import BeautifulSoup
class DownloadData(luigi.Task):
def run(self):
site = req.get("http://www.gutenberg.org/browse/scores/top").text
with self.output().open("w") as f:
f.write(site)
def output(self):
return luigi.LocalTarget("raw_data.txt")
def complete(self):
return os.path.exists(self.output().path)
class PrePData(luigi.Task):
def requires(self):
return DownloadData()
def run(self):
data = self.requires()
bs4ed_data = []
if data.contains("<!DOCTYPE html>"):
bs4ed_data.append()(data,"html.parser")
else:
print("can not found any problem in this data")
return bs4ed_data
def output(self):
return luigi.local_target("data.txt")
def complete(self):
return os.path.exists(self.output().path)
def on_success(self):
print("data preprocessing completed successfully")
def on_failure(self):
print("data preprocessing failed")
class RunAllTasks(luigi.WrapperTask):
def requires(self):
return [DownloadData(),PrePData()]
ı run this python file with this command in my terminal
python -m luigi --module PipeLineofETL-A RunAllTasks --local-scheduler --workers 4
and error
python -m luigi --module PipeLineofETL-A RunAllTasks --local-scheduler --workers 4
DEBUG: Checking if RunAllTasks() is complete
WARNING: Will not run RunAllTasks() or any dependencies due to error in complete() method:
Traceback (most recent call last):
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py", line 429, in check_complete
is_complete = check_complete_cached(task, completion_cache)
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py", line 414, in check_complete_cached
is_complete = task.complete()
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/task.py", line 845, in complete
return all(r.complete() for r in flatten(self.requires()))
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/task.py", line 845, in <genexpr>
return all(r.complete() for r in flatten(self.requires()))
File "/home/tuna/Belgeler/GitLab/extractdata/ChatGPT's Basic tasks/PipeLineofETL-A.py", line 40, in complete
return os.path.exists(self.output().path)
File "/home/tuna/Belgeler/GitLab/extractdata/ChatGPT's Basic tasks/PipeLineofETL-A.py", line 37, in output
return luigi.local_target("data.txt")
TypeError: 'module' object is not callable
INFO: Informed scheduler that task RunAllTasks__99914b932b has status UNKNOWN
INFO: Done scheduling tasks
INFO: Running Worker with 4 processes
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Worker(salt=1404147006, workers=4, host=tunapc, username=tuna, pid=9077) was stopped. Shutting down Keep-Alive thread
INFO:
===== Luigi Execution Summary =====
Scheduled 1 tasks of which:
* 1 failed scheduling:
- 1 RunAllTasks()
Did not run any tasks
This progress looks :( because there were tasks whose scheduling failed
===== Luigi Execution Summary =====
import os
import luigi
import pandas as pd
import requests as req
from bs4 import BeautifulSoup
class DownloadData(luigi.Task):
def run(self):
site = req.get("http://www.gutenberg.org/browse/scores/top").text
with self.output().open("w") as f:
f.write(site)
def output(self):
return luigi.LocalTarget("raw_data.txt")
def complete(self):
return os.path.exists(self.output().path)
class PrePData(luigi.Task):
def requires(self):
return DownloadData()
def run(self):
data = self.requires()
bs4ed_data = []
if data.contains("<!DOCTYPE html>"):
bs4ed_data.append()(data,"html.parser")
else:
print("can not found any problem in this data")
return bs4ed_data
class RunAllTasks(luigi.WrapperTask):
def requires(self):
return [DownloadData(),PrePData()]
ı write same command in terminal and ı get this error
DEBUG: Checking if RunAllTasks() is complete
/home/tuna/.local/lib/python3.10/site-packages/luigi/task.py:845: UserWarning: Task PrePData() without outputs has no custom complete() method
return all(r.complete() for r in flatten(self.requires()))
DEBUG: Checking if DownloadData() is complete
DEBUG: Checking if PrePData() is complete
/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py:414: UserWarning: Task PrePData() without outputs has no custom complete() method
is_complete = task.complete()
INFO: Informed scheduler that task RunAllTasks__99914b932b has status PENDING
INFO: Informed scheduler that task PrePData__99914b932b has status PENDING
INFO: Informed scheduler that task DownloadData__99914b932b has status DONE
INFO: Done scheduling tasks
INFO: Running Worker with 4 processes
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 2
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
DEBUG: PrePData__99914b932b is currently run by worker Worker(salt=3997262702, workers=4, host=tunapc, username=tuna, pid=10617)
INFO: [pid 10624] Worker Worker(salt=3997262702, workers=4, host=tunapc, username=tuna, pid=10617) running PrePData()
ERROR: [pid 10624] Worker Worker(salt=3997262702, workers=4, host=tunapc, username=tuna, pid=10617) failed PrePData()
Traceback (most recent call last):
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py", line 198, in run
new_deps = self._run_get_new_deps()
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py", line 138, in _run_get_new_deps
task_gen = self.task.run()
File "/home/tuna/Belgeler/GitLab/extractdata/ChatGPT's Basic tasks/PipeLineofETL-A.py", line 28, in run
if data.contains("<!DOCTYPE html>"):
AttributeError: 'DownloadData' object has no attribute 'contains'
INFO: Informed scheduler that task PrePData__99914b932b has status FAILED
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
DEBUG: There are 2 pending tasks possibly being run by other workers
DEBUG: There are 2 pending tasks unique to this worker
DEBUG: There are 2 pending tasks last scheduled by this worker
INFO: Worker Worker(salt=3997262702, workers=4, host=tunapc, username=tuna, pid=10617) was stopped. Shutting down Keep-Alive thread
INFO:
===== Luigi Execution Summary =====
Scheduled 3 tasks of which:
* 1 complete ones were encountered:
- 1 DownloadData()
* 1 failed:
- 1 PrePData()
* 1 were left pending, among these:
* 1 had failed dependencies:
- 1 RunAllTasks()
This progress looks :( because there were failed tasks
===== Luigi Execution Summary =====
when ı added output() method to DownloadData in requires function, ı get this error
DEBUG: Checking if RunAllTasks() is complete
/home/tuna/.local/lib/python3.10/site-packages/luigi/task.py:845: UserWarning: Task PrePData() without outputs has no custom complete() method
return all(r.complete() for r in flatten(self.requires()))
DEBUG: Checking if DownloadData() is complete
DEBUG: Checking if PrePData() is complete
/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py:414: UserWarning: Task PrePData() without outputs has no custom complete() method
is_complete = task.complete()
INFO: Informed scheduler that task RunAllTasks__99914b932b has status PENDING
ERROR: Luigi unexpected framework error while scheduling RunAllTasks()
Traceback (most recent call last):
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py", line 794, in add
for next in self._add(item, is_complete):
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py", line 892, in _add
self._validate_dependency(d)
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py", line 917, in _validate_dependency
raise Exception('requires() can not return Target objects. Wrap it in an ExternalTask class')
Exception: requires() can not return Target objects. Wrap it in an ExternalTask class
INFO: Worker Worker(salt=6506578324, workers=4, host=tunapc, username=tuna, pid=10710) was stopped. Shutting down Keep-Alive thread
ERROR: Uncaught exception in luigi
Traceback (most recent call last):
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/retcodes.py", line 75, in run_with_retcodes
worker = luigi.interface._run(argv).worker
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/interface.py", line 213, in _run
return _schedule_and_run([cp.get_task_obj()], worker_scheduler_factory)
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/interface.py", line 171, in _schedule_and_run
success &= worker.add(t, env_params.parallel_scheduling, env_params.parallel_scheduling_processes)
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py", line 794, in add
for next in self._add(item, is_complete):
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py", line 892, in _add
self._validate_dependency(d)
File "/home/tuna/.local/lib/python3.10/site-packages/luigi/worker.py", line 917, in _validate_dependency
raise Exception('requires() can not return Target objects. Wrap it in an ExternalTask class')
Exception: requires() can not return Target objects. Wrap it in an ExternalTask class

You are getting the error in the first block because luigi.local_target is a module while luigi.LocalTarget is the class you were looking for.
The second error is because you most likely don't want to be using self.requires directly in PrePData.run, but instead want to use self.input() (take a look at https://luigi.readthedocs.io/en/stable/tasks.html#task-run). self.input() will return the outputs of the required task, which in this case is DownloadData.
Finally, there are a couple optimizations you can make to your code:
In luigi, if a LocalTarget is specified as an output, it's mere existence signifies that the task is complete. This is actually the default implementation of Task.complete, so you don't need to reimplement it yourself.
You don't need to specify all the tasks in RunAllTasks. Luigi will automatically discover required tasks and construct the requirements tree before resolving the entire tree. Therefore, you only need to specify the top-level tasks, which in this case is just the PrePData task.

Related

Module not found when running scrapy in celery

EDIT: I found out it only happened on a Windows computer. Everything works fine on Linux server.
I am running a scrapy crawler in a celery process and keep getting this error. Any ideas what am I doing wrong?
[2021-08-18 11:28:42,294: INFO/MainProcess] Connected to sqla+sqlite:///celerydb.sqlite
[2021-08-18 11:28:42,313: INFO/MainProcess] celery#NP45086 ready.
[2021-08-18 09:46:58,330: INFO/MainProcess] Received task: app_celery.scraping_process_cli[e94dc192-e10e-4921-ad0c-bb932be9b568]
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:\Program Files\Python374\lib\multiprocessing\spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "C:\Program Files\Python374\lib\multiprocessing\spawn.py", line 115, in _main
self = reduction.pickle.load(from_parent)
ModuleNotFoundError: No module named 'app_celery'
[2021-08-18 09:46:58,773: INFO/MainProcess] Task app_celery.scraping_process_cli[e94dc192-e10e-4921-ad0c-bb932be9b568] succeeded in 0.4380000000091968s: None
My app_celery looks like this:
app = Celery('app_celery', backend=..., broker=...)
def scrape_data():
process = CrawlerProcess(get_project_settings())
crawler = process.create_crawler(spider_cls)
process.crawl(spider_cls, **kwargs)
process.start()
#app.task(name='app_celery.scraping_process_cli', time_limit=1200, max_retries=3)
def scraping_process_cli(company_id):
import multiprocessing
a = multiprocessing.Process(target=scrape_data())
a.start()
a.join()
I am running the celery as:
celery -A app_celery worker -c 4 -n worker1 --pool threads

Before do step 2, change directory to the project root of your scrapy project.
You have to tell CrawlerProcess() to load settings.py.
import os
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def scrape_data():
os.chdir("/path/to/your/scrapy/project_root")
process = CrawlerProcess(get_project_settings())
process.crawl('spider_name')
process.start()

rqscheduler docker is stopping with timemismatch error

I have created 3 dockers in same network
redis queue
rq scheduler
Python based docker
Error is coming when redis is trying to schedule the task on scheduler.
docker ps output
b18b7d21894f redis "docker-entrypoint.s…" 27 minutes ago Up 27 minutes 6379/tcp test_redis_1
140a7c31b87d python "python3" 13 hours ago Up 13 hours pyRed5
55dc5bcd3f57 anarchy/rq-scheduler "rqscheduler --host …" 27 minutes ago Exited (1) 13 minutes ago boring_bohr
I am trying to schedule the periodic task.
File iss.py
from rq_scheduler import Scheduler
from redis import Redis
from datetime import datetime, timedelta,timezone
import pytz
import mail
scheduler = Scheduler(connection=Redis("test_redis_1"))
def get_next_pass():
x= datetime.now() + timedelta(minutes = 1)
return x.replace(tzinfo=timezone.utc)
#.strftime("%Y-%m-%dT%H:%M:%SZ")
def send_text_message(time):
mail.mail()
scheduler.enqueue_at(time+100, iss.send_text_message,time+100)
File scheduler.py
from datetime import datetime
from redis import Redis
from rq_scheduler import Scheduler
import iss
scheduler = Scheduler(connection=Redis("test_redis_1")) # Get a scheduler for the "default" queue
next_pass = iss.get_next_pass()
if next_pass:
print(next_pass)
next_pass
print("reached here")
scheduler.enqueue_at(next_pass, iss.send_text_message,next_pass)
I am calling schduler.py from python docker. Task is going to rq but it is getting failed at rq scheduler with the below error
root#healthbot-build-vm1:~/redis# docker logs 55dc5bcd3f57
19:09:55 Running RQ scheduler...
19:09:55 Checking for scheduled jobs...
19:10:55 Checking for scheduled jobs...
19:11:55 Checking for scheduled jobs...
19:12:55 Checking for scheduled jobs...
19:13:55 Checking for scheduled jobs...
19:14:55 Checking for scheduled jobs...
19:15:56 Checking for scheduled jobs...
19:16:56 Checking for scheduled jobs...
19:17:56 Checking for scheduled jobs...
19:18:56 Checking for scheduled jobs...
19:19:56 Checking for scheduled jobs...
19:20:56 Checking for scheduled jobs...
19:21:56 Checking for scheduled jobs...
19:22:56 Checking for scheduled jobs...
19:23:56 Checking for scheduled jobs...
Traceback (most recent call last):
File "/usr/local/lib/python3.5/site-packages/rq/utils.py", line 164, in utcparse
return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%SZ')
File "/usr/local/lib/python3.5/_strptime.py", line 510, in _strptime_datetime
tt, fraction = _strptime(data_string, format)
File "/usr/local/lib/python3.5/_strptime.py", line 343, in _strptime
(data_string, format))
ValueError: time data '2021-01-14T19:22:07.242474Z' does not match format '%Y-%m-%dT%H:%M:%SZ'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/bin/rqscheduler", line 11, in <module>
sys.exit(main())
File "/usr/local/lib/python3.5/site-packages/rq_scheduler/scripts/rqscheduler.py", line 53, in main
scheduler.run(burst=args.burst)
File "/usr/local/lib/python3.5/site-packages/rq_scheduler/scheduler.py", line 340, in run
self.enqueue_jobs()
File "/usr/local/lib/python3.5/site-packages/rq_scheduler/scheduler.py", line 322, in enqueue_jobs
jobs = self.get_jobs_to_queue()
File "/usr/local/lib/python3.5/site-packages/rq_scheduler/scheduler.py", line 271, in get_jobs_to_queue
return self.get_jobs(to_unix(datetime.utcnow()), with_times=with_times)
File "/usr/local/lib/python3.5/site-packages/rq_scheduler/scheduler.py", line 254, in get_jobs
job = Job.fetch(job_id, connection=self.connection)
File "/usr/local/lib/python3.5/site-packages/rq/job.py", line 294, in fetch
job.refresh()
File "/usr/local/lib/python3.5/site-packages/rq/job.py", line 410, in refresh
self.created_at = to_date(as_text(obj.get('created_at')))
File "/usr/local/lib/python3.5/site-packages/rq/job.py", line 403, in to_date
return utcparse(as_text(date_str))
File "/usr/local/lib/python3.5/site-packages/rq/utils.py", line 167, in utcparse
return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S.%f+00:00')
File "/usr/local/lib/python3.5/_strptime.py", line 510, in _strptime_datetime
tt, fraction = _strptime(data_string, format)
File "/usr/local/lib/python3.5/_strptime.py", line 343, in _strptime
(data_string, format))
ValueError: time data '2021-01-14T19:22:07.242474Z' does not match format '%Y-%m-%dT%H:%M:%S.%f+00:00'

Celery could not start worker processes when using scrapy-Djangoitem

Question detail: https://github.com/celery/celery/issues/3598
I want to run a scrapy spider with celery, which contain Djangoitems.
this is my celery task:
# coding_task.py
import sys
from celery import Celery
from collector.collector.crawl_agent import crawl
app = Celery('coding.net', backend='redis', broker='redis://localhost:6379/0')
app.config_from_object('celery_config')
#app.task
def period_task():
crawl()
collector.collector.crawl_agent.crawl contains a scrapy crawler who uses djangoitem as item.
the item like:
import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'RaPo3.settings'
django.setup()
from scrapy_djangoitem import DjangoItem
from xxx.models import Collection
class CodingItem(DjangoItem):
django_model = Collection
amount = scrapy.Field(default=0)
role = scrapy.Field()
type = scrapy.Field()
duration = scrapy.Field()
detail = scrapy.Field()
extra = scrapy.Field()
when run: celery -A coding_task worker --loglevel=info --concurrency=1, it wil get some errors below:
[2016-11-16 17:33:41,934: ERROR/Worker-1] Process Worker-1
Traceback (most recent call last):
File "/usr/local/lib/python2.7/site-packages/billiard/process.py", line 292, in _bootstrap
self.run()
File "/usr/local/lib/python2.7/site-packages/billiard/pool.py", line 292, in run
self.after_fork()
File "/usr/local/lib/python2.7/site-packages/billiard/pool.py", line 395, in after_fork
self.initializer(*self.initargs)
File "/usr/local/lib/python2.7/site-packages/celery/concurrency/prefork.py", line 80, in process_initializer
signals.worker_process_init.send(sender=None)
File "/usr/local/lib/python2.7/site-packages/celery/utils/dispatch/signal.py", line 151, in send
response = receiver(signal=self, sender=sender, **named)
File "/usr/local/lib/python2.7/site-packages/celery/fixups/django.py", line 152, in on_worker_process_init
self._close_database()
File "/usr/local/lib/python2.7/site-packages/celery/fixups/django.py", line 181, in _close_database
funs = [self._db.close_connection] # pre multidb
AttributeError: 'module' object has no attribute 'close_connection'
[2016-11-16 17:33:41,942: INFO/MainProcess] Connected to redis://localhost:6379/0
[2016-11-16 17:33:41,957: INFO/MainProcess] mingle: searching for neighbors
[2016-11-16 17:33:42,962: INFO/MainProcess] mingle: all alone
/usr/local/lib/python2.7/site-packages/celery/fixups/django.py:199: UserWarning: Using settings.DEBUG leads to a memory leak, never use this setting in production environments!
warnings.warn('Using settings.DEBUG leads to a memory leak, never '
[2016-11-16 17:33:42,968: WARNING/MainProcess] /usr/local/lib/python2.7/site-packages/celery/fixups/django.py:199: UserWarning: Using settings.DEBUG leads to a memory leak, never use this setting in production environments!
warnings.warn('Using settings.DEBUG leads to a memory leak, never '
[2016-11-16 17:33:42,968: WARNING/MainProcess] celery#MacBook-Pro.local ready.
[2016-11-16 17:33:42,969: ERROR/MainProcess] Process 'Worker-1' pid:2777 exited with 'exitcode 1'
[2016-11-16 17:33:42,991: ERROR/MainProcess] Unrecoverable error: WorkerLostError('Could not start worker processes',)
Traceback (most recent call last):
File "/usr/local/lib/python2.7/site-packages/celery/worker/__init__.py", line 208, in start
self.blueprint.start(self)
File "/usr/local/lib/python2.7/site-packages/celery/bootsteps.py", line 127, in start
step.start(parent)
File "/usr/local/lib/python2.7/site-packages/celery/bootsteps.py", line 378, in start
return self.obj.start()
File "/usr/local/lib/python2.7/site-packages/celery/worker/consumer.py", line 271, in start
blueprint.start(self)
File "/usr/local/lib/python2.7/site-packages/celery/bootsteps.py", line 127, in start
step.start(parent)
File "/usr/local/lib/python2.7/site-packages/celery/worker/consumer.py", line 766, in start
c.loop(*c.loop_args())
File "/usr/local/lib/python2.7/site-packages/celery/worker/loops.py", line 50, in asynloop
raise WorkerLostError('Could not start worker processes')
WorkerLostError: Could not start worker processes
if i delete djangoitem in item:
from scrapy.item import Item
class CodingItem(item):
amount = scrapy.Field(default=0)
role = scrapy.Field()
type = scrapy.Field()
duration = scrapy.Field()
detail = scrapy.Field()
extra = scrapy.Field()
the task will play well and doesn't have any error.
What should i do if i want to use djangoitem in this celery-scrapy task?
Thanks!

You should check the ram usage. It might be possible celery is not getting enough ram

Upgrade Celery to 4.0 will solve the problem.
More detail: https://github.com/celery/celery/issues/3598

Redis loosing connection when used with Celery groups or chain - throwing ERROR/MainProcess] Connection to Redis lost: Retry (0/20) now

My problem is with redis showing the following error in terminal when i start my worker using celery task.
I am lucky to have reproduce the error using a modified version of chord in celery documentation. It seems the problem happens with chord or where I have too many groups running in parallel i.e chord(add.s(i, i) for i in range(1, num))(list_add.s()) or group(add.s(i, i) for i in range(1, num))()
Below is my code sample
#task
def add(x, y):
return [x, y, x + y, "Next"]
#task
def list_add(nums):
numbers = []
count = 1
for i in nums:
print("{}). {}".format(count, i))
numbers.extend(i)
count += 1
print(numbers)
return numbers
#task
def foo(num):
return chord(add.s(i, i) for i in range(1, num))(list_add.s())
Below is part of my terminal output with traceback.
[2015-11-04 20:36:14,912: INFO/MainProcess] Received task: b2b.tasks.add[b87fdc44-e759-4224-bce4-11f9468d12b3]
[2015-11-04 20:36:14,913: INFO/MainProcess] Received task: b2b.tasks.add[120f5bf2-b962-4424-894b-d6f0ca56102b]
[2015-11-04 20:36:14,914: INFO/MainProcess] Task b2b.tasks.bar[9dc93c75-6404-4db3-a685-ff91460e1adb] succeeded in 1.00830382s: <AsyncResult: c891df3e-aa5c-4c9f-ad3f-30abe3b3ccc1>
[2015-11-04 19:44:34,922: ERROR/MainProcess] Connection to Redis lost: Retry (0/20) now.
[2015-11-04 19:44:34,922: ERROR/MainProcess] Connection to Redis lost: Retry (1/20) in 1.00 second.
...
/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/celery/app/trace.py:365: RuntimeWarning: Exception raised outside body: ConnectionError('Error 8 connecting to localhost:6379. nodename nor servname provided, or not known.',):
Traceback (most recent call last):
File "/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/celery/app/trace.py", line 235, in trace_task
File "/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/celery/backends/base.py", line 256, in store_result
File "/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/celery/backends/base.py", line 490, in _store_result
File "/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/celery/backends/redis.py", line 160, in set
File "/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/celery/backends/redis.py", line 149, in ensure
File "/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/kombu/utils/__init__.py", line 243, in retry_over_time
File "/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/celery/backends/redis.py", line 169, in _set
File "/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/redis/client.py", line 2593, in execute
File "/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/redis/client.py", line 2447, in _execute_transaction
File "/Users/Me/.virtualenvs/djangoscrape/lib/python2.7/site-packages/redis/connection.py", line 532, in send_packed_command
File "/Users/Michael/.virtualenvs/djangoscrape/lib/python2.7/site-packages/redis/connection.py", line 436, in connect
ConnectionError: Error 8 connecting to localhost:6379. nodename nor servname provided, or not known.
This is the command i use to start my worker in terminal
celery -A scraper worker -P eventlet -c 1000 -l info
It works fine when foo() is passed 20 but with 1000 or greater the
error shows up.
>>> a = foo.delay(20) # works
>>> a = foo.delay(1000) # fails
Please, kindly suggest how this can be solved if you have an idea and thanks in advance.

I finally figured it out with help from IRC and Greg from redis user groups
By changing the limit of opened files on my Mac OSX Yosemite to 65536 the problem was solved.
I hope this helps someone someday.

Using multiprocessing pool from celery task raises exception

FOR THOSE READING THIS: I have decided to use RQ instead which doesn't fail when running code that uses the multiprocessing module. I suggest you use that.
I am trying to use a multiprocessing pool from within a celery task using Python 3 and redis as the broker (running it on a Mac). However, I don't seem to be able to even create a multiprocessing Pool object from within the Celery task! Instead, I get a strange exception that I really don't know what to do with.
Can anyone tell me how to accomplish this?
The task:
from celery import Celery
from multiprocessing.pool import Pool
app = Celery('tasks', backend='redis', broker='redis://localhost:6379/0')
#app.task
def test_pool():
with Pool() as pool:
# perform some task using the pool
pool.close()
return 'Done!'
which I add to Celery using:
celery -A tasks worker --loglevel=info
and then running it via the following python script:
import tasks
tasks.test_pool.delay()
that returns the following celery output:
[2015-01-12 15:08:57,571: INFO/MainProcess] Connected to redis://localhost:6379/0
[2015-01-12 15:08:57,583: INFO/MainProcess] mingle: searching for neighbors
[2015-01-12 15:08:58,588: INFO/MainProcess] mingle: all alone
[2015-01-12 15:08:58,598: WARNING/MainProcess] celery#Simons-MacBook-Pro.local ready.
[2015-01-12 15:09:02,425: INFO/MainProcess] Received task: tasks.test_pool[38cab553-3a01-4512-8f94-174743b05369]
[2015-01-12 15:09:02,436: ERROR/MainProcess] Task tasks.test_pool[38cab553-3a01-4512-8f94-174743b05369] raised unexpected: AttributeError("'Worker' object has no attribute '_config'",)
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/celery/app/trace.py", line 240, in trace_task
R = retval = fun(*args, **kwargs)
File "/usr/local/lib/python3.4/site-packages/celery/app/trace.py", line 438, in __protected_call__
return self.run(*args, **kwargs)
File "/Users/simongray/Code/etilbudsavis/offer-sniffer/tasks.py", line 17, in test_pool
with Pool() as pool:
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/pool.py", line 150, in __init__
self._setup_queues()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/pool.py", line 243, in _setup_queues
self._inqueue = self._ctx.SimpleQueue()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/context.py", line 111, in SimpleQueue
return SimpleQueue(ctx=self.get_context())
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/queues.py", line 336, in __init__
self._rlock = ctx.Lock()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/context.py", line 66, in Lock
return Lock(ctx=self.get_context())
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/synchronize.py", line 163, in __init__
SemLock.__init__(self, SEMAPHORE, 1, 1, ctx=ctx)
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/synchronize.py", line 59, in __init__
kind, value, maxvalue, self._make_name(),
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/synchronize.py", line 117, in _make_name
return '%s-%s' % (process.current_process()._config['semprefix'],
AttributeError: 'Worker' object has no attribute '_config'

This is a known issue with celery. It stems from an issue introduced in the billiard dependency. A work-around is to manually set the _config attribute for the current process. Thanks to user #martinth for the work-around below.
from celery.signals import worker_process_init
from multiprocessing import current_process
#worker_process_init.connect
def fix_multiprocessing(**kwargs):
try:
current_process()._config
except AttributeError:
current_process()._config = {'semprefix': '/mp'}
The worker_process_init hook will execute the code upon worker process initialization. We simply check to see if _config exists, and set it if it does not.

Via a useful comment in the Celery issue report linked to in Davy's comment, I was able to solve this by importing the billiard module's Pool class instead.
Replace
from multiprocessing import Pool
with
from billiard.pool import Pool

A quick solution is to use the thread-based "dummy" multiprocessing implementation. Change
from multiprocessing import Pool # or whatever you're using
to
from multiprocessing.dummy import Pool
However since this parallelism is thread-based, the usual caveats (GIL) apply.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

ı writing data process pipeline with luigi but ı get error - python

Related

Module not found when running scrapy in celery

rqscheduler docker is stopping with timemismatch error

Celery could not start worker processes when using scrapy-Djangoitem

Redis loosing connection when used with Celery groups or chain - throwing ERROR/MainProcess] Connection to Redis lost: Retry (0/20) now

Using multiprocessing pool from celery task raises exception

Categories

Resources