How do I distribute tasks to multiple celery workers in parallel? - python

It seems every celery questions are like 5 years to 10 years old and
utilizing old celery versions and design patterns
Using celery version 5.0.5
I have a celery task that queries the database and then performs some computations/calculations on each row of the rows returned by query
Issue is this task is taking several minutes to complete because of the thousands of rows returned from query so i am trying to distribute to multiple celery workers in parallel
#celery.task()
def send_sms(to, body):
from twilio.rest import Client
account_sid = os.environ["ACCOUNT_SID"]
auth_token = os.environ["AUTH_TOKEN"]
from_ = os.environ["NUMBER"]
client = Client(
account_sid,
auth_token,
)
message = client.messages.create(
to=to,
from_=from_,
body=body,
)
#celery.task()
def notify_users():
session = create_session()
query = session.query(Rentals).filter(Rentals.enabled == True)
today = datetime.now()
for q in query:
if q.returned_date is not None:
if (today - q.returned_date).total_seconds() < q.rental_period:
continue
user = session.query(Users).filter(User.id == q.user_id).one()
to = send_notification_get_to.get(q.notification_method)(user)
body = f"sending email to {user.email}"
send_sms.delay(to, body)
What will be the best way to distribute these tasks to multiple workers as opposed to letting one worker run it for several minutes which gets slower exponentially as the number of rows returned increase from a few thousands to tens of thousands

I had the same use case earlier, what I did was
I paginated the query (broke the records into smaller chunks) and each page was processed by a celery worker
You can also try using different worker pools like gevent, eventlet pools for better performance.
the code would look like this.
#celery.task()
def send_sms(to, body):
from twilio.rest import Client
account_sid = os.environ["ACCOUNT_SID"]
auth_token = os.environ["AUTH_TOKEN"]
from_ = os.environ["NUMBER"]
client = Client(
account_sid,
auth_token,
)
message = client.messages.create(
to=to,
from_=from_,
body=body,
)
#celery.task()
def notify_range_of_users(num_chunks, skip):
session = create_session()
today = datetime.now()
query = session.query(Rentals).filter(Rentals.enabled == True)
paginated_query = query.limit(num_chunks).offset(skip * num_chunks)
for q in paginated_query:
if q.returned_date is not None:
if (today - q.returned_date).total_seconds() < q.rental_period:
continue
user = session.query(Users).filter(User.id == q.user_id).one()
to = send_notification_get_to.get(q.notification_method)(user)
body = f"sending email to {user.email}"
send_sms.delay(to, body)
#celery.task()
def notify_users():
session = create_session()
today = datetime.now()
query = session.query(Rentals).filter(Rentals.enabled == True)
total_rentals = query.count()
# each chunk will contain, 100 rows/objects
num_chunks = 100
# find total number of chunks
quo, remainder = divmod(total_rentals, num_chunks)
# each job will contain a certain number of chunks
jobs = quo
if remainder:
jobs = jobs + 1
skip = 0
for i in range(jobs):
notify_range_of_users.delay(num_chunks, skip)
# increment skip to go the next page
skip = skip + 1

Related

Kivy - Schedule Something for the future

I'm trying to write an app that you can input tasks. When you input these tasks, you give a time too, for example 10:30 AM. When that time comes, you get alerted of the task. I figured out the alerting part, but I just don't know how to call a function at some point in the future without interrupting the rest of the code while its running. Kivy's built in clock function seems to small scale for something like this.
What I tried to do that interrupted the rest of the code:
def submit_button(self):
#account id and auth token omitted
account_id = ""
auth_token = ""
client = Client(account_id, auth_token)
self.name_in_q = self.ids.the_name.text
self.time = f'{self.ids.time_spinner_1.text}:{self.ids.time_spinner_2.text} {self.ids.time_spinner_3.text}'
waiting_for_task = True
while waiting_for_task:
tz_hous = pytz.timezone('America/Chicago')
datetime_houston = datetime.now(tz_hous)
ds = datetime_houston.strftime("%H:%M")
t = time.strptime(ds, "%H:%M")
ds = time.strftime("%I:%M %p", t)
if(ds == self.time):
client.messages.create(
body = f"Complete your task: {self.name_in_q} for {self.time}",
# phone numbers omitted
from_ = "+",
to = "+"
)
break

"TypeError: Cannot pickle 'SSL Object' " When using concurrent.futuresProcessPoolExecutor() with IMAP

I am using python 3.9 with IMAPlib in order to retrieve emails and scrape links from them. It works fine but can become quite slow for large amounts of emails (I'm doing ~40,000). In order to speed it up I'd like to use some concurrency I can get all the emails at once.
To do this I get the IDs of all the emails beforehand then assign each ID to a task in my pool. I close the previously used impalib connection before scrape_link_mp() is called. I have tried to use a lock and a manager lock but I still get the same error.
Am I missing something fundamental here? Let me know if anything else needs to be explained, thanks.
My code looks like this:
def scrape_link_mp(self):
self.file_counter = 0
self.login_session.logout()
self.Manager = multiprocessing.Manager()
self.lock = self.Manager.Lock()
futures = []
with concurrent.futures.ProcessPoolExecutor() as Executor:
for self.num_message in self.arr_of_emails[self.start_index:]:
task_params = self.current_user,self.current_password,self.counter,self.imap_url,self.num_message,self.substring_filter,self.link_regex,self.lock
futures.append(
Executor.submit(
self.scrape_link_from_email_single,
*task_params
)
)
for future in concurrent.futures.as_completed(futures):
self.counter+=1
self.timestamp = time.strftime('%H:%M:%S')
print(f'[{self.timestamp}] DONE: {self.counter}/{len(self.num_mails)}')
print(future.result())
def scrape_link_from_email_single(self,current_user,current_password,counter,imap_url,num_message,substring_filter,link_regex,lock):
login_session_mp.logout()
current_user_mp = self.current_user
current_password_mp = self.current_password
self.lock.acquire()
login_session_mp = imaplib.IMAP4_SSL(self.imap_url,993)
login_session_mp.login(current_user_mp,current_password_mp)
self.search_mail_status, self.amount_matching_criteria = login_session_mp.search(Mail.CHARSET,search_criteria)
_,individual_response_data = login_session_mp.fetch(self.num_message,'(RFC822)')
self.lock().release
raw = email.message_from_bytes(individual_response_data[0][1])
scraped_email_value = str(email.message_from_bytes(Mail.scrape_email(raw)))
print(scraped_email_value)
returned_links = str(link_regex.findall(scraped_email_value))
for i in returned_links:
if substring_filter:
self.lock.acquire()
with open('out.txt','a+') as link_file:
link_file.write(i +'\n')
link_file.close()
self.lock.release()

Process a queue with parallel/async requests

I want to implement a parallel request.get() function, that processes a queue of requests and puts the result in a list, which, when finished, is processed by a standard sequential code. I tried the following, but my code doesn´t end and does not print the IDs.
import requests
from queue import Queue
from threading import Thread
BASE = 'http://www.uniprot.org'
KB_ENDPOINT = '/uniprot/'
FORMAT = ".xml"
num_threads = 10
ID_q = Queue()
ID_data = Queue()
# worker function
def get_ID_data(ID_q, ID_data, BASE, KB_ENDPOINT, FORMAT):
while True:
ID = ID_q.get()
print(ID)
ID_data.put(requests.get(BASE + KB_ENDPOINT + ID + FORMAT))
ID_q.task_done()
ID_data.task_done()
# initialize worker
for i in range(num_threads):
worker = Thread(target=get_ID_data, args=(ID_q, ID_data, BASE, KB_ENDPOINT, FORMAT))
worker.setDaemon(True)
worker.start()
# load IDs and put in queue
ID_list = ["A6ZMA9", "N1P5E6",
"H0GM11", "H0GZ91",
"A0A0L8VK54", "G2WKA0",
"C8ZEQ4", "B5VPH8",
"B3LLU5", "C7GL72",
"J8QFS9", "J8Q1C1",
"A0A0L8RDV1"]
for ID in ID_list:
ID_q.put(ID)
ID_q.join()
# work with ID_data
print(ID_data)
Update:
I changed #pkqxdd answer using asyncio and aiohttp to this:
import asyncio,aiohttp
IDs = ["A6ZMA9", "N1P5E6",
"H0GM11", "H0GZ91",
"A0A0L8VK54", "G2WKA0",
"C8ZEQ4", "B5VPH8",
"B3LLU5", "C7GL72",
"J8QFS9", "J8Q1C1",
"A0A0L8RDV1"]
BASE = 'http://www.uniprot.org'
KB_ENDPOINT = '/uniprot/'
FORMAT = ".xml"
async def get_data_coroutine(session, ID):
async with session.get(BASE + KB_ENDPOINT + ID + FORMAT) as response:
res = await response.text()
print(ID)
if not res:
raise NameError('{} is not available'.format(ID))
return res
async def main(loop):
async with aiohttp.ClientSession(loop=loop) as session:
tasks = [get_data_coroutine(session, ID) for ID in IDs]
return await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
result = loop.run_until_complete(main(loop))
Since you've mentioned async, I'm assuming you are using Python3.6 or higher.
The library requests doesn't really support async programming and it's kinda a dead end trying to make it async. A better idea is to use aiohttp instead.
You can achieve your goal with simple codes like this:
import asyncio,aiohttp
BASE = 'http://www.uniprot.org'
KB_ENDPOINT = '/uniprot/'
FORMAT = ".xml"
ID_list = ["A6ZMA9", "N1P5E6",
"H0GM11", "H0GZ91",
"A0A0L8VK54", "G2WKA0",
"C8ZEQ4", "B5VPH8",
"B3LLU5", "C7GL72",
"J8QFS9", "J8Q1C1",
"A0A0L8RDV1"]
session=aiohttp.ClientSession()
async def get_data(ID):
async with session.get(BASE + KB_ENDPOINT + ID + FORMAT) as response:
return await response.text()
coros=[]
for ID in ID_list:
coros.append(get_data(ID))
loop=asyncio.get_event_loop()
fut=asyncio.gather(*coros)
loop.run_until_complete(fut)
print(fut.result())
(Yes, I see the warning. But I don't really want to make the answer more complicated. You should change it to suit your purpose better.)

Google App Engine: Using cron to expire (or 'unpublish') entities

I would like to mimic the 'published/unpublished' functionality of common CMS platforms like Wordpress or Drupal.
So I have this Job(ndb.Model):
class Job(ndb.Model):
title = ndb.StringProperty()
published = ndb.StringProperty(default = "on")
created = ndb.DateTimeProperty(auto_now_add = True)
expire = ndb.DateTimeProperty()
The NewJob handler looks like this:
class NewJob(JobHandler):
def get(self):
self.render('new-job.html')
def post(self):
title = self.request.get('title')
published = "on"
expire = datetime.datetime.now() + datetime.timedelta(days = 30)
if title:
j = Job(
title = title,
published = published,
expire = expire,
created = created)
j.put()
self.redirect('/job/%s' % str(j.key.id()))
else:
self.redirect('/login')
And the saved entity looks something like this:
Job(key=Key('Job', 5910974510923776), created=datetime.datetime(2014, 1, 17, 19, 0, 52, 12379), expire=datetime.datetime(2014, 2, 17, 19, 1, 52, 12174), published=u'on', title=u'Sous Chef')
What I am aiming to do is to set all Job entities to 'publish == "off"' when the their expire time is today (now)
So I've set up a task in the cron.yaml
cron:
- description: expire job entities after 30 days
url: /cron/job-expire
schedule: every day 00:00
...and the /cron/job-expire url is handled by:
class CronJobExpire(BaseHandler):
def get(self):
jobs = Job.query(Job.published == "on").fetch()
now = datetime.datetime.now()
for job in jobs:
if job.expire < now or job.expire == now:
job.published = "off"
The aim of the CronJobExpire handler above is to:
Check through the list of Job entities which are currently published == "on", then
check if their expire dates are 'now' or '< now', and if this is True, set published == "off".
This doesn't work. I am following the documentation. Any help would be appreciated - thank you.
You need to save the jobs that you changed:
class CronJobExpire(BaseHandler):
def get(self):
jobs = Job.query(Job.published == "on").fetch()
now = datetime.datetime.now()
for job in jobs:
if job.expire <= now:
job.published = "off"
job.put()
Also, I suggest fetching only the expired jobs:
now = datetime.datetime.now()
jobs = Job.query(Job.published == "on", Job.expire <= now).fetch()
for job in jobs:
job.published = "off"
job.put()
Or, to reduce API calls and therefore improve speed:
now = datetime.datetime.now()
jobs = Job.query(Job.published == "on", Job.expire <= now).fetch()
for job in jobs:
job.published = "off"
ndb.put_multi(jobs)
Lastly, consider making Job.published a boolean:
published = ndb.BooleanProperty(default=True)

Consolidation of query (task) in Django

I have the following task. This task can take a few seconds to complete.
How can make the task below make trips and run faster?
class SendMessage(Task):
name = "Sending SMS"
max_retries = 10
default_retry_delay = 3
def run(self, message_id, gateway_id=None, **kwargs):
logging.debug("About to send a message.")
# Because we don't always have control over transactions
# in our calling code, we will retry up to 10 times, every 3
# seconds, in order to try to allow for the commit to the database
# to finish. That gives the server 30 seconds to write all of
# the data to the database, and finish the view.
try:
message = Message.objects.get(pk=message_id)
except Exception as exc:
raise SendMessage.retry(exc=exc)
if not gateway_id:
if hasattr(message.billee, 'sms_gateway'):
gateway = message.billee.sms_gateway
else:
gateway = Gateway.objects.all()[0]
else:
gateway = Gateway.objects.get(pk=gateway_id)
# Check we have a credits to sent me message
account = Account.objects.get(user=message.sender)
# I'm getting the non-cathed version here, check performance!!!!!
if account._balance() >= message.length:
response = gateway._send(message)
if response.status == 'Sent':
# Take credit from users account.
transaction = Transaction(
account=account,
amount=- message.charge,
description="Debit: SMS Sent",
)
transaction.save()
message.billed = True
message.save()
else:
pass
logging.debug("Done sending message.")

Categories