Google App Engine: Using cron to expire (or 'unpublish') entities - python

I would like to mimic the 'published/unpublished' functionality of common CMS platforms like Wordpress or Drupal.
So I have this Job(ndb.Model):
class Job(ndb.Model):
title = ndb.StringProperty()
published = ndb.StringProperty(default = "on")
created = ndb.DateTimeProperty(auto_now_add = True)
expire = ndb.DateTimeProperty()
The NewJob handler looks like this:
class NewJob(JobHandler):
def get(self):
self.render('new-job.html')
def post(self):
title = self.request.get('title')
published = "on"
expire = datetime.datetime.now() + datetime.timedelta(days = 30)
if title:
j = Job(
title = title,
published = published,
expire = expire,
created = created)
j.put()
self.redirect('/job/%s' % str(j.key.id()))
else:
self.redirect('/login')
And the saved entity looks something like this:
Job(key=Key('Job', 5910974510923776), created=datetime.datetime(2014, 1, 17, 19, 0, 52, 12379), expire=datetime.datetime(2014, 2, 17, 19, 1, 52, 12174), published=u'on', title=u'Sous Chef')
What I am aiming to do is to set all Job entities to 'publish == "off"' when the their expire time is today (now)
So I've set up a task in the cron.yaml
cron:
- description: expire job entities after 30 days
url: /cron/job-expire
schedule: every day 00:00
...and the /cron/job-expire url is handled by:
class CronJobExpire(BaseHandler):
def get(self):
jobs = Job.query(Job.published == "on").fetch()
now = datetime.datetime.now()
for job in jobs:
if job.expire < now or job.expire == now:
job.published = "off"
The aim of the CronJobExpire handler above is to:
Check through the list of Job entities which are currently published == "on", then
check if their expire dates are 'now' or '< now', and if this is True, set published == "off".
This doesn't work. I am following the documentation. Any help would be appreciated - thank you.

You need to save the jobs that you changed:
class CronJobExpire(BaseHandler):
def get(self):
jobs = Job.query(Job.published == "on").fetch()
now = datetime.datetime.now()
for job in jobs:
if job.expire <= now:
job.published = "off"
job.put()
Also, I suggest fetching only the expired jobs:
now = datetime.datetime.now()
jobs = Job.query(Job.published == "on", Job.expire <= now).fetch()
for job in jobs:
job.published = "off"
job.put()
Or, to reduce API calls and therefore improve speed:
now = datetime.datetime.now()
jobs = Job.query(Job.published == "on", Job.expire <= now).fetch()
for job in jobs:
job.published = "off"
ndb.put_multi(jobs)
Lastly, consider making Job.published a boolean:
published = ndb.BooleanProperty(default=True)

Related

Kivy - Schedule Something for the future

I'm trying to write an app that you can input tasks. When you input these tasks, you give a time too, for example 10:30 AM. When that time comes, you get alerted of the task. I figured out the alerting part, but I just don't know how to call a function at some point in the future without interrupting the rest of the code while its running. Kivy's built in clock function seems to small scale for something like this.
What I tried to do that interrupted the rest of the code:
def submit_button(self):
#account id and auth token omitted
account_id = ""
auth_token = ""
client = Client(account_id, auth_token)
self.name_in_q = self.ids.the_name.text
self.time = f'{self.ids.time_spinner_1.text}:{self.ids.time_spinner_2.text} {self.ids.time_spinner_3.text}'
waiting_for_task = True
while waiting_for_task:
tz_hous = pytz.timezone('America/Chicago')
datetime_houston = datetime.now(tz_hous)
ds = datetime_houston.strftime("%H:%M")
t = time.strptime(ds, "%H:%M")
ds = time.strftime("%I:%M %p", t)
if(ds == self.time):
client.messages.create(
body = f"Complete your task: {self.name_in_q} for {self.time}",
# phone numbers omitted
from_ = "+",
to = "+"
)
break

How do I distribute tasks to multiple celery workers in parallel?

It seems every celery questions are like 5 years to 10 years old and
utilizing old celery versions and design patterns
Using celery version 5.0.5
I have a celery task that queries the database and then performs some computations/calculations on each row of the rows returned by query
Issue is this task is taking several minutes to complete because of the thousands of rows returned from query so i am trying to distribute to multiple celery workers in parallel
#celery.task()
def send_sms(to, body):
from twilio.rest import Client
account_sid = os.environ["ACCOUNT_SID"]
auth_token = os.environ["AUTH_TOKEN"]
from_ = os.environ["NUMBER"]
client = Client(
account_sid,
auth_token,
)
message = client.messages.create(
to=to,
from_=from_,
body=body,
)
#celery.task()
def notify_users():
session = create_session()
query = session.query(Rentals).filter(Rentals.enabled == True)
today = datetime.now()
for q in query:
if q.returned_date is not None:
if (today - q.returned_date).total_seconds() < q.rental_period:
continue
user = session.query(Users).filter(User.id == q.user_id).one()
to = send_notification_get_to.get(q.notification_method)(user)
body = f"sending email to {user.email}"
send_sms.delay(to, body)
What will be the best way to distribute these tasks to multiple workers as opposed to letting one worker run it for several minutes which gets slower exponentially as the number of rows returned increase from a few thousands to tens of thousands
I had the same use case earlier, what I did was
I paginated the query (broke the records into smaller chunks) and each page was processed by a celery worker
You can also try using different worker pools like gevent, eventlet pools for better performance.
the code would look like this.
#celery.task()
def send_sms(to, body):
from twilio.rest import Client
account_sid = os.environ["ACCOUNT_SID"]
auth_token = os.environ["AUTH_TOKEN"]
from_ = os.environ["NUMBER"]
client = Client(
account_sid,
auth_token,
)
message = client.messages.create(
to=to,
from_=from_,
body=body,
)
#celery.task()
def notify_range_of_users(num_chunks, skip):
session = create_session()
today = datetime.now()
query = session.query(Rentals).filter(Rentals.enabled == True)
paginated_query = query.limit(num_chunks).offset(skip * num_chunks)
for q in paginated_query:
if q.returned_date is not None:
if (today - q.returned_date).total_seconds() < q.rental_period:
continue
user = session.query(Users).filter(User.id == q.user_id).one()
to = send_notification_get_to.get(q.notification_method)(user)
body = f"sending email to {user.email}"
send_sms.delay(to, body)
#celery.task()
def notify_users():
session = create_session()
today = datetime.now()
query = session.query(Rentals).filter(Rentals.enabled == True)
total_rentals = query.count()
# each chunk will contain, 100 rows/objects
num_chunks = 100
# find total number of chunks
quo, remainder = divmod(total_rentals, num_chunks)
# each job will contain a certain number of chunks
jobs = quo
if remainder:
jobs = jobs + 1
skip = 0
for i in range(jobs):
notify_range_of_users.delay(num_chunks, skip)
# increment skip to go the next page
skip = skip + 1

AWS Lambda for cost explorer

I'm a new learner for AWS Lambda.
I'm trying to get data from AWS Cost Explorer and send a slack daily.
The data should be descending order but it's not.
Can you please give some tips for my code?
AWS Lambda seems not to allow to use .getValue() for sort function.
def lambda_handler(event, context):
client = boto3.client('ce')
#get cost for each service daily
serviceCost = get_daily_serviceCost(client)
(title, detail) = create_message(totalCost, serviceCost)
#transfer message to slack
post_slack(title, detail)
def get_daily_serviceCost(client) -> list:
today = datetime.date.today()
yesterday = datetime.date.today() - datetime.timedelta(days=1)
price = client.get_cost_and_usage(
TimePeriod={
'Start':datetime.date.strftime(yesterday, '%Y-%m-%d'),
'End':datetime.date.strftime(today, '%Y-%m-%d')
},
Granularity='DAILY',
Metrics=['BlendedCost'],
GroupBy=[
{
'Type':'DIMENSION',
'Key':'SERVICE'
}
]
)
billings = []
for item in price['ResultsByTime'][0]['Groups']:
billings.append({
'service_name':item['Keys'][0],
'billing':item['Metrics']['BlendedCost']['Amount']
})
return billings
def create_message(serviceCost:list) -> (str):
yesterday = datetime.date.today() - datetime.timedelta(days=1)
details = []
for item in serviceCost:
service_name = item['service_name']
billing = round(float(item['billing']),2)
if billing == 0.00:
continue
details.append({service_name, billing})
for check in details:
print(check)
test = []
for what in serviceCost:
test.append({service_name, billing})
# print(what)
test.sort(key=lambda k: k[1], reverse=True)

How to exit the process if any job will fail in python?

I am running jobs in a parallel manner based on sequence number. I am taking the status of every job like - success or Failed. Then after getting the status of every job i am sending mail with status of every job.
But mail is generating after finishing of whole process. But i want if any job will fail the process will stop there and mail will generate.
Can you please help me how to do this?
Code i am running:
df_mail_final = pd.DataFrame()
df_mail_final1 = pd.DataFrame()
'''Getting the status of every job'''
for m_job in df_main4.master_job.unique():
list_df = []
dict_mail = OrderedDict()
temp_df1 = df_main4[df_main4['master_job'] == m_job].copy()
temp_df1['duration'] = pd.to_datetime(temp_df1['end_time'].unique()[-1]) - pd.to_datetime(temp_df1['start_time'].unique()[0])
temp_df1['duration'] = temp_df1['duration'].replace('0 days' ,'')
status_list = temp_df1.status.unique()
if(0 in status_list):
dict_mail['Master Job Name'] = m_job
idx = temp_df1['status'] == 0
dict_mail['Execution_Seq'] = temp_df1.loc[idx]["exec_seq"].unique()[0]
dict_mail['Start_time'] = temp_df1.loc[idx]["start_time"].unique()[0]
dict_mail['End_time'] = temp_df1.loc[idx]["end_time"].unique()[-1]
dict_mail['Status'] = 'Failed'
dict_mail['Duration'] = temp_df1.loc[idx]["duration"].unique()[-1]
dict_mail['Reason'] = temp_df1.loc[idx]["error_msg"].unique()[0]
dict_mail['Function_Name'] = temp_df1.loc[idx]["error_func"].unique()[0]
list_df.append(dict_mail)
df_mail = pd.DataFrame(list_df)
if(0 not in status_list):
print(m_job)
dict_mail['Master Job Name'] = m_job
dict_mail['Execution_Seq'] = temp_df1.exec_seq.unique()[0]
dict_mail['Start_time'] = temp_df1.start_time.unique()[0]
dict_mail['End_time'] = temp_df1.end_time.unique()[-1]
dict_mail['Status'] = 'Success'
dict_mail['Duration'] = temp_df1.duration.unique()[-1]
dict_mail['Reason'] = ''
dict_mail['Function_Name'] = ''
list_df.append(dict_mail)
df_mail = pd.DataFrame(list_df)
df_mail_final = pd.concat([df_mail_final,df_mail], axis=0, ignore_index=True)
#if(df_mail_final['Status'].iloc[-1] == 'Failed'):
#break
'''Printing the Final Dataframe with status of all the jobs'''
print(df_mail_final)
df_mail_final = df_mail_final[['Master Job Name', 'Execution_Seq', 'Start_time', 'End_time', 'Status', 'Duration', 'Reason', 'Function_Name']]
exec_end_dt = datetime.datetime.now().strftime("%H:%M:%S")
#total_duration = pd.to_datetime(exec_end_dt) - pd.to_datetime(exec_start_dt)
total_duration= pd.to_datetime(df_mail_final['End_time']).max() - pd.to_datetime(df_mail_final['Start_time']).min()
total_duration = str(total_duration)
total_duration = total_duration.replace('0 days', '')
send_mail(df_mail_final, LOG_FILE, total_duration)
Sharing a gist/system design logic of how to implement this job
def my_parallel_job(*args, **kwargs):
# do your stuff here
pass
def parallel_job_wrapper(*args, **kwargs):
try:
my_parallel_job(*args, **kwargs)
# if errors following will not run
return "success"
except:
# if errors comes
return "fail"
def main(*args, **kwargs):
# call you parallel jobs from here
p1 = parallel_job_wrapper(*args, **kwargs)
# preferably you are using something like python's multithreading pool methods
In the above code, the second function is to act as a cushion, in case of any failure of the first function. This ensures that your main does not stop even when any parallel job fails.

Is it possible to fetch many objects together from Google App Engine?

We have about 5,000 objects of class Domain in Google App Engine, and we want to export the list of domains to CSV. Each domain is linked to an object of class DomainStateData:
class DomainStateData(db.Expando, ExpandoEntity):
plan = db.ReferenceProperty(Plan)
plan_expiration = db.DateTimeProperty()
trial_expiration = db.DateTimeProperty()
date_created = db.DateTimeProperty(auto_now_add=True, indexed=True)
last_modified = db.DateTimeProperty(auto_now=True)
class Domain(db.Expando, ExpandoEntity, SocialIconsEntity):
"""
Domain Model
"""
domain = db.StringProperty(required=True)
...
_state_data = db.ReferenceProperty(DomainStateData)
#property
def state_data(self):
try:
if not self._state_data:
# try to get it, if not, build it
sd = DomainStateData.get_by_key_name(self.key().name())
if not sd:
sd = DomainStateData(key_name=self.key().name()).put()
self._state_data = sd
self.put()
return self._state_data
else:
return self._state_data
except ReferencePropertyResolveError:
self._state_data = DomainStateData(key_name=self.key().name()).put()
self.put()
return self._state_data
I wrote a code which exports 100 domains to CSV (it takes 5 seconds), but if I try to fetch all the 5,000 domains I get a timeout, which is 60 seconds. Is it possible to fetch all the DomainStateData objects together without a timeout? Here is my code that exports the domains to CSV:
import sys
sys.path.insert(0, 'libs')
import webapp2
import datetime
import csv
from models import Domain
class ExportAllDomainsToCsvHandler(webapp2.RequestHandler):
def get(self):
self.response.headers['Content-Type'] = 'text/csv'
self.response.headers['Content-Disposition'] = 'attachment; filename="All Domains [{0}].csv"'.format(str(datetime.date.today()))
writer = csv.writer(self.response.out)
writer.writerow(["Domain", "Current state", "Plan expiration date", "Trial expiration date", "Current oauth user"])
all_domains = Domain.all().fetch(100)
all_domains.sort(key=lambda domain: (0 if domain.state_data.plan_expiration is None else 1, domain.state_data.plan_expiration, 0 if domain.state_data.trial_expiration is None else 1, domain.state_data.trial_expiration, domain.domain))
for domain in all_domains:
if (domain.state_data.plan_expiration is None):
domain_plan_expiration = "No plan expiration date"
else:
domain_plan_expiration = domain.state_data.plan_expiration.strftime('%Y-%m-%d')
if (domain.state_data.trial_expiration is None):
domain_trial_expiration = "No trial expiration date"
else:
domain_trial_expiration = domain.state_data.trial_expiration.strftime('%Y-%m-%d')
writer.writerow([domain.domain, domain.cur_state.name, domain_plan_expiration, domain_trial_expiration, domain.admin])
app = webapp2.WSGIApplication([
("/csv/export_all_domains_to_csv", ExportAllDomainsToCsvHandler)
], debug=True)
OK, I found a solution. I fetched all the DomainStateData objects directly from the database and now it takes 35 seconds to create the CSV with all the domains. Here is my code, I didn't change the models:
import sys
sys.path.insert(0, 'libs')
import webapp2
import datetime
import csv
from models import DomainStateData, Domain
class ExportAllDomainsToCsvHandler(webapp2.RequestHandler):
def get(self):
self.response.headers['Content-Type'] = 'text/csv'
self.response.headers['Content-Disposition'] = 'attachment; filename="All Domains [{0}].csv"'.format(str(datetime.date.today()))
writer = csv.writer(self.response.out)
writer.writerow(["Domain", "Current state", "Plan expiration date", "Trial expiration date", "Current oauth user"])
all_domain_state_data_dict = dict()
all_domain_state_data = DomainStateData.all().fetch(1000000)
all_domains = Domain.all().fetch(1000000)
for domain_state_data in all_domain_state_data:
all_domain_state_data_dict[domain_state_data.key().name()] = domain_state_data
for domain in all_domains:
if (domain.key().name() in all_domain_state_data_dict):
domain.__state_data = all_domain_state_data_dict[domain.key().name()]
all_domains.sort(key=lambda domain: (0 if domain.__state_data.plan_expiration is None else 1, domain.__state_data.plan_expiration, 0 if domain.__state_data.trial_expiration is None else 1, domain.__state_data.trial_expiration, domain.domain))
for domain in all_domains:
if (domain.__state_data.plan_expiration is None):
domain_plan_expiration = "No plan expiration date"
else:
domain_plan_expiration = domain.__state_data.plan_expiration.strftime('%Y-%m-%d')
if (domain.__state_data.trial_expiration is None):
domain_trial_expiration = "No trial expiration date"
else:
domain_trial_expiration = domain.__state_data.trial_expiration.strftime('%Y-%m-%d')
writer.writerow([domain.domain, domain.cur_state.name, domain_plan_expiration, domain_trial_expiration, domain.admin])
app = webapp2.WSGIApplication([
("/csv/export_all_domains_to_csv", ExportAllDomainsToCsvHandler)
], debug=True)

Categories