I have a model structure as below,
ACTIVE_STATUS = ['waiting', 'loading', 'starting', 'running', 'stopping']
INACTIVE_STATUS = ['stopped', 'finished', 'failed', 'lost']
ALL_STATUS = ACTIVE_STATUS + INACTIVE_STATUS
class Task(models.Model):
name = models.CharField(max_length=20)
class Job(models.Model):
task = models.ForeignKey(Task, related_name='jobs')
timestamp = models.DateTimeField(auto_now_add=True)
status = models.CharField(choices=zip(ALL_STATUS, ALL_STATUS), max_length=20)
How can I annotate the "latest timestamp and its status" into Task queryset?
I have managed to obtain the latest timestamp by,
Task.objects.annotate(latest_ts=models.Max(models.F('job__timestamp')))
So, how can I get the corresponding status?
Update-1
The utmost aim of this query is to sort the Task queryset in
with zero Jobs ( say Task.objects.filter(job__isnull=True) )
latest_job=='running'
Update-2
TaskManager class that used to obtain the sorted queryset
class TaskManager(models.Manager):
def get_queryset(self):
qs = super().get_queryset()
latest_job = models.Max(models.F('job__timestamp'))
latest_status = models.Subquery(
Job.objects.filter(
task_id=models.OuterRef('pk')
).values('status').order_by('-timestamp')[:1]
)
qs_order = models.Case(
models.When(job__isnull=True, then=models.Value(2)),
models.When(latest_status='running', then=models.Value(1)),
default=models.Value(0),
output_field=models.IntegerField()
)
return qs.annotate(latest_job=latest_job, latest_status=latest_status, qs_order=qs_order).order_by('-qs_order')
You can work with a Subquery expression [Django-doc]:
from django.db.models import OuterRef, Subquery
Task.objects.annotate(
latest_status=Subquery(
Job.objects.filter(
task_id=OuterRef('pk')
).values('status').order_by('-timestamp')[:1]
)
)
Based on this, you can probably also filter on the latest status:
from django.db.models import Q
from django.db.models import OuterRef, Subquery
Task.objects.annotate(
latest_status=Subquery(
Job.objects.filter(
task_id=OuterRef('pk')
).values('status').order_by('-timestamp')[:1]
)
).filter(
Q(jobs=None) | Q(latest_status='running')
)
or we can order by the existance of a Job, etc. with:
from django.db.models import BooleanField, Exists, ExpressionWrapper, Max, Q
from django.db.models import OuterRef, Subquery
Task.objects.annotate(
latest_status=Subquery(
Job.objects.filter(
task_id=OuterRef('pk')
).values('status').order_by('-timestamp')[:1]
),
latest_job=Max('jobs__timestamp')
).order_by(
Exists(Job.objects.filter(task_id=OuterRef('pk'))).asc(),
ExpressionWrapper(Q(latest_status='running'), output_field=BooleanField()).asc(),
'pk'
)
It might be a good idea to filter eventually on the primary key to make the ordering deterministic.
Willem's answer looks promising what I want, but, I have managed to obtain the ordering by annotating the count of jobs.
This is the model manager at the end,
class TaskManager(models.Manager):
def get_queryset(self):
qs = super().get_queryset()
latest_job = models.Max(models.F('jobs__timestamp'))
latest_status = models.Subquery(
Job.objects.filter(
task_id=models.OuterRef('pk')
).values('status').order_by('-timestamp')[:1]
)
job_count = models.Count('jobs')
qs_order = models.Case(
models.When(job_count=0, then=models.Value(2)),
models.When(latest_status='running', then=models.Value(1)),
default=models.Value(0),
output_field=models.IntegerField()
)
return qs.annotate(job_count=job_count,
latest_job=latest_job,
latest_status=latest_status,
qs_order=qs_order
).order_by('-qs_order', '-pk')
Result screenshot
Related
I have three models:
class BaseModel(Model):
deleted = BooleanField(default=False)
class Document(BaseModel):
def total_price()
return DocumentLine.objects.filter(
section__in=self.sections.filter(deleted=False),
deleted=False,
).total_price()
class Section(BaseModel):
document = ForeignKey(Document, on_delete=CASCADE, related_name='sections')
class LineQuerySet(QuerySet):
def with_total_price(self):
total_price = F('quantity') * F('price')
return self.annotate(
total_price=ExpressionWrapper(total_price, output_field=DecimalField())
)
def total_price(self):
return self.with_total_prices().aggregate(
Sum('total_price', output_field=DecimalField())
)['total_price__sum'] or Decimal(0.0)
class Line(BaseModel):
objects = LineQuerySet.as_manager()
section = ForeignKey(Section, on_delete=CASCADE, related_name='lines')
price = DecimalField()
quantity = DecimalField()
As you can see on the LineQuerySet, there is a method that will annotate the queryset with the total price of each line, based on the price and quantity.
Now I can easily get the total price of an entire document doing something like this (Note that lines and sections with deleted=True are ignored):
document = Document.objects.get(pk=1)
total_price = document.total_price()
However, now I would like to generate a queryset of multiple documents, and annotate that with each document's total price. I've tried a combination of annotates, aggregates, making use of prefetch_related (using Prefetch), and OuterRef, but I can't quite seem to be able to get the result I want without it throwing an error.
Is there some way to perform this operation in a queryset, making it then possible to filter or order by this total_price field?
You can annotate with:
from django.db.models import F, Sum
Document.objects.filter(
deleted=False,
sections__deleted=False,
section__lines__deleted=False
).annotate(
total_price=Sum(F('sections__lines__price')*F('sections__lines__quantity'))
)
Each Document that arises from this queryset will have an attribute .total_price which is the sum of the price times the quantity of all related lines of all related sections of that Document.
An alternative is to work with a Subquery expression [Django-doc] to determine the sum, so:
from django.db.models import F, OuterRef, Subquery, Sum
Document.objects.annotate(
total_price=Subquery(
Line.objects.values(
document_id=F('section__document_id')
).filter(
deleted=False, section__deleted=False, document__deleted=False
).annotate(
total_price=Sum(F('price') * F('quantity'))
).order_by('document_id').filter(document_id=OuterRef('pk')).values('total_price')[:1]
)
)
my model looks like this
class Model(models.Model):
user_id = models.ForeignKey()
date = models.DateField()
field1 = models.FloatField()
field2 = models.FloatField()
I have a below queryset
queryset = Model.objects.filter(user_id__exact=5) \
.annotate(weekstartdate=Trunc('date', 'week')) \
.values('weekstartdate') \
.annotate(avg_field1=Avg('field1')) \
.annotate(avg_field2=Avg('field2')) \
.order_by('-weekstartdate')
which is working perfectly. now I want to add weekenddate field to above queryset which has a date = weekstartdate + 6 days. I have added below line to above query
.annotate(weekenddate=Trunc('date', 'week') + timedelta(days=7), output_field=DateField())
but it is complaining :-
TypeError: QuerySet.annotate() received non-expression(s):
<django.db.models.fields.DateField>
Relative imports
from django.db.models import Avg, Q
from django.db.models.functions import Trunc
from django.db.models import DateTimeField, ExpressionWrapper, F, DateField
Note :-
Simple for loop after queryset is not i am looking for because after assigning manually a field, queryset filter is still fetching query from old queryset due to laziness of the queryset.
If answer can be in relativedelta of dateutil library it would be much better.
You need to use ExpressionWrapper around it.
YourModel.objects.annotate(
weekenddate=models.ExpressionWrapper(
Trunc('date', 'week') + timedelta(days=7),
output_field=models.DateField(),
),
)
models:
class A(models.Model):
created_on = models.DateTimeField()
class B(models.Model):
a = models.ForeignKey('A', verbose_name='bs')
class C(models.Model):
a = models.ForeignKey('A', verbose_name='cs')
I want to count the number of B and C by A and group them.
Here is my attempt, but the result is not correct.
from django.db.models import Count, Q
from django.db.models.functions import Trunc
a_qs = A.objects.filter(Q(created_on__gte=start_date, created_on__lte=end_date))
g = a_qs.objects.annotate(time=Trunc('created_on', 'month')).values('time').order_by('time')
result = g.annotate(a_total=Count('id'), b_total=Count('bs'), c_total=Count('cs'))
Although this will not report an error, the result will be incorrect. I don't want to loop queryset.
I have an idea that can meet my needs, but in the end I need to merge queryset.
a_qs = A.objects.filter(Q(created_on__gte=start_date, created_on__lte=end_date))
a_g = a_qs.annotate(time=Trunc('created_on', 'month')).values('time').order_by('time')
a_result = a_g.annotate(a_total=Count('id'))
b_g = B.objects.filter(a__in=a_qs).annotate(time=Trunc('a__created_on', 'month')).values('time').order_by('time')
b_result = b_g.annotate(b_total=Count('id'))
c_g = ...
c_result = ...
The main problem here is that you make two JOINs, and thus the JOINs act as a "multiplier" of each other. You can count the distinct related objects with:
A.objects.filter(
created_on__range=(start_date, end_date)
).annotate(
time=Trunc('created_on', 'month')
).values('time').annotate(
a_total=Count('id', distinct=True),
b_total=Count('b', distinct=True),
c_total=Count('c', distinct=True)
).order_by('time')
I have a View that returns some statistics about email lists growth. The models involved are:
models.py
class Contact(models.Model):
email_list = models.ForeignKey(EmailList, related_name='contacts')
customer = models.ForeignKey('Customer', related_name='contacts')
status = models.CharField(max_length=8)
create_date = models.DateTimeField(auto_now_add=True)
class EmailList(models.Model):
customers = models.ManyToManyField('Customer',
related_name='lists',
through='Contact')
class Customer(models.Model):
is_unsubscribed = models.BooleanField(default=False, db_index=True)
unsubscribe_date = models.DateTimeField(null=True, blank=True, db_index=True)
In the View what I'm doing is iterating over all EmailLists objects and getting some metrics: the following way:
view.py
class ListHealthView(View):
def get(self, request, *args, **kwargs):
start_date, end_date = get_dates_from_querystring(request)
data = []
for email_list in EmailList.objects.all():
# historic data up to start_date
past_contacts = email_list.contacts.filter(
status='active',
create_date__lt=start_date).count()
past_unsubscribes = email_list.customers.filter(
is_unsubscribed=True,
unsubscribe_date__lt=start_date,
contacts__status='active').count()
past_deleted = email_list.contacts.filter(
status='deleted',
modify_date__lt=start_date).count()
# data for the given timeframe
new_contacts = email_list.contacts.filter(
status='active',
create_date__range=(start_date, end_date)).count()
new_unsubscribes = email_list.customers.filter(
is_unsubscribed=True,
unsubscribe_date__range=(start_date, end_date),
contacts__status='active').count()
new_deleted = email_list.contacts.filter(
status='deleted',
modify_date__range=(start_date, end_date)).count()
data.append({
'new_contacts': new_contacts,
'new_unsubscribes': new_unsubscribes,
'new_deleted': new_deleted,
'past_contacts': past_contacts,
'past_unsubscribes': past_unsubscribes,
'past_deleted': past_deleted,
})
return Response({'data': data})
Now this works fine, but as My DB started growing, the response time from this view is above 1s and occasionally will cause long running queries in the Database. I think the most obvious improvement would be to index EmailList.customers but I think maybe it needs to be a compound index ? Also, is there a better way of doing this ? Maybe using aggregates ?
EDIT
After #bdoubleu answer I tried the following:
data = (
EmailList.objects.annotate(
past_contacts=Count(Subquery(
Contact.objects.values('id').filter(
email_list=F('pk'),
status='active',
create_date__lt=start_date)
)),
past_deleted=Count(Subquery(
Contact.objects.values('id').filter(
email_list=F('pk'),
status='deleted',
modify_date__lt=start_date)
)),
)
.values(
'past_contacts', 'past_deleted',
)
)
I had to change to use F instead of OuterRef because I realized that my model EmailList has id = HashidAutoField(primary_key=True, salt='...') was causing ProgrammingError: more than one row returned by a subquery used as an expression but I'm not completely sure about it.
Now the query works but sadly all counts are returned as 0
As is your code is producing 6 queries for every EmailList instance. For 100 instances that's minimum 600 queries which slows things down.
You can optimize by using SubQuery() expressions and .values().
from django.db.models import Count, OuterRef, Subquery
data = (
EmailList.objects
.annotate(
past_contacts=Count(Subquery(
Contact.objects.filter(
email_list=OuterRef('pk'),
status='active',
create_date__lt=start_date
).values('id')
)),
past_unsubscribes=...,
past_deleted=...,
new_contacts=...,
new_unsubscribes=...,
new_deleted=...,
)
.values(
'past_contacts', 'past_unsubscribes',
'past_deleted', 'new_contacts',
'new_unsubscribes', 'new_deleted',
)
)
Update: for older versions of Django your subquery may need to look like below
customers = (
Customer.objects
.annotate(
template_count=Subquery(
CustomerTemplate.objects
.filter(customer=OuterRef('pk'))
.values('customer')
.annotate(count=Count('*')).values('count')
)
).values('name', 'template_count')
)
I am trying to build a complex(for me) query for one of my projects. Django version is 1.11.4 and PostgreSQL version is 9.6.
Here are the models.
class Event(models.Model):
...
name = models.CharField(max_length=256)
classification = models.ForeignKey("events.Classification", related_name="events", null=True, blank=True)
...
class Classification(models.Model):
...
segment = models.ForeignKey("events.ClassificationSegment", related_name="classifications", blank=True, null=True)
...
class ClassificationSegment(models.Model):
...
name = models.CharField(max_length=256)
...
I blocked somewhere here and can't go ahead.
from django.db.models import CharField, Value as V
from django.db.models.functions import Concat
from django.contrib.postgres.aggregates import ArrayAgg
from django.db.models import OuterRef, Subquery
import events.models
event_subquery = events.models.Event.objects.filter(classification__segment=OuterRef('pk')) \
.annotate(event=Concat(V('{id:'), 'id', V(', name:"'), 'name', V('"}'), output_field=CharField()))
final_list = events.models.ClassificationSegment.objects.annotate(
event_list=ArrayAgg(Subquery(event_subquery.values('event')[:6])))
I have a raw query. Here it is.
final_events = events.models.ClassificationSegment.objects.raw('SELECT "events_classificationsegment"."id", "events_classificationsegment"."name", (SELECT ARRAY(SELECT CONCAT(\'{id:\', CONCAT(U0."id", CONCAT(\',\', \'name:"\', U0."name", \'"}\'))) AS "event" FROM "events_event" U0 INNER JOIN "events_classification" U1 ON (U0."classification_id" = U1."id") WHERE U1."segment_id" = ("events_classificationsegment"."id") LIMIT 6)) AS "event_list" FROM "events_classificationsegment"')
You can see the result in the screenshot. I guess I am on the right way. Can anyone help me?
Thanks.
Postgres has a really nice way of making an array from a subquery:
SELECT foo.id, ARRAY(SELECT bar FROM baz WHERE foo_id = foo.id) AS bars
FROM foo
To do this within the ORM, you can define a subclass of Subquery:
class Array(Subquery):
template = 'ARRAY(%(subquery)s)'
and use this in your queryset:
queryset = ClassificationSegment.objects.annotate(
event_list=Array(event_subquery.values('event')[:6])
)