I am new to mongodb and still sitting on the same pipeline thing. I dont understand why my usage of $project did not generate any output at all ?
def make_pipeline():
# complete the aggregation pipeline
pipeline = [
{
'$match': {
"user.statuses_count": {"$gt":99 },
"user.time_zone": "Brasilia"
}
},
{
"$group": {
"_id": "$user.id",
"followers": { "$max": "$user.followers_count" }
}
},
{
"$sort": { "followers": -1 }
},{"$project": {
"userId": "$user.id",
"screen_name": "$user.screen_name",
"retweet_count": "$retweet_count"}},
{
"$limit" : 1
}
]
Any ideas?
Try this aggregation pipeline below, it should give you the desired output.
Using Mongo shell:
Test documents (with minimum test case):
db.tweet.insert([
{
"retweet_count" : 23,
"user" : {
"time_zone" : "Brasilia",
"statuses_count" : 2475,
"screen_name" : "Catherinemull",
"followers_count" : 169,
"id" : 37486277
},
"id" : NumberLong("22819398300")
},
{
"retweet_count" : 7,
"user" : {
"time_zone" : "Lisbon",
"statuses_count" : 4532,
"screen_name" : "foo",
"followers_count" : 43,
"id" : 37486278
},
"id" : NumberLong("22819398301")
},
{
"retweet_count" : 12,
"user" : {
"time_zone" : "Brasilia",
"statuses_count" : 132,
"screen_name" : "test2",
"followers_count" : 4,
"id" : 37486279
},
"id" : NumberLong("22819398323")
},
{
"retweet_count" : 4235,
"user" : {
"time_zone" : "Brasilia",
"statuses_count" : 33,
"screen_name" : "test4",
"followers_count" : 2,
"id" : 37486280
},
"id" : NumberLong("22819398308")
},
{
"retweet_count" : 562,
"user" : {
"time_zone" : "Kenya",
"statuses_count" : 672,
"screen_name" : "Kiptot",
"followers_count" : 169,
"id" : 37486281
},
"id" : NumberLong("22819398374")
},
{
"retweet_count" : 789,
"user" : {
"time_zone" : "Brasilia",
"statuses_count" : 5263,
"screen_name" : "test231",
"followers_count" : 8282,
"id" : 37486
},
"id" : NumberLong("22819398331")
}
]);
The Magic:
db.tweet.aggregate([
{
'$match': {
"user.statuses_count": {"$gt":99 },
"user.time_zone": "Brasilia"
}
},
{
"$group": {
"_id": "$user.id",
"followers": { "$max": "$user.followers_count" },
"doc": {
"$addToSet": "$$ROOT"
}
}
},
{
"$sort": { "followers": -1 }
},
{
"$unwind": "$doc"
},
{
"$project": {
"_id": 0,
"userId": "$_id",
"screen_name": "$doc.user.screen_name",
"retweet_count": "$doc.retweet_count",
"followers": 1
}
},
{
"$limit": 1
}
]);
Output:
/* 1 */
{
"result" : [
{
"userId" : 37486,
"screen_name" : "test231",
"retweet_count" : 789,
"followers" : 8282
}
],
"ok" : 1
}
-- UPDATE --
Python implementation:
>>> from bson.son import SON
>>> pipeline = [
... {"$match": {"user.statuses_count": {"$gt": 99}, "user.time_zone": "Brasilia"}},
... {"$group": {"_id": "$user.id", "followers": { "$max": "$user.followers_count" }, "doc": {"$addToSet": "$$ROOT"}}},
... {"$sort": {"followers": -1 }},
... {"$unwind": "$doc"}, {"$project": {"_id": 0, "userId": "$_id", "screen_name": "$doc.user.screen_name", "retweet_count": "$doc.retweet_count", "followers": 1}},
... {"$limit": 1}
... ]
>>> list(db.tweet.aggregate(pipeline))
[{u'userId': 37486, u'screen_name': u'test231', u'retweet_count': 789, u'followers': 8282}]
Related
I have a collection of 24.8 million documents (tweet objects). Each tweet object represents single tweet. Example of a tweet object:
{'_id': ObjectId('5ff0798da1fb5219b93ef4ca'),
'tweet_id': 1233904784635256833,
'user_id': 1092190045,
'user_followers_count': 1653,
'user_friends_count': 24,
'user_tweets_count': 62340,
'user_mentions': [],
'coordinates': {'type': 'Point', 'coordinates': [13.435, 52.481388]},
'created_at': 'Sun Mar 01 00:00:00 +0000 2020',
'lang': 'de',
'text': 'schepper'}
I want to find the total number of tweets on weekdays (Mon - Fri) and weekends (Sat and Sun).
I tried this:
tweets_by_weekday = [{
"$group": {
"_id": {
"day": {
"$dayOfWeek": "created_at"
}
},
"count":{"$sum":1}
}
}]
list(tweetsData.aggregate(tweets_by_weekday))
What about:
search_request = { "$or":
[
{ "created_at": { "$regex": "^Mon .*" } },
{ "created_at": { "$regex": "^Tue .*" } },
{ "created_at": { "$regex": "^Wed .*" } },
{ "created_at": { "$regex": "^Thu .*" } },
{ "created_at": { "$regex": "^Fri .*" } }
]
}
mycol.find(search_request).count() #Older pymongo
mycol.count_documents(search_request) #Newer pymongo
and:
search_request = { "$or":
[
{ "created_at": { "$regex": "^Sat .*" } },
{ "created_at": { "$regex": "^Sun .*" } }
]
}
mycol.find(search_request).count() #Older pymongo
mycol.count_documents(search_request) #Newer pymongo
Another option:
tweetsData.aggregate([{$project:{_id:0,d:{$substr:["$created_at",0,3]}}} , {$group:{_id:"$d" ,cnt:{$sum:1} } } ])
{ "_id" : "Sat", "cnt" : 1 }
{ "_id" : "Sun", "cnt" : 1 }
{ "_id" : "Thu", "cnt" : 1 }
{ "_id" : "Wed", "cnt" : 1 }
{ "_id" : "Tue", "cnt" : 1 }
{ "_id" : "Fri", "cnt" : 1 }
{ "_id" : "Mon", "cnt" : 1 }
and distributed by weekends & working days:
tweetsData.aggregate([{$project:{_id:0,d:{$substr:["$created_at",0,3]}}} , {$group:{_id:"$d" ,cnt:{$sum:1} } } , {$facet:{ "weekends":[ {$match:{_id:{$in:['Sat','Sun' ]}}} ,{$group:{_id:"weekends" , total:{$sum:"$cnt"}}} ],"week":[ {$match:{_id:{$in:['Mon','Tue','Wed','Thu','Fri']} } } , {$group:{_id:"week" , total:{$sum:"$cnt"}}} ] } } ])
{ "weekends" : [ { "_id" : "weekends", "total" : 2 } ], "week" : [ { "_id" : "week", "total" : 5 } ] }
CURRENT_TZ = timezone(bp.BaseModel.__timezone__ or "Asia/Shanghai")
NOW = CURRENT_TZ.localize(datetime.utcnow())
EXPIRY_DATE = NOW + relativedelta(days=5)
res = await Fixture.aggregate(
[
{"$match": dict(eol={"$nin": [True, ""]})},
{
"$group": {
"_id": {
"$cond": [
{"$lt": ["pm_date", start_date]},
"PENDING",
{
"$gte": ["pm_date", start_date],
"$lt": ["pm_date", end_date],
},
"DONE",
{
"$gte": ["pm_due_date", start_date],
"$lte": ["pm_due_date", EXPIRY_DATE],
},
"WILL EXPIRED",
{"$lte": ["pm_due_date", NOW]},
"EXPIRED",
]
},
"count": {"$sum": 1},
}
},
]
)
from the above code, I expected output for example like
{
"_id" : "PENDING",
"qty": 50
},
{
"_id" : "DONE",
"qty": 50
},
{
"_id" : "WILL BE EXPIRE",
"qty": 40
}
{
"_id" : "EXPIRED",
"qty": 10
}
but from my console show error as following, can someone help me fix the pymongo pipeline for groping multiple conditions?
raise OperationFailure(msg % errmsg, code, response)
pymongo.errors.OperationFailure: An object representing an expression must have exactly one field: { $gte: [ "pm_date", new Date(1596240000000) ], $lt: [ "pm_date", new Date(1598918400000) ] }
Update: I got the result by using $switch (aggregation)
Refer to: https://docs.mongodb.com/manual/reference/operator/aggregation/switch/
res = await Fixture.aggregate(
[
{"$match": dict(eol={"$nin": [True, ""]})},
{
"$project": {
"pm_due_date": 1,
"status": {
"$switch": {
"branches": [
{
"case": {"$lt": ["$pm_due_date", NOW]},
"then": "EXPIRED",
},
{
"case": {
"$and": [
{
"$gte": [
"$pm_due_date",
start_date,
]
},
{
"$lte": [
"$pm_due_date",
EXPIRY_DATE,
]
},
]
},
"then": "WILL EXPIRE",
},
{
"case": {"$lt": ["$pm_date", start_date]},
"then": "PENDING",
},
{
"case": {
"$and": [
{"$gte": ["$pm_date", start_date]},
{"$lt": ["$pm_date", end_date]},
]
},
"then": "DONE",
},
],
"default": "NA",
}
},
}
},
{"$group": {"_id": "$status", "count": {"$sum": 1}}},
]
)
You should put your $cond in a $project stage instead of the $group
[
{"$match": dict(eol={"$nin": [True, ""]})},
{"$project": {
"status": {
"$cond": [
{"$lt": ["pm_date", start_date]},
"PENDING",
{"$cond": [
{
"$and": [
{"$gte": ["pm_date", start_date]},
{"$lt": ["pm_date", end_date]}
]
},
"DONE",
{"$cond": [
{
"$and": [
{"$gte": ["pm_date", start_date]},
{"$lt": ["pm_date", EXPIRY_DATE]}
]
},
"WILL EXPIRED",
"EXPIRED"
]}
]}
]}
}
},
{
"$group": {
"_id": "$status",
"count": {"$sum": 1},
}
},
]
Following is the kibana JSON of a single row,
{
"_index": "questionanswers",
"_type": "doc",
"_id": "3",
"_version": 1,
"_score": 0,
"_source": {
"question": {
"id": 3,
"text": "Your first salary",
"answer_type": "FL",
"question_type": "BQ"
},
"candidate": {
"id": 13
},
"job": {
"id": 6
},
"id": 3,
"status": "AN",
"answered_on": "2019-07-12T09:26:01+00:00",
"answer": "12222222"
},
"fields": {
"answered_on": [
"2019-07-12T09:26:01.000Z"
]
}
}
I have an sql query like,
Select * from questionanswers where question.id = 3 and answer between 1250 and 1253666
I have converted this to elasticsearch query as follows,
{
"size": 1000,
"query": {
"bool": {
"must": [
{
"term": {
"question.id":3
}
},
{
"range": {
"answer": {
"from": 1250,
"to": 1253666999,
"include_lower": true,
"include_upper": true,
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
}
Here answer is declared as String , But i holds Date,FLoat and String values.
"question": {
"id": 3,
"text": "Your first salary",
"answer_type": "FL",
"question_type": "BQ"
},
Here answer_type tells which type of answer it is expecting.
When I try to run this query I am not getting desired results. I am getting an empty response on this hit.
But actually, there is a row that satisfies this query.
How my elasticsearch query should be so that I can filter with
question.id = 3 , question.answer_type = "FL" and answer between 1250 and 1253666```
see your document again. The answer is a string value, and you are treating as a number in your query. So it does not work obviously.
change the mapping for this field to number.
Here is the document I indexed in a test index and ran your query again and it works
Indexing the document ( see the field answer)
POST /so-index4/_doc/1
{
"question": {
"id": 3,
"text": "Your first salary",
"answer_type": "FL",
"question_type": "BQ"
},
"candidate": {
"id": 13
},
"job": {
"id": 6
},
"id": 3,
"status": "AN",
"answered_on": "2019-07-12T09:26:01+00:00",
"answer": 12222222,
"fields": {
"answered_on": [
"2019-07-12T09:26:01.000Z"
]
}
}
and the query (same query that you provided above)
GET /so-index4/_search
{
"size": 1000,
"query": {
"bool": {
"must": [
{
"term": {
"question.id":3
}
},
{
"range": {
"answer": {
"from": 1250,
"to": 1253666999,
"include_lower": true,
"include_upper": true,
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
}
the result
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 2.0,
"hits" : [
{
"_index" : "so-index4",
"_type" : "_doc",
"_id" : "1",
"_score" : 2.0,
"_source" : {
"question" : {
"id" : 3,
"text" : "Your first salary",
"answer_type" : "FL",
"question_type" : "BQ"
},
"candidate" : {
"id" : 13
},
"job" : {
"id" : 6
},
"id" : 3,
"status" : "AN",
"answered_on" : "2019-07-12T09:26:01+00:00",
"answer" : 12222222,
"fields" : {
"answered_on" : [
"2019-07-12T09:26:01.000Z"
]
}
}
}
]
}
}
I am newbie in Python. I have some difficulties generating a nested JSON using for loop in python. For generating a nested JSON, I got the length of dictionary on runtime and based on the dictionary length I want to generate nested JSON. eg. I got the length of dictionary is 4. The dictionary length may vary. Here is my data_dict dictionary:
data_dict = {"PHOTO_1" : {"key1" : "PHOTO_2", "key2" : "PHOTO_3", "key3" : "PHOTO_4"}, "PHOTO_2" : {"key1" : "PHOTO_1", "key2" : "PHOTO_3"},"PHOTO_3" : {"key1" : "PHOTO_2"},"PHOTO_4" : {"key1" : "PHOTO_1", "key2" : "PHOTO_2", "key3" : "PHOTO_3"}}
Expected result :
{
"Requests": [
{
"photo": {
"photoId": {
"id": "PHOTO_1"
},
"connections": {
"target": {
"id": "PHOTO_2"
}
}
},
"updateData": "connections"
},
{
"photo": {
"photoId": {
"id": "PHOTO_1"
},
"connections": {
"target": {
"id": "PHOTO_3"
}
}
},
"updateData": "connections"
},
{
"photo": {
"photoId": {
"id": "PHOTO_1"
},
"connections": {
"target": {
"id": "PHOTO_4"
}
}
},
"updateData": "connections"
},
{
"photo": {
"photoId": {
"id": "PHOTO_2"
},
"connections": {
"target": {
"id": "PHOTO_1"
},
}
},
"updateData": "connections"
},
{
"photo": {
"photoId": {
"id": "PHOTO_2"
},
"connections": {
"target": {
"id": "PHOTO_3"
},
}
},
"updateData": "connections"
},
{
"photo": {
"photoId": {
"id": "PHOTO_3"
},
"connections": {
"target": {
"id": "PHOTO_2"
},
}
},
"updateData": "connections"
},
{
"photo": {
"photoId": {
"id": "PHOTO_4"
},
"connections": {
"target": {
"id": "PHOTO_1"
},
}
},
"updateData": "connections"
},
{
"photo": {
"photoId": {
"id": "PHOTO_4"
},
"connections": {
"target": {
"id": "PHOTO_2"
},
}
},
"updateData": "connections"
},
{
"photo": {
"photoId": {
"id": "PHOTO_4"
},
"connections": {
"target": {
"id": "PHOTO_3"
},
}
},
"updateData": "connections"
}
]
}
Please help. I'm not getting how to solve this query? Please don't mark it duplicate. I have already checked all the answers and my JSON query is totally different.
The solution using itertools.permutations() function:
import itertools, json
data_dict = {"first_photo" : "PHOTO_1", "second_photo" : "PHOTO_2", "Thrid" : "PHOTO_3"}
result = {"Requests":[]}
for pair in sorted(itertools.permutations(data_dict.values(), 2)):
result["Requests"].append({"photo":{"photoId":{"id": pair[0]},
"connections":{"target":{"id": pair[1]}}},"updateData": "connections"})
print(json.dumps(result, indent=4))
The additional approach for the new input dict:
data_dict = {"PHOTO_1" : {"key1" : "PHOTO_2", "key2" : "PHOTO_3", "key3" : "PHOTO_4"}, "PHOTO_2" : {"key1" : "PHOTO_1", "key2" : "PHOTO_3"},"PHOTO_3" : {"key1" : "PHOTO_2"},"PHOTO_4" : {"key1" : "PHOTO_1", "key2" : "PHOTO_2", "key3" : "PHOTO_3"}}
result = {"Requests":[]}
for k,d in sorted(data_dict.items()):
for v in sorted(d.values()):
result["Requests"].append({"photo":{"photoId":{"id": k},
"connections":{"target":{"id": v}}},"updateData": "connections"})
print(json.dumps(result, indent=4))
I'm using Elasticsearch through the python requests library. I've set up my analysers like so:
"analysis" : {
"analyzer": {
"my_basic_search": {
"type": "standard",
"stopwords": []
},
"my_autocomplete": {
"type": "custom",
"tokenizer": "keyword",
"filter": ["lowercase", "autocomplete"]
}
},
"filter": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20,
}
}
}
I've got a list of artists who I'd like to search for using autocomplete: my current test case is 'bill w', which should match 'bill withers' etc - the artist mapping looks like this (this is a output of GET http://localhost:9200/my_index/artist/_mapping):
{
"my_index" : {
"mappings" : {
"artist" : {
"properties" : {
"clean_artist_name" : {
"type" : "string",
"analyzer" : "my_basic_search",
"fields" : {
"autocomplete" : {
"type" : "string",
"index_analyzer" : "my_autocomplete",
"search_analyzer" : "my_basic_search"
}
}
},
"submitted_date" : {
"type" : "date",
"format" : "basic_date_time"
},
"total_count" : {
"type" : "integer"
}
}
}
}
}
}
...and then I run this query to do the autocomplete:
"query": {
"function_score": {
"query": {
"bool": {
"must" : { "match": { "clean_artist_name.autocomplete": "bill w" } },
"should" : { "match": { "clean_artist_name": "bill w" } },
}
},
"functions": [
{
"script_score": {
"script": "artist-score"
}
}
]
}
}
This seems to match artists that contain either 'bill' or 'w' as well as 'bill withers': I only wanted to match artists that contain that exact string. The analyser seems to be working fine, here is the output of http://localhost:9200/my_index/_analyze?analyzer=my_autocomplete&text=bill%20w:
{
"tokens" : [ {
"token" : "b",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
}, {
"token" : "bi",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
}, {
"token" : "bil",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
}, {
"token" : "bill",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
}, {
"token" : "bill ",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
}, {
"token" : "bill w",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
} ]
}
So why is this not excluding matches with just 'bill' or 'w' in there? Is there something in my query that is allowing the results that only match with the my_basic_search analyser?
I believe you need a "term" filter instead of a "match" one for your "must". You already have split your artist names in ngrams so your searching text should match exactly one of the ngrams. For this to happen you need a "term" that will match exactly the ngrams:
"query": {
"function_score": {
"query": {
"bool": {
"must" : { "term": { "clean_artist_name.autocomplete": "bill w" } },
"should" : { "match": { "clean_artist_name": "bill w" } },
}
},
"functions": [
{
"script_score": {
"script": "artist-score"
}
}
]
}
}