MongoDB count total number of tweets on weekdays and weekends (Python) - python

I have a collection of 24.8 million documents (tweet objects). Each tweet object represents single tweet. Example of a tweet object:
{'_id': ObjectId('5ff0798da1fb5219b93ef4ca'),
'tweet_id': 1233904784635256833,
'user_id': 1092190045,
'user_followers_count': 1653,
'user_friends_count': 24,
'user_tweets_count': 62340,
'user_mentions': [],
'coordinates': {'type': 'Point', 'coordinates': [13.435, 52.481388]},
'created_at': 'Sun Mar 01 00:00:00 +0000 2020',
'lang': 'de',
'text': 'schepper'}
I want to find the total number of tweets on weekdays (Mon - Fri) and weekends (Sat and Sun).
I tried this:
tweets_by_weekday = [{
"$group": {
"_id": {
"day": {
"$dayOfWeek": "created_at"
}
},
"count":{"$sum":1}
}
}]
list(tweetsData.aggregate(tweets_by_weekday))

What about:
search_request = { "$or":
[
{ "created_at": { "$regex": "^Mon .*" } },
{ "created_at": { "$regex": "^Tue .*" } },
{ "created_at": { "$regex": "^Wed .*" } },
{ "created_at": { "$regex": "^Thu .*" } },
{ "created_at": { "$regex": "^Fri .*" } }
]
}
mycol.find(search_request).count() #Older pymongo
mycol.count_documents(search_request) #Newer pymongo
and:
search_request = { "$or":
[
{ "created_at": { "$regex": "^Sat .*" } },
{ "created_at": { "$regex": "^Sun .*" } }
]
}
mycol.find(search_request).count() #Older pymongo
mycol.count_documents(search_request) #Newer pymongo

Another option:
tweetsData.aggregate([{$project:{_id:0,d:{$substr:["$created_at",0,3]}}} , {$group:{_id:"$d" ,cnt:{$sum:1} } } ])
{ "_id" : "Sat", "cnt" : 1 }
{ "_id" : "Sun", "cnt" : 1 }
{ "_id" : "Thu", "cnt" : 1 }
{ "_id" : "Wed", "cnt" : 1 }
{ "_id" : "Tue", "cnt" : 1 }
{ "_id" : "Fri", "cnt" : 1 }
{ "_id" : "Mon", "cnt" : 1 }
and distributed by weekends & working days:
tweetsData.aggregate([{$project:{_id:0,d:{$substr:["$created_at",0,3]}}} , {$group:{_id:"$d" ,cnt:{$sum:1} } } , {$facet:{ "weekends":[ {$match:{_id:{$in:['Sat','Sun' ]}}} ,{$group:{_id:"weekends" , total:{$sum:"$cnt"}}} ],"week":[ {$match:{_id:{$in:['Mon','Tue','Wed','Thu','Fri']} } } , {$group:{_id:"week" , total:{$sum:"$cnt"}}} ] } } ])
{ "weekends" : [ { "_id" : "weekends", "total" : 2 } ], "week" : [ { "_id" : "week", "total" : 5 } ] }

Related

Aggregation $match within a $sum

I was wondering if it was possible to somehow use the $match operator within the $sum function for aggregation.
{ "$unwind": "$info.avatarInfoList" },
{ "$unwind": "$info.avatarInfoList.equipList" },
{ "$unwind": "$info.avatarInfoList.equipList.flat.reliquarySubstats" },
{
"$project": {
"name" : "$name",
"character" : "$info.avatarInfoList.avatarId",
"artifact" : "$info.avatarInfoList.equipList.itemId",
"statValue" : {
"$sum": [
{"$match" : { "$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId" : "FIGHT_PROP_CRITICAL_HURT" } },
{"$multiply": [2, {"$match" : { "$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId" : "FIGHT_PROP_CRITICAL" } }]}
]
},
}
},
{ "$sort": { "statValue": -1 }},
{ '$limit' : 30 }
]).to_list(length=None)
print(data)
I want to be able to use the value of the $sum operator within the project fields somehow, I just don't really understand what the right approach would be for this.
Sample Input (may be too long):
https://www.toptal.com/developers/hastebin/ixamekaxoq.json
Sample Output:
( 2 * FIGHT_PROP_CRITICAL ) + FIGHT_PROP_CRITICAL_HURT sorted from highest to lowest for each item.
{name: hat, character: Slayer, artifact: 13, statValue : 25.6}
There are still a few ambiguities about how you want to aggregate your data, but using the full document from your link, here's one way to produce the output you want.
N.B.: Weapons in the "equipList" don't have "reliquarySubstats" so they show a "statValue" of null in the output.
db.collection.aggregate([
{"$unwind": "$info.avatarInfoList"},
{"$unwind": "$info.avatarInfoList.equipList"},
{
"$project": {
"_id": 0,
"name": 1,
"character": "$info.avatarInfoList.avatarId",
"artifact": "$info.avatarInfoList.equipList.itemId",
"statValue": {
"$reduce": {
"input": "$info.avatarInfoList.equipList.flat.reliquarySubstats",
"initialValue": 0,
"in": {
"$switch": {
"branches": [
{
"case": {"$eq": ["$$this.appendPropId", "FIGHT_PROP_CRITICAL"]},
"then": {
"$add": [
"$$value",
{"$multiply": [2, "$$this.statValue"]}
]
}
},
{
"case": {"$eq": ["$$this.appendPropId", "FIGHT_PROP_CRITICAL_HURT"]},
"then": {"$add": ["$$value", "$$this.statValue"]}
}
],
"default": "$$value"
}
}
}
}
}
},
{"$sort": {"statValue": -1}}
])
Try it on mongoplayground.net.
It's not quite clear what you want to achieve, but as mentioned you want to be using $cond here.
like so:
{
"$project": {
"statValue": {
"$sum": [
{
$cond: [
{ // if this condition is true (prop id = prop critical hurt )
$eq: [
"$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId",
"FIGHT_PROP_CRITICAL_HURT"
]
},
{ // then use this value for the "$sum"
"$multiply": [
2,
"$info.avatarInfoList.equipList.flat.reliquarySubstats.statValue"
]
},
0 // otherwise use this value for the sum.
]
}
]
}
}
Mongo Playground

generate unique id in nested document - Pymongo

generate unique id in nested document - Pymongo
my database looks like this...
{
"_id":"5ea661d6213894a6082af6d1",
"blog_id":"blog_one",
"comments": [
{
"user_id":"1",
"comment":"comment for blog one this is good"
},
{
"user_id":"2",
"comment":"other for blog one"
},
]
}
I want to add unique id in each and every comment,
I want it to output like this,
{
"_id":"5ea661d6213894a6082af6d1",
"blog_id":"blog_one",
"comments": [
{
"id" : "something" (auto generate unique),
"user_id":"1",
"comment":"comment for blog one this is good"
},
{
"id" : "something" (auto generate unique),
"user_id":"2",
"comment":"other for blog one"
},
]
}
I'm using PyMongo, is there a way to update this kind of document?
it's possible or not?
This update will add an unique id value to each of the comments array with nested documents. The id value is calculated based upon the present time as milliseconds. This value is incremented for each array element to get the new id value for the nested documents of the array.
The code runs with MongoDB version 4.2 and PyMongo 3.10.
pipeline = [
{
"$set": {
"comments": {
"$map": {
"input": { "$range": [ 0, { "$size": "$comments" } ] },
"in": {
"$mergeObjects": [
{ "id": { "$add": [ { "$toLong" : datetime.datetime.now() }, "$$this" ] } },
{ "$arrayElemAt": [ "$comments", "$$this" ] }
]
}
}
}
}
}
]
collection.update_one( { }, pipeline )
The updated document:
{
"_id" : "5ea661d6213894a6082af6d1",
"blog_id" : "blog_one",
"comments" : [
{
"id" : NumberLong("1588179349566"),
"user_id" : "1",
"comment" : "comment for blog one this is good"
},
{
"id" : NumberLong("1588179349567"),
"user_id" : "2",
"comment" : "other for blog one"
}
]
}
[ EDIT ADD ]
The following works from mongo shell. It adds unique id for the comments array's nested documents - unique across the documents.
db.collection.aggregate( [
{
"$unwind": "$comments" },
{
"$group": {
"_id": null,
"count": { "$sum": 1 },
"docs": { "$push": "$$ROOT" },
"now": { $first: "$$NOW" }
}
},
{
"$addFields": {
"docs": {
"$map": {
"input": { "$range": [ 0, "$count" ] },
"in": {
"$mergeObjects": [
{ "comments_id": { "$add": [ { "$toLong" : "$now" }, "$$this" ] } },
{ "$arrayElemAt": [ "$docs", "$$this" ] }
]
}
}
}
}
},
{
"$unwind": "$docs"
},
{
"$addFields": {
"docs.comments.comments_id": "$docs.comments_id"
}
},
{
"$replaceRoot": { "newRoot": "$docs" }
},
{
"$group": {
"_id": { "_id": "$_id", "blog_id": "$blog_id" },
"comments": { "$push": "$comments" }
}
},
{
$project: {
"_id": 0,
"_id": "$_id._id",
"blog_id": "$_id.blog_id",
"comments": 1
}
}
] ).forEach(doc => db.blogs.updateOne( { _id: doc._id }, { $set: { comments: doc.comments } } ) )
You can use ObjectId constructor to create the ids and place them in your nested documents.

Filter MongoDB query to find documents only if a field in a list of objects is not empty

I have a MongoDB document structure like following:
Structure
{
"stores": [
{
"items": [
{
"feedback": [],
"item_category": "101",
"item_id": "10"
},
{
"feedback": [],
"item_category": "101",
"item_id": "11"
}
]
},
{
"items": [
{
"feedback": [],
"item_category": "101",
"item_id": "10"
},
{
"feedback": ["A feedback"],
"item_category": "101",
"item_id": "11"
},
{
"feedback": [],
"item_category": "101",
"item_id": "12"
},
{
"feedback": [],
"item_category": "102",
"item_id": "13"
},
{
"feedback": [],
"item_category": "102",
"item_id": "14"
}
],
"store_id": 500
}
]
}
This is a single document in a collection. Some field are deleted to produce minimal representation of the data.
What I want is to get items only if the feedback field in the items array is not empty. The expected result is:
Expected result
{
"stores": [
{
"items": [
{
"feedback": ["A feedback"],
"item_category": "101",
"item_id": "11"
}
],
"store_id": 500
}
]
}
This is what I tried based on examples in this, which I think pretty same situation, but it didn't work. What's wrong with my query, isn't it the same situation in zipcode search example in the link? It returns everything like in the first JSON code, Structure:
What I tried
query = {
'date': {'$gte': since, '$lte': until},
'stores.items': {"$elemMatch": {"feedback": {"$ne": []}}}
}
Thanks.
Please try this :
db.yourCollectionName.aggregate([
{ $match: { 'date': { '$gte': since, '$lte': until }, 'stores.items': { "$elemMatch": { "feedback": { "$ne": [] } } } } },
{ $unwind: '$stores' },
{ $match: { 'stores.items': { "$elemMatch": { "feedback": { "$ne": [] } } } } },
{ $unwind: '$stores.items' },
{ $match: { 'stores.items.feedback': { "$ne": [] } } },
{ $group: { _id: { _id: '$_id', store_id: '$stores.store_id' }, items: { $push: '$stores.items' } } },
{ $project: { _id: '$_id._id', store_id: '$_id.store_id', items: 1 } },
{ $group: { _id: '$_id', stores: { $push: '$$ROOT' } } },
{ $project: { 'stores._id': 0 } }
])
We've all these stages as you need to operate on an array of arrays, this query is written assuming you're dealing with a large set of data, Since you're filtering on dates just in case if your documents size is way less after first $match then you can avoid following $match stage which is in between two $unwind's.
Ref 's :
$match,
$unwind,
$project,
$group
This aggregate query gets the needed result (using the provided sample document and run from the mongo shell):
db.stores.aggregate( [
{ $unwind: "$stores" },
{ $unwind: "$stores.items" },
{ $addFields: { feedbackExists: { $gt: [ { $size: "$stores.items.feedback" }, 0 ] } } },
{ $match: { feedbackExists: true } },
{ $project: { _id: 0, feedbackExists: 0 } }
] )

Nothing happens when trying to use $project

I am new to mongodb and still sitting on the same pipeline thing. I dont understand why my usage of $project did not generate any output at all ?
def make_pipeline():
# complete the aggregation pipeline
pipeline = [
{
'$match': {
"user.statuses_count": {"$gt":99 },
"user.time_zone": "Brasilia"
}
},
{
"$group": {
"_id": "$user.id",
"followers": { "$max": "$user.followers_count" }
}
},
{
"$sort": { "followers": -1 }
},{"$project": {
"userId": "$user.id",
"screen_name": "$user.screen_name",
"retweet_count": "$retweet_count"}},
{
"$limit" : 1
}
]
Any ideas?
Try this aggregation pipeline below, it should give you the desired output.
Using Mongo shell:
Test documents (with minimum test case):
db.tweet.insert([
{
"retweet_count" : 23,
"user" : {
"time_zone" : "Brasilia",
"statuses_count" : 2475,
"screen_name" : "Catherinemull",
"followers_count" : 169,
"id" : 37486277
},
"id" : NumberLong("22819398300")
},
{
"retweet_count" : 7,
"user" : {
"time_zone" : "Lisbon",
"statuses_count" : 4532,
"screen_name" : "foo",
"followers_count" : 43,
"id" : 37486278
},
"id" : NumberLong("22819398301")
},
{
"retweet_count" : 12,
"user" : {
"time_zone" : "Brasilia",
"statuses_count" : 132,
"screen_name" : "test2",
"followers_count" : 4,
"id" : 37486279
},
"id" : NumberLong("22819398323")
},
{
"retweet_count" : 4235,
"user" : {
"time_zone" : "Brasilia",
"statuses_count" : 33,
"screen_name" : "test4",
"followers_count" : 2,
"id" : 37486280
},
"id" : NumberLong("22819398308")
},
{
"retweet_count" : 562,
"user" : {
"time_zone" : "Kenya",
"statuses_count" : 672,
"screen_name" : "Kiptot",
"followers_count" : 169,
"id" : 37486281
},
"id" : NumberLong("22819398374")
},
{
"retweet_count" : 789,
"user" : {
"time_zone" : "Brasilia",
"statuses_count" : 5263,
"screen_name" : "test231",
"followers_count" : 8282,
"id" : 37486
},
"id" : NumberLong("22819398331")
}
]);
The Magic:
db.tweet.aggregate([
{
'$match': {
"user.statuses_count": {"$gt":99 },
"user.time_zone": "Brasilia"
}
},
{
"$group": {
"_id": "$user.id",
"followers": { "$max": "$user.followers_count" },
"doc": {
"$addToSet": "$$ROOT"
}
}
},
{
"$sort": { "followers": -1 }
},
{
"$unwind": "$doc"
},
{
"$project": {
"_id": 0,
"userId": "$_id",
"screen_name": "$doc.user.screen_name",
"retweet_count": "$doc.retweet_count",
"followers": 1
}
},
{
"$limit": 1
}
]);
Output:
/* 1 */
{
"result" : [
{
"userId" : 37486,
"screen_name" : "test231",
"retweet_count" : 789,
"followers" : 8282
}
],
"ok" : 1
}
-- UPDATE --
Python implementation:
>>> from bson.son import SON
>>> pipeline = [
... {"$match": {"user.statuses_count": {"$gt": 99}, "user.time_zone": "Brasilia"}},
... {"$group": {"_id": "$user.id", "followers": { "$max": "$user.followers_count" }, "doc": {"$addToSet": "$$ROOT"}}},
... {"$sort": {"followers": -1 }},
... {"$unwind": "$doc"}, {"$project": {"_id": 0, "userId": "$_id", "screen_name": "$doc.user.screen_name", "retweet_count": "$doc.retweet_count", "followers": 1}},
... {"$limit": 1}
... ]
>>> list(db.tweet.aggregate(pipeline))
[{u'userId': 37486, u'screen_name': u'test231', u'retweet_count': 789, u'followers': 8282}]

Elasticsearch full-text autocomplete

I'm using Elasticsearch through the python requests library. I've set up my analysers like so:
"analysis" : {
"analyzer": {
"my_basic_search": {
"type": "standard",
"stopwords": []
},
"my_autocomplete": {
"type": "custom",
"tokenizer": "keyword",
"filter": ["lowercase", "autocomplete"]
}
},
"filter": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20,
}
}
}
I've got a list of artists who I'd like to search for using autocomplete: my current test case is 'bill w', which should match 'bill withers' etc - the artist mapping looks like this (this is a output of GET http://localhost:9200/my_index/artist/_mapping):
{
"my_index" : {
"mappings" : {
"artist" : {
"properties" : {
"clean_artist_name" : {
"type" : "string",
"analyzer" : "my_basic_search",
"fields" : {
"autocomplete" : {
"type" : "string",
"index_analyzer" : "my_autocomplete",
"search_analyzer" : "my_basic_search"
}
}
},
"submitted_date" : {
"type" : "date",
"format" : "basic_date_time"
},
"total_count" : {
"type" : "integer"
}
}
}
}
}
}
...and then I run this query to do the autocomplete:
"query": {
"function_score": {
"query": {
"bool": {
"must" : { "match": { "clean_artist_name.autocomplete": "bill w" } },
"should" : { "match": { "clean_artist_name": "bill w" } },
}
},
"functions": [
{
"script_score": {
"script": "artist-score"
}
}
]
}
}
This seems to match artists that contain either 'bill' or 'w' as well as 'bill withers': I only wanted to match artists that contain that exact string. The analyser seems to be working fine, here is the output of http://localhost:9200/my_index/_analyze?analyzer=my_autocomplete&text=bill%20w:
{
"tokens" : [ {
"token" : "b",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
}, {
"token" : "bi",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
}, {
"token" : "bil",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
}, {
"token" : "bill",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
}, {
"token" : "bill ",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
}, {
"token" : "bill w",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 1
} ]
}
So why is this not excluding matches with just 'bill' or 'w' in there? Is there something in my query that is allowing the results that only match with the my_basic_search analyser?
I believe you need a "term" filter instead of a "match" one for your "must". You already have split your artist names in ngrams so your searching text should match exactly one of the ngrams. For this to happen you need a "term" that will match exactly the ngrams:
"query": {
"function_score": {
"query": {
"bool": {
"must" : { "term": { "clean_artist_name.autocomplete": "bill w" } },
"should" : { "match": { "clean_artist_name": "bill w" } },
}
},
"functions": [
{
"script_score": {
"script": "artist-score"
}
}
]
}
}

Categories