How to Build a Keyword Historgam in MongoDB? - python

I am using MongoDB 3.4 and PyMongo. I have a set of keywords:
keywords = [ 'bar', 'foo', ..., 'zoo' ]
I also have a collection:
docs = { 'data' : ' ... bar foo ... ',
'data' : ' ... foo ... ',
'data' : ' ... zoo ... ' }
I am looking for a PyMongo aggregation query which is going to give me a dict:
{ 'bar' : 0, 'foo' : 2, ..., 'zoo' : 0 }

There isn't anything language specific about this, as the only solutions are either all aggregate or using mapReduce, where the latter is defined in JavaScript functions
Just setting up some sample data:
db.wordstuff.insertMany([
{ 'data': "foo brick bar" },
{ 'data': "brick foo" },
{ 'data': "bar brick baz" },
{ 'data': "bax" },
{ 'data': "brin brok fu foo" }
])
Aggregation Framework
Then you can run the aggregation statement:
db.wordstuff.aggregate([
{ "$project": {
"_id": 0,
"split": {
"$filter": {
"input": { "$split": [ "$data", " " ] },
"cond": { "$in": [ "$$this", ["bar","foo","baz","blat"] ] }
}
}
}},
{ "$unwind": "$split" },
{ "$group": { "_id": "$split", "count": { "$sum": 1 } }},
{ "$group": {
"_id": null,
"data": { "$push": { "k": "$_id", "v": "$count" } }
}},
{ "$replaceRoot": {
"newRoot": {
"$arrayToObject": {
"$map": {
"input": ["bar","foo","baz","blat"],
"as": "d",
"in": {
"$cond": {
"if": { "$ne": [{ "$indexOfArray": ["$data.k","$$d"] },-1] },
"then": {
"$arrayElemAt": [
"$data",
{ "$indexOfArray": ["$data.k","$$d"] }
]
},
"else": { "k": "$$d", "v": 0 }
}
}
}
}
}
}}
])
In reality, all of the real work is done by this point:
db.wordstuff.aggregate([
{ "$project": {
"_id": 0,
"split": {
"$filter": {
"input": { "$split": [ "$data", " " ] },
"cond": { "$in": [ "$$this", ["bar","foo","baz","blat"] ] }
}
}
}},
{ "$unwind": "$split" },
{ "$group": { "_id": "$split", "count": { "$sum": 1 } }},
])
Which gives you output like:
{ "_id" : "baz", "count" : 1.0 }
{ "_id" : "bar", "count" : 2.0 }
{ "_id" : "foo", "count" : 3.0 }
So the real work here is being done by $split and that is the main dependency on using the aggregation framework, so you need MongoDB 3.4 at least in order to do this. The very simple premise is to $split the words out individually as array members, then $filter the content to match the input array of words to match.
That $filter uses $in, which is another addition as of MongoDB 3.4 to match against each listed word. There are other operators that can do this with longer syntax, but we know we already need MongoDB 3.4 so this is the shortest syntax.
All that is really done after that is to $unwind the matched array of words from each document, then $group to obtain those matched words as a distinct list, along with the count of the occurrences.
That really is all there is to it from the main perspective of the database.
The following parts are actually "optional" since these are easy to reproduce in code, and probably look a lot clearer and cleaner by doing so. But just to demonstrate the newer operators that would require MongoDB 3.4.4 at least for the introduction of $arrayToObject.
Again the basics are that the next $group "rolls up" the matched words from the cursor into an array within a single document. There is also a very specific key naming applied of "k" and "v" for later reasons.
Then you use a $replaceRoot stage since the content of the document returned is evaluated from an expression. This expression uses $map to iterate over the "input array" of words and matches those to the entries created from the aggregation. This matching is done using $indexOfArray do return the matched index of the compared value.
You use this within $cond as you either want to transform that value into a matched elment using $arrayElemAt, or alternately recognize the index was not a match. This either returns the aggregated entry ( obtained from earlier matches ) or a "default" value of 0 for the given word.
The final part uses $arrayToObject which transforms an array of objects with properties "k" and "v" in to "key/value" pairs as an object.
So you can ask MongoDB to do it, but the data is actually reduced by the minimal pipeline as shown, so you may as well do it in client code. It's pretty simple, and for JavaScript you just do:
var words = db.wordstuff.aggregate([
{ "$project": {
"_id": 0,
"split": {
"$filter": {
"input": { "$split": [ "$data", " " ] },
"cond": { "$in": [ "$$this", ["bar","foo","baz","blat"] ] }
}
}
}},
{ "$unwind": "$split" },
{ "$group": { "_id": "$split", "count": { "$sum": 1 } }},
]).toArray();
var result = ["bar","foo","baz","blat"].map(
w => ( words.map(wd => wd._id).indexOf(w) !== -1)
? words[words.map(wd => wd._id).indexOf(w)]
: { _id: w, count: 0 }
).reduce((acc,curr) => Object.assign(acc,{ [curr._id]: curr.count }),{})
So if there is anything that's language specific at all, then that would be the part. So if you choose to run the aggregation at it's basics and process the resulting cursor, then the python code would be:
input = ["bar","foo","baz","blat"]
words = list(db.wordstuff.aggregate([
{ "$project": {
"_id": 0,
"split": {
"$filter": {
"input": { "$split": [ "$data", " " ] },
"cond": { "$in": [ "$$this", input ] }
}
}
}},
{ "$unwind": "$split" },
{ "$group": { "_id": "$split", "count": { "$sum": 1 } }},
]))
result = reduce(
lambda x,y:
dict(x.items() + { y['_id']: y['count'] }.items()),
map(lambda w: words[map(lambda wd: wd['_id'],words).index(w)]
if w in map(lambda wd: wd['_id'],words)
else { '_id': w, 'count': 0 },
input
),
{}
)
And either method pulls out the same result:
{
"bar" : 2.0,
"foo" : 3.0,
"baz" : 1.0,
"blat" : 0.0
}
MapReduce
The alternate case where you don't even have the minimum MongoDB 3.4.0 available is to use mapReduce for the process instead. Again, this needs to be sent to the server as JavaScript, which is generally represented within "strings" in most language implementations ( other than JavaScript itself ):
db.wordstuff.mapReduce(
function() {
this.data.split(' ')
.filter( w => words.indexOf(w) !== -1 )
.forEach( w => emit(null,{ [w]: 1 }) );
},
function(key,values) {
return [].concat.apply([],
values.map(v => Object.keys(v).map(k => ({ k: k, v: v[k] })))
).reduce((acc,curr) => Object.assign(acc,{
[curr.k]: (acc.hasOwnProperty(curr.k))
? acc[curr.k] + curr.v : curr.v
}),{});
},
{
"out": { "inline": 1 },
"scope": { "words": ["bar","foo","baz","blat"] },
"finalize": function(key,value) {
return words.map( w => (value.hasOwnProperty(w))
? { [w]: value[w] } : { [w]: 0 }
).reduce((acc,curr) => Object.assign(acc,curr),{})
}
}
)
And that gives you the same results and really does exactly the same thing. Just a little slower because MongoDB needs to evaluate and process the JavaScript as compared to using it's own native coded methods with the aggregation framework.

Related

Aggregation $match within a $sum

I was wondering if it was possible to somehow use the $match operator within the $sum function for aggregation.
{ "$unwind": "$info.avatarInfoList" },
{ "$unwind": "$info.avatarInfoList.equipList" },
{ "$unwind": "$info.avatarInfoList.equipList.flat.reliquarySubstats" },
{
"$project": {
"name" : "$name",
"character" : "$info.avatarInfoList.avatarId",
"artifact" : "$info.avatarInfoList.equipList.itemId",
"statValue" : {
"$sum": [
{"$match" : { "$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId" : "FIGHT_PROP_CRITICAL_HURT" } },
{"$multiply": [2, {"$match" : { "$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId" : "FIGHT_PROP_CRITICAL" } }]}
]
},
}
},
{ "$sort": { "statValue": -1 }},
{ '$limit' : 30 }
]).to_list(length=None)
print(data)
I want to be able to use the value of the $sum operator within the project fields somehow, I just don't really understand what the right approach would be for this.
Sample Input (may be too long):
https://www.toptal.com/developers/hastebin/ixamekaxoq.json
Sample Output:
( 2 * FIGHT_PROP_CRITICAL ) + FIGHT_PROP_CRITICAL_HURT sorted from highest to lowest for each item.
{name: hat, character: Slayer, artifact: 13, statValue : 25.6}
There are still a few ambiguities about how you want to aggregate your data, but using the full document from your link, here's one way to produce the output you want.
N.B.: Weapons in the "equipList" don't have "reliquarySubstats" so they show a "statValue" of null in the output.
db.collection.aggregate([
{"$unwind": "$info.avatarInfoList"},
{"$unwind": "$info.avatarInfoList.equipList"},
{
"$project": {
"_id": 0,
"name": 1,
"character": "$info.avatarInfoList.avatarId",
"artifact": "$info.avatarInfoList.equipList.itemId",
"statValue": {
"$reduce": {
"input": "$info.avatarInfoList.equipList.flat.reliquarySubstats",
"initialValue": 0,
"in": {
"$switch": {
"branches": [
{
"case": {"$eq": ["$$this.appendPropId", "FIGHT_PROP_CRITICAL"]},
"then": {
"$add": [
"$$value",
{"$multiply": [2, "$$this.statValue"]}
]
}
},
{
"case": {"$eq": ["$$this.appendPropId", "FIGHT_PROP_CRITICAL_HURT"]},
"then": {"$add": ["$$value", "$$this.statValue"]}
}
],
"default": "$$value"
}
}
}
}
}
},
{"$sort": {"statValue": -1}}
])
Try it on mongoplayground.net.
It's not quite clear what you want to achieve, but as mentioned you want to be using $cond here.
like so:
{
"$project": {
"statValue": {
"$sum": [
{
$cond: [
{ // if this condition is true (prop id = prop critical hurt )
$eq: [
"$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId",
"FIGHT_PROP_CRITICAL_HURT"
]
},
{ // then use this value for the "$sum"
"$multiply": [
2,
"$info.avatarInfoList.equipList.flat.reliquarySubstats.statValue"
]
},
0 // otherwise use this value for the sum.
]
}
]
}
}
Mongo Playground

Elasticsearch: highlight on query terms, not filter terms?

Say I have this:
search_object = {
'query': {
'bool' : {
'must' : {
'simple_query_string' : {
'query': search_text,
'fields': [ 'french_no_accents', 'def_no_accents', ],
},
},
'filter' : [
{ 'term' : { 'def_no_accents' : 'court', }, },
{ 'term' : { 'def_no_accents' : 'bridge', }, },
],
},
},
'highlight': {
'encoder': 'html',
'fields': {
'french_no_accents': {},
'def_no_accents': {},
},
'number_of_fragments' : 0,
},
}
... whatever search string I enter as search_text, its constituent terms, but also "court" and "bridge" are highlighted. I don't want "court" or "bridge" to be highlighted.
I've tried putting the "highlight" key-value in a different spot in the structure... nothing seems to work (i.e. syntax exception thrown).
More generally, is there a formal grammar anywhere specifying what you can and can't do with ES (v7) queries?
You could add a highlight query to limit what should and shouldn't get highlighted:
{
"query": {
"bool": {
"must": {
"simple_query_string": {
"query": "abc",
"fields": [
"french_no_accents",
"def_no_accents"
]
}
},
"filter": [
{ "term": { "def_no_accents": "court" } },
{ "term": { "def_no_accents": "bridge" } }
]
}
},
"highlight": {
"encoder": "html",
"fields": {
"*_no_accents": { <--
"highlight_query": {
"simple_query_string": {
"query": "abc",
"fields": [ "french_no_accents", "def_no_accents" ]
}
}
}
},
"number_of_fragments": 0
}
}
I've used a wildcard for the two fields (*_no_accents) -- if that matches unwanted fields too, you'll need to duplicate the highlight query on two separate, non-wilcard highlight fields like you originally had. Though I can't think of a scenario where that'd happen since your multi_match query targets two concrete fields.
As to:
More generally, is there a formal grammar anywhere specifying what you can and can't do with ES (v7) queries?
what exactly are you looking for?

Removing a list entry in a list in pyMongo

I have a database collection that has objects like this:
{
"_id": ObjectId("something"),
"name_lower": "total",
"name": "Total",
"mounts": [
[
"mount1",
"instance1"
],
[
"mount2",
"instance1"
],
[
"mount1",
"instance2"
],
[
"mount2",
"instance2"
]
]
}
Say I want to remove every mount that has the instance instance2, How would I go about doing that? I have been searching for quite a while.
You can do something like this
[
{
$unwind: "$mounts"
},
{
$match: {
"mounts": {
$ne: "instance2"
}
}
},
{
$group: {
_id: "$_id",
name: {
$first: "$name"
},
mounts: {
$push: "$mounts"
}
}
}
]
Working Mongo playground
This answer is based on #varman answer but more pythonic and efficient.
The first stage should be a $match condition to filter out documents that don't need to be updated.
Since the mounts key consists of a nested array, we have to $unwind it, so that we can remove array elements that need to be removed.
We have to apply the $match condition again to filter out the element that has to be removed.
Finally, we have to $group the pipeline by _id key, so that the documents which got $unwind in the previous stage will be groupped into a single document.
from pymongo import MongoClient
client = MongoClient("<URI-String>")
col = client["<DB-Name"]["<Collection-Name>"]
count = 0
for cursor in col.aggregate([
{
"$match": {
"mounts": {"$ne": "instance2"}
}
},
{
"$unwind": "$mounts"
},
{
"$match": {
"mounts": {"$ne": "instance2"}
}
},
{
"$group": {
"_id": "$_id",
"newMounts": {
"$push": "$mounts"
}
}
},
]):
# print(cursor)
col.update_one({
"_id": cursor["_id"]
}, {
"$set": {
"mounts": cursor["newMounts"]
}
})
count += 1
print("\r", count, end="")
print("\n\nDone!!!")

generate unique id in nested document - Pymongo

generate unique id in nested document - Pymongo
my database looks like this...
{
"_id":"5ea661d6213894a6082af6d1",
"blog_id":"blog_one",
"comments": [
{
"user_id":"1",
"comment":"comment for blog one this is good"
},
{
"user_id":"2",
"comment":"other for blog one"
},
]
}
I want to add unique id in each and every comment,
I want it to output like this,
{
"_id":"5ea661d6213894a6082af6d1",
"blog_id":"blog_one",
"comments": [
{
"id" : "something" (auto generate unique),
"user_id":"1",
"comment":"comment for blog one this is good"
},
{
"id" : "something" (auto generate unique),
"user_id":"2",
"comment":"other for blog one"
},
]
}
I'm using PyMongo, is there a way to update this kind of document?
it's possible or not?
This update will add an unique id value to each of the comments array with nested documents. The id value is calculated based upon the present time as milliseconds. This value is incremented for each array element to get the new id value for the nested documents of the array.
The code runs with MongoDB version 4.2 and PyMongo 3.10.
pipeline = [
{
"$set": {
"comments": {
"$map": {
"input": { "$range": [ 0, { "$size": "$comments" } ] },
"in": {
"$mergeObjects": [
{ "id": { "$add": [ { "$toLong" : datetime.datetime.now() }, "$$this" ] } },
{ "$arrayElemAt": [ "$comments", "$$this" ] }
]
}
}
}
}
}
]
collection.update_one( { }, pipeline )
The updated document:
{
"_id" : "5ea661d6213894a6082af6d1",
"blog_id" : "blog_one",
"comments" : [
{
"id" : NumberLong("1588179349566"),
"user_id" : "1",
"comment" : "comment for blog one this is good"
},
{
"id" : NumberLong("1588179349567"),
"user_id" : "2",
"comment" : "other for blog one"
}
]
}
[ EDIT ADD ]
The following works from mongo shell. It adds unique id for the comments array's nested documents - unique across the documents.
db.collection.aggregate( [
{
"$unwind": "$comments" },
{
"$group": {
"_id": null,
"count": { "$sum": 1 },
"docs": { "$push": "$$ROOT" },
"now": { $first: "$$NOW" }
}
},
{
"$addFields": {
"docs": {
"$map": {
"input": { "$range": [ 0, "$count" ] },
"in": {
"$mergeObjects": [
{ "comments_id": { "$add": [ { "$toLong" : "$now" }, "$$this" ] } },
{ "$arrayElemAt": [ "$docs", "$$this" ] }
]
}
}
}
}
},
{
"$unwind": "$docs"
},
{
"$addFields": {
"docs.comments.comments_id": "$docs.comments_id"
}
},
{
"$replaceRoot": { "newRoot": "$docs" }
},
{
"$group": {
"_id": { "_id": "$_id", "blog_id": "$blog_id" },
"comments": { "$push": "$comments" }
}
},
{
$project: {
"_id": 0,
"_id": "$_id._id",
"blog_id": "$_id.blog_id",
"comments": 1
}
}
] ).forEach(doc => db.blogs.updateOne( { _id: doc._id }, { $set: { comments: doc.comments } } ) )
You can use ObjectId constructor to create the ids and place them in your nested documents.

Elastic Search nested object query

I have a elastic search index collection like below,
"_index":"test",
"_type":"abc",
"_source":{
"file_name":"xyz.ex"
"metadata":{
"format":".ex"
"profile":[
{"date_value" : "2018-05-30T00:00:00",
"key_id" : "1",
"type" : "date",
"value" : [ "30-05-2018" ]
},
{
"key_id" : "2",
"type" : "freetext",
"value" : [ "New york" ]
}
}
Now I need to search for document by matching key_id to its value. (key_id is some field whose value is stored in "value")
Ex. For key_id='1'field, if it's value = "30-05-2018" it should match the above document.
I tried mapping this as a nested object, But I am not able to write query to search with 2 or more key_id matching its respective value.
This is how I would do it. You need to AND together via bool/filter (or bool/must) two nested queries for each of the condition pair, since you want to match two different nested elements from the same parent document.
{
"query": {
"bool": {
"filter": [
{
"nested": {
"path": "metadata.profile",
"query": {
"bool": {
"filter": [
{
"term": {
"metadata.profile.f1": "a"
}
},
{
"term": {
"metadata.profile.f2": true
}
}
]
}
}
}
},
{
"nested": {
"path": "metadata.profile",
"query": {
"bool": {
"filter": [
{
"term": {
"metadata.profile.f1": "b"
}
},
{
"term": {
"metadata.profile.f2": false
}
}
]
}
}
}
}
]
}
}
}

Categories