Mongodb group average array

Mongodb group average array - python

I'm trying to do PyMongo aggregate - $group averages of arrays, and I cannot find any examples that matches my problem.
Data example
{
Subject: "Dave",
Strength: [1,2,3,4]
},
{
Subject: "Dave",
Strength: [1,2,3,5]
},
{
Subject: "Dave",
Strength: [1,2,3,6]
},
{
Subject: "Stuart",
Strength: [4,5,6,7]
},
{
Subject: "Stuart",
Strength: [6,5,6,7]
},
{
Subject: "Kevin",
Strength: [1,2,3,4]
},
{
Subject: "Kevin",
Strength: [9,4,3,4]
}
Wanted results
{
Subject: "Dave",
mean_strength = [1,2,3,5]
},
{
Subject: "Stuart",
mean_strength = [5,5,6,7]
},
{
Subject: "Kevin",
mean_strength = [5,3,3,4]
}
I have tried this approach but MongoDB is interpreting the arrays as Null?
pipe = [{'$group': {'_id': 'Subject', 'mean_strength': {'$avg': '$Strength'}}}]
results = db.Walk.aggregate(pipeline=pipe)
Out: [{'_id': 'SubjectID', 'total': None}]
I've looked through the MongoDB documentation and I cannot find or understand if there is any way to do this?

You could use $unwind with includeArrayIndex. As the name suggests, includeArrayIndex adds the array index to the output. This allows for grouping by Subject and array position in Strength. After calculating the average, the results need to be sorted to ensure the second $group and $push add the results back into the right order. Finally there is a $project to include and rename the relevant columns.
db.test.aggregate([{
"$unwind": {
"path": "$Strength",
"includeArrayIndex": "rownum"
}
},
{
"$group": {
"_id": {
"Subject": "$Subject",
"rownum": "$rownum"
},
"mean_strength": {
"$avg": "$Strength"
}
}
},
{
"$sort": {
"_id.Subject": 1,
"_id.rownum": 1
}
},
{
"$group": {
"_id": "$_id.Subject",
"mean_strength": {
"$push": "$mean_strength"
}
}
},
{
"$project": {
"_id": 0,
"Subject": "$_id",
"mean_strength": 1
}
}
])
For your test input, this returns:
{ "mean_strength" : [ 5, 5, 6, 7 ], "Subject" : "Stuart" }
{ "mean_strength" : [ 5, 3, 3, 4 ], "Subject" : "Kevin" }
{ "mean_strength" : [ 1, 2, 3, 5 ], "Subject" : "Dave" }

You can try below aggregation.
For example, Dave has [[1,2,3,4], [1,2,3,5], [1,2,3,6]] after group stage.
Here is the matrix
Reduce function
Pass Current Value (c) Accumulated Value (b) Next Value
First: [1,2,3,5] [[1],[2],[3],[4]] [[1,1],[2,2],[3,3],[5, 4]]
Second: [1,2,3,6] [[1,1],[2,2],[3,3],[5, 4]] [[1,1,1],[2,2,2],[3,3,3],[5, 4, 6]]
Map function - Calculates avg for each array value from reduce stage to output [1,2,3,5]
[{"$group":{"_id":"$Subject","Strength":{"$push":"$Strength"}}}, //Push all arrays
{"$project":{"mean_strength":{
"$map":{//Calculate avg for each reduced indexed pairs.
"input":{
"$reduce":{
"input":{"$slice":["$Strength",1,{"$subtract":[{"$size":"$Strength"},1]}]}, //Start from second array.
"initialValue":{ //Initialize to the first array with all elements transformed to array of single values.
"$map":{
"input":{"$range":[0,{"$size":{"$arrayElemAt":["$Strength",0]}}]},
"as":"a",
"in":[{"$arrayElemAt":[{"$arrayElemAt":["$Strength",0]},"$$a"]}]
}
},
"in":{
"$let":{"vars":{"c":"$$this","b":"$$value"}, //Create variables for current and accumulated values
"in":{"$map":{ //Creates map of same indexed values from each iteration
"input":{"$range":[0,{"$size":"$$b"}]},
"as":"d",
"in":{
"$concatArrays":[ //Concat values at same index
{"$arrayElemAt":["$$c","$$d"]}, //current
[{"$arrayElemAt":["$$b","$$d"]}] //accumulated
]
}
}
}
}
}
}
},
"as":"e",
"in":{"$avg":"$$e"}
}
}}}
]

According to description as mentioned into above question, as a solution to it please try executing following aggregate query
db.collection.aggregate(
// Pipeline
[
// Stage 1
{
$unwind: { path: "$Strength", includeArrayIndex: "arrayIndex" }
},
// Stage 2
{
$group: {
_id:{Subject:'$Subject',arrayIndex:'$arrayIndex'},
mean_strength:{$avg:'$Strength'}
}
},
// Stage 3
{
$group: {
_id:{'Subject':'$_id.Subject'},
mean_strength:{$push:'$mean_strength'}
}
},
// Stage 4
{
$project: {
Subject:'$_id.Subject',
mean_strength:'$mean_strength',
_id:0
}
}
]
);

Related

Aggregation $match within a $sum

I was wondering if it was possible to somehow use the $match operator within the $sum function for aggregation.
{ "$unwind": "$info.avatarInfoList" },
{ "$unwind": "$info.avatarInfoList.equipList" },
{ "$unwind": "$info.avatarInfoList.equipList.flat.reliquarySubstats" },
{
"$project": {
"name" : "$name",
"character" : "$info.avatarInfoList.avatarId",
"artifact" : "$info.avatarInfoList.equipList.itemId",
"statValue" : {
"$sum": [
{"$match" : { "$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId" : "FIGHT_PROP_CRITICAL_HURT" } },
{"$multiply": [2, {"$match" : { "$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId" : "FIGHT_PROP_CRITICAL" } }]}
]
},
}
},
{ "$sort": { "statValue": -1 }},
{ '$limit' : 30 }
]).to_list(length=None)
print(data)
I want to be able to use the value of the $sum operator within the project fields somehow, I just don't really understand what the right approach would be for this.
Sample Input (may be too long):
https://www.toptal.com/developers/hastebin/ixamekaxoq.json
Sample Output:
( 2 * FIGHT_PROP_CRITICAL ) + FIGHT_PROP_CRITICAL_HURT sorted from highest to lowest for each item.
{name: hat, character: Slayer, artifact: 13, statValue : 25.6}

There are still a few ambiguities about how you want to aggregate your data, but using the full document from your link, here's one way to produce the output you want.
N.B.: Weapons in the "equipList" don't have "reliquarySubstats" so they show a "statValue" of null in the output.
db.collection.aggregate([
{"$unwind": "$info.avatarInfoList"},
{"$unwind": "$info.avatarInfoList.equipList"},
{
"$project": {
"_id": 0,
"name": 1,
"character": "$info.avatarInfoList.avatarId",
"artifact": "$info.avatarInfoList.equipList.itemId",
"statValue": {
"$reduce": {
"input": "$info.avatarInfoList.equipList.flat.reliquarySubstats",
"initialValue": 0,
"in": {
"$switch": {
"branches": [
{
"case": {"$eq": ["$$this.appendPropId", "FIGHT_PROP_CRITICAL"]},
"then": {
"$add": [
"$$value",
{"$multiply": [2, "$$this.statValue"]}
]
}
},
{
"case": {"$eq": ["$$this.appendPropId", "FIGHT_PROP_CRITICAL_HURT"]},
"then": {"$add": ["$$value", "$$this.statValue"]}
}
],
"default": "$$value"
}
}
}
}
}
},
{"$sort": {"statValue": -1}}
])
Try it on mongoplayground.net.

It's not quite clear what you want to achieve, but as mentioned you want to be using $cond here.
like so:
{
"$project": {
"statValue": {
"$sum": [
{
$cond: [
{ // if this condition is true (prop id = prop critical hurt )
$eq: [
"$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId",
"FIGHT_PROP_CRITICAL_HURT"
]
},
{ // then use this value for the "$sum"
"$multiply": [
2,
"$info.avatarInfoList.equipList.flat.reliquarySubstats.statValue"
]
},
0 // otherwise use this value for the sum.
]
}
]
}
}
Mongo Playground

Find the longest group after groupby on normalized json in pandas

My code below groups by values and creates a list of values that were once the length of arrays. But how can I return the id that has the largest sum of each number in the elements:
Original Json read into df (not same data as printed because it was too long)
{
"kind":"admin#reports#activities",
"etag":"\"5g8\"",
"nextPageToken":"A:1651795128914034:-4002873813067783265:151219070090:C02f6wppb",
"items":[
{
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:59:39.421Z",
"uniqueQualifier":"5526793068617678141",
"applicationName":"token",
"customerId":"cds"
},
"etag":"\"jkYcURYoi8\"",
"actor":{
"email":"blah#blah.net",
"profileId":"1323"
},
"ipAddress":"107.178.193.87",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"admin"
},
{
"name":"method_name",
"value":"directory.users.list"
},
{
"name":"client_id",
"value":"722230783769-dsta4bi9fkom72qcu0t34aj3qpcoqloq.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"7158"
},
{
"name":"product_bucket",
"value":"GSUITE_ADMIN"
},
{
"name":"app_name",
"value":"Untitled project"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
]
},
{
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:58:48.914Z",
"uniqueQualifier":"-4002873813067783265",
"applicationName":"token",
"customerId":"df"
},
"etag":"\"5T53xK7dpLei95RNoKZd9uz5Xb8LJpBJb72fi2HaNYM/9DTdB8t7uixvUbjo4LUEg53_gf0\"",
"actor":{
"email":"blah.blah#bebe.net",
"profileId":"1324"
},
"ipAddress":"54.80.168.30",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"gmail"
},
{
"name":"method_name",
"value":"gmail.users.messages.list"
},
{
"name":"client_id",
"value":"927538837578.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"2"
},
{
"name":"product_bucket",
"value":"GMAIL"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
]
}
]
}
current code:
df = pd.json_normalize(response['items'])
df['test'] = df.groupby('actor.profileId')['events'].apply(lambda x: [len(x.iloc[i][0]['parameters']) for i in range(len(x))])
output:
ID
1002306 [7, 7, 7, 5]
1234444 [3,5,6]
1222222 [1,3,4,5]
desired output
id total
1002306 26
Sorry had to fill up more space, as there was so much code

There’s no need to construct the intermediate df and do groupby on it. You can use pass the record and meta paths to json_normalize to directly flatten the json data. Then your job seems to be about counting the number of rows per actor.profileId and finding the maximum.
df = pd.json_normalize(response['items'], ['events','parameters'], ['actor'])
df['actor.profileId'] = df['actor'].str['profileId']
out = df.value_counts('actor.profileId').pipe(lambda x: x.iloc[[0]])
Output:
actor.profileId
1323 7
dtype: int64

Pymongo aggregate pipeline

I would like to use an aggregate pipeline to get the most common value given another value.
How can I use an aggregate pipeline to find what the most common StudentId is for TeacherId 212?
Have been attempting code below, but not getting desired outcome.
pl= [
'$project': {
'_id': 1,
'StudentId': 1,
"TeacherID: 1,
"$group": {
"__id": 'TeacherID',
"__id": {
"$first": "StudentID",
}
}
}
]
db.collection.aggregate(pl)

Demo - https://mongoplayground.net/p/ksay82IaGHs
Group by TeacherID and TeacherID and get occurrence of the combination, $sort by occurrence in descending order.
db.collection.aggregate([
{ $group: { _id: { TeacherID: "$TeacherID", StudentID: "$StudentID" }, occurrence: { $sum: 1 } } },
{ $sort: { "occurrence": -1 } }
]);
Output
[
{
"_id": {
"StudentID": 2,
"TeacherID": 212
},
"occurrence": 3
},
{
"_id": {
"StudentID": 4,
"TeacherID": 223
},
"occurrence": 1
}, .....
]
If you want the top record
Demo - https://mongoplayground.net/p/zBsGdAOdYwy
{
"$limit": 1
}
Demo - https://mongoplayground.net/p/G2KIVcjtYII
If you want to check for specific TeacherID use $match

Is there a way to eliminate all duplicates from a collection?

I have a collection where the objects have a structure similar to
{'_id': ObjectId('5e691cb9e73282f624362221'),
'created_at': 'Tue Mar 10 09:23:54 +0000 2020',
'id': 1237308186757120001,
'id_str': '1237308186757120001',
'full_text': 'See you in July'}
I am struggling to only keep object which have a unique full text. Using distinct only gives me a list of the distinct full text field values where as I want to only conserve object in the collection with unique full texts.

There is, the code should look like this:
dict = {"a": 1, "b": 2, "c": 3, "a": 5, "d": 4, "e": 5, "c": 8}
#New clean dictionary
unique = {}
#Go through the original dictionary's items
for key, value in dict.items():
if(key in unique.keys()):
#If the key already exists in the new dictionary
continue
else:
#Otherwise
unique[key] = value
print(unique)
I hope this helps you!

There are 2 ways:
MongoDB way
We perform MongoDB aggregation where we group records by full_text, filter unique documents only and insert them into collection. (in the shell)
db.collection.aggregate([
{
$group: {
_id: "$full_text",
data: {
$push: "$$ROOT"
},
count: {
$sum: 1
}
}
},
{
$match: {
count: {
$eq: 1
}
}
},
{
$addFields: {
data: {
$arrayElemAt: [
"$data",
0
]
}
}
},
{
$replaceRoot: {
newRoot: "$data"
}
},
{
$out: "tmp"
}
])
When you run this query, it will create new collection with unique full_text values. You can drop old collection and rename this one.
You may also put your collection name into $out operator like this {$out:"collection"}, but there is no going back.
Python way
We perform MongoDB aggregation grouping by full_text field, filter duplicate documents and create single array with all _id to be removed. Once MongoDB returns results, we execute remove command for duplicate documents.
db.collection.aggregate([
{
$group: {
_id: "$full_text",
data: {
$push: "$_id"
},
count: {
$sum: 1
}
}
},
{
$match: {
count: {
$gt: 1
}
}
},
{
$group: {
_id: null,
data: {
$push: "$data"
}
}
},
{
$addFields: {
data: {
$reduce: {
input: "$data",
initialValue: [],
in: {
$concatArrays: [
"$$value",
"$$this"
]
}
}
}
}
}
])
MongoPlayground
Pseudocode
data = list(collection.aggregate(...))
if len(data) > 0:
colleciton.remove({'_id':{'$in':data[0]["data"]}})

How to Build a Keyword Historgam in MongoDB?

I am using MongoDB 3.4 and PyMongo. I have a set of keywords:
keywords = [ 'bar', 'foo', ..., 'zoo' ]
I also have a collection:
docs = { 'data' : ' ... bar foo ... ',
'data' : ' ... foo ... ',
'data' : ' ... zoo ... ' }
I am looking for a PyMongo aggregation query which is going to give me a dict:
{ 'bar' : 0, 'foo' : 2, ..., 'zoo' : 0 }

There isn't anything language specific about this, as the only solutions are either all aggregate or using mapReduce, where the latter is defined in JavaScript functions
Just setting up some sample data:
db.wordstuff.insertMany([
{ 'data': "foo brick bar" },
{ 'data': "brick foo" },
{ 'data': "bar brick baz" },
{ 'data': "bax" },
{ 'data': "brin brok fu foo" }
])
Aggregation Framework
Then you can run the aggregation statement:
db.wordstuff.aggregate([
{ "$project": {
"_id": 0,
"split": {
"$filter": {
"input": { "$split": [ "$data", " " ] },
"cond": { "$in": [ "$$this", ["bar","foo","baz","blat"] ] }
}
}
}},
{ "$unwind": "$split" },
{ "$group": { "_id": "$split", "count": { "$sum": 1 } }},
{ "$group": {
"_id": null,
"data": { "$push": { "k": "$_id", "v": "$count" } }
}},
{ "$replaceRoot": {
"newRoot": {
"$arrayToObject": {
"$map": {
"input": ["bar","foo","baz","blat"],
"as": "d",
"in": {
"$cond": {
"if": { "$ne": [{ "$indexOfArray": ["$data.k","$$d"] },-1] },
"then": {
"$arrayElemAt": [
"$data",
{ "$indexOfArray": ["$data.k","$$d"] }
]
},
"else": { "k": "$$d", "v": 0 }
}
}
}
}
}
}}
])
In reality, all of the real work is done by this point:
db.wordstuff.aggregate([
{ "$project": {
"_id": 0,
"split": {
"$filter": {
"input": { "$split": [ "$data", " " ] },
"cond": { "$in": [ "$$this", ["bar","foo","baz","blat"] ] }
}
}
}},
{ "$unwind": "$split" },
{ "$group": { "_id": "$split", "count": { "$sum": 1 } }},
])
Which gives you output like:
{ "_id" : "baz", "count" : 1.0 }
{ "_id" : "bar", "count" : 2.0 }
{ "_id" : "foo", "count" : 3.0 }
So the real work here is being done by $split and that is the main dependency on using the aggregation framework, so you need MongoDB 3.4 at least in order to do this. The very simple premise is to $split the words out individually as array members, then $filter the content to match the input array of words to match.
That $filter uses $in, which is another addition as of MongoDB 3.4 to match against each listed word. There are other operators that can do this with longer syntax, but we know we already need MongoDB 3.4 so this is the shortest syntax.
All that is really done after that is to $unwind the matched array of words from each document, then $group to obtain those matched words as a distinct list, along with the count of the occurrences.
That really is all there is to it from the main perspective of the database.
The following parts are actually "optional" since these are easy to reproduce in code, and probably look a lot clearer and cleaner by doing so. But just to demonstrate the newer operators that would require MongoDB 3.4.4 at least for the introduction of $arrayToObject.
Again the basics are that the next $group "rolls up" the matched words from the cursor into an array within a single document. There is also a very specific key naming applied of "k" and "v" for later reasons.
Then you use a $replaceRoot stage since the content of the document returned is evaluated from an expression. This expression uses $map to iterate over the "input array" of words and matches those to the entries created from the aggregation. This matching is done using $indexOfArray do return the matched index of the compared value.
You use this within $cond as you either want to transform that value into a matched elment using $arrayElemAt, or alternately recognize the index was not a match. This either returns the aggregated entry ( obtained from earlier matches ) or a "default" value of 0 for the given word.
The final part uses $arrayToObject which transforms an array of objects with properties "k" and "v" in to "key/value" pairs as an object.
So you can ask MongoDB to do it, but the data is actually reduced by the minimal pipeline as shown, so you may as well do it in client code. It's pretty simple, and for JavaScript you just do:
var words = db.wordstuff.aggregate([
{ "$project": {
"_id": 0,
"split": {
"$filter": {
"input": { "$split": [ "$data", " " ] },
"cond": { "$in": [ "$$this", ["bar","foo","baz","blat"] ] }
}
}
}},
{ "$unwind": "$split" },
{ "$group": { "_id": "$split", "count": { "$sum": 1 } }},
]).toArray();
var result = ["bar","foo","baz","blat"].map(
w => ( words.map(wd => wd._id).indexOf(w) !== -1)
? words[words.map(wd => wd._id).indexOf(w)]
: { _id: w, count: 0 }
).reduce((acc,curr) => Object.assign(acc,{ [curr._id]: curr.count }),{})
So if there is anything that's language specific at all, then that would be the part. So if you choose to run the aggregation at it's basics and process the resulting cursor, then the python code would be:
input = ["bar","foo","baz","blat"]
words = list(db.wordstuff.aggregate([
{ "$project": {
"_id": 0,
"split": {
"$filter": {
"input": { "$split": [ "$data", " " ] },
"cond": { "$in": [ "$$this", input ] }
}
}
}},
{ "$unwind": "$split" },
{ "$group": { "_id": "$split", "count": { "$sum": 1 } }},
]))
result = reduce(
lambda x,y:
dict(x.items() + { y['_id']: y['count'] }.items()),
map(lambda w: words[map(lambda wd: wd['_id'],words).index(w)]
if w in map(lambda wd: wd['_id'],words)
else { '_id': w, 'count': 0 },
input
),
{}
)
And either method pulls out the same result:
{
"bar" : 2.0,
"foo" : 3.0,
"baz" : 1.0,
"blat" : 0.0
}
MapReduce
The alternate case where you don't even have the minimum MongoDB 3.4.0 available is to use mapReduce for the process instead. Again, this needs to be sent to the server as JavaScript, which is generally represented within "strings" in most language implementations ( other than JavaScript itself ):
db.wordstuff.mapReduce(
function() {
this.data.split(' ')
.filter( w => words.indexOf(w) !== -1 )
.forEach( w => emit(null,{ [w]: 1 }) );
},
function(key,values) {
return [].concat.apply([],
values.map(v => Object.keys(v).map(k => ({ k: k, v: v[k] })))
).reduce((acc,curr) => Object.assign(acc,{
[curr.k]: (acc.hasOwnProperty(curr.k))
? acc[curr.k] + curr.v : curr.v
}),{});
},
{
"out": { "inline": 1 },
"scope": { "words": ["bar","foo","baz","blat"] },
"finalize": function(key,value) {
return words.map( w => (value.hasOwnProperty(w))
? { [w]: value[w] } : { [w]: 0 }
).reduce((acc,curr) => Object.assign(acc,curr),{})
}
}
)
And that gives you the same results and really does exactly the same thing. Just a little slower because MongoDB needs to evaluate and process the JavaScript as compared to using it's own native coded methods with the aggregation framework.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Mongodb group average array - python

Related

Aggregation $match within a $sum

Find the longest group after groupby on normalized json in pandas

Pymongo aggregate pipeline

Is there a way to eliminate all duplicates from a collection?

How to Build a Keyword Historgam in MongoDB?

Categories

Resources