Pymongo aggregate pipeline - python

I would like to use an aggregate pipeline to get the most common value given another value.
How can I use an aggregate pipeline to find what the most common StudentId is for TeacherId 212?
Have been attempting code below, but not getting desired outcome.
pl= [
'$project': {
'_id': 1,
'StudentId': 1,
"TeacherID: 1,
"$group": {
"__id": 'TeacherID',
"__id": {
"$first": "StudentID",
}
}
}
]
db.collection.aggregate(pl)

Demo - https://mongoplayground.net/p/ksay82IaGHs
Group by TeacherID and TeacherID and get occurrence of the combination, $sort by occurrence in descending order.
db.collection.aggregate([
{ $group: { _id: { TeacherID: "$TeacherID", StudentID: "$StudentID" }, occurrence: { $sum: 1 } } },
{ $sort: { "occurrence": -1 } }
]);
Output
[
{
"_id": {
"StudentID": 2,
"TeacherID": 212
},
"occurrence": 3
},
{
"_id": {
"StudentID": 4,
"TeacherID": 223
},
"occurrence": 1
}, .....
]
If you want the top record
Demo - https://mongoplayground.net/p/zBsGdAOdYwy
{
"$limit": 1
}
Demo - https://mongoplayground.net/p/G2KIVcjtYII
If you want to check for specific TeacherID use $match

Related

Aggregation $match within a $sum

I was wondering if it was possible to somehow use the $match operator within the $sum function for aggregation.
{ "$unwind": "$info.avatarInfoList" },
{ "$unwind": "$info.avatarInfoList.equipList" },
{ "$unwind": "$info.avatarInfoList.equipList.flat.reliquarySubstats" },
{
"$project": {
"name" : "$name",
"character" : "$info.avatarInfoList.avatarId",
"artifact" : "$info.avatarInfoList.equipList.itemId",
"statValue" : {
"$sum": [
{"$match" : { "$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId" : "FIGHT_PROP_CRITICAL_HURT" } },
{"$multiply": [2, {"$match" : { "$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId" : "FIGHT_PROP_CRITICAL" } }]}
]
},
}
},
{ "$sort": { "statValue": -1 }},
{ '$limit' : 30 }
]).to_list(length=None)
print(data)
I want to be able to use the value of the $sum operator within the project fields somehow, I just don't really understand what the right approach would be for this.
Sample Input (may be too long):
https://www.toptal.com/developers/hastebin/ixamekaxoq.json
Sample Output:
( 2 * FIGHT_PROP_CRITICAL ) + FIGHT_PROP_CRITICAL_HURT sorted from highest to lowest for each item.
{name: hat, character: Slayer, artifact: 13, statValue : 25.6}
There are still a few ambiguities about how you want to aggregate your data, but using the full document from your link, here's one way to produce the output you want.
N.B.: Weapons in the "equipList" don't have "reliquarySubstats" so they show a "statValue" of null in the output.
db.collection.aggregate([
{"$unwind": "$info.avatarInfoList"},
{"$unwind": "$info.avatarInfoList.equipList"},
{
"$project": {
"_id": 0,
"name": 1,
"character": "$info.avatarInfoList.avatarId",
"artifact": "$info.avatarInfoList.equipList.itemId",
"statValue": {
"$reduce": {
"input": "$info.avatarInfoList.equipList.flat.reliquarySubstats",
"initialValue": 0,
"in": {
"$switch": {
"branches": [
{
"case": {"$eq": ["$$this.appendPropId", "FIGHT_PROP_CRITICAL"]},
"then": {
"$add": [
"$$value",
{"$multiply": [2, "$$this.statValue"]}
]
}
},
{
"case": {"$eq": ["$$this.appendPropId", "FIGHT_PROP_CRITICAL_HURT"]},
"then": {"$add": ["$$value", "$$this.statValue"]}
}
],
"default": "$$value"
}
}
}
}
}
},
{"$sort": {"statValue": -1}}
])
Try it on mongoplayground.net.
It's not quite clear what you want to achieve, but as mentioned you want to be using $cond here.
like so:
{
"$project": {
"statValue": {
"$sum": [
{
$cond: [
{ // if this condition is true (prop id = prop critical hurt )
$eq: [
"$info.avatarInfoList.equipList.flat.reliquarySubstats.appendPropId",
"FIGHT_PROP_CRITICAL_HURT"
]
},
{ // then use this value for the "$sum"
"$multiply": [
2,
"$info.avatarInfoList.equipList.flat.reliquarySubstats.statValue"
]
},
0 // otherwise use this value for the sum.
]
}
]
}
}
Mongo Playground

MongoDB - Get SUM of values INSIDE of the array

I have JSON document recorded to MongoDB with structure like so:
[{ "SessionKey": "172e3b6b-509e-4ef3-950c-0c1dc5c83bab",
"Query": {"Date": "2020-03-04"},
"Flights": [
{"LegId":"13235",
"PricingOptions": [
{"Agents": [1963108],
"Price": 61763.64 },
{"Agents": [4035868],
"Price": 62395.83 }]},
{"LegId": "13236",
"PricingOptions": [{
"Agents": [2915951],
"Price": 37188.0}]}
...
The result I'm trying to get is "LegId":"sum_per_flight", in this case -> {'13235': (61763.64+62395.83), '13236': 37188.0} and then get flights with price < N
I've tried to run this pipeline for aggregation step (but it returns list of ALL prices - I don't know how to sum them up properly):
result = collection.aggregate([
{'$match': {'Query.Date': '2020-03-01'}},
{'$group': {'_id': {'Flight':'$Flights.LegId', 'Price':'$Flights.PricingOptions.Price'}}} ])
Also I've tried this pipeline, but it returns 0 for 'total_price_per_flight':
result = collection.aggregate({'$project': {
'Flights.LegId':1,
'total_price_per_flight': {'$sum': '$Flights.PricingOptions.Price'}
}})
You need to use $unwind to flatten Flights array to able iterate individually.
With $reduce operator, we iterate PricingOptions array and sum Price fields (accumulate prices).
The last step we return your documents into original structure. Before that, you may apply "get flights with price < N"
db.collection.aggregate([
{
"$match": {
"Query.Date": "2020-03-04"
}
},
{
$unwind: "$Flights"
},
{
$addFields: {
"Flights.LegId": {
$arrayToObject: [
[
{
k: "$Flights.LegId",
v: {
$reduce: {
input: "$Flights.PricingOptions",
initialValue: 0,
in: {
$add: [
"$$value",
"$$this.Price"
]
}
}
}
}
]
]
}
}
},
{
$group: {
_id: "$_id",
SessionKey: {
$first: "$SessionKey"
},
Query: {
$first: "$Query"
},
Flights: {
$push: "$Flights"
}
}
}
])
MongoPlayground

Mongodb group average array

I'm trying to do PyMongo aggregate - $group averages of arrays, and I cannot find any examples that matches my problem.
Data example
{
Subject: "Dave",
Strength: [1,2,3,4]
},
{
Subject: "Dave",
Strength: [1,2,3,5]
},
{
Subject: "Dave",
Strength: [1,2,3,6]
},
{
Subject: "Stuart",
Strength: [4,5,6,7]
},
{
Subject: "Stuart",
Strength: [6,5,6,7]
},
{
Subject: "Kevin",
Strength: [1,2,3,4]
},
{
Subject: "Kevin",
Strength: [9,4,3,4]
}
Wanted results
{
Subject: "Dave",
mean_strength = [1,2,3,5]
},
{
Subject: "Stuart",
mean_strength = [5,5,6,7]
},
{
Subject: "Kevin",
mean_strength = [5,3,3,4]
}
I have tried this approach but MongoDB is interpreting the arrays as Null?
pipe = [{'$group': {'_id': 'Subject', 'mean_strength': {'$avg': '$Strength'}}}]
results = db.Walk.aggregate(pipeline=pipe)
Out: [{'_id': 'SubjectID', 'total': None}]
I've looked through the MongoDB documentation and I cannot find or understand if there is any way to do this?
You could use $unwind with includeArrayIndex. As the name suggests, includeArrayIndex adds the array index to the output. This allows for grouping by Subject and array position in Strength. After calculating the average, the results need to be sorted to ensure the second $group and $push add the results back into the right order. Finally there is a $project to include and rename the relevant columns.
db.test.aggregate([{
"$unwind": {
"path": "$Strength",
"includeArrayIndex": "rownum"
}
},
{
"$group": {
"_id": {
"Subject": "$Subject",
"rownum": "$rownum"
},
"mean_strength": {
"$avg": "$Strength"
}
}
},
{
"$sort": {
"_id.Subject": 1,
"_id.rownum": 1
}
},
{
"$group": {
"_id": "$_id.Subject",
"mean_strength": {
"$push": "$mean_strength"
}
}
},
{
"$project": {
"_id": 0,
"Subject": "$_id",
"mean_strength": 1
}
}
])
For your test input, this returns:
{ "mean_strength" : [ 5, 5, 6, 7 ], "Subject" : "Stuart" }
{ "mean_strength" : [ 5, 3, 3, 4 ], "Subject" : "Kevin" }
{ "mean_strength" : [ 1, 2, 3, 5 ], "Subject" : "Dave" }
You can try below aggregation.
For example, Dave has [[1,2,3,4], [1,2,3,5], [1,2,3,6]] after group stage.
Here is the matrix
Reduce function
Pass Current Value (c) Accumulated Value (b) Next Value
First: [1,2,3,5] [[1],[2],[3],[4]] [[1,1],[2,2],[3,3],[5, 4]]
Second: [1,2,3,6] [[1,1],[2,2],[3,3],[5, 4]] [[1,1,1],[2,2,2],[3,3,3],[5, 4, 6]]
Map function - Calculates avg for each array value from reduce stage to output [1,2,3,5]
[{"$group":{"_id":"$Subject","Strength":{"$push":"$Strength"}}}, //Push all arrays
{"$project":{"mean_strength":{
"$map":{//Calculate avg for each reduced indexed pairs.
"input":{
"$reduce":{
"input":{"$slice":["$Strength",1,{"$subtract":[{"$size":"$Strength"},1]}]}, //Start from second array.
"initialValue":{ //Initialize to the first array with all elements transformed to array of single values.
"$map":{
"input":{"$range":[0,{"$size":{"$arrayElemAt":["$Strength",0]}}]},
"as":"a",
"in":[{"$arrayElemAt":[{"$arrayElemAt":["$Strength",0]},"$$a"]}]
}
},
"in":{
"$let":{"vars":{"c":"$$this","b":"$$value"}, //Create variables for current and accumulated values
"in":{"$map":{ //Creates map of same indexed values from each iteration
"input":{"$range":[0,{"$size":"$$b"}]},
"as":"d",
"in":{
"$concatArrays":[ //Concat values at same index
{"$arrayElemAt":["$$c","$$d"]}, //current
[{"$arrayElemAt":["$$b","$$d"]}] //accumulated
]
}
}
}
}
}
}
},
"as":"e",
"in":{"$avg":"$$e"}
}
}}}
]
According to description as mentioned into above question, as a solution to it please try executing following aggregate query
db.collection.aggregate(
// Pipeline
[
// Stage 1
{
$unwind: { path: "$Strength", includeArrayIndex: "arrayIndex" }
},
// Stage 2
{
$group: {
_id:{Subject:'$Subject',arrayIndex:'$arrayIndex'},
mean_strength:{$avg:'$Strength'}
}
},
// Stage 3
{
$group: {
_id:{'Subject':'$_id.Subject'},
mean_strength:{$push:'$mean_strength'}
}
},
// Stage 4
{
$project: {
Subject:'$_id.Subject',
mean_strength:'$mean_strength',
_id:0
}
}
]
);

How Iterate or remove MongoDb array list item using pymongo?

I want to iterate Mongodb database Arraylist items(TRANSACTION list) and remove Arraylist specific(TRANSACTION List) item using pymongo ?
I create Mongo collection as above using python pymongo. I want to iterate array list item using pymongo and remove final item only in Arraylist?
Data insert query using Python pymongo
# added new method create block chain_structure
def addCoinWiseTransaction(self, senz, coin, format_date):
self.collection = self.db.block_chain
coinValexists = self.collection.find({"_id": str(coin)}).count()
print('coin exists : ', coinValexists)
if (coinValexists > 0):
print('coin hash exists')
newTransaction = {"$push": {"TRANSACTION": {"SENDER": senz.attributes["#SENDER"],
"RECIVER": senz.attributes["#RECIVER"],
"T_NO_COIN": int(1),
"DATE": datetime.datetime.utcnow()
}}}
self.collection.update({"_id": str(coin)}, newTransaction)
else:
flag = senz.attributes["#f"];
print flag
if (flag == "ccb"):
print('new coin mined othir minner')
root = {"_id": str(coin)
, "S_ID": int(senz.attributes["#S_ID"]), "S_PARA": senz.attributes["#S_PARA"],
"FORMAT_DATE": format_date,
"NO_COIN": int(1),
"TRANSACTION": [{"MINER": senz.attributes["#M_S_ID"],
"RECIVER": senz.attributes["#RECIVER"],
"T_NO_COIN": int(1),
"DATE": datetime.datetime.utcnow()
}
]
}
self.collection.insert(root)
else:
print('new coin mined')
root = {"_id": str(coin)
, "S_ID": int(senz.attributes["#S_ID"]), "S_PARA": senz.attributes["#S_PARA"],
"FORMAT_DATE": format_date,
"NO_COIN": int(1),
"TRANSACTION": [{"MINER": "M_1",
"RECIVER": senz.sender,
"T_NO_COIN": int(1),
"DATE": datetime.datetime.utcnow()
}
]
}
self.collection.insert(root)
return 'DONE'
To remove the last entry, the general idea (as you have mentioned) is to iterate the array and grab the index of the last element as denoted by its DATE field, then update the collection by removing it using $pull. So the crucial piece of data you need for this to work is the DATE value and the document's _id.
One approach you could take is to first use the aggregation framework to get this data. With this, you can run a pipeline where the first step if filtering the documents in the collection by using the $match operator which uses standard MongoDB queries.
The next stage after filtering the documents is to flatten the TRANSACTION array i.e. denormalise the documents in the list so that you can filter the final item i.e. get the last document by the DATE field. This is made possible with the $unwind operator, which for each input document, outputs n documents where n is the number of array elements and can be zero for an empty array.
After deconstructing the array, in order to get the last document, use the $group operator where you can regroup the flattened documents and in the process use the group accumulator operators to obtain
the last TRANSACTION date by using the $max operator applied to its embedded DATE field.
So in essence, run the following pipeline and use the results to update the collection. For example, you can run the following pipeline:
mongo shell
db.block_chain.aggregate([
{ "$match": { "_id": coin_id } },
{ "$unwind": "$TRANSACTION" },
{
"$group": {
"_id": "$_id",
"last_transaction_date": { "$max": "$TRANSACTION.DATE" }
}
}
])
You can then get the document with the update data from this aggregate operation using the toArray() method or the aggregate cursor and update your collection:
var docs = db.block_chain.aggregate([
{ "$match": { "_id": coin_id } },
{ "$unwind": "$TRANSACTION" },
{
"$group": {
"_id": "$_id",
"LAST_TRANSACTION_DATE": { "$max": "$TRANSACTION.DATE" }
}
}
]).toArray()
db.block_chain.updateOne(
{ "_id": docs[0]._id },
{
"$pull": {
"TRANSACTION": {
"DATE": docs[0]["LAST_TRANSACTION_DATE"]
}
}
}
)
python
def remove_last_transaction(self, coin):
self.collection = self.db.block_chain
pipe = [
{ "$match": { "_id": str(coin) } },
{ "$unwind": "$TRANSACTION" },
{
"$group": {
"_id": "$_id",
"last_transaction_date": { "$max": "$TRANSACTION.DATE" }
}
}
]
# run aggregate pipeline
cursor = self.collection.aggregate(pipeline=pipe)
docs = list(cursor)
# run update
self.collection.update_one(
{ "_id": docs[0]["_id"] },
{
"$pull": {
"TRANSACTION": {
"DATE": docs[0]["LAST_TRANSACTION_DATE"]
}
}
}
)
Alternatively, you can run a single aggregate operation that will also update your collection using the $out pipeline which writes the results of the pipeline to the same collection:
If the collection specified by the $out operation already
exists, then upon completion of the aggregation, the $out stage atomically replaces the existing collection with the new results collection. The $out operation does not
change any indexes that existed on the previous collection. If the
aggregation fails, the $out operation makes no changes to
the pre-existing collection.
For example, you could run this pipeline:
mongo shell
db.block_chain.aggregate([
{ "$match": { "_id": coin_id } },
{ "$unwind": "$TRANSACTION" },
{ "$sort": { "TRANSACTION.DATE": 1 } }
{
"$group": {
"_id": "$_id",
"LAST_TRANSACTION": { "$last": "$TRANSACTION" },
"FORMAT_DATE": { "$first": "$FORMAT_DATE" },
"NO_COIN": { "$first": "$NO_COIN" },
"S_ID": { "$first": "$S_ID" },
"S_PARA": { "$first": "$S_PARA" },
"TRANSACTION": { "$push": "$TRANSACTION" }
}
},
{
"$project": {
"FORMAT_DATE": 1,
"NO_COIN": 1,
"S_ID": 1,
"S_PARA": 1,
"TRANSACTION": {
"$setDifference": ["$TRANSACTION", ["$LAST_TRANSACTION"]]
}
}
},
{ "$out": "block_chain" }
])
python
def remove_last_transaction(self, coin):
self.db.block_chain.aggregate([
{ "$match": { "_id": str(coin) } },
{ "$unwind": "$TRANSACTION" },
{ "$sort": { "TRANSACTION.DATE": 1 } },
{
"$group": {
"_id": "$_id",
"LAST_TRANSACTION": { "$last": "$TRANSACTION" },
"FORMAT_DATE": { "$first": "$FORMAT_DATE" },
"NO_COIN": { "$first": "$NO_COIN" },
"S_ID": { "$first": "$S_ID" },
"S_PARA": { "$first": "$S_PARA" },
"TRANSACTION": { "$push": "$TRANSACTION" }
}
},
{
"$project": {
"FORMAT_DATE": 1,
"NO_COIN": 1,
"S_ID": 1,
"S_PARA": 1,
"TRANSACTION": {
"$setDifference": ["$TRANSACTION", ["$LAST_TRANSACTION"]]
}
}
},
{ "$out": "block_chain" }
])
Whilst this approach can be more efficient than the first, it requires knowledge of the existing fields first so in some cases the solution cannot be practical.

PyMongo group by multiple keys

With PyMongo, group by one key seems to be ok:
results = collection.group(key={"scan_status":0}, condition={'date': {'$gte': startdate}}, initial={"count": 0}, reduce=reducer)
results:
{u'count': 215339.0, u'scan_status': u'PENDING'} {u'count': 617263.0, u'scan_status': u'DONE'}
but when I try to do group by multiple keys I get an exception:
results = collection.group(key={"scan_status":0,"date":0}, condition={'date': {'$gte': startdate}}, initial={"count": 0}, reduce=reducer)
How can I do group by multiple fields correctly?
If you are trying to count over two keys then while it is possible using .group() your better option is via .aggregate().
This uses "native code operators" and not the JavaScript interpreted code as required by .group() to do the same basic "grouping" action as you are trying to achieve.
Particularly here is the $group pipeline operator:
result = collection.aggregate([
# Matchn the documents possible
{ "$match": { "date": { "$gte": startdate } } },
# Group the documents and "count" via $sum on the values
{ "$group": {
"_id": {
"scan_status": "$scan_status",
"date": "$date"
},
"count": { "$sum": 1 }
}}
])
In fact you probably want something that reduces the "date" into a distinct period. As in:
result = collection.aggregate([
# Matchn the documents possible
{ "$match": { "date": { "$gte": startdate } } },
# Group the documents and "count" via $sum on the values
{ "$group": {
"_id": {
"scan_status": "$scan_status",
"date": {
"year": { "$year": "$date" },
"month": { "$month" "$date" },
"day": { "$dayOfMonth": "$date" }
}
},
"count": { "$sum": 1 }
}}
])
Using the Date Aggregation Operators as shown here.
Or perhaps with basic "date math":
import datetime
from datetime import date
result = collection.aggregate([
# Matchn the documents possible
{ "$match": { "date": { "$gte": startdate } } },
# Group the documents and "count" via $sum on the values
# use "epoch" "1970-01-01" as a base to convert to integer
{ "$group": {
"_id": {
"scan_status": "$scan_status",
"date": {
"$subtract": [
{ "$subtract": [ "$date", date.fromtimestamp(0) ] },
{ "$mod": [
{ "$subtract": [ "$date", date.fromtimestamp(0) ] },
1000 * 60 * 60 * 24
]}
]
}
},
"count": { "$sum": 1 }
}}
])
Which will return integer values from "epoch" time instead of a compisite value object.
But all of these options are better than .group() as they use native coded routines and perform their actions much faster than the JavaScript code you need to supply otherwise.

Categories