Is there a way to eliminate all duplicates from a collection? - python

I have a collection where the objects have a structure similar to
{'_id': ObjectId('5e691cb9e73282f624362221'),
'created_at': 'Tue Mar 10 09:23:54 +0000 2020',
'id': 1237308186757120001,
'id_str': '1237308186757120001',
'full_text': 'See you in July'}
I am struggling to only keep object which have a unique full text. Using distinct only gives me a list of the distinct full text field values where as I want to only conserve object in the collection with unique full texts.

There is, the code should look like this:
dict = {"a": 1, "b": 2, "c": 3, "a": 5, "d": 4, "e": 5, "c": 8}
#New clean dictionary
unique = {}
#Go through the original dictionary's items
for key, value in dict.items():
if(key in unique.keys()):
#If the key already exists in the new dictionary
continue
else:
#Otherwise
unique[key] = value
print(unique)
I hope this helps you!

There are 2 ways:
MongoDB way
We perform MongoDB aggregation where we group records by full_text, filter unique documents only and insert them into collection. (in the shell)
db.collection.aggregate([
{
$group: {
_id: "$full_text",
data: {
$push: "$$ROOT"
},
count: {
$sum: 1
}
}
},
{
$match: {
count: {
$eq: 1
}
}
},
{
$addFields: {
data: {
$arrayElemAt: [
"$data",
0
]
}
}
},
{
$replaceRoot: {
newRoot: "$data"
}
},
{
$out: "tmp"
}
])
When you run this query, it will create new collection with unique full_text values. You can drop old collection and rename this one.
You may also put your collection name into $out operator like this {$out:"collection"}, but there is no going back.
Python way
We perform MongoDB aggregation grouping by full_text field, filter duplicate documents and create single array with all _id to be removed. Once MongoDB returns results, we execute remove command for duplicate documents.
db.collection.aggregate([
{
$group: {
_id: "$full_text",
data: {
$push: "$_id"
},
count: {
$sum: 1
}
}
},
{
$match: {
count: {
$gt: 1
}
}
},
{
$group: {
_id: null,
data: {
$push: "$data"
}
}
},
{
$addFields: {
data: {
$reduce: {
input: "$data",
initialValue: [],
in: {
$concatArrays: [
"$$value",
"$$this"
]
}
}
}
}
}
])
MongoPlayground
Pseudocode
data = list(collection.aggregate(...))
if len(data) > 0:
colleciton.remove({'_id':{'$in':data[0]["data"]}})

Related

MongoDB: Update element in an array where the index of the element is saved in the document

I have the following document structure.
{
_id: ...,
unique_id: 1234,
config_no: 1,
configs: [
{
data: "qwertyuiop" // random string
},
{
data: "asdfghjkl" // random string
}
]
}
I want to update value of data from one of the configs. The index of the config that needs to be updated is available in the config_no key.
Is there any way to update the value without querying the document.
This is what I am currently doing
doc = db.collection.findOne({"unique_id": 1234})
config_no = doc.config_no
db.collection.updateOne(
{"unique_id": 1234},
{"$set": {"configs."+config_no+".data": "zxcvbnm"}} //"configs.1.data"
)
Following is something what i would like to achive.
db.collection.updateOne(
{"unique_id": 1234},
{"$set": {"configs.${config_no}.data": "zxcvbnm"}}
)
You can $unwind with includeArrayIndex option. Use the index to perform conditional update and $merge back into the collection.
db.collection.aggregate([
{
$match: {
unique_id: 1234
}
},
{
"$unwind": {
path: "$configs",
includeArrayIndex: "idx"
}
},
{
$set: {
"configs.data": {
"$cond": {
"if": {
$eq: [
"$config_no",
"$idx"
]
},
"then": "zxcvbnm",
"else": "$configs.data"
}
}
}
},
{
$group: {
_id: "$_id",
config_no: {
$first: "$config_no"
},
configs: {
$push: "$configs"
},
unique_id: {
$first: "$unique_id"
}
}
},
{
"$merge": {
"into": "collection",
"on": "_id",
"whenMatched": "merge"
}
}
])
Mongo Playground

Removing a list entry in a list in pyMongo

I have a database collection that has objects like this:
{
"_id": ObjectId("something"),
"name_lower": "total",
"name": "Total",
"mounts": [
[
"mount1",
"instance1"
],
[
"mount2",
"instance1"
],
[
"mount1",
"instance2"
],
[
"mount2",
"instance2"
]
]
}
Say I want to remove every mount that has the instance instance2, How would I go about doing that? I have been searching for quite a while.
You can do something like this
[
{
$unwind: "$mounts"
},
{
$match: {
"mounts": {
$ne: "instance2"
}
}
},
{
$group: {
_id: "$_id",
name: {
$first: "$name"
},
mounts: {
$push: "$mounts"
}
}
}
]
Working Mongo playground
This answer is based on #varman answer but more pythonic and efficient.
The first stage should be a $match condition to filter out documents that don't need to be updated.
Since the mounts key consists of a nested array, we have to $unwind it, so that we can remove array elements that need to be removed.
We have to apply the $match condition again to filter out the element that has to be removed.
Finally, we have to $group the pipeline by _id key, so that the documents which got $unwind in the previous stage will be groupped into a single document.
from pymongo import MongoClient
client = MongoClient("<URI-String>")
col = client["<DB-Name"]["<Collection-Name>"]
count = 0
for cursor in col.aggregate([
{
"$match": {
"mounts": {"$ne": "instance2"}
}
},
{
"$unwind": "$mounts"
},
{
"$match": {
"mounts": {"$ne": "instance2"}
}
},
{
"$group": {
"_id": "$_id",
"newMounts": {
"$push": "$mounts"
}
}
},
]):
# print(cursor)
col.update_one({
"_id": cursor["_id"]
}, {
"$set": {
"mounts": cursor["newMounts"]
}
})
count += 1
print("\r", count, end="")
print("\n\nDone!!!")

MongoDB - Get SUM of values INSIDE of the array

I have JSON document recorded to MongoDB with structure like so:
[{ "SessionKey": "172e3b6b-509e-4ef3-950c-0c1dc5c83bab",
"Query": {"Date": "2020-03-04"},
"Flights": [
{"LegId":"13235",
"PricingOptions": [
{"Agents": [1963108],
"Price": 61763.64 },
{"Agents": [4035868],
"Price": 62395.83 }]},
{"LegId": "13236",
"PricingOptions": [{
"Agents": [2915951],
"Price": 37188.0}]}
...
The result I'm trying to get is "LegId":"sum_per_flight", in this case -> {'13235': (61763.64+62395.83), '13236': 37188.0} and then get flights with price < N
I've tried to run this pipeline for aggregation step (but it returns list of ALL prices - I don't know how to sum them up properly):
result = collection.aggregate([
{'$match': {'Query.Date': '2020-03-01'}},
{'$group': {'_id': {'Flight':'$Flights.LegId', 'Price':'$Flights.PricingOptions.Price'}}} ])
Also I've tried this pipeline, but it returns 0 for 'total_price_per_flight':
result = collection.aggregate({'$project': {
'Flights.LegId':1,
'total_price_per_flight': {'$sum': '$Flights.PricingOptions.Price'}
}})
You need to use $unwind to flatten Flights array to able iterate individually.
With $reduce operator, we iterate PricingOptions array and sum Price fields (accumulate prices).
The last step we return your documents into original structure. Before that, you may apply "get flights with price < N"
db.collection.aggregate([
{
"$match": {
"Query.Date": "2020-03-04"
}
},
{
$unwind: "$Flights"
},
{
$addFields: {
"Flights.LegId": {
$arrayToObject: [
[
{
k: "$Flights.LegId",
v: {
$reduce: {
input: "$Flights.PricingOptions",
initialValue: 0,
in: {
$add: [
"$$value",
"$$this.Price"
]
}
}
}
}
]
]
}
}
},
{
$group: {
_id: "$_id",
SessionKey: {
$first: "$SessionKey"
},
Query: {
$first: "$Query"
},
Flights: {
$push: "$Flights"
}
}
}
])
MongoPlayground

How do I extract keys from a dictionary that has {"key":[{"A":"1"},{"B":"2"}]?

I have a python dictionary,
dict = {
"A": [{
"264": "0.1965"
}, {
"289": "0.1509"
}, {
"192": "0.1244"
}]
}
I have a collection in mongoDB that has,
{
"_id": ObjectId("5d5a7f474c55b68a873f9602"),
"A": [{
"264": "0.5700"
}, {
"175": "0.321"
}
}
{
"_id": ObjectId("5d5a7f474c55b68a873f9610"),
"B": [{
"152": "0.2826"
}, {
"012": "0.1234"
}
}
}
I want to see if the key "A" from dict is available in mongodb. If yes, I want to loop over the keys in the list i.e.
[{
"264": "0.19652049960139123"
}, {
"289": "0.1509138215380371"
}, {
"192": "0.12447470015715734"
}]
}
and check if 264 is available in mongodb and update the key value else append.
Expected output in mongodb:
{
"_id": ObjectId("5d5a7f474c55b68a873f9602"),
"A": [{
"264": "0.1965"
}, {
"175": "0.321"
}, {
"289": "0.1509"
}, {
"192": "0.1244"
}
}
{
"_id": ObjectId("5d5a7f474c55b68a873f9610"),
"B": [{
"152": "0.2826"
},{
"012": "0.1234"
}
}
The value for key 264 is updated. Kindly help.
Assuming you are looking for the python part and not the mongoDB, try:
for k,v in dict['A'].items(): #k is key, v is value
process_entry(k, v) #do what you want with the database
assuming your mongodb collection is called your_collection
data= your_collection.find_one({'A':{'$exists':1}})
if data:
#loop over the keys
for item in data['A']:
#check whether a certain key is available
if 'some_key' not in item:
do_something()# update

How Iterate or remove MongoDb array list item using pymongo?

I want to iterate Mongodb database Arraylist items(TRANSACTION list) and remove Arraylist specific(TRANSACTION List) item using pymongo ?
I create Mongo collection as above using python pymongo. I want to iterate array list item using pymongo and remove final item only in Arraylist?
Data insert query using Python pymongo
# added new method create block chain_structure
def addCoinWiseTransaction(self, senz, coin, format_date):
self.collection = self.db.block_chain
coinValexists = self.collection.find({"_id": str(coin)}).count()
print('coin exists : ', coinValexists)
if (coinValexists > 0):
print('coin hash exists')
newTransaction = {"$push": {"TRANSACTION": {"SENDER": senz.attributes["#SENDER"],
"RECIVER": senz.attributes["#RECIVER"],
"T_NO_COIN": int(1),
"DATE": datetime.datetime.utcnow()
}}}
self.collection.update({"_id": str(coin)}, newTransaction)
else:
flag = senz.attributes["#f"];
print flag
if (flag == "ccb"):
print('new coin mined othir minner')
root = {"_id": str(coin)
, "S_ID": int(senz.attributes["#S_ID"]), "S_PARA": senz.attributes["#S_PARA"],
"FORMAT_DATE": format_date,
"NO_COIN": int(1),
"TRANSACTION": [{"MINER": senz.attributes["#M_S_ID"],
"RECIVER": senz.attributes["#RECIVER"],
"T_NO_COIN": int(1),
"DATE": datetime.datetime.utcnow()
}
]
}
self.collection.insert(root)
else:
print('new coin mined')
root = {"_id": str(coin)
, "S_ID": int(senz.attributes["#S_ID"]), "S_PARA": senz.attributes["#S_PARA"],
"FORMAT_DATE": format_date,
"NO_COIN": int(1),
"TRANSACTION": [{"MINER": "M_1",
"RECIVER": senz.sender,
"T_NO_COIN": int(1),
"DATE": datetime.datetime.utcnow()
}
]
}
self.collection.insert(root)
return 'DONE'
To remove the last entry, the general idea (as you have mentioned) is to iterate the array and grab the index of the last element as denoted by its DATE field, then update the collection by removing it using $pull. So the crucial piece of data you need for this to work is the DATE value and the document's _id.
One approach you could take is to first use the aggregation framework to get this data. With this, you can run a pipeline where the first step if filtering the documents in the collection by using the $match operator which uses standard MongoDB queries.
The next stage after filtering the documents is to flatten the TRANSACTION array i.e. denormalise the documents in the list so that you can filter the final item i.e. get the last document by the DATE field. This is made possible with the $unwind operator, which for each input document, outputs n documents where n is the number of array elements and can be zero for an empty array.
After deconstructing the array, in order to get the last document, use the $group operator where you can regroup the flattened documents and in the process use the group accumulator operators to obtain
the last TRANSACTION date by using the $max operator applied to its embedded DATE field.
So in essence, run the following pipeline and use the results to update the collection. For example, you can run the following pipeline:
mongo shell
db.block_chain.aggregate([
{ "$match": { "_id": coin_id } },
{ "$unwind": "$TRANSACTION" },
{
"$group": {
"_id": "$_id",
"last_transaction_date": { "$max": "$TRANSACTION.DATE" }
}
}
])
You can then get the document with the update data from this aggregate operation using the toArray() method or the aggregate cursor and update your collection:
var docs = db.block_chain.aggregate([
{ "$match": { "_id": coin_id } },
{ "$unwind": "$TRANSACTION" },
{
"$group": {
"_id": "$_id",
"LAST_TRANSACTION_DATE": { "$max": "$TRANSACTION.DATE" }
}
}
]).toArray()
db.block_chain.updateOne(
{ "_id": docs[0]._id },
{
"$pull": {
"TRANSACTION": {
"DATE": docs[0]["LAST_TRANSACTION_DATE"]
}
}
}
)
python
def remove_last_transaction(self, coin):
self.collection = self.db.block_chain
pipe = [
{ "$match": { "_id": str(coin) } },
{ "$unwind": "$TRANSACTION" },
{
"$group": {
"_id": "$_id",
"last_transaction_date": { "$max": "$TRANSACTION.DATE" }
}
}
]
# run aggregate pipeline
cursor = self.collection.aggregate(pipeline=pipe)
docs = list(cursor)
# run update
self.collection.update_one(
{ "_id": docs[0]["_id"] },
{
"$pull": {
"TRANSACTION": {
"DATE": docs[0]["LAST_TRANSACTION_DATE"]
}
}
}
)
Alternatively, you can run a single aggregate operation that will also update your collection using the $out pipeline which writes the results of the pipeline to the same collection:
If the collection specified by the $out operation already
exists, then upon completion of the aggregation, the $out stage atomically replaces the existing collection with the new results collection. The $out operation does not
change any indexes that existed on the previous collection. If the
aggregation fails, the $out operation makes no changes to
the pre-existing collection.
For example, you could run this pipeline:
mongo shell
db.block_chain.aggregate([
{ "$match": { "_id": coin_id } },
{ "$unwind": "$TRANSACTION" },
{ "$sort": { "TRANSACTION.DATE": 1 } }
{
"$group": {
"_id": "$_id",
"LAST_TRANSACTION": { "$last": "$TRANSACTION" },
"FORMAT_DATE": { "$first": "$FORMAT_DATE" },
"NO_COIN": { "$first": "$NO_COIN" },
"S_ID": { "$first": "$S_ID" },
"S_PARA": { "$first": "$S_PARA" },
"TRANSACTION": { "$push": "$TRANSACTION" }
}
},
{
"$project": {
"FORMAT_DATE": 1,
"NO_COIN": 1,
"S_ID": 1,
"S_PARA": 1,
"TRANSACTION": {
"$setDifference": ["$TRANSACTION", ["$LAST_TRANSACTION"]]
}
}
},
{ "$out": "block_chain" }
])
python
def remove_last_transaction(self, coin):
self.db.block_chain.aggregate([
{ "$match": { "_id": str(coin) } },
{ "$unwind": "$TRANSACTION" },
{ "$sort": { "TRANSACTION.DATE": 1 } },
{
"$group": {
"_id": "$_id",
"LAST_TRANSACTION": { "$last": "$TRANSACTION" },
"FORMAT_DATE": { "$first": "$FORMAT_DATE" },
"NO_COIN": { "$first": "$NO_COIN" },
"S_ID": { "$first": "$S_ID" },
"S_PARA": { "$first": "$S_PARA" },
"TRANSACTION": { "$push": "$TRANSACTION" }
}
},
{
"$project": {
"FORMAT_DATE": 1,
"NO_COIN": 1,
"S_ID": 1,
"S_PARA": 1,
"TRANSACTION": {
"$setDifference": ["$TRANSACTION", ["$LAST_TRANSACTION"]]
}
}
},
{ "$out": "block_chain" }
])
Whilst this approach can be more efficient than the first, it requires knowledge of the existing fields first so in some cases the solution cannot be practical.

Categories