Various queries - MongoDB

Various queries - MongoDB - python

This is my table:
unicorns = {'name':'George',
'actions':[{'action':'jump', 'time':123123},
{'action':'run', 'time':345345},
...]}
How can I perform the following queries?
Grab the time of all actions of all unicorns where action=='jump' ?
Grab all actions of all unicorns where time is equal?
e.g. {'action':'jump', 'time':123} and {'action':'stomp', 'time':123}
Help would be amazing =)

For the second query, you need to use MapReduce, which can get a big hairy. This will work:
map = function() {
for (var i = 0, j = this.actions.length; i < j; i++) {
emit(this.actions[i].time, this.actions[i].action);
}
}
reduce = function(key, value_array) {
var array = [];
for (var i = 0, j = value_array.length; i < j; i++) {
if (value_array[i]['actions']) {
array = array.concat(value_array[i]['actions']);
} else {
array.push(value_array[i]);
}
}
return ({ actions: array });
}
res = db.test.mapReduce(map, reduce)
db[res.result].find()
This would return something like this, where the _id keys are your timestamps:
{ "_id" : 123, "value" : { "actions" : [ "jump" ] } }
{ "_id" : 125, "value" : { "actions" : [ "neigh", "canter" ] } }
{ "_id" : 127, "value" : { "actions" : [ "whinny" ] } }
Unfortunately, mongo doesn't currently support returning an array from a reduce function, thus necessitating the silly {actions: [...]} syntax.

Use dot-separated notation:
db.unicorns.find({'actions.action' : 'jump'})
Similarly for times:
db.unicorns.find({'actions.time' : 123})
Edit: if you want to group the results by time, you'll have to use MapReduce.

Related

Kotlin set Array as key for a HashMap

I'm doing a bit of Leetcode, and I'm facing this issue: Group Anagrams, I have a Python background and I can do the following:
res = defaultdic(list)
count = [0] * 26
res[tuple(count)].append(s)
as we can see we can set the tupled array as the key for the dictionary, I want to do the same thing in Kotlin, however, when creating this in Kotlin, I get a different object every time when adding this logic in a for loop.
fun groupAnagrams(strs: Array<String>): List<List<String>> {
val hashMap = hashMapOf<IntArray, ArrayList<String>>()
for (word in strs) {
val array = IntArray(26) { 0 }
for (char in word) {
val charInt = char - 'a'
array[charInt] += 1
}
if (hashMap.containsKey(array)) {
hashMap[array]!!.add(word)
} else {
hashMap[array] = ArrayList<String>().apply { add(word) }
}
}
return hashMap.values.toList()
}
Is this something can be done in Kotlin?

Equality for IntArray is checked based on its reference. You can use a List here in place of IntArray. Two Lists are equal if they contain the same elements.
Modified code will be like this:
fun groupAnagrams(strs: Array<String>): List<List<String>> {
val hashMap = hashMapOf<List<Int>, ArrayList<String>>()
for (word in strs) {
val array = List(26) { 0 }.toMutableList()
for (char in word) {
val charInt = char - 'a'
array[charInt] += 1
}
if (hashMap.containsKey(array)) {
hashMap[array]!!.add(word)
} else {
hashMap[array] = ArrayList<String>().apply { add(word) }
}
}
return hashMap.values.toList()
}

Avoiding the problem you run into (equality of arrays) by using String keys:
fun groupAnagramsWithHashing(strs: Array<String>): List<List<String>> {
val map = hashMapOf<String, MutableList<String>>()
MessageDigest.getInstance("SHA-1").also { sha ->
for (word in strs) {
word.toByteArray().sorted().forEach { sha.update(it) }
val key = sha.digest().joinToString()
map.computeIfAbsent(key) { mutableListOf() }.add(word)
}
}
return map.values.toList()
}
fun main() {
val input = arrayOf("eat", "tea", "tan", "ate", "nat", "bat")
groupAnagramsWithHashing(input).also { println(it) }
// [[eat, tea, ate], [bat], [tan, nat]]
}

Unable to replicate post_filter query in elasticsearch-dsl

The query I would like to replicate in DSL is as below:
GET /_search
{
"query":{
"bool":{
"must":[
{
"term":{
"destination":"singapore"
}
},
{
"terms":{
"tag_ids":[
"tag_luxury"
]
}
}
]
}
},
"aggs":{
"max_price":{
"max":{
"field":"price_range_from.SGD"
}
},
"min_price":{
"min":{
"field":"price_range_from.SGD"
}
}
},
"post_filter":{
"range":{
"price_range_from.SGD":{
"gte":0.0,
"lte":100.0
}
}
}
}
The above query
Matches terms - destination and tags_ids
Aggregates to result to find the max price from field price_range_from.SGD
Applies another post_filter to subset the result set within price limits
It works perfectly well in the Elastic/Kibana console.
I replicated the above query in elasticsearch-dsl as below:
es_query = []
es_query.append(Q("term", destination="singapore"))
es_query.append(Q("terms", tag_ids=["tag_luxury"]))
final_query = Q("bool", must=es_query)
es_conn = ElasticSearch.instance().get_client()
dsl_client = DSLSearch(using=es_conn, index=index).get_dsl_client()
dsl_client.query = final_query
dsl_client.aggs.metric("min_price", "min", field="price_range_from.SGD")
dsl_client.aggs.metric("max_price", "max", field="price_range_from.SGD")
q = Q("range", **{"price_range_from.SGD":{"gte": 0.0, "lte": 100.0}})
dsl_client.post_filter(q)
print(dsl_client.to_dict())
response = dsl_client.execute()
print(response.to_dict().get("hits", {}))
Although the aggregations are correct, products beyond the price range are also being returned. There is no error returned but it seems like the post_filter query is not applied.
I dived in the dsl_client object to see whether my query is being captured correctly. I see only the query and aggs but don't see the post_filter part in the object. The query when converted to a dictionary using dsl_client.to_dict() is as below -
{
"query":{
"bool":{
"must":[
{
"term":{
"destination":"singapore"
}
},
{
"terms":{
"tag_ids":[
"tag_luxury"
]
}
}
]
}
},
"aggs":{
"min_price":{
"min":{
"field":"price_range_from.SGD"
}
},
"max_price":{
"max":{
"field":"price_range_from.SGD"
}
}
}
}
Please help. Thanks!

You have to re-assign the dsl_client like:
dsl_client = dsl_client.post_filter(q)

MongoDB Python MongoEngine - Returning Document by filter of Embedded Documents Sum of Filtered property

I am using Python and MongoEngine to try and query the below Document in MongoDB.
I need a query to efficiently get the Documents only when they contain Embedded Documents 'Keywords' that match the following criteria:
Keywords Filtered where the Property 'SFR' is LTE '100000'
SUM the filtered keywords
Return the parent documents where SUM of the keywords matching the criteria is Greater than '9'
Example structure:
{
"_id" : ObjectId("5eae60e4055ef0e717f06a50"),
"registered_data" : ISODate("2020-05-03T16:12:51.999+0000"),
"UniqueName" : "SomeUniqueNameHere",
"keywords" : [
{
"keyword" : "carport",
"search_volume" : NumberInt(10532),
"sfr" : NumberInt(20127),
"percent_contribution" : 6.47,
"competing_product_count" : NumberInt(997),
"avg_review_count" : NumberInt(143),
"avg_review_score" : 4.05,
"avg_price" : 331.77,
"exact_ppc_bid" : 3.44,
"broad_ppc_bid" : 2.98,
"exact_hsa_bid" : 8.33,
"broad_hsa_bid" : 9.29
},
{
"keyword" : "party tent",
"search_volume" : NumberInt(6944),
"sfr" : NumberInt(35970),
"percent_contribution" : 4.27,
"competing_product_count" : NumberInt(2000),
"avg_review_count" : NumberInt(216),
"avg_review_score" : 3.72,
"avg_price" : 210.16,
"exact_ppc_bid" : 1.13,
"broad_ppc_bid" : 0.55,
"exact_hsa_bid" : 9.66,
"broad_hsa_bid" : 8.29
}
]
}
From the research I have been doing, I believe an Aggregate type query might do what I am attempting.
Unfortunately, being new to MongoDB / MongoEngine I am struggling to figure out how to structure the query and have failed in finding an example similar to what I am attempting to do (RED FLAG RIGHT????).
I did find an example of a aggregate but unsure how to structure my criteria in it, maybe something like this is getting close but does not work.
pipeline = [
{
"$lte": {
"$sum" : {
"keywords" : {
"$lte": {
"keyword": 100000
}
}
}: 9
}
}
]
data = product.objects().aggregate(pipeline)
Any guidance would be greatly appreciated.
Thanks,
Ben

you can try something like this
db.collection.aggregate([
{
$project: { // the first project to filter the keywords array
registered_data: 1,
UniqueName: 1,
keywords: {
$filter: {
input: "$keywords",
as: "item",
cond: {
$lte: [
"$$item.sfr",
100000
]
}
}
}
}
},
{
$project: { // the second project to get the length of the keywords array
registered_data: 1,
UniqueName: 1,
keywords: 1,
keywordsLength: {
$size: "$keywords"
}
}
},
{
$match: { // then do the match
keywordsLength: {
$gte: 9
}
}
}
])
you can test it here Mongo Playground
hope it helps
Note, I used sfr property only from the keywords array for simplicity

using $split for twice in mongodb using python

Let say in simple my document in mongodb is like this:
{'status' = {'tat': 'a, b <b>, c, d <d>' } }
I want to separate them and print it like
{bbced_name : 'a'},
{bbced_name : 'b'},
{bbced_name : 'c'},
{bbced_name : 'd'},
Therefore I try to split the data for twice. The first one is that to split the text with separator comma, then I split again with the separator < :
#the first split
project = { "$project" : { "bcced_name" : {
"$split" :
["$status.tat", ", "]
}
}
}
unwind = {"$unwind" : "$bcced_name"}
#the second split
project2= {"$project" : { "bbced_name2" : {
"$split" :
["$cced_name", "<"]
}
}
}
unwind2 = {"unwind" : "$bbced2"}
cur = collection.aggregate([project, unwind, project2, unwind2])
could I use split for twice in one pipeline? The first split is working well, but the second isn't.

You can below aggregation in 3.4.
$split to create a array of string values followed by $map to output a $substrCP value from start of the string to delimiter <.
Each substring end value is calculated by iterating the string using $range and $filter to output the location of the < string.
db.collection_name.aggregate(
[{"$project":
{"bcced_name":
{"$map":{
"input":{"$split":["$status.tat",", "]},
"as":"tat",
"in":{
"$cond":[
{"$eq":[{"$strLenCP":"$$tat"},1]},
"$$tat",
{
"$substrCP":[
"$$tat",
0,
{
"$arrayElemAt":[
{"$filter":{
"input":{"$range":[0,{"$strLenCP":"$$tat"},1]},
"as":"r",
"cond":{"$eq":[{"$substrCP":["$$tat","$$r",2]}," <"]}}
},
0]
}
]
}
]
}
}
}
}
},
{"$unwind": "$bcced_name"}
])
Update: (Use $indexOfCP)
db.collection_name.aggregate(
[{"$project":
{"bcced_name":
{"$map":{
"input":{"$split":["$status.tat",", "]},
"as":"tat",
"in":{
"$cond":[
{"$eq":[{"$strLenCP":"$$tat"},1]},
"$$tat",
{
"$substrCP":[
"$$tat",
0,
{ "$indexOfCP": [ "$$tat", " <" ] }
]
}
]
}
}
}
}
},
{"$unwind": "$bcced_name"}
])

{"$project":{
"bcced_name":
{"$map":{
"input":{"$split":["$status.tat",", "]},
"as":"tat",
"in":{
"$cond":[
{"$gt":[{"$indexOfCP":["$$tat","<"]},0]},
{"$arrayElemAt" : [{"$split":["$$tat", "<"]}, 0]},
"$$tat"
]
}
}
}
}
}

Reading large data from mongoDB in batches - Pymongo [duplicate]

I know that it is a bad practice to use skip in order to implement pagination, because when your data gets large skip starts to consume a lot of memory. One way to overcome this trouble is to use natural order by _id field:
//Page 1
db.users.find().limit(pageSize);
//Find the id of the last document in this page
last_id = ...
//Page 2
users = db.users.find({'_id'> last_id}). limit(10);
The problem is - I'm new to mongo and do not know what is the best way to get this very last_id

The concept you are talking about can be called "forward paging". A good reason for that is unlike using .skip() and .limit() modifiers this cannot be used to "go back" to a previous page or indeed "skip" to a specific page. At least not with a great deal of effort to store "seen" or "discovered" pages, so if that type of "links to page" paging is what you want, then you are best off sticking with the .skip() and .limit() approach, despite the performance drawbacks.
If it is a viable option to you to only "move forward", then here is the basic concept:
db.junk.find().limit(3)
{ "_id" : ObjectId("54c03f0c2f63310180151877"), "a" : 1, "b" : 1 }
{ "_id" : ObjectId("54c03f0c2f63310180151878"), "a" : 4, "b" : 4 }
{ "_id" : ObjectId("54c03f0c2f63310180151879"), "a" : 10, "b" : 10 }
Of course that's your first page with a limit of 3 items. Consider that now with code iterating the cursor:
var lastSeen = null;
var cursor = db.junk.find().limit(3);
while (cursor.hasNext()) {
var doc = cursor.next();
printjson(doc);
if (!cursor.hasNext())
lastSeen = doc._id;
}
So that iterates the cursor and does something, and when it is true that the last item in the cursor is reached you store the lastSeen value to the present _id:
ObjectId("54c03f0c2f63310180151879")
In your subsequent iterations you just feed that _id value which you keep ( in session or whatever ) to the query:
var cursor = db.junk.find({ "_id": { "$gt": lastSeen } }).limit(3);
while (cursor.hasNext()) {
var doc = cursor.next();
printjson(doc);
if (!cursor.hasNext())
lastSeen = doc._id;
}
{ "_id" : ObjectId("54c03f0c2f6331018015187a"), "a" : 1, "b" : 1 }
{ "_id" : ObjectId("54c03f0c2f6331018015187b"), "a" : 6, "b" : 6 }
{ "_id" : ObjectId("54c03f0c2f6331018015187c"), "a" : 7, "b" : 7 }
And the process repeats over and over until no more results can be obtained.
That's the basic process for a natural order such as _id. For something else it gets a bit more complex. Consider the following:
{ "_id": 4, "rank": 3 }
{ "_id": 8, "rank": 3 }
{ "_id": 1, "rank": 3 }
{ "_id": 3, "rank": 2 }
To split that into two pages sorted by rank then what you essentially need to know is what you have "already seen" and exclude those results. So looking at a first page:
var lastSeen = null;
var seenIds = [];
var cursor = db.junk.find().sort({ "rank": -1 }).limit(2);
while (cursor.hasNext()) {
var doc = cursor.next();
printjson(doc);
if ( lastSeen != null && doc.rank != lastSeen )
seenIds = [];
seenIds.push(doc._id);
if (!cursor.hasNext() || lastSeen == null)
lastSeen = doc.rank;
}
{ "_id": 4, "rank": 3 }
{ "_id": 8, "rank": 3 }
On the next iteration you want to be less or equal to the lastSeen "rank" score, but also excluding those already seen documents. You do this with the $nin operator:
var cursor = db.junk.find(
{ "_id": { "$nin": seenIds }, "rank": "$lte": lastSeen }
).sort({ "rank": -1 }).limit(2);
while (cursor.hasNext()) {
var doc = cursor.next();
printjson(doc);
if ( lastSeen != null && doc.rank != lastSeen )
seenIds = [];
seenIds.push(doc._id);
if (!cursor.hasNext() || lastSeen == null)
lastSeen = doc.rank;
}
{ "_id": 1, "rank": 3 }
{ "_id": 3, "rank": 2 }
How many "seenIds" you actually hold on to depends on how "granular" your results are where that value is likely to change. In this case you can check if the current "rank" score is not equal to the lastSeen value and discard the present seenIds content so it does not grow to much.
That's the basic concepts of "forward paging" for you to practice and learn.

The simplest way to implement pagination in MongoDB
// Pagination
const page = parseInt(req.query.page, 10) || 1;
const limit = parseInt(req.query.limit, 10) || 25;
const startIndex = (page - 1) * limit;
const endIndex = page * limit;
query = query.skip(startIndex).limit(limit);

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Various queries - MongoDB - python

Use dot-separated notation: db.unicorns.find({'actions.action' : 'jump'}) Similarly for times: db.unicorns.find({'actions.time' : 123}) Edit: if you want to group the results by time, you'll have to use MapReduce.

Related

Kotlin set Array as key for a HashMap

Unable to replicate post_filter query in elasticsearch-dsl

MongoDB Python MongoEngine - Returning Document by filter of Embedded Documents Sum of Filtered property

using $split for twice in mongodb using python

Reading large data from mongoDB in batches - Pymongo [duplicate]

Categories

Resources