MapReduce in PyMongo

MapReduce in PyMongo - python

My Mongo collection : Impressions has docs in the following format:-
{
_uid: 10,
"impressions": [
{
"pos": 6,
"id": 123,
"service": "furniture"
},
{
"pos": 0,
"id": 128,
"service": "electronics"
},
{
"pos": 2,
"id": 127,
"service": "furniture"
},
{
"pos": 2,
"id": 125,
"service": "electronics"
},
{
"pos": 10,
"id": 124,
"service": "electronics"
}
]
},
{
_uid: 11,
"impressions": [
{
"pos": 1,
"id": 124,
"service": "furniture"
},
{
"pos": 10,
"id": 124,
"service": "electronics"
},
{
"pos": 1,
"id": 123,
"service": "furniture"
},
{
"pos": 21,
"id": 122,
"service": "furniture"
},
{
"pos": 3,
"id": 125,
"service": "electronics"
},
{
"pos": 10,
"id": 121,
"service": "electronics"
}
]
},
.
.
.
.
.
Each of the doc in the collection has "impressions" key which is an array of dictionaries. In each dictionary "id" is the id of the entity, "service" is the service type and "pos"is the position of the item in the search page results. My aim is to find out the count of number of impressions for every "id" in each category.
So for the above data for "service" == "furniture", I want to have this as my aggregation results:-
[
{"id": 123,"impressions_count":2},
{"id": 127,"impressions_count":1},
{"id": 124,"impressions_count":1},
{"id": 122,"impressions_count":1}
]
I tried to aggregate on the "id" using MAPREDUCE via following function in a python script
def fetch_impressions():
try:
imp_collection = get_mongo_connection('Impressions')
map = Code("""
function(){
for( x in this.impressions){
var flat_id = x['id'];
var service_type = x['service']
emit(parseInt(flat_id),1);
}
};
""")
""")
reduce = Code("""
function(a,b){
return Array.sum(b);
};
""")
results = imp_collection.map_reduce(map, reduce, 'aggregation_result')
return results
except Exception as e:
raise Exception(e)
But I'm getting the results as None, probably because of the faulty map function.I'm new to Javascript and Mongo kindly help!

You can use the aggregation framework
import pymongo
conn = pymongo.MongoClient()
db = conn.test
col = db.collection
for doc in col.aggregate([{'$unwind': '$impressions'},
{'$match': {'impressions.service': 'furniture'}},
{'$group': {'_id': '$impressions.id', 'impressions_count': {'$sum': 1}}},
]):
print(doc)
Or more efficiently using the $map and the $setDifference operators.
col.aggregate([
{ "$project": { "impressions": {"$setDifference": [{ "$map": { "input": "$impressions", "as": "imp", "in": { "$cond": { "if": { "$eq": [ "$$imp.service", "furniture" ] }, "then": "$$imp.id", "else": 0 }}}}, [0]]}}},
{ "$unwind": "$impressions" },
{ "$group": { "_id": "$impressions", "impressions_count": { "$sum": 1 }}}
])
Which yields:
{'_id': 122.0, 'impressions_count': 1}
{'_id': 124.0, 'impressions_count': 1}
{'_id': 127.0, 'impressions_count': 1}
{'_id': 123.0, 'impressions_count': 2}

I made a tool that lets you run MongoDB Map/Reduce in Python
https://mreduce.com
import random
import threading
import bson
import pymongo
import mreduce
mongo_client = pymongo.MongoClient("mongodb://your_mongodb_server")
def map_func(document):
for impression in document["impressions"]:
yield document["id"], 1
def reduce_func(id, prices):
return sum(prices)
worker_functions = {
"exampleMap": map_func,
"exampleReduce": reduce_func
}
api = mreduce.API(
api_key = "...",
mongo_client = mongo_client
)
project_id = "..."
thread = threading.Thread(
target=api.run,
args=[project_id, worker_functions]
)
thread.start()
job = api.submit_job(
projectId=project["_id"],
mapFunctionName="exampleMap",
reduceFunctionName="exampleReduce",
inputDatabase="db",
inputCollection="impressions",
outputDatabase="db",
outputCollection="impressions_results"
)
result = job.wait_for_result()
for key, value in result:
print("Key: " + key, ", Value: " + str(value))

Related

How to create an automatic mapping of possible JSON data options to be collected?

I've never heard of or found an option for what I'm looking for, but maybe someone knows a way:
To collect the data from a JSON I need to map manually it like this:
events = response['events']
for event in events:
tournament_name = event['tournament']['name']
tournament_slug = event['tournament']['slug']
tournament_category_name = event['tournament']['category']['name']
tournament_category_slug = event['tournament']['category']['slug']
tournament_category_sport_name = event['tournament']['category']['sport']['name']
tournament_category_sport_slug = event['tournament']['category']['sport']['slug']
tournament_category_sport_id = event['tournament']['category']['sport']['id']
The complete model is this:
{
"events": [
{
"tournament": {
"name": "Serie A",
"slug": "serie-a",
"category": {
"name": "Italy",
"slug": "italy",
"sport": {
"name": "Football",
"slug": "football",
"id": 1
},
"id": 31,
"flag": "italy",
"alpha2": "IT"
},
"uniqueTournament": {
"name": "Serie A",
"slug": "serie-a",
"category": {
"name": "Italy",
"slug": "italy",
"sport": {
"name": "Football",
"slug": "football",
"id": 1
},
"id": 31,
"flag": "italy",
"alpha2": "IT"
},
"userCount": 586563,
"id": 23,
"hasEventPlayerStatistics": true
},
"priority": 254,
"id": 33
},
"roundInfo": {
"round": 24
},
"customId": "Kdbsfeb",
"status": {
"code": 7,
"description": "2nd half",
"type": "inprogress"
},
"winnerCode": 0,
"homeTeam": {
"name": "Bologna",
"slug": "bologna",
"shortName": "Bologna",
"gender": "M",
"userCount": 39429,
"nameCode": "BOL",
"national": false,
"type": 0,
"id": 2685,
"subTeams": [
],
"teamColors": {
"primary": "#003366",
"secondary": "#cc0000",
"text": "#cc0000"
}
},
"awayTeam": {
"name": "Empoli",
"slug": "empoli",
"shortName": "Empoli",
"gender": "M",
"userCount": 31469,
"nameCode": "EMP",
"national": false,
"type": 0,
"id": 2705,
"subTeams": [
],
"teamColors": {
"primary": "#0d5696",
"secondary": "#ffffff",
"text": "#ffffff"
}
},
"homeScore": {
"current": 0,
"display": 0,
"period1": 0
},
"awayScore": {
"current": 0,
"display": 0,
"period1": 0
},
"coverage": 1,
"time": {
"initial": 2700,
"max": 5400,
"extra": 540,
"currentPeriodStartTimestamp": 1644159735
},
"changes": {
"changes": [
"status.code",
"status.description",
"time.currentPeriodStart"
],
"changeTimestamp": 1644159743
},
"hasGlobalHighlights": false,
"hasEventPlayerStatistics": true,
"hasEventPlayerHeatMap": true,
"id": 9645399,
"statusTime": {
"prefix": "",
"initial": 2700,
"max": 5400,
"timestamp": 1644159735,
"extra": 540
},
"startTimestamp": 1644156000,
"slug": "empoli-bologna",
"lastPeriod": "period2",
"finalResultOnly": false
}
]
}
In my example I am collecting 7 values.
But there are 83 possible values to be collected.
In case I want to get all the values options that exist in this JSON, is there any way to make this map sequence automatically to print so I can copy it to the code?
Because manually it takes too long to do and it's very tiring.
And the results of texts like print() in terminal would be something like:
tournament_name = event['tournament']['name']
tournament_slug = event['tournament']['slug']
...
...
...
And so on until delivering the 83 object paths with values to collect...
Then I could copy all the prints and paste into my Python file to retrieve the values or any other way to make the work easier.

If the elements in the events arrays are the same, this code works without errors.
def get_prints(recode: dict):
for key in recode.keys():
if type(recode[key]) == dict:
for sub_print in get_prints(recode[key]):
yield [key] + sub_print
else:
yield [key]
class Automater:
def __init__(self,name: str):
"""
Params:
name: name of json
"""
self.name = name
def get_print(self,*args):
"""
Params:
*args: keys json
"""
return '_'.join(args) + ' = ' + self.name + ''.join([f"['{arg}']" for arg in args])
For example, this code:
dicts = {
'tournament':{
'name':"any name",
'slug':'somthing else',
'sport':{
'name':'sport',
'anotherdict':{
'yes':True
}
}
}
}
list_names = get_prints(dicts)
for name in list_names:
print(auto.get_print(*name))
Gives this output:
tournament_name = event['tournament']['name']
tournament_slug = event['tournament']['slug']
tournament_sport_name = event['tournament']['sport']['name']
tournament_sport_anotherdict_yes = event['tournament']['sport']['anotherdict']['yes']

Do a pymongo Query with elemmatch and filter

I have the following data structure:
[
{
"site_id": ObjectId("5e85b9d20498abd407e9a030"),
"status": "ERROR"
},
{
"site_id": ObjectId("5e85b9d20498abd407e9a120"),
"status": "ERROR"
},
{
"site_id": ObjectId("5e85b9d20498abd407e9a030"),
"status": "OK",
"risk_categories": [
{
"position": 1,
"category_id": 1414,
},
{
"position": 2,
"category_id": 1402,
},
{
"position": 3,
"category_id": 1392,
}
]
}
]
I want to make a query with pymongo like this:
collection.find_one(filter=filter)
where:
filter = {'$and': [{'$and': [{'site_id': ObjectId('5e85b9d20498abd407e9a030')}, {'status': 'OK'}]}, {'risk_categories': {'$elemMatch': {'$or': [{'position': {'$eq': 1}}, {'position': {'$eq': 2}}]}}}]}
however, it returns me the entire object. Not only the values of risk categories that I want.
What can I do on my filter to modify that?

The aggregation runs from mongo shell:
db.collection.aggregate( [
{
$match: {
site_id: ObjectId('5e85b9d20498abd407e9a030'),
status: "OK"
}
},
{
$addFields: {
risk_categories: {
$filter: {
input: "$risk_categories",
as: "cat",
cond: {
$in: [ "$$cat.position", [ 1, 2 ] ] // this is equivalent to using the "$or"
}
}
}
}
},
] ).pretty()
The output:
{
"_id" : ObjectId("5e85c7b6724e461876467077"),
"site_id" : ObjectId("5e85b9d20498abd407e9a030"),
"status" : "OK",
"risk_categories" : [
{
"position" : 1,
"category_id" : 1414
},
{
"position" : 2,
"category_id" : 1402
}
]
}
Using PyMongo 3.9 and MongoDB 4.2, from the Python shell:
import pymongo
from pymongo import MongoClient
client = MongoClient()
db = client.test
collection = db.collection
import pprint
from bson.objectid import ObjectId
pipeline = [
{
'$match': {
'site_id': ObjectId('5e85b9d20498abd407e9a030'),
'status': 'OK'
}
},
{
'$addFields': {
'risk_categories': {
'$filter': {
'input': '$risk_categories',
'as': 'cat',
'cond': {
'$in': [ '$$cat.position', [ 1, 2 ] ]
}
}
}
}
},
]
pprint.pprint(list(collection.aggregate(pipeline)))

Elasticsearch return phonetic token with search

I use the phonetic analysis plugin from elastic search to do some string matching thanks to phonetic transformation.
My problem is, how to get phonetic transformation processed by elastic search in the result of the query?.
First, I create an index with a metaphone transformation:
request_body = {
'settings': {
'index': {
'analysis': {
'analyzer': {
'metaphone_analyzer': {
'tokenizer':
'standard',
'filter': [
'ascii_folding_filter', 'lowercase',
'metaphone_filter'
]
}
},
'filter': {
'metaphone_filter': {
'type': 'phonetic',
'encoder': 'metaphone',
'replace': False
},
'ascii_folding_filter': {
'type': 'asciifolding',
'preserve_original': True
}
}
}
}
},
'mappings': {
'person_name': {
'properties': {
'full_name': {
'type': 'text',
'fields': {
'metaphone_field': {
'type': 'string',
'analyzer': 'metaphone_analyzer'
}
}
}
}
}
}
}
res = es.indices.create(index="my_index", body=request_body)
Then, I add some data:
# Add some data
names = [{
"full_name": "John Doe"
}, {
"full_name": "Bob Alice"
}, {
"full_name": "Foo Bar"
}]
for name in names:
res = es.index(index="my_index",
doc_type='person_name',
body=name,
refresh=True)
And finally, I query a name:
es.search(index="my_index",
body={
"size": 5,
"query": {
"multi_match": {
"query": "Jon Doe",
"fields": "*_field"
}
}
})
Search returns:
{
'took': 1,
'timed_out': False,
'_shards': {
'total': 5,
'successful': 5,
'skipped': 0,
'failed': 0
},
'hits': {
'total':
1,
'max_score':
0.77749264,
'hits': [{
'_index': 'my_index',
'_type': 'person_name',
'_id': 'AWwYjl4Mqo63y_hLp5Yl',
'_score': 0.77749264,
'_source': {
'full_name': 'John Doe'
}
}]
}
}
In the search return I would like to get the phonetic transformation of the names in elastic search (also from the query name but it is less important) when I execute the search.
I know, that I could use explain API but I would like to avoid a 2nd request, and moreover the explain API seems a little "overkill" for what I want to achieve.
Thanks !

It doesn't look like an easy thing to implement in an Elasticsearch query, but you could try analyze API and scripted fields with fielddata enabled, and term vectors might come handy. Here's how.
Retrieve tokens from an arbitrary query
Analyze API is a great tool if you want to understand how exactly does Elasticsearch tokenize your query.
Using your mapping you could do, for example:
GET myindex/_analyze
{
"analyzer": "metaphone_analyzer",
"text": "John Doe"
}
And get something like this as a result:
{
"tokens": [
{
"token": "JN",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "john",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "T",
"start_offset": 5,
"end_offset": 8,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "doe",
"start_offset": 5,
"end_offset": 8,
"type": "<ALPHANUM>",
"position": 1
}
]
}
This is technically a different query, but still might be useful.
Retrieve tokens from a field of a document
In theory, we could try to retrieve the very same tokens which analyze API returned in the previous section, from the documents matched by our query.
In practice Elasticsearch will not store the tokens of a text field it has just analyzed: fielddata is disabled by default. We need to enable it:
PUT /myindex
{
"mappings": {
"person_name": {
"properties": {
"full_name": {
"fields": {
"metaphone_field": {
"type": "text",
"analyzer": "metaphone_analyzer",
"fielddata": true
}
},
"type": "text"
}
}
}
},
"settings": {
...
}
}
Now, we can use scripted fields to ask Elasticsearch to return those tokens.
The query might look like this:
POST myindex/_search
{
"script_fields": {
"my tokens": {
"script": {
"lang": "painless",
"source": "doc[params.field].values",
"params": {
"field": "full_name.metaphone_field"
}
}
}
}
}
And the response would look like this:
{
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "myindex",
"_type": "person_name",
"_id": "123",
"_score": 1,
"fields": {
"my tokens": [
"JN",
"T",
"doe",
"john"
]
}
}
]
}
}
As you can see, the very same tokens (but in random order).
Can we retrieve also the information about location of these tokens in the document?
Retrieving tokens with their positions
term vectors may help. To be able to use them we actually don't need fielddata enabled. We could lookup term vectors for a document:
GET myindex/person_name/123/_termvectors
{
"fields" : ["full_name.metaphone_field"],
"offsets" : true,
"positions" : true
}
This would return something like this:
{
"_index": "myindex",
"_type": "person_name",
"_id": "123",
"_version": 1,
"found": true,
"took": 1,
"term_vectors": {
"full_name.metaphone_field": {
"field_statistics": {
"sum_doc_freq": 4,
"doc_count": 1,
"sum_ttf": 4
},
"terms": {
"JN": {
"term_freq": 1,
"tokens": [
{
"position": 0,
"start_offset": 0,
"end_offset": 4
}
]
},
"T": {
"term_freq": 1,
"tokens": [
{
"position": 1,
"start_offset": 5,
"end_offset": 8
}
]
},
"doe": {
"term_freq": 1,
"tokens": [
{
"position": 1,
"start_offset": 5,
"end_offset": 8
}
]
},
"john": {
"term_freq": 1,
"tokens": [
{
"position": 0,
"start_offset": 0,
"end_offset": 4
}
]
}
}
}
}
}
This gives a way to get the tokens of a field of a document like the analyzer produced them.
Unfortunately, as of my knowledge, there is no way to combine these three queries into a single one. Also fielddata should be used with caution since it uses a lot of memory.
Hope this helps!

Create dynamic json object in python

I have a dictionary which is contain multiple keys and values and the values also contain the key, value pair. I am not getting how to create dynamic json using this dictionary in python. Here's the dictionary:
image_dict = {"IMAGE_1":{"img0":"IMAGE_2","img1":"IMAGE_3","img2":"IMAGE_4"},"IMAGE_2":{"img0":"IMAGE_1", "img1" : "IMAGE_3"},"IMAGE_3":{"img0":"IMAGE_1", "img1":"IMAGE_2"},"IMAGE_4":{"img0":"IMAGE_1"}}
My expected result like this :
{
"data": [
{
"image": {
"imageId": {
"id": "IMAGE_1"
},
"link": {
"target": {
"id": "IMAGE_2"
},
"target": {
"id": "IMAGE_3"
},
"target": {
"id": "IMAGE_4"
}
}
},
"updateData": "link"
},
{
"image": {
"imageId": {
"id": "IMAGE_2"
},
"link": {
"target": {
"id": "IMAGE_1"
},
"target": {
"id": "IMAGE_3"
}
}
},
"updateData": "link"
},
{
"image": {
"imageId": {
"id": "IMAGE_3"
},
"link": {
"target": {
"id": "IMAGE_1"
},
"target": {
"id": "IMAGE_2"
}
}
},
"updateData": "link"
} ,
{
"image": {
"imageId": {
"id": "IMAGE_4"
},
"link": {
"target": {
"id": "IMAGE_1"
}
}
},
"updateData": "link"
}
]
}
I tried to solve it but I didn't get expected result.
result = {"data":[]}
for k,v in sorted(image_dict.items()):
for a in sorted(v.values()):
result["data"].append({"image":{"imageId":{"id": k},
"link":{"target":{"id": a}}},"updateData": "link"})
print(json.dumps(result, indent=4))

In Python dictionaries you can't have 2 values with the same key. So you can't have multiple targets all called "target". So you can index them. Also I don't know what this question has to do with dynamic objects but here's the code I got working:
import re
dict_res = {}
ind = 0
for image in image_dict:
lin_ind = 0
sub_dict = {'image' + str(ind): {'imageId': {image}, 'link': {}}}
for sub in image_dict[image].values():
sub_dict['image' + str(ind)]['link'].update({'target' + str(lin_ind): {'id': sub}})
lin_ind += 1
dict_res.update(sub_dict)
ind += 1
dict_res = re.sub('target\d', 'target', re.sub('image\d', 'image', str(dict_res)))
print dict_res

Elasticsearch: Time Range aggregation is not working as expected

I'm new to elasticsearch domain. I'm learning and trying it out to check if it meets my needs.
Right now I'm learning aggregations in elasticsearch and I wrote the following python script to ingest some time-series data into elasticsearch.
Every 5 seconds I create a new message which will have:
Timestamp (ISO8601 format)
Counter
A random number between 0 and 100
For every new day, I create a new index with logs_Y-m-D as the index name.
I will index every message using the message Counter as the _id. The counter resets for every new index (every day).
import csv
import time
import random
from datetime import datetime
from elasticsearch import Elasticsearch
class ElasticSearchDB:
def __init__(self):
self.es = Elasticsearch()
def run(self):
print("Started: {}".format(datetime.now().isoformat()))
print("<Ctrl + c> for exit!")
with open("..\\out\\logs.csv", "w", newline='') as f:
writer = csv.writer(f)
counter = 0
try:
while True:
i_name = "logs_" + time.strftime("%Y-%m-%d")
if not self.es.indices.exists([i_name]):
self.es.indices.create(i_name, ignore=400)
print("New index created: {}".format(i_name))
counter = 0
message = {"counter": counter, "#timestamp": datetime.now().isoformat(), "value": random.randint(0, 100)}
# Write to file
writer.writerow(message.values())
# Write to elasticsearch index
self.es.index(index=i_name, doc_type="logs", id=counter, body=message)
# Waste some time
time.sleep(5)
counter += 1
except KeyboardInterrupt:
print("Stopped: {}".format(datetime.now().isoformat()))
test_es = ElasticSearchDB()
test_es.run()
I ran this script for 30 minutes. Next, using Sense, I query elasticsearch with following aggregation queries.
Query #1: Get all
Query #2: Aggregate logs from last 1 hour and generate stats for them. This shows right results.
Query #3: Aggregate logs from last 1 minute and generate stats for them. The number of docs aggregated is same as in from 1-hour aggregations, ideally, it should have aggregated only 12-13 logs.
Query #4: Aggregate logs from last 15 seconds and generate stats for them. The number of docs aggregated is same as in from 1-hour aggregations, ideally, it should have aggregated only 3-4 logs.
My Questions:
Why is elasticsearch not able to understand 1 minute and 15 seconds
range?
I understand mappings but I don't know how to write one, so I've not written one, is that what is causing this problem?
Please help!
Query #1: Get all
GET /_search
Output:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 314,
"max_score": 1,
"hits": [
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "19",
"_score": 1,
"_source": {
"counter": 19,
"value": 62,
"#timestamp": "2016-11-03T07:40:35.981395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "22",
"_score": 1,
"_source": {
"counter": 22,
"value": 95,
"#timestamp": "2016-11-03T07:40:51.066395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "25",
"_score": 1,
"_source": {
"counter": 25,
"value": 18,
"#timestamp": "2016-11-03T07:41:06.140395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "26",
"_score": 1,
"_source": {
"counter": 26,
"value": 58,
"#timestamp": "2016-11-03T07:41:11.164395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "29",
"_score": 1,
"_source": {
"counter": 29,
"value": 73,
"#timestamp": "2016-11-03T07:41:26.214395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "41",
"_score": 1,
"_source": {
"counter": 41,
"value": 59,
"#timestamp": "2016-11-03T07:42:26.517395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "14",
"_score": 1,
"_source": {
"counter": 14,
"value": 9,
"#timestamp": "2016-11-03T07:40:10.857395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "40",
"_score": 1,
"_source": {
"counter": 40,
"value": 9,
"#timestamp": "2016-11-03T07:42:21.498395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "24",
"_score": 1,
"_source": {
"counter": 24,
"value": 41,
"#timestamp": "2016-11-03T07:41:01.115395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "0",
"_score": 1,
"_source": {
"counter": 0,
"value": 79,
"#timestamp": "2016-11-03T07:39:00.302395"
}
}
]
}
}
Query #2: Get stats from last 1 hour.
GET /logs_2016-11-03/logs/_search?search_type=count
{
"aggs": {
"time_range": {
"filter": {
"range": {
"#timestamp": {
"from": "now-1h"
}
}
},
"aggs": {
"just_stats": {
"stats": {
"field": "value"
}
}
}
}
}
}
Output:
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 366,
"max_score": 0,
"hits": []
},
"aggregations": {
"time_range": {
"doc_count": 366,
"just_stats": {
"count": 366,
"min": 0,
"max": 100,
"avg": 53.17213114754098,
"sum": 19461
}
}
}
}
I get 366 entries, which is correct.
Query #3: Get stats from last 1 minute.
GET /logs_2016-11-03/logs/_search?search_type=count
{
"aggs": {
"time_range": {
"filter": {
"range": {
"#timestamp": {
"from": "now-1m"
}
}
},
"aggs": {
"just_stats": {
"stats": {
"field": "value"
}
}
}
}
}
}
Output:
{
"took": 15,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 407,
"max_score": 0,
"hits": []
},
"aggregations": {
"time_range": {
"doc_count": 407,
"just_stats": {
"count": 407,
"min": 0,
"max": 100,
"avg": 53.152334152334156,
"sum": 21633
}
}
}
}
This is wrong, it can't be 407 entries in last 1 minute, it should have been 12-13 logs only.
Query #4: Get stats from last 15 seconds.
GET /logs_2016-11-03/logs/_search?search_type=count
{
"aggs": {
"time_range": {
"filter": {
"range": {
"#timestamp": {
"from": "now-15s"
}
}
},
"aggs": {
"just_stats": {
"stats": {
"field": "value"
}
}
}
}
}
}
Output:
{
"took": 15,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 407,
"max_score": 0,
"hits": []
},
"aggregations": {
"time_range": {
"doc_count": 407,
"just_stats": {
"count": 407,
"min": 0,
"max": 100,
"avg": 53.152334152334156,
"sum": 21633
}
}
}
}
This is also wrong, it can't be 407 entries in last 15 seconds. It should have been only 3-4 logs only.

Your query is right but ES stores date in UTC and hence you are getting everything back. From the documentation
In JSON documents, dates are represented as strings. Elasticsearch
uses a set of preconfigured formats to recognize and parse these
strings into a long value representing milliseconds-since-the-epoch in
UTC.
You could use the pytz module and store dates in UTC in ES. Refer to this SO question.
You could also use time_zone param in range query, also it is better to aggregate on filtered results rather than get all the results and then filter on all of them.
GET /logs_2016-11-03/logs/_search
{
"query": {
"bool": {
"filter": {
"range": {
"#timestamp": {
"gte": "2016-11-03T07:15:35", <----- You would need absolute value
"time_zone": "-01:00" <---- timezone setting
}
}
}
}
},
"aggs": {
"just_stats": {
"stats": {
"field": "value"
}
}
},
"size": 0
}
You would have to convert desired time(now-1m, now-15s) to format yyyy-MM-dd'T'HH:mm:ss for time_zone param to work as now is not affected by time_zone so best option is to convert dates to UTC and store it.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

MapReduce in PyMongo - python

Related

How to create an automatic mapping of possible JSON data options to be collected?

Do a pymongo Query with elemmatch and filter

Elasticsearch return phonetic token with search

Create dynamic json object in python

Elasticsearch: Time Range aggregation is not working as expected

Categories

Resources