I'm new to elasticsearch domain. I'm learning and trying it out to check if it meets my needs.
Right now I'm learning aggregations in elasticsearch and I wrote the following python script to ingest some time-series data into elasticsearch.
Every 5 seconds I create a new message which will have:
Timestamp (ISO8601 format)
Counter
A random number between 0 and 100
For every new day, I create a new index with logs_Y-m-D as the index name.
I will index every message using the message Counter as the _id. The counter resets for every new index (every day).
import csv
import time
import random
from datetime import datetime
from elasticsearch import Elasticsearch
class ElasticSearchDB:
def __init__(self):
self.es = Elasticsearch()
def run(self):
print("Started: {}".format(datetime.now().isoformat()))
print("<Ctrl + c> for exit!")
with open("..\\out\\logs.csv", "w", newline='') as f:
writer = csv.writer(f)
counter = 0
try:
while True:
i_name = "logs_" + time.strftime("%Y-%m-%d")
if not self.es.indices.exists([i_name]):
self.es.indices.create(i_name, ignore=400)
print("New index created: {}".format(i_name))
counter = 0
message = {"counter": counter, "#timestamp": datetime.now().isoformat(), "value": random.randint(0, 100)}
# Write to file
writer.writerow(message.values())
# Write to elasticsearch index
self.es.index(index=i_name, doc_type="logs", id=counter, body=message)
# Waste some time
time.sleep(5)
counter += 1
except KeyboardInterrupt:
print("Stopped: {}".format(datetime.now().isoformat()))
test_es = ElasticSearchDB()
test_es.run()
I ran this script for 30 minutes. Next, using Sense, I query elasticsearch with following aggregation queries.
Query #1: Get all
Query #2: Aggregate logs from last 1 hour and generate stats for them. This shows right results.
Query #3: Aggregate logs from last 1 minute and generate stats for them. The number of docs aggregated is same as in from 1-hour aggregations, ideally, it should have aggregated only 12-13 logs.
Query #4: Aggregate logs from last 15 seconds and generate stats for them. The number of docs aggregated is same as in from 1-hour aggregations, ideally, it should have aggregated only 3-4 logs.
My Questions:
Why is elasticsearch not able to understand 1 minute and 15 seconds
range?
I understand mappings but I don't know how to write one, so I've not written one, is that what is causing this problem?
Please help!
Query #1: Get all
GET /_search
Output:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 314,
"max_score": 1,
"hits": [
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "19",
"_score": 1,
"_source": {
"counter": 19,
"value": 62,
"#timestamp": "2016-11-03T07:40:35.981395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "22",
"_score": 1,
"_source": {
"counter": 22,
"value": 95,
"#timestamp": "2016-11-03T07:40:51.066395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "25",
"_score": 1,
"_source": {
"counter": 25,
"value": 18,
"#timestamp": "2016-11-03T07:41:06.140395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "26",
"_score": 1,
"_source": {
"counter": 26,
"value": 58,
"#timestamp": "2016-11-03T07:41:11.164395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "29",
"_score": 1,
"_source": {
"counter": 29,
"value": 73,
"#timestamp": "2016-11-03T07:41:26.214395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "41",
"_score": 1,
"_source": {
"counter": 41,
"value": 59,
"#timestamp": "2016-11-03T07:42:26.517395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "14",
"_score": 1,
"_source": {
"counter": 14,
"value": 9,
"#timestamp": "2016-11-03T07:40:10.857395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "40",
"_score": 1,
"_source": {
"counter": 40,
"value": 9,
"#timestamp": "2016-11-03T07:42:21.498395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "24",
"_score": 1,
"_source": {
"counter": 24,
"value": 41,
"#timestamp": "2016-11-03T07:41:01.115395"
}
},
{
"_index": "logs_2016-11-03",
"_type": "logs",
"_id": "0",
"_score": 1,
"_source": {
"counter": 0,
"value": 79,
"#timestamp": "2016-11-03T07:39:00.302395"
}
}
]
}
}
Query #2: Get stats from last 1 hour.
GET /logs_2016-11-03/logs/_search?search_type=count
{
"aggs": {
"time_range": {
"filter": {
"range": {
"#timestamp": {
"from": "now-1h"
}
}
},
"aggs": {
"just_stats": {
"stats": {
"field": "value"
}
}
}
}
}
}
Output:
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 366,
"max_score": 0,
"hits": []
},
"aggregations": {
"time_range": {
"doc_count": 366,
"just_stats": {
"count": 366,
"min": 0,
"max": 100,
"avg": 53.17213114754098,
"sum": 19461
}
}
}
}
I get 366 entries, which is correct.
Query #3: Get stats from last 1 minute.
GET /logs_2016-11-03/logs/_search?search_type=count
{
"aggs": {
"time_range": {
"filter": {
"range": {
"#timestamp": {
"from": "now-1m"
}
}
},
"aggs": {
"just_stats": {
"stats": {
"field": "value"
}
}
}
}
}
}
Output:
{
"took": 15,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 407,
"max_score": 0,
"hits": []
},
"aggregations": {
"time_range": {
"doc_count": 407,
"just_stats": {
"count": 407,
"min": 0,
"max": 100,
"avg": 53.152334152334156,
"sum": 21633
}
}
}
}
This is wrong, it can't be 407 entries in last 1 minute, it should have been 12-13 logs only.
Query #4: Get stats from last 15 seconds.
GET /logs_2016-11-03/logs/_search?search_type=count
{
"aggs": {
"time_range": {
"filter": {
"range": {
"#timestamp": {
"from": "now-15s"
}
}
},
"aggs": {
"just_stats": {
"stats": {
"field": "value"
}
}
}
}
}
}
Output:
{
"took": 15,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 407,
"max_score": 0,
"hits": []
},
"aggregations": {
"time_range": {
"doc_count": 407,
"just_stats": {
"count": 407,
"min": 0,
"max": 100,
"avg": 53.152334152334156,
"sum": 21633
}
}
}
}
This is also wrong, it can't be 407 entries in last 15 seconds. It should have been only 3-4 logs only.
Your query is right but ES stores date in UTC and hence you are getting everything back. From the documentation
In JSON documents, dates are represented as strings. Elasticsearch
uses a set of preconfigured formats to recognize and parse these
strings into a long value representing milliseconds-since-the-epoch in
UTC.
You could use the pytz module and store dates in UTC in ES. Refer to this SO question.
You could also use time_zone param in range query, also it is better to aggregate on filtered results rather than get all the results and then filter on all of them.
GET /logs_2016-11-03/logs/_search
{
"query": {
"bool": {
"filter": {
"range": {
"#timestamp": {
"gte": "2016-11-03T07:15:35", <----- You would need absolute value
"time_zone": "-01:00" <---- timezone setting
}
}
}
}
},
"aggs": {
"just_stats": {
"stats": {
"field": "value"
}
}
},
"size": 0
}
You would have to convert desired time(now-1m, now-15s) to format yyyy-MM-dd'T'HH:mm:ss for time_zone param to work as now is not affected by time_zone so best option is to convert dates to UTC and store it.
Related
I have the following schema for documents in my collection. Each document corresponds to list all the submissions made by a name.
- "_id": ObjectId
- "name": str
- "is_team": bool
- "submissions": List
- time: datetime
- score: float
Example:
{"name": "Intrinsic Nubs",
"is_team": true,
"submissions": [
{
"score": 61.77466359705439,
"time": {
"$date": {
"$numberLong": "1656009267652"
}
}
},
{
"score": 81.77466359705439,
"time": {
"$date": {
"$numberLong": "1656009267680"
}
}
}]}
I need to collect all those documents whose is_team is True and further get the name, Maximum Score and time corresponding to the maximum score.
Example:
[{"name": "Intrinsic Nubs", "MaxScore": 81.77466359705439, "time":{ "$date": {"$numberLong": "1656009267680"}}}]
Here's another way to produce your desired output.
db.collection.aggregate([
{ // limit docs
"$match": {"is_team": true}
},
{ // set MaxScore
"$set": {"MaxScore": {"$max": "$submissions.score"}}
},
{ "$project": {
"_id": 0,
"name": 1,
"MaxScore": 1,
"time": {
// get time at MaxScore
"$arrayElemAt": [
"$submissions.time",
{"$indexOfArray": ["$submissions.score", "$MaxScore"]}
]
}
}
}
])
Try it on mongoplayground.net.
Query
keep documents with is_team=true
reduce to find the member with the biggest score, and return it
you can $project, futher i kept all to see the change
Playmongo
aggregate(
[{"$match": {"is_team": {"$eq": true}}},
{"$set":
{"name": "$name",
"max-submision":
{"$reduce":
{"input": "$submissions",
"initialValue": {"score": 0},
"in":
{"$cond":
[{"$gt": ["$$this.score", "$$value.score"]}, "$$this",
"$$value"]}}}}}])
I'm trying to write a query to get the sum of a value per month of documents with a particular Id. To do this I'm trying:
query = {
"size": 0,
"aggs" : {
"articles_over_time" : {
"date_histogram" : {
"field" : "timestamp",
"interval" : "month"
}
},
"value": {
"sum": {
"field": "generatedTotal"
}
}
}
}
This query will give me the sum of generatedTotal per month, but it is giving me the sum of generatedTotal for all documents. How can I specify to get the sum of generatedTotal per month for a particular generatorId?
Example of a document in the Elasticsearch index:
{'id': 0, 'timestamp': '2018-01-01', 'generatorId': '150', 'generatedTotal': 2166.8759558092734}
If you do it separately like that, it counts as 2 different aggregations. You first need to query for the specific generatorId that you want, then do the second aggs within the first aggs:
{
"size": 0,
"query": {
"term": {
"generatorId": "150"
}
},
"aggs": {
"articles_over_time": {
"date_histogram": {
"field": "timestamp",
"interval": "month"
},
"aggs": {
"monthlyGeneratedTotal": {
"sum": {
"field": "generatedTotal"
}
}
}
}
}
}
4 sample documents (1 with different generatorId, and not be counted in the aggregations)
{"timestamp": "2018-02-01", "generatedTotal": 3, "generatorId": "150"}
{"timestamp": "2018-01-01", "generatedTotal": 1, "generatorId": "150"}
{"timestamp": "2018-01-01", "generatedTotal": 2, "generatorId": "150"}
{"timestamp": "2018-01-01", "generatedTotal": 2, "generatorId": "160"}
Then you will have the aggregations as follow:
{
"aggregations": {
"articles_over_time": {
"buckets": [
{
"key_as_string": "2018-01-01T00:00:00.000Z",
"key": 1514764800000,
"doc_count": 2,
"monthlyGeneratedTotal": {
"value": 3.0
}
},
{
"key_as_string": "2018-02-01T00:00:00.000Z",
"key": 1517443200000,
"doc_count": 1,
"monthlyGeneratedTotal": {
"value": 3.0
}
}
]
}
}
}
I hope this answers your question.
This question already has answers here:
Updating a Nested Array with MongoDB
(2 answers)
Closed 3 years ago.
i'm trying to update data on MongoDB using python-Flask but i found an error. I have followed the documentation how to implement it but the error still there. Could anyone help me to fix this?
#app.route('/updateData', methods=['POST'])
def updateData():
dataList = mongo.db.warehouse
old_Data = {
"name": "Pulo Gebang Warehouse"
}
new_Data = {
'$push':{"racks":{"rack columns":{"rack objects":{"items":{'$each':[{"index":4, "item":{"SKU": "HD 2179/3",
"arrivalDate": "2019-10-22",
"brand": "Philips",
"maxQty": 30,
"name": "Playstatus 10",
"qty": 10}}]}}}}}
}
dataList.update(old_Data, new_Data)
return "Update Success!"
This is my Database
[
{
"floorRacks": [
{
"adjacentRackID": "A1",
"floorColumn": [
{
"floorObjects": [
{
"index": 0,
"item": {
"SKU": "HD 1179/3",
"arrivalDate": "2019-10-22",
"brand": "Philips",
"maxQty": 30,
"name": "Blender Super mewah",
"qty": 10
}
},
{
"index": 1,
"item": "null"
}
],
"index": 0
}
]
}
],
"name": "Pulo Gebang Warehouse",
"racks": [
{
"code": "A",
"rack_columns": [
{
"columnID": 0,
"rack_objects": [
{
"items": [
{
"index": 0,
"item": {
"SKU": "HD 1179/3",
"arrivalDate": "2019-10-22",
"brand": "Philips",
"maxQty": 30,
"name": "Blender Super mewah",
"qty": 10
}
},
{
"index": 1,
"item": {
"SKU": "HD 1179/3",
"arrivalDate": "2019-10-22",
"brand": "Philips",
"maxQty": 30,
"name": "Blender Super mewah",
"qty": 10
}
},
{
"index": 2,
"item": "null"
},
{
"index": 3,
"item": {
"SKU": "HD 1189/3",
"arrivalDate": "2019-10-22",
"brand": "Philips",
"maxQty": 35,
"name": "Blender Super mewah Eksklusif",
"qty": 10
}
}
],
"level": 0
}
]
}
]
}
]
}
]
and here is the full ERROR Message
pymongo.errors.WriteError: The dollar ($) prefixed field '$each' in
'racks..rack columns.rack objects.items.$each' is not valid for
storage.
Try changing the query to below:
{
"$push": {
"racks.rack columns.rack objects.items": {
"$each": [
{
"index": 4,
"item": {
"SKU": "HD 2179/3",
"arrivalDate": "2019-10-22",
"brand": "Philips",
"maxQty": 30,
"name": "Playstatus 10",
"qty": 10
}
}
]
}
}
}
In elastic search aggregation query I need to get all the movies watched by the user who watches the movie "Frozen". This is how my Result source
{
"_index": "user",
"_type": "user",
"_id": "ovUowmUBREWOv-CU-4RT",
"_version": 4,
"_score": 1,
"_source": {
"movies": [
"Angry birds 1",
"PINNOCCHIO",
"Frozen",
"Hotel Transylvania 3"
],
"user_id": 86
}
}
This is the query I'm using.
{
"query": {
"match": {
"movies": "Frozen"
}
},
"size": 0,
"aggregations": {
"movies_like_Frozen": {
"terms": {
"field": "movies",
"min_doc_count": 1
}
}
}
}
The result I got in the bucket is correct, but the movie names are splits by white space like this
"buckets": [
{
"key": "3",
"doc_count": 2
},
{
"key": "hotel",
"doc_count": 2
},
{
"key": "transylvania",
"doc_count": 2
},
{
"key": "1",
"doc_count": 1
},
{
"key": "angry",
"doc_count": 1
},
{
"key": "birds",
"doc_count": 1
}
]
How can I get buckets with "Angry birds 1", "Hotel Transylvania 3" as result.
Please help.
In elasticsearch 6.x, every text field is analyzed implicitly. To override this, you need to create a mapping for text type fields as not_analyzed in an index, then insert documents in it.
In your case,
{
"mappings": {
"user": {
"properties": {
"movies": {
"type": "text",
"index": "not_analyzed",
"fields": {
"keyword": {
"type": "text",
"index": "not_analyzed"
}
}
},
"user_id": {
"type": "long"
}
}
}
}
}
Hope it works.
I Want to update the user's favorite products and I cant do that with python there is any solution for that. this example is the same as the user add and remove tags
this is the schema I have on customers INDEX
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "customers",
"_type": "customer",
"_id": "QOTzXMUcrnbsYyFKeouHnjlkjQB3",
"_score": 1,
"_source": {
"uid": "QOTzXMUcrnbsYyFKeouHnjlkjQB3",
"email": "george#gmail.com",
"favorites": [], < ---- this is the problem
"history": [],
"settings": {},
"purchases ": [],
"created": 1507892081201,
"updated": 1507892081201
}
}
]
}
As you see I want the array favorites to store the ids of the products the user selected as favorite products. And I want to update this doc to be like this:
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "customers",
"_type": "customer",
"_id": "QOTzXMUcrnbsYyFKeouHnjlkjQB3",
"_score": 1,
"_source": {
"uid": "QOTzXMUcrnbsYyFKeouHnjlkjQB3",
"email": "george#gmail.com",
"favorites": ['product_id_1', 'product_id_2', 'product_id_3'], < ---- this is the problem
"history": ['id_1', 'id_2', 'id_3'],
"settings": {},
"purchases ": [],
"created": 1507892081201,
"updated": 1507892081201
}
}
]
}
I have tried this code but is does not work:
fav_product = {
"user_id": "user_1",
"product_id": "favorite_product_1",
}
def add_favevorite_product(fav_product):
''' Add new favorite product '''
user_id = fav_product['user_id']
product_id = fav_product['product_id']
print('Start new favorite product')
timestamp = int(round(time.time() * 1000))
doc = {
'favorites' : "ctx._source.tags.add(product_id)",
'created': timestamp,
'updated': timestamp
}
es.update(index="customers", doc_type='customer', id=user_id, body={"doc": doc})
es.indices.refresh(index="customers")
return jsonify({'message': 'customer_updated'}), 200
# return jsonify(fav_product), 200
#end
Have this response from the server:
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "customers",
"_type": "customer",
"_id": "QOTzXMUcrnbsYyFKeouHnjlkjQB3",
"_score": 1,
"_source": {
"uid": "QOTzXMUcrnbsYyFKeouHnjlkjQB3",
"email": "george#gmail.com",
"favorites": "ctx._source.favorites.add(AV8PsBG_oWUfB334-p5b)",
"history": [],
"settings": {},
"purchases ": [],
"created": 1507893703655,
"updated": 1507893703655
}
}
]
}
Try this:
def add_favevorite_product(fav_product):
''' Add new favorite product '''
user_id = fav_product['user_id']
product_id = fav_product['product_id']
print('Start new favorite product')
doc = {
"script" : {
"inline":"ctx._source.favorites.add(params.product_id)",
"params":{
"product_id":product_id
}
}
}
es.update(index="customers", doc_type='customers', id= user_id, body=doc)
es.indices.refresh(index="test")