Elasticsearch: Aggregation returns garbage values sometimes - python

I'm relatively new to Elasticsearch. I'd really be grateful if anyone can help me with an issue in the aggregation query. I have the default setup of 5 clusters on a single node. My index contains the mapping as follows:
mapping = {
"my_index": {
"date_detection": False,
"dynamic_templates": [{
"string_fields": {
"mapping": {
"type": "string",
"fields": {
"raw": {
"index": "not_analyzed",
"ignore_above": 256,
"type": "string"
}
}
},
"match_mapping_type": "string",
"match": "*"
}
}]
}
}
I have already created aggregation queries for my requirement as follows:
records = es.search(index="my_index",doc_type="marksheet",body={ "aggs": { "student_name": { "terms": { "field": "name.raw","order": { "total_score" : "desc" } }, "aggs": { "total_score": { "sum": { "field": "score" } } } } } } )
This query works perfectly fine, just as i need it, most of the times. But sometimes, due to reasons unknown, this same query returns very large or very small results, like: 1.4e-322.
I haven't been able to find out a proper reason as to why this would happen on its own. Would really appreciate it someone could help me out with this. Thank You!
UPDATE:
After running the following aggregation:
{"aggs":{"score_stats":{"stats":{"field":"score"}}}}
I get the results in the aggregation key as:
{u'score_stats': {u'count': 1186, u'max': 5e-323, u'sum': 4.5187e-320, u'avg': 4e-323, u'min': 2e-323}}
UPDATE 2:
After running the query as the one below:
curl -XGET localhost:9200/my_index/marksheet/_search?_source=score&size=100&pretty&filter_p‌​ath=hits.hits.score
The hits key in the output is as follows:
"hits" : [ {
"_index" : "my_index",
"_type" : "marksheet",
"_id" : "AU61alGT0VlANyomm3HT",
"_score" : 1.0,
"_source":{"score":10}
}, {
"_index" : "my_index",
"_type" : "marksheet",
"_id" : "AU61alGV0VlANyomm3HV",
"_score" : 1.0,
"_source":{"score":10}
}, {
"_index" : "my_index",
"_type" : "marksheet",
"_id" : "AU61alGa0VlANyomm3Ha",
"_score" : 1.0,
"_source":{"score":8}
}, {
"_index" : "my_index",
"_type" : "marksheet",
"_id" : "AU61alGf0VlANyomm3Hh",
"_score" : 1.0,
"_source":{"score":8}
}, {
"_index" : "my_index",
"_type" : "marksheet",
"_id" : "AU61alGk0VlANyomm3Hn",
"_score" : 1.0,
"_source":{"score":6}
}, {
"_index" : "my_index",
"_type" : "marksheet",
"_id" : "AU61alGp0VlANyomm3Hu",
"_score" : 1.0,
"_source":{"score":10}
}, {
"_index" : "my_index",
"_type" : "marksheet",
"_id" : "AU61alGu0VlANyomm3H0",
"_score" : 1.0,
"_source":{"score":10}
}, {
"_index" : "my_index",
"_type" : "marksheet",
"_id" : "AU61alGz0VlANyomm3H7",
"_score" : 1.0,
"_source":{"score":10}
}, {
"_index" : "my_index",
"_type" : "marksheet",
"_id" : "AU61alHA0VlANyomm3IN",
"_score" : 1.0,
"_source":{"score":8}
}, {
"_index" : "my_index",
"_type" : "marksheet",
"_id" : "AU61alHD0VlANyomm3IR",
"_score" : 1.0,
"_source":{"score":10}
},
...

Related

Group an elasticsearch query with similar field and fetch both documents but together

I am new to elasticsearch and trying to make queries.
I have an index where among other fields two fields are Sno and request_sno.
I want to make query where document/row with certain Sno should be followed by doc/row which has request_sno exactly same for previous Sno.
example,
Sno:1, name:'a', address:'b',..., request_sno:''
Sno:2, name:'', address:'',...., request_sno:1
These two should come together one row filled by other.
At first I thought of group by but I don't want aggregation.
Any help will be highly appreciable.
You can resolve this usecase with collapse functionality of Elasticsearch.
For implement Collapse, your document should have one unique field. So lets create field call collpase_id which have value of request_sno and if request_sno is empty or null then copy value of sno field.
So your final document will be look like this:
{
"_index" : "collapse",
"_type" : "_doc",
"_id" : "kNNEbH4Bb7CAaZKC_vAY",
"_score" : 1.0,
"_source" : {
"sno" : 1,
"name" : "a",
"address" : "b",
"collapse_id" : 1
}
},
{
"_index" : "collapse",
"_type" : "_doc",
"_id" : "kdNFbH4Bb7CAaZKC-PBM",
"_score" : 1.0,
"_source" : {
"sno" : 2,
"name" : "a",
"address" : "b",
"request_sno" : 1,
"collapse_id" : 1
}
You can use below query to get collpase result
POST collapse/_search
{
"_source": false,
"query": {
"match_all": {}
},
"collapse": {
"field": "collapse_id",
"inner_hits": {
"name": "sno_reqsno_match",
"size": 10
}
}
}
your result will look like below:
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "collapse",
"_type" : "_doc",
"_id" : "kNNEbH4Bb7CAaZKC_vAY",
"_score" : 1.0,
"fields" : {
"collapse_id" : [
1
]
},
"inner_hits" : {
"sno_reqsno_match" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "collapse",
"_type" : "_doc",
"_id" : "kNNEbH4Bb7CAaZKC_vAY",
"_score" : 1.0,
"_source" : {
"sno" : 1,
"name" : "a",
"address" : "b",
"collapse_id" : 1
}
},
{
"_index" : "collapse",
"_type" : "_doc",
"_id" : "kdNFbH4Bb7CAaZKC-PBM",
"_score" : 1.0,
"_source" : {
"sno" : 2,
"name" : "a",
"address" : "b",
"request_sno" : 1,
"collapse_id" : 1
}
}
]
}
}
}
},
{
"_index" : "collapse",
"_type" : "_doc",
"_id" : "ktNNbH4Bb7CAaZKC5PC8",
"_score" : 1.0,
"fields" : {
"collapse_id" : [
3
]
},
"inner_hits" : {
"sno_reqsno_match" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "collapse",
"_type" : "_doc",
"_id" : "ktNNbH4Bb7CAaZKC5PC8",
"_score" : 1.0,
"_source" : {
"sno" : 3,
"name" : "a",
"address" : "b",
"collapse_id" : 3
}
},
{
"_index" : "collapse",
"_type" : "_doc",
"_id" : "k9NObH4Bb7CAaZKCGfAc",
"_score" : 1.0,
"_source" : {
"sno" : 4,
"name" : "a",
"address" : "b",
"request_sno" : 3,
"collapse_id" : 3
}
}
]
}
}
}
}
]
}

Write queryDSL to find unique error messages from sys log data?

Is there a way to configure the elasticsearch analyzer so that it is possible to get unique error messages in different scenarios?
1."...July 2020 23:00:00.674z... same message....."
2. slight changes in the string :
message1: "....message_details.. (unknown error 20004)
message2: "....message_details.. (unknown error 278945)
OR
message1:"....a::::: message_details ...."
message2:"....a:f23ed:fff:ff:: message_details ...."
The above two messages are the same apart from the character differnce.
Here is the query :
GET log_stash_2020.06.16/_search
{
"query": {
"bool": {
"must": [
{
"match_phrase": {
"message": "Error"
}
},
{
"match_phrase": {
"type": "lab_id"
}
}
]
}
},
"aggs": {
"log_message": {
"significant_text": {
"field": "message",
"filter_duplicate_text": "true"
}
}
},
"size": 1000
}
I have added the sample log file.
{
"_index" : "logstash_2020.06.16",
"_type" : "doc",
"_id" : "################",
"_score" : 1.0,
"_source" : {
"logsource" : "router_id",
"timestamp" : "Jun 15 20:00:00",
"program" : "some_program",
"host" : "#############",
"priority" : "27",
"#timestamp" : "2020-06-16T00:00:01.020Z",
"type" : "lab_id",
"pid" : "####",
"message" : ": ############### send failed with error: ENOENT -- Item not found (No error: 0)",
"#version" : "1"
}
}
{
"_index" : "logstash_2020.06.16",
"_type" : "doc",
"_id" : "################",
"_score" : 1.0,
"_source" : {
"host" : "################",
"#timestamp" : "2020-06-16T00:00:02.274Z",
"type" : "####",
"tags" : [
"_grokparsefailure"
],
"message" : "################:Jun 15 20:00:18.908 EDT: mediasvr[2546]: %MEDIASVR-MEDIASVR-4-PARTITION_USAGE_ALERT : High disk usage alert : host ##### exceeded 100% \n",
"#version" : "1"
}
}
Is there a way to do it in python ?(If elasticsearch does not have above mentioned functionality)
You can use the Elasticsearch Python client like so:
from elasticsearch import Elasticsearch
es = Elasticsearch(...)
resp = es.search(index="log_stash_2020.06.16", body={<dsl query>})
print(resp)
where is whatever query you want to run like the one you gave in the question.
<disclosure: I'm the maintainer of the Elasticsearch client and employed by Elastic>

Elasticsearch aggregation for each unit python

I want for each unit (org_name_en) add a new field with aggregation for review_count - sum / count. Field members is a type of nested. Below you can see piece of ES db:
"hits" : [
{
"_index" : "some_index",
"_type" : "_doc",
"_id" : "some_id",
"_score" : 1.0,
"_source" : {
"target_date" : "some_data",
"org_name" : "abc",
"org_name_en" : "ABC",
[...]
"members" : [
{
[...]
"review_count" : 50.0,
},
{
[...]
"review_count" : 60.0,
},
Final result should be like:
"hits" : [
{
[...]
"_source" : {
"target_date" : "filtered with some data",
"org_name" : "abc",
"org_name_en" : "ABC",
[...]
"members" : [
{
[...]
"review_count" : 50.0,
},
{
[...]
"review_count" : 60.0,
},
],
"review_total": {
"review_sum": 110,
"review_count: 2
},
[...next unit DEF, GHI, XYZ with the same aggregation...]
My try with Elasticsearch-DSL in python - works only for given unit, so I need to filter it out first
qs.aggs.bucket('members', 'nested', path='members').metric('review_total', 'sum', field='members.review_count')

Matching / Mapping lists with elasticsearch

There is a list in mongodb,
eg:
db_name = "Test"
collection_name = "Map"
db.Map.findOne()
{
"_id" : ObjectId(...),
"Id" : "576",
"FirstName" : "xyz",
"LastName" : "abc",
"skills" : [
"C++",
"Java",
"Python",
"MongoDB",
]
}
There is a list in elastcisearch index (I am using kibana to execute queries)
GET /user/_search
{
"took" : 31,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 7,
"max_score" : 1.0,
"hits" : [
{
"_index" : "customer",
"_type" : "_doc",
"_id" : "5",
"_score" : 1.0,
"_source" : {
"name" : "xyz abc"
"Age" : 21,
"skills" : [
"C++",
"Java",
"Python",
"MongoDB",
]
}
},
]
}
}
Can anyone help with the elasticsearch query that will match both the records based on skills.
I am using python to write the code
If a match is found, I am trying to get the first name and last name of that user
First name : "xyz"
Last name : "abc"
Assuming you are indexing all the document in elastic and of these you want to match documents where skills has both java and mongodb the query will be as:
{
"query": {
"bool": {
"filter": [
{
"term": {
"skills": "mongodb"
}
},
{
"term": {
"skills": "java"
}
}
]
}
}
}

pymongo get id for collection

i have this code:
def get_attribute_colour(colour_code):
attribute_colour_meta = db.attributes.aggregate([{ '$match': {"name.en-UK": "Colour"} },
{ '$unwind' : "$values" },
{ '$project': { "code" : "$values.code", "valueId": "$values._id"} },
{ '$match': {"code": colour_code} }])
return attribute_colour_meta['result']
that looks up a collection called attributes, which has the following structure:
> db.attributes.find({}).pretty();
{
"_id" : ObjectId("53b27bded901f26432996e00"),
"values" : [
{
"code" : "AQ",
"pmsCode" : "638c",
"name" : {
"en-UK" : "Aqua"
},
"tcxCode" : "16-4529 TCX",
"hexCode" : "#00aed8",
"images" : [
"AQ.jpg"
],
"_id" : ObjectId("53b27bded901f26432996d83")
},
{
"code" : "AQ",
"pmsCode" : "3115c",
"name" : {
"en-UK" : "Aqua"
},
"tcxCode" : "",
"hexCode" : "#00c4db",
"images" : [
"AQ.jpg"
],
"_id" : ObjectId("53b27bded901f26432996d84")
},
.....
}
],
"name" : {
"en-UK" : "Colour"
}
}
{
"_id" : ObjectId("53b27bded901f26432996e1b"),
"values" : [
{
"code" : 0,
"_id" : ObjectId("53b27bded901f26432996e01"),
"name" : {
"en-UK" : "0-3 MTHS"
}
},
.....
}
],
"name" : {
"en-UK" : "Size"
}
}
{
"_id" : ObjectId("53b27bded901f26432996e28"),
"values" : [
{
"Currency" : "GBP",
"_id" : ObjectId("53b27bded901f26432996e1c"),
"name" : {
"en-UK" : "Carton price list"
}
},
}
],
"name" : {
"en-UK" : "Price list"
}
}
>
basically, there are 3 attributes, colour, size and price list, each of which has sub-documents called values
in my def get_attribute_colour function, how do i return the _id for the attribute within the results, so that i get something like:
{ attributeId: ObjectId("53b27bded901f26432996e00"),
valueId: ObjectId("53b27bded901f26432996d83") }
the result does return the _id:
[{u'code': u'AQ', u'_id': ObjectId('53b27bded901f26432996e00'), u'valueId': ObjectId('53b27bded901f26432996d83')}]
but i don't see where this is specified?
any advice much appreciated.

Categories