Why i'm getting null value instead of aggregated response? - python

I'm trying to perform min aggregation using nested aggregation in elasticsearch but still getting null values..
GET /my_index/_search
{
"query": {
"match": {
"FirstName": "Cheryl"
}
},
"aggs": {
"art": {
"nested": {
"path": "art"
},
"aggs": {
"min_price": {
"min": {
"field": "art.Income"
}
}
}
}
}
}
Mappings :
{
"mappings": {
"properties": {
"art": {
"type": "nested",
"properties": {
"FirstName": {
"type": "text"
},
"Price": {
"type": "integer"
}
}
}
}
}
}

Related

elasticsearch doesn't raise score when matching on n-gram inside should clause

i'm trying to search an item inside elastic search, and raise the score in case i find an sku or something like that.
this is my index configuration:
{
"settings": {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"tokenizer": "ngram_tokenizer"
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 3,
"token_chars": [
"letter",
"digit",
"symbol",
"whitespace",
"punctuation"
]
}
}
}
},
"mappings": {
"properties": {
"job_desc": {
"type": "text",
"analyzer": "ngram_analyzer"
},
"sku": {
"type": "text",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256}
}
},
"unit_price": {
"type": "text",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256}
}
},
"unit_type": {
"type": "text",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256}
}
}
}
}
}
and this is my query to find the item:
{
"bool": {
"must": {
"multi_match": {
"query": "<name of item & possibly sku>",
"fields": [
"name",
"sku"
]
}
},
"should": {
"match": {
"job_desc": {
"query": "424241 (sku of item)",
"analyzer": "ngram_analyzer"
}
}
}
}
}
but for some reason, the "should" clause does not raise the score of the search.
what is the issue here?
thanks in advance,
Yaniv Akiva

transform a complex json object from a web api to multiple rows in a dataframe in Azure Databricks using pyspark?

I have a JSON file that is received from a REST API. An example of the return is like this:
{
"d": {
"results": [
{
"__metadata": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')",
"type": "EmpEmployment"
},
"personIdExternal": "60000033",
"userId": "60000033",
"hiringNotCompleted": false,
"isECRecord": true,
"lastModifiedDateTime": "/Date(1642917586000+0000)/",
"endDate": "/Date(1675123200000)/",
"createdDateTime": "/Date(1641473919000+0000)/",
"createdOn": "/Date(1641473919000)/",
"originalStartDate": "/Date(1501545600000)/",
"customDate1": "/Date(1501545600000)/",
"customString17": null,
"customString18": null,
"customString19": null,
"assignmentClass": "ST",
"lastModifiedBy": "This Dude",
"okToRehire": true,
"customString4": null,
"customString3": "3",
"customString2": null,
"assignmentIdExternal": "60000033",
"customString16": null,
"lastModifiedOn": "/Date(1642917586000)/",
"customString1": null,
"createdBy": "This Dudette",
"seniorityDate": "/Date(1501545600000)/",
"startDate": "/Date(1659398400000)/",
"customString16Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString16Nav"
}
},
"customString1Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString1Nav"
}
},
"customString18Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString18Nav"
}
},
"customString3Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString3Nav"
}
},
"paymentInformationNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/paymentInformationNav"
}
},
"empJobRelationshipNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empJobRelationshipNav"
}
},
"personNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/personNav"
}
},
"empWorkPermitNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empWorkPermitNav"
}
},
"photoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/photoNav"
}
},
"compInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/compInfoNav"
}
},
"userNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/userNav"
}
},
"customString2Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString2Nav"
}
},
"customString19Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString19Nav"
}
},
"jobInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/jobInfoNav"
}
},
"wfRequestNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/wfRequestNav"
}
},
"costDistributionNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/costDistributionNav"
}
},
"empPayCompNonRecurringNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empPayCompNonRecurringNav"
}
}
},
{
"__metadata": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')",
"type": "EmpEmployment"
},
"personIdExternal": "100003",
"userId": "100003",
"hiringNotCompleted": false,
"isECRecord": true,
"lastModifiedDateTime": "/Date(1638051713000+0000)/",
"endDate": null,
"createdDateTime": "/Date(1638051713000+0000)/",
"createdOn": "/Date(1638051713000)/",
"originalStartDate": "/Date(1635724800000)/",
"customDate1": null,
"customString17": null,
"customString18": null,
"customString19": null,
"assignmentClass": "ST",
"lastModifiedBy": "This Dudette",
"okToRehire": null,
"customString4": null,
"customString3": null,
"customString2": null,
"assignmentIdExternal": "100003",
"customString16": null,
"lastModifiedOn": "/Date(1638051713000)/",
"customString1": null,
"createdBy": "This Dude",
"seniorityDate": "/Date(1635724800000)/",
"startDate": "/Date(1635724800000)/",
"customString16Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString16Nav"
}
},
"customString1Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString1Nav"
}
},
"customString18Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString18Nav"
}
},
"customString3Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString3Nav"
}
},
"paymentInformationNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/paymentInformationNav"
}
},
"empJobRelationshipNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empJobRelationshipNav"
}
},
"personNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/personNav"
}
},
"empWorkPermitNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empWorkPermitNav"
}
},
"photoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/photoNav"
}
},
"compInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/compInfoNav"
}
},
"userNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/userNav"
}
},
"customString2Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString2Nav"
}
},
"customString19Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString19Nav"
}
},
"jobInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/jobInfoNav"
}
},
"wfRequestNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/wfRequestNav"
}
},
"costDistributionNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/costDistributionNav"
}
},
"empPayCompNonRecurringNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empPayCompNonRecurringNav"
}
}
}
]
}
}
I'm at present just looking to pull the userId & startDate from the JSON. I've tried using the explode command as shown in this example.
https://adatis.co.uk/parsing-nested-json-lists-in-databricks-using-python/
But all I seem to do is be able to put the entire result into one column in the dataframe, or if I use the the following:
Some help in being pointed in the right direction would be great please. Am I better just building a custom schema and trying to parse the JSON into that?
All I'm looking to do is return the result as per the image but each userId and startDate on on its own row due to them relating to each other.
Explode results to get them into row
df = spark.read.json("./sample.json", multiLine=True)
df2 = df.withColumn('d', explode(col('d.results')))
df2.select(df2.d.userId, df2.d.startDate).show(10,False)
+--------+---------------------+
|d.userId|d.startDate |
+--------+---------------------+
|60000033|/Date(1659398400000)/|
|100003 |/Date(1635724800000)/|
+--------+---------------------+
You can add as many attributes as required e.g
df.select(explode(col('d.results'))).\
selectExpr("col.userId","col.startDate","col.lastModifiedBy").\
show(10,False)
+--------+---------------------+--------------+
|userId |startDate |lastModifiedBy|
+--------+---------------------+--------------+
|60000033|/Date(1659398400000)/|This Dude |
|100003 |/Date(1635724800000)/|This Dudette |
+--------+---------------------+--------------+

Keyword searching AND Filtering in Elasticsearch

I have to search for keywords in one field and an exact match in a different field. I have tried something but it does not seem to work at all.
I tried giving the full article with the author as i have put in AWS ElasticSearch but it still won't retrieve anything.
query=json.dumps({
"query": {
"bool": {
"must": {
"match": {
"article": "man killed kim jones"
}
},
"filter": {
"term": {
"author": "Barbara Boyer"
}
}
}
}
})
response = requests.get(url-ES-domain/data/_search?",headers=headers,data=(query))
response.json()
Mapping details
{
"mappings": {
"article": {
"full_name": "article",
"mapping": {
"article": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
This is for the keyword in the article. Even if I give the full article as it is in the ES index, it still won't give any hits.
Try like this
Mappings
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
}
},
"normalizer": {
"lc_normalizer": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"max_ngram_diff": 20
},
"mappings": {
"properties": {
"article": {
"type": "text",
"analyzer": "my_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"author": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "lc_normalizer"
}
}
},
"key": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publication": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"title": {
"type": "text",
"analyzer": "my_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
Query
{
"query": {
"bool": {
"must" : {
"multi_match" : {
"query": "man killed kim jones",
"fields": [ "article", "title" ]
}
},
"filter": {
"term": {
"author.keyword": "Maddie Hanna"
}
}
}
}
}
The above query returns matches and returns the document you have added to the document.
When you are searching for a multi-word match, I suggest you use the match_phrase query. By default, elasticsearch will create keyword mapping for the text fields.
Note: You can try these things using Kibana UI provided by the elastic team. It will save a lot of time.

ElasticSearch how to do a sub aggregation in a sum aggregation

Hello I have an index in ElasticSearch with:
Plant, Department, Date, Value
I am trying to do a query in elasticsearch
1) Group by Plant and Date in specific departments and sum Value:
es = Elasticsearch('elasticsearch:9200')
body = Dict({"query": {
"bool": {
"must_not": {
"match": {
"Department": "Indirect*"}}}},
"aggs": {
"group_code": {
"terms": {
"field": "Plant.keyword", "size":10000},
"aggs": {
"group_date": {
"terms": {
"field": "Date"},
"aggs": {
"group_value": {
"sum":{
"field": "Value"}}}}}}}})
2) Group by Plant and Range of Dates, and get avg and median:
es = Elasticsearch('elasticsearch:9200')
body = Dict(
{"query": {
"bool": {
"must_not": {
"match": {
"Department_Substrate": "Indirect*"}}}},
"aggs": {
"group_code": {
"terms": {
"field": "Plant.keyword",
"size": 10000},
"aggs": {
"group_date": {
"range": {
"field": "Date",
"ranges": datelist},
"aggs": {
"Median": {
"percentiles": {
"field": "Value",
"percents": [25]}},
"Mean": {
"avg": {
"field":
"Value}}}}}}}})
it works too but in this case i didn't do the grouping by plant and date before, so mixing both i have something like:
body = Dict({"query": {
"bool": {
"must_not": {
"match": {
"Department_Substrate": "Indirect*"}}}},
"aggs": {
"group_code": {
"terms": {
"field": "Plant.keyword", "size":10000},
"aggs": {
"group_date": {
"terms": {
"field": "Date"},
"aggs": {
"group_value": {
"sum":{
"field": "Value"},
"aggs": {
"group_date": {
"range": {
"field": "Date",
"ranges": datelist},
"aggs": {
"Median": {
"percentiles": {
"field": "Value",
"percents": [25]}},
"Mean": {
"avg": {
"field":
"Value"}}}}}}}}}}}})
res = es.search(index=self.index, doc_type='test', body=body)
I have this:
TransportError: TransportError(500, 'aggregation_initialization_exception', 'Aggregator [group_value] of type [sum] cannot accept sub-aggregations')
So it exists a way to do this?
if it could help my code python before was:
data = test[~test.Department.str.startswith('Indirect')]
group1 = data.groupby(['Plant', 'Date'])['Value'].sum()
group2 = pd.DataFrame(group1.reset_index()).groupby(['Plant', pd.Grouper(key='Date', freq='W')])['Value'].median()
The error is clear:"Aggregator [group_value] of type [sum] cannot accept sub-aggregations"
When you do 'sum' aggregation you can't split the result anymore.
So you'd better change the position of sum aggs.
i.e.:
{
"query": {
"bool": {
"must_not": {
"match": {
"Department_Substrate": "Indirect*"
}
}
}
},
"aggs": {
"group_code": {
"terms": {
"field": "Plant.keyword",
"size": 10000
},
"aggs": {
"group_date": {
"terms": {
"field": "Date"
},
"aggs": {
"group_date": {
"range": {
"field": "Date",
"ranges": "sdf"
},
"aggs": {
"Median": {
"percentiles": {
"field": "Value",
"percents": [
25
]
}
},
"aggs": {
"group_value": {
"sum": {
"field": "Value"
}
}
}
}
}
}
}
}
}

Sum for Multiple Ranges on GroupBy Aggregations in Elasticsearch

The following mapping is aggregated on multiple levels on a field grouping documents using another field.
Mapping:
{
'predictions': {
'properties': {
'Company':{'type':'string'},
'TxnsId':{'type':'string'},
'Emp':{'type':'string'},
'Amount':{'type':'float'},
'Cash/online':{'type':'string'},
'items':{'type':'float'},
'timestamp':{'type':'date'}
}
}
}
My requirement is bit complex, I need to
For each Emp (Getting the distinct employees)
Check whether it is online or cashed transaction
Group by items with the ranges like 0-10,11-20,21-30....
Sum the Amount
Final Output is like:
>Emp-online-range-Amount
>a-online-(0-10)-1240$
>a-online-(21-30)-3543$
>b-online-(0-10)-2345$
>b-online-(11-20)-3456$
Something like this should do the job:
{
"size": 0,
"aggs": {
"by_emp": {
"terms": {
"field": "Emp"
},
"aggs": {
"cash_online": {
"filters": {
"filters": {
"cashed": {
"term": {
"Cash/online": "cached"
}
},
"online": {
"term": {
"Cash/online": "online"
}
}
}
},
"aggs": {
"ranges": {
"range": {
"field": "items",
"ranges": [
{
"from": 0,
"to": 11
},
{
"from": 11,
"to": 21
},
{
"from": 21,
"to": 31
}
]
},
"aggs": {
"total": {
"sum": {
"field": "Amount"
}
}
}
}
}
}
}
}
}
}

Categories