Build relation between elasticsearch aggregators - Nested groupings

Build relation between elasticsearch aggregators - Nested groupings - python

I need to create nested groupings between fields.
Let us consider the example given below,
Documents:
{
"keyword": "abc",
"country": "IN",
"state": "TN",
"city": "Chennai"
},
{
"keyword": "abc",
"country": "IN",
"state": "TN",
"city": "Trichy"
},
{
"keyword": "abc",
"country": "IN",
"state": "KL",
"city": "TVM"
},
{
"keyword": "abc",
"country": "US",
"state": "Cal",
"city": "California"
}
Required output(Something like this):
{
"country": "IN",
"TN": [
"Chennai",
"Trichy"
],
"KL": [
"TVM"
]
},
{
"country": "US",
"Cal": [
"California"
]
}
Query used:
{
"from": 0,
"size": 1,
"aggs": {
"country": {
"terms": {
"field": "country.keyword",
"size": 50000
}
},
"state": {
"terms": {
"field": "state.keyword",
"size": 50000
}
},
"city": {
"terms": {
"field": "city.keyword",
"size": 50000
}
}
},
"query": {
"query_string": {
"query": "(keyword:abc) "
}
}
}
For this query I got separate bucket as output for city , state and country.
But what I need is city should be grouped under state and state should be grouped under country.
Thanks in advance.

Following query with aggregation should work for you
{
"query": {
"query_string": {
"query": "(keyword:abc)"
}
},
"size": 0,
"aggs": {
"country_agg": {
"terms": {
"field": "country.keyword",
"size": 10
},
"aggs": {
"state_agg": {
"terms": {
"field": "state.keyword",
"size": 10
},
"aggs": {
"city_agg": {
"terms": {
"field": "city.keyword",
"size": 10
}
}
}
}
}
}
}
}

Related

Explode json without pandas

I have a JSON object:
{
"data": {
"geography": [
{
"id": "1",
"state": "USA",
"properties": [
{
"code": "CMD-01",
"value": "34"
},
{
"code": "CMD-02",
"value": "24"
}
]
},
{
"id": "2",
"state": "Canada",
"properties": [
{
"code": "CMD-04",
"value": "50"
},
{
"code": "CMD-05",
"value": "60"
}
]
}
]
}
}
I want to get the result as a new JSON, but without using pandas (and all those explode, flatten and normalize functions...). Is there any option to get this structure without using pandas or having an Out of memory issue?
The output should be:
{ "id": "1",
"state": "USA",
"code": "CMD-01",
"value": "34"
},
{ "id": "1",
"state": "USA",
"code": "CMD-02",
"value": "24",
},
{ "id": "2",
"state": "Canada",
"code": "CMD-04",
"value": "50"
},
{ "id": "2",
"state": "Canada",
"code": "CMD-05",
"value": "60"
},

You can simply loop over the list associated with "geography" and build new dictionaries that you will add to a newly created list:
dict_in = {
"data": {
"geography": [
{
"id": "1",
"state": "USA",
"properties": [
{
"code": "CMD-01",
"value": "34"
},
{
"code": "CMD-02",
"value": "24"
}
]
},
{
"id": "2",
"state": "Canada",
"properties": [
{
"code": "CMD-04",
"value": "50"
},
{
"code": "CMD-05",
"value": "60"
}
]
}
]
}
}
import json
rec_out = []
for obj in dict_in["data"]["geography"]:
for prop in obj["properties"]:
dict_out = {
"id": obj["id"],
"state": obj["state"]
}
dict_out.update(prop)
rec_out.append(dict_out)
print(json.dumps(rec_out, indent=4))
Output:
[
{
"id": "1",
"state": "USA",
"code": "CMD-01",
"value": "34"
},
{
"id": "1",
"state": "USA",
"code": "CMD-02",
"value": "24"
},
{
"id": "2",
"state": "Canada",
"code": "CMD-04",
"value": "50"
},
{
"id": "2",
"state": "Canada",
"code": "CMD-05",
"value": "60"
}
]

extract aliases from wikidata dump using python

I am trying to extract certain fields about wikidata items from the wikidata dump, but I have a problem with the aliases field for a certain language, my code is based on the code in the following URL how_to_use_a_wikidata_dump, I made my modification, but the aliases field returns empty value:
for record in wikidata(args.dumpfile):
print('i = '+str(i)+' item '+record['id']+' started!'+'\n')
item_id = pydash.get(record, 'id')
item_type = pydash.get(record, 'claims.P31[0].mainsnak.datavalue.value.id')
arabic_label = pydash.get(record, 'labels.ar.value')
english_label = pydash.get(record, 'labels.en.value')
arabic_aliases =pydash.get(record, 'aliases.ar.value')
english_aliases =pydash.get(record, 'aliases.en.value')
arabic_desc = pydash.get(record, 'descriptions.ar.value')
english_desc = pydash.get(record, 'descriptions.en.value')
main_category = pydash.get(record, 'claims.P910[0].mainsnak.datavalue.value.id')
arwiki = pydash.get(record, 'sitelinks.arwiki.title')
arwikiquote = pydash.get(record, 'sitelinks.arwikiquote.title')
enwiki = pydash.get(record, 'sitelinks.enwiki.title')
enwikiquote = pydash.get(record, 'sitelinks.enwiki
quote.title')
The JSON Format for the wikidata item can be found here:
JSON Format
Example JSON RECORD
{
"pageid": 186,
"ns": 0,
"title": "Q60",
"lastrevid": 199780882,
"modified": "2020-02-27T14:37:20Z",
"id": "Q60",
"type": "item",
"aliases": {
"en": [
{
"language": "en",
"value": "NYC"
},
{
"language": "en",
"value": "New York"
}
],
"fr": [
{
"language": "fr",
"value": "New York City"
},
{
"language": "fr",
"value": "NYC"
}
],
"zh-mo": [
{
"language": "zh-mo",
"value": "\u7d10\u7d04\u5e02"
}
]
},
"labels": {
"en": {
"language": "en",
"value": "New York City"
},
"ar": {
"language": "ar",
"value": "\u0645\u062f\u064a\u0646\u0629 \u0646\u064a\u0648 \u064a\u0648\u0631\u0643"
},
"fr": {
"language": "fr",
"value": "New York City"
},
"my": {
"language": "my",
"value": "\u1014\u101a\u1030\u1038\u101a\u1031\u102c\u1000\u103a\u1019\u103c\u102d\u102f\u1037"
},
"ps": {
"language": "ps",
"value": "\u0646\u064a\u0648\u064a\u0627\u0631\u06a9"
}
},
"descriptions": {
"en": {
"language": "en",
"value": "largest city in New York and the United States of America"
},
"it": {
"language": "it",
"value": "citt\u00e0 degli Stati Uniti d'America"
},
"pl": {
"language": "pl",
"value": "miasto w Stanach Zjednoczonych"
},
"ro": {
"language": "ro",
"value": "ora\u015ful cel mai mare din SUA"
}
},
"claims": {
"P1151": [
{
"id": "Q60$6f832804-4c3f-6185-38bd-ca00b8517765",
"mainsnak": {
"snaktype": "value",
"property": "P1151",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q6342720",
"numeric-id": 6342720
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
}
],
"P625": [
{
"id": "q60$f00c56de-4bac-e259-b146-254897432868",
"mainsnak": {
"snaktype": "value",
"property": "P625",
"datatype": "globe-coordinate",
"datavalue": {
"value": {
"latitude": 40.67,
"longitude": -73.94,
"altitude": null,
"precision": 0.00027777777777778,
"globe": "http://www.wikidata.org/entity/Q2"
},
"type": "globecoordinate"
}
},
"type": "statement",
"rank": "normal",
"references": [
{
"hash": "7eb64cf9621d34c54fd4bd040ed4b61a88c4a1a0",
"snaks": {
"P143": [
{
"snaktype": "value",
"property": "P143",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q328",
"numeric-id": 328
},
"type": "wikibase-entityid"
}
}
]
},
"snaks-order": [
"P143"
]
}
]
}
],
"P150": [
{
"id": "Q60$bdddaa06-4e4b-f369-8954-2bb010aaa057",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q11299",
"numeric-id": 11299
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$0e484d5b-41a5-1594-7ae1-c3768c6206f6",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18419",
"numeric-id": 18419
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$e5000a60-42fc-2aba-f16d-bade1d2e8a58",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18424",
"numeric-id": 18424
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$4d90d6f4-4ab8-26bd-f2a5-4ac2a6eb48cd",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18426",
"numeric-id": 18426
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$ede49e3c-44f6-75a3-eb74-6a89886e30c9",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18432",
"numeric-id": 18432
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
}
],
"P6": [
{
"id": "Q60$5cc8fc79-4807-9800-dbea-fe9c20ab273b",
"mainsnak": {
"snaktype": "value",
"property": "P6",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q4911497",
"numeric-id": 4911497
},
"type": "wikibase-entityid"
}
},
"qualifiers": {
"P580": [
{
"hash": "c53f3ca845b789e543ed45e3e1ecd1dd950e30dc",
"snaktype": "value",
"property": "P580",
"datatype": "time",
"datavalue": {
"value": {
"time": "+00000002014-01-01T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
},
"type": "time"
}
}
]
},
"qualifiers-order": [
"P580"
],
"type": "statement",
"rank": "preferred"
},
{
"id": "q60$cad4e313-4b5e-e089-08b9-3b1c7998e762",
"mainsnak": {
"snaktype": "value",
"property": "P6",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q607",
"numeric-id": 607
},
"type": "wikibase-entityid"
}
},
"qualifiers": {
"P580": [
{
"hash": "47c515b79f80e24e03375b327f2ac85184765d5b",
"snaktype": "value",
"property": "P580",
"datatype": "time",
"datavalue": {
"value": {
"time": "+00000002002-01-01T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
},
"type": "time"
}
}
],
"P582": [
{
"hash": "1f463f78538c49ef6adf3a9b18e211af7195240a",
"snaktype": "value",
"property": "P582",
"datatype": "time",
"datavalue": {
"value": {
"time": "+00000002013-12-31T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
},
"type": "time"
}
}
]
},
"qualifiers-order": [
"P580",
"P582"
]
}
],
"P856": [
{
"id": "Q60$4e3e7a42-4ec4-b7c3-7570-b103eb2bc1ac",
"mainsnak": {
"snaktype": "value",
"property": "P856",
"datatype": "url",
"datavalue": {
"value": "http://nyc.gov/",
"type": "string"
}
},
"type": "statement",
"rank": "normal"
}
]
},
"sitelinks": {
"afwiki": {
"site": "afwiki",
"title": "New York Stad",
"badges": []
},
"dewiki": {
"site": "dewiki",
"title": "New York City",
"badges": [
"Q17437798"
]
},
"dewikinews": {
"site": "dewikinews",
"title": "Kategorie:New York",
"badges": []
},
"elwiki": {
"site": "elwiki",
"title": "\u039d\u03ad\u03b1 \u03a5\u03cc\u03c1\u03ba\u03b7",
"badges": []
},
"enwiki": {
"site": "enwiki",
"title": "New York City",
"badges": []
},
"zhwikivoyage": {
"site": "zhwikivoyage",
"title": "\u7d10\u7d04",
"badges": []
},
"zuwiki": {
"site": "zuwiki",
"title": "New York (idolobha)",
"badges": []
}
}
}
The result of this code is :
english_aliases =pydash.get(record, 'aliases.en')
print(type(arabic_aliases))
print(english_aliases)
<class 'list'>
[{'language': 'en', 'value': 'Kingdom of Belgium'}, {'language': 'en', 'value': 'BEL'}, {'language': 'en', 'value': 'be'}, {'language': 'en', 'value': '🇧🇪'}, {'language': 'en', 'value': 'BE'}]

The answer is :
english_aliases= set()
if pydash.has(record, 'aliases.en'):
for itm in pydash.get(record, 'aliases.en'):
english_aliases.add(itm['value'])

Why am I receiving an error when attempting to parse JSON object within for loop?

Everything with my script runs fine until I try to run it through a for loop. Specifically, when I attempt to index a specific array within the object. Before I get to the The script is intended to grab the delivery date for each tracking number in my list.
This is my script:
import requests
import json
TrackList = ['1Z3X756E0310496105','1ZX0373R0303581450','1ZX0373R0103574417']
url = 'https://onlinetools.ups.com/rest/Track'
para1 = '...beginning of JSON request string...'
para2 = '...end of JSON request string...'
for TrackNum in TrackList:
parameters = para1+TrackNum+para2
resp = requests.post(url = url, data = parameters, verify=False)
data = json.loads(resp.text)
DelDate = data['TrackResponse']['Shipment']['Package'][0]['Activity'][0]['Date']
print(DelDate)
JSON API Response (if needed):
{
"TrackResponse": {
"Response": {
"ResponseStatus": {
"Code": "1",
"Description": "Success"
},
"TransactionReference": {
"CustomerContext": "Analytics Inquiry"
}
},
"Shipment": {
"InquiryNumber": {
"Code": "01",
"Description": "ShipmentIdentificationNumber",
"Value": "1ZX0373R0103574417"
},
"Package": {
"Activity": [
{
"ActivityLocation": {
"Address": {
"City": "OKLAHOMA CITY",
"CountryCode": "US",
"PostalCode": "73128",
"StateProvinceCode": "OK"
},
"Code": "M3",
"Description": "Front Desk",
"SignedForByName": "CUMMINGS"
},
"Date": "20190520",
"Status": {
"Code": "9E",
"Description": "Delivered",
"Type": "D"
},
"Time": "091513"
},
{
"ActivityLocation": {
"Address": {
"City": "Oklahoma City",
"CountryCode": "US",
"StateProvinceCode": "OK"
},
"Description": "Front Desk"
},
"Date": "20190520",
"Status": {
"Code": "OT",
"Description": "Out For Delivery Today",
"Type": "I"
},
"Time": "085943"
},
{
"ActivityLocation": {
"Address": {
"City": "Oklahoma City",
"CountryCode": "US",
"StateProvinceCode": "OK"
},
"Description": "Front Desk"
},
"Date": "20190520",
"Status": {
"Code": "DS",
"Description": "Destination Scan",
"Type": "I"
},
"Time": "011819"
},
{
"ActivityLocation": {
"Address": {
"City": "Oklahoma City",
"CountryCode": "US",
"StateProvinceCode": "OK"
},
"Description": "Front Desk"
},
"Date": "20190519",
"Status": {
"Code": "AR",
"Description": "Arrival Scan",
"Type": "I"
},
"Time": "235100"
},
{
"ActivityLocation": {
"Address": {
"City": "DFW Airport",
"CountryCode": "US",
"StateProvinceCode": "TX"
},
"Description": "Front Desk"
},
"Date": "20190519",
"Status": {
"Code": "DP",
"Description": "Departure Scan",
"Type": "I"
},
"Time": "195500"
},
{
"ActivityLocation": {
"Address": {
"City": "DFW Airport",
"CountryCode": "US",
"StateProvinceCode": "TX"
},
"Description": "Front Desk"
},
"Date": "20190517",
"Status": {
"Code": "OR",
"Description": "Origin Scan",
"Type": "I"
},
"Time": "192938"
},
{
"ActivityLocation": {
"Address": {
"CountryCode": "US"
},
"Description": "Front Desk"
},
"Date": "20190517",
"Status": {
"Code": "MP",
"Description": "Order Processed: Ready for UPS",
"Type": "M"
},
"Time": "184621"
}
],
"PackageWeight": {
"UnitOfMeasurement": {
"Code": "LBS"
},
"Weight": "2.00"
},
"ReferenceNumber": [
{
"Code": "01",
"Value": "8472745558"
},
{
"Code": "01",
"Value": "5637807:1007379402:BN81-17077A:1"
},
{
"Code": "01",
"Value": "5637807"
}
],
"TrackingNumber": "1ZX0373R0103574417"
},
"PickupDate": "20190517",
"Service": {
"Code": "001",
"Description": "UPS Next Day Air"
},
"ShipmentAddress": [
{
"Address": {
"AddressLine": "S 600 ROYAL LN",
"City": "COPPELL",
"CountryCode": "US",
"PostalCode": "750193827",
"StateProvinceCode": "TX"
},
"Type": {
"Code": "01",
"Description": "Shipper Address"
}
},
{
"Address": {
"City": "OKLAHOMA CITY",
"CountryCode": "US",
"PostalCode": "73128",
"StateProvinceCode": "OK"
},
"Type": {
"Code": "02",
"Description": "ShipTo Address"
}
}
],
"ShipmentWeight": {
"UnitOfMeasurement": {
"Code": "LBS"
},
"Weight": "2.00"
},
"ShipperNumber": "X0373R"
}
}
}
Below is the error I receive:
Traceback (most recent call last):
File "/Users/***/Library/Preferences/PyCharmCE2019.1/scratches/UPS_API.py", line 15, in <module>
DelDate = data['TrackResponse']['Shipment']['Package'][0]['Activity'][0]['Date']
KeyError: 0

You're trying to index "Package" at index 0, but it's an object not an array. So you should be accessing ['Package']['Activity']

just take away the [0] because there is no [1] or [2]

issue in Elastic Search Term Aggregation

In elastic search aggregation query I need to get all the movies watched by the user who watches the movie "Frozen". This is how my Result source
{
"_index": "user",
"_type": "user",
"_id": "ovUowmUBREWOv-CU-4RT",
"_version": 4,
"_score": 1,
"_source": {
"movies": [
"Angry birds 1",
"PINNOCCHIO",
"Frozen",
"Hotel Transylvania 3"
],
"user_id": 86
}
}
This is the query I'm using.
{
"query": {
"match": {
"movies": "Frozen"
}
},
"size": 0,
"aggregations": {
"movies_like_Frozen": {
"terms": {
"field": "movies",
"min_doc_count": 1
}
}
}
}
The result I got in the bucket is correct, but the movie names are splits by white space like this
"buckets": [
{
"key": "3",
"doc_count": 2
},
{
"key": "hotel",
"doc_count": 2
},
{
"key": "transylvania",
"doc_count": 2
},
{
"key": "1",
"doc_count": 1
},
{
"key": "angry",
"doc_count": 1
},
{
"key": "birds",
"doc_count": 1
}
]
How can I get buckets with "Angry birds 1", "Hotel Transylvania 3" as result.
Please help.

In elasticsearch 6.x, every text field is analyzed implicitly. To override this, you need to create a mapping for text type fields as not_analyzed in an index, then insert documents in it.
In your case,
{
"mappings": {
"user": {
"properties": {
"movies": {
"type": "text",
"index": "not_analyzed",
"fields": {
"keyword": {
"type": "text",
"index": "not_analyzed"
}
}
},
"user_id": {
"type": "long"
}
}
}
}
}
Hope it works.

"object mapping [prices] can't be changed from nested to non-nested" on Bulk Python

I'm trying to insert a doc in ElasticSearch but every time i try to insert in python, its return me an error. But if i try to insert from Kibana or cUrl, its succeed.
I already tried the elasticserach-dsl but i've got the same error.
(Sorry for my bad english, i'm from brazil :D)
Error i've got:
elasticsearch.helpers.BulkIndexError: ((...)'status': 400, 'error': {'type':
'illegal_argument_exception', 'reason': "object mapping [prices] can't be changed from nested to non-nested"}}}])
My code:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
doc = [{
"_index": "products",
"_type": "test_products",
"_source": {
[...]
"prices": {
"latest": {
"value": 89,
"when": 1502795602848
},
"old": [
{
"value": 0,
"when": 1502795602848
}
]
},
"sizes": [
{
"name": "P",
"available": True
},
{
"name": "M",
"available": True
}
],
"created": "2017-08-15T08:13:22.848284"
}
}]
bulk(self.es, doc, index="products")
My ES mapping:
{
"test_products": {
"mappings": {
"products": {
"properties": {
"approved": {
"type": "boolean"
},
"available": {
"type": "boolean"
},
"brand": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"buyClicks": {
"type": "integer"
},
"category": {
"type": "keyword"
},
"code": {
"type": "keyword"
},
"color": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"created": {
"type": "date"
},
"description": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"gender": {
"type": "keyword"
},
"images": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"likes": {
"type": "integer"
},
"link": {
"type": "keyword"
},
"name": {
"type": "text",
"term_vector": "yes",
"analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"prices": {
"type": "nested",
"properties": {
"latest": {
"type": "nested",
"properties": {
"value": {
"type": "long"
},
"when": {
"type": "date",
"format": "dd-MM-yyyy||epoch_millis"
}
}
},
"old": {
"type": "nested",
"properties": {
"value": {
"type": "long"
},
"when": {
"type": "date",
"format": "dd-MM-yyyy||epoch_millis"
}
}
}
}
},
"redirectClicks": {
"type": "integer"
},
"sizes": {
"type": "nested",
"properties": {
"available": {
"type": "boolean"
},
"name": {
"type": "keyword"
},
"quantity": {
"type": "integer"
}
}
},
"slug": {
"type": "keyword"
},
"store": {
"type": "keyword"
},
"subCategories": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"tags": {
"type": "text",
"fields": {
"raw": {
"type": "text",
"term_vector": "yes",
"analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
}
}
},
"thumbnails": {
"type": "keyword"
}
}
}
}
}
}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Build relation between elasticsearch aggregators - Nested groupings - python

Related

Explode json without pandas

extract aliases from wikidata dump using python

Why am I receiving an error when attempting to parse JSON object within for loop?

issue in Elastic Search Term Aggregation

"object mapping [prices] can't be changed from nested to non-nested" on Bulk Python

Categories

Resources