extract aliases from wikidata dump using python - python

I am trying to extract certain fields about wikidata items from the wikidata dump, but I have a problem with the aliases field for a certain language, my code is based on the code in the following URL how_to_use_a_wikidata_dump, I made my modification, but the aliases field returns empty value:
for record in wikidata(args.dumpfile):
print('i = '+str(i)+' item '+record['id']+' started!'+'\n')
item_id = pydash.get(record, 'id')
item_type = pydash.get(record, 'claims.P31[0].mainsnak.datavalue.value.id')
arabic_label = pydash.get(record, 'labels.ar.value')
english_label = pydash.get(record, 'labels.en.value')
arabic_aliases =pydash.get(record, 'aliases.ar.value')
english_aliases =pydash.get(record, 'aliases.en.value')
arabic_desc = pydash.get(record, 'descriptions.ar.value')
english_desc = pydash.get(record, 'descriptions.en.value')
main_category = pydash.get(record, 'claims.P910[0].mainsnak.datavalue.value.id')
arwiki = pydash.get(record, 'sitelinks.arwiki.title')
arwikiquote = pydash.get(record, 'sitelinks.arwikiquote.title')
enwiki = pydash.get(record, 'sitelinks.enwiki.title')
enwikiquote = pydash.get(record, 'sitelinks.enwiki
quote.title')
The JSON Format for the wikidata item can be found here:
JSON Format
Example JSON RECORD
{
"pageid": 186,
"ns": 0,
"title": "Q60",
"lastrevid": 199780882,
"modified": "2020-02-27T14:37:20Z",
"id": "Q60",
"type": "item",
"aliases": {
"en": [
{
"language": "en",
"value": "NYC"
},
{
"language": "en",
"value": "New York"
}
],
"fr": [
{
"language": "fr",
"value": "New York City"
},
{
"language": "fr",
"value": "NYC"
}
],
"zh-mo": [
{
"language": "zh-mo",
"value": "\u7d10\u7d04\u5e02"
}
]
},
"labels": {
"en": {
"language": "en",
"value": "New York City"
},
"ar": {
"language": "ar",
"value": "\u0645\u062f\u064a\u0646\u0629 \u0646\u064a\u0648 \u064a\u0648\u0631\u0643"
},
"fr": {
"language": "fr",
"value": "New York City"
},
"my": {
"language": "my",
"value": "\u1014\u101a\u1030\u1038\u101a\u1031\u102c\u1000\u103a\u1019\u103c\u102d\u102f\u1037"
},
"ps": {
"language": "ps",
"value": "\u0646\u064a\u0648\u064a\u0627\u0631\u06a9"
}
},
"descriptions": {
"en": {
"language": "en",
"value": "largest city in New York and the United States of America"
},
"it": {
"language": "it",
"value": "citt\u00e0 degli Stati Uniti d'America"
},
"pl": {
"language": "pl",
"value": "miasto w Stanach Zjednoczonych"
},
"ro": {
"language": "ro",
"value": "ora\u015ful cel mai mare din SUA"
}
},
"claims": {
"P1151": [
{
"id": "Q60$6f832804-4c3f-6185-38bd-ca00b8517765",
"mainsnak": {
"snaktype": "value",
"property": "P1151",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q6342720",
"numeric-id": 6342720
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
}
],
"P625": [
{
"id": "q60$f00c56de-4bac-e259-b146-254897432868",
"mainsnak": {
"snaktype": "value",
"property": "P625",
"datatype": "globe-coordinate",
"datavalue": {
"value": {
"latitude": 40.67,
"longitude": -73.94,
"altitude": null,
"precision": 0.00027777777777778,
"globe": "http://www.wikidata.org/entity/Q2"
},
"type": "globecoordinate"
}
},
"type": "statement",
"rank": "normal",
"references": [
{
"hash": "7eb64cf9621d34c54fd4bd040ed4b61a88c4a1a0",
"snaks": {
"P143": [
{
"snaktype": "value",
"property": "P143",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q328",
"numeric-id": 328
},
"type": "wikibase-entityid"
}
}
]
},
"snaks-order": [
"P143"
]
}
]
}
],
"P150": [
{
"id": "Q60$bdddaa06-4e4b-f369-8954-2bb010aaa057",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q11299",
"numeric-id": 11299
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$0e484d5b-41a5-1594-7ae1-c3768c6206f6",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18419",
"numeric-id": 18419
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$e5000a60-42fc-2aba-f16d-bade1d2e8a58",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18424",
"numeric-id": 18424
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$4d90d6f4-4ab8-26bd-f2a5-4ac2a6eb48cd",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18426",
"numeric-id": 18426
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$ede49e3c-44f6-75a3-eb74-6a89886e30c9",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18432",
"numeric-id": 18432
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
}
],
"P6": [
{
"id": "Q60$5cc8fc79-4807-9800-dbea-fe9c20ab273b",
"mainsnak": {
"snaktype": "value",
"property": "P6",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q4911497",
"numeric-id": 4911497
},
"type": "wikibase-entityid"
}
},
"qualifiers": {
"P580": [
{
"hash": "c53f3ca845b789e543ed45e3e1ecd1dd950e30dc",
"snaktype": "value",
"property": "P580",
"datatype": "time",
"datavalue": {
"value": {
"time": "+00000002014-01-01T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
},
"type": "time"
}
}
]
},
"qualifiers-order": [
"P580"
],
"type": "statement",
"rank": "preferred"
},
{
"id": "q60$cad4e313-4b5e-e089-08b9-3b1c7998e762",
"mainsnak": {
"snaktype": "value",
"property": "P6",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q607",
"numeric-id": 607
},
"type": "wikibase-entityid"
}
},
"qualifiers": {
"P580": [
{
"hash": "47c515b79f80e24e03375b327f2ac85184765d5b",
"snaktype": "value",
"property": "P580",
"datatype": "time",
"datavalue": {
"value": {
"time": "+00000002002-01-01T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
},
"type": "time"
}
}
],
"P582": [
{
"hash": "1f463f78538c49ef6adf3a9b18e211af7195240a",
"snaktype": "value",
"property": "P582",
"datatype": "time",
"datavalue": {
"value": {
"time": "+00000002013-12-31T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
},
"type": "time"
}
}
]
},
"qualifiers-order": [
"P580",
"P582"
]
}
],
"P856": [
{
"id": "Q60$4e3e7a42-4ec4-b7c3-7570-b103eb2bc1ac",
"mainsnak": {
"snaktype": "value",
"property": "P856",
"datatype": "url",
"datavalue": {
"value": "http://nyc.gov/",
"type": "string"
}
},
"type": "statement",
"rank": "normal"
}
]
},
"sitelinks": {
"afwiki": {
"site": "afwiki",
"title": "New York Stad",
"badges": []
},
"dewiki": {
"site": "dewiki",
"title": "New York City",
"badges": [
"Q17437798"
]
},
"dewikinews": {
"site": "dewikinews",
"title": "Kategorie:New York",
"badges": []
},
"elwiki": {
"site": "elwiki",
"title": "\u039d\u03ad\u03b1 \u03a5\u03cc\u03c1\u03ba\u03b7",
"badges": []
},
"enwiki": {
"site": "enwiki",
"title": "New York City",
"badges": []
},
"zhwikivoyage": {
"site": "zhwikivoyage",
"title": "\u7d10\u7d04",
"badges": []
},
"zuwiki": {
"site": "zuwiki",
"title": "New York (idolobha)",
"badges": []
}
}
}
The result of this code is :
english_aliases =pydash.get(record, 'aliases.en')
print(type(arabic_aliases))
print(english_aliases)
<class 'list'>
[{'language': 'en', 'value': 'Kingdom of Belgium'}, {'language': 'en', 'value': 'BEL'}, {'language': 'en', 'value': 'be'}, {'language': 'en', 'value': '🇧🇪'}, {'language': 'en', 'value': 'BE'}]

The answer is :
english_aliases= set()
if pydash.has(record, 'aliases.en'):
for itm in pydash.get(record, 'aliases.en'):
english_aliases.add(itm['value'])

Related

Explode json without pandas

I have a JSON object:
{
"data": {
"geography": [
{
"id": "1",
"state": "USA",
"properties": [
{
"code": "CMD-01",
"value": "34"
},
{
"code": "CMD-02",
"value": "24"
}
]
},
{
"id": "2",
"state": "Canada",
"properties": [
{
"code": "CMD-04",
"value": "50"
},
{
"code": "CMD-05",
"value": "60"
}
]
}
]
}
}
I want to get the result as a new JSON, but without using pandas (and all those explode, flatten and normalize functions...). Is there any option to get this structure without using pandas or having an Out of memory issue?
The output should be:
{ "id": "1",
"state": "USA",
"code": "CMD-01",
"value": "34"
},
{ "id": "1",
"state": "USA",
"code": "CMD-02",
"value": "24",
},
{ "id": "2",
"state": "Canada",
"code": "CMD-04",
"value": "50"
},
{ "id": "2",
"state": "Canada",
"code": "CMD-05",
"value": "60"
},
You can simply loop over the list associated with "geography" and build new dictionaries that you will add to a newly created list:
dict_in = {
"data": {
"geography": [
{
"id": "1",
"state": "USA",
"properties": [
{
"code": "CMD-01",
"value": "34"
},
{
"code": "CMD-02",
"value": "24"
}
]
},
{
"id": "2",
"state": "Canada",
"properties": [
{
"code": "CMD-04",
"value": "50"
},
{
"code": "CMD-05",
"value": "60"
}
]
}
]
}
}
import json
rec_out = []
for obj in dict_in["data"]["geography"]:
for prop in obj["properties"]:
dict_out = {
"id": obj["id"],
"state": obj["state"]
}
dict_out.update(prop)
rec_out.append(dict_out)
print(json.dumps(rec_out, indent=4))
Output:
[
{
"id": "1",
"state": "USA",
"code": "CMD-01",
"value": "34"
},
{
"id": "1",
"state": "USA",
"code": "CMD-02",
"value": "24"
},
{
"id": "2",
"state": "Canada",
"code": "CMD-04",
"value": "50"
},
{
"id": "2",
"state": "Canada",
"code": "CMD-05",
"value": "60"
}
]

Compare keys value on two list of dict

I'm trying to compare the contents of a dictionary from 2 lists of dict, I have data like this
data1 = [
{
"name": "MoreDependentsInd",
"description": "Form 1040-SP Indicator",
"data_type": "CheckboxType",
"xpath": "",
},
{
"name": "DependentFirstNm",
"description": "Form 1040-SR Indicator",
"data_type": "PersonNameControlType",
"xpath": "",
},
]
data2 = [
{
"id": 29,
"kind": "IN",
"container": 1,
"content": "null",
"uid": "IRS 1040-DependentDetail",
"title": "DependentDetail",
"display_name": "null",
"description": "null",
"extra_info": {
"kind": "null",
"include": {
"fields": [
{
"kind": "PersonFirstNameType",
"name": "DependentFirstNm",
"annotation": {
"documentation": {
"description": "Dependent First Name",
"line_number": "(1)",
}
},
"restriction": "null",
},
{
"kind": "PersonLastNameType",
"name": "DependentLastNm",
"annotation": {
"documentation": {
"description": "Dependent Last Name",
"line_number": "(1)",
}
},
"restriction": "null",
},
{
"kind": "PersonNameControlType",
"name": "DependentNameControlTxt",
"annotation": {
"documentation": {
"description": "Dependent Name Control",
"line_number": "(1)",
}
},
"restriction": "null",
},
{
"kind": "null",
"name": "IdentityProtectionPIN",
"annotation": {
"documentation": {
"description": "Dependent/Qualifying Child Identity Protection PIN",
"line_number": "(2)",
}
},
"restriction": {"rule": "null"},
},
{
"kind": "null",
"name": "DependentRelationshipCd",
"annotation": {
"documentation": {
"description": "Dependent Relationship Code",
"line_number": "(3)",
}
},
"restriction": {
"rule": {
"kind": "TextType",
"enumeration": [
"SON",
"DAUGHTER",
"STEPCHILD",
"FOSTER CHILD",
"BROTHER",
"SISTER",
"STEPBROTHER",
"STEPSISTER",
"HALF BROTHER",
"HALF SISTER",
"GRANDCHILD",
"NIECE",
"NEPHEW",
"PARENT",
"GRANDPARENT",
"AUNT",
"UNCLE",
"OTHER",
"NONE",
],
}
},
},
{
"kind": "PersonFirstNameType",
"name": "DependentFirstNm",
"annotation": {
"documentation": {
"description": "Dependent First Name",
"line_number": "(1)",
}
},
"restriction": "null",
},
{
"kind": "PersonLastNameType",
"name": "DependentLastNm",
"annotation": {
"documentation": {
"description": "Dependent Last Name",
"line_number": "(1)",
}
},
"restriction": "null",
},
{
"kind": "PersonNameControlType",
"name": "DependentNameControlTxt",
"annotation": {
"documentation": {
"description": "Dependent Name Control",
"line_number": "(1)",
}
},
"restriction": "null",
},
{
"kind": "null",
"name": "IdentityProtectionPIN",
"annotation": {
"documentation": {
"description": "Dependent/Qualifying Child Identity Protection PIN",
"line_number": "(2)",
}
},
"restriction": {"rule": "null"},
},
{
"kind": "null",
"name": "DependentRelationshipCd",
"annotation": {
"documentation": {
"description": "Dependent Relationship Code",
"line_number": "(3)",
}
},
"restriction": {
"rule": {
"kind": "TextType",
"enumeration": [
"SON",
"DAUGHTER",
"STEPCHILD",
"FOSTER CHILD",
"BROTHER",
"SISTER",
"STEPBROTHER",
"STEPSISTER",
"HALF BROTHER",
"HALF SISTER",
"GRANDCHILD",
"NIECE",
"NEPHEW",
"PARENT",
"GRANDPARENT",
"AUNT",
"UNCLE",
"OTHER",
"NONE",
],
}
},
},
{
"kind": "PersonFirstNameType",
"name": "DependentFirstNm",
"annotation": {
"documentation": {
"description": "Dependent First Name",
"line_number": "(1)",
}
},
"restriction": "null",
},
{
"kind": "PersonLastNameType",
"name": "DependentLastNm",
"annotation": {
"documentation": {
"description": "Dependent Last Name",
"line_number": "(1)",
}
},
"restriction": "null",
},
{
"kind": "PersonNameControlType",
"name": "DependentNameControlTxt",
"annotation": {
"documentation": {
"description": "Dependent Name Control",
"line_number": "(1)",
}
},
"restriction": "null",
},
{
"kind": "null",
"name": "IdentityProtectionPIN",
"annotation": {
"documentation": {
"description": "Dependent/Qualifying Child Identity Protection PIN",
"line_number": "(2)",
}
},
"restriction": {"rule": "null"},
},
{
"kind": "null",
"name": "DependentRelationshipCd",
"annotation": {
"documentation": {
"description": "Dependent Relationship Code",
"line_number": "(3)",
}
},
"restriction": {
"rule": {
"kind": "TextType",
"enumeration": [
"SON",
"DAUGHTER",
"STEPCHILD",
"FOSTER CHILD",
"BROTHER",
"SISTER",
"STEPBROTHER",
"STEPSISTER",
"HALF BROTHER",
"HALF SISTER",
"GRANDCHILD",
"NIECE",
"NEPHEW",
"PARENT",
"GRANDPARENT",
"AUNT",
"UNCLE",
"OTHER",
"NONE",
],
}
},
},
],
"reference": "null",
},
"annotation": "null",
"max_occurs": "100",
"min_occurs": "0",
"restriction": "null",
},
"instruction": "null",
"created_at": "2022-04-26T09:53:36.426118Z",
"updated_at": "2022-04-26T09:53:36.426203Z",
},
{
"id": 30,
"kind": "IN",
"container": 1,
"content": "null",
"uid": "IRS 1040-MoreDependentsInd",
"title": "MoreDependentsInd",
"display_name": "null",
"description": "null",
"extra_info": {
"kind": "CheckboxType",
"include": "null",
"annotation": {
"documentation": {
"description": "More Dependents Indicator",
"line_number": "null",
}
},
"max_occurs": "null",
"min_occurs": "0",
"restriction": "null",
},
"instruction": "null",
"created_at": "2022-04-26T09:53:36.427552Z",
"updated_at": "2022-04-26T09:53:36.427647Z",
},
]
for i in data1:
for j in data2:
if i["name"] == j["title"]:
print("success")
the result is a success, but in my code, there is a deficiency, if it processes a lot of data then the performance will feel slow, how to fix it? and what if the value such as the data type is in a key that we don't know is located?
for example, I search value data_type from data1 I compare it with CheckboxType
"extra_info": {
"kind": "CheckboxType",
"include": "null",
"annotation": {
"documentation": {
"description": "More Dependents Indicator",
"line_number": "null",
}
},
from data2 on index 1 data2[1] but the value CheckboxType is also exist in another key on data2 in another index
Thanks!
You can do the test in O(n) instead of O(n^2) by building sets of name and title (which is O(n)) and then taking their intersection (which is also O(n) now that they're sets):
>>> {i["name"] for i in data1} & {i["title"] for i in data2}
{'MoreDependentsInd'}
Non-empty sets are truthy so you can use this in a conditional if you don't care about what the intersection is:
>>> if {i["name"] for i in data1} & {i["title"] for i in data2}:
... print("success")
...
success
If you want to look at all string values without having to know that name and title are the intersecting keys, use another generator expression to put all string values into the two sets:
>>> {v for i in data1 for v in i.values() if isinstance(v, str)} & {v for i in data2 for v in i.values() if isinstance(v, str)}
{'MoreDependentsInd'}

Why am I receiving an error when attempting to parse JSON object within for loop?

Everything with my script runs fine until I try to run it through a for loop. Specifically, when I attempt to index a specific array within the object. Before I get to the The script is intended to grab the delivery date for each tracking number in my list.
This is my script:
import requests
import json
TrackList = ['1Z3X756E0310496105','1ZX0373R0303581450','1ZX0373R0103574417']
url = 'https://onlinetools.ups.com/rest/Track'
para1 = '...beginning of JSON request string...'
para2 = '...end of JSON request string...'
for TrackNum in TrackList:
parameters = para1+TrackNum+para2
resp = requests.post(url = url, data = parameters, verify=False)
data = json.loads(resp.text)
DelDate = data['TrackResponse']['Shipment']['Package'][0]['Activity'][0]['Date']
print(DelDate)
JSON API Response (if needed):
{
"TrackResponse": {
"Response": {
"ResponseStatus": {
"Code": "1",
"Description": "Success"
},
"TransactionReference": {
"CustomerContext": "Analytics Inquiry"
}
},
"Shipment": {
"InquiryNumber": {
"Code": "01",
"Description": "ShipmentIdentificationNumber",
"Value": "1ZX0373R0103574417"
},
"Package": {
"Activity": [
{
"ActivityLocation": {
"Address": {
"City": "OKLAHOMA CITY",
"CountryCode": "US",
"PostalCode": "73128",
"StateProvinceCode": "OK"
},
"Code": "M3",
"Description": "Front Desk",
"SignedForByName": "CUMMINGS"
},
"Date": "20190520",
"Status": {
"Code": "9E",
"Description": "Delivered",
"Type": "D"
},
"Time": "091513"
},
{
"ActivityLocation": {
"Address": {
"City": "Oklahoma City",
"CountryCode": "US",
"StateProvinceCode": "OK"
},
"Description": "Front Desk"
},
"Date": "20190520",
"Status": {
"Code": "OT",
"Description": "Out For Delivery Today",
"Type": "I"
},
"Time": "085943"
},
{
"ActivityLocation": {
"Address": {
"City": "Oklahoma City",
"CountryCode": "US",
"StateProvinceCode": "OK"
},
"Description": "Front Desk"
},
"Date": "20190520",
"Status": {
"Code": "DS",
"Description": "Destination Scan",
"Type": "I"
},
"Time": "011819"
},
{
"ActivityLocation": {
"Address": {
"City": "Oklahoma City",
"CountryCode": "US",
"StateProvinceCode": "OK"
},
"Description": "Front Desk"
},
"Date": "20190519",
"Status": {
"Code": "AR",
"Description": "Arrival Scan",
"Type": "I"
},
"Time": "235100"
},
{
"ActivityLocation": {
"Address": {
"City": "DFW Airport",
"CountryCode": "US",
"StateProvinceCode": "TX"
},
"Description": "Front Desk"
},
"Date": "20190519",
"Status": {
"Code": "DP",
"Description": "Departure Scan",
"Type": "I"
},
"Time": "195500"
},
{
"ActivityLocation": {
"Address": {
"City": "DFW Airport",
"CountryCode": "US",
"StateProvinceCode": "TX"
},
"Description": "Front Desk"
},
"Date": "20190517",
"Status": {
"Code": "OR",
"Description": "Origin Scan",
"Type": "I"
},
"Time": "192938"
},
{
"ActivityLocation": {
"Address": {
"CountryCode": "US"
},
"Description": "Front Desk"
},
"Date": "20190517",
"Status": {
"Code": "MP",
"Description": "Order Processed: Ready for UPS",
"Type": "M"
},
"Time": "184621"
}
],
"PackageWeight": {
"UnitOfMeasurement": {
"Code": "LBS"
},
"Weight": "2.00"
},
"ReferenceNumber": [
{
"Code": "01",
"Value": "8472745558"
},
{
"Code": "01",
"Value": "5637807:1007379402:BN81-17077A:1"
},
{
"Code": "01",
"Value": "5637807"
}
],
"TrackingNumber": "1ZX0373R0103574417"
},
"PickupDate": "20190517",
"Service": {
"Code": "001",
"Description": "UPS Next Day Air"
},
"ShipmentAddress": [
{
"Address": {
"AddressLine": "S 600 ROYAL LN",
"City": "COPPELL",
"CountryCode": "US",
"PostalCode": "750193827",
"StateProvinceCode": "TX"
},
"Type": {
"Code": "01",
"Description": "Shipper Address"
}
},
{
"Address": {
"City": "OKLAHOMA CITY",
"CountryCode": "US",
"PostalCode": "73128",
"StateProvinceCode": "OK"
},
"Type": {
"Code": "02",
"Description": "ShipTo Address"
}
}
],
"ShipmentWeight": {
"UnitOfMeasurement": {
"Code": "LBS"
},
"Weight": "2.00"
},
"ShipperNumber": "X0373R"
}
}
}
Below is the error I receive:
Traceback (most recent call last):
File "/Users/***/Library/Preferences/PyCharmCE2019.1/scratches/UPS_API.py", line 15, in <module>
DelDate = data['TrackResponse']['Shipment']['Package'][0]['Activity'][0]['Date']
KeyError: 0
You're trying to index "Package" at index 0, but it's an object not an array. So you should be accessing ['Package']['Activity']
just take away the [0] because there is no [1] or [2]

Build relation between elasticsearch aggregators - Nested groupings

I need to create nested groupings between fields.
Let us consider the example given below,
Documents:
{
"keyword": "abc",
"country": "IN",
"state": "TN",
"city": "Chennai"
},
{
"keyword": "abc",
"country": "IN",
"state": "TN",
"city": "Trichy"
},
{
"keyword": "abc",
"country": "IN",
"state": "KL",
"city": "TVM"
},
{
"keyword": "abc",
"country": "US",
"state": "Cal",
"city": "California"
}
Required output(Something like this):
{
"country": "IN",
"TN": [
"Chennai",
"Trichy"
],
"KL": [
"TVM"
]
},
{
"country": "US",
"Cal": [
"California"
]
}
Query used:
{
"from": 0,
"size": 1,
"aggs": {
"country": {
"terms": {
"field": "country.keyword",
"size": 50000
}
},
"state": {
"terms": {
"field": "state.keyword",
"size": 50000
}
},
"city": {
"terms": {
"field": "city.keyword",
"size": 50000
}
}
},
"query": {
"query_string": {
"query": "(keyword:abc) "
}
}
}
For this query I got separate bucket as output for city , state and country.
But what I need is city should be grouped under state and state should be grouped under country.
Thanks in advance.
Following query with aggregation should work for you
{
"query": {
"query_string": {
"query": "(keyword:abc)"
}
},
"size": 0,
"aggs": {
"country_agg": {
"terms": {
"field": "country.keyword",
"size": 10
},
"aggs": {
"state_agg": {
"terms": {
"field": "state.keyword",
"size": 10
},
"aggs": {
"city_agg": {
"terms": {
"field": "city.keyword",
"size": 10
}
}
}
}
}
}
}
}

"object mapping [prices] can't be changed from nested to non-nested" on Bulk Python

I'm trying to insert a doc in ElasticSearch but every time i try to insert in python, its return me an error. But if i try to insert from Kibana or cUrl, its succeed.
I already tried the elasticserach-dsl but i've got the same error.
(Sorry for my bad english, i'm from brazil :D)
Error i've got:
elasticsearch.helpers.BulkIndexError: ((...)'status': 400, 'error': {'type':
'illegal_argument_exception', 'reason': "object mapping [prices] can't be changed from nested to non-nested"}}}])
My code:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
doc = [{
"_index": "products",
"_type": "test_products",
"_source": {
[...]
"prices": {
"latest": {
"value": 89,
"when": 1502795602848
},
"old": [
{
"value": 0,
"when": 1502795602848
}
]
},
"sizes": [
{
"name": "P",
"available": True
},
{
"name": "M",
"available": True
}
],
"created": "2017-08-15T08:13:22.848284"
}
}]
bulk(self.es, doc, index="products")
My ES mapping:
{
"test_products": {
"mappings": {
"products": {
"properties": {
"approved": {
"type": "boolean"
},
"available": {
"type": "boolean"
},
"brand": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"buyClicks": {
"type": "integer"
},
"category": {
"type": "keyword"
},
"code": {
"type": "keyword"
},
"color": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"created": {
"type": "date"
},
"description": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"gender": {
"type": "keyword"
},
"images": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"likes": {
"type": "integer"
},
"link": {
"type": "keyword"
},
"name": {
"type": "text",
"term_vector": "yes",
"analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"prices": {
"type": "nested",
"properties": {
"latest": {
"type": "nested",
"properties": {
"value": {
"type": "long"
},
"when": {
"type": "date",
"format": "dd-MM-yyyy||epoch_millis"
}
}
},
"old": {
"type": "nested",
"properties": {
"value": {
"type": "long"
},
"when": {
"type": "date",
"format": "dd-MM-yyyy||epoch_millis"
}
}
}
}
},
"redirectClicks": {
"type": "integer"
},
"sizes": {
"type": "nested",
"properties": {
"available": {
"type": "boolean"
},
"name": {
"type": "keyword"
},
"quantity": {
"type": "integer"
}
}
},
"slug": {
"type": "keyword"
},
"store": {
"type": "keyword"
},
"subCategories": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"tags": {
"type": "text",
"fields": {
"raw": {
"type": "text",
"term_vector": "yes",
"analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
}
}
},
"thumbnails": {
"type": "keyword"
}
}
}
}
}
}

Categories