searching only digits in a mixed field (elasticsearch) - python

I have a field with phone numbers with this format - XXX-XXX-XXXX or XXXXXXXXXX (its a merged table).
I want to be able to search XXXXXXXXXX and get results from both formats.
I tried using the decimal digit filter but it didn't work.
Here are the settings that i have tried which are as follow:
mapping = {
'mappings': {
DOC_TYPE: {
'properties': {
'first_name': {
'type': 'text',
'analyzer': 'word_splitter'
},
'last_name': {
'type': 'text',
'analyzer': 'word_splitter'
},
'email': {
'type': 'text',
'analyzer': 'email'
},
'gender': {
'type': 'text'
},
'ip_address': {
'type': 'text'
},
'language': {
'type': 'text'
},
'phone': {
'type': 'text',
'analyzer': 'digits'
},
'id': {
'type': 'long'
}
}
}
},
'settings': {
'analysis': {
'analyzer': {
'my_analyzer': {
'type': 'whitespace'
},
'better': {
'type': 'standard'
},
'word_splitter': {
'type': 'custom',
'tokenizer': 'nGram',
'min_gram': 5,
'max_gram': 5,
'filter': [
'lowercase'
]
},
'email': {
'type': 'custom',
'tokenizer': 'uax_url_email'
},
'digits': {
'type': 'custom',
'tokenizer': 'whitespace',
'filter': [
'decimal_digit'
]
}
}
}
}
}
Any ideas ?

Use a char_filter to remove the hyphens before indexing. As a simple example:
Set up the custom analyzer and apply it to the phone field.
PUT my_index
{
"settings": {
"analysis": {
"analyzer": {
"phone_analyzer": {
"tokenizer": "standard",
"char_filter": [
"phone_char_filter"
]
}
},
"char_filter": {
"phone_char_filter": {
"type": "mapping",
"mappings": [
"- => "
]
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"phone": {
"type": "text",
"analyzer": "phone_analyzer"
}
}
}
}
}
Add some docs
POST my_index/_doc
{"phone": "123-456-7890"}
POST my_index/_doc
{"phone": "2345678901"}
Search in xxx-xxx-xxxx format
GET my_index/_search
{
"query": {
"match": {
"phone": "123-456-7890"
}
}
}
Search in xxxxxxxxxx format
GET my_index/_search
{
"query": {
"match": {
"phone": "1234567890"
}
}
}

Related

Pymongo Aggregate with multiple conditions: lookup, unwind, redact, cond, sort and limit

done_status = ['BAD_PU', 'TO_WH', 'RCVDPORT', 'RCVD', 'BAD_DEL', 'MISSFLT', 'OFFLOAD']
shipments = db.db_shipment.aggregate([{
"$lookup":{
"from":"db_shipment_status_history",
"localField":"_id",
"foreignField":"fk_shipment_id",
"as":"shipment_status_history_collection"
}
},
{"$unwind":
"$shipment_status_history_collection"},
{"$redact":{"$cond":{ "$if": { "status_value": {"$in": done_status } } },
"$then": "$$KEEP"
,"$else":"$$PRUNE"
}
},
{"$sort":
{'shipment_status_history_collection.rec_timestamp':-1}},
{"$limit":1},
{"$project":{"pkey":"$pkey","_code":"$_code"}}
])
error:
pymongo.errors.OperationFailure: An object representing an expression must have exactly one field: { $cond: { $if: { status_value: { $in: [ "BAD_PU", "TO_WH", "RCVDPORT", "RCVD", "BAD_DEL", "MISSFLT", "OFFLOAD" ] } } }, $else: "$$PRUNE", $then: "$$KEEP" }
how to fix this error? Im trying to add the latest shipment status history in the shipment record where the status value is in the given status value.
Update the redact stage for your aggregation pipeline. if, then and else are a part of the $cond operator and they're not operators in themselves.
Also, $in operator is passed an array where its first item is checked for presence in the second item. The second item is usually an iterable.
Mongo 3.6
messenger_pipeline_status = (
messenger_active_status['data']['pending']
+ messenger_active_status['data']['processing']
)
assigned_status = ['DEL_ASSIGNED','PU_ASSIGNED']
subpipeline = [
{
'$match': {
'$expr': {
'$and': [
{'$eq': ['$fk_shipment_id', '$$pkey']},
{'$eq': ['$fk_messenger_id', fk_user_id]},
{'$in': ['$status_value', assigned_status]}
]
}
}
},
{
'$sort': {
'rec_timestamp': -1
}
},
{
'$limit': 1
},
{
'$project': {
'fk_shipment_id': 1
}
}
]
pipeline = [
{
'$match': {
'status_value': {'$in': messenger_pipeline_status}
'is_deleted': False,
'is_postponed': False,
'is_active': True,
}
},
{
'$lookup': {
'from': 'db_shipment_status_history',
'let': {'pkey': '$pkey'},
'pipeline': subpipeline,
'as': 'shipment_status_history'
}
},
{
'$match': {
'shipment_status_history': {
'$ne': []
}
}
},
{
'$unwind': '$shipment_status_history'
},
{
'$project': {
'_id': 1,
'pkey': 1,
'_code': 1,
'date_created': 1,
'sender_full_name': '$sender.full_name',
'sender_raw_address': '$sender.raw_address',
'sender_formatted_address': '$sender.formatted_address',
'receiver_full_name': '$receiver.full_name',
'receiver_raw_address': '$receiver.raw_address',
'receiver_formatted_address': '$receiver.formatted_address',
'status_name': 1,
'team_value': 1,
'cs_name': 1,
'fk_messenger_id': '$shipment_status_history.fk_shipment_id'
}
}
]
result = db.db_shipment.aggregate(pipeline)
print(list(result))
[Edit] Mongo 3.2
The following aggregation pipeline produces similar results as the above and is valid query for Mongo 3.2.
messenger_pipeline_status = ['MISSFLT', 'OFFLOAD']
pipeline = [
{
'$match': {
'status_value': { '$in': messenger_pipeline_status}
'is_deleted': False,
'is_postponed': False,
'is_active': True,
}
},
{
"$lookup": {
'from': 'db_shipment_status_history',
'localField': 'pkey',
'foreignField': 'fk_shipment_id',
'as': 'shipment_status_history'
}
},
{
'$match': {
'shipment_status_history': {
'$ne': []
}
}
},
{
'$project': {
'_id': 1,
'pkey': 1,
'_code': 1,
'date_created': 1,
'sender_full_name': '$sender.full_name',
'sender_raw_address': '$sender.raw_address',
'sender_formatted_address': '$sender.formatted_address',
'receiver_full_name': '$receiver.full_name',
'receiver_raw_address': '$receiver.raw_address',
'receiver_formatted_address': '$receiver.formatted_address',
'status_name': 1,
'team_value': 1,
'cs_name': 1,
'shipment_status_history': {
'$filter': {
'input': '$shipment_status_history',
'as': 'shipment',
'cond': {
'$and': [
{'$eq': ['$$shipment.fk_shipment_id', fk_user_id]},
{'$in': ['$$shipment.status_value', assigned_status]},
]
}
}
},
}
},
{
'$unwind': '$shipment_status_history'
},
{
'$sort': {
'shipment_status_history.rec_timestamp': -1,
}
},
{
'$group': {
'_id': '$pkey',
'doc': {
'$first': '$$CURRENT'
}
}
},
{
'$unwind': '$doc'
},
{ # last projection, I promise
'$project': {
'_id': '$doc.id',
'pkey': '$doc.pkey',
'_code': '$doc._code',
'date_created': '$doc.date_created',
'sender_full_name': '$doc.sender_full_name',
'sender_raw_address': '$doc.sender_raw_address',
'sender_formatted_address': '$doc.sender_formatted_address',
'receiver_full_name': '$doc.receiver_full_name',
'receiver_raw_address': '$doc.receiver_raw_address',
'receiver_formatted_address': '$doc.receiver_formatted_address',
'status_name': '$doc.status_name',
'team_value': '$doc.team_value',
'cs_name': '$doc.cs_name',
'fk_messenger_id': '$doc.shipment_status_history.fk_shipment_id'
}
},
]
res = db.db_shipment.aggregate(pipeline)

Elastic Search: Mapper for [_all] conflicts with existing mapping in other types

I am trying to create edge ngram index and map the document. I am using python elasticsearch api to do this.
This is my index body.
body = {
"settings": {
"analysis": {
"filter": {
"ngram_filter": {
"type": "edge_ngram",
"min_gram": 3,
"max_gram": 20
}
},
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ngram_filter"
]
}
}
}
}
}
And this is my mapping of a document.
user_mapping = {
"_all": {
"type": "string",
"analyzer": "ngram_analyzer",
"search_analyzer": "standard"
},
'properties': {
'id': {
'type': 'long',
'include_in_all': False
},
'uuid': {
'type': 'string',
'include_in_all': False
},
'email': {
'type': 'string',
'include_in_all': True,
"term_vector": "yes"
},
'screen_name': {
'type': 'string',
"include_in_all": True,
"term_vector": "yes"
},
'display_name': {
'type': 'string',
"include_in_all": True,
"term_vector": "yes"
},
'first_name': {
'type': 'string',
'include_in_all': True,
'term_vector': 'yes'
},
'last_name': {
'type': 'string',
"include_in_all": True,
"term_vector": "yes"
},
'lat': {
'type': 'long',
"include_in_all": False
},
'lon': {
'type': 'long',
"include_in_all": False
},
'public_status': {
'type': 'string',
"include_in_all": True,
"term_vector": "yes"
},
'score': {
'type': 'long',
'include_in_all': False
},
'groups': {
'type': 'string',
'include_in_all': False
},
'created_at': {
'type': 'date',
'include_in_all': False
}
}
}
When i try to map this, I am getting this exception. Not sure what is wrong with my map.
{u'status': 400, u'error': {u'root_cause': [{u'reason': u'Mapper for [_all] conflicts with existing mapping in other types:\n[mapper [_all] has different [analyzer], mapper [_all] is used by multiple types. Set update_all_types to true to update [search_analyzer] across all types., mapper [_all] is used by multiple types. Set update_all_types to true to update [search_quote_analyzer] across all types.]', u'type': u'illegal_argument_exception'}], u'type': u'illegal_argument_exception', u'reason': u'Mapper for [_all] conflicts with existing mapping in other types:\n[mapper [_all] has different [analyzer], mapper [_all] is used by multiple types. Set update_all_types to true to update [search_analyzer] across all types., mapper [_all] is used by multiple types. Set update_all_types to true to update [search_quote_analyzer] across all types.]'}}

Finding ElasticSearch records matching empty and null values

I have some elasticsearch records that are being stored as either an empty string, or a null value. I am trying to develop a query that will allow me to return these from the index. I came up with:
{
'query': {
'filtered': {
'filter': {
'bool': {
'should': [
{'term': {'field1': ''}},
{"missing" : {"field": "field1"}},
],
}
}
}
}
}
Which works as intended for my purpose, and returns the correct row. However, if I try and search for any more than a single field, the 'should' clause OR's the two fields together. This is a problem, because I want there to be an AND relationship:
{
'query': {
'filtered': {
'filter': {
'bool': {
'should': [
{'term': {'field1': ''}},
{"missing" : {"field": "field1"}},
# these fields should be AND but are OR
{'term': {'field2': ''}},
{"missing" : {"field": "field2"}},
],
}
}
}
}
}
Is there anyway I can do the above with a single filter, or AND the two filters together?
You could use the and filter for that purpose, and AND the two bool/should filters, like this:
{
"query": {
"filtered": {
"filter": {
"and": [
{
"bool": {
"should": [
{
"term": {
"field1": ""
}
},
{
"missing": {
"field": "field1"
}
}
]
}
},
{
"bool": {
"should": [
{
"term": {
"field2": ""
}
},
{
"missing": {
"field": "field2"
}
}
]
}
}
]
}
}
}
}
Or you can also bool/must two or filters like this:
{
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"or": [
{
"term": {
"field1": ""
}
},
{
"missing": {
"field": "field1"
}
}
]
},
{
"or": [
{
"term": {
"field2": ""
}
},
{
"missing": {
"field": "field2"
}
}
]
}
]
}
}
}
}
}

str to dict in python, but maintain the sequence of json attributes

I've tried ast.literal_eval and json.loads but both of these, doesn't maintain the sequence of json attributes when a string is provided. Please see the following example -
String before providing it to json.loads -
{
"type": "array",
"properties": {
"name": {
"type": "string"
},
"i": {
"type": "integer"
},
"strList": {
"type": "array",
"items": {
"type": "string"
}
},
"strMap": {
"type": "object"
},
"p2": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"i": {
"type": "integer"
},
"p3": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"i": {
"type": "integer"
},
"p4": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"i": {
"type": "integer"
}
}
}
}
}
}
}
},
"p3": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"i": {
"type": "integer"
},
"p4": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"i": {
"type": "integer"
}
}
}
}
}
},
"b": {
"type": "boolean",
"required": true
}
},
"classnames": {
"rootNode": {
"classname": "com.agent.Person"
},
"p2": {
"classname": "com.agent.Person2",
"p3": {
"classname": "com.agent.Person3",
"p4": {
"classname": "com.agent.Person4"
}
}
},
"p3": {
"classname": "com.agent.Person3",
"p4": {
"classname": "com.agent.Person4"
}
}
}
}
String after providing it to json.loads -
{
'classnames': {
'p2': {
'classname': 'com.agent.Person2',
'p3': {
'classname': 'com.agent.Person3',
'p4': {
'classname': 'com.agent.Person4'
}
}
},
'p3': {
'classname': 'com.agent.Person3',
'p4': {
'classname': 'com.agent.Person4'
}
},
'rootNode': {
'classname': 'com.agent.Person'
}
},
'properties': {
'b': {
'required': True,
'type': 'boolean'
},
'i': {
'type': 'integer'
},
'name': {
'type': 'string'
},
'p2': {
'items': {
'properties': {
'i': {
'type': 'integer'
},
'name': {
'type': 'string'
},
'p3': {
'properties': {
'i': {
'type': 'integer'
},
'name': {
'type': 'string'
},
'p4': {
'properties': {
'i': {
'type': 'integer'
},
'name': {
'type': 'string'
}
},
'type': 'object'
}
},
'type': 'object'
}
},
'type': 'object'
},
'type': 'array'
},
'p3': {
'items': {
'properties': {
'i': {
'type': 'integer'
},
'name': {
'type': 'string'
},
'p4': {
'properties': {
'i': {
'type': 'integer'
},
'name': {
'type': 'string'
}
},
'type': 'object'
}
},
'type': 'object'
},
'type': 'array'
},
'strList': {
'items': {
'type': 'string'
},
'type': 'array'
},
'strMap': {
'type': 'object'
}
},
'type': 'array'
}
Can anyone please suggest an alternative or something in python which keeps the sequence of attributes as they are and convert the string into the python dictionary?
As tobias_k has said, python dictionaries are unordered, so you'll lose any order information as soon as you load your data into one.
You can, however, load your JSON string into a OrderedDict:
from collections import OrderedDict
import json
json.loads(your_json_string, object_pairs_hook=OrderedDict)
This method is mentioned in the json module documentation

ordering json in python mapping object

I am using elasticsearch where the query is to be posted in json and should be in standard order or else the result will be wrong. the problem is that the python is changing my json ordering. my original json query is.
x= {
"query": {
"filtered": {
"query": {
"query_string": {
"query": "*a*"
}
},
"filter": {
"and": {
"filters": [
{
"term": {
"city": "london"
}
},
{
"term": {
"industry.industry_not_analyed": "oil"
}
}
]
}
}
}
},
"facets": {
"industry": {
"terms": {
"field": "industry.industry_not_analyed"
}
},
"city": {
"terms": {
"field": "city.city_not_analyzed"
}
}
}
}
but the resulting python object is as follow.
{
'query': {
'filtered': {
'filter': {
'and': {
'filters': [
{
'term': {
'city': 'london'
}
},
{
'term': {
'industry.industry_not_analyed': 'oil'
}
}
]
}
},
'query': {
'query_string': {
'query': '*a*'
}
}
}
},
'facets': {
'city': {
'terms': {
'field': 'city.city_not_analyzed'
}
},
'industry': {
'terms': {
'field': 'industry.industry_not_analyed'
}
}
}
}
the result is different than what I need how do I solve this.
Use OrderedDict() instead of {}. Note that you can't simply use OrderedDict(query=...) because that would create an unordered dict in the background. Use this code instead:
x = OrderedDict()
x['query'] = OrderedDict()
...
I suggest to implement a builder for this:
x = Query().filtered().query_string("*a*").and()....

Categories