How to Bulk index in Elastic Search using the Python API - python

I am trying to bulk insert a lot of documents into elastic search using the Python API.
import elasticsearch
from pymongo import MongoClient
es = elasticsearch.Elasticsearch()
def index_collection(db, collection, fields, host='localhost', port=27017):
conn = MongoClient(host, port)
coll = conn[db][collection]
cursor = coll.find({}, fields=fields, timeout=False)
print "Starting Bulk index of {} documents".format(cursor.count())
def action_gen():
"""
Generator to use for bulk inserts
"""
for n, doc in enumerate(cursor):
op_dict = {
'_index': db.lower(),
'_type': collection,
'_id': int('0x' + str(doc['_id']), 16),
}
doc.pop('_id')
op_dict['_source'] = doc
yield op_dict
res = bulk(es, action_gen(), stats_only=True)
print res
The documents come from a Mongodb collection and I amusing the function above to do the bulk indexing according to the way explained in the docs.
the bulk indexing goes on filling elastic search with thousands of empty documents. Can anyone tell me what am I doing wrong?

I've never seen the bulk data put together that way, especially what you're doing with "_source". There may be a way to get that to work, I don't know off-hand, but when I tried it I got weird results.
If you look at the bulk api, ES is expecting a meta-data document, then the document to be indexed. So you need two entries in your bulk data list for each document. So maybe something like:
import elasticsearch
from pymongo import MongoClient
es = elasticsearch.Elasticsearch()
def index_collection(db, collection, fields, host='localhost', port=27017):
conn = MongoClient(host, port)
coll = conn[db][collection]
cursor = coll.find({}, fields=fields, timeout=False)
print "Starting Bulk index of {} documents".format(cursor.count())
bulk_data = []
for n, doc in enumerate(cursor):
bulk_data.append({
'_index': db.lower(),
'_type': collection,
'_id': int('0x' + str(doc['_id']), 16),
})
bulk_data.append(doc)
es.bulk(index=index_name,body=bulk_data,refresh=True)
I didn't try to run that code, though. Here is a script I know works, that you can play with, if it helps:
from elasticsearch import Elasticsearch
es_client = Elasticsearch(hosts = [{ "host" : "localhost", "port" : 9200 }])
index_name = "test_index"
if es_client.indices.exists(index_name):
print("deleting '%s' index..." % (index_name))
print(es_client.indices.delete(index = index_name, ignore=[400, 404]))
print("creating '%s' index..." % (index_name))
print(es_client.indices.create(index = index_name))
bulk_data = []
for i in range(4):
bulk_data.append({
"index": {
"_index": index_name,
"_type": 'doc',
"_id": i
}
})
bulk_data.append({ "idx": i })
print("bulk indexing...")
res = es_client.bulk(index=index_name,body=bulk_data,refresh=True)
print(res)
print("results:")
for doc in es_client.search(index=index_name)['hits']['hits']:
print(doc)

Related

How can I bulk upload JSON records to AWS OpenSearch index using a python client library?

I have a sufficiently large dataset that I would like to bulk index the JSON objects in AWS OpenSearch.
I cannot see how to achieve this using any of: boto3, awswrangler, opensearch-py, elasticsearch, elasticsearch-py.
Is there a way to do this without using a python request (PUT/POST) directly?
Note that this is not for: ElasticSearch, AWS ElasticSearch.
Many thanks!
I finally found a way to do it using opensearch-py, as follows.
First establish the client,
# First fetch credentials from environment defaults
# If you can get this far you probably know how to tailor them
# For your particular situation. Otherwise SO is a safe bet :)
import boto3
credentials = boto3.Session().get_credentials()
region='eu-west-2' # for example
auth = AWSV4SignerAuth(credentials, region)
# Now set up the AWS 'Signer'
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
auth = AWSV4SignerAuth(credentials, region)
# And finally the OpenSearch client
host=f"...{region}.es.amazonaws.com" # fill in your hostname (minus the https://) here
client = OpenSearch(
hosts = [{'host': host, 'port': 443}],
http_auth = auth,
use_ssl = True,
verify_certs = True,
connection_class = RequestsHttpConnection
)
Phew! Let's create the data now:
# Spot the deliberate mistake(s) :D
document1 = {
"title": "Moneyball",
"director": "Bennett Miller",
"year": "2011"
}
document2 = {
"title": "Apollo 13",
"director": "Richie Cunningham",
"year": "1994"
}
data = [document1, document2]
TIP! Create the index if you need to -
my_index = 'my_index'
try:
response = client.indices.create(my_index)
print('\nCreating index:')
print(response)
except Exception as e:
# If, for example, my_index already exists, do not much!
print(e)
This is where things go a bit nutty. I hadn't realised that every single bulk action needs an, er, action e.g. "index", "search" etc. - so let's define that now
action={
"index": {
"_index": my_index
}
}
You can read all about the bulk REST API, there.
The next quirk is that the OpenSearch bulk API requires Newline Delimited JSON (see https://www.ndjson.org), which is basically JSON serialized as strings and separated by newlines. Someone wrote on SO that this "bizarre" API looked like one designed by a data scientist - far from taking offence, I think that rocks. (I agree ndjson is weird though.)
Hideously, now let's build up the full JSON string, combining the data and actions. A helper fn is at hand!
def payload_constructor(data,action):
# "All my own work"
action_string = json.dumps(action) + "\n"
payload_string=""
for datum in data:
payload_string += action_string
this_line = json.dumps(datum) + "\n"
payload_string += this_line
return payload_string
OK so now we can finally invoke the bulk API. I suppose you could mix in all sorts of actions (out of scope here) - go for it!
response=client.bulk(body=payload_constructor(data,action),index=my_index)
That's probably the most boring punchline ever but there you have it.
You can also just get (geddit) .bulk() to just use index= and set the action to:
action={"index": {}}
Hey presto!
Now, choose your poison - the other solution looks crazily shorter and neater.
PS The well-hidden opensearch-py documentation on this are located here.
conn = wr.opensearch.connect(
host=self.hosts, # URL
port=443,
username=self.username,
password=self.password
)
def insert_index_data(data, index_name='stocks', delete_index_data=False):
""" Bulk Create
args: body [{doc1}{doc2}....]
"""
if delete_index_data:
index_name = 'symbol'
self.delete_es_index(index_name)
resp = wr.opensearch.index_documents(
self.conn,
documents=data,
index=index_name
)
print(resp)
return resp
I have used below code to bulk insert records from postgres into OpenSearch ( ES 7.2 )
import sqlalchemy as sa
from sqlalchemy import text
import pandas as pd
import numpy as np
from opensearchpy import OpenSearch
from opensearchpy.helpers import bulk
import json
engine = sa.create_engine('postgresql+psycopg2://postgres:postgres#127.0.0.1:5432/postgres')
host = 'search-xxxxxxxxxx.us-east-1.es.amazonaws.com'
port = 443
auth = ('username', 'password') # For testing only. Don't store credentials in code.
# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
hosts = [{'host': host, 'port': port}],
http_compress = True,
http_auth = auth,
use_ssl = True,
verify_certs = True,
ssl_assert_hostname = False,
ssl_show_warn = False
)
with engine.connect() as connection:
result = connection.execute(text("select * from account_1_study_1.stg_pred where domain='LB'"))
records = []
for row in result:
record = dict(row)
record.update(record['item_dataset'])
del record['item_dataset']
records.append(record)
df = pd.DataFrame(records)
#df['Date'] = df['Date'].astype(str)
df = df.fillna("null")
print(df.keys)
documents = df.to_dict(orient='records')
#bulk(es ,documents, index='search-irl-poc-dump', raise_on_error=True)\
#response=client.bulk(body=documents,index='sample-index')
bulk(client, documents, index='search-irl-poc-dump', raise_on_error=True, refresh=True)

Kinesis Firehose Lambda Transformation and Dynamic partition

The following data presented is from the faker library. i am trying to learn and implement
dynamic partition in kinesis Firehose
Sample payload Input
{
"name":"Dr. Nancy Mcmillan",
"phone_numbers":"8XXXXX",
"city":"Priscillaport",
"address":"908 Mitchell Views SXXXXXXXX 42564",
"date":"1980-07-11",
"customer_id":"3"
}
Sample Input code
def main():
import boto3
import json
AWS_ACCESS_KEY = "XXXXX"
AWS_SECRET_KEY = "XXX"
AWS_REGION_NAME = "us-east-1"
for i in range(1,13):
faker = Faker()
json_data = {
"name": faker.name(),
"phone_numbers": faker.phone_number(),
"city": faker.city(),
"address": faker.address(),
"date": str(faker.date()),
"customer_id": str(random.randint(1, 5))
}
print(json_data)
hasher = MyHasher(key=json_data)
res = hasher.get()
client = boto3.client(
"kinesis",
aws_access_key_id=AWS_ACCESS_KEY,
aws_secret_access_key=AWS_SECRET_KEY,
region_name=AWS_REGION_NAME,
)
response = client.put_record(
StreamName='XXX',
Data=json.dumps(json_data),
PartitionKey='test',
)
print(response)
Here is lambda code which work fine
try:
import json
import boto3
import base64
from dateutil import parser
except Exception as e:
pass
class MyHasher(object):
def __init__(self, key):
self.key = key
def get(self):
keys = str(self.key).encode("UTF-8")
keys = base64.b64encode(keys)
keys = keys.decode("UTF-8")
return keys
def lambda_handler(event, context):
print("Event")
print(event)
output = []
for record in event["records"]:
dat = base64.b64decode(record["data"])
serialize_payload = json.loads(dat)
print("serialize_payload", serialize_payload)
json_new_line = str(serialize_payload) + "\n"
hasherHelper = MyHasher(key=json_new_line)
hash = hasherHelper.get()
partition_keys = {"customer_id": serialize_payload.get("customer_id")}
_ = {
"recordId": record["recordId"],
"result": "Ok",
"data": hash,
'metadata': {
'partitionKeys':
partition_keys
}
}
print(_)
output.append(_)
print("*****************")
print(output)
return {"records": output}
Sample screenshots show works fine
Here are setting on firehose for dynamic partition
some reason on AWS S3 I see an error folder and all my messages go into that
I have successfully implemented lambda transformation and have made a video which can be found below I am currently stuck on the dynamic partition I have tried reading several posts but that didn't help
https://www.youtube.com/watch?v=6wot9Z93vAY&t=231s
Thank you again and looking forward to hearing from you guys
Refernecs
https://docs.aws.amazon.com/firehose/latest/dev/dynamic-partitioning.html
https://www.youtube.com/watch?v=HcOVAFn-KhM
https://www.youtube.com/watch?v=PoaKgHdJgCE
https://medium.com/#bv_subhash/kinesis-firehose-performs-partitioning-based-on-timestamps-and-creates-files-in-s3-but-they-would-13efd51f6d39
https://www.amazonaws.cn/en/new/2021/s3-analytics-dynamic-partitioning-kinesis-data-firehose/
There are two prefix options for dynamic partitioning. 1) partitionKeyFromQuery 2) partitionKeyFromLambda. If you want firehose to parse record and get partition key then use first option. If you want to provide partition key after performing transformation use second option.
As per your firehose config, you are using lambda to provide partition key (second option) but prefix is provided for first option. To resolve this issue either disable inline parsing and add second option to firehose prefix !{partitionKeyFromLambda:customer_id}/ or remove lambda transformation and keep inline parsing

dump bulk data in elastic search using python api

i want to index shakespeare data in elastic search using its python api. I am getting below error.
PUT http://localhost:9200/shakes/play/3 [status:400 request:0.098s]
{'error': {'root_cause': [{'type': 'mapper_parsing_exception', 'reason': 'failed to parse'}], 'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'not_x_content_exception', 'reason': 'Compressor detection can only be called on some xcontent bytes or compressed xcontent bytes'}}, 'status': 400}
python script
from elasticsearch import Elasticsearch
from elasticsearch import TransportError
import json
data = []
for line in open('shakespeare.json', 'r'):
data.append(json.loads(line))
es = Elasticsearch()
res = 0
cl = []
# filtering data which i need
for d in data:
if res == 0:
res = 1
continue
cl.append(data[res])
res = 0
try:
res = es.index(index = "shakes", doc_type = "play", id = 3, body = cl)
print(res)
except TransportError as e:
print(e.info)
I also tried using json.dumps but still getting same error. But when add just one element of list to elastic search below code works.
You are not sending a bulk request to es, but only a simple create request -please take a look here. This method works with a dict that represent a new doc, and not with a list of docs. If you put an id on the create request, then you need to make this value dynamic, otherwise every doc will be overwritten on the id of the last doc indicized. If in your json , you have a record for each line you should try this -Please read here for bulk documentation:
from elasticsearch import helpers
es = Elasticsearch()
op_list = []
with open("C:\ElasticSearch\shakespeare.json") as json_file:
for record in json_file:
op_list.append({
'_op_type': 'index',
'_index': 'shakes',
'_type': 'play',
'_source': record
})
helpers.bulk(client=es, actions=op_list)

how to write a mongodb query in pymongo

The following query works in mongodb shell:
db.user.count( {$and: [ {"agent_id":{$exists:true}}, {"is_agent":{$ne:true}} ] } )
When I try it in python I get a different answer. Here's the python code:
import pymongo
from pymongo import MongoClient
def getCollection(cient,dbname,collection):
"""Return a colleciton based on client, db and collection"""
data_base = getattr(client, dbname)
collObject = getattr(data_base, collection)
return collObject
userColl = getCollection(client, "hkpr_restore","user")
usersWithAgents = userColl.count( {"$and": [ {"agent_id":{"$exists":"true"}}, {"is_agent":{"$ne":"true"}} ] } )
print usersWithAgents
The results are about 11,000 for the mongo shell query and about 17,000 for the python script query.
You should use:
"$exists": True
and not
"$exists": "true"
The same for $ne.

How to create request body for Python Elasticsearch mSearch

I'm trying to run a multi search request on the Elasticsearch Python client. I can run the singular search correctly but can't figure out how to format the request for a msearch. According to the documentation, the body of the request needs to be formatted as:
The request definitions (metadata-search request definition pairs), as
either a newline separated string, or a sequence of dicts to serialize
(one per row).
What's the best way to create this request body? I've been searching for examples but can't seem to find any.
If you follow the demo of official doc(even thought it's for BulkAPI) , you will find how to construct your request in python with the Elasticsearch client:
Here is the newline separated string way:
def msearch():
es = get_es_instance()
search_arr = []
# req_head
search_arr.append({'index': 'my_test_index', 'type': 'doc_type_1'})
# req_body
search_arr.append({"query": {"term" : {"text" : "bag"}}, 'from': 0, 'size': 2})
# req_head
search_arr.append({'index': 'my_test_index', 'type': 'doc_type_2'})
# req_body
search_arr.append({"query": {"match_all" : {}}, 'from': 0, 'size': 2})
request = ''
for each in search_arr:
request += '%s \n' %json.dumps(each)
# as you can see, you just need to feed the <body> parameter,
# and don't need to specify the <index> and <doc_type> as usual
resp = es.msearch(body = request)
As you can see, the final-request is constructed by several req_unit.
Each req_unit construct shows below:
request_header(search control about index_name, optional mapping-types, search-types etc.)\n
reqeust_body(which involves query detail about this request)\n
The sequence of dicts to serialize way is almost same with the previous one, except that you don't need to convert it to string:
def msearch():
es = get_es_instance()
request = []
req_head = {'index': 'my_test_index', 'type': 'doc_type_1'}
req_body = {
'query': {'term': {'text' : 'bag'}},
'from' : 0, 'size': 2 }
request.extend([req_head, req_body])
req_head = {'index': 'my_test_index', 'type': 'doc_type_2'}
req_body = {
'query': {'range': {'price': {'gte': 100, 'lt': 300}}},
'from' : 0, 'size': 2 }
request.extend([req_head, req_body])
resp = es.msearch(body = request)
Here is the structure it returns. Read more about msearch.
If you are using elasticsearch-dsl, you can use the class MultiSearch.
Example from the documentation:
from elasticsearch_dsl import MultiSearch, Search
ms = MultiSearch(index='blogs')
ms = ms.add(Search().filter('term', tags='python'))
ms = ms.add(Search().filter('term', tags='elasticsearch'))
responses = ms.execute()
for response in responses:
print("Results for query %r." % response.search.query)
for hit in response:
print(hit.title)
Here is what I came up with. I am using the same document type and index so I optimized the code to run multiple queries with the same header:
from elasticsearch import Elasticsearch
from elasticsearch import exceptions as es_exceptions
import json
RETRY_ATTEMPTS = 10
RECONNECT_SLEEP_SECS = 0.5
def msearch(es_conn, queries, index, doc_type, retries=0):
"""
Es multi-search query
:param queries: list of dict, es queries
:param index: str, index to query against
:param doc_type: str, defined doc type i.e. event
:param retries: int, current retry attempt
:return: list, found docs
"""
search_header = json.dumps({'index': index, 'type': doc_type})
request = ''
for q in queries:
# request head, body pairs
request += '{}\n{}\n'.format(search_header, json.dumps(q))
try:
resp = es_conn.msearch(body=request, index=index)
found = [r['hits']['hits'] for r in resp['responses']]
except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError,
es_exceptions.TransportError): # pragma: no cover
logging.warning("msearch connection failed, retrying...") # Retry on timeout
if retries > RETRY_ATTEMPTS: # pragma: no cover
raise
time.sleep(RECONNECT_SLEEP_SECS)
found = msearch(queries=queries, index=index, retries=retries + 1)
except Exception as e: # pragma: no cover
logging.critical("msearch error {} on query {}".format(e, queries))
raise
return found
es_conn = Elasticsearch()
queries = []
queries.append(
{"min_score": 2.0, "query": {"bool": {"should": [{"match": {"name.tokenized": {"query": "batman"}}}]}}}
)
queries.append(
{"min_score": 1.0, "query": {"bool": {"should": [{"match": {"name.tokenized": {"query": "ironman"}}}]}}}
)
queries.append(
{"track_scores": True, "min_score": 9.0, "query":
{"bool": {"should": [{"match": {"name": {"query": "not-findable"}}}]}}}
)
q_results = msearch(es_conn, queries, index='pipeliner_current', doc_type='event')
This may be what some of you are looking for if you want to do multiple queries on the same index and doc type.
Got it! Here's what I did for anybody else...
query_list = ""
es = ElasticSearch("myurl")
for obj in my_list:
query = constructQuery(name)
query_count += 1
query_list += json.dumps({})
query_list += json.dumps(query)
if query_count <= 19:
query_list += "\n"
if query_count == 20:
es.msearch(index = "m_index", body = query_list)
I was beging screwed up by having to add the index twice. Even when using the Python client you still have to include the index part described in the original docs. Works now though!

Categories