KeyError accessing inner_hits in elasticsearch_dsl Python client - python

I've encountered a problem when accessing inner_hits data using the Python elasticsearch_dsl client. Any attempt to use the embedded Response object within meta.inner_hits yields a KeyError on "_type" in the container object. The following code is completely self-contained so anyone should be able to reproduce the same result using Python 2.7 and elasticsearch_dsl 5.0.0:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Index, Mapping, Nested, Search, Q
from elasticsearch_dsl.connections import connections
index = "test_index"
es_con = connections.create_connection(hosts=["localhost:9200"])
es_index = Index(index)
if es_index.exists():
es_index.delete()
es_index.create()
para_mapping = Mapping("paragraph")
para_mapping.field("sentences", Nested())
para_mapping.save(index)
test_paras = {}
for a in range(2):
test_paras[a] = {
"label": "Test Paragraph p{}".format(a),
"sentences": []
}
for b in range(2):
test_sent = {
"text": "Test Sentence p{}s{}".format(a, b),
}
test_paras[a]["sentences"].append(test_sent)
for idx, para in test_paras.iteritems():
para_id = "para_id_p{}".format(idx)
es_con.create(
index=index,
doc_type="paragraph",
id=para_id,
body=para
)
es_index.flush()
q_1 = Search(using=es_con).index(index).doc_type('paragraph')
q_2 = q_1 = q_1.query(
'nested', path='sentences', query=Q('term', **{"sentences.text": "p0s1"}), inner_hits={}
)
q_1 = q_1.execute()
# We got the expected paragraph
print "PARA_ID: ", q_1.hits[0].meta.id
# With all sentences
print "PARA[SENTENCES]: ", q_1.hits[0].sentences
# We can see inner_hits is included in para.meta
print "DIR PARA.META: ", dir(q_1.hits[0].meta)
# And it contains a "sentences" object
print "DIR PARA.META.INNER_HITS: ", dir(q_1.hits[0].meta.inner_hits)
# Of type elasticsearch_dsl.result.Response
print "TYPE PARA.META.INNER_HITS.SENTENCES:", type(q_1.hits[0].meta.inner_hits.sentences)
# That contains a "hits" object
print "DIR PARA.META.INNER_HITS.SENTENCES: ", dir(q_1.hits[0].meta.inner_hits.sentences)
# But every attempted action yields a KeyError: '_type' in result.AttrList()
try:
print q_1.hits[0].meta.inner_hits.sentences
except KeyError as e:
print "\nException:", type(e)
print "Message:", e
# Uncomment the following line to see full exception detail
# print dir(q.hits[0].meta.inner_hits.sentences.hits)
# The same query using the base-level API
es = Elasticsearch()
results = es.search(index=index, body=q_2.to_dict())
# This works just fine.
print "\nES RESULT:"
print results["hits"]["hits"][0]["inner_hits"]["sentences"]["hits"]["hits"][0]["_source"]["text"]
Is this a bug in the API?

This is a bug where we only account for inner_hits across parent/child relationship and not nested. I created an issue to track the fix: https://github.com/elastic/elasticsearch-dsl-py/issues/565
Thanks for the report!

Related

Elasticsearch helper returns generator class, need dictionary

I'm using the scan API to query elasticsearch and return an unlimited number of results (as opposed to the 10k limit). This returns a generator object, which I then need to turn into a dictionary (because the results will then be turned into a pandas data frame).
I attempted logs = {c.name:c.value for c in logs} but it returned an empty list. I know the results themselves came back because they printed when I attempted for i in logs: \\ print(i). How can I turn my results into a dictionary?
Code
import elasticsearch.helpers
from elasticsearch import Elasticsearch, TransportError, RequestsHttpConnection
def check_query(query, index):
ES_user = environ.get('ES_USER')
ES_pw = environ.get('ES_PW')
ES_cluster = environ.get('ES_CLUSTER')
es = Elasticsearch([ES_cluster],
connection_class=RequestsHttpConnection,
http_auth=(ES_user, ES_pw),
use_ssl=True, verify_certs=False)
try:
logs = elasticsearch.helpers.scan(es, query=query, index=index)
logs = {c.name:c.value for c in logs} # returns empty list
except Exception as e:
logs = 'Please contact the application supporter and provide them the information below! \n'
logs += 'Error: ' + str(e) + '\n'
logs += 'Query: ' + query
return logs

Elasticsearch for python - Calls not blocking correctly

I'm trying to write unittests for my own Elasticsearch client. It uses the client from elasticsearch-py.
Most of my tests are fine, but when running a test on my own search() function (which uses the search() function from Elasticsearch client) I get very random behaviour. This is the way my test is implemented:
def setUp(self) -> None:
self.es = ESClient(host="localhost")
self.es_acc = ESClient()
self.connection_res = (False, {})
self.t = self.es_acc.get_connection_status(self._callback)
self.t.join()
# Create test index and index some documents
self.es.create_index(self.TEST_INDEX)
names = ["Gregor", "Alice", "Per Svensson", "Mats Hermelin", "Mamma Mia"
, "Eva Dahlgren", "Per Morberg", "Maja Larsson", "Ola Salo", "Magrecievic Holagrostokovic"]
self.num_docs = len(names)
self.payload = []
random.seed(123)
for i, name in enumerate(names):
n = name.split(" ")
fname = n[0]
lname = n[1] if len(n) > 1 else n[0]
self.payload.append({"name": {"first": fname, "last": lname}, "age": random.randint(-100, 100),
"timestamp": datetime.utcnow() - timedelta(days=1 * i)})
self.es.upload(self.TEST_INDEX, self.payload, ids=list(range(len(names))))
def test_search(self):
# Test getting docs based on ids
ids = ["1", "4", "9"]
status, hits = self.es.search(self.TEST_INDEX, ids=ids) # Breakpoint
docs = hits["hits"]["hits"]
self.assertTrue(status, "Status not correct for search!")
returned_ids = [d["_id"] for d in docs]
names = [d["_source"]["name"] for d in docs]
self.assertListEqual(sorted(returned_ids), ids, "Returned ids from search not correct!")
self.assertListEqual(names, [self.payload[i]["name"] for i in [1, 4, 9]], "Returned source from search not correct!")
In setUp() I'm just uploading a few documents to test on, so there should always be 10 documents to test on. Below is an excerpt from my search() function.
if ids:
try:
q = Query().ids(ids).compile_and_get()
res = self.es.search(index=index, body=q)
print(res)
return True, res
except exceptions.ElasticsearchException as e:
self._handle_elastic_exceptions("search", e, index=index)
return False, {}
I've implemented Query. Anyway, when I just run the test, I ALMOST always get 0 hits. But if I debug the application, with a breakpoint in test_search() on the row where I make the call to search() and step, everything works fine. If I put it just one line below, I get 0 hits again. What is going on? Why is it not blocking correctly?
It seems like I found my solution!
I did not understand that setUp was called on every test method. This was actually not the problem however.
The problem is that for some tests, uploading documents simply took to much time (which was done in setUp) and so when the test started, the documents did not exist yet! Solution: add sleep(1) to the end of setUp.

Printing dictionary from inside a list puts one character on each line

Yes, yet another. I can't figure out what the issue is. I'm trying to iterate over a list that is a subsection of JSON output from an API call.
This is the section of JSON that I'm working with:
[
{
"created_at": "2017-02-22 17:20:29 UTC",
"description": "",
"id": 1,
"label": "FOO",
"name": "FOO",
"title": "FOO",
"updated_at": "2018-12-04 16:37:09 UTC"
}
]
The code that I'm running that retrieves this and displays it:
#!/usr/bin/python
import json
import sys
try:
import requests
except ImportError:
print "Please install the python-requests module."
sys.exit(-1)
SAT_API = 'https://satellite6.example.com/api/v2/'
USERNAME = "admin"
PASSWORD = "password"
SSL_VERIFY = False # Ignore SSL for now
def get_json(url):
# Performs a GET using the passed URL location
r = requests.get(url, auth=(USERNAME, PASSWORD), verify=SSL_VERIFY)
return r.json()
def get_results(url):
jsn = get_json(url)
if jsn.get('error'):
print "Error: " + jsn['error']['message']
else:
if jsn.get('results'):
return jsn['results']
elif 'results' not in jsn:
return jsn
else:
print "No results found"
return None
def display_all_results(url):
results = get_results(url)
if results:
return json.dumps(results, indent=4, sort_keys=True)
def main():
orgs = display_all_results(KATELLO_API + "organizations/")
for org in orgs:
print org
if __name__ == "__main__":
main()
I appear to be missing a concept because when I print org I get each character per line such as
[
{
"
c
r
e
a
t
e
d
_
a
t
"
It does this through to the final ]
I've also tried to print org['name'] which throws the TypeError: list indices must be integers, not str Python error. This makes me think that org is being seen as a list rather than a dictionary which I thought it would be due to the [{...}] format.
What concept am I missing?
EDIT: An explanation for why I'm not getting this: I'm working with a script in the Red Hat Satellite API Guide which I'm using to base another script on. I'm basically learning as I go.
display_all_results is returning a string since you are doing json.dumps in json.dumps(results, indent=4, sort_keys=True), which converts the dictionary to a string (you are getting that dictionary from r.json() in get_json function)
You then end up iterating over the characters of that string in main, and you see one character per line
Instead just return results from display_all_results and the code will work as intended
def display_all_results(url):
#results is already a dictionary, just return it
results = get_results(url)
if results:
return results
Orgs is a result of json.dump which produces a string. So instead of this code:
for org in orgs:
print(org)
replace it with simply:
#for org in orgs:
print(orgs)

Using Python elastisearch_dsl with nested objects

I want to try and use elasticsearch_dsl with python for the following
import elasticsearch
es_server = 'my_server_name'
es_port = '9200'
es_index_name = 'my_index_name'
es_connection = Elasticsearch([{'host': es_server, 'port': es_port}])
es_query = '{"query":{"bool":{"must":[{"term":{"data.party.fullName":"john do"}}],"must_not":[],"should":[]}},"from":0,"size":1,"sort":[],"facets":{}}'
my_results = es_connection.search(index=es_index_name, body=es_query)
print my_results
es_query ='{"query": {"nested" : {"filter" : {"term" : {"party.phoneList.phoneFullNumber" : "4081234567"}},"path" : "party.phoneList"}},"from" :0,"size" : 1}';
my_results = es_connection.search(index=es_index_name, body=es_query)
print my_results
I am able to get the 1st query but am not sure on the second one
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
client = Elasticsearch('my_server:9200')
s = Search(using=client, index = "my_index").query("term",fullName="john do ")
response = s.execute()
print response
Not sure how to do the query using DSL for the nested object party.phoneList.phoneFullNumber
New to ES and hence could not figure out how to do the nested objects.
I looked at https://github.com/elastic/elasticsearch-dsl-py/issues/28 and could not quite figure out.
Thanks !
Just use __ instead of . to get around python's limitations and the nested query:
s = Search(using=client, index = "my_index")
s = s.query("nested",
path="party.phoneList",
query=Q("term", party__phoneList__phoneFullNumber="4081234567")
)

Reading specific test steps from Quality Center with python

I am working with Quality Center via OTA COM library. I figured out how to connect to server, but I am lost in OTA documentation on how to work with it. What I need is to create a function which takes a test name as an input and returns number of steps in this test from QC.
For now I am this far in this question.
import win32com
from win32com.client import Dispatch
# import codecs #to store info in additional codacs
import re
import json
import getpass #for password
qcServer = "***"
qcUser = "***"
qcPassword = getpass.getpass('Password: ')
qcDomain = "***"
qcProject = "***"
td = win32com.client.Dispatch("TDApiOle80.TDConnection.1")
#Starting to connect
td.InitConnectionEx(qcServer)
td.Login(qcUser,qcPassword)
td.Connect(qcDomain, qcProject)
if td.Connected == True:
print "Connected to " + qcProject
else:
print "Connection failed"
#Path = "Subject\Regression\C.001_Band_tones"
mg=td.TreeManager
npath="Subject\Regression"
tsFolder = td.TestSetTreeManager.NodeByPath(npath)
print tsFolder
td.Disconnect
td.Logout
print "Disconnected from " + qcProject
Any help on descent python examples or tutorials will be highly appreciated. For now I found this and this, but they doesn't help.
Using the OTA API to get data from Quality Center normally means to get some element by path, create a factory and then use the factory to get search the object. In your case you need the TreeManager to get a folder in the Test Plan, then you need a TestFactory to get the test and finally you need the DesignStepFactory to get the steps. I'm no Python programmer but I hope you can get something out of this:
mg=td.TreeManager
npath="Subject\Test"
tsFolder = mg.NodeByPath(npath)
testFactory = tsFolder.TestFactory
testFilter = testFactory.Filter
testFilter["TS_NAME"] = "Some Test"
testList = testFactory.NewList(testFilter.Text)
test = testList.Item(1) # There should be only 1 item
print test.Name
stepFactory = test.DesignStepFactory
stepList = stepFactory.NewList("")
for step in stepList:
print step.StepName
It takes some time to get used to the QC OTA API documentation but I find it very helpful. Nearly all of my knowledge comes from the examples in the API documentation—for your problem there are examples like "Finding a unique test" or "Get a test object with name and path". Both examples are examples to the Test object. Even if the examples are in VB it should be no big thing to adapt them to Python.
I figured out the solution, if there is a better way to do this you are welcome to post it.
import win32com
from win32com.client import Dispatch
import getpass
def number_of_steps(name):
qcServer = "***"
qcUser = "***"
qcPassword = getpass.getpass('Password: ')
qcDomain = "***"
qcProject = "***"
td = win32com.client.Dispatch("TDApiOle80.TDConnection.1")
#Starting to connect
td.InitConnectionEx(qcServer)
td.Login(qcUser, qcPassword)
td.Connect(qcDomain, qcProject)
if td.Connected is True:
print "Connected to " + qcProject
else:
print "Connection failed"
mg = td.TreeManager # Tree manager
folder = mg.NodeByPath("Subject\Regression")
testList = folder.FindTests(name) # Make a list of tests matching name (partial match is accepted)
if testList is not None:
if len(testList) > 1:
print "There are multiple tests matching this name, please check input parameter\nTests matching"
for test in testList:
print test.name
td.Disconnect
td.Logout
return False
if len(testList) == 1:
print "In test %s there is %d steps" % (testList[0].Name, testList[0].DesStepsNum)
else:
print "There are no test with this test name in Quality Center"
td.Disconnect
td.Logout
return False
td.Disconnect
td.Logout
print "Disconnected from " + qcProject
return testList[0].DesStepsNum # Return number of steps for given test

Categories