post request with \n-delimited JSON in python - python

I'm trying to use the bulk API from Elasticsearch and I see that this can be done using the following request which is special because what is given as a "data" is not a proper JSON, but a JSON that uses \n as delimiters.
curl -XPOST 'localhost:9200/_bulk?pretty' -H 'Content-Type: application/json' -d '
{ "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } }
{ "field1" : "value1" }
{ "delete" : { "_index" : "test", "_type" : "type1", "_id" : "2" } }
{ "create" : { "_index" : "test", "_type" : "type1", "_id" : "3" } }
{ "field1" : "value3" }
{ "update" : {"_id" : "1", "_type" : "type1", "_index" : "test"} }
{ "doc" : {"field2" : "value2"} }
'
My question is how can I perform such request within python? The authors of ElasticSearch suggest to not pretty print the JSON but I'm not sure what it means (see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html)
I know that this is a valid python request
import requests
import json
data = json.dumps({"field":"value"})
r = requests.post("localhost:9200/_bulk?pretty", data=data)
But what do I do if the JSON is \n-delimited?

What this really is is a set of individual JSON documents, joined together with newlines. So you could do something like this:
data = [
{ "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } },
{ "field1" : "value1" },
{ "delete" : { "_index" : "test", "_type" : "type1", "_id" : "2" }, },
{ "create" : { "_index" : "test", "_type" : "type1", "_id" : "3" }, },
{ "field1" : "value3" },
{ "update" : {"_id" : "1", "_type" : "type1", "_index" : "test"} },
{ "doc" : {"field2" : "value2"} }
]
data_to_post = '\n'.join(json.dumps(d) for d in data)
r = requests.post("localhost:9200/_bulk?pretty", data=data_to_post)
However, as pointed out in the comments, the Elasticsearch Python client is likely to be more useful.

As a follow-up to Daniel's answer above, I had to add an additional '\n' to the end of the data_to_post, and add a {Content-Type: application/x-ndjson} header to get it work in Elasticsearch 6.3.
data_to_post = '\n'.join(json.dumps(d) for d in data) + "\n"
headers = {"Content-Type": "application/x-ndjson"}
r = requests.post("http://localhost:9200/_bulk?pretty", data=data_to_post, headers=headers)
Otherwise, I will receive the error:
"The bulk request must be terminated by a newline [\\n]"

You can use python ndjson library to do it.
https://pypi.org/project/ndjson/
It contains JSONEncoder and JSONDecoder classes for easy use with other libraries, such as requests:
import ndjson
import requests
response = requests.get('https://example.com/api/data')
items = response.json(cls=ndjson.Decoder)

Related

Different/partial json outputs when read from each line in a file vs read from list variable within the code

Being new to python, I am unable to resolve the following issue.
Below is my python code, which returns different json outputs compared to when executed with list passed in a variable vs each line read from a file.
Code with lines read from file which throws partial/corrupted output:
import requests
import json
import re
repo_name = "repo"
with open('file_list_new.txt') as file:
for line in file:
url = "http://fqdn/repository/{0}/{1}?describe=json".format(repo_name, line)
response = requests.get(url)
json_data = response.text
data = json.loads(json_data)
print(data)
for size in data['items']:
if size['name'] == 'Payload':
value_size= size['value']['Size']
if value_size != -1:
print(value_size)
content of file_list_new.txt
mysql.odbc/5.1.14
mysql.odbc/5.1.11
corrupted output
{
"parameters" : {
"path" : "/mysql.odbc/5.1.14\n",
"nexusUrl" : "http://fqdn"
},
"items" : [ {
"name" : "Exception during handler processing",
"type" : "topic",
"value" : "Exception during handler processing"
}, {
"name" : "java.lang.IllegalArgumentException",
"type" : "table",
"value" : {
"Message" : "Illegal character in path at index 40: Packages(Id='mysql.odbc',Version='5.1.14\n')"
}
}, {
"name" : "java.net.URISyntaxException",
"type" : "table",
"value" : {
"Message" : "Illegal character in path at index 40: Packages(Id='mysql.odbc',Version='5.1.14\n')"
}
}, {
"name" : "Request",
"type" : "topic",
"value" : "Request"
}, {
"name" : "Details",
"type" : "table",
"value" : {
"Action" : "GET",
"path" : "/mysql.odbc/5.1.14\n"
}
}, {
"name" : "Parameters",
"type" : "table",
"value" : {
"describe" : "json"
}
}, {
"name" : "Headers",
"type" : "table",
"value" : {
"Accept" : "*/*",
"User-Agent" : "python-requests/2.27.1",
"Connection" : "keep-alive",
"Host" : "fqdn",
"Accept-Encoding" : "gzip, deflate"
}
}, {
"name" : "Attributes",
"type" : "table",
"value" : {
"org.apache.shiro.subject.support.DefaultSubjectContext.SESSION_CREATION_ENABLED" : false,
"Key[type=org.sonatype.nexus.security.SecurityFilter, annotation=[none]].FILTERED" : true,
"authcAntiCsrf.FILTERED" : true,
"nx-authc.FILTERED" : true,
"org.apache.shiro.web.servlet.ShiroHttpServletRequest_SESSION_ID_URL_REWRITING_ENABLED" : true,
"javax.servlet.include.servlet_path" : "/repository/repo/mysql.odbc/5.1.14%0A",
"nx-anonymous.FILTERED" : true,
"org.sonatype.nexus.security.anonymous.AnonymousFilter.originalSubject" : "org.apache.shiro.web.subject.support.WebDelegatingSubject#33c429ba",
"nx-apikey-authc.FILTERED" : true
}
}, {
"name" : "Payload",
"type" : "table",
"value" : {
"Content-Type" : "",
"Size" : -1
}
} ]
}
Code with variable of list with in the code:
import requests
import json
import re
repo_name = "repo"
file_list = ["mysql.odbc/5.1.11","mysql.odbc/5.1.14"]
for i in file_list:
url = "http://fqdn/repository/{0}/{1}?describe=json".format(repo_name, i)
response = requests.get(url)
json_data = response.text
data = json.loads(json_data)
for size in data['items']:
if size['name'] == 'Payload':
value_size= size['value']['Size']
if value_size != -1:
print(value_size)
[Expected output]Output with list passed within the code as
{
"parameters" : {
"path" : "/mysql.odbc/5.1.14",
"nexusUrl" : "http://fqdn"
},
"items" : [ {
"name" : "Request",
"type" : "topic",
"value" : "Request"
}, {
"name" : "Details",
"type" : "table",
"value" : {
"Action" : "GET",
"path" : "/mysql.odbc/5.1.14"
}
}, {
"name" : "Parameters",
"type" : "table",
"value" : {
"describe" : "json"
}
}, {
"name" : "Headers",
"type" : "table",
"value" : {
"Accept" : "*/*",
"User-Agent" : "python-requests/2.27.1",
"Connection" : "keep-alive",
"Host" : "fqdn",
"Accept-Encoding" : "gzip, deflate"
}
}, {
"name" : "Attributes",
"type" : "table",
"value" : {
"org.apache.shiro.subject.support.DefaultSubjectContext.SESSION_CREATION_ENABLED" : false,
"Key[type=org.sonatype.nexus.security.SecurityFilter, annotation=[none]].FILTERED" : true,
"authcAntiCsrf.FILTERED" : true,
"nx-authc.FILTERED" : true,
"org.apache.shiro.web.servlet.ShiroHttpServletRequest_SESSION_ID_URL_REWRITING_ENABLED" : true,
"javax.servlet.include.servlet_path" : "/repository/repo/mysql.odbc/5.1.14",
"nx-anonymous.FILTERED" : true,
"org.sonatype.nexus.security.anonymous.AnonymousFilter.originalSubject" : "org.apache.shiro.web.subject.support.WebDelegatingSubject#1433a6c9",
"nx-apikey-authc.FILTERED" : true
}
}, {
"name" : "Payload",
"type" : "table",
"value" : {
"Content-Type" : "",
"Size" : -1
}
}, {
"name" : "Response",
"type" : "topic",
"value" : "Response"
}, {
"name" : "Status",
"type" : "table",
"value" : {
"Code" : 200,
"Message" : ""
}
}, {
"name" : "Headers",
"type" : "table",
"value" : {
"ETag" : "\"df4f013db18103f1b9541cdcd6ba8632\"",
"Content-Disposition" : "attachment; filename=mysql.odbc.5.1.14.nupkg",
"Last-Modified" : "Tue, 13 Oct 2015 03:54:48 GMT"
}
}, {
"name" : "Attributes",
"type" : "table",
"value" : { }
}, {
"name" : "Payload",
"type" : "table",
"value" : {
"Content-Type" : "application/zip",
"Size" : 3369
}
} ]
}
I am not sure if I am doing something wrong or missing something simple.
Any help is much appreciated.
It looks like the newlines are being passed into the url string
"Message" : "Illegal character in path at index 40: Packages(Id='mysql.odbc',Version='5.1.14**\n**')"
You can do something like this to remove them
with open('file_list_new.txt') as file:
for line in file:
url = "http://fqdn/repository/{0}/{1}?describe=json".format(repo_name, line.strip())

Write queryDSL to find unique error messages from sys log data?

Is there a way to configure the elasticsearch analyzer so that it is possible to get unique error messages in different scenarios?
1."...July 2020 23:00:00.674z... same message....."
2. slight changes in the string :
message1: "....message_details.. (unknown error 20004)
message2: "....message_details.. (unknown error 278945)
OR
message1:"....a::::: message_details ...."
message2:"....a:f23ed:fff:ff:: message_details ...."
The above two messages are the same apart from the character differnce.
Here is the query :
GET log_stash_2020.06.16/_search
{
"query": {
"bool": {
"must": [
{
"match_phrase": {
"message": "Error"
}
},
{
"match_phrase": {
"type": "lab_id"
}
}
]
}
},
"aggs": {
"log_message": {
"significant_text": {
"field": "message",
"filter_duplicate_text": "true"
}
}
},
"size": 1000
}
I have added the sample log file.
{
"_index" : "logstash_2020.06.16",
"_type" : "doc",
"_id" : "################",
"_score" : 1.0,
"_source" : {
"logsource" : "router_id",
"timestamp" : "Jun 15 20:00:00",
"program" : "some_program",
"host" : "#############",
"priority" : "27",
"#timestamp" : "2020-06-16T00:00:01.020Z",
"type" : "lab_id",
"pid" : "####",
"message" : ": ############### send failed with error: ENOENT -- Item not found (No error: 0)",
"#version" : "1"
}
}
{
"_index" : "logstash_2020.06.16",
"_type" : "doc",
"_id" : "################",
"_score" : 1.0,
"_source" : {
"host" : "################",
"#timestamp" : "2020-06-16T00:00:02.274Z",
"type" : "####",
"tags" : [
"_grokparsefailure"
],
"message" : "################:Jun 15 20:00:18.908 EDT: mediasvr[2546]: %MEDIASVR-MEDIASVR-4-PARTITION_USAGE_ALERT : High disk usage alert : host ##### exceeded 100% \n",
"#version" : "1"
}
}
Is there a way to do it in python ?(If elasticsearch does not have above mentioned functionality)
You can use the Elasticsearch Python client like so:
from elasticsearch import Elasticsearch
es = Elasticsearch(...)
resp = es.search(index="log_stash_2020.06.16", body={<dsl query>})
print(resp)
where is whatever query you want to run like the one you gave in the question.
<disclosure: I'm the maintainer of the Elasticsearch client and employed by Elastic>

Celery Result type for ElasticSearch

I'm exploring celery for my work currently and I'm trying to set-up Elasticsearch backend. Is there any way to send resulting value as a dictionary/JSON, not as a text? Therefore, results in Elasticsearch will be shown correctly and nested type could be used?
Automatic mapping created by celery
{
"celery" : {
"mappings" : {
"backend" : {
"properties" : {
"#timestamp" : {
"type" : "date"
},
"result" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
I've tried to create my own mapping with nested field, but it has resulted in a elasticsearch.exceptions.RequestError: RequestError(400, 'mapper_parsing_exception', 'object mapping for [result] tried to parse field [result] as object, but found a concrete value')
UPDATE
Result is already encoded in JSON and inside Elasticsearch wrapper JSON string is saved inside a dictionary. Adding json.loads(result) as a quick-fix actually helps.
After the quick-fix new mapping has appeared:
{
"celery" : {
"mappings" : {
"backend" : {
"properties" : {
"#timestamp" : {
"type" : "date"
},
"result" : {
"properties" : {
"date_done" : {
"type" : "date"
},
"result" : {
"type" : "long"
},
"status" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"task_id" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
}
}
Updated Kibana view:
Is there any way to disable serialization of results in Celery?
I could add a pull-request with unpacking JSON, just for Elasticsearch, but it looks like a hack.
Since v4.0 the default result_serializer is json, so you should have results in JSON format anyway. Maybe your configuration uses something else? - In that case I suggest you remove it (if you use Celery >=4.0) and you should enjoy results in JSON format. I prefer msgpack but on the other hand I do not use ElasticSearch on Celery results...

Elastic Search and AWS python

I am working on AWS ElasticSearch using python,I have JSON file with 3 field.
("cat1","Cat2","cat3"), each row is separated with \n
example cat1:food, cat2: wine, cat3: lunch etc.
from requests_aws4auth import AWS4Auth
import boto3
import requests
payload = {
"settings": {
"number_of_shards": 10,
"number_of_replicas": 5
},
"mappings": {
"Categoryall" :{
"properties" : {
"cat1" : {
"type": "string"
},
"Cat2":{
"type" : "string"
},
"cat3" : {
"type" : "string"
}
}
}
}
}
r = requests.put(url, auth=awsauth, json=payload)
I created schema/mapping for the index as shown above but i don't know how to populate index.
I am thinking to put a for loop for JSON file and call post request to insert the index. Doesn't have an idea how to proceed.
I want to create index and bulk upload this file in the index. Any suggestion would be appreciated.
Take a look at Elasticsearch Bulk API.
Basically, you need to create a bulk request body and post it to your "https://{elastic-endpoint}/_bulk" url.
The following example is showing a bulk request to insert 3 json records into your index called "my_index":
{ "index" : { "_index" : "my_index", "_type" : "_doc", "_id" : "1" } }
{ "cat1" : "food 1", "cat2": "wine 1", "cat3": "lunch 1" }
{ "index" : { "_index" : "my_index", "_type" : "_doc", "_id" : "2" } }
{ "cat1" : "food 2", "cat2": "wine 2", "cat3": "lunch 2" }
{ "index" : { "_index" : "my_index", "_type" : "_doc", "_id" : "3" } }
{ "cat1" : "food 3", "cat2": "wine 3", "cat3": "lunch 3" }
where each json record is represented by 2 json objects.
So if you write your bulk request body into a file called post-data.txt, then you can post it using Python something like this:
with open('post-data.txt','rb') as payload:
r = requests.post('https://your-elastic-endpoint/_bulk', auth=awsauth,
data=payload, ... add more params)
Alternatively, you can try Python elasticsearch bulk helpers.

Python and Elasticsearch autcompletion

I am trying to work with Python Elasticsearch version 1.1.0, on the master branch. It seems it will create an index, but there are issues with retrieving autocomplete results, when using a suggestion filed.
Below is a basic Python functions to create an index, then add a song to it, and finally we query it through the curl at the very bottom.
Unfortunately it fails with the error:
"reason" : "BroadcastShardOperationFailedException[[music][2] ]; nested: ElasticsearchException[failed to execute suggest]; nested: ElasticsearchException[Field [suggest] is not a completion suggest field]; "
} ]'
The functions I am using to create the index and add a song is below:
conn = Elasticsearch()
def mapping():
return """{
"song" : {
"properties" : {
"name" : { "type" : "string" },
"suggest" : { "type" : "completion",
"index_analyzer" : "simple",
"search_analyzer" : "simple",
"payloads" : true
}
}
}
}"""
def createMapping():
settings = mapping()
conn.indices.create(index= "music", body=settings)
def addSong():
body = """{
"name" : "Nevermind",
"suggest" : {
"input": [ "Nevermind", "Nirvana" ],
"output": "Nirvana - Nevermind",
"payload" : { "artistId" : 2321 },
"weight" : 34
}
}"""
res = conn.index(body=body, index="music", doc_type="song", id=1)
Curl request:
curl -X POST 'localhost:9200/music/_suggest?pretty' -d '{
"song-suggest" : {
"text" : "n",
"completion" : {
"field" : "suggest"
}
}
}'
When you use the create index API, you have to wrap your mappings in mappings:
def createMapping():
settings = """{"mappings": %s}""" % mapping()
conn.indices.create(index= "music", body=settings)

Categories