post request with \n-delimited JSON in python

post request with \n-delimited JSON in python - python

I'm trying to use the bulk API from Elasticsearch and I see that this can be done using the following request which is special because what is given as a "data" is not a proper JSON, but a JSON that uses \n as delimiters.
curl -XPOST 'localhost:9200/_bulk?pretty' -H 'Content-Type: application/json' -d '
{ "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } }
{ "field1" : "value1" }
{ "delete" : { "_index" : "test", "_type" : "type1", "_id" : "2" } }
{ "create" : { "_index" : "test", "_type" : "type1", "_id" : "3" } }
{ "field1" : "value3" }
{ "update" : {"_id" : "1", "_type" : "type1", "_index" : "test"} }
{ "doc" : {"field2" : "value2"} }
'
My question is how can I perform such request within python? The authors of ElasticSearch suggest to not pretty print the JSON but I'm not sure what it means (see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html)
I know that this is a valid python request
import requests
import json
data = json.dumps({"field":"value"})
r = requests.post("localhost:9200/_bulk?pretty", data=data)
But what do I do if the JSON is \n-delimited?

What this really is is a set of individual JSON documents, joined together with newlines. So you could do something like this:
data = [
{ "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } },
{ "field1" : "value1" },
{ "delete" : { "_index" : "test", "_type" : "type1", "_id" : "2" }, },
{ "create" : { "_index" : "test", "_type" : "type1", "_id" : "3" }, },
{ "field1" : "value3" },
{ "update" : {"_id" : "1", "_type" : "type1", "_index" : "test"} },
{ "doc" : {"field2" : "value2"} }
]
data_to_post = '\n'.join(json.dumps(d) for d in data)
r = requests.post("localhost:9200/_bulk?pretty", data=data_to_post)
However, as pointed out in the comments, the Elasticsearch Python client is likely to be more useful.

As a follow-up to Daniel's answer above, I had to add an additional '\n' to the end of the data_to_post, and add a {Content-Type: application/x-ndjson} header to get it work in Elasticsearch 6.3.
data_to_post = '\n'.join(json.dumps(d) for d in data) + "\n"
headers = {"Content-Type": "application/x-ndjson"}
r = requests.post("http://localhost:9200/_bulk?pretty", data=data_to_post, headers=headers)
Otherwise, I will receive the error:
"The bulk request must be terminated by a newline [\\n]"

You can use python ndjson library to do it.
https://pypi.org/project/ndjson/
It contains JSONEncoder and JSONDecoder classes for easy use with other libraries, such as requests:
import ndjson
import requests
response = requests.get('https://example.com/api/data')
items = response.json(cls=ndjson.Decoder)

Related

Different/partial json outputs when read from each line in a file vs read from list variable within the code

Being new to python, I am unable to resolve the following issue.
Below is my python code, which returns different json outputs compared to when executed with list passed in a variable vs each line read from a file.
Code with lines read from file which throws partial/corrupted output:
import requests
import json
import re
repo_name = "repo"
with open('file_list_new.txt') as file:
for line in file:
url = "http://fqdn/repository/{0}/{1}?describe=json".format(repo_name, line)
response = requests.get(url)
json_data = response.text
data = json.loads(json_data)
print(data)
for size in data['items']:
if size['name'] == 'Payload':
value_size= size['value']['Size']
if value_size != -1:
print(value_size)
content of file_list_new.txt
mysql.odbc/5.1.14
mysql.odbc/5.1.11
corrupted output
{
"parameters" : {
"path" : "/mysql.odbc/5.1.14\n",
"nexusUrl" : "http://fqdn"
},
"items" : [ {
"name" : "Exception during handler processing",
"type" : "topic",
"value" : "Exception during handler processing"
}, {
"name" : "java.lang.IllegalArgumentException",
"type" : "table",
"value" : {
"Message" : "Illegal character in path at index 40: Packages(Id='mysql.odbc',Version='5.1.14\n')"
}
}, {
"name" : "java.net.URISyntaxException",
"type" : "table",
"value" : {
"Message" : "Illegal character in path at index 40: Packages(Id='mysql.odbc',Version='5.1.14\n')"
}
}, {
"name" : "Request",
"type" : "topic",
"value" : "Request"
}, {
"name" : "Details",
"type" : "table",
"value" : {
"Action" : "GET",
"path" : "/mysql.odbc/5.1.14\n"
}
}, {
"name" : "Parameters",
"type" : "table",
"value" : {
"describe" : "json"
}
}, {
"name" : "Headers",
"type" : "table",
"value" : {
"Accept" : "*/*",
"User-Agent" : "python-requests/2.27.1",
"Connection" : "keep-alive",
"Host" : "fqdn",
"Accept-Encoding" : "gzip, deflate"
}
}, {
"name" : "Attributes",
"type" : "table",
"value" : {
"org.apache.shiro.subject.support.DefaultSubjectContext.SESSION_CREATION_ENABLED" : false,
"Key[type=org.sonatype.nexus.security.SecurityFilter, annotation=[none]].FILTERED" : true,
"authcAntiCsrf.FILTERED" : true,
"nx-authc.FILTERED" : true,
"org.apache.shiro.web.servlet.ShiroHttpServletRequest_SESSION_ID_URL_REWRITING_ENABLED" : true,
"javax.servlet.include.servlet_path" : "/repository/repo/mysql.odbc/5.1.14%0A",
"nx-anonymous.FILTERED" : true,
"org.sonatype.nexus.security.anonymous.AnonymousFilter.originalSubject" : "org.apache.shiro.web.subject.support.WebDelegatingSubject#33c429ba",
"nx-apikey-authc.FILTERED" : true
}
}, {
"name" : "Payload",
"type" : "table",
"value" : {
"Content-Type" : "",
"Size" : -1
}
} ]
}
Code with variable of list with in the code:
import requests
import json
import re
repo_name = "repo"
file_list = ["mysql.odbc/5.1.11","mysql.odbc/5.1.14"]
for i in file_list:
url = "http://fqdn/repository/{0}/{1}?describe=json".format(repo_name, i)
response = requests.get(url)
json_data = response.text
data = json.loads(json_data)
for size in data['items']:
if size['name'] == 'Payload':
value_size= size['value']['Size']
if value_size != -1:
print(value_size)
[Expected output]Output with list passed within the code as
{
"parameters" : {
"path" : "/mysql.odbc/5.1.14",
"nexusUrl" : "http://fqdn"
},
"items" : [ {
"name" : "Request",
"type" : "topic",
"value" : "Request"
}, {
"name" : "Details",
"type" : "table",
"value" : {
"Action" : "GET",
"path" : "/mysql.odbc/5.1.14"
}
}, {
"name" : "Parameters",
"type" : "table",
"value" : {
"describe" : "json"
}
}, {
"name" : "Headers",
"type" : "table",
"value" : {
"Accept" : "*/*",
"User-Agent" : "python-requests/2.27.1",
"Connection" : "keep-alive",
"Host" : "fqdn",
"Accept-Encoding" : "gzip, deflate"
}
}, {
"name" : "Attributes",
"type" : "table",
"value" : {
"org.apache.shiro.subject.support.DefaultSubjectContext.SESSION_CREATION_ENABLED" : false,
"Key[type=org.sonatype.nexus.security.SecurityFilter, annotation=[none]].FILTERED" : true,
"authcAntiCsrf.FILTERED" : true,
"nx-authc.FILTERED" : true,
"org.apache.shiro.web.servlet.ShiroHttpServletRequest_SESSION_ID_URL_REWRITING_ENABLED" : true,
"javax.servlet.include.servlet_path" : "/repository/repo/mysql.odbc/5.1.14",
"nx-anonymous.FILTERED" : true,
"org.sonatype.nexus.security.anonymous.AnonymousFilter.originalSubject" : "org.apache.shiro.web.subject.support.WebDelegatingSubject#1433a6c9",
"nx-apikey-authc.FILTERED" : true
}
}, {
"name" : "Payload",
"type" : "table",
"value" : {
"Content-Type" : "",
"Size" : -1
}
}, {
"name" : "Response",
"type" : "topic",
"value" : "Response"
}, {
"name" : "Status",
"type" : "table",
"value" : {
"Code" : 200,
"Message" : ""
}
}, {
"name" : "Headers",
"type" : "table",
"value" : {
"ETag" : "\"df4f013db18103f1b9541cdcd6ba8632\"",
"Content-Disposition" : "attachment; filename=mysql.odbc.5.1.14.nupkg",
"Last-Modified" : "Tue, 13 Oct 2015 03:54:48 GMT"
}
}, {
"name" : "Attributes",
"type" : "table",
"value" : { }
}, {
"name" : "Payload",
"type" : "table",
"value" : {
"Content-Type" : "application/zip",
"Size" : 3369
}
} ]
}
I am not sure if I am doing something wrong or missing something simple.
Any help is much appreciated.

It looks like the newlines are being passed into the url string
"Message" : "Illegal character in path at index 40: Packages(Id='mysql.odbc',Version='5.1.14**\n**')"
You can do something like this to remove them
with open('file_list_new.txt') as file:
for line in file:
url = "http://fqdn/repository/{0}/{1}?describe=json".format(repo_name, line.strip())

Write queryDSL to find unique error messages from sys log data?

Is there a way to configure the elasticsearch analyzer so that it is possible to get unique error messages in different scenarios?
1."...July 2020 23:00:00.674z... same message....."
2. slight changes in the string :
message1: "....message_details.. (unknown error 20004)
message2: "....message_details.. (unknown error 278945)
OR
message1:"....a::::: message_details ...."
message2:"....a:f23ed:fff:ff:: message_details ...."
The above two messages are the same apart from the character differnce.
Here is the query :
GET log_stash_2020.06.16/_search
{
"query": {
"bool": {
"must": [
{
"match_phrase": {
"message": "Error"
}
},
{
"match_phrase": {
"type": "lab_id"
}
}
]
}
},
"aggs": {
"log_message": {
"significant_text": {
"field": "message",
"filter_duplicate_text": "true"
}
}
},
"size": 1000
}
I have added the sample log file.
{
"_index" : "logstash_2020.06.16",
"_type" : "doc",
"_id" : "################",
"_score" : 1.0,
"_source" : {
"logsource" : "router_id",
"timestamp" : "Jun 15 20:00:00",
"program" : "some_program",
"host" : "#############",
"priority" : "27",
"#timestamp" : "2020-06-16T00:00:01.020Z",
"type" : "lab_id",
"pid" : "####",
"message" : ": ############### send failed with error: ENOENT -- Item not found (No error: 0)",
"#version" : "1"
}
}
{
"_index" : "logstash_2020.06.16",
"_type" : "doc",
"_id" : "################",
"_score" : 1.0,
"_source" : {
"host" : "################",
"#timestamp" : "2020-06-16T00:00:02.274Z",
"type" : "####",
"tags" : [
"_grokparsefailure"
],
"message" : "################:Jun 15 20:00:18.908 EDT: mediasvr[2546]: %MEDIASVR-MEDIASVR-4-PARTITION_USAGE_ALERT : High disk usage alert : host ##### exceeded 100% \n",
"#version" : "1"
}
}
Is there a way to do it in python ?(If elasticsearch does not have above mentioned functionality)

You can use the Elasticsearch Python client like so:
from elasticsearch import Elasticsearch
es = Elasticsearch(...)
resp = es.search(index="log_stash_2020.06.16", body={<dsl query>})
print(resp)
where is whatever query you want to run like the one you gave in the question.
<disclosure: I'm the maintainer of the Elasticsearch client and employed by Elastic>

Celery Result type for ElasticSearch

I'm exploring celery for my work currently and I'm trying to set-up Elasticsearch backend. Is there any way to send resulting value as a dictionary/JSON, not as a text? Therefore, results in Elasticsearch will be shown correctly and nested type could be used?
Automatic mapping created by celery
{
"celery" : {
"mappings" : {
"backend" : {
"properties" : {
"#timestamp" : {
"type" : "date"
},
"result" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
I've tried to create my own mapping with nested field, but it has resulted in a elasticsearch.exceptions.RequestError: RequestError(400, 'mapper_parsing_exception', 'object mapping for [result] tried to parse field [result] as object, but found a concrete value')
UPDATE
Result is already encoded in JSON and inside Elasticsearch wrapper JSON string is saved inside a dictionary. Adding json.loads(result) as a quick-fix actually helps.
After the quick-fix new mapping has appeared:
{
"celery" : {
"mappings" : {
"backend" : {
"properties" : {
"#timestamp" : {
"type" : "date"
},
"result" : {
"properties" : {
"date_done" : {
"type" : "date"
},
"result" : {
"type" : "long"
},
"status" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"task_id" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
}
}
Updated Kibana view:
Is there any way to disable serialization of results in Celery?
I could add a pull-request with unpacking JSON, just for Elasticsearch, but it looks like a hack.

Since v4.0 the default result_serializer is json, so you should have results in JSON format anyway. Maybe your configuration uses something else? - In that case I suggest you remove it (if you use Celery >=4.0) and you should enjoy results in JSON format. I prefer msgpack but on the other hand I do not use ElasticSearch on Celery results...

Elastic Search and AWS python

I am working on AWS ElasticSearch using python,I have JSON file with 3 field.
("cat1","Cat2","cat3"), each row is separated with \n
example cat1:food, cat2: wine, cat3: lunch etc.
from requests_aws4auth import AWS4Auth
import boto3
import requests
payload = {
"settings": {
"number_of_shards": 10,
"number_of_replicas": 5
},
"mappings": {
"Categoryall" :{
"properties" : {
"cat1" : {
"type": "string"
},
"Cat2":{
"type" : "string"
},
"cat3" : {
"type" : "string"
}
}
}
}
}
r = requests.put(url, auth=awsauth, json=payload)
I created schema/mapping for the index as shown above but i don't know how to populate index.
I am thinking to put a for loop for JSON file and call post request to insert the index. Doesn't have an idea how to proceed.
I want to create index and bulk upload this file in the index. Any suggestion would be appreciated.

Take a look at Elasticsearch Bulk API.
Basically, you need to create a bulk request body and post it to your "https://{elastic-endpoint}/_bulk" url.
The following example is showing a bulk request to insert 3 json records into your index called "my_index":
{ "index" : { "_index" : "my_index", "_type" : "_doc", "_id" : "1" } }
{ "cat1" : "food 1", "cat2": "wine 1", "cat3": "lunch 1" }
{ "index" : { "_index" : "my_index", "_type" : "_doc", "_id" : "2" } }
{ "cat1" : "food 2", "cat2": "wine 2", "cat3": "lunch 2" }
{ "index" : { "_index" : "my_index", "_type" : "_doc", "_id" : "3" } }
{ "cat1" : "food 3", "cat2": "wine 3", "cat3": "lunch 3" }
where each json record is represented by 2 json objects.
So if you write your bulk request body into a file called post-data.txt, then you can post it using Python something like this:
with open('post-data.txt','rb') as payload:
r = requests.post('https://your-elastic-endpoint/_bulk', auth=awsauth,
data=payload, ... add more params)
Alternatively, you can try Python elasticsearch bulk helpers.

Python and Elasticsearch autcompletion

I am trying to work with Python Elasticsearch version 1.1.0, on the master branch. It seems it will create an index, but there are issues with retrieving autocomplete results, when using a suggestion filed.
Below is a basic Python functions to create an index, then add a song to it, and finally we query it through the curl at the very bottom.
Unfortunately it fails with the error:
"reason" : "BroadcastShardOperationFailedException[[music][2] ]; nested: ElasticsearchException[failed to execute suggest]; nested: ElasticsearchException[Field [suggest] is not a completion suggest field]; "
} ]'
The functions I am using to create the index and add a song is below:
conn = Elasticsearch()
def mapping():
return """{
"song" : {
"properties" : {
"name" : { "type" : "string" },
"suggest" : { "type" : "completion",
"index_analyzer" : "simple",
"search_analyzer" : "simple",
"payloads" : true
}
}
}
}"""
def createMapping():
settings = mapping()
conn.indices.create(index= "music", body=settings)
def addSong():
body = """{
"name" : "Nevermind",
"suggest" : {
"input": [ "Nevermind", "Nirvana" ],
"output": "Nirvana - Nevermind",
"payload" : { "artistId" : 2321 },
"weight" : 34
}
}"""
res = conn.index(body=body, index="music", doc_type="song", id=1)
Curl request:
curl -X POST 'localhost:9200/music/_suggest?pretty' -d '{
"song-suggest" : {
"text" : "n",
"completion" : {
"field" : "suggest"
}
}
}'

When you use the create index API, you have to wrap your mappings in mappings:
def createMapping():
settings = """{"mappings": %s}""" % mapping()
conn.indices.create(index= "music", body=settings)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

post request with \n-delimited JSON in python - python

Related

Different/partial json outputs when read from each line in a file vs read from list variable within the code

Write queryDSL to find unique error messages from sys log data?

Celery Result type for ElasticSearch

Elastic Search and AWS python

Python and Elasticsearch autcompletion

Categories

Resources