Unable to see whole pdf content after indexing pdf file in ES - python

Below is my code to index a pdf url in Elasticsearch:
import requests
from elasticsearch import Elasticsearch
es = Elasticsearch()
body = {
"description" : "Extract attachment information",
"processors" : [
{
"attachment" : {
"field" : "data"
}
}
]
}
es.index(index='_ingest', doc_type='pipeline', id='attachment', body=body)
url = 'https://pubs.vmware.com/nsx-63/topic/com.vmware.ICbase/PDF/nsx_63_cross_vc_install.pdf'
response = requests.get(url)
import base64
data = base64.b64encode(response.content).decode('ascii')
result2 = es.index(index='my_index', doc_type='my_type', pipeline='attachment',
body={'data': data})
result2
doc = es.get(index='my_index', doc_type='my_type', id=result2['_id'], _source_exclude=['data'])
doc
print(doc['_source']['attachment']['content'])
Last line is printing the contents of pdf file till 63 page only out of 126.
Do I need to change any settings somewhere(already tried to increase the console o/p,dint helped).
Please provide pointers on this.

There is a limit of 100000 characters extracted.
You can change it in the pipeline definition by setting indexed_chars.
See https://www.elastic.co/guide/en/elasticsearch/plugins/current/using-ingest-attachment.html

Related

Gravity form API with python

The documentation of the API is here, and I try to implement this line in python
//retrieve entries created on a specific day (use the date_created field)
//this example returns entries created on September 10, 2019
https://localhost/wp-json/gf/v2/entries?search={"field_filters": [{"key":"date_created","value":"09/10/2019","operator":"is"}]}
But when I try to do with python in the following code, I got an error:
import json
import oauthlib
from requests_oauthlib import OAuth1Session
consumer_key = ""
client_secret = ""
session = OAuth1Session(consumer_key,
client_secret=client_secret,signature_type=oauthlib.oauth1.SIGNATURE_TYPE_QUERY)
url = 'https://localhost/wp-json/gf/v2/entries?search={"field_filters": [{"key":"date_created","value":"09/01/2023","operator":"is"}]}'
r = session.get(url)
print(r.content)
The error message is :
ValueError: Error trying to decode a non urlencoded string. Found invalid characters: {']', '['} in the string: 'search=%7B%22field_filters%22:%20[%7B%22key%22:%22date_created%22,%22value%22:%2209/01/2023%22,%22operator%22:%22is%22%7D]%7D'. Please ensure the request/response body is x-www-form-urlencoded.
One solution is to parameterize the url:
import requests
import json
url = 'https://localhost/wp-json/gf/v2/entries'
params = {
"search": {"field_filters": [{"key":"date_created","value":"09/01/2023","operator":"is"}]}
}
headers = {'Content-type': 'application/json'}
response = session.get(url, params=params, headers=headers)
print(response.json())
But in the retrieved entries, the data is not filtered with the specified date.
In the official documentation, they gave a date in this format "09/01/2023", but in my dataset, the format is: "2023-01-10 19:16:59"
Do I have to transform the format ? I tried a different format for the date
date_created = "09/01/2023"
date_created = datetime.strptime(date_created, "%d/%m/%Y").strftime("%Y-%m-%d %H:%M:%S")
What alternative solutions can I test ?
What if you use urllib.parse.urlencode function, so your code would looks like:
import json
import oauthlib
from requests_oauthlib import OAuth1Session
import urllib.parse
consumer_key = ""
client_secret = ""
session = OAuth1Session(consumer_key,
client_secret=client_secret,signature_type=oauthlib.oauth1.SIGNATURE_TYPE_QUERY)
params = {
"search": {"field_filters": [{"key":"date_created","value":"09/01/2023","operator":"is"}]}
}
encoded_params = urllib.parse.urlencode(params)
url = f'https://localhost/wp-json/gf/v2/entries?{encoded_params}'
r = session.get(url)
print(r.content)
hope that helps
I had the same problem and found a solution with this code:
params = {
'search': json.dumps({
'field_filters': [
{ 'key': 'date_created', 'value': '2023-01-01', 'operator': 'is' }
],
'mode': 'all'
})
}
encoded_params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
url = 'http://localhost/depot_git/wp-json/gf/v2/forms/1/entries?' + encoded_params + '&paging[page_size]=999999999' # nombre de réponses par page forcé manuellement
I'm not really sure what permitted it to work as I'm an absolute beginner with Python, but I found that you need double quotes in the URL ( " ) instead of simple quotes ( ' ), so the solution by William Castrillon wasn't enough.
As for the date format, Gravity Forms seems to understand DD/MM/YYYY. It doesn't need a time either.

How to print json info with python?

I have a json (url = http://open.data.amsterdam.nl/ivv/parkeren/locaties.json) and I want to print all 'title', 'adres', 'postcode'. How can I do that?
I want to print it like this:
title.
adres.
postcode.
title.
adres.
postcode.
so among themselves
I hope you can help me with this
import urllib, json
url = "http://open.data.amsterdam.nl/ivv/parkeren/locaties.json"
import requests
search = requests.get(url).json()
print(search['title'])
print(search['adres'])
print(search['postcode'])
Using print(json.dumps(r, indent=4)) you can see that the structure is
{
"parkeerlocaties": [
{
"parkeerlocatie": {
"title": "Fietsenstalling Tolhuisplein",
"Locatie": "{\"type\":\"Point\",\"coordinates\":[4.9032801,52.3824545]}",
...
}
},
{
"parkeerlocatie": {
"title": "Fietsenstalling Paradiso",
"Locatie": "{\"type\":\"Point\",\"coordinates\":[4.8833735,52.3621851]}",
...
}
},
So to access the inner properties, you need to follow the JSON path
import requests
url = ' http://open.data.amsterdam.nl/ivv/parkeren/locaties.json'
search = requests.get(url).json()
for parkeerlocatie in search["parkeerlocaties"]:
content = parkeerlocatie['parkeerlocatie']
print(content['title'])
print(content['adres'])
print(content['postcode'])
print()

How to match an exact word in a json soup?

I am parsing through Patient Metadata scraped from a url, and I am trying to access the 'PatientID' field. However, there is also an 'OtherPatientIDs' field, which is grabbed by my search.
I have tried looking into using regular expressions but I am unclear on how to match an EXACT string or how to incorporate it into my code.
So at the moment, I have done:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
PatientID = "PatientID"
lines = soup.decode('utf8').split("\n")
for line in lines:
if "PatientID" in line:
PatientID = line.split(':')[1].split('\"')[1].split('\"')[0]
print(PatientID)
Which successfully finds the values of both the PatientID AND OtherPatientIDs field. How do I specify that I only want the PatientID field?
EDIT:
I was asked to give an example of what I get with response.text, and it's of the form:
{
"ID" : "shqowihdojcoughwoeh"
"LastUpdate: "20190507"
"MainTags" : {
"OtherPatientIDs" : "0304992098"
"PatientBirthDate" : "29/04/1803"
"PatientID" : "92879837"
"PatientName" : "LASTNAME^FIRSTNAME"
},
"Type" : "Patient"
}
Why not use the json library instead?
import json
import requests
response = requests.get(url)
data = json.loads(response.text)
print(data['MainTags']['PatientID'])

python gspread updating multiple cells from reponse body

I am using this python script to take a response from Progresso API:
http://docs.progresso.apiary.io/#reference/behaviour/behaviour-events-collection/get-behaviour-events
from urllib2 import Request, urlopen
import smtplib import gspread
from oauth2client.service_account import ServiceAccountCredentialseaders = {
'Authorization': 'Bearer [CURRENT_TOKEN]'
}
request = Request('https://private-anon-ae5edf57e7-progresso.apiary-
mock.com/BMEvents/?Behaviour=new', headers=headers)
response_body = urlopen(request).read()
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name('ProgressoAPI-
2f6ecaa6635c.json', scope)
gc = gspread.authorize(credentials)
wks = gc.open("Progresso Test").sheet1
wks.clear()
cell_list = wks.range('A1:H20')
for cell in cell_list:
cell.value = response_body
wks.update_cells(cell_list)
I know the cell.value = response body is wrong and I don't know how I can get it right - I am stuck.
it appears in every cell like this:
"{
""BehaviourEntryId"": 13798177,
""LearnerId"": 245277,
""LearnerCode"": ""2009-0080"",
""RegGroup"": ""U6-RWE"",
""Behaviour"": ""Negative"",
""IncidentDate"": ""2017-02-07"",
""Subject"": ""BE"",
""Location"": ""CLS"",
""Published"": ""Yes"",
""Creator"": ""DhDr"",
""Editor"": null,
""Assignee"": ""DiRo"",
""Status"": ""Completed"",
""Details"": [
{
""Category"": ""CL"",
""Type"": ""CLatt"",
""Severity"": ""S2"",
""point"": 0
},
{
""Category"": ""CL"",
""Type"": ""CLBEH"",
""Severity"": ""S2"",
""point"": 2
}
],
""Comments"": [
{
""BehaviourEntryCommentId"": 5648278,
""Confidential"": true,
""Comment"": ""Asked to go to the toilet and went to the one furthest away just to waste time.""
},
{
""BehaviourEntryCommentId"": 5648279,
""Confidential"": false,
""Comment"": ""Spat gum out on floor""
},
{
""BehaviourEntryCommentId"": 5648280,
""Confidential"": false,
""Comment"": ""Was rude to memeber of Staff""
}
],
""Actions"": [
""HTO"",
""ISO""
]
}"
How do I separate the text to how I want in the cell range and bulk update it?
If you mean something like two columns with one row being "BehaviourEntryId" and the other row being 13798177, you can try something like this:
import json
response = json.loads(response_body) #decode the json response string, returns a dict
response_pairs = list(response.items)
for i in range(1, len(response_body)+1):
current_pair = response_pairs[i-1]
current_key = current_pair[0]
current_value = current_pair[1]
wks.update_acell('A{}'.format(i), current_key)
wks.update_acell('B{}'.format(i), current_value)

Read a JSON from a URL using Python 3

I need an help on reading a JSON from a URL, which has the below JSON in it:
{
"totalItems":2,
"#href":"/classes/dsxplan:Program",
"#id":"dsxplan:Program",
"#mask":"dsplan:MVMask.WorkPackage.Complex",
"#type":"Collection",
"#code":200,
"#context":{
"dsxplan":"xplan",
"dsplan":"plan",
"dspol":"pol",
"image":{
"#id":"dspol:image",
"#type":"#id"
},
"dskern":"kern"
},
"member":[
{
"dsplan:actualType":{
"#href":"/resources/dsxplan:Program",
"#id":"dsxplan:Program",
"#mask":"dskern:Mask.Default",
"image":"iconProgram.png"
},
"dskern:owner":{
"#href":"/resources/dskern:Person.Creator",
"#id":"dskern:Person.Creator",
"#mask":"dskern:MVMask.Person.Complex",
"dsplan:actualType":{
"#href":"/resources/foaf:Person",
"#id":"foaf:Person",
"#mask":"dskern:Mask.Default"
}
},
"dspol:modificationDate":"2017-09-08T17:54:36.786Z",
"#href":"/resources/dsxplan:DSLCProgram.R-399",
"#id":"dsxplan:DSLCProgram.R-399",
"#mask":"dsplan:MVMask.WorkPackage.Complex",
"#etag":"7412df19-1dde-4245-b40b-5dd86dbbe3f1"
},
{
"dsplan:actualType":{
"#href":"/resources/dsxplan:Program",
"#id":"dsxplan:Program",
"#mask":"dskern:Mask.Default",
"image":"iconProgram.png"
},
"dskern:owner":{
"#href":"/resources/dskern:Person.Creator",
"#id":"dskern:Person.Creator",
"#mask":"dskern:MVMask.Person.Complex",
"dsplan:actualType":{
"#href":"/resources/foaf:Person",
"#id":"foaf:Person",
"#mask":"dskern:Mask.Default"
}
},
"dspol:modificationDate":"2017-09-08T17:54:36.786Z",
"#href":"/resources/dsxplan:xComModel2017program.R-394",
"#id":"dsxplan:xComModel2017program.R-394",
"#mask":"dsplan:MVMask.WorkPackage.Complex",
"#etag":"7412df19-1dde-4245-b40b-5dd86dbbe3f1"
}
]
}
I just need to read this json from a link provided. I tried the below code:
import urllib.request
request= urllib.request.Request("https://dummy_link")
response = urllib.request.urlopen(request)
input = (response.read().decode('utf-8'))
json.loads(input)
This code throws me this error:
"JSONDecodeError: Expecting value: line 9 column 1 (char 12)"
Could you please help me get this right? I really appreciate the help.!!
You could use Requests library which is more simple than urllib:
For instance:
import requests
r = requests.get('https://dummy_link')
obj = r.json()
EDIT
If you want to use urllib, you can do as below:
import urllib.request
import json
with urllib.request.urlopen("https://dummy_link") as f:
content = f.read()
obj = json.loads(content)
There is no need to convert the binary content to unicode string.
There is an urllib howto in the official documentation.

Categories