Load data from MongoDb to Elasticsearch through python

Load data from MongoDb to Elasticsearch through python - python

I have some json data loaded in MongoDb. e.g. doc1 = {"id": 1,"name": "x1"},doc2 = {"id": 2,"name": "x2"},doc3 = {"id": 3,"name": "x3"}. Now I want this data to import from MongoDb to Elasticsearch. I wrote this piece of code.
mgclient = MongoClient()
db = mgclient['light-test']
col = db['test']
es1 = Elasticsearch()
print ("Connected", es1.info())
es1.indices.create(index='light-test', ignore=400)
# Pull from mongo and dump into ES using bulk API
actions = []
for data in tqdm(col.find(), total=col.count()):
data.pop('_id')
action = {
"_index": 'light-test',
"_type": 'test',
"_source": data
}
actions.append(action)
print("complete")
# Dump x number of objects at a time
if len(actions) >= 100:
deque(parallel_bulk(es1, actions), maxlen=0)
actions = []
print("done")
a = es1.search(index='light-test', body={
'query': {
'match_all': {
}
}
})
print(a)
The problem is in the query returned. The hits shows blank whereas it should had returned the json files.
results
Help me in importing the data from MongoDb to Elasticsearch.

app = Flask(__name__)
MONGO_URL = '...'
mgclient = MongoClient(MONGO_URL, ssl=True, ssl_cert_reqs=ssl.CERT_NONE)
db = mgclient['light']
col = db['task']
doc1 = {...}
doc2 = {...}
doc3 = {...}
post_id = col.insert_many([doc1, doc2, doc3])
print(col.count())
es1 = Elasticsearch(...)
ESinfo=(es1.info())
# Pull from mongo and dump into ES using bulk API
actions = []
for data in tqdm(col.find(), total=col.count()):
data.pop('_id')
action = {
"index": {
"_index": 'light',
"_type": 'task',
}
}
actions.append(action)
actions.append(data)
#delete = es1.indices.delete(index = 'light')
request_body = {
"settings" : {
"number_of_shards": 1,
"number_of_replicas": 0
}
}
es1.indices.create(index='light', body = request_body, ignore=400)
res = es1.bulk(index = 'light', body = actions, refresh = True)
result = col.find()
names = []
for obj in col.find():
name = obj['name']
names.append(name)
print(names)
#app.route('/query')
def Query():
a = es1.search(index='light', body={
'query': {
'match': {
'name': '...',
}
}
})
return jsonify(query=a)
if __name__ == "__main__":
app.run(host='0.0.0.0', port=1024)
This has helped. thank you :)

Related

AttributeError: Could not locate column in row for column 'to_dict'

I use flask with datatable.js plugin with ajax. I am getting following error. Looks like this error is thrown at the end of my view (see code at the bottom). I am not able to locate the cause of issue, any thoughts?
Error: (thrown from view shown below in code)
'data': [it.to_dict() for it in query],
AttributeError: Could not locate column in row for column 'to_dict'
This is my model:
class DiskSpaceView(Database.db.Model, UserMixin):
__tablename__ = 'view_space'
__table_args__ = {"schema":"config"}
column_not_exist_in_db = Database.db.Column(Database.db.Integer, primary_key=True) # just add for sake of this error, dont add in db
object_type = Database.db.Column(Database.db.String)
schemaname = Database.db.Column(Database.db.String)
tablename = Database.db.Column(Database.db.String)
indexname = Database.db.Column(Database.db.String)
object_size_bytes = Database.db.Column(Database.db.String)
object_size = Database.db.Column(Database.db.String)
def to_dict(self):
return {
'column_not_exist_in_db': random.getrandbits(64),
'object_type': self.object_type,
'schemaname': self.schemaname,
'tablename': self.tablename,
'indexname': self.indexname,
'object_size_bytes': self.object_size_bytes,
'object_size': self.object_size
}
This is my Ajax call:
$(document).ready(function () {
var table = $("#dataTbDiskSpace").DataTable({
scrollY: "600px",
scrollCollapse: true,
paging: true,
ajax: "/api/datadiskspace",
serverSide: true,
pageLength : 200,
success: function (ajax) {
console.log("test.js >> DataTable ajax >>>" + JSON.stringify(ajax));
},
columns: [
{ data: "object_type" },
{ data: "schemaname"},
{ data: "tablename"},
{ data: "indexname"},
{ data: "object_size_bytes"},
{ data: "object_size"}
],
columnDefs: [
{
defaultContent: "-",
targets: "_all",
},
],
});
});
This is my view:
#userviews.route('/api/datadiskspace')
def datadiskspace():
query = DiskSpaceView.query.with_entities(DiskSpaceView.object_type, DiskSpaceView.schemaname, DiskSpaceView.tablename,DiskSpaceView.indexname,DiskSpaceView.object_size_bytes, DiskSpaceView.object_size)
print(query)
# search filter
search = request.args.get('search[value]')
if search:
query = query.filter(DatabasePostgres.db.or_(
DiskSpaceView.tablename.like(f'%{search}%'),
DiskSpaceView.indexname.like(f'%{search}%')
))
total_filtered = query.count()
# sorting
order = []
i = 0
while True:
col_index = request.args.get(f'order[{i}][column]')
if col_index is None:
break
col_name = request.args.get(f'columns[{col_index}][data]')
if col_name not in ['object_type', 'schemaname', 'tablename', 'indexname','object_size_bytes','object_size']:
col_name = 'schemaname'
descending = request.args.get(f'order[{i}][dir]') == 'desc'
col = getattr(DiskSpaceView, col_name)
if descending:
col = col.desc()
order.append(col)
i += 1
if order:
query = query.order_by(*order)
# pagination
start = request.args.get('start', type=int)
length = request.args.get('length', type=int)
query = query.offset(start).limit(length)
#debug
counter = 1
for i in query:
print(i)
print(counter)
counter += 1
# response
return {
'data': [it.to_dict() for it in query],
'recordsFiltered': total_filtered,
'recordsTotal': DiskSpaceView.query.count(),
'draw': request.args.get('draw', type=int)
}
EDIT:
I think it may something to do with pagination - if i set for example 200 in my ajax call like: pageLength : 200 then in my view when i print (see #debug) it prints me 200 rows. When I for instance change to pageLength : 50. My print loop prints me 50 rows... I don't understand what's the case but this seems to be pretty intresting.

TypeError when trying to set a DB value

I'm trying to get my Flask server to update to a Mongo Atlas DB on request. When a request is passed with the required arguments, it will post to a Discord webhook then attempt to update DB values.
Example: Going to https://(redacted for privacy)/(redacted for privacy)?iid=123&username=wow&price=22 with JUST the webhook code (doesn't touch DB) will do this:
(sent to discord webhook) and give a success message (outputs as 200 in the console).
But when I enable the DB code, the same link will through a 500 error. Anything I can do? The console outputs as;
balance = mycol.find_one({"UserID": uid})['balance']
TypeError: 'NoneType' object is not subscriptable
I don't understand why this won't work. Here's the code that works, but only posts to the webhook (commented stuff is DB related):
#balance = mycol.find_one({"UserID": uid})['balance']
#res = mycol.find_one({"UserID": uid})
#newvalues = { "$set": { "UserID": uid, "balance": int(balance) + int(cashback_amt)} }
#mycol.update_one(res, newvalues)
img_url = f"https://www.roblox.com/bust-thumbnail/image?userId={uid}&width=420&height=420&format=png"
data = {
"content" : "**New purchase!**",
"username" : "robate"
}
data["embeds"] = [
{
"description" : f"**ItemID**: `{item_id}`\n**Item Price**: `{price}`\n**Item Cashback**: `{cashback_amt}`",
"title" : "New purchase made",
"color": 65311,
"thumbnail": {"url": img_url}
}
]
result = requests.post(purclog, json = data)
return jsonify({"res": True})
And the non-working DB included code:
balance = mycol.find_one({"UserID": uid})['balance']
res = mycol.find_one({"UserID": uid})
newvalues = { "$set": { "UserID": uid, "balance": int(balance) + int(cashback_amt)} }
mycol.update_one(res, newvalues)
img_url = f"https://www.roblox.com/bust-thumbnail/image?userId={uid}&width=420&height=420&format=png"
data = {
"content" : "**New purchase!**",
"username" : "robate"
}
data["embeds"] = [
{
"description" : f"**ItemID**: `{item_id}`\n**Item Price**: `{price}`\n**Item Cashback**: `{cashback_amt}`",
"title" : "New purchase made",
"color": 65311,
"thumbnail": {"url": img_url}
}
]
result = requests.post(purclog, json = data)
return jsonify({"res": True})
Any help is severely appreciated. Have a good day!
EDIT: The UserID is defined here:
try:
uid = requests.get(f"https://api.roblox.com/users/get-by-username?username={username}").json()['Id']
except:
return
balance = mycol.find_one({"UserID": uid})['balance']

Python: Fetching postgresql result and loading it to JSON

I'm trying to get details from PostgreSQL about some cameras and I need to insert them all in one JSON response, but I can't imagine how it should be done, because for row in self.data: processes one line per time, how can I add them all in one JSON dump?
I imagine JSON dump like this:
{
"status": "ok",
"total_cameras": 3,
"cameras":[{
"camera_id" : 1,
"camera_name" : "hikvision 1",
"camera_ip": "42.51.56.0"
},
{
"camera_id" : 2,
"camera_name" : "hikvision 2",
"camera_ip": "42.51.56.5"
},
{
"camera_id" : 3,
"camera_name" : "hikvision 3",
"camera_ip": "2.1.58.5"
}]
}
My code which I use to get information from PostgreSQL :
if not self.Data:
self.RES = {'status': 'nocameras'}
return web.Response(text=json.dumps(self.RES), status=403)
else:
self.rows = self.cursor.rowcount
for row in self.Data:
if self.rows > 1:
# Authorizing objects
print(row)
self.Camera_ID = row[0]
self.Camera_Name = row[1]
self.Camera_LAT = row[3]
self.Camera_LOG = row[4]
self.Camera_IP = row[2]
self.Camera_Last_Updated = row[6]
self.Camera_Street = row[5]
self.Camera_API_key = row[7]
print(self.Camera_ID, self.Camera_Name)
else:
self.RES = {'status': 'row_error'}
return web.Response(text=json.dumps(self.RES), status=500)

I would first use the returned rows to build a list of dictionaries:
cameras = [
dict(
camera_id=c_id,
camera_name=c_name,
camera_ip=c_ip
) for c_id, c_name, _, _, c_ip, *_ in self.Data
]
And then create the final JSON object:
json.dumps({
'status': 'ok',
'total_cameras': len(cameras),
'cameras': cameras
})

Iterate through a list of dictionaries and save duplicate data

I would like to iterate through a list of dictionaries and save values of certain keys (in my case "consumer Key" and "consumer Secret") as many times they are present into another dictionary.
Problem: I'm able to iterate through the list but my code is not saving the second consumer key and consumer secret, instead it is saving the first consumer key and consumer secret twice.
Input:
{
"accessType": "",
"apiProducts": [],
"appFamily": "default",
"appId": "ac56c8b2-6ac1-4971-a1d3-4bf97893c067",
"attributes": [
{
"name": "DisplayName",
"value": "quotaapp"
},
{
"name": "Notes",
"value": ""
}
],
"callbackUrl": "",
"createdAt": 1549274952045,
"createdBy": "suraj.pai.airody#sap.com",
"credentials": [
{
"apiProducts": [
{
"apiproduct": "apiprod",
"status": "approved"
}
],
"attributes": [],
"consumerKey": "xyz",
"consumerSecret": "abc",
"expiresAt": -1,
"issuedAt": 1549274952051,
"scopes": [],
"status": "approved"
},
{
"apiProducts": [
{
"apiproduct": "ouathTest-Product",
"status": "approved"
}
],
"attributes": [],
"consumerKey": "pqr",
"consumerSecret": "wmn",
"expiresAt": -1,
"issuedAt": 1554802431452,
"scopes": [],
"status": "approved"
}
],
"developerId": "xyz",
"lastModifiedAt": 1554802431662,
"lastModifiedBy": "suraj.pai.airody#sap.com",
"name": "quotaapp",
"scopes": [],
"status": "approved"
}
Code:
import requests
import json
from requests.auth import HTTPBasicAuth
import csv
def get_v2details():
a = 'orgID1'
b = 'appID1'
c = 'ConKey1'
d = 'ConSecret1'
e = 'appName1'
org_lst = []
some_dict = {}
con_blst = [] # variable to append the dictionary app level
n = int(input("Enter number of orgs from Landscape 1: "))
for i in range(0, n):
ele = str(input())
org_lst.append(ele)
cmp_orglst = list(org_lst)
print(cmp_orglst)
for j in cmp_orglst:
url = "https://canarydevmgmtsrv.dmzmo.sap.corp/v1/o/" + str(j) + "/apps/"
headers = {'Content-Type': 'application/json'}
response = requests.get(url, auth=HTTPBasicAuth('xyz', 'xyz'), headers=headers, verify=False)
app_data = json.loads(response.text)
print(app_data)
for k in app_data:
url1 = "https://canarydevmgmtsrv.dmzmo.sap.corp/v1/o/" + str(j) + "/apps/" + str(k)
headers = {'Content-Type': 'application/json'}
response1 = requests.get(url1, auth=HTTPBasicAuth('xyz', 'xyz'), headers=headers, verify=False)
consumer_data = json.loads(response1.text)
print(" Consumer Data is ", consumer_data)
for l in range(len(consumer_data['credentials'])):
some_dict[a] = str(j)
some_dict[b] = consumer_data['appId']
some_dict[e] = consumer_data['name']
some_dict[c] = consumer_data['credentials'][0]['consumerKey']
some_dict[d] = consumer_data['credentials'][0]['consumerSecret']
print(some_dict) # Print dictionary of each app ID
con_blst.append(some_dict.copy())
print(con_blst)
csv_columns = ['orgID1', 'appName1', 'appID1', 'ConKey1', 'ConSecret1']
csv_file = "Names1.csv"
try:
with open(csv_file, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
writer.writeheader()
for data in con_blst:
writer.writerow(data)
except IOError:
print("I/O error")
Expected result:
orgID1 appName1 appID1 ConKey1 ConSecret1
VALIDATE quotaapp 4bf97893c067 xyz abc
VALIDATE quotaapp 4bf97893c067 pqr wmn
Actual result:
orgID1 appName1 appID1 ConKey1 ConSecret1
VALIDATE quotaapp 4bf97893c067 xyz abc
VALIDATE quotaapp 4bf97893c067 xyz abc

It seems you just made a small error.
for l in range(len(consumer_data['credentials'])):
some_dict[a] = str(j)
some_dict[b] = consumer_data['appId']
some_dict[e] = consumer_data['name']
some_dict[c] = consumer_data['credentials'][0]['consumerKey'] #this line
some_dict[d] = consumer_data['credentials'][0]['consumerSecret'] #and this line
print(some_dict) # Print dictionary of each app ID
con_blst.append(some_dict.copy())
Should be
for l in range(len(consumer_data['credentials'])):
some_dict[a] = str(j)
some_dict[b] = consumer_data['appId']
some_dict[e] = consumer_data['name']
some_dict[c] = consumer_data['credentials'][l]['consumerKey'] # Here
some_dict[d] = consumer_data['credentials'][l]['consumerSecret'] # Here
print(some_dict) # Print dictionary of each app ID
con_blst.append(some_dict.copy())
You weren't looping through consumer_data['credentials'], you were just storing consumer_data['credentials'][0] twice

Elastic search not giving data with big number for page size

Size of data to get: 20,000 approx
Issue: searching Elastic Search indexed data using below command in python
but not getting any results back.
from pyelasticsearch import ElasticSearch
es_repo = ElasticSearch(settings.ES_INDEX_URL)
search_results = es_repo.search(
query, index=advertiser_name, es_from=_from, size=_size)
If I give size less than or equal to 10,000 it works fine but not with 20,000
Please help me find an optimal solution to this.
PS: On digging deeper into ES found this message error:
Result window is too large, from + size must be less than or equal to: [10000] but was [19999]. See the scrolling API for a more efficient way to request large data sets.

for real time use the best solution is to use the search after query . You need only a date field, and another field that uniquely identify a doc - it's enough a _id field or an _uid field.
Try something like this, in my example I would like to extract all the documents that belongs to a single user - in my example the user field has a keyword datatype:
from elasticsearch import Elasticsearch
es = Elasticsearch()
es_index = "your_index_name"
documento = "your_doc_type"
user = "Francesco Totti"
body2 = {
"query": {
"term" : { "user" : user }
}
}
res = es.count(index=es_index, doc_type=documento, body= body2)
size = res['count']
body = { "size": 10,
"query": {
"term" : {
"user" : user
}
},
"sort": [
{"date": "asc"},
{"_uid": "desc"}
]
}
result = es.search(index=es_index, doc_type=documento, body= body)
bookmark = [result['hits']['hits'][-1]['sort'][0], str(result['hits']['hits'][-1]['sort'][1]) ]
body1 = {"size": 10,
"query": {
"term" : {
"user" : user
}
},
"search_after": bookmark,
"sort": [
{"date": "asc"},
{"_uid": "desc"}
]
}
while len(result['hits']['hits']) < size:
res =es.search(index=es_index, doc_type=documento, body= body1)
for el in res['hits']['hits']:
result['hits']['hits'].append( el )
bookmark = [res['hits']['hits'][-1]['sort'][0], str(result['hits']['hits'][-1]['sort'][1]) ]
body1 = {"size": 10,
"query": {
"term" : {
"user" : user
}
},
"search_after": bookmark,
"sort": [
{"date": "asc"},
{"_uid": "desc"}
]
}
Then you will find all the doc appended to the result var
If you would like to use scroll query - doc here:
from elasticsearch import Elasticsearch, helpers
es = Elasticsearch()
es_index = "your_index_name"
documento = "your_doc_type"
user = "Francesco Totti"
body = {
"query": {
"term" : { "user" : user }
}
}
res = helpers.scan(
client = es,
scroll = '2m',
query = body,
index = es_index)
for i in res:
print(i)

Probably its ElasticSearch constraints.
index.max_result_window index setting which defaults to 10,000

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Load data from MongoDb to Elasticsearch through python - python

Related

AttributeError: Could not locate column in row for column 'to_dict'

TypeError when trying to set a DB value

Python: Fetching postgresql result and loading it to JSON

Iterate through a list of dictionaries and save duplicate data

Elastic search not giving data with big number for page size

Categories

Resources