how to write a mongodb query in pymongo - python

The following query works in mongodb shell:
db.user.count( {$and: [ {"agent_id":{$exists:true}}, {"is_agent":{$ne:true}} ] } )
When I try it in python I get a different answer. Here's the python code:
import pymongo
from pymongo import MongoClient
def getCollection(cient,dbname,collection):
"""Return a colleciton based on client, db and collection"""
data_base = getattr(client, dbname)
collObject = getattr(data_base, collection)
return collObject
userColl = getCollection(client, "hkpr_restore","user")
usersWithAgents = userColl.count( {"$and": [ {"agent_id":{"$exists":"true"}}, {"is_agent":{"$ne":"true"}} ] } )
print usersWithAgents
The results are about 11,000 for the mongo shell query and about 17,000 for the python script query.

You should use:
"$exists": True
and not
"$exists": "true"
The same for $ne.

Related

How can I bulk upload JSON records to AWS OpenSearch index using a python client library?

I have a sufficiently large dataset that I would like to bulk index the JSON objects in AWS OpenSearch.
I cannot see how to achieve this using any of: boto3, awswrangler, opensearch-py, elasticsearch, elasticsearch-py.
Is there a way to do this without using a python request (PUT/POST) directly?
Note that this is not for: ElasticSearch, AWS ElasticSearch.
Many thanks!
I finally found a way to do it using opensearch-py, as follows.
First establish the client,
# First fetch credentials from environment defaults
# If you can get this far you probably know how to tailor them
# For your particular situation. Otherwise SO is a safe bet :)
import boto3
credentials = boto3.Session().get_credentials()
region='eu-west-2' # for example
auth = AWSV4SignerAuth(credentials, region)
# Now set up the AWS 'Signer'
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
auth = AWSV4SignerAuth(credentials, region)
# And finally the OpenSearch client
host=f"...{region}.es.amazonaws.com" # fill in your hostname (minus the https://) here
client = OpenSearch(
hosts = [{'host': host, 'port': 443}],
http_auth = auth,
use_ssl = True,
verify_certs = True,
connection_class = RequestsHttpConnection
)
Phew! Let's create the data now:
# Spot the deliberate mistake(s) :D
document1 = {
"title": "Moneyball",
"director": "Bennett Miller",
"year": "2011"
}
document2 = {
"title": "Apollo 13",
"director": "Richie Cunningham",
"year": "1994"
}
data = [document1, document2]
TIP! Create the index if you need to -
my_index = 'my_index'
try:
response = client.indices.create(my_index)
print('\nCreating index:')
print(response)
except Exception as e:
# If, for example, my_index already exists, do not much!
print(e)
This is where things go a bit nutty. I hadn't realised that every single bulk action needs an, er, action e.g. "index", "search" etc. - so let's define that now
action={
"index": {
"_index": my_index
}
}
You can read all about the bulk REST API, there.
The next quirk is that the OpenSearch bulk API requires Newline Delimited JSON (see https://www.ndjson.org), which is basically JSON serialized as strings and separated by newlines. Someone wrote on SO that this "bizarre" API looked like one designed by a data scientist - far from taking offence, I think that rocks. (I agree ndjson is weird though.)
Hideously, now let's build up the full JSON string, combining the data and actions. A helper fn is at hand!
def payload_constructor(data,action):
# "All my own work"
action_string = json.dumps(action) + "\n"
payload_string=""
for datum in data:
payload_string += action_string
this_line = json.dumps(datum) + "\n"
payload_string += this_line
return payload_string
OK so now we can finally invoke the bulk API. I suppose you could mix in all sorts of actions (out of scope here) - go for it!
response=client.bulk(body=payload_constructor(data,action),index=my_index)
That's probably the most boring punchline ever but there you have it.
You can also just get (geddit) .bulk() to just use index= and set the action to:
action={"index": {}}
Hey presto!
Now, choose your poison - the other solution looks crazily shorter and neater.
PS The well-hidden opensearch-py documentation on this are located here.
conn = wr.opensearch.connect(
host=self.hosts, # URL
port=443,
username=self.username,
password=self.password
)
def insert_index_data(data, index_name='stocks', delete_index_data=False):
""" Bulk Create
args: body [{doc1}{doc2}....]
"""
if delete_index_data:
index_name = 'symbol'
self.delete_es_index(index_name)
resp = wr.opensearch.index_documents(
self.conn,
documents=data,
index=index_name
)
print(resp)
return resp
I have used below code to bulk insert records from postgres into OpenSearch ( ES 7.2 )
import sqlalchemy as sa
from sqlalchemy import text
import pandas as pd
import numpy as np
from opensearchpy import OpenSearch
from opensearchpy.helpers import bulk
import json
engine = sa.create_engine('postgresql+psycopg2://postgres:postgres#127.0.0.1:5432/postgres')
host = 'search-xxxxxxxxxx.us-east-1.es.amazonaws.com'
port = 443
auth = ('username', 'password') # For testing only. Don't store credentials in code.
# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
hosts = [{'host': host, 'port': port}],
http_compress = True,
http_auth = auth,
use_ssl = True,
verify_certs = True,
ssl_assert_hostname = False,
ssl_show_warn = False
)
with engine.connect() as connection:
result = connection.execute(text("select * from account_1_study_1.stg_pred where domain='LB'"))
records = []
for row in result:
record = dict(row)
record.update(record['item_dataset'])
del record['item_dataset']
records.append(record)
df = pd.DataFrame(records)
#df['Date'] = df['Date'].astype(str)
df = df.fillna("null")
print(df.keys)
documents = df.to_dict(orient='records')
#bulk(es ,documents, index='search-irl-poc-dump', raise_on_error=True)\
#response=client.bulk(body=documents,index='sample-index')
bulk(client, documents, index='search-irl-poc-dump', raise_on_error=True, refresh=True)

Mock AWSwrangler for unittesting

As there is no support for AWSwrangler by moto i am stuck here and don't know how to mock.
I am trying to unittest my lambda code which run athena query using AWSwrangler.
import awswrangler as wr
import boto3
def athena_query(dbtable, contact_id, athena_output, session):
query = """
SELECT
*
FROM
:dbtable;
WHERE
contactid=:contactid;
"""
output = wr.athena.read_sql_query(
query,
params = {
"contactid": f"'{contact_id}'",
"dbtable": f"{dbtable}"
},
s3_output = athena_output,
boto3_session = session
)
results = output.head().loc[0]
return results
response = athena_query("table_name", "123", "s3://bucket", boto3.session.Session())
I referenced AWSwrangler github issue and while trying some of the test provided in link it's hitting AWS service instead of running locally.
Here is an example implementation for this function using moto and pytest.
First I would correct your function according to awswrangler required parameters in its current version (2.16.1).
import awswrangler as wr
import boto3
def athena_query(database, dbtable, contact_id, athena_output, session):
query = """
SELECT
*
FROM
:dbtable;
WHERE
contactid=:contactid;
"""
output = wr.athena.read_sql_query(
query,
database,
params = {
"contactid": f"'{contact_id}'",
"dbtable": f"{dbtable}"
},
s3_output = athena_output,
boto3_session = session
)
results = output.head().loc[0]
return results
The in a test/conftest.py fil I would declare the necessary mocked objects:
import pytest
import moto
TEST_BUCKET_NAME = "my_bucket"
REGION = "us-east-1"
DATABASE_NAME = "test_db"
TABLE_NAME = "test_table"
TABLE_DDL = f"""CREATE EXTERNAL TABLE IF NOT EXISTS
{DATABASE_NAME}.{TABLE_NAME} (
a string,
b string,
contactid string
) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
'separatorChar' = ',',
'quoteChar' = '\"',
'escapeChar' = '\\'
)
STORED AS TEXTFILE
LOCATION 's3://{TEST_BUCKET_NAME}/input/';"""
#pytest.fixture
def aws_credentials():
"""Mocked AWS Credentials for moto."""
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
os.environ["AWS_SECURITY_TOKEN"] = "testing"
os.environ["AWS_SESSION_TOKEN"] = "testing"
#pytest.fixture
def s3_client(aws_credentials):
with moto.mock_s3():
conn = boto3.client("s3", region_name=REGION)
yield conn
#pytest.fixture
def athena_client(aws_credentials):
with moto.athena.mock_athena():
conn = boto3.client("athena", region_name=REGION)
yield conn
#pytest.fixture
def s3_bucket(s3_client):
s3_client.create_bucket(
Bucket=TEST_BUCKET_NAME,
CreateBucketConfiguration={
'LocationConstraint': 'eu-west-1'
}
)
yield boto3.resource('s3').Bucket(TEST_BUCKET_NAME)
#pytest.fixture
def athena_table(athena_client, s3_bucket):
# create database
_ = athena_client.start_query_execution(
QueryString=f"create database {DATABASE_NAME}",
ResultConfiguration={"OutputLocation": "s3://{TEST_BUCKET_NAME}/queries/"}
)
# create table
_ = athena_client.start_query_execution(
QueryString=TABLE_DDL,
ResultConfiguration={"OutputLocation": "s3://{TEST_BUCKET_NAME}/queries/"}
)
And, then I would define a test of the function in a separate test/athena_test.py file. This is using mocker to mock awswrangler response to the query but you could use advanced testing using the mock objects created in the conftest.py file :
from conftest import TEST_BUCKET_NAME, DATABASE_NAME, TABLE_NAME
# import your function to test here
def test_athena_query(s3_bucket, athena_table, mocker):
def mock_response(*args, **kwargs):
return pd.DataFrame.from_dict({"a": [1, 2], "b": [3, 4], "contactid": [123, 123]})
# mocking
mock_wr_call = mocker.patch('wr.athena.read_sql_query')
mock_wr_call.side_effect = mock_response
response = athena_query(DATABASE_NAME, TABLE_NAME, "123", f"s3://{TEST_BUCKET_NAME}/queries/", boto3.session.Session())
assert response.shape[0] == 2
Resources:
https://aws-data-wrangler.readthedocs.io/en/stable/stubs/awswrangler.athena.read_sql_query.html

How to write location, reference and timestamp data types on Firestore with Python's SDK?

Documentation goes as far as explaining how to write the following data types:
data = {
u'stringExample': u'Hello, World!',
u'booleanExample': True,
u'numberExample': 3.14159265,
u'dateExample': datetime.datetime.now(),
u'arrayExample': [5, True, u'hello'],
u'nullExample': None,
u'objectExample': {
u'a': 5,
u'b': True
}
}
I am migrating a Mongo DB collection into FireStore.
This is my code so far:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
def initialize_firestore():
cred = credentials.Certificate('admin.json')
app = firebase_admin.initialize_app(cred)
db = firestore.client()
return db
db.collection('My_Collection').document('desired_iD').set('Document_to_write')
I would like to write a document with reference and location fields plus a custom date.
After getting the response(res) & initializing dictionary (Document_to_write):
Document_to_write['date_type'] = datetime.datetime.strptime(res['created'], "date format in string")
Document_to_write['reference_type'] = db.document(u'Collection_Name/'+res['collection_id'])
Document_to_write['location_type'] = firestore.GeoPoint(res['latitude'], res['longitude'])
Follow to save with selected method.

how to convert compiled url in normail form using python

I just used gogole api to search via python and used this script below
import urllib
import json as m_json
query = raw_input ( 'Query: ' )
query = urllib.urlencode ( { 'q' : query } )
response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
json = m_json.loads ( response )
results = json [ 'responseData' ] [ 'results' ]
for result in results:
url = result['url'] # was URL in the original and that threw a name error exception
print ( url )
and after that I got result below :
Query: inurl:"readnews.php?id="
http://www.idmantv.az/readnews.php%3Fid%3D14999
http://www.kanda.com/readnews.php%3Fid%3D9
http://www.dcfever.com/news/readnews.php%3Fid%3D12573
http://www.thegrower.org/readnews.php%3Fid%3D6c0p5n0e8i6b
but I want this url in normal form like
http://www.idmantv.az/readnews.php?id=14999
How to do that with python?
Use urllib.unquote or urllib.unquote_plus to decode %-encoded string:
>>> urllib.unquote('http://www.idmantv.az/readnews.php%3Fid%3D14999')
'http://www.idmantv.az/readnews.php?id=14999'

How to Bulk index in Elastic Search using the Python API

I am trying to bulk insert a lot of documents into elastic search using the Python API.
import elasticsearch
from pymongo import MongoClient
es = elasticsearch.Elasticsearch()
def index_collection(db, collection, fields, host='localhost', port=27017):
conn = MongoClient(host, port)
coll = conn[db][collection]
cursor = coll.find({}, fields=fields, timeout=False)
print "Starting Bulk index of {} documents".format(cursor.count())
def action_gen():
"""
Generator to use for bulk inserts
"""
for n, doc in enumerate(cursor):
op_dict = {
'_index': db.lower(),
'_type': collection,
'_id': int('0x' + str(doc['_id']), 16),
}
doc.pop('_id')
op_dict['_source'] = doc
yield op_dict
res = bulk(es, action_gen(), stats_only=True)
print res
The documents come from a Mongodb collection and I amusing the function above to do the bulk indexing according to the way explained in the docs.
the bulk indexing goes on filling elastic search with thousands of empty documents. Can anyone tell me what am I doing wrong?
I've never seen the bulk data put together that way, especially what you're doing with "_source". There may be a way to get that to work, I don't know off-hand, but when I tried it I got weird results.
If you look at the bulk api, ES is expecting a meta-data document, then the document to be indexed. So you need two entries in your bulk data list for each document. So maybe something like:
import elasticsearch
from pymongo import MongoClient
es = elasticsearch.Elasticsearch()
def index_collection(db, collection, fields, host='localhost', port=27017):
conn = MongoClient(host, port)
coll = conn[db][collection]
cursor = coll.find({}, fields=fields, timeout=False)
print "Starting Bulk index of {} documents".format(cursor.count())
bulk_data = []
for n, doc in enumerate(cursor):
bulk_data.append({
'_index': db.lower(),
'_type': collection,
'_id': int('0x' + str(doc['_id']), 16),
})
bulk_data.append(doc)
es.bulk(index=index_name,body=bulk_data,refresh=True)
I didn't try to run that code, though. Here is a script I know works, that you can play with, if it helps:
from elasticsearch import Elasticsearch
es_client = Elasticsearch(hosts = [{ "host" : "localhost", "port" : 9200 }])
index_name = "test_index"
if es_client.indices.exists(index_name):
print("deleting '%s' index..." % (index_name))
print(es_client.indices.delete(index = index_name, ignore=[400, 404]))
print("creating '%s' index..." % (index_name))
print(es_client.indices.create(index = index_name))
bulk_data = []
for i in range(4):
bulk_data.append({
"index": {
"_index": index_name,
"_type": 'doc',
"_id": i
}
})
bulk_data.append({ "idx": i })
print("bulk indexing...")
res = es_client.bulk(index=index_name,body=bulk_data,refresh=True)
print(res)
print("results:")
for doc in es_client.search(index=index_name)['hits']['hits']:
print(doc)

Categories