No matter what I try I keep hitting the "MongoClient opened before fork" warning regarding not forking active mongo connections when trying to use multiprocessing on a mongoengine db. The standard mongo advice seems to be to only connect to the db from within the child processes but I think what I'm doing should be functionally equivalent because I'm closing the database prior to using multiprocessing however I still hit the problem.
Related questions either without a minimal example or with inapplicable solutions are here, here, and specifically for the case of flask/celery and here
Minimal example to reproduce the problem:
from mongoengine import connect, Document, StringField, ListField, ReferenceField
from pathos.multiprocessing import ProcessingPool
class Base(Document):
key = StringField(primary_key=True)
name = StringField()
parent = ReferenceField('Parent', required=True)
class Parent(Document):
key = StringField(primary_key=True)
name = StringField()
bases = ListField(ReferenceField('Base'))
def remove_base(key):
db = connect('mydb')
mongo_b = Base.objects().get(key=key)
mongo_b.parent.update(pull__bases=mongo_b)
mongo_b.delete()
### setup
db = connect('mydb', connect=False)
Base(key='b1', name='test', parent='p1').save()
Base(key='b2', name='test', parent='p1').save()
Base(key='b3', name='test2', parent='p1').save()
p=Parent(key='p1', name='parent').save()
p.update(add_to_set__bases='b1')
p.update(add_to_set__bases='b2')
p.update(add_to_set__bases='b3')
### find objects we want to delete
my_base_objects = Base.objects(name='test')
keys = [b.key for b in my_base_objects]
del my_base_objects
# close db to avoid problems?!
db.close()
del db
# parallel map removing base objects and references from the db
# warning generated here
pp = ProcessingPool(2)
pp.map(remove_base, keys)
Ok so I figured it out. Mongoengine caches connections to the database all over the place. If you manually remove them then the issue is resolved. Adding the following import
from mongoengine import connection
then adding in:
connection._connections = {}
connection._connection_settings ={}
connection._dbs = {}
Base._collection = None
Parent._collection = None
to the '#close db' section appears to solve the issue.
Complete code:
from mongoengine import connect, Document, StringField, ListField, ReferenceField, connection
from pathos.multiprocessing import ProcessingPool
class Base(Document):
key = StringField(primary_key=True)
name = StringField()
parent = ReferenceField('Parent', required=True)
class Parent(Document):
key = StringField(primary_key=True)
name = StringField()
bases = ListField(ReferenceField('Base'))
def remove_base(key):
db = connect('mydb', connect=False)
mongo_b = Base.objects().get(key=key)
mongo_b.parent.update(pull__bases=mongo_b)
mongo_b.delete()
def setup():
Base(key='b1', name='test', parent='p1').save()
Base(key='b2', name='test', parent='p1').save()
Base(key='b3', name='test2', parent='p1').save()
p=Parent(key='p1', name='parent').save()
p.update(add_to_set__bases='b1')
p.update(add_to_set__bases='b2')
p.update(add_to_set__bases='b3')
db = connect('mydb', connect=False)
setup()
### find objects we want to delete
my_base_objects = Base.objects(name='test')
keys = [b.key for b in my_base_objects]
del my_base_objects
### close db to avoid problems?!
db.close()
db = None
connection._connections = {}
connection._connection_settings ={}
connection._dbs = {}
Base._collection = None
Parent._collection = None
### parallel map removing base objects from the db
pp = ProcessingPool(2)
pp.map(remove_base, keys)
This got recently improved and as of MongoEngine>=0.18.0, the method disconnect() and disconnect_all() should be used to respectively disconnect 1 or all existing connections (changelog 0.18.0)
See official doc
Related
I want to use ORM from DataStax Python Driver for Cassandra DB (create some class with data and automatically create a table from it without writing too much CQL = )
I've deployed a Cassandra server on localhost via docker and tried to do just like it's written in their manual:
from cassandra.cluster import Cluster
from cassandra.cqlengine import columns
from cassandra.cqlengine.models import Model
from cassandra.cqlengine.management import sync_table, create_keyspace_simple
class Person(Model):
__keyspace__ = 'mega_keyspace'
__table_name__ = 'person'
id = columns.UUID(primary_key=True)
first_name = columns.Text()
last_name = columns.Text()
if __name__ == "__main__":
cluster = Cluster()
session = cluster.connect()
# create_keyspace_simple("mega_keyspace", 2)
session.execute("CREATE KEYSPACE IF NOT EXISTS mega_keyspace WITH REPLICATION = "
"{ 'class' : 'SimpleStrategy', 'replication_factor' : 2 };") # keyspace is created okay...
sync_table(Person) # And here's the error appears!
But alas, sync_table(...) gives me an error:
cassandra.cqlengine.CQLEngineException: Connection name '<object object at 0x7fbd95322ab0>' doesn't exist in the registry.
How can I fix it?
You can probably solve the issue by adding the two following lines to your code
from cassandra.cqlengine.connection import set_default_connection, register_coneection
add the following lines in the main function after the session definition
register_connection(str(session), session=session)
set_default_conection(session)
I have Django app which creates collections in MongoDB automatically. But when I tried to integrate the delete functionality, collections that are created with delete functionality are not deleted. Collections that are automatically created are edited successfully. This method is called in another file, with all parameters.
An interesting thing to note is when I manually tried to delete via python shell it worked. I won't be deleting the collections which are not required anymore.
import pymongo
from .databaseconnection import retrndb #credentials from another file all admin rights are given
mydb = retrndb()
class Delete():
def DeleteData(postid,name):
PostID = postid
tbl = name + 'Database'
liketbl = PostID + 'Likes'
likecol = mydb[liketbl]
pcol = mydb[tbl]
col = mydb['fpost']
post = {"post_id":PostID}
ppost = {"PostID":PostID}
result1 = mydb.commentcol.drop() #this doesn't work
result2 = mydb.likecol.drop() #this doesn't work
print(result1,'\n',result2) #returns none for both
try:
col.delete_one(post) #this works
pcol.delete_one(ppost) #this works
return False
except Exception as e:
return e
Any solutions, I have been trying to solve this thing for a week.
Should I change the database engine as Django doesn't support NoSQL natively. Although I have written whole custom scripts that do CRUD using pymongo.
Using python Mongoengine I am trying to create databases and add documents to different databases. Here's how I am trying to do it :
from mongoengine import *
class MongoDataAccessObject():
# method to connect to the database and initialize the tables etc.
def __init__(self, my_env, helperObj):
print "initializing db for the environment ", my_env
self.con = None
self.dbName = my_env
self.helper_obj = helperObj
try:
self.con = connect(db=self.dbName)
except Exception as e:
print e
def addRecord(self, document_object):
document_object.save()
Now, I pass the names of different databases that I want created while creating the object of the above class, and add the documents like this :
for my_env in list_of_envs:
dao = MongoDataAccessObject(my_env, helper_object)
dao.addRecord(myDocument)
Now there are 2 questions here:
For some reason all my documents keep getting added to the same DB (the first one being passed while MongoDataAccessObject object creation. I would assume that when I am creating a new object every time, while passing a different db name each time, a new connection should get created to the new db passed and documents should get added to the db which is currently connected to.
To verify if I am actually connected to the DB in question or not, I could not find a method like get_database_name() on the connection object. Is there a way to verify if I am getting connected to the DB name being passed or not ?
Ok did some more research and found this:
https://github.com/MongoEngine/mongoengine/issues/605
Tried it out like this in iptyhon:
from mongoengine import *
import datetime
class Page(Document):
title = StringField(max_length=200, required=True)
date_modified = DateTimeField(default=datetime.datetime.now)
def switch(model, db):
model._meta['db_alias'] = db
# must set _collection to none so it is re-evaluated
model._collection = None
return model
register_connection('default', name='testing')
register_connection('mycon', name='db1')
page = Page(title="Test Page")
page = switch(page, 'mycon')
page.save()
This works and creates a db named db1 and stores the document there.
Now I do this again:
register_connection('mycon2', name='db2')
page = Page(title="Test Page")
page = switch(page, 'mycon2')
page.save()
Contrary to my expectation this time db2 was not created (checked from both mongo client and from Robomongo), however the document was saved successfully. Wonder where exactly did the document get saved then ??
So to figure that out repeated the above exercise with a small change as below:
register_connection('mycon2', name='db2')
page = Page(title="Test Page")
page = switch(page, 'mycon2')
x = page.save()
# did a dir(x) and found that there is _get_db, so tried it out as below
x._get_db()
and the output was :
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary()), u'db2')
which I guess means that the document got saved in a database named db2. But where on earth is this db2 ???? Why can't I see it through either mongo client or even Robomongo etc. ?
Finally the only way I could find to achieve the above was through the context manager with statement provided by MongoEngine, which also is well documented here : http://docs.mongoengine.org/guide/connecting.html#switch-database
The way I did it in my code above is something like this :
In the first call to the db, a default db needs to be created, which should have the alias as default. Only then can another alias be created, else Mongoengine throws an error saying that no default db found.
So to do this, when the very first db object is created, a False flag is sent to the __init__of MongoDataAccessObject
So to do this, the MongoDataAccessObject was also changed to something like this:
class MongoDataAccessObject():
# method to connect to the database and initialize the tables etc.
def __init__(self, my_env, helperObj, is_default_db_set):
print "initializing db for the environment ", my_env
self.con = None
self.dbName = my_env
self.helper_obj = helperObj
self.db_alias = my_env
self.is_default_db_set = is_default_db_set
try:
# all of this part with the is_default_db_set and register_connection() is needed because MongoEngine does not
# provide a simple way of changing dbs or switching db connections. The only way to do it is through the switch_db()
# context manager they provide (which has been used in the addRecord() below)
if not self.is_default_db_set:
self.con = connect(db=self.dbName, alias='default')
else:
# register_connection(alias_name, db_name)
register_connection(self.db_alias, self.dbName)
except Exception as e:
print e
And the addRecord() was also modified as :
def addRecord(self, document_object):
with switch_db(model_object, self.db_alias) as model_object:
document_object = model_object()
document_object.save()
And this part above:
for my_env in list_of_envs:
dao = MongoDataAccessObject(my_env, helper_object)
dao.addRecord(myDocument)
was also modified as:
for my_env in list_of_envs:
dao = MongoDataAccessObject(my_env,helper_object,mongo_default_db_flag)
dao.addRecord(myDocument)
And this seemed to do the job for me.
I am using a sqlite database as my application file through sqlalchemy. I have a separate configuration file.
There are some classes whose information I persist on my application file that I would like to replicate on my configuration file. The thing is that I would load it alternatively from one or the other source depending on availability.
I saw this mention on the documentation, but I think it does not directly apply as the secondary mapping will not not persist the information. Also, the notion of which would be the primary is blurry. Both databases would carry the same information, maybe not on the same version, though.
http://sqlalchemy.readthedocs.org/en/rel_1_0/orm/nonstandard_mappings.html#multiple-mappers-for-one-class
I will try to make it clearer with an example:
I have a class A which represents a multi-field user input. I save this on my application file.
A class B also on my application file file is composed of an instance of Class A.
The same instance from Class A may compose several suitable instances of Class B. These are all stored on my application file.
My problem is that on another session, with a brand new configuration file I might want to reuse that Class A instance. I can not have it only on the application file, because if it gets updated, it will be relevant across all application files that use it.
On the other hand, it can not be only in the configuration file, as a user might share his application file with another and the later might not have a suitable configuration and would have to do it manually.
I need to have it in both places, be able to choose which database will be the source at runtime and have all changes persist on both databases at once.
Can it be done in sqlalchemy+sqlite? Is it a good idea? Are there classic solutions for this?
EDIT:
I think I am describing something that looks like a cache, which sqlalchemy does not do. Does any other approach come to mind?
Does sqlalchemy allow me to map an instance to a database upon instance creation? This would allow for two instances of the same class to be mapped against different databases. Then I would listen for an update event by sqlalchemy and issue the same sql to the other database. I also do not know how to do this.
Another option: map my class against a union query. Sqlalchemy might allow as it does for arbitrary selects, BUT then there is the persistence issue.
Another option: add a layer to the engine so that it connects to two databases simultaneously, issuing the same commands to both for reading and writing. I could deal with the duplicated returns.
I came up with the mixin below. I does not handle expunge or rollback, as I do not use those in my application nor know how to get about them.
It looks like it is working. I will proceed to expand it to handle collections.
import os
from sqlalchemy import Column, Float, String, Enum, Integer, event
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import orm
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
class ReplicateMixin:
#classmethod
def get_or_create(cls,prime_session, sessoes = None, **kwargs):
if sessoes is None:
sessoes = []
if not isinstance(sessoes, list):
sessoes = [sessoes]
sessoes = [prime_session] + sessoes #They are passed separatelly just to make explicit that the first might receive diferent treatment
replicas = []
for sessao in sessoes: #Gets a result or creates a new instance from each database
instance = sessao.query(Datum).filter_by(**kwargs).first()
if instance is None:
instance = cls(**kwargs)
setattr(instance, "__new", True)
sessao.add(instance)
instance.sessao = sessao
replicas.append(instance)
fittest = cls.__select_fittest(replicas) #Selects the instance whose data will prevail
prime = replicas.pop(0) #Instance from the session we will be issuing commits to. The others must simply follow.
cls.__copy_data(fittest, prime, ReplicateMixin.__get_primary_keys(prime))
setattr(prime, "__replicas", replicas) #The object will carry references to its copies
return prime
#staticmethod
def __select_fittest(instances):
"""This method should contain logic for choosing the instance that has
the most relevant information. It may be altered by child classes"""
if getattr(instances[0], "__new", False):
return instances[1]
else:
return instances[0]
#staticmethod
def __copy_data(source, dest, primary_keys = None):
primary_keys = [] if primary_keys is None else primary_keys
for prop in orm.class_mapper(type(source)).iterate_properties:
if (isinstance(prop, orm.ColumnProperty)
and prop.key not in primary_keys):
setattr(dest, prop.key,
getattr(source, prop.key))
#staticmethod
def __replicate(mapper, connection, original_obj):
replicants = getattr(original_obj, "__replicas", []) #if it IS a replicant it will not have a __replicas attribute
primary_keys = ReplicateMixin.__get_primary_keys(original_obj)
for objeto in replicants:
ReplicateMixin.__copy_data(original_obj, objeto, primary_keys)
objeto.sessao.commit()
#staticmethod
def __replicate_del(mapper, conection, original_obj):
replicants = getattr(original_obj, "__replicas", []) #if it IS a replicant it will not have a __replicas attribute
for objeto in replicants:
if objeto in objeto.sessao.new:
objeto.sessao.expunge(objeto)
else:
objeto.sessao.delete(objeto)
objeto.sessao.commit()
#staticmethod
def __get_primary_keys(mapped_object):
return [key.name for key in orm.class_mapper(type(mapped_object)).primary_key]
#classmethod
def __declare_last__(cls):
"""Binds certain events to functions"""
event.listen(cls, "before_insert", cls.__replicate)
event.listen(cls, "before_update", cls.__replicate)
event.listen(cls, "before_delete", cls.__replicate_del)
#FIXME might not play well with rollback
Example:
DeclarativeBase = declarative_base()
class Datum (ReplicateMixin, DeclarativeBase):
__tablename__ = "xUnitTestData"
Key = Column(Integer, primary_key=True)
Value = Column(Float)
nome = Column(String(10))
def __repr__(self):
return "{}; {}; {}".format(self.Key, self.Value, self.nome)
end_local = os.path.join(os.path.expanduser("~"), "Desktop", "local.bd")
end_remoto = os.path.join(os.path.expanduser("~"), "Desktop", "remoto.bd")
src_engine = create_engine('sqlite:///'+end_local, echo=False)
dst_engine = create_engine('sqlite:///'+end_remoto, echo=False)
DeclarativeBase.metadata.create_all(src_engine)
DeclarativeBase.metadata.create_all(dst_engine)
SessionSRC = sessionmaker(bind=src_engine)
SessionDST = sessionmaker(bind=dst_engine)
session1 = SessionSRC()
session2 = SessionDST()
item = Datum.pegar_ou_criar(session1, session2, Value = 0.5, nome = "terceiro")
item.Value = item.Value/2
print(item)
session1.delete(item)
session1.commit()
session1.close()
My app connects to multiple databases using a technique similar to this. It works so long as I don't try to access different databases in the same request. Having looked back to the above script I see they have written a comment to this end:
SQLAlchemy integration for CherryPy,
such that you can access multiple databases,
but only one of these databases per request or thread.
My app now requires me to fetch data from Database A and Database B. Is it possible to do this in a single request?
Please see below for sources and examples:
Working Example 1:
from model import meta
my_object_instance = meta.main_session().query(MyObject).filter(
MyObject.id == 1
).one()
Working Example 2:
from model import meta
my_user = meta.user_session().query(User).filter(
User.id == 1
).one()
Error Example:
from model import meta
my_object_instance = meta.main_session().query(MyObject).filter(
MyObject.id == 1
).one()
my_user = meta.user_session().query(User).filter(
User.id == 1
).one()
This errors with:
(sqlalchemy.exc.ProgrammingError) (1146, "Table 'main_db.user' doesn't exist")
Sources:
# meta.py
import cherrypy
import sqlalchemy
from sqlalchemy import MetaData
from sqlalchemy.orm import scoped_session, sessionmaker
from sqlalchemy.ext.declarative import declarative_base
# Return an Engine
def create_engine(defaultschema = True, schema = "", **kwargs):
# A blank DB is the same as no DB so to specify a non-schema-specific connection just override with defaultschema = False
connectionString = 'mysql://%s:%s#%s/%s?charset=utf8' % (
store['application'].config['main']['database-server-config-username'],
store['application'].config['main']['database-server-config-password'],
store['application'].config['main']['database-server-config-host'],
store['application'].config['main']['database-server-config-defaultschema'] if defaultschema else schema
)
# Create engine object. we pass **kwargs through so this call can be extended
return sqlalchemy.create_engine(connectionString, echo=True, pool_recycle=10, echo_pool=True, encoding='utf-8', **kwargs)
# Engines
main_engine = create_engine()
user_engine = None
# Sessions
_main_session = None
_user_session = None
# Metadata
main_metadata = MetaData()
main_metadata.bind = main_engine
user_metadata = MetaData()
# No idea what bases are/do but nothing works without them
main_base = declarative_base(metadata = main_metadata)
user_base = declarative_base(metadata = user_metadata)
# An easy collection of user database connections
engines = {}
# Each thread gets a session based on this object
GlobalSession = scoped_session(sessionmaker(autoflush=True, autocommit=False, expire_on_commit=False))
def main_session():
_main_session = cherrypy.request.main_dbsession
_main_session.configure(bind=main_engine)
return _main_session
def user_session():
_user_session = cherrypy.request.user_dbsession
_user_session.configure(bind = get_user_engine())
return _user_session
def get_user_engine():
# Get dburi from the users instance
dburi = cherrypy.session['auth']['user'].instance.database
# Store this engine for future use
if dburi in engines:
engine = engines.get(dburi)
else:
engine = engines[dburi] = create_engine(defaultschema = False, schema = dburi)
# Return Engine
return engine
def get_user_metadata():
user_metadata.bind = get_user_engine()
return user_metadata
# open a new session for the life of the request
def open_dbsession():
cherrypy.request.user_dbsession = cherrypy.thread_data.scoped_session_class
cherrypy.request.main_dbsession = cherrypy.thread_data.scoped_session_class
return
# close the session for this request
def close_dbsession():
if hasattr(cherrypy.request, "user_dbsession"):
try:
cherrypy.request.user_dbsession.flush()
cherrypy.request.user_dbsession.remove()
del cherrypy.request.user_dbsession
except:
pass
if hasattr(cherrypy.request, "main_dbsession"):
try:
cherrypy.request.main_dbsession.flush()
cherrypy.request.main_dbsession.remove()
del cherrypy.request.main_dbsession
except:
pass
return
# initialize the session factory class for the selected thread
def connect(thread_index):
cherrypy.thread_data.scoped_session_class = scoped_session(sessionmaker(autoflush=True, autocommit=False))
return
# add the hooks to cherrypy
cherrypy.tools.dbsession_open = cherrypy.Tool('on_start_resource', open_dbsession)
cherrypy.tools.dbsession_close = cherrypy.Tool('on_end_resource', close_dbsession)
cherrypy.engine.subscribe('start_thread', connect)
You could also choose an ORM that is designed from the ground up for multiple databases, like Dejavu.
Take a look at this:
http://pythonhosted.org/Flask-SQLAlchemy/binds.html
Basically, it suggests that you use a bind param - for each connection. That said, this seems to be a bit of a hack.
This question has a lot more detail in the answer:
With sqlalchemy how to dynamically bind to database engine on a per-request basis
That said, both this question and the one referenced aren't the newest and sqlalchemy will probably have moved on since then.