sqlalchemy mixin: after_create not firing in child - python

I am working on an ORM style version of the pq library (PostgreSQL powered python queue system) where users can have their own queue model. It also has added features such as bulk insert/get, asynchronous support and more (if all goes well I'll be able to publish it).
I am having difficulties creating a trigger (I use a PostgreSQL notification system) automatically after table creation (I want to make the usage as simple as possible so that would be much better than adding an additional classmethod for creating the trigger).
This is similar to the answer in this post however I cannot use this solution because I need to pass a connection (for escaping SQL identifiers by checking the dialect of the connection and for checking if objects exist beforehand).
Here is my attempt at it based on the post I mentionned earlier. I apologize for the long code but I figured I had to include a bit of context.
Base model
from sqlalchemy import (BIGINT, Column, func, Index, nullslast,
nullsfirst, SMALLINT, TIMESTAMP)
from sqlalchemy.orm import declared_attr, declarative_mixin
from sqlalchemy.event import listens_for
# this is the function that returns the base model
def postgres_queue_base(schema:str='public', tz_aware:bool=True, use_trigger:bool=True) -> 'PostgresQueueBase':
#declarative_mixin # this is only for MyPy, it does not modify anything
class PostgresQueueBase:
__tablename__ = 'queue'
#declared_attr
def __table_args__(cls):
return (Index(nullsfirst(cls.schedule_at), nullslast(cls.dequeued_at), postgresql_where=(cls.dequeued_at == None)),
{'schema':schema})
id = Column('id', BIGINT, primary_key=True)
internal_mapping = Column('internal_mapping', BIGINT, nullable=False)
enqueued_at = Column('enqueued_at', TIMESTAMP(timezone=tz_aware), nullable=False, server_default=func.now())
dequeued_at = Column('dequeued_at', TIMESTAMP(timezone=tz_aware))
expected_at = Column(TIMESTAMP(timezone=tz_aware))
schedule_at = Column(TIMESTAMP(timezone=tz_aware))
status = Column(SMALLINT, index=True)
#listens_for(PostgresQueueBase, "instrument_class", propagate=True)
def instrument_class(mapper, class_):
print('EVENT INSTRUMENT CLASS')
if use_trigger and mapper.local_table is not None:
trigger_for_table(table=mapper.local_table)
def trigger_for_table(table):
print('Registering after_create event')
#listens_for(table, "after_create")
def create_trigger(table, connection):
print('AFTER CREATE EVENT')
# code that creates triggers and logs that (here I'll just print something and put pseudo code in a comment)
# trig = PostgresQueueTrigger(schema=get_schema_from_model(table), table_name=table.name, connection=connection)
# trig.add_trigger()
print('Creating notify function public.notify_job')
# unique trigger name using hash of schema.table_name (avoids problems with long names and special chars)
print('Creating trigger trigger_job_5d69fc3870b446d0a1f56a793b799ae3')
return PostgresQueueBase
When I try the base model
from sqlalchemy import Column, create_engine, INTEGER, TEXT
from sqlalchemy.orm import declarative_base
# IMPORTANT: inherit both a declarative base AND the postgres queue base
Base = declarative_base()
PostgresQueueBase = postgres_queue_base(schema='public')
# create custom queue model
class MyQueue(Base, PostgresQueueBase):
# optional custom table name (by default it is "queue")
__tablename__ = 'demo_queue'
# custom columns
operation = Column(TEXT)
project_id = Column(INTEGER)
# create table in database
# change connection string accordingly!
engine = create_engine('postgresql://username:password#localhost:5432/postgres')
Base.metadata.create_all(bind=engine)
EVENT INSTRUMENT CLASS
Registering after_create event
I cannot see "AFTER CREATE EVENT" printed out 😟. How do I get the "after_create" event to be fired?
Thanks in advance for your help 👍!

Sorry, I finally figured it out... The table already existed so the events were never firing. Also the code above has some errors in the events (I could not test them since they were not being executed) and the composite index in table_args somehow gets the name """ NULLS FIRST"". I used a hash to have a better name and avoid problems with character limitation or escaping.
import hashlib
from sqlalchemy import (BIGINT, Column, func, Index, nullslast,
nullsfirst, SMALLINT, TIMESTAMP)
from sqlalchemy.orm import declared_attr, declarative_mixin
from sqlalchemy.event import listens_for
# this is the function that returns the base model
def postgres_queue_base(schema:str='public', tz_aware:bool=True, use_trigger:bool=True) -> 'PostgresQueueBase':
#declarative_mixin # this is only for MyPy, it does not modify anything
class PostgresQueueBase:
__tablename__ = 'queue'
#declared_attr
def __table_args__(cls):
# to prevent any problems such as escaping, SQL injection or limit of characters I'll just md5 the table name for the index
md5 = hashlib.md5(cls.__tablename__.encode('utf-8')).hexdigest()
return (Index(f'queue_prio_ix_{md5}', nullsfirst(cls.schedule_at), nullslast(cls.dequeued_at),
postgresql_where=(cls.dequeued_at == None)),
{'schema':schema})
id = Column('id', BIGINT, primary_key=True)
internal_mapping = Column('internal_mapping', BIGINT, nullable=False)
enqueued_at = Column('enqueued_at', TIMESTAMP(timezone=tz_aware), nullable=False, server_default=func.now())
dequeued_at = Column('dequeued_at', TIMESTAMP(timezone=tz_aware))
expected_at = Column(TIMESTAMP(timezone=tz_aware))
schedule_at = Column(TIMESTAMP(timezone=tz_aware))
status = Column(SMALLINT, index=True)
if use_trigger:
#listens_for(PostgresQueueBase, "instrument_class", propagate=True)
def class_instrument(mapper, class_):
if mapper.local_table is not None:
create_trigger_event(table=mapper.local_table)
def create_trigger_event(table):
#listens_for(table, "after_create")
def create_trigger(target, connection, **kw):
print('Create trigger')
return PostgresQueueBase

Related

SQLAlchemy strategy: ORM + Core for classes with large amounts of data

Apparently use of ORM and Core in tandem is possible, but I haven't been able to find any solid explanation of a strategy for this.
Here's the use case class:
class DataHolder(Base):
__tablename__ = 'data_holder'
id = Column(Integer, primary_key=True)
dataset_id = Column(Integer, ForeignKey('data_set.id'))
name = Column(String)
_dataset_table = Table('data_set', Base.metadata,
Column('id', Integer, primary_key=True),
)
_datarows_table = Table('data_rows', Base.metadata,
Column('id', Integer, primary_key=True),
Column('dataset_id', None, ForeignKey('data_set.id')),
Column('row', Integer),
Column('col_0', Integer),
Column('col_1', Integer),
Column('col_2', Integer),
)
def __init__(self, name=None, data=None):
self.name = name
self.data = data
def _pack_info(self):
# Return __class__ and other info needed for packing.
def _unpack_info(self):
# Return info needed for unpacking.
name should be persisted via the ORM. data, which would be a large NumPy array (or similar type), should be persisted via the Core.
There is a go-between table 'data_set' that exists for the purpose of a many-to-one relationship between DataHolder and the data. This allows data sets to exist independently within some library. (The sole purpose of this table is to generate IDs for new data sets.)
Actual persistence would be accomplished through a class that implements some listeners, such as the following.
class PersistenceManager:
def __init__(self):
self.init_db()
self.init_listeners()
def init_db(self):
engine = create_engine('sqlite:///path/to/database.db')
self.sa_engine = engine
self.sa_sessionmaker = sessionmaker(bind=engine)
Base.metadata.create_all(engine)
def init_listeners(self):
#event.listens_for(Session, 'transient_to_pending')
def pack_data(session, instance):
try:
pack_info = instance._pack_info()
# Use Core to execute INSERT for bulky data.
except AttributeError:
pass
#event.listens_for(Session, 'loaded_as_persistent')
def unpack_data(session, instance):
try:
unpack_info = instance._unpack_info()
# Use Core to execute SELECT for bulky data.
except AttributeError:
pass
def persist(self, obj):
session.add(obj)
def load(self, class_, spec):
obj = session.query(class_).filter_by(**spec).all()[-1]
return obj
def session_scope(self):
session = self.sa_sessionmaker()
try:
yield session
session.commit()
except:
session.rollback()
raise
finally:
session.close()
The idea is that whenever a DataHolder is persisted, its data is also persisted at the same (or nearly the same) time.
Listening for 'transient_to_pending' (for "packing") and 'loaded_as_persistent' (for "unpacking") events will work for simple saving and loading. However, it seems care should be taken to also listen for the 'pending_to_transient' event. In the case of a rollback, the data added via Core will not be pulled back out of the database in the same way the ORM-related data will.
Is there another, better way to manipulate this behavior besides listening for 'pending_to_transient'? This could cause problems in the case where two different DataHolders reference the same data set: one DataHolder could rollback, removing the data set from the database so that the other DataHolder can no longer use it.

Python SQLalchemy access huge DB data without creating models

I am using flaks python and sqlalchemy to connect to a huge db, where a lot of stats are saved. I need to create some useful insights with the use of these stats, so I only need to read/get the data and never modify.
The issue I have now is the following:
Before I can access a table I need to replicate the table in my models file. For example I see the table Login_Data in the DB. So I go into my models and recreate the exact same table.
class Login_Data(Base):
__tablename__ = 'login_data'
id = Column(Integer, primary_key=True)
date = Column(Date, nullable=False)
new_users = Column(Integer, nullable=True)
def __init__(self, date=None, new_users=None):
self.date = date
self.new_users = new_users
def get(self, id):
if self.id == id:
return self
else:
return None
def __repr__(self):
return '<%s(%r, %r, %r)>' % (self.__class__.__name__, self.id, self.date, self.new_users)
I do this because otherwise I cant query it using:
some_data = Login_Data.query.limit(10)
But this feels unnecessary, there must be a better way. Whats the point in recreating the models if they are already defined. What shall I use here:
some_data = [SOMETHING HERE SO I DONT NEED TO RECREATE THE TABLE].query.limit(10)
Simple question but I have not found a solution yet.
Thanks to Tryph for the right sources.
To access the data of an existing DB with sqlalchemy you need to use automap. In your configuration file where you load/declare your DB type. You need to use the automap_base(). After that you can create your models and use the correct table names of the DB without specifying everything yourself:
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
import stats_config
Base = automap_base()
engine = create_engine(stats_config.DB_URI, convert_unicode=True)
# reflect the tables
Base.prepare(engine, reflect=True)
# mapped classes are now created with names by default
# matching that of the table name.
LoginData = Base.classes.login_data
db_session = Session(engine)
After this is done you can now use all your known sqlalchemy functions on:
some_data = db_session.query(LoginData).limit(10)
You may be interested by reflection and automap.
Unfortunately, since I never used any of those features, I am not able to tell you more about them. I just know that they allow to use the database schema without explicitly declaring it in Python.

Should I use a single class to put SQLAlchemy table specifications and my business logic?

I have a class Contract to represent my contracts:
.../mypackage/Contract.py
class Contract:
# setter and getters.
def isValid( self, contract_number=None ):
#code
def cancelTheContract( self, contract_number=None ):
# code
And my SQLAlchemy Contract class:
.../mypackage/orm.py
from sqlalchemy import create_engine
from sqlalchemy import MetaData
from sqlalchemy import Column, ForeignKey, Integer, String, Table
from sqlalchemy.orm import *
from sqlalchemy.ext.declarative import declarative_base
db = create_engine( 'mysql://myuser:mypasswd#localhost/mydatabase' )
contracts = Table( 'contracts', MetaData( bind = None ) )
class Connection:
def connect( self ):
Session = sessionmaker( bind = db )
session = Session()
return session
class Contract( Base ):
__tablename__ = 'contracts'
id = Column( Integer, primary_key = True )
type = Column( String )
price = Column( Float )
So...
Would be ok to merge both Contract classes in a single one?
If not, I have to instantiate a class specific for the database table and another class specific for the business logic, so when I have to deal with database data, manipulate it and put it back, I have to deal with two objects that are basically the thing.
Well... I guess I'm missing some important concept here.
What should I read to understand better about my question implications?
Thanks!
Gio
Yes, the philosophy of ORMs is to map physical tables to business entity objects, so it is best practice to combine your two classes. SQLA attributes manage the persistent fields of your entity and you can encapsulate all the business logic in that class per standard object-oriented modeling techniques.

Instantiating object automatically adds to SQLAlchemy Session. Why?

From my understanding of SQLAlchemy, in order to add a model to a session, I need to call session.add(obj). However, for some reason, in my code, SQLAlchemy seems to do this automatically.
Why is it doing this, and how can I stop it? Am I approaching session in the correct way?
example
>>> from database import Session as db
>>> import clients
>>> from instances import Instance
>>> from uuid import uuid4
>>> len(db.query(Instance).all())
>>> 0 # Note, no instances in database/session
>>> i = Instance(str(uuid4()), clients.get_by_code('AAA001'), [str(uuid4())])
>>> len(db.query(Instance).all())
>>> 1 # Why?? I never called db.add(i)!
database.py
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base
import config
Base = declarative_base()
class Database():
def __init__(self):
db_url = 'postgresql://{:s}:{:s}#{:s}:{}/{:s}'.format(
config.database['user'],
config.database['password'],
config.database['host'],
config.database['port'],
config.database['dbname']
)
self.engine = create_engine(db_url)
session_factory = sessionmaker(bind=self.engine)
self.session = scoped_session(session_factory)
Database = Database()
Session = Database.session
instance.py
from sqlalchemy import Column, Text, ForeignKey
from sqlalchemy.orm import relationship
from sqlalchemy.dialects.postgresql import UUID, ARRAY
import database
Base = database.Base
class Instance(Base):
__tablename__ = 'instances'
uuid = Column(UUID, primary_key=True)
client_code = Column(
Text, ForeignKey('clients.code', ondelete='CASCADE'), nullable=False)
mac_addresses = Column(ARRAY(Text, as_tuple=True),
primary_key=True)
client = relationship("Client", back_populates="instances")
def __init__(self, uuid, client, mac_addresses):
self.uuid = uuid
self.client = client
self.mac_addresses = tuple(mac_addresses)
client.py
from sqlalchemy import Column, Text
from sqlalchemy.orm import relationship
import database
from database import Session as db
Base = database.Base
class Client(Base):
__tablename__ = 'clients'
code = Column(Text, primary_key=True)
name = Column(Text)
instances = relationship("Instance", back_populates='client')
def __init__(self, code, name=None):
self.code = code
self.name = name
def get_by_code(code):
client = db.query(Client).filter(Client.code == code).first()
return client
When you create a SQLAlchemy object and link it directly to another SQLAlchemy object, both objects end up in the session.
The reason is that SQLAlchemy needs to make sure you can query these objects.
Take, for example, a user with addresses.
If you create a user in code, with an address, both the user and the address end up in the session, because the address is linked to the user and SQLAlchemy needs to make sure you can query all addresses of a user using user.addresses.all().
In that case all (possibly) existing addresses need to be fetched, as well as the new address you just added. For that purpose the newly added address needs to be saved in the database.
To prevent this from happening (for example if you only need objects to just calculate with), you can link the objects with their IDs/Foreign Keys:
address.user_id = user.user_id
However, if you do this, you won't be able to access the SQLAlchemy properties anymore. So user.addresses or address.user will no longer yield results.
The reverse is also true; I asked a question myself a while back why linking two objects by ID will not result in SQLAlchemy linking these objects in the ORM:
relevant stackoverflow question
another description of this behavior

can an an ORM column trigger a session flush in SQLAlchemy?

Question
Can a property access trigger a session flush in SQLAlchemy? My expectation would be for, e.g., queries attached to an object via column_property() or #hybrid_property to cause a session autoflush, in the same way that queries made through session.Query() do. That does not seem to be the case.
In the simple example below, an Account contains an Entry collection. It also provides a "balance" property, constructed with column_property(), that exposes a select-sum query. New entries only appear in an account's balance if session.flush() is called explicitly.
This behavior seems suboptimal: users of the Account class need to sprinkle flush() calls throughout their code based on knowing the internals of the balance implementation. If the implementation changes---e.g., if "balance" was previously a Python #property---bugs can be introduced even though the Account interface is essentially identical. Is there an alternative?
Complete Example
import sys
import sqlalchemy as sa
import sqlalchemy.sql
import sqlalchemy.orm
import sqlalchemy.ext.declarative
Base = sa.ext.declarative.declarative_base()
class Entry(Base):
__tablename__ = "entries"
id = sa.Column(sa.Integer, primary_key=True)
value = sa.Column(sa.Numeric, primary_key=True)
account_id = sa.Column(sa.Integer, sa.ForeignKey("accounts.id"))
account = sa.orm.relationship("Account", backref="entries")
class Account(Base):
__tablename__ = "accounts"
id = sa.Column(sa.Integer, primary_key=True)
balance = sa.orm.column_property(
sa.sql.select([sa.sql.func.sum(Entry.value)])
.where(Entry.account_id == id)
)
def example(database_url):
# connect to the database and prepare the schema
engine = sa.create_engine(database_url)
session = sa.orm.sessionmaker(bind=engine)()
Base.metadata.create_all(bind = engine)
# add an entry to an account
account = Account()
account.entries.append(Entry(value = 42))
session.add(account)
# and look for that entry in the balance
print "account.balance:", account.balance
assert account.balance == 42
if __name__ == "__main__":
example(sys.argv[1])
Observed Output
$ python sa_column_property_example.py postgres:///za_test
account.balance: None
Traceback (most recent call last):
File "sa_column_property_example.py", line 46, in <module>
example(sys.argv[1])
File "sa_column_property_example.py", line 43, in example
assert account.balance == 42
AssertionError
Preferred Output
I'd like to see "account.balance: 42", without adding an explicit call to session.flush().
a column_property is only evaluated at query time, that is when you say query(Account), as well as when the attribute is expired, that is if you said session.expire("account", ['balance']).
To have an attribute invoke a query every time, we use a #property (some small mods here for the script to work with sqlite):
import sys
import sqlalchemy as sa
import sqlalchemy.sql
import sqlalchemy.orm
import sqlalchemy.ext.declarative
Base = sa.ext.declarative.declarative_base()
class Entry(Base):
__tablename__ = "entries"
id = sa.Column(sa.Integer, primary_key=True)
value = sa.Column(sa.Numeric)
account_id = sa.Column(sa.Integer, sa.ForeignKey("accounts.id"))
account = sa.orm.relationship("Account", backref="entries")
class Account(Base):
__tablename__ = "accounts"
id = sa.Column(sa.Integer, primary_key=True)
#property
def balance(self):
return sqlalchemy.orm.object_session(self).query(
sa.sql.func.sum(Entry.value)
).filter(Entry.account_id == self.id).scalar()
def example(database_url):
# connect to the database and prepare the schema
engine = sa.create_engine(database_url, echo=True)
session = sa.orm.sessionmaker(bind=engine)()
Base.metadata.create_all(bind = engine)
# add an entry to an account
account = Account()
account.entries.append(Entry(value = 42))
session.add(account)
# and look for that entry in the balance
print "account.balance:", account.balance
assert account.balance == 42
if __name__ == "__main__":
example("sqlite://")
Note that "flushing" itself is generally not something we have to worry about; the autoflush feature will ensure flush is called each time query() goes to the database to get results, so it's really ensuring that a query occurs which is what we're going for.
Another approach to this issue is to use hybrids. I'd recommend reading the overview of all three methods at SQL Expressions as Mapped Attributes which lists out the tradeoffs to each approach.

Categories