SQLAlchemy threaded session execute does not commit changes - python

I have the following simple setup
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from sqlalchemy.orm import sessionmaker, scoped_session
def do_query():
engine = ...
session_factory = sessionmaker(bind=engine, autocommit=False, autoflush=False)
ThreadedSession = scoped_session(session_factory)
f = partial(
_query_function,
session=ThreadedSession,
)
queries = [...]
with ThreadPoolExecutor(max_workers=num_threads) as pool:
pool.map(f, queries)
ThreadedSession.commit()
def _query_function(query, session):
s = session()
s.execute(query)
return
Where I will pass the queries to a ThreadPoolExecutor, and have each thread use the shared session factory as in https://docs.sqlalchemy.org/en/13/orm/contextual.html#contextual-thread-local-sessions. However, the changes are not committed like this. Why?

Related

python async sqlalchemy session get tables

how can get a list of table objects with an asynchronous session?
I tried many options but never found the right one.
This is how i get the session object itself
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import MetaData
from sqlalchemy import inspect
from sqlalchemy.future import select
from utils import *
import asyncio
engine = create_async_engine(config.database_url).execution_options(autocommit=True)
async_session = sessionmaker(
engine, expire_on_commit=False, class_=AsyncSession
)
async def execute(*args):
return await (async_session()).execute(*args)
async def tables():
session= async_session()
#print(await session.run_sync( inspect(engine).get_table_names ))
asyncio.run(tables())

Is sqlalchemy sessionmaker thread safe?

Instead of:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
# an Engine, which the Session will use for connection
# resources
engine = create_engine('sqlite:///...')
# create session and add objects
with Session(engine) as session:
session.add(some_object)
session.add(some_other_object)
session.commit()
I create a sessionmaker (according to example in documentation, see bellow):
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
# an Engine, which the Session will use for connection
# resources, typically in module scope
engine = create_engine('postgresql://scott:tiger#localhost/')
# a sessionmaker(), also in the same scope as the engine
Session = sessionmaker(engine)
# we can now construct a Session() without needing to pass the
# engine each time
with Session() as session:
session.add(some_object)
session.add(some_other_object)
session.commit()
Can I use the sessions from session maker in different threads (spawning multiple sessions at the same time)? In other words, is session maker thread safe object? If yes, can multiple sessions exists and read/write into same tables at the same time?
Furthermore, what is the advantage of of using 'scoped_session' - is it realated to problem of multiple sessions (one per thread)?:
# set up a scoped_session
from sqlalchemy.orm import scoped_session
from sqlalchemy.orm import sessionmaker
session_factory = sessionmaker(bind=some_engine)
Session = scoped_session(session_factory)
# now all calls to Session() will create a thread-local session
some_session = Session()
# you can now use some_session to run multiple queries, etc.
# remember to close it when you're finished!
Session.remove()
Session objects are not thread-safe, but are thread-local. What I recommend using is sessionmaker instead of Session. It will yield a Session object every time you need it, thus not idling the database connection. I'd use the approach below.
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, Session
DB_ENGINE = create_engine('sqlite:///...')
DB_SES_MAKER = sessionmaker(bind=DB_ENGINE)
def get_db():
db = DB_SES_MAKER()
try:
yield db
finally:
db.close()
Then call get_db whenever needed:
db = next(get_db())

How to use PostgreSQL test database in async FastAPI tests?

I'm working on an async FastAPI project and I want to connect to the database during tests. Coming from Django, my instinct was to create pytest fixtures that take care of creating/dropping the test database. However, I couldn't find much documentation on how to do this. The most complete instructions I could find were in this tutorial, but they don't work for me because they are all synchronous. I'm somewhat new to async development so I'm having trouble adapting the code to work async. This is what I have so far:
import pytest
from sqlalchemy.ext.asyncio import create_async_engine, session
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy_utils import database_exists, create_database
from fastapi.testclient import TestClient
from app.core.db import get_session
from app.main import app
Base = declarative_base()
#pytest.fixture(scope="session")
def db_engine():
default_db = (
"postgresql+asyncpg://postgres:postgres#postgres:5432/postgres"
)
test_db = "postgresql+asyncpg://postgres:postgres#postgres:5432/test"
engine = create_async_engine(default_db)
if not database_exists(test_db): # <- Getting error on this line
create_database(test_db)
Base.metadata.create_all(bind=engine)
yield engine
#pytest.fixture(scope="function")
def db(db_engine):
connection = db_engine.connect()
# begin a non-ORM transaction
connection.begin()
# bind an individual Session to the connection
Session = sessionmaker(bind=connection)
db = Session()
# db = Session(db_engine)
yield db
db.rollback()
connection.close()
#pytest.fixture(scope="function")
def client(db):
app.dependency_overrides[get_session] = lambda: db
PREFIX = "/api/v1/my-endpoint"
with TestClient(PREFIX, app) as c:
yield c
And this is the error I'm getting:
E sqlalchemy.exc.MissingGreenlet: greenlet_spawn has not been called; can't call await_() here. Was IO attempted in an unexpected place? (Background on this error at: https://sqlalche.me/e/14/xd2s)
/usr/local/lib/python3.9/site-packages/sqlalchemy/util/_concurrency_py3k.py:67: MissingGreenlet
Any idea what I have to do to fix it?
You try to use sync engine with async session. Try to use:
from sqlalchemy.ext.asyncio import AsyncSession
Session = sessionmaker(bind= connection, class_=AsyncSession)
https://docs.sqlalchemy.org/en/14/orm/extensions/asyncio.html

Handling scoped_session across multiple modules with SQLAlchemy

I'm newbie using SQLAlchemy and I'm working on a complex ETL process so I did the below simplified code:
module1.py
class Foo:
def foo_method(self):
# doing stuff with database
module2.py
class Bar:
def bar_method(self):
# doing stuff with database
main_script.py
from module1 import Foo
from module2 import Bar
def run():
with Pool(processes = num_workers) as pool:
responses = [pool.apply_async(some_func, (param)) for param in params]
for response in responses:
response.get()
def some_func(param):
engine = create_engine(connection_string, echo=True)
Session = scoped_session(sessionmaker(bind=engine))
session = Session()
# Start doing some stuff with database
foo = Foo()
foo.foo_method()
bar = Bar()
bar.bar_method()
So I have a Pool with worker process. When I call main_script.run() each worker creates a database session inside some_func. My question is how can I use the same session for each worker in module1 and module2 without passing the session by param to each method? Should I add the follow lines in each module/file?
engine = create_engine(connection_string, echo=True)
Session = scoped_session(sessionmaker(bind=engine))
session = Session()
scoped_session should be created at the module level. For your project structure, that probably means having a separate module to house the engine and session:
db.py
engine = create_engine(connection_string, echo=True)
Session = scoped_session(sessionmaker(bind=engine))
module1.py
from db import Session
class Foo:
def foo_method(self):
session = Session()
session.query(...)...

SQLAlchemy ThreadPoolExecutor "Too many clients"

I wrote a script with this sort of logic in order to insert many records into a PostgreSQL table as they are generated.
#!/usr/bin/env python3
import asyncio
from concurrent.futures import ProcessPoolExecutor as pool
from functools import partial
import sqlalchemy as sa
from sqlalchemy.ext.declarative import declarative_base
metadata = sa.MetaData(schema='stackoverflow')
Base = declarative_base(metadata=metadata)
class Example(Base):
__tablename__ = 'example'
pk = sa.Column(sa.Integer, primary_key=True)
text = sa.Column(sa.Text)
sa.event.listen(Base.metadata, 'before_create',
sa.DDL('CREATE SCHEMA IF NOT EXISTS stackoverflow'))
engine = sa.create_engine(
'postgresql+psycopg2://postgres:password#localhost:5432/stackoverflow'
)
Base.metadata.create_all(engine)
session = sa.orm.sessionmaker(bind=engine, autocommit=True)()
def task(value):
engine.dispose()
with session.begin():
session.add(Example(text=value))
async def infinite_task(loop):
spawn_task = partial(loop.run_in_executor, None, task)
while True:
await asyncio.wait([spawn_task(value) for value in range(10000)])
def main():
loop = asyncio.get_event_loop()
with pool() as executor:
loop.set_default_executor(executor)
asyncio.ensure_future(infinite_task(loop))
loop.run_forever()
loop.close()
if __name__ == '__main__':
main()
This code works just fine, creating a pool of as many processes as I have CPU cores, and happily chugging along forever. I wanted to see how threads would compare to processes, but I could not get a working example. Here are the changes I made:
from concurrent.futures import ThreadPoolExecutor as pool
session_maker = sa.orm.sessionmaker(bind=engine, autocommit=True)
Session = sa.orm.scoped_session(session_maker)
def task(value):
engine.dispose()
# create new session per thread
session = Session()
with session.begin():
session.add(Example(text=value))
# remove session once the work is done
Session.remove()
This version runs for a while before a flood of "too many clients" exceptions:
sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) FATAL: sorry, too many clients already
What am I missing?
It turns out that the problem is engine.dispose(), which, in the words of Mike Bayer (zzzeek) "is leaving PG connections lying open to be garbage collected."
Source: https://groups.google.com/forum/#!topic/sqlalchemy/zhjCBNebnDY
So the updated task function looks like this:
def task(value):
# create new session per thread
session = Session()
with session.begin():
session.add(Example(text=value))
# remove session object once the work is done
session.remove()
It looks like you're opening a lot of new connections without closing them, try to add engine.dispose() after:
from concurrent.futures import ThreadPoolExecutor as pool
session_maker = sa.orm.sessionmaker(bind=engine, autocommit=True)
Session = sa.orm.scoped_session(session_maker)
def task(value):
engine.dispose()
# create new session per thread
session = Session()
with session.begin():
session.add(Example(text=value))
# remove session once the work is done
Session.remove()
engine.dispose()
Keep in mind the cost of a new connection, so ideally you should have one connection per process/thread, but I'm not sure how ThreadPoolExecutor works and probably connections are not being closed on thread's execution finish.

Categories