Many-to-many intersection in sqlalchemy - python

I have a Character class with a .tags attribute; the .tags attribute is a list of Tag objects. in a many-to-many relationship. I'm trying to write a query that will find all pairs of characters that don't have the same name that have at least one tag in common; how would I go about doing this?

You go about this as following:
think of an SQL query which will give you the desired result
create a corresponding SA query
The SQL query (with WITH clause on SQL Server for the sake of test data) is as below (obviously your table and column names might be different):
WITH t_character (id, name)
AS ( SELECT 1, "ch-1"
UNION SELECT 2, "ch-2"
UNION SELECT 3, "ch-3"
UNION SELECT 4, "ch-4"
)
, t_tag (id, name)
AS ( SELECT 1, "tag-1"
UNION SELECT 2, "tag-2"
UNION SELECT 3, "tag-3"
)
, t_character_tag (character_id, tag_id)
AS ( SELECT 1, 1
UNION SELECT 2, 1
UNION SELECT 2, 2
UNION SELECT 3, 1
UNION SELECT 3, 2
UNION SELECT 3, 3
UNION SELECT 4, 3
)
-- the result should contain pairs (1, 2), (1, 3), (2, 3) again (2, 3), and (3, 4)
SELECT DISTINCT -- will filter out duplicates
c1.id, c2.id
FROM t_character c1
INNER JOIN t_character c2
ON c1.id < c2.id -- all pairs without duplicates
INNER JOIN t_character_tag r1
ON r1.character_id = c1.id
INNER JOIN t_character_tag r2
ON r2.character_id = c2.id
WHERE r1.tag_id = r2.tag_id
ORDER BY c1.id, c2.id
The complete sample code with the query you need is below:
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey, Table
from sqlalchemy.orm import relationship, scoped_session, sessionmaker, aliased
from sqlalchemy.ext.declarative import declarative_base
# Configure test database for SA
engine = create_engine("sqlite:///:memory:", echo=False)
session = scoped_session(sessionmaker(bind=engine, autoflush=False))
class Base(object):
""" Just a helper base class to set properties on object creation.
Also provides a convenient default __repr__() function, but be aware that
also relationships are printed, which might result in loading relations.
"""
def __init__(self, **kwargs):
for k,v in kwargs.items():
setattr(self, k, v)
def __repr__(self):
return "<%s(%s)>" % (self.__class__.__name__,
", ".join("%s=%r" % (k, self.__dict__[k])
for k in sorted(self.__dict__) if "_sa_" != k[:4] and "_backref_" != k[:9])
)
Base = declarative_base(cls=Base)
t_character_tag = Table(
"t_character_tag", Base.metadata,
Column("character_id", Integer, ForeignKey("t_character.id")),
Column("tag_id", Integer, ForeignKey("t_tag.id"))
)
class Character(Base):
__tablename__ = u"t_character"
id = Column(Integer, primary_key=True)
name = Column(String)
tags = relationship("Tag", secondary=t_character_tag, backref="characters")
class Tag(Base):
__tablename__ = u"t_tag"
id = Column(Integer, primary_key=True)
name = Column(String)
# create db schema
Base.metadata.create_all(engine)
# 0. create test data
ch1 = Character(id=1, name="ch-1")
ch2 = Character(id=2, name="ch-2")
ch3 = Character(id=3, name="ch-3")
ch4 = Character(id=4, name="ch-4")
ta1 = Tag(id=1, name="tag-1")
ta2 = Tag(id=2, name="tag-2")
ta3 = Tag(id=3, name="tag-3")
ch1.tags.append(ta1)
ch2.tags.append(ta1); ch2.tags.append(ta2);
ch3.tags.append(ta1); ch3.tags.append(ta2); ch3.tags.append(ta3);
ch4.tags.append(ta3)
session.add_all((ch1, ch2, ch3, ch4,))
session.commit()
# 1. some data checks
session.expunge_all()
assert len(session.query(Character).all()) == 4
assert session.query(Tag).get(2).name == "tag-2"
assert len(session.query(Character).get(3).tags) == 3
# 2. create a final query (THE ANSWER TO THE QUESTION):
session.expunge_all()
t_c1 = aliased(Character)
t_c2 = aliased(Character)
t_t1 = aliased(Tag)
t_t2 = aliased(Tag)
q =(session.query(t_c1, t_c2).
join((t_c2, t_c1.id < t_c2.id)).
join((t_t1, t_c1.tags)).
join((t_t2, t_c2.tags)).
filter(t_t1.id == t_t2.id).
filter(t_c1.name != t_c2.name). # if tag name is unique, this can be dropped
order_by(t_c1.id).
order_by(t_c2.id)
)
q = q.distinct() # filter out duplicates
res = [_r for _r in q.all()]
assert len(res) == 4
for _r in res:
print _r

Related

Create a hybrid_property to return the value of a previous record

I've got as far as this when trying to create a hybrid_property to return the value of the previous record:
from datetime import date
from sqlalchemy import Column, Integer, Date, select, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class A(Base):
__tablename__ = "a"
id_ = Column(Integer, primary_key=True)
record_date = Column(Date)
example_value = Column(Integer)
#hybrid_property
def prev_value(self):
return
#prev_value.expression
def prev_value(cls):
stmt = select(A.example_value)
stmt = stmt.order_by(A.record_date.desc())
stmt = stmt.limit(1)
stmt = stmt.label("prev_value")
return stmt
engine = create_engine("sqlite:///:memory:")
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
session.add(A(record_date=date(2022, 5 ,1), example_value=1))
session.add(A(record_date=date(2022, 5 ,2), example_value=2))
session.add(A(record_date=date(2022, 5 ,3), example_value=3))
session.commit()
prev_value = session.execute(select(A.prev_value).where(A.id_ == 3)).scalar()
print(prev_value)
This is currently returning None when it should return 2.
What should I put in the in-Python method and the expression variant?
SOLUTION:
#prev_value.expression
def prev_value(cls):
A1 = aliased(A, name="a_prev")
stmt = select(A1.example_value)
stmt = stmt.filter(A1.record_date < cls.record_date)
stmt = stmt.order_by(A1.record_date.desc())
stmt = stmt.limit(1)
stmt = stmt.label("prev_value")
return stmt
Explanation
When I run the code from your question verbatim (using sqlite), the result I get is actually 3 and not None as you indicated. In fact, the result returned for all rows is the same value of 3 for all the rows asked by a query session.execute(select(A, A.prev_value)) # (1):
(<A [1] (example_value = 1, id_ = 1, record_date = datetime.date(2022, 5, 1))>, 3)
(<A [2] (example_value = 2, id_ = 2, record_date = datetime.date(2022, 5, 2))>, 3)
(<A [3] (example_value = 3, id_ = 3, record_date = datetime.date(2022, 5, 3))>, 3)
Why do I get 3 for your sample code?
I think this is because the sub-query does not have any condition linking it to the requested row. Assuming the previous value should be previous "by record_date", the link to add to the query should be:
stmt = stmt.filter(A.record_date < cls.record_date)
Running it, however, will now generate None for all the result. Let's look at the generated SQL and the reason why none rows are found:
SELECT a.id_,
a.record_date,
a.example_value,
(SELECT a.example_value
FROM a
WHERE a.record_date < a.record_date # >>> the ISSUE is here: always FALSE
ORDER BY a.record_date DESC
LIMIT 1) AS prev_value
FROM a
The problem is that the main query and the sub-query are pointing to the same table/view.
Solve the subquery:
In order to solve it, we just need to explicitly create a sub-query, and the problem is solved:
#prev_value.expression
def prev_value(cls):
A1 = aliased(A, name="a_prev")
stmt = select(A1.example_value)
stmt = stmt.filter(A1.record_date < cls.record_date)
stmt = stmt.order_by(A1.record_date.desc())
stmt = stmt.limit(1)
stmt = stmt.label("prev_value")
return stmt
and the same query (1) produces the following result:
(<A [1] (example_value = 1, id_ = 1, record_date = datetime.date(2022, 5, 1))>, None)
(<A [2] (example_value = 2, id_ = 2, record_date = datetime.date(2022, 5, 2))>, 1)
(<A [3] (example_value = 3, id_ = 3, record_date = datetime.date(2022, 5, 3))>, 2)
based on the following generated SQL:
SELECT a.id_,
a.record_date,
a.example_value,
(SELECT a_prev.example_value
FROM a AS a_prev
WHERE a_prev.record_date < a.record_date
ORDER BY a_prev.record_date DESC
LIMIT 1) AS prev_value
FROM a

How can I write a SQLAlchemy query that will return all descendants of a node in a graph?

I am working on an application where my database objects often have multiple parents and multiple children, and would like to create a SQLAlchemy query that will return all descendants of an object.
Realizing that I am basically trying to store a graph in a SQL database, I found that setting up a self-referential many-to-many schema got me most of the way there, but I am having trouble writing the query to return all descendants of a node. I tried to follow SQLA's recursive CTE example, which looks like the right approach, but have been running into problems getting it to work. I think my situation is different from the example because in my case, queries to Node.child (and Node.parent) return instrumented lists and not ORM objects.
In any case, the code below will set up a simple directed acyclic disconnected graph that looks like this (where the direction is inferred to be from the higher row to the lower one):
a b c
\ / \ |
d e f
|\ /
g h
|
i
And what I'm looking for is some help writing a query that will give me all descendants of a node.
get_descendants(d) should return g, h, i
get_descendants(b) should return d, e, g, h, i
Example code:
from sqlalchemy.orm import aliased
from sqlalchemy import Column, ForeignKey, Integer, Table, Text
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy.orm import sessionmaker
engine = create_engine('sqlite:///:memory:', echo=True)
Session = sessionmaker(bind=engine)
session = Session()
Base = declarative_base()
association_table = Table('association_table', Base.metadata,
Column('parent_id', Integer, ForeignKey('node.id'), primary_key=True),
Column('child_id', Integer, ForeignKey('node.id'), primary_key=True))
class Node(Base):
__tablename__ = 'node'
id = Column(Integer, primary_key=True)
property_1 = Column(Text)
property_2 = Column(Integer)
# http://docs.sqlalchemy.org/en/latest/orm/join_conditions.html#self-referential-many-to-many-relationship
child = relationship('Node',
secondary=association_table,
primaryjoin=id==association_table.c.parent_id,
secondaryjoin=id==association_table.c.child_id,
backref='parent'
)
Base.metadata.create_all(engine)
a = Node(property_1='a', property_2=1)
b = Node(property_1='b', property_2=2)
c = Node(property_1='c', property_2=3)
d = Node(property_1='d', property_2=4)
e = Node(property_1='e', property_2=5)
f = Node(property_1='f', property_2=6)
g = Node(property_1='g', property_2=7)
h = Node(property_1='h', property_2=8)
i = Node(property_1='i', property_2=9)
session.add_all([a, b, c, d, e, f, g, h, i])
a.child.append(d)
b.child.append(d)
d.child.append(g)
d.child.append(h)
g.child.append(i)
b.child.append(e)
e.child.append(h)
c.child.append(f)
session.commit()
session.close()
Solution
The following, surprisingly simple, self-referential many-to-many recursive CTE query will return the desired results for finding all descendants of b:
nodealias = aliased(Node)
descendants = session.query(Node)\
.filter(Node.id == b.id) \
.cte(name="descendants", recursive=True)
descendants = descendants.union(
session.query(nodealias)\
.join(descendants, nodealias.parent)
)
Testing with
for item in session.query(descendants):
print(item.property_1, item.property_2)
Yields:
b 2
d 4
e 5
g 7
h 8
i 9
Which is the correct list of b and all of its descendants.
Full working example code
This example adds a convenient function to the Node class for returning all descendants of an object, while also computing the path from itself to all of its descendants:
from sqlalchemy.orm import aliased
from sqlalchemy import Column, ForeignKey, Integer, Table, Text
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy.orm import sessionmaker
engine = create_engine('sqlite://', echo=True)
Session = sessionmaker(bind=engine)
session = Session()
Base = declarative_base()
association_table = Table('association_table', Base.metadata,
Column('parent_id', Integer, ForeignKey('node.id'), primary_key=True),
Column('child_id', Integer, ForeignKey('node.id'), primary_key=True))
class Node(Base):
__tablename__ = 'node'
id = Column(Integer, primary_key=True)
property_1 = Column(Text)
property_2 = Column(Integer)
# http://docs.sqlalchemy.org/en/latest/orm/join_conditions.html#self-referential-many-to-many-relationship
child = relationship('Node',
secondary=association_table,
primaryjoin=id==association_table.c.parent_id,
secondaryjoin=id==association_table.c.child_id,
backref='parent'
)
def descendant_nodes(self):
nodealias = aliased(Node)
descendants = session.query(Node.id, Node.property_1, (self.property_1 + '/' + Node.property_1).label('path')).filter(Node.parent.contains(self))\
.cte(recursive=True)
descendants = descendants.union(
session.query(nodealias.id, nodealias.property_1, (descendants.c.path + '/' + nodealias.property_1).label('path')).join(descendants, nodealias.parent)
)
return session.query(descendants.c.property_1, descendants.c.path).all()
Base.metadata.create_all(engine)
a = Node(property_1='a', property_2=1)
b = Node(property_1='b', property_2=2)
c = Node(property_1='c', property_2=3)
d = Node(property_1='d', property_2=4)
e = Node(property_1='e', property_2=5)
f = Node(property_1='f', property_2=6)
g = Node(property_1='g', property_2=7)
h = Node(property_1='h', property_2=8)
i = Node(property_1='i', property_2=9)
session.add_all([a, b, c, d, e, f, g, h, i])
a.child.append(d)
b.child.append(d)
d.child.append(g)
d.child.append(h)
g.child.append(i)
b.child.append(e)
e.child.append(h)
c.child.append(f)
e.child.append(i)
session.commit()
for item in b.descendant_nodes():
print(item)
session.close()
"""
Graph should be setup like this:
a b c
\ / \ |
d e f
|\ /|
g h |
+---+
i
"""
Output:
('d', 'b/d')
('e', 'b/e')
('g', 'b/d/g')
('h', 'b/d/h')
('h', 'b/e/h')
('i', 'b/e/i')
('i', 'b/d/g/i')
Comments
Reviewing the SQL Alchemy documentation for self-referential queries was helpful
The problem with my first few attempts was that I was trying to use SQL Alchemy common relationship operators such as any(), contains(), and has() instead of a self-referential join operation
I also found a helpful SO entry for doing what I wanted in raw SQL.

Multiple joins in a SELECT

I have three tables. These are joined by ForeignKey constraints so sqlalchemy knows how to join them.
I want to select the columns from all three tables:
select([a.c.x, b.c.x, c.c.x], a.c.a.between(10,20), [join(a, c), join(a, b)])
This generates the broken SQL:
SELECT a.x, b.x, c.x
FROM
a JOIN b ON a.b_id == b.id,
a JOIN c ON a.c_id == c.id
WHERE
a.a BETWEEN 10 AND 20;
As can be seen, the table a is in the FROM clause twice!
How can you join three tables in a select() statement using sqlalchemy?
The short answer is
select([a.c.x, b.c.x, c.c.x]).\
select_from(a.join(b).join(c)).\
where(between(a.c.a, 5, 15))
And if someone want's to try it out here's the whole thing.
import sqlalchemy
from sqlalchemy import Table, Column, Integer, String, Sequence,\
ForeignKey, select, between
meta = sqlalchemy.MetaData()
url = 'sqlite:///:memory:'
engine = sqlalchemy.create_engine(url)
a = Table(
'a', meta,
Column('id', Integer, Sequence('a_id_seq'), primary_key=True),
Column('age', Integer),
Column('name', String(20))
)
b = Table(
'b', meta,
Column('a_id', Integer, ForeignKey("a.id")),
Column('value', String(20))
)
c = Table(
'c', meta,
Column('a_id', Integer, ForeignKey("a.id")),
Column('title', String(20))
)
# Create tables
meta.create_all(engine)
# Fill with dummy data
def add_data(age, name, value, title):
q = a.insert().values({a.c.age: age, a.c.name: name})
res = engine.execute(q)
a_id = res.inserted_primary_key[0]
q = b.insert().values({b.c.a_id: a_id, b.c.value: value})
engine.execute(q)
q = c.insert().values({c.c.a_id: a_id, c.c.title: title})
engine.execute(q)
add_data(12, 'Foo', 'Bar', 'Baz')
add_data(17, '111', '222', '333')
q = select([a.c.name, b.c.value, c.c.title]).\
select_from(a.join(b).join(c)).\
where(between(a.c.age, 5, 15))
print(str(q))
# SELECT a.name, b.value, c.title
# FROM a JOIN b ON a.id = b.a_id JOIN c ON a.id = c.a_id
# WHERE a.age BETWEEN :age_1 AND :age_2
res = engine.execute(q)
for row in res.fetchall():
print(row)
# ('Foo', 'Bar', 'Baz')
Updated answer, thx for the comment Will!
give the below a go.
SELECT a.x, b.x, c.x
FROM *TABLENAME* a
JOIN *TABLENAME* b
ON a.id = b.id
JOIN *TABLENAME* c
ON a.id = c.id
WHERE
a.a BETWEEN 10 AND 20

How can I write an SQLAlchemy Query with a Join and an Aggregate?

I have a table that has 3 columns: type, content and time (an integer). For each 'type', I want to select the entry with the greatest (most recent) 'time' integer and the corresponding data. How can I do this using SQLAlchemy and Python? I could do this using SQL by performing:
select
c.type,
c.time,
b.data
from
parts as b
inner join
(select
a.type,
max(a.time) as time
from parts as a
group by a.type) as c
on
b.type = c.type and
b.time = c.time
But how can I accomplish this in SQLAlchemy?
The table mapping:
class Structure(Base):
__tablename__ = 'structure'
id = Column(Integer, primary_key=True)
type = Column(Text)
content = Column(Text)
time = Column(Integer)
def __init__(self, type, content):
self.type = type
self.content = content
self.time = time.time()
def serialise(self):
return {"type" : self.type,
"content" : self.content};
The attempted query:
max = func.max(Structure.time).alias("time")
c = DBSession.query(max)\
.add_columns(Structure.type, Structure.time)\
.group_by(Structure.type)\
.subquery()
c.alias("c")
b = DBSession.query(Structure.content)\
.add_columns(c.c.type, c.c.time)\
.join(c, Structure.type == c.c.type)
Gives me:
sqlalchemy.exc.OperationalError: (OperationalError) near "(": syntax
error u'SELECT structure.content AS structure_content, anon_1.type AS
anon_1_type, anon_1.t ime AS anon_1_time \nFROM structure JOIN (SELECT
time.max_1 AS max_1, structure.type AS type, structure.time AS time
\nFROM max(structure.time) AS time, structu re GROUP BY
structure.type) AS anon_1 ON structure.type = anon_1.type' ()
I'm essentially stabbing in the dark, so any help would be appreciated.
Try the code below using sub-query:
subq = (session.query(
Structure.type,
func.max(Structure.time).label("max_time")
).
group_by(Structure.type)
).subquery()
qry = (session.query(Structure).
join(subq, and_(Structure.type == subq.c.type, Structure.time == subq.c.max_time))
)
print qry
producing SQL:
SELECT structure.id AS structure_id, structure.type AS structure_type, structure.content AS structure_content, structure.time AS structure_time
FROM structure
JOIN (SELECT structure.type AS type, max(structure.time) AS max_time
FROM structure GROUP BY structure.type) AS anon_1
ON structure.type = anon_1.type
AND structure.time = anon_1.max_time

How to get rows which match a list of 3-tuples conditions with SQLAlchemy

Having a list of 3-tuples :
[(a, b, c), (d, e, f)]
I want to retrieve all the rows from a table where 3 columns matches the tuples. FOr this example, the query WHERE clause could be something like this :
(column_X = a AND column_Y = b AND column_Z = c)
OR (column_X = d AND column_Y = e AND column_Z = f)
How can I create such a request using SQLAlchemy ? In my case the 3-tuples list will contains hundred of elements, and I'm looking for the best scallable solution.
Thanks for your help,
Easiest way would be using SQLAlchemy-provided tuple_ function:
from sqlalchemy import tuple_
session.query(Foo).filter(tuple_(Foo.a, Foo.b, Foo.c).in_(items))
This works with PostgreSQL, but breaks with SQLite. Not sure about other database engines.
Fortunately there's a workaround that should work on all databases.
Start by mapping out all the items with the and_ expression:
conditions = (and_(c1=x, c2=y, c3=z) for (x, y, z) in items)
And then create an or_ filter that encloses all the conditions:
q.filter(or_(*conditions))
Here's a simple example:
#/usr/bin/env python
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer
from sqlalchemy.sql import and_, or_
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine('sqlite:///')
session = sessionmaker(bind=engine)()
Base = declarative_base()
class Foo(Base):
__tablename__ = 'foo'
id = Column(Integer, primary_key=True)
a = Column(Integer)
b = Column(Integer)
c = Column(Integer)
def __init__(self, a, b, c):
self.a = a
self.b = b
self.c = c
def __repr__(self):
return '(%d %d %d)' % (self.a, self.b, self.c)
Base.metadata.create_all(engine)
session.add_all([Foo(1, 2, 3), Foo(3, 2, 1), Foo(3, 3, 3), Foo(1, 3, 4)])
session.commit()
items = ((1, 2, 3), (3, 3, 3))
conditions = (and_(Foo.a==x, Foo.b==y, Foo.c==z) for (x, y, z) in items)
q = session.query(Foo)
print q.all()
q = q.filter(or_(*conditions))
print q
print q.all()
Which outputs:
$ python test.py
[(1 2 3), (3 2 1), (3 3 3), (1 3 4)]
SELECT foo.id AS foo_id, foo.a AS foo_a, foo.b AS foo_b, foo.c AS foo_c
FROM foo
WHERE foo.a = :a_1 AND foo.b = :b_1 AND foo.c = :c_1 OR foo.a = :a_2 AND foo.b = :b_2 AND foo.c = :c_2
[(1 2 3), (3 3 3)]
A less conventional approach that I suspect would scale well would be to create a temporary table of all your tuples and then join on that:
import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, Table
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
engine = sqlalchemy.create_engine('sqlite:///:memory:')
Session = sessionmaker(bind=engine)
session = Session()
class Triple(Base):
__tablename__ = 'triple'
id = Column(Integer(), primary_key=True)
x = Column(Integer())
y = Column(Integer())
z = Column(Integer())
ws_table = Table('where_sets', Base.metadata,
Column('x', Integer()),
Column('y', Integer()),
Column('z', Integer()),
prefixes = ['temporary']
)
Base.metadata.create_all(engine)
...
where_sets = [(1, 2, 3), (3, 2, 1), (1, 1, 1)]
ws_table.create(engine, checkfirst=True)
session.execute(ws_table.insert(), [dict(zip('xyz', s)) for s in where_sets])
matches = session.query(Triple).join(ws_table, (Triple.x==ws_table.c.x) & (Triple.y==ws_table.c.y) & (Triple.z==ws_table.c.z)).all()
which executes SQL like this:
INSERT INTO triple (x, y, z) VALUES (?, ?, ?)
(1, 2, 3)
INSERT INTO triple (x, y, z) VALUES (?, ?, ?)
(3, 1, 2)
INSERT INTO triple (x, y, z) VALUES (?, ?, ?)
(1, 1, 1)
SELECT triple.id AS triple_id, triple.x AS triple_x, triple.y AS triple_y, triple.z AS triple_z
FROM triple JOIN where_sets ON triple.x = where_sets.x AND triple.y = where_sets.y AND triple.z = where_sets.z
Would anyone consider creating an extra key in the original table ?
i.e. create a new column with "1"-"2"-"3" instead of another table and check for the uniqueness.

Categories