Query with join doesn't select related object - python

I am trying to join the table STUDENT to STUDY_PROGRAM. STUDENT to STUDY_PROGRAM is a one to many relationship. The query on a simple natural join didn't give the expected result. Debugging shows that the query result didn't have 'program' columns.
(Pdb) print mystudents[0].program
*** AttributeError: 'Student' object has no attribute 'program'
def students():
mystudentinfo = mydb.session.query(Student).join(StudyProgram)
return render_template('administration/students.html', studentinfo = mystudentinfo)
class Student(mydb.Model):
__tablename__ = 'STUDENT'
study_no = mydb.Column(mydb.String(20), primary_key = True)
std_first_name = mydb.Column(mydb.String(64))
std_last_name = mydb.Column(mydb.String(64))
std_birthdate = mydb.Column(mydb.Date())
std_email = mydb.Column(mydb.String(62))
std_password = mydb.Column(mydb.String())
study_programs = mydb.relationship('StudyProgram', backref='student')
project_apps = mydb.relationship('ProjectApp', backref='student')
class StudyProgram(mydb.Model):
__tablename__ = 'STUDY_PROGRAM'
study_no = mydb.Column(mydb.String(20), mydb.ForeignKey('STUDENT.study_no'), primary_key = True)
program = mydb.Column(mydb.String(100), primary_key = True)
degree_type = mydb.Column(mydb.String(8), primary_key = True)
reg_date = mydb.Column(mydb.Date())
status = mydb.Column(mydb.String(20))
earned_ECTs = mydb.Column(mydb.Numeric(4, 1))
reg_ECTs = mydb.Column(mydb.Numeric(3, 1))
tot_ECTs = mydb.Column(mydb.Numeric(4, 1))
graduation_date = mydb.Column(mydb.Date())

The query didn't select any of Program because SQLAlchemy treats joins separately from selects.
The loading strategy for the relationship can be changed using the options() call on the query. Since you are not doing any filtering on StudyProgram, you can omit the join and set the joinedload option instead.
students = db.session.query(Student).options(db.joinedload('study_programs'))
Now the study_programs relationship will be loaded during the main query, rather than as a separate query. If you do need to join for filtering, you can use the contains_eager option instead.
To access the programs for each student, use the relationship. For example:
for s in students:
print(s.first_name)
for p in s.study_programs:
print(p.program)
print()
If you used joinedload this will not issue any queries except the first one to get the students. If you did not, the default behavior of a relationship is to issue a SELECT when it is accessed, so you will incur one query per student.
The reason for your specific error is that you named the relationship attribute study_programs, not program.

Related

sqlalchemy join to a table via two foreign keys to that same table (ambiguous column error)

I am trying to do a join on a table that has two foriegn keys to the same table. Namely, sourceToOutputRelation points twice to Entries, as shown in the code. Also, Entries have tags. I am trying to do a join so that I get every sourceToOutputRelation that has all the given tags (via Entries). I am just trying to understand the join (the filtering works, I think). Here is the code I have for the join and filter. :
'''
tags is a list of strings that are supposed to match the Tags.tag strings
'''
from sqlalchemy.orm import aliased
q = SourceToOutputRelation.query.\
join(Entries.source_entries, Entries.output_entries).\
join(original_tag_registration).\
join(Tags).\
filter(Tags.tag == tags[0])
print(q.all())
Here are my model definitions :
class SourceToOutputRelation(alchemyDB.Model):
__tablename__ = 'sourceToOutputRel'
id = alchemyDB.Column(alchemyDB.Integer, primary_key = True)
source_article = alchemyDB.Column(alchemyDB.Integer, alchemyDB.ForeignKey('entries.id'))
output_article = alchemyDB.Column(alchemyDB.Integer, alchemyDB.ForeignKey('entries.id'))
class Entries(alchemyDB.Model):
__tablename__ = 'entries'
id = alchemyDB.Column(alchemyDB.Integer, primary_key = True)
tags = alchemyDB.relationship('Tags',
secondary = original_tag_registration,
backref = alchemyDB.backref('relevant_entries', lazy = 'dynamic'),
lazy = 'dynamic')
source_entries = alchemyDB.relationship('SourceToOutputRelation',
primaryjoin="SourceToOutputRelation.output_article==Entries.id",
foreign_keys = [SourceToOutputRelation.output_article],
backref = alchemyDB.backref('output', lazy = 'joined'),
lazy = 'dynamic',
cascade = 'all, delete-orphan')
output_entries = alchemyDB.relationship('SourceToOutputRelation',
primaryjoin="SourceToOutputRelation.source_article==Entries.id",
foreign_keys = [SourceToOutputRelation.source_article],
backref = alchemyDB.backref('source', lazy = 'joined'),
lazy = 'dynamic',
cascade = 'all, delete-orphan')
original_tag_registration = alchemyDB.Table('original_tag_registration',
alchemyDB.Column('tag_id', alchemyDB.Integer, alchemyDB.ForeignKey('tagTable.id')),
alchemyDB.Column('entry_id', alchemyDB.Integer, alchemyDB.ForeignKey('entries.id'))
)
class Tags(alchemyDB.Model):
'''
a table to hold unique tags
'''
__tablename__ = 'tagTable'
id = alchemyDB.Column(alchemyDB.Integer, primary_key = True)
tag = alchemyDB.Column(alchemyDB.String(64), unique=True)
entries_with_this_tag = alchemyDB.relationship('Entries',
secondary = original_tag_registration,
backref = alchemyDB.backref('tag', lazy = 'dynamic'),
lazy = 'dynamic')
I get this error :
OperationalError: (OperationalError) ambiguous column name:
sourceToOutputRel.id u'SELECT "sourceToOutputRel".id AS
"sourceToOutputRel_id", "sourceToOutputRel".source_article AS
"sourceToOutputRel_source_article", "sourceToOutputRel".output_article
AS "sourceToOutputRel_output_article",
"sourceToOutputRel".needs_processing AS
"sourceToOutputRel_needs_processing",
"sourceToOutputRel".number_of_votes AS
"sourceToOutputRel_number_of_votes", "sourceToOutputRel".date_related
AS "sourceToOutputRel_date_related",
"sourceToOutputRel".confirmed_relationship_type AS
"sourceToOutputRel_confirmed_relationship_type", entries_1.id AS
entries_1_id, entries_1.title AS entries_1_title, entries_1.text AS
entries_1_text, entries_1.body_html AS entries_1_body_html,
entries_1.user_id AS entries_1_user_id, entries_1.date_posted AS
entries_1_date_posted, entries_2.id AS entries_2_id, entries_2.title
AS entries_2_title, entries_2.text AS entries_2_text,
entries_2.body_html AS entries_2_body_html, entries_2.user_id AS
entries_2_user_id, entries_2.date_posted AS entries_2_date_posted
\nFROM entries JOIN "sourceToOutputRel" ON
"sourceToOutputRel".output_article = entries.id JOIN
"sourceToOutputRel" ON "sourceToOutputRel".source_article = entries.id
JOIN original_tag_registration ON entries.id =
original_tag_registration.entry_id JOIN "tagTable" ON "tagTable".id =
original_tag_registration.tag_id LEFT OUTER JOIN entries AS entries_1
ON "sourceToOutputRel".output_article = entries_1.id LEFT OUTER JOIN
entries AS entries_2 ON "sourceToOutputRel".source_article =
entries_2.id \nWHERE "tagTable".tag = ?' (u'brods',)
Look at the docs.
Paragraph
Joins to a Target with an ON Clause
a_alias = aliased(Address)
q = session.query(User).\
join(User.addresses).\
join(a_alias, User.addresses).\
filter(Address.email_address=='ed#foo.com').\
filter(a_alias.email_address=='ed#bar.com')
There are multiple join on one table.
You already import aliased funciton.
Try this code
'''
tags is a list of strings that are supposed to match the Tags.tag strings
'''
from sqlalchemy.orm import aliased
entry_alias = aliased(Entries)
q = SourceToOutputRelation.query.\
join(Entries.source_entries).\
join(entry_alias, Entries.output_entries).\
join(original_tag_registration).\
join(Tags).\
filter(Tags.tag == tags[0])
print(q.all())

SQLAlchemy: How to keep the record ID and its relationship record in sync for new records (pre-commit)?

When creating new records, I'd expect that foreign key fields, and their relationship object would stay in sync (if I change one the other would change to reflect), but this doesn't seem to be the case. Is this possible to do?
Given the following:
Base = declarative_base();
class User(Base):
__tablename__ = 'user';
id = Column(Integer, primary_key=True);
name = Column(String);
fullname = Column(String);
password = Column(String);
equipment = relationship('Equipment', backref='user');
class Equipment(Base):
__tablename__ = 'equipment';
id = Column(Integer, primary_key=True);
user_id = Column(Integer, ForeignKey('user.id'), nullable=False);
name = Column(String);
engine = create_engine('sqlite:///:memory:', echo=True);
Base.metadata.create_all(engine);
session = sessionmaker(bind=engine);
conn = session();
conn.add_all([
User(name='bill', fullname='Bill W.', password='rlrrlrll'), # id=1
User(name='tony', fullname='Tony I.', password='EADGBe'), # id=2
User(name='ozzy', fullname='Ozzy O.', password='durrrr'), # id=3
User(name='geezer', fullname='Terence B.', password='password'), # id=4
]);
I can create related records in either of the two ways:
guitar = Equipment(
user = conn.query(User).filter(User.name == 'tony').one(),
name = 'Gibson SG');
drums = Equipment(
user_id = 1,
name = 'Ludwigs');
Following these lines I'd expect guitar.user_id to be 2, and drums.user to be the 'bill' object, but in both cases they're None. After I conn.add()/conn.commit() then it starts working a little more like I'd expect (both complementary fields return non-None values).
Is there any way for this to work pre-commit? I'd like to be able to construct new records either way (by ID or by object), and in library functions be able to reliably access the ID or object.
You can do this by flushing:
conn.add(guitar)
conn.add(name)
conn.flush()
Flushing emits the INSERT queries but does not COMMIT, meaning you can ROLLBACK later if you need to.

Sqlalchemy ID field isn't populated when relationship with another table is set up

I'm trying to set up Sqlalchemy and am running into problems with setting up relationships between tables. Most likely it's misunderstanding on my part.
A table is set up like so. The important line is the one with two asterisks one either side, setting up the relationship to table "jobs."
class Clocktime(Base):
"""Table for clockin/clockout values
ForeignKeys exist for Job and Employee
many to one -> employee
many to one -> job
"""
__tablename__ = "clocktimes"
id = Column(Integer, primary_key=True)
time_in = Column(DateTime)
time_out = Column(DateTime)
employee_id = Column(Integer, ForeignKey('employees.id'))
**job_id = Column(Integer, ForeignKey('jobs.id'))**
# employee = many to one relationship with Employee
# job = many to one relationship with Job
#property
def timeworked(self):
return self.time_out - self.time_in
#property
def __str__(self):
formatter="Employee: {employee.name}, "\
"Job: {job.abbr}, "\
"Start: {self.time_in}, "\
"End: {self.time_out}, "\
"Hours Worked: {self.timeworked}, "\
"ID# {self.id}"
return formatter.format(employee=self.employee, job=self.job, self=self)
Now, the jobs table follows. Check the asterisked line:
class Job(Base):
"""Table for jobs
one to many -> clocktimes
note that rate is cents/hr"""
__tablename__ = "jobs"
id = Column(Integer, primary_key=True)
name = Column(String(50))
abbr = Column(String(16))
rate = Column(Integer) # cents/hr
**clocktimes = relationship('Clocktime', backref='job', order_by=id)**
def __str__(self):
formatter = "Name: {name:<50} {abbr:>23}\n" \
"Rate: ${rate:<7.2f}/hr {id:>62}"
return formatter.format(name=self.name,
abbr="Abbr: " + str(self.abbr),
rate=self.rate/100.0,
id="ID# " + str(self.id))
When a user starts a new task, the following code is executed in order to write the relevant data to tables jobs and clocktimes:
new_task_job = [Job(abbr=abbrev, name=project_name, rate=p_rate), Clocktime(time_in=datetime.datetime.now())]
for i in new_task_job:
session.add(i)
session.commit()
start_time = datetime.datetime.now()
status = 1
Then, when the user takes a break...
new_break = Clocktime(time_out=datetime.datetime.now())
session.add(new_break)
session.commit()
If you look in the screenshot, the job_id field isn't being populated. Shouldn't it be populated with the primary key (id) from the jobs table, per
job_id = Column(Integer, ForeignKey('jobs.id'))
or am I missing something? I'm assuming that I'm to write code to do that, but I don't want to break anything that Sqlalchemy is trying to do in the backend. This should be a one job to many clocktimes, since a person can spend several days per task.
Checking out the docs it
looks like you've set up a collection of ClockTime objects on Job called clocktimes and a .job attribute on ClockTime that will refer to the parent Job object.
The expected behaviour is,
c1 = ClockTime()
j1 = Job()
>>> j1.clocktimes
[]
>>> print c1.job
None
When you populate j1.clocktimes with an object, you should also see c1.job get a non None value.
j1.clocktimes.append(c1)
>>> j1.clocktimes
[an instance of `ClockTime`]
>>> c1.job
[an instance of `Job`]
Do you find that behaviour? I don't see in your code where you populate clocktimes so the population of job is never triggered.
I think you are expecting the addition of ForeignKey to the column definition to do something it doesn't do. The ForeignKey constraint you put on job_id simply means that it is constrained to be among the values that exist in the id column of the Jobs table. Check here for more details

sqlalchemy: union query few columns from multiple tables with condition

I'm trying to adapt some part of a MySQLdb application to sqlalchemy in declarative base. I'm only beginning with sqlalchemy.
The legacy tables are defined something like:
student: id_number*, semester*, stateid, condition, ...
choice: id_number*, semester*, choice_id, school, program, ...
We have 3 tables for each of them (student_tmp, student_year, student_summer, choice_tmp, choice_year, choice_summer), so each pair (_tmp, _year, _summer) contains information for a specific moment.
select *
from `student_tmp`
inner join `choice_tmp` using (`id_number`, `semester`)
My problem is the information that is important to me is to get the equivalent of the following select:
SELECT t.*
FROM (
(
SELECT st.*, ct.*
FROM `student_tmp` AS st
INNER JOIN `choice_tmp` as ct USING (`id_number`, `semester`)
WHERE (ct.`choice_id` = IF(right(ct.`semester`, 1)='1', '3', '4'))
AND (st.`condition` = 'A')
) UNION (
SELECT sy.*, cy.*
FROM `student_year` AS sy
INNER JOIN `choice_year` as cy USING (`id_number`, `semester`)
WHERE (cy.`choice_id` = 4)
AND (sy.`condition` = 'A')
) UNION (
SELECT ss.*, cs.*
FROM `student_summer` AS ss
INNER JOIN `choice_summer` as cs USING (`id_number`, `semester`)
WHERE (cs.`choice_id` = 3)
AND (ss.`condition` = 'A')
)
) as t
* used for shorten the select, but I'm actually only querying for about 7 columns out of the 50 availables.
This information is used in many flavors... "Do I have new students? Do I still have all students from a given date? Which students are subscribed after the given date? etc..." The result of this select statement is to be saved in another database.
Would it be possible for me to achieve this with a single view-like class? The information is read-only so I don't need to be able to modify/create/delte. Or do I have to declare a class for each table (ending up with 6 classes) and every time I need to query, I have to remember to filter?
Thanks for pointers.
EDIT: I don't have modification access to the database (I cannot create a view). Both databases may not be on the same server (so I cannot create a view on my second DB).
My concern is to avoid the full table scan before filtering on condition and choice_id.
EDIT 2: I've set up declarative classes like this:
class BaseStudent(object):
id_number = sqlalchemy.Column(sqlalchemy.String(7), primary_key=True)
semester = sqlalchemy.Column(sqlalchemy.String(5), primary_key=True)
unique_id_number = sqlalchemy.Column(sqlalchemy.String(7))
stateid = sqlalchemy.Column(sqlalchemy.String(12))
condition = sqlalchemy.Column(sqlalchemy.String(3))
class Student(BaseStudent, Base):
__tablename__ = 'student'
choices = orm.relationship('Choice', backref='student')
#class StudentYear(BaseStudent, Base):...
#class StudentSummer(BaseStudent, Base):...
class BaseChoice(object):
id_number = sqlalchemy.Column(sqlalchemy.String(7), primary_key=True)
semester = sqlalchemy.Column(sqlalchemy.String(5), primary_key=True)
choice_id = sqlalchemy.Column(sqlalchemy.String(1))
school = sqlalchemy.Column(sqlalchemy.String(2))
program = sqlalchemy.Column(sqlalchemy.String(5))
class Choice(BaseChoice, Base):
__tablename__ = 'choice'
__table_args__ = (
sqlalchemy.ForeignKeyConstraint(['id_number', 'semester',],
[Student.id_number, Student.semester,]),
)
#class ChoiceYear(BaseChoice, Base): ...
#class ChoiceSummer(BaseChoice, Base): ...
Now, the query that gives me correct SQL for one set of table is:
q = session.query(StudentYear, ChoiceYear) \
.select_from(StudentYear) \
.join(ChoiceYear) \
.filter(StudentYear.condition=='A') \
.filter(ChoiceYear.choice_id=='4')
but it throws an exception...
"Could not locate column in row for column '%s'" % key)
sqlalchemy.exc.NoSuchColumnError: "Could not locate column in row for column '*'"
How do I use that query to create myself a class I can use?
If you can create this view on the database, then you simply map the view as if it was a table. See Reflecting Views.
# DB VIEW
CREATE VIEW my_view AS -- #todo: your select statements here
# SA
my_view = Table('my_view', metadata, autoload=True)
# define view object
class ViewObject(object):
def __repr__(self):
return "ViewObject %s" % str((self.id_number, self.semester,))
# map the view to the object
view_mapper = mapper(ViewObject, my_view)
# query the view
q = session.query(ViewObject)
for _ in q:
print _
If you cannot create a VIEW on the database level, you could create a selectable and map the ViewObject to it. The code below should give you the idea:
student_tmp = Table('student_tmp', metadata, autoload=True)
choice_tmp = Table('choice_tmp', metadata, autoload=True)
# your SELECT part with the columns you need
qry = select([student_tmp.c.id_number, student_tmp.c.semester, student_tmp.stateid, choice_tmp.school])
# your INNER JOIN condition
qry = qry.where(student_tmp.c.id_number == choice_tmp.c.id_number).where(student_tmp.c.semester == choice_tmp.c.semester)
# other WHERE clauses
qry = qry.where(student_tmp.c.condition == 'A')
You can create 3 queries like this, then combine them with union_all and use the resulting query in the mapper:
view_mapper = mapper(ViewObject, my_combined_qry)
In both cases you have to ensure though that a PrimaryKey is properly defined on the view, and you might need to override the autoloaded view, and specify the primary key explicitely (see the link above). Otherwise you will either receive an error, or might not get proper results from the query.
Answer to EDIT-2:
qry = (session.query(StudentYear, ChoiceYear).
select_from(StudentYear).
join(ChoiceYear).
filter(StudentYear.condition == 'A').
filter(ChoiceYear.choice_id == '4')
)
The result will be tuple pairs: (Student, Choice).
But if you want to create a new mapped class for the query, then you can create a selectable as the sample above:
student_tmp = StudentTmp.__table__
choice_tmp = ChoiceTmp.__table__
.... (see sample code above)
This is to show what I ended up doing, any comment welcomed.
class JoinedYear(Base):
__table__ = sqlalchemy.select(
[
StudentYear.id_number,
StudentYear.semester,
StudentYear.stateid,
ChoiceYear.school,
ChoiceYear.program,
],
from_obj=StudentYear.__table__.join(ChoiceYear.__table__),
) \
.where(StudentYear.condition == 'A') \
.where(ChoiceYear.choice_id == '4') \
.alias('YearView')
and I will elaborate from there...
Thanks #van

How to do an upsert with SqlAlchemy?

I have a record that I want to exist in the database if it is not there, and if it is there already (primary key exists) I want the fields to be updated to the current state. This is often called an upsert.
The following incomplete code snippet demonstrates what will work, but it seems excessively clunky (especially if there were a lot more columns). What is the better/best way?
Base = declarative_base()
class Template(Base):
__tablename__ = 'templates'
id = Column(Integer, primary_key = True)
name = Column(String(80), unique = True, index = True)
template = Column(String(80), unique = True)
description = Column(String(200))
def __init__(self, Name, Template, Desc):
self.name = Name
self.template = Template
self.description = Desc
def UpsertDefaultTemplate():
sess = Session()
desired_default = Template("default", "AABBCC", "This is the default template")
try:
q = sess.query(Template).filter_by(name = desiredDefault.name)
existing_default = q.one()
except sqlalchemy.orm.exc.NoResultFound:
#default does not exist yet, so add it...
sess.add(desired_default)
else:
#default already exists. Make sure the values are what we want...
assert isinstance(existing_default, Template)
existing_default.name = desired_default.name
existing_default.template = desired_default.template
existing_default.description = desired_default.description
sess.flush()
Is there a better or less verbose way of doing this? Something like this would be great:
sess.upsert_this(desired_default, unique_key = "name")
although the unique_key kwarg is obviously unnecessary (the ORM should be able to easily figure this out) I added it just because SQLAlchemy tends to only work with the primary key. eg: I've been looking at whether Session.merge would be applicable, but this works only on primary key, which in this case is an autoincrementing id which is not terribly useful for this purpose.
A sample use case for this is simply when starting up a server application that may have upgraded its default expected data. ie: no concurrency concerns for this upsert.
SQLAlchemy supports ON CONFLICT with two methods on_conflict_do_update() and on_conflict_do_nothing().
Copying from the documentation:
from sqlalchemy.dialects.postgresql import insert
stmt = insert(my_table).values(user_email='a#b.com', data='inserted data')
stmt = stmt.on_conflict_do_update(
index_elements=[my_table.c.user_email],
index_where=my_table.c.user_email.like('%#gmail.com'),
set_=dict(data=stmt.excluded.data)
)
conn.execute(stmt)
SQLAlchemy does have a "save-or-update" behavior, which in recent versions has been built into session.add, but previously was the separate session.saveorupdate call. This is not an "upsert" but it may be good enough for your needs.
It is good that you are asking about a class with multiple unique keys; I believe this is precisely the reason there is no single correct way to do this. The primary key is also a unique key. If there were no unique constraints, only the primary key, it would be a simple enough problem: if nothing with the given ID exists, or if ID is None, create a new record; else update all other fields in the existing record with that primary key.
However, when there are additional unique constraints, there are logical issues with that simple approach. If you want to "upsert" an object, and the primary key of your object matches an existing record, but another unique column matches a different record, then what do you do? Similarly, if the primary key matches no existing record, but another unique column does match an existing record, then what? There may be a correct answer for your particular situation, but in general I would argue there is no single correct answer.
That would be the reason there is no built in "upsert" operation. The application must define what this means in each particular case.
Nowadays, SQLAlchemy provides two helpful functions on_conflict_do_nothing and on_conflict_do_update. Those functions are useful but require you to swich from the ORM interface to the lower-level one - SQLAlchemy Core.
Although those two functions make upserting using SQLAlchemy's syntax not that difficult, these functions are far from providing a complete out-of-the-box solution to upserting.
My common use case is to upsert a big chunk of rows in a single SQL query/session execution. I usually encounter two problems with upserting:
For example, higher level ORM functionalities we've gotten used to are missing. You cannot use ORM objects but instead have to provide ForeignKeys at the time of insertion.
I'm using this following function I wrote to handle both of those issues:
def upsert(session, model, rows):
table = model.__table__
stmt = postgresql.insert(table)
primary_keys = [key.name for key in inspect(table).primary_key]
update_dict = {c.name: c for c in stmt.excluded if not c.primary_key}
if not update_dict:
raise ValueError("insert_or_update resulted in an empty update_dict")
stmt = stmt.on_conflict_do_update(index_elements=primary_keys,
set_=update_dict)
seen = set()
foreign_keys = {col.name: list(col.foreign_keys)[0].column for col in table.columns if col.foreign_keys}
unique_constraints = [c for c in table.constraints if isinstance(c, UniqueConstraint)]
def handle_foreignkeys_constraints(row):
for c_name, c_value in foreign_keys.items():
foreign_obj = row.pop(c_value.table.name, None)
row[c_name] = getattr(foreign_obj, c_value.name) if foreign_obj else None
for const in unique_constraints:
unique = tuple([const,] + [row[col.name] for col in const.columns])
if unique in seen:
return None
seen.add(unique)
return row
rows = list(filter(None, (handle_foreignkeys_constraints(row) for row in rows)))
session.execute(stmt, rows)
I use a "look before you leap" approach:
# first get the object from the database if it exists
# we're guaranteed to only get one or zero results
# because we're filtering by primary key
switch_command = session.query(Switch_Command).\
filter(Switch_Command.switch_id == switch.id).\
filter(Switch_Command.command_id == command.id).first()
# If we didn't get anything, make one
if not switch_command:
switch_command = Switch_Command(switch_id=switch.id, command_id=command.id)
# update the stuff we care about
switch_command.output = 'Hooray!'
switch_command.lastseen = datetime.datetime.utcnow()
session.add(switch_command)
# This will generate either an INSERT or UPDATE
# depending on whether we have a new object or not
session.commit()
The advantage is that this is db-neutral and I think it's clear to read. The disadvantage is that there's a potential race condition in a scenario like the following:
we query the db for a switch_command and don't find one
we create a switch_command
another process or thread creates a switch_command with the same primary key as ours
we try to commit our switch_command
There are multiple answers and here comes yet another answer (YAA). Other answers are not that readable due to the metaprogramming involved. Here is an example that
Uses SQLAlchemy ORM
Shows how to create a row if there are zero rows using on_conflict_do_nothing
Shows how to update the existing row (if any) without creating a new row using on_conflict_do_update
Uses the table primary key as the constraint
A longer example in the original question what this code is related to.
import sqlalchemy as sa
import sqlalchemy.orm as orm
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import Session
class PairState(Base):
__tablename__ = "pair_state"
# This table has 1-to-1 relationship with Pair
pair_id = sa.Column(sa.ForeignKey("pair.id"), nullable=False, primary_key=True, unique=True)
pair = orm.relationship(Pair,
backref=orm.backref("pair_state",
lazy="dynamic",
cascade="all, delete-orphan",
single_parent=True, ), )
# First raw event in data stream
first_event_at = sa.Column(sa.TIMESTAMP(timezone=True), nullable=False, server_default=text("TO_TIMESTAMP(0)"))
# Last raw event in data stream
last_event_at = sa.Column(sa.TIMESTAMP(timezone=True), nullable=False, server_default=text("TO_TIMESTAMP(0)"))
# The last hypertable entry added
last_interval_at = sa.Column(sa.TIMESTAMP(timezone=True), nullable=False, server_default=text("TO_TIMESTAMP(0)"))
#staticmethod
def create_first_event_if_not_exist(dbsession: Session, pair_id: int, ts: datetime.datetime):
"""Sets the first event value if not exist yet."""
dbsession.execute(
insert(PairState).
values(pair_id=pair_id, first_event_at=ts).
on_conflict_do_nothing()
)
#staticmethod
def update_last_event(dbsession: Session, pair_id: int, ts: datetime.datetime):
"""Replaces the the column last_event_at for a named pair."""
# Based on the original example of https://stackoverflow.com/a/49917004/315168
dbsession.execute(
insert(PairState).
values(pair_id=pair_id, last_event_at=ts).
on_conflict_do_update(constraint=PairState.__table__.primary_key, set_={"last_event_at": ts})
)
#staticmethod
def update_last_interval(dbsession: Session, pair_id: int, ts: datetime.datetime):
"""Replaces the the column last_interval_at for a named pair."""
dbsession.execute(
insert(PairState).
values(pair_id=pair_id, last_interval_at=ts).
on_conflict_do_update(constraint=PairState.__table__.primary_key, set_={"last_interval_at": ts})
)
The below works fine for me with redshift database and will also work for combined primary key constraint.
SOURCE : this
Just few modifications required for creating SQLAlchemy engine in the function
def start_engine()
from sqlalchemy import Column, Integer, Date ,Metadata
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects import postgresql
Base = declarative_base()
def start_engine():
engine = create_engine(os.getenv('SQLALCHEMY_URI',
'postgresql://localhost:5432/upsert'))
connect = engine.connect()
meta = MetaData(bind=engine)
meta.reflect(bind=engine)
return engine
class DigitalSpend(Base):
__tablename__ = 'digital_spend'
report_date = Column(Date, nullable=False)
day = Column(Date, nullable=False, primary_key=True)
impressions = Column(Integer)
conversions = Column(Integer)
def __repr__(self):
return str([getattr(self, c.name, None) for c in self.__table__.c])
def compile_query(query):
compiler = query.compile if not hasattr(query, 'statement') else
query.statement.compile
return compiler(dialect=postgresql.dialect())
def upsert(session, model, rows, as_of_date_col='report_date', no_update_cols=[]):
table = model.__table__
stmt = insert(table).values(rows)
update_cols = [c.name for c in table.c
if c not in list(table.primary_key.columns)
and c.name not in no_update_cols]
on_conflict_stmt = stmt.on_conflict_do_update(
index_elements=table.primary_key.columns,
set_={k: getattr(stmt.excluded, k) for k in update_cols},
index_where=(getattr(model, as_of_date_col) < getattr(stmt.excluded, as_of_date_col))
)
print(compile_query(on_conflict_stmt))
session.execute(on_conflict_stmt)
session = start_engine()
upsert(session, DigitalSpend, initial_rows, no_update_cols=['conversions'])
This allows access to the underlying models based on string names
def get_class_by_tablename(tablename):
"""Return class reference mapped to table.
https://stackoverflow.com/questions/11668355/sqlalchemy-get-model-from-table-name-this-may-imply-appending-some-function-to
:param tablename: String with name of table.
:return: Class reference or None.
"""
for c in Base._decl_class_registry.values():
if hasattr(c, '__tablename__') and c.__tablename__ == tablename:
return c
sqla_tbl = get_class_by_tablename(table_name)
def handle_upsert(record_dict, table):
"""
handles updates when there are primary key conflicts
"""
try:
self.active_session().add(table(**record_dict))
except:
# Here we'll assume the error is caused by an integrity error
# We do this because the error classes are passed from the
# underlying package (pyodbc / sqllite) SQLAlchemy doesn't mask
# them with it's own code - this should be updated to have
# explicit error handling for each new db engine
# <update>add explicit error handling for each db engine</update>
active_session.rollback()
# Query for conflic class, use update method to change values based on dict
c_tbl_primary_keys = [i.name for i in table.__table__.primary_key] # List of primary key col names
c_tbl_cols = dict(sqla_tbl.__table__.columns) # String:Col Object crosswalk
c_query_dict = {k:record_dict[k] for k in c_tbl_primary_keys if k in record_dict} # sub-dict from data of primary key:values
c_oo_query_dict = {c_tbl_cols[k]:v for (k,v) in c_query_dict.items()} # col-object:query value for primary key cols
c_target_record = session.query(sqla_tbl).filter(*[k==v for (k,v) in oo_query_dict.items()]).first()
# apply new data values to the existing record
for k, v in record_dict.items()
setattr(c_target_record, k, v)
This works for me with sqlite3 and postgres. Albeit it might fail with combined primary key constraints and will most likely fail with additional unique constraints.
try:
t = self._meta.tables[data['table']]
except KeyError:
self._log.error('table "%s" unknown', data['table'])
return
try:
q = insert(t, values=data['values'])
self._log.debug(q)
self._db.execute(q)
except IntegrityError:
self._log.warning('integrity error')
where_clause = [c.__eq__(data['values'][c.name]) for c in t.c if c.primary_key]
update_dict = {c.name: data['values'][c.name] for c in t.c if not c.primary_key}
q = update(t, values=update_dict).where(*where_clause)
self._log.debug(q)
self._db.execute(q)
except Exception as e:
self._log.error('%s: %s', t.name, e)
As we had problems with generated default-ids and references which lead to ForeignKeyViolation-Errors like
update or delete on table "..." violates foreign key constraint
Key (id)=(...) is still referenced from table "...".
we had to exclude the id for the update dict, as otherwise the it will be always generated as new default value.
In addition the method is returning the created/updated entity.
from sqlalchemy.dialects.postgresql import insert # Important to use the postgresql insert
def upsert(session, data, key_columns, model):
stmt = insert(model).values(data)
# Important to exclude the ID for update!
exclude_for_update = [model.id.name, *key_columns]
update_dict = {c.name: c for c in stmt.excluded if c.name not in exclude_for_update}
stmt = stmt.on_conflict_do_update(
index_elements=key_columns,
set_=update_dict
).returning(model)
orm_stmt = (
select(model)
.from_statement(stmt)
.execution_options(populate_existing=True)
)
return session.execute(orm_stmt).scalar()
Example:
class UpsertUser(Base):
__tablename__ = 'upsert_user'
id = Column(Id, primary_key=True, default=uuid.uuid4)
name: str = Column(sa.String, nullable=False)
user_sid: str = Column(sa.String, nullable=False, unique=True)
house_admin = relationship('UpsertHouse', back_populates='admin', uselist=False)
class UpsertHouse(Base):
__tablename__ = 'upsert_house'
id = Column(Id, primary_key=True, default=uuid.uuid4)
admin_id: Id = Column(Id, ForeignKey('upsert_user.id'), nullable=False)
admin: UpsertUser = relationship('UpsertUser', back_populates='house_admin', uselist=False)
# Usage
upserted_user = upsert(session, updated_user, [UpsertUser.user_sid.name], UpsertUser)
Note: Only tested on postgresql but could work also for other DBs which support ON DUPLICATE KEY UPDATE e.g. MySQL
In case of sqlite, the sqlite_on_conflict='REPLACE' option can be used when defining a UniqueConstraint, and sqlite_on_conflict_unique for unique constraint on a single column. Then session.add will work in a way just like upsert. See the official documentation.
I use this code for upsert
Before using this code, you should add primary keys to table in database.
from sqlalchemy import create_engine
from sqlalchemy import MetaData, Table
from sqlalchemy.inspection import inspect
from sqlalchemy.engine.reflection import Inspector
from sqlalchemy.dialects.postgresql import insert
def upsert(df, engine, table_name, schema=None, chunk_size = 1000):
metadata = MetaData(schema=schema)
metadata.bind = engine
table = Table(table_name, metadata, schema=schema, autoload=True)
# olny use common columns between df and table.
table_columns = {column.name for column in table.columns}
df_columns = set(df.columns)
intersection_columns = table_columns.intersection(df_columns)
df1 = df[intersection_columns]
records = df1.to_dict('records')
# get list of fields making up primary key
primary_keys = [key.name for key in inspect(table).primary_key]
with engine.connect() as conn:
chunks = [records[i:i + chunk_size] for i in range(0, len(records), chunk_size)]
for chunk in chunks:
stmt = insert(table).values(chunk)
update_dict = {c.name: c for c in stmt.excluded if not c.primary_key}
s = stmt.on_conflict_do_update(
index_elements= primary_keys,
set_=update_dict)
conn.execute(s)

Categories