SQLAlchemy unique check on postgresql insert scalability - python

Can someone please help me understand what I am doing wrong?
All of the below works as required, but I'm running into scalability issues -
On the first run i fetched ~70,000 rows into a blank table in ~2-3 s
On the 2nd run i fetched ~80,000 rows into the same table in ~5 min
On the 3rd run i fetched ~50,000 rows into the same table in ~30 min
On the 4th run i fetched ~120,000 rows into the same table in ~1 hr
On the 5th run i fetched ~100,000 rows into the same table in ~2 hr
Each time i run the code, I see a steady ~600KB/s traffic between the client & the db while this activity finishes
So as you see, the hash check across all those columns does not seem to scale well at all
What is my code trying to accomplish?
I need to add daily stock data into a postgres database. The data is updated at source only once a day & the API response is as follows -
{'instrument_token': '210011653'
'exchange_token': '820358'
'tradingsymbol': 'COLG17MAY1020.00PE'
'name': ''
'last_price': 0.0
'expiry': '2017-05-25'
'strike': 1020.0
'tick_size': 0.05
'lot_size': 700
'instrument_type': 'PE'
'segment': 'BFO-OPT'
'exchange': 'BFO'}
The items in the response & the row count changes every day
On a given day, I see i can fetch between 50,000 - 120,000 rows in a single response (i.e approx 20-30 MB of csv data). Sending the request fetches the same data for a given day.
So the core problem is - I want to avoid adding the same row twice to the db in case the data is fetched multiple times in the same day.
What have I tried so far -
I'm a db newbie, my thought process was to autoincrement an id & add a data_date column, so my schema looks like this -
CREATE TABLE IF NOT EXISTS instruments (
id bigserial,
data_date date NOT NULL,
instrument_token integer NOT NULL,
exchange_token integer NOT NULL,
tradingsymbol varchar(40) NOT NULL,
name varchar(40) NOT NULL,
last_price numeric(15,2) NOT NULL,
expiry date,
strike numeric(15,2),
tick_size numeric,
lot_size integer,
instrument_type varchar(10),
segment varchar(20),
exchange varchar(10),
PRIMARY KEY(id)
);
I've built a class like so -
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, mapper, relationship
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, Numeric, String, MetaData, Table, ForeignKey, DateTime, union
from sqlalchemy.engine.url import URL
engine = create_engine('postgresql://blah')
Base = declarative_base(engine)
def _unique(session, cls, hashfunc, queryfunc, constructor, arg, kw):
cache = getattr(session, '_unique_cache', None)
if cache is None:
session._unique_cache = cache = {}
key = (cls, hashfunc(*arg, **kw))
if key in cache:
return cache[key]
else:
with session.no_autoflush:
q = session.query(cls)
q = queryfunc(q, *arg, **kw)
obj = q.first()
if not obj:
obj = constructor(*arg, **kw)
session.add(obj)
cache[key] = obj
return obj
class UniqueMixin(object):
#classmethod
def unique_hash(cls, *arg, **kw):
raise NotImplementedError()
#classmethod
def unique_filter(cls, query, *arg, **kw):
raise NotImplementedError()
#classmethod
def as_unique(cls, session, *arg, **kw):
return _unique(
session,
cls,
cls.unique_hash,
cls.unique_filter,
cls,
arg, kw
)
class Instrument(UniqueMixin, Base):
__tablename__ = 'instruments'
__table_args__ = {'autoload':True}
__table__ = Table('instruments', Base.metadata,
Column('id', Integer, primary_key=True),
Column('data_date', String),
Column('instrument_token', Integer),
Column('exchange_token', Integer),
Column('tradingsymbol', String),
Column('name', String),
Column('last_price', Numeric),
Column('expiry', Integer),
Column('strike', Numeric),
Column('tick_size', Numeric),
Column('lot_size', Integer),
Column('instrument_type', String),
Column('segment', String),
Column('exchange', String))
#classmethod
def unique_hash(cls, data_date, instrument_token, exchange_token, tradingsymbol, name, last_price, expiry, strike, tick_size, lot_size, instrument_type, segment, exchange):
return data_date, instrument_token, exchange_token, tradingsymbol, name, last_price, expiry, strike, tick_size, lot_size, instrument_type, segment, exchange
#classmethod
def unique_filter(cls, query, data_date, instrument_token, exchange_token, tradingsymbol, name, last_price, expiry, strike, tick_size, lot_size, instrument_type, segment, exchange):
return query.filter(Instrument.data_date == data_date, Instrument.instrument_token == instrument_token, Instrument.exchange_token == exchange_token, Instrument.tradingsymbol == tradingsymbol, Instrument.name == name, Instrument.last_price == last_price, Instrument.expiry == expiry, Instrument.strike == strike, Instrument.tick_size == tick_size, Instrument.lot_size == lot_size, Instrument.instrument_type == instrument_type, Instrument.segment == segment, Instrument.exchange == exchange)
def __init__(self, data_date, instrument_token, exchange_token, tradingsymbol, name, last_price, expiry, strike, tick_size, lot_size, instrument_type, segment, exchange):
self.data_date = data_date
self.instrument_token = instrument_token
self.exchange_token = exchange_token
self.tradingsymbol = tradingsymbol
self.name = name
self.last_price = last_price
self.expiry = expiry
self.strike = strike
self.tick_size = tick_size
self.lot_size = lot_size
self.instrument_type = instrument_type
self.segment = segment
self.exchange = exchange
def __repr__(self):
return "<Instruments - '%s': '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s'>" % (
self.id,
self.data_date,
self.instrument_token,
self.exchange_token,
self.tradingsymbol,
self.name,
self.last_price,
self.expiry,
self.strike,
self.tick_size,
self.lot_size,
self.instrument_type,
self.segment,
self.exchange
)
The code to insert the data looks like this -
for instrument in response:
#print(instrument)
if instrument['expiry'] == '' :
instrument['expiry'] = null()
market_instrument = Instrument.as_unique(self.session,
data_date = datetime.date.today().isoformat(),
instrument_token = instrument['instrument_token'],
exchange_token = instrument['exchange_token'],
tradingsymbol = instrument['tradingsymbol'],
name = instrument['name'],
last_price = instrument['last_price'],
expiry = instrument['expiry'],
strike = instrument['strike'],
tick_size = instrument['tick_size'],
lot_size = instrument['lot_size'],
instrument_type = instrument['instrument_type'],
segment = instrument['segment'],
exchange = instrument['exchange'],
)
self.session.add(market_instrument)
self.session.commit()
Options I am considering
What do you think is best?
Option 1
No longer use as_unique(
Create one more data_update_date table (data_date(primary), status(boolean)) which is updated at the end of a successful daily insert
Check data_update_date for today's date, & skip add for the entire block if it exists
However this option does not help me learn if there was another mistake in my as_unique functions that needs to be corrected
Option 2
Setup new db with powa & profile
Find & fix bottlenecks
i am using the official postgres docker image, i ran into a dead end extending the debian base with hypopg & other required extensions
Looks like centos will be much simpler, so i'm creating a new dockerfile to do this
However, since i'm a total newbie with postgresql & sqlalchemy i also need your opinion on whether my code has some obvious issues
Option 3
hash only a few columns
i could hash just the first 3, excluding id
however i dont know how to do this
just reducing the hash classdef parameters causes the number of parameters to be less than defined in the class, so the insert fails
Option 4
I'm not married to either postgresql or sqlalchemy
Should i be using a non ORM method instead?
Or, should i be using something other than a db to store this kind of data
I'm running this on an m2.large instance on AWS, which should have the right kind of performance, so maybe I am using the wrong method to store the data
If this is the situation during insert, multiple threads while doing technical analysis will be simply unusable...
Should I be using something like hadoop instead?
also, an obvious drawback in this option is another learning curve to scale for hadoop...

I ran some db profiling on the bulk insert operation
The cache hit ratio is 100%
I do not see any disk io
Sorry i can't post more than 2 links right now, so i can't show you the charts for the hit ratio & disk hits, so you'll just have take my word for it :)
The as_unique method basically works using an extremely inefficient method which hits the db with a crazy large # of queries. If anything, I guess this just served as a good benchmark for this server build+config, that leaves me very satisfied with it's performance for cache friendly workloads
As pointed out by hints from various responses, the bottleneck lies in the schema as well as the way the insert is implemented in the code
I fixed the problem like this -
1. Add a multi-column unique index
CREATE UNIQUE INDEX market_daily_uq_idx ON instruments (
data_date,
instrument_token,
exchange_token,
tradingsymbol,
instrument_type,
segment,
exchange
);
2. Use .on_conflict_do_nothing()
statement = insert(Instrument).values(
data_date = datetime.date.today().isoformat(),
instrument_token = instrument['instrument_token'],
exchange_token = instrument['exchange_token'],
tradingsymbol = instrument['tradingsymbol'],
name = instrument['name'],
last_price = instrument['last_price'],
expiry = instrument['expiry'],
strike = instrument['strike'],
tick_size = instrument['tick_size'],
lot_size = instrument['lot_size'],
instrument_type = instrument['instrument_type'],
segment = instrument['segment'],
exchange = instrument['exchange'],
).on_conflict_do_nothing()
self.session.execute(statement)
self.session.commit()
This works very well & things are much faster now, thereby solving the core issue
Thank you all very much for all the help, hints & advice!

Related

SQLalchemy custom String primary_key sequence

For the life of me, I cannot think of a simple way to accomplishing this without querying the database whenever a new record is created, but this is what I'm trying to do with sqlalchemy+postgresql:
I would like to have a primary key of a given table follow this format:
YYWW0001, YYWW0002 etc. such that I see values like 20010001, 20010002 such that the last four digits are only incremented within the given week of the year, then resetting when a new week or year is entered.
I'm at the limit of my knowledge here so any help is greatly appreciated!
In the meantime, I am looking into sqlalchemy.schema.Sequence.
Another thing I can think to try is creating a table that has let's say 10,000 records that just have a plain Integer primary key and the actual ID I want, then find some sort of 'next' method to pull from that table when my Core object is constructed? This seems less than Ideal in my mind since I would still need to ensure that the data portion of the id in the table is correct and current. I think if there is a dynamic approach it would best suit my needs.
so far my naiive implementation looks like this:
BASE = declarative_base()
_name = os.environ.get('HW_QC_USER_ID', None)
_pass = os.environ.get('HW_QC_USER_PASS', None)
_ip = os.environ.get('HW_QC_SERVER_IP', None)
_db_name = 'mock'
try:
print('Creating engine')
engine = create_engine(
f'postgresql://{_name}:{_pass}#{_ip}/{_db_name}',
echo=False
)
except OperationalError as _e:
print('An Error has occured when connecting to the database')
print(f'postgresql://{_name}:{_pass}#{_ip}/{_db_name}')
print(_e)
class Core(BASE):
"""
This class describes a master table.
"""
__tablename__ = 'cores'
udi = Column(String(11), primary_key=True, unique=True) # <-- how do I get this to be the format described?
_date_code = Column(
String(4),
default=datetime.datetime.now().strftime("%y%U")
)
BASE.metadata.create_all(engine)
session = sessionmaker(bind=engine)()
date_code_now = datetime.datetime.now().strftime("%y%U")
cores_from_this_week = session.query(Core).filter(
Core._date_code == date_code_now
).all()
num_cores_existing = len(cores_from_this_week)
new_core = Core(
udi=f'FRA{date_code_now}{num_cores_existing+1:04}'
)
session.add(new_core)
session.commit()
session.close()
engine.dispose()

fast insert (on conflict) many rows to postges-DB with python

I want to write messages from a websocket to a postgres-DB running on a Raspberry Pi.
The average message/seconds ratio from the websocket is about 30messages/second. But within peaks it reaches up to 250 messages/second.
I implemented a python program to receive the messages and write them to the database with sqlalchemy orm. After each message i first check if the same primary key already exists and then do an update or an insert, afterwards i always do a commit, and so it gets very slow. I can write maximally 30 messages/second to the database. In peak-times this is a problem.
So i tested several approaches to speed things up.
This is my best approach:
I first make all the single-querys (with psycopg2) and then join them together and send the complete querystring to the database to execute it at once --> so it speeds up to 580 messages /second.
Create the table for Testdata:
CREATE TABLE transactions (
id int NOT NULL PRIMARY KEY,
name varchar(255),
description varchar(255),
country_name varchar(255),
city_name varchar(255),
cost varchar(255),
currency varchar(255),
created_at DATE,
billing_type varchar(255),
language varchar(255),
operating_system varchar(255)
);
example copied from https://medium.com/technology-nineleaps/mysql-sqlalchemy-performance-b123584eb833
Python-Test-Skript:
import random
import time
from faker import Faker
import psycopg2
from psycopg2.extensions import AsIs
"""psycopg2"""
psycopg2_conn = {'host':'192.168.176.101',
'dbname':'test',
'user':'blabla',
'password':'blabla'}
connection_psycopg2 = psycopg2.connect(**psycopg2_conn)
myFactory = Faker()
def random_data():
billing_type_list = ['cheque', 'cash', 'credit', 'debit', 'e-wallet']
language = ['English', 'Bengali', 'Kannada']
operating_system = 'linux'
random_dic = {}
for i in range(0, 300):
id = int(i)
name = myFactory.name()
description = myFactory.text()
country_name = myFactory.country()
city_name = myFactory.city()
cost = str(myFactory.random_digit_not_null())
currency = myFactory.currency_code()
created_at = myFactory.date_time_between(start_date="-30y", end_date="now", tzinfo=None)
billing_type = random.choice(billing_type_list)
language = random.choice(language)
operating_system = operating_system
random_dic[id] = {}
for xname in ['id', 'description', 'country_name','city_name','cost','currency',
'created_at', 'billing_type','language','operating_system']:
random_dic[id][xname]=locals()[xname]
print(id)
return random_dic
def single_insert_on_conflict_psycopg2(idic, icur):
cur=icur
columns = idic.keys()
columns_with_excludephrase = ['EXCLUDED.{}'.format(column) for column in columns]
values = [idic[column] for column in columns]
insert_statement = """
insert into transactions (%s) values %s
ON CONFLICT ON CONSTRAINT transactions_pkey
DO UPDATE SET (%s) = (%s)
"""
#insert_statement = 'insert into transactions (%s) values %s'
print(','.join(columns))
print(','.join(columns_with_excludephrase))
print(tuple(values))
xquery = cur.mogrify(insert_statement,(
AsIs (','.join(columns)) ,
tuple(values),
AsIs (','.join(columns)) ,
AsIs (','.join(columns_with_excludephrase))
))
print(xquery)
return xquery
def complete_run_psycopg2(random_dic):
querylist=[]
starttime = time.time()
cur = connection_psycopg2.cursor()
for key in random_dic:
print(key)
query=single_insert_on_conflict_psycopg2(idic=random_dic[key],
icur=cur)
querylist.append(query.decode("utf-8") )
complete_query = ';'.join(tuple(querylist))
cur.execute(complete_query)
connection_psycopg2.commit()
cur.close()
endtime = time.time()
xduration=endtime-starttime
write_sec=len(random_dic)/xduration
print('complete Duration:{}'.format(xduration))
print('writes per second:{}'.format(write_sec))
return write_sec
def main():
random_dic = random_data()
complete_run_psycopg2(random_dic)
return
if __name__ == '__main__':
main()
Now my question: is this a proper approach? Are there any hints I didn’t consider?
First You can not insert column names like that. I would use .format to inject column names, and then use %s for the values.
SQL = 'INSERT INTO ({}) VALUES (%s,%s,%s,%s,%s,%s)'.format(','.join(columnns))
db.Pcursor().execute(SQL, value1, value2, value3)
Second you will get better speed if you use async processes.
Fortunately for you I wrote a gevent async library for psycopg2 you can use. It makes the process far easier, it is async threaded and pooled.
Python Postgres psycopg2 ThreadedConnectionPool exhausted

SQLAlchemy temporary table with Declarative Base

I need a temporary table in my programme. I have seen that this can be achieved with the "mapper" syntax in this way:
t = Table(
't', metadata,
Column('id', Integer, primary_key=True),
# ...
prefixes=['TEMPORARY'],
)
Seen here
But, my whole code is using the declarative base, it is what I understand, and I would like to stick to it. There is the possibility of using a hybrid approach but if possible I'd avoid it.
This is a simplified version of how my declarative class looks like:
import SQLAlchemy as alc
class Tempo(Base):
"""
Class for temporary table used to process data coming from xlsx
#param Base Declarative Base
"""
# TODO: make it completely temporary
__tablename__ = 'tempo'
drw = alc.Column(alc.String)
date = alc.Column(alc.Date)
check_number = alc.Column(alc.Integer)
Thanks in advance!
EDITED WITH THE NEW PROBLEMS:
Now the class looks like this:
import SQLAlchemy as alc
class Tempo(Base):
"""
Class for temporary table used to process data coming from xlsx
#param Base Declarative Base
"""
# TODO: make it completely temporary
__tablename__ = 'tempo'
__table_args__ = {'prefixes': ['TEMPORARY']}
drw = alc.Column(alc.String)
date = alc.Column(alc.Date)
check_number = alc.Column(alc.Integer)
And when I try to insert data in this table, I get the following error message:
sqlalchemy.exc.OperationalError: (OperationalError) no such table:
tempo u'INSERT INTO tempo (...) VALUES (?, ?, ?, ?, ?, ?, ?, ?)' (....)
It seems the table doesn't exist just by declaring it. I have seen something like create_all() that might be the solution for this (it's funny to see how new ideas come while explaining thoroughly)
Then again, thank you very much!
Is it possible to use __table_args__? See https://docs.sqlalchemy.org/en/14/orm/declarative_tables.html#orm-declarative-table-configuration
class Tempo(Base):
"""
Class for temporary table used to process data coming from xlsx
#param Base Declarative Base
"""
# TODO: make it completely temporary
__tablename__ = 'tempo'
__table_args__ = {'prefixes': ['TEMPORARY']}
drw = alc.Column(alc.String)
date = alc.Column(alc.Date)
check_number = alc.Column(alc.Integer)
Old question, but if anyone out there wants to create a temp table from an existing declarative table model on the fly rather than having it always be a part of your model/code, you can try the following approach. Copying __table_args__ is a little tricky since it can have multiple formats and any Index objects need to be recreated so they aren't associated with the old table.
import time
from sqlalchemy.schema import CreateTable
def copy_table_args(model, **kwargs):
"""Try to copy existing __table_args__, override params with kwargs"""
table_args = model.__table_args__
if isinstance(table_args, tuple):
new_args = []
for arg in table_args:
if isinstance(arg, dict):
table_args_dict = arg.copy()
table_args_dict.update(**kwargs)
new_args.append(arg)
elif isinstance(arg, sa.Index):
index = sa.Index(
arg.name,
*[col for col in arg.columns.keys()],
unique=arg.unique,
**arg.kwargs,
)
new_args.append(index)
else:
# TODO: need to handle Constraints
raise Exception(f"Unhandled table arg: {arg}")
table_args = tuple(new_args)
elif isinstance(table_args, dict):
table_args = {
k: (v.copy() if hasattr(v, "copy") else v) for k, v in table_args.items()
}
table_args.update(**kwargs)
else:
raise Exception(f"Unexpected __table_args__ type: {table_args}")
return table_args
def copy_table_from_model(conn, model, **kwargs):
model_name = model.__name__ + "Tmp"
table_name = model.__table__.name + "_" + str(time.time()).replace(".", "_")
table_args = copy_table_args(model, extend_existing=True)
args = {c.name: c.copy() for c in model.__table__.c}
args["__tablename__"] = table_name
args["__table_args__"] = table_args
copy_model = type(model_name, model.__bases__, args)
print(str(CreateTable(copy_model.__table__)))
copy_model.__table__.create(conn)
return copy_model
def temp_table_from_model(conn, model, **kwargs):
return copy_table_from_model(conn, model, prefixes=["TEMPORARY"])
Note: I haven't added logic to handle copying Constraints, and this is lightly tested against MySQL. Also note that if you do this with non-temporary tables and auto-named indexes (i.e. Column(..., index=True)) then this may not play nice with alembic.

SqlAlchemy: array of Postgresql custom types

So in my postgres DB I have the following custom type:
create type my_pg_type as (
sting_id varchar(32),
time_diff interval,
multiplier integer
);
To further complicate things, this is being used as an array:
alter table my_table add column my_keys my_pg_type [];
I'd like to map this with SQLAlchemy (0.6.4) !!
(apologies for elixir)
from sqlalchemy.dialects.postgresql import ARRAY
from sqlalchemy.types import Enum
from elixir import Entity, Field
class MyTable(Entity):
# -- snip --
my_keys = Field(ARRAY(Enum))
I know 'Enum' is incorrect in the above.
For an example of a value coming back from the database for that array column, I've shown below the value in ARRAY.result_processor(self, dialect, coltype):
class ARRAY(sqltypes.MutableType, sqltypes.Concatenable, sqltypes.TypeEngine):
# -- snip --
def result_processor(self, dialect, coltype):
item_proc = self.item_type.result_processor(dialect, coltype)
if item_proc:
def convert_item(item):
if isinstance(item, list):
return [convert_item(child) for child in item]
else:
return item_proc(item)
else:
def convert_item(item):
if isinstance(item, list):
return [convert_item(child) for child in item]
else:
return item
def process(value):
if value is None:
return value
"""
# sample value:
>>> value
'{"(key_1,07:23:00,0)","(key_2,01:00:00,20)"}'
"""
return [convert_item(item) for item in value]
return process
So the above process function incorrectly splits the string, assuming it's already a list.
So far, I've successfully subclassed ARRAY to properly split the string, and instead of Enum, I've tried to write my own type (implementing Unicode) to recreate the (string, timedelta, integer) tuple, but have run into a lot of difficulties, specifically the proper conversion of the interval to the Python timedelta.
I'm posting here in case I'm missing an obvious precedent way of doing this?
UPDATE See the recipe at the bottom for a workaround
I worked up some example code to see what psycopg2 is doing here, and this is well within their realm - psycopg2 is not interpreting the value as an array at all. psycopg2 needs to be able to parse out the ARRAY when it comes back as SQLA's ARRAY type assumes at least that much has been done. You can of course hack around SQLAlchemy's ARRAY, which here would mean basically not using it at all in favor of something that parses out this particular string value psycopg2 is giving us back.
But what's also happening here is that we aren't even getting at psycopg2's mechanics for converting timedeltas either, something SQLAlchemy normally doesn't have to worry about. In this case I feel like the facilities of the DBAPI are being under-utilized and psycopg2 is a very capable DBAPI.
So I'd advise you work with psycopg2's custom type mechanics over at http://initd.org/psycopg/docs/extensions.html#database-types-casting-functions.
If you want to mail their mailing list, here's a test case:
import psycopg2
conn = psycopg2.connect(host="localhost", database="test", user="scott", password="tiger")
cursor = conn.cursor()
cursor.execute("""
create type my_pg_type as (
string_id varchar(32),
time_diff interval,
multiplier integer
)
""")
cursor.execute("""
CREATE TABLE my_table (
data my_pg_type[]
)
""")
cursor.execute("insert into my_table (data) "
"values (CAST(%(data)s AS my_pg_type[]))",
{'data':[("xyz", "'1 day 01:00:00'", 5), ("pqr", "'1 day 01:00:00'", 5)]})
cursor.execute("SELECT * from my_table")
row = cursor.fetchone()
assert isinstance(row[0], (tuple, list)), repr(row[0])
PG's type registration supports global registration. You can also register the types on a per-connection basis within SQLAlchemy using the pool listener in 0.6 or connect event in 0.7 and further.
UPDATE - due to https://bitbucket.org/zzzeek/sqlalchemy/issue/3467/array-of-enums-does-not-allow-assigning I'm probably going to recommend people use this workaround type for now, until psycopg2 adds more built-in support for this:
class ArrayOfEnum(ARRAY):
def bind_expression(self, bindvalue):
return sa.cast(bindvalue, self)
def result_processor(self, dialect, coltype):
super_rp = super(ArrayOfEnum, self).result_processor(dialect, coltype)
def handle_raw_string(value):
inner = re.match(r"^{(.*)}$", value).group(1)
return inner.split(",")
def process(value):
return super_rp(handle_raw_string(value))
return process
Checkout the sqlalchemy_utils documentation:
CompositeType provides means to interact with
`PostgreSQL composite types`_. Currently this type features:
* Easy attribute access to composite type fields
* Supports SQLAlchemy TypeDecorator types
* Ability to include composite types as part of PostgreSQL arrays
* Type creation and dropping
Usage:
from collections import OrderedDict
import sqlalchemy as sa
from sqlalchemy_utils import Composite, CurrencyType
class Account(Base):
__tablename__ = 'account'
id = sa.Column(sa.Integer, primary_key=True)
balance = sa.Column(
CompositeType(
'money_type',
[
sa.Column('currency', CurrencyType),
sa.Column('amount', sa.Integer)
]
)
)
Array Of Composites:
from sqlalchemy_utils import CompositeArray
class Account(Base):
__tablename__ = 'account'
id = sa.Column(sa.Integer, primary_key=True)
balances = sa.Column(
CompositeArray(
CompositeType(
'money_type',
[
sa.Column('currency', CurrencyType),
sa.Column('amount', sa.Integer)
]
)
)
)

How to do an upsert with SqlAlchemy?

I have a record that I want to exist in the database if it is not there, and if it is there already (primary key exists) I want the fields to be updated to the current state. This is often called an upsert.
The following incomplete code snippet demonstrates what will work, but it seems excessively clunky (especially if there were a lot more columns). What is the better/best way?
Base = declarative_base()
class Template(Base):
__tablename__ = 'templates'
id = Column(Integer, primary_key = True)
name = Column(String(80), unique = True, index = True)
template = Column(String(80), unique = True)
description = Column(String(200))
def __init__(self, Name, Template, Desc):
self.name = Name
self.template = Template
self.description = Desc
def UpsertDefaultTemplate():
sess = Session()
desired_default = Template("default", "AABBCC", "This is the default template")
try:
q = sess.query(Template).filter_by(name = desiredDefault.name)
existing_default = q.one()
except sqlalchemy.orm.exc.NoResultFound:
#default does not exist yet, so add it...
sess.add(desired_default)
else:
#default already exists. Make sure the values are what we want...
assert isinstance(existing_default, Template)
existing_default.name = desired_default.name
existing_default.template = desired_default.template
existing_default.description = desired_default.description
sess.flush()
Is there a better or less verbose way of doing this? Something like this would be great:
sess.upsert_this(desired_default, unique_key = "name")
although the unique_key kwarg is obviously unnecessary (the ORM should be able to easily figure this out) I added it just because SQLAlchemy tends to only work with the primary key. eg: I've been looking at whether Session.merge would be applicable, but this works only on primary key, which in this case is an autoincrementing id which is not terribly useful for this purpose.
A sample use case for this is simply when starting up a server application that may have upgraded its default expected data. ie: no concurrency concerns for this upsert.
SQLAlchemy supports ON CONFLICT with two methods on_conflict_do_update() and on_conflict_do_nothing().
Copying from the documentation:
from sqlalchemy.dialects.postgresql import insert
stmt = insert(my_table).values(user_email='a#b.com', data='inserted data')
stmt = stmt.on_conflict_do_update(
index_elements=[my_table.c.user_email],
index_where=my_table.c.user_email.like('%#gmail.com'),
set_=dict(data=stmt.excluded.data)
)
conn.execute(stmt)
SQLAlchemy does have a "save-or-update" behavior, which in recent versions has been built into session.add, but previously was the separate session.saveorupdate call. This is not an "upsert" but it may be good enough for your needs.
It is good that you are asking about a class with multiple unique keys; I believe this is precisely the reason there is no single correct way to do this. The primary key is also a unique key. If there were no unique constraints, only the primary key, it would be a simple enough problem: if nothing with the given ID exists, or if ID is None, create a new record; else update all other fields in the existing record with that primary key.
However, when there are additional unique constraints, there are logical issues with that simple approach. If you want to "upsert" an object, and the primary key of your object matches an existing record, but another unique column matches a different record, then what do you do? Similarly, if the primary key matches no existing record, but another unique column does match an existing record, then what? There may be a correct answer for your particular situation, but in general I would argue there is no single correct answer.
That would be the reason there is no built in "upsert" operation. The application must define what this means in each particular case.
Nowadays, SQLAlchemy provides two helpful functions on_conflict_do_nothing and on_conflict_do_update. Those functions are useful but require you to swich from the ORM interface to the lower-level one - SQLAlchemy Core.
Although those two functions make upserting using SQLAlchemy's syntax not that difficult, these functions are far from providing a complete out-of-the-box solution to upserting.
My common use case is to upsert a big chunk of rows in a single SQL query/session execution. I usually encounter two problems with upserting:
For example, higher level ORM functionalities we've gotten used to are missing. You cannot use ORM objects but instead have to provide ForeignKeys at the time of insertion.
I'm using this following function I wrote to handle both of those issues:
def upsert(session, model, rows):
table = model.__table__
stmt = postgresql.insert(table)
primary_keys = [key.name for key in inspect(table).primary_key]
update_dict = {c.name: c for c in stmt.excluded if not c.primary_key}
if not update_dict:
raise ValueError("insert_or_update resulted in an empty update_dict")
stmt = stmt.on_conflict_do_update(index_elements=primary_keys,
set_=update_dict)
seen = set()
foreign_keys = {col.name: list(col.foreign_keys)[0].column for col in table.columns if col.foreign_keys}
unique_constraints = [c for c in table.constraints if isinstance(c, UniqueConstraint)]
def handle_foreignkeys_constraints(row):
for c_name, c_value in foreign_keys.items():
foreign_obj = row.pop(c_value.table.name, None)
row[c_name] = getattr(foreign_obj, c_value.name) if foreign_obj else None
for const in unique_constraints:
unique = tuple([const,] + [row[col.name] for col in const.columns])
if unique in seen:
return None
seen.add(unique)
return row
rows = list(filter(None, (handle_foreignkeys_constraints(row) for row in rows)))
session.execute(stmt, rows)
I use a "look before you leap" approach:
# first get the object from the database if it exists
# we're guaranteed to only get one or zero results
# because we're filtering by primary key
switch_command = session.query(Switch_Command).\
filter(Switch_Command.switch_id == switch.id).\
filter(Switch_Command.command_id == command.id).first()
# If we didn't get anything, make one
if not switch_command:
switch_command = Switch_Command(switch_id=switch.id, command_id=command.id)
# update the stuff we care about
switch_command.output = 'Hooray!'
switch_command.lastseen = datetime.datetime.utcnow()
session.add(switch_command)
# This will generate either an INSERT or UPDATE
# depending on whether we have a new object or not
session.commit()
The advantage is that this is db-neutral and I think it's clear to read. The disadvantage is that there's a potential race condition in a scenario like the following:
we query the db for a switch_command and don't find one
we create a switch_command
another process or thread creates a switch_command with the same primary key as ours
we try to commit our switch_command
There are multiple answers and here comes yet another answer (YAA). Other answers are not that readable due to the metaprogramming involved. Here is an example that
Uses SQLAlchemy ORM
Shows how to create a row if there are zero rows using on_conflict_do_nothing
Shows how to update the existing row (if any) without creating a new row using on_conflict_do_update
Uses the table primary key as the constraint
A longer example in the original question what this code is related to.
import sqlalchemy as sa
import sqlalchemy.orm as orm
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import Session
class PairState(Base):
__tablename__ = "pair_state"
# This table has 1-to-1 relationship with Pair
pair_id = sa.Column(sa.ForeignKey("pair.id"), nullable=False, primary_key=True, unique=True)
pair = orm.relationship(Pair,
backref=orm.backref("pair_state",
lazy="dynamic",
cascade="all, delete-orphan",
single_parent=True, ), )
# First raw event in data stream
first_event_at = sa.Column(sa.TIMESTAMP(timezone=True), nullable=False, server_default=text("TO_TIMESTAMP(0)"))
# Last raw event in data stream
last_event_at = sa.Column(sa.TIMESTAMP(timezone=True), nullable=False, server_default=text("TO_TIMESTAMP(0)"))
# The last hypertable entry added
last_interval_at = sa.Column(sa.TIMESTAMP(timezone=True), nullable=False, server_default=text("TO_TIMESTAMP(0)"))
#staticmethod
def create_first_event_if_not_exist(dbsession: Session, pair_id: int, ts: datetime.datetime):
"""Sets the first event value if not exist yet."""
dbsession.execute(
insert(PairState).
values(pair_id=pair_id, first_event_at=ts).
on_conflict_do_nothing()
)
#staticmethod
def update_last_event(dbsession: Session, pair_id: int, ts: datetime.datetime):
"""Replaces the the column last_event_at for a named pair."""
# Based on the original example of https://stackoverflow.com/a/49917004/315168
dbsession.execute(
insert(PairState).
values(pair_id=pair_id, last_event_at=ts).
on_conflict_do_update(constraint=PairState.__table__.primary_key, set_={"last_event_at": ts})
)
#staticmethod
def update_last_interval(dbsession: Session, pair_id: int, ts: datetime.datetime):
"""Replaces the the column last_interval_at for a named pair."""
dbsession.execute(
insert(PairState).
values(pair_id=pair_id, last_interval_at=ts).
on_conflict_do_update(constraint=PairState.__table__.primary_key, set_={"last_interval_at": ts})
)
The below works fine for me with redshift database and will also work for combined primary key constraint.
SOURCE : this
Just few modifications required for creating SQLAlchemy engine in the function
def start_engine()
from sqlalchemy import Column, Integer, Date ,Metadata
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects import postgresql
Base = declarative_base()
def start_engine():
engine = create_engine(os.getenv('SQLALCHEMY_URI',
'postgresql://localhost:5432/upsert'))
connect = engine.connect()
meta = MetaData(bind=engine)
meta.reflect(bind=engine)
return engine
class DigitalSpend(Base):
__tablename__ = 'digital_spend'
report_date = Column(Date, nullable=False)
day = Column(Date, nullable=False, primary_key=True)
impressions = Column(Integer)
conversions = Column(Integer)
def __repr__(self):
return str([getattr(self, c.name, None) for c in self.__table__.c])
def compile_query(query):
compiler = query.compile if not hasattr(query, 'statement') else
query.statement.compile
return compiler(dialect=postgresql.dialect())
def upsert(session, model, rows, as_of_date_col='report_date', no_update_cols=[]):
table = model.__table__
stmt = insert(table).values(rows)
update_cols = [c.name for c in table.c
if c not in list(table.primary_key.columns)
and c.name not in no_update_cols]
on_conflict_stmt = stmt.on_conflict_do_update(
index_elements=table.primary_key.columns,
set_={k: getattr(stmt.excluded, k) for k in update_cols},
index_where=(getattr(model, as_of_date_col) < getattr(stmt.excluded, as_of_date_col))
)
print(compile_query(on_conflict_stmt))
session.execute(on_conflict_stmt)
session = start_engine()
upsert(session, DigitalSpend, initial_rows, no_update_cols=['conversions'])
This allows access to the underlying models based on string names
def get_class_by_tablename(tablename):
"""Return class reference mapped to table.
https://stackoverflow.com/questions/11668355/sqlalchemy-get-model-from-table-name-this-may-imply-appending-some-function-to
:param tablename: String with name of table.
:return: Class reference or None.
"""
for c in Base._decl_class_registry.values():
if hasattr(c, '__tablename__') and c.__tablename__ == tablename:
return c
sqla_tbl = get_class_by_tablename(table_name)
def handle_upsert(record_dict, table):
"""
handles updates when there are primary key conflicts
"""
try:
self.active_session().add(table(**record_dict))
except:
# Here we'll assume the error is caused by an integrity error
# We do this because the error classes are passed from the
# underlying package (pyodbc / sqllite) SQLAlchemy doesn't mask
# them with it's own code - this should be updated to have
# explicit error handling for each new db engine
# <update>add explicit error handling for each db engine</update>
active_session.rollback()
# Query for conflic class, use update method to change values based on dict
c_tbl_primary_keys = [i.name for i in table.__table__.primary_key] # List of primary key col names
c_tbl_cols = dict(sqla_tbl.__table__.columns) # String:Col Object crosswalk
c_query_dict = {k:record_dict[k] for k in c_tbl_primary_keys if k in record_dict} # sub-dict from data of primary key:values
c_oo_query_dict = {c_tbl_cols[k]:v for (k,v) in c_query_dict.items()} # col-object:query value for primary key cols
c_target_record = session.query(sqla_tbl).filter(*[k==v for (k,v) in oo_query_dict.items()]).first()
# apply new data values to the existing record
for k, v in record_dict.items()
setattr(c_target_record, k, v)
This works for me with sqlite3 and postgres. Albeit it might fail with combined primary key constraints and will most likely fail with additional unique constraints.
try:
t = self._meta.tables[data['table']]
except KeyError:
self._log.error('table "%s" unknown', data['table'])
return
try:
q = insert(t, values=data['values'])
self._log.debug(q)
self._db.execute(q)
except IntegrityError:
self._log.warning('integrity error')
where_clause = [c.__eq__(data['values'][c.name]) for c in t.c if c.primary_key]
update_dict = {c.name: data['values'][c.name] for c in t.c if not c.primary_key}
q = update(t, values=update_dict).where(*where_clause)
self._log.debug(q)
self._db.execute(q)
except Exception as e:
self._log.error('%s: %s', t.name, e)
As we had problems with generated default-ids and references which lead to ForeignKeyViolation-Errors like
update or delete on table "..." violates foreign key constraint
Key (id)=(...) is still referenced from table "...".
we had to exclude the id for the update dict, as otherwise the it will be always generated as new default value.
In addition the method is returning the created/updated entity.
from sqlalchemy.dialects.postgresql import insert # Important to use the postgresql insert
def upsert(session, data, key_columns, model):
stmt = insert(model).values(data)
# Important to exclude the ID for update!
exclude_for_update = [model.id.name, *key_columns]
update_dict = {c.name: c for c in stmt.excluded if c.name not in exclude_for_update}
stmt = stmt.on_conflict_do_update(
index_elements=key_columns,
set_=update_dict
).returning(model)
orm_stmt = (
select(model)
.from_statement(stmt)
.execution_options(populate_existing=True)
)
return session.execute(orm_stmt).scalar()
Example:
class UpsertUser(Base):
__tablename__ = 'upsert_user'
id = Column(Id, primary_key=True, default=uuid.uuid4)
name: str = Column(sa.String, nullable=False)
user_sid: str = Column(sa.String, nullable=False, unique=True)
house_admin = relationship('UpsertHouse', back_populates='admin', uselist=False)
class UpsertHouse(Base):
__tablename__ = 'upsert_house'
id = Column(Id, primary_key=True, default=uuid.uuid4)
admin_id: Id = Column(Id, ForeignKey('upsert_user.id'), nullable=False)
admin: UpsertUser = relationship('UpsertUser', back_populates='house_admin', uselist=False)
# Usage
upserted_user = upsert(session, updated_user, [UpsertUser.user_sid.name], UpsertUser)
Note: Only tested on postgresql but could work also for other DBs which support ON DUPLICATE KEY UPDATE e.g. MySQL
In case of sqlite, the sqlite_on_conflict='REPLACE' option can be used when defining a UniqueConstraint, and sqlite_on_conflict_unique for unique constraint on a single column. Then session.add will work in a way just like upsert. See the official documentation.
I use this code for upsert
Before using this code, you should add primary keys to table in database.
from sqlalchemy import create_engine
from sqlalchemy import MetaData, Table
from sqlalchemy.inspection import inspect
from sqlalchemy.engine.reflection import Inspector
from sqlalchemy.dialects.postgresql import insert
def upsert(df, engine, table_name, schema=None, chunk_size = 1000):
metadata = MetaData(schema=schema)
metadata.bind = engine
table = Table(table_name, metadata, schema=schema, autoload=True)
# olny use common columns between df and table.
table_columns = {column.name for column in table.columns}
df_columns = set(df.columns)
intersection_columns = table_columns.intersection(df_columns)
df1 = df[intersection_columns]
records = df1.to_dict('records')
# get list of fields making up primary key
primary_keys = [key.name for key in inspect(table).primary_key]
with engine.connect() as conn:
chunks = [records[i:i + chunk_size] for i in range(0, len(records), chunk_size)]
for chunk in chunks:
stmt = insert(table).values(chunk)
update_dict = {c.name: c for c in stmt.excluded if not c.primary_key}
s = stmt.on_conflict_do_update(
index_elements= primary_keys,
set_=update_dict)
conn.execute(s)

Categories