fast insert (on conflict) many rows to postges-DB with python - python

I want to write messages from a websocket to a postgres-DB running on a Raspberry Pi.
The average message/seconds ratio from the websocket is about 30messages/second. But within peaks it reaches up to 250 messages/second.
I implemented a python program to receive the messages and write them to the database with sqlalchemy orm. After each message i first check if the same primary key already exists and then do an update or an insert, afterwards i always do a commit, and so it gets very slow. I can write maximally 30 messages/second to the database. In peak-times this is a problem.
So i tested several approaches to speed things up.
This is my best approach:
I first make all the single-querys (with psycopg2) and then join them together and send the complete querystring to the database to execute it at once --> so it speeds up to 580 messages /second.
Create the table for Testdata:
CREATE TABLE transactions (
id int NOT NULL PRIMARY KEY,
name varchar(255),
description varchar(255),
country_name varchar(255),
city_name varchar(255),
cost varchar(255),
currency varchar(255),
created_at DATE,
billing_type varchar(255),
language varchar(255),
operating_system varchar(255)
);
example copied from https://medium.com/technology-nineleaps/mysql-sqlalchemy-performance-b123584eb833
Python-Test-Skript:
import random
import time
from faker import Faker
import psycopg2
from psycopg2.extensions import AsIs
"""psycopg2"""
psycopg2_conn = {'host':'192.168.176.101',
'dbname':'test',
'user':'blabla',
'password':'blabla'}
connection_psycopg2 = psycopg2.connect(**psycopg2_conn)
myFactory = Faker()
def random_data():
billing_type_list = ['cheque', 'cash', 'credit', 'debit', 'e-wallet']
language = ['English', 'Bengali', 'Kannada']
operating_system = 'linux'
random_dic = {}
for i in range(0, 300):
id = int(i)
name = myFactory.name()
description = myFactory.text()
country_name = myFactory.country()
city_name = myFactory.city()
cost = str(myFactory.random_digit_not_null())
currency = myFactory.currency_code()
created_at = myFactory.date_time_between(start_date="-30y", end_date="now", tzinfo=None)
billing_type = random.choice(billing_type_list)
language = random.choice(language)
operating_system = operating_system
random_dic[id] = {}
for xname in ['id', 'description', 'country_name','city_name','cost','currency',
'created_at', 'billing_type','language','operating_system']:
random_dic[id][xname]=locals()[xname]
print(id)
return random_dic
def single_insert_on_conflict_psycopg2(idic, icur):
cur=icur
columns = idic.keys()
columns_with_excludephrase = ['EXCLUDED.{}'.format(column) for column in columns]
values = [idic[column] for column in columns]
insert_statement = """
insert into transactions (%s) values %s
ON CONFLICT ON CONSTRAINT transactions_pkey
DO UPDATE SET (%s) = (%s)
"""
#insert_statement = 'insert into transactions (%s) values %s'
print(','.join(columns))
print(','.join(columns_with_excludephrase))
print(tuple(values))
xquery = cur.mogrify(insert_statement,(
AsIs (','.join(columns)) ,
tuple(values),
AsIs (','.join(columns)) ,
AsIs (','.join(columns_with_excludephrase))
))
print(xquery)
return xquery
def complete_run_psycopg2(random_dic):
querylist=[]
starttime = time.time()
cur = connection_psycopg2.cursor()
for key in random_dic:
print(key)
query=single_insert_on_conflict_psycopg2(idic=random_dic[key],
icur=cur)
querylist.append(query.decode("utf-8") )
complete_query = ';'.join(tuple(querylist))
cur.execute(complete_query)
connection_psycopg2.commit()
cur.close()
endtime = time.time()
xduration=endtime-starttime
write_sec=len(random_dic)/xduration
print('complete Duration:{}'.format(xduration))
print('writes per second:{}'.format(write_sec))
return write_sec
def main():
random_dic = random_data()
complete_run_psycopg2(random_dic)
return
if __name__ == '__main__':
main()
Now my question: is this a proper approach? Are there any hints I didn’t consider?

First You can not insert column names like that. I would use .format to inject column names, and then use %s for the values.
SQL = 'INSERT INTO ({}) VALUES (%s,%s,%s,%s,%s,%s)'.format(','.join(columnns))
db.Pcursor().execute(SQL, value1, value2, value3)
Second you will get better speed if you use async processes.
Fortunately for you I wrote a gevent async library for psycopg2 you can use. It makes the process far easier, it is async threaded and pooled.
Python Postgres psycopg2 ThreadedConnectionPool exhausted

Related

Slow SQLite update with Python

I have a sqlite database that I've built and it gets both added to and updated on a weekly basis. The issue I have is the update seems to take a very long time. (Roughly 2 hours without the transaction table). I'm hoping there is a faster way to do this. What the script does is read from a CSV and updates the database line by line through a loop
An example data entry would be:
JohnDoe123 018238e1f5092c66d896906bfbcf9abf5abe978975a8852eb3a78871e16b4268
The Code that I use is
#updates reported table
def update_sha(conn, sha, ID, op):
sql_update_reported = 'UPDATE reported SET sha = ? WHERE ID = ? AND operator = ?'
sql_update_blocked = 'UPDATE blocked SET sha = ? WHERE ID = ? AND operator = ?'
sql_update_trans = 'UPDATE transactions SET sha = ? WHERE ID = ? AND operator = ?'
data = (sha, ID, op)
cur = conn.cursor()
cur.execute(sql_update_reported, data)
cur.execute(sql_update_blocked, data)
cur.execute(sql_update_trans, data)
conn.commit()
def Count(conn):
#Creates a dataframe with the Excel sheet information and ensures them to
#be strings
df = pd.DataFrame()
df = pd.read_excel("Count.xlsx", engine='openpyxl',converters={'ID':str})
#Runs through the DataFrame once for reported
for i in df.index:
ID = df['ID'][i]
Sha = df['Sha'][i]
op = df['op'][i]
print(i)
with conn:
update_dupi(conn, Sha, ID, op)
if __name__ == '__main__':
conn = create_connection(database)
print("Updating Now..")
Count(conn)
conn.close()

Rows inserted in mysql table after application started not picked by the application

I have usecase in whch I have to read rows having status = 0 from mysql.
Table schema:
CREATE TABLE IF NOT EXISTS in_out_analytics(
id INT AUTO_INCREMENT PRIMARY KEY,
file_name VARCHAR(255),
start_time BIGINT,
end_time BIGINT,
duration INT,
in_count INT,
out_count INT,
status INT
)
I am using this below code to read data from mysql.
persistance.py
import mysql
import mysql.connector
import conf
class DatabaseManager(object):
# global vars to storing db connection details
connection = None
def __init__(self):
self.ip = conf.db_ip
self.user_name = conf.db_user
self.password = conf.db_password
self.db_name = conf.db_name
# Initialize database only one time in application
if not DatabaseManager.connection:
self.connect()
self.cursor = DatabaseManager.connection.cursor()
self.create_schema()
def connect(self):
try:
DatabaseManager.connection = mysql.connector.connect(
host= self.ip,
database = self.db_name,
user = self.user_name,
password = self.password
)
print(f"Successfully connected to { self.ip } ")
except mysql.connector.Error as e:
print(str(e))
def create_schema(self):
# Create database
# sql = f"CREATE DATABASE { self.db_name} IF NOT EXIST"
# self.cursor.execute(sql)
# Create table
sql = """
CREATE TABLE IF NOT EXISTS in_out_analytics(
id INT AUTO_INCREMENT PRIMARY KEY,
file_name VARCHAR(255),
start_time BIGINT,
end_time BIGINT,
duration INT,
in_count INT,
out_count INT,
status INT
)"""
self.cursor.execute(sql)
def read_unprocessed_rows(self):
sql = "SELECT id, start_time, end_time FROM in_out_analytics WHERE status=0;"
self.cursor.execute(sql)
result_set = self.cursor.fetchall()
rows = []
for row in result_set:
id = row[0]
start_time = row[1]
end_time = row[2]
details = {
'id' : id,
'start_time' : start_time,
'end_time' : end_time
}
rows.append(details)
return rows
test.py
import time
from persistance import DatabaseManager
if __name__ == "__main__":
# Rows which are inserted after application is started do not get processed if
# 'DatabaseManager' is defined here
# dm = DatabaseManager()
while True:
# Rows which are inserted after application is started do get processed if
# 'DatabaseManager' is defined here
dm = DatabaseManager()
unprocessed_rows = dm.read_unprocessed_rows()
print(f"unprocessed_rows: { unprocessed_rows }")
time.sleep(2)
Problem:
The problem is, when I define database object dm = DatabaseManager() above the while loop, then any new row which is inserted after the application is started do not get processed and if I define the dm = DatabaseManager() inside the while loop then the rows which are inserted even after application is started gets processed.
What is the problem with the above code?
Ideally, we should make only one object of DatabaseManager as this class is creating a connection with MySQL. Hence creating a connection with any database should be the ideal case.
Making an assumption here, as I cannot test it myself.
tl;dr: Add DatabaseManager.connection.commit() to your read_unprocessed_rows
When you execute your SELECT statement, a transaction is created implicitly, using the default isolation level REPEATABLE READ. That creates a snapshot of the database at that point in time and all consecutive reads in that transaction will read from the snapshot established during the first read. The effects of different isolation levels are described here. To refresh the snapshot in REPEATABLE READ, you can commit your current transaction before executing the next statement.
So, when you instantiate your DatabaseManager inside your loop, each SELECT starts a new transaction on a new connection, hence has a fresh snapshot every time. When instantiating your Databasemanager outside the loop, the transaction created by the first SELECT keeps the same snapshot for all consecutive SELECTs and updates from outside that transaction remain invisible.

SQLAlchemy unique check on postgresql insert scalability

Can someone please help me understand what I am doing wrong?
All of the below works as required, but I'm running into scalability issues -
On the first run i fetched ~70,000 rows into a blank table in ~2-3 s
On the 2nd run i fetched ~80,000 rows into the same table in ~5 min
On the 3rd run i fetched ~50,000 rows into the same table in ~30 min
On the 4th run i fetched ~120,000 rows into the same table in ~1 hr
On the 5th run i fetched ~100,000 rows into the same table in ~2 hr
Each time i run the code, I see a steady ~600KB/s traffic between the client & the db while this activity finishes
So as you see, the hash check across all those columns does not seem to scale well at all
What is my code trying to accomplish?
I need to add daily stock data into a postgres database. The data is updated at source only once a day & the API response is as follows -
{'instrument_token': '210011653'
'exchange_token': '820358'
'tradingsymbol': 'COLG17MAY1020.00PE'
'name': ''
'last_price': 0.0
'expiry': '2017-05-25'
'strike': 1020.0
'tick_size': 0.05
'lot_size': 700
'instrument_type': 'PE'
'segment': 'BFO-OPT'
'exchange': 'BFO'}
The items in the response & the row count changes every day
On a given day, I see i can fetch between 50,000 - 120,000 rows in a single response (i.e approx 20-30 MB of csv data). Sending the request fetches the same data for a given day.
So the core problem is - I want to avoid adding the same row twice to the db in case the data is fetched multiple times in the same day.
What have I tried so far -
I'm a db newbie, my thought process was to autoincrement an id & add a data_date column, so my schema looks like this -
CREATE TABLE IF NOT EXISTS instruments (
id bigserial,
data_date date NOT NULL,
instrument_token integer NOT NULL,
exchange_token integer NOT NULL,
tradingsymbol varchar(40) NOT NULL,
name varchar(40) NOT NULL,
last_price numeric(15,2) NOT NULL,
expiry date,
strike numeric(15,2),
tick_size numeric,
lot_size integer,
instrument_type varchar(10),
segment varchar(20),
exchange varchar(10),
PRIMARY KEY(id)
);
I've built a class like so -
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, mapper, relationship
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, Numeric, String, MetaData, Table, ForeignKey, DateTime, union
from sqlalchemy.engine.url import URL
engine = create_engine('postgresql://blah')
Base = declarative_base(engine)
def _unique(session, cls, hashfunc, queryfunc, constructor, arg, kw):
cache = getattr(session, '_unique_cache', None)
if cache is None:
session._unique_cache = cache = {}
key = (cls, hashfunc(*arg, **kw))
if key in cache:
return cache[key]
else:
with session.no_autoflush:
q = session.query(cls)
q = queryfunc(q, *arg, **kw)
obj = q.first()
if not obj:
obj = constructor(*arg, **kw)
session.add(obj)
cache[key] = obj
return obj
class UniqueMixin(object):
#classmethod
def unique_hash(cls, *arg, **kw):
raise NotImplementedError()
#classmethod
def unique_filter(cls, query, *arg, **kw):
raise NotImplementedError()
#classmethod
def as_unique(cls, session, *arg, **kw):
return _unique(
session,
cls,
cls.unique_hash,
cls.unique_filter,
cls,
arg, kw
)
class Instrument(UniqueMixin, Base):
__tablename__ = 'instruments'
__table_args__ = {'autoload':True}
__table__ = Table('instruments', Base.metadata,
Column('id', Integer, primary_key=True),
Column('data_date', String),
Column('instrument_token', Integer),
Column('exchange_token', Integer),
Column('tradingsymbol', String),
Column('name', String),
Column('last_price', Numeric),
Column('expiry', Integer),
Column('strike', Numeric),
Column('tick_size', Numeric),
Column('lot_size', Integer),
Column('instrument_type', String),
Column('segment', String),
Column('exchange', String))
#classmethod
def unique_hash(cls, data_date, instrument_token, exchange_token, tradingsymbol, name, last_price, expiry, strike, tick_size, lot_size, instrument_type, segment, exchange):
return data_date, instrument_token, exchange_token, tradingsymbol, name, last_price, expiry, strike, tick_size, lot_size, instrument_type, segment, exchange
#classmethod
def unique_filter(cls, query, data_date, instrument_token, exchange_token, tradingsymbol, name, last_price, expiry, strike, tick_size, lot_size, instrument_type, segment, exchange):
return query.filter(Instrument.data_date == data_date, Instrument.instrument_token == instrument_token, Instrument.exchange_token == exchange_token, Instrument.tradingsymbol == tradingsymbol, Instrument.name == name, Instrument.last_price == last_price, Instrument.expiry == expiry, Instrument.strike == strike, Instrument.tick_size == tick_size, Instrument.lot_size == lot_size, Instrument.instrument_type == instrument_type, Instrument.segment == segment, Instrument.exchange == exchange)
def __init__(self, data_date, instrument_token, exchange_token, tradingsymbol, name, last_price, expiry, strike, tick_size, lot_size, instrument_type, segment, exchange):
self.data_date = data_date
self.instrument_token = instrument_token
self.exchange_token = exchange_token
self.tradingsymbol = tradingsymbol
self.name = name
self.last_price = last_price
self.expiry = expiry
self.strike = strike
self.tick_size = tick_size
self.lot_size = lot_size
self.instrument_type = instrument_type
self.segment = segment
self.exchange = exchange
def __repr__(self):
return "<Instruments - '%s': '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s' - '%s'>" % (
self.id,
self.data_date,
self.instrument_token,
self.exchange_token,
self.tradingsymbol,
self.name,
self.last_price,
self.expiry,
self.strike,
self.tick_size,
self.lot_size,
self.instrument_type,
self.segment,
self.exchange
)
The code to insert the data looks like this -
for instrument in response:
#print(instrument)
if instrument['expiry'] == '' :
instrument['expiry'] = null()
market_instrument = Instrument.as_unique(self.session,
data_date = datetime.date.today().isoformat(),
instrument_token = instrument['instrument_token'],
exchange_token = instrument['exchange_token'],
tradingsymbol = instrument['tradingsymbol'],
name = instrument['name'],
last_price = instrument['last_price'],
expiry = instrument['expiry'],
strike = instrument['strike'],
tick_size = instrument['tick_size'],
lot_size = instrument['lot_size'],
instrument_type = instrument['instrument_type'],
segment = instrument['segment'],
exchange = instrument['exchange'],
)
self.session.add(market_instrument)
self.session.commit()
Options I am considering
What do you think is best?
Option 1
No longer use as_unique(
Create one more data_update_date table (data_date(primary), status(boolean)) which is updated at the end of a successful daily insert
Check data_update_date for today's date, & skip add for the entire block if it exists
However this option does not help me learn if there was another mistake in my as_unique functions that needs to be corrected
Option 2
Setup new db with powa & profile
Find & fix bottlenecks
i am using the official postgres docker image, i ran into a dead end extending the debian base with hypopg & other required extensions
Looks like centos will be much simpler, so i'm creating a new dockerfile to do this
However, since i'm a total newbie with postgresql & sqlalchemy i also need your opinion on whether my code has some obvious issues
Option 3
hash only a few columns
i could hash just the first 3, excluding id
however i dont know how to do this
just reducing the hash classdef parameters causes the number of parameters to be less than defined in the class, so the insert fails
Option 4
I'm not married to either postgresql or sqlalchemy
Should i be using a non ORM method instead?
Or, should i be using something other than a db to store this kind of data
I'm running this on an m2.large instance on AWS, which should have the right kind of performance, so maybe I am using the wrong method to store the data
If this is the situation during insert, multiple threads while doing technical analysis will be simply unusable...
Should I be using something like hadoop instead?
also, an obvious drawback in this option is another learning curve to scale for hadoop...
I ran some db profiling on the bulk insert operation
The cache hit ratio is 100%
I do not see any disk io
Sorry i can't post more than 2 links right now, so i can't show you the charts for the hit ratio & disk hits, so you'll just have take my word for it :)
The as_unique method basically works using an extremely inefficient method which hits the db with a crazy large # of queries. If anything, I guess this just served as a good benchmark for this server build+config, that leaves me very satisfied with it's performance for cache friendly workloads
As pointed out by hints from various responses, the bottleneck lies in the schema as well as the way the insert is implemented in the code
I fixed the problem like this -
1. Add a multi-column unique index
CREATE UNIQUE INDEX market_daily_uq_idx ON instruments (
data_date,
instrument_token,
exchange_token,
tradingsymbol,
instrument_type,
segment,
exchange
);
2. Use .on_conflict_do_nothing()
statement = insert(Instrument).values(
data_date = datetime.date.today().isoformat(),
instrument_token = instrument['instrument_token'],
exchange_token = instrument['exchange_token'],
tradingsymbol = instrument['tradingsymbol'],
name = instrument['name'],
last_price = instrument['last_price'],
expiry = instrument['expiry'],
strike = instrument['strike'],
tick_size = instrument['tick_size'],
lot_size = instrument['lot_size'],
instrument_type = instrument['instrument_type'],
segment = instrument['segment'],
exchange = instrument['exchange'],
).on_conflict_do_nothing()
self.session.execute(statement)
self.session.commit()
This works very well & things are much faster now, thereby solving the core issue
Thank you all very much for all the help, hints & advice!

Sqlalchemy bulk update in MySQL works very slow

I'm using SQLAlchemy 1.0.0, and want to make some UPDATE ONLY (update if match primary key else do nothing) queries in batch.
I've made some experiment and found that bulk update looks much slower than bulk insert or bulk upsert.
Could you please help me to point out why it works so slow or is there any alternative way/idea to make the BULK UPDATE (not BULK UPSERT) with SQLAlchemy ?
Below is the table in MYSQL:
CREATE TABLE `test` (
`id` int(11) unsigned NOT NULL,
`value` int(11) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
And the test code:
from sqlalchemy import create_engine, text
import time
driver = 'mysql'
host = 'host'
user = 'user'
password = 'password'
database = 'database'
url = "{}://{}:{}#{}/{}?charset=utf8".format(driver, user, password, host, database)
engine = create_engine(url)
engine.connect()
engine.execute('TRUNCATE TABLE test')
num_of_rows = 1000
rows = []
for i in xrange(0, num_of_rows):
rows.append({'id': i, 'value': i})
print '--------- test insert --------------'
sql = '''
INSERT INTO test (id, value)
VALUES (:id, :value)
'''
start = time.time()
engine.execute(text(sql), rows)
end = time.time()
print 'Cost {} seconds'.format(end - start)
print '--------- test upsert --------------'
for r in rows:
r['value'] = r['id'] + 1
sql = '''
INSERT INTO test (id, value)
VALUES (:id, :value)
ON DUPLICATE KEY UPDATE value = VALUES(value)
'''
start = time.time()
engine.execute(text(sql), rows)
end = time.time()
print 'Cost {} seconds'.format(end - start)
print '--------- test update --------------'
for r in rows:
r['value'] = r['id'] * 10
sql = '''
UPDATE test
SET value = :value
WHERE id = :id
'''
start = time.time()
engine.execute(text(sql), rows)
end = time.time()
print 'Cost {} seconds'.format(end - start)
The output when num_of_rows = 100:
--------- test insert --------------
Cost 0.568960905075 seconds
--------- test upsert --------------
Cost 0.569655895233 seconds
--------- test update --------------
Cost 20.0891299248 seconds
The output when num_of_rows = 1000:
--------- test insert --------------
Cost 0.807548999786 seconds
--------- test upsert --------------
Cost 0.584554195404 seconds
--------- test update --------------
Cost 206.199367046 seconds
The network latency to database server is around 500ms.
Looks like in bulk update it send and execute each query one by one, not in batch?
Thanks in advance.
You can speed up bulk update operations with a trick, even if the database-server (like in your case) has a very bad latency. Instead of updating your table directly, you use a stage-table to insert your new data very fast, then do one join-update to the destination-table. This also has the advantage that you reduce the number of statements you have to send to the database quite dramatically.
How does this work with UPDATEs?
Say you have a table entries and you have new data coming in all the time, but you only want to update those which have already been stored. You create a copy of your destination-table entries_stage with only the relevant fields in it:
entries = Table('entries', metadata,
Column('id', Integer, autoincrement=True, primary_key=True),
Column('value', Unicode(64), nullable=False),
)
entries_stage = Table('entries_stage', metadata,
Column('id', Integer, autoincrement=False, unique=True),
Column('value', Unicode(64), nullable=False),
)
Then you insert your data with a bulk-insert. This can be sped up even further if you use MySQL's multiple value insert syntax, which isn't natively supported by SQLAlchemy, but can be built without much difficulty.
INSERT INTO enries_stage (`id`, `value`)
VALUES
(1, 'string1'), (2, 'string2'), (3, 'string3'), ...;
In the end, you update the values of the destination-table with the values from the stage-table like this:
UPDATE entries e
JOIN entries_stage es ON e.id = es.id
SET e.value = es.value
WHERE e.value != es.value;
Then you're done.
What about inserts?
This also works to speed up inserts of course. As you already have the data in the stage-table, all you need to do is issue a INSERT INTO ... SELECT statement, with the data which is not in destination-table yet.
INSERT INTO entries (id, value)
SELECT FROM entries_stage es
LEFT JOIN entries e ON e.id = es.id
HAVING e.id IS NULL;
The nice thing about this is that you don't have to do INSERT IGNORE, REPLACE or ON DUPLICATE KEY UPDATE, which will increment your primary key, even if they will do nothing.

Getting the id of the last record inserted for Postgresql SERIAL KEY with Python

I am using SQLAlchemy without the ORM, i.e. using hand-crafted SQL statements to directly interact with the backend database. I am using PG as my backend database (psycopg2 as DB driver) in this instance - I don't know if that affects the answer.
I have statements like this,for brevity, assume that conn is a valid connection to the database:
conn.execute("INSERT INTO user (name, country_id) VALUES ('Homer', 123)")
Assume also that the user table consists of the columns (id [SERIAL PRIMARY KEY], name, country_id)
How may I obtain the id of the new user, ideally, without hitting the database again?
You might be able to use the RETURNING clause of the INSERT statement like this:
result = conn.execute("INSERT INTO user (name, country_id) VALUES ('Homer', 123)
RETURNING *")
If you only want the resulting id:
result = conn.execute("INSERT INTO user (name, country_id) VALUES ('Homer', 123)
RETURNING id")
[new_id] = result.fetchone()
User lastrowid
result = conn.execute("INSERT INTO user (name, country_id) VALUES ('Homer', 123)")
result.lastrowid
Current SQLAlchemy documentation suggests
result.inserted_primary_key should work!
Python + SQLAlchemy
after commit, you get the primary_key column id (autoincremeted) updated in your object.
db.session.add(new_usr)
db.session.commit() #will insert the new_usr data into database AND retrieve id
idd = new_usr.usrID # usrID is the autoincremented primary_key column.
return jsonify(idd),201 #usrID = 12, correct id from table User in Database.
this question has been asked many times on stackoverflow and no answer I have seen is comprehensive. Googling 'sqlalchemy insert get id of new row' brings up a lot of them.
There are three levels to SQLAlchemy.
Top: the ORM.
Middle: Database abstraction (DBA) with Table classes etc.
Bottom: SQL using the text function.
To an OO programmer the ORM level looks natural, but to a database programmer it looks ugly and the ORM gets in the way. The DBA layer is an OK compromise. The SQL layer looks natural to database programmers and would look alien to an OO-only programmer.
Each level has it own syntax, similar but different enough to be frustrating. On top of this there is almost too much documentation online, very hard to find the answer.
I will describe how to get the inserted id AT THE SQL LAYER for the RDBMS I use.
Table: User(user_id integer primary autoincrement key, user_name string)
conn: Is a Connection obtained within SQLAlchemy to the DBMS you are using.
SQLite
======
insstmt = text(
'''INSERT INTO user (user_name)
VALUES (:usernm) ''' )
# Execute within a transaction (optional)
txn = conn.begin()
result = conn.execute(insstmt, usernm='Jane Doe')
# The id!
recid = result.lastrowid
txn.commit()
MS SQL Server
=============
insstmt = text(
'''INSERT INTO user (user_name)
OUTPUT inserted.record_id
VALUES (:usernm) ''' )
txn = conn.begin()
result = conn.execute(insstmt, usernm='Jane Doe')
# The id!
recid = result.fetchone()[0]
txn.commit()
MariaDB/MySQL
=============
insstmt = text(
'''INSERT INTO user (user_name)
VALUES (:usernm) ''' )
txn = conn.begin()
result = conn.execute(insstmt, usernm='Jane Doe')
# The id!
recid = conn.execute(text('SELECT LAST_INSERT_ID()')).fetchone()[0]
txn.commit()
Postgres
========
insstmt = text(
'''INSERT INTO user (user_name)
VALUES (:usernm)
RETURNING user_id ''' )
txn = conn.begin()
result = conn.execute(insstmt, usernm='Jane Doe')
# The id!
recid = result.fetchone()[0]
txn.commit()
result.inserted_primary_key
Worked for me. The only thing to note is that this returns a list that contains that last_insert_id.
Make sure you use fetchrow/fetch to receive the returning object
insert_stmt = user.insert().values(name="homer", country_id="123").returning(user.c.id)
row_id = await conn.fetchrow(insert_stmt)
For Postgress inserts from python code is simple to use "RETURNING" keyword with the "col_id" (name of the column which you want to get the last inserted row id) in insert statement at end
syntax -
from sqlalchemy import create_engine
conn_string = "postgresql://USERNAME:PSWD#HOSTNAME/DATABASE_NAME"
db = create_engine(conn_string)
conn = db.connect()
INSERT INTO emp_table (col_id, Name ,Age)
VALUES(3,'xyz',30) RETURNING col_id;
or
(if col_id column is auto increment)
insert_sql = (INSERT INTO emp_table (Name ,Age)
VALUES('xyz',30) RETURNING col_id;)
result = conn.execute(insert_sql)
[last_row_id] = result.fetchone()
print(last_row_id)
#output = 3
ex -

Categories