Looking for a better strategy for an SQLAlchemy bulk upsert - python

I have a Flask application with a RESTful API. One of the API calls is a 'mass upsert' call with a JSON payload. I am struggling with performance.
The first thing I tried was to use merge-result on a Query object, because...
This is an optimized method which will merge all mapped instances, preserving the structure of the result rows and unmapped columns with less method overhead than that of calling Session.merge() explicitly for each value.
This was the initial code:
class AdminApiUpdateTasks(Resource):
"""Bulk task creation / update endpoint"""
def put(self, slug):
taskdata = json.loads(request.data)
existing = db.session.query(Task).filter_by(challenge_slug=slug)
existing.merge_result(
[task_from_json(slug, **task) for task in taskdata])
db.session.commit()
return {}, 200
A request to that endpoint with ~5000 records, all of them already existing in the database, takes more than 11m to return:
real 11m36.459s
user 0m3.660s
sys 0m0.391s
As this would be a fairly typical use case, I started looking into alternatives to improve performance. Against my better judgement, I tried to merge the session for each individual record:
class AdminApiUpdateTasks(Resource):
"""Bulk task creation / update endpoint"""
def put(self, slug):
# Get the posted data
taskdata = json.loads(request.data)
for task in taskdata:
db.session.merge(task_from_json(slug, **task))
db.session.commit()
return {}, 200
To my surprise, this turned out to be more than twice as fast:
real 4m33.945s
user 0m3.608s
sys 0m0.258s
I have two questions:
Why is the second strategy using merge faster than the supposedly optimized first one that uses merge_result?
What other strategies should I pursue to optimize this more, if any?

This is an old question, but I hope this answer can still help people.
I used the same idea as this example set by SQLAlchemy, but I added benchmarking for doing UPSERT (insert if exists, otherwise update the existing record) operations. I added the results on a PostgreSQL 11 database below:
Tests to run: test_customer_individual_orm_select, test_customer_batched_orm_select, test_customer_batched_orm_select_add_all, test_customer_batched_orm_merge_result
test_customer_individual_orm_select : UPSERT statements via individual checks on whether objects exist and add new objects individually (10000 iterations); total time 9.359603 sec
test_customer_batched_orm_select : UPSERT statements via batched checks on whether objects exist and add new objects individually (10000 iterations); total time 1.553555 sec
test_customer_batched_orm_select_add_all : UPSERT statements via batched checks on whether objects exist and add new objects in bulk (10000 iterations); total time 1.358680 sec
test_customer_batched_orm_merge_result : UPSERT statements using batched merge_results (10000 iterations); total time 7.191284 sec
As you can see, merge-result is far from the most efficient option. I'd suggest checking in batches whether the results exist and should be updated. Hope this helps!
"""
This series of tests illustrates different ways to UPSERT
or INSERT ON CONFLICT UPDATE a large number of rows in bulk.
"""
from sqlalchemy import Column
from sqlalchemy import create_engine
from sqlalchemy import Integer
from sqlalchemy import String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session
from profiler import Profiler
Base = declarative_base()
engine = None
class Customer(Base):
__tablename__ = "customer"
id = Column(Integer, primary_key=True)
name = Column(String(255))
description = Column(String(255))
Profiler.init("bulk_upserts", num=100000)
#Profiler.setup
def setup_database(dburl, echo, num):
global engine
engine = create_engine(dburl, echo=echo)
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
s = Session(engine)
for chunk in range(0, num, 10000):
# Insert half of the customers we want to merge
s.bulk_insert_mappings(
Customer,
[
{
"id": i,
"name": "customer name %d" % i,
"description": "customer description %d" % i,
}
for i in range(chunk, chunk + 10000, 2)
],
)
s.commit()
#Profiler.profile
def test_customer_individual_orm_select(n):
"""
UPSERT statements via individual checks on whether objects exist
and add new objects individually
"""
session = Session(bind=engine)
for i in range(0, n):
customer = session.query(Customer).get(i)
if customer:
customer.description += "updated"
else:
session.add(Customer(
id=i,
name=f"customer name {i}",
description=f"customer description {i} new"
))
session.flush()
session.commit()
#Profiler.profile
def test_customer_batched_orm_select(n):
"""
UPSERT statements via batched checks on whether objects exist
and add new objects individually
"""
session = Session(bind=engine)
for chunk in range(0, n, 1000):
customers = {
c.id: c for c in
session.query(Customer)\
.filter(Customer.id.between(chunk, chunk + 1000))
}
for i in range(chunk, chunk + 1000):
if i in customers:
customers[i].description += "updated"
else:
session.add(Customer(
id=i,
name=f"customer name {i}",
description=f"customer description {i} new"
))
session.flush()
session.commit()
#Profiler.profile
def test_customer_batched_orm_select_add_all(n):
"""
UPSERT statements via batched checks on whether objects exist
and add new objects in bulk
"""
session = Session(bind=engine)
for chunk in range(0, n, 1000):
customers = {
c.id: c for c in
session.query(Customer)\
.filter(Customer.id.between(chunk, chunk + 1000))
}
to_add = []
for i in range(chunk, chunk + 1000):
if i in customers:
customers[i].description += "updated"
else:
to_add.append({
"id": i,
"name": "customer name %d" % i,
"description": "customer description %d new" % i,
})
if to_add:
session.bulk_insert_mappings(
Customer,
to_add
)
to_add = []
session.flush()
session.commit()
#Profiler.profile
def test_customer_batched_orm_merge_result(n):
"UPSERT statements using batched merge_results"
session = Session(bind=engine)
for chunk in range(0, n, 1000):
customers = session.query(Customer)\
.filter(Customer.id.between(chunk, chunk + 1000))
customers.merge_result(
Customer(
id=i,
name=f"customer name {i}",
description=f"customer description {i} new"
) for i in range(chunk, chunk + 1000)
)
session.flush()
session.commit()

I think that either this was causing your slowness in the first query:
existing = db.session.query(Task).filter_by(challenge_slug=slug)
Also you should probably change this:
existing.merge_result(
[task_from_json(slug, **task) for task in taskdata])
To:
existing.merge_result(
(task_from_json(slug, **task) for task in taskdata))
As that should save you some memory and time, as the list won't be generated in memory before sending it to the merge_result method.

Related

How to fetch data from azure documentdb faster

I'm trying to implement this example:
https://github.com/Azure/azure-documentdb-python/blob/master/samples/DatabaseManagement/Program.py
To fetch data from azure documentdb and do some visualization. However, I would like to use a query on the line where it says #error here instead.
def read_database(client, id):
print('3. Read a database by id')
try:
db = next((data for data in client.ReadDatabases() if data['id'] == database_id))
coll = next((coll for coll in client.ReadCollections(db['_self']) if coll['id'] == database_collection))
return list(itertools.islice(client.ReadDocuments(coll['_self']), 0, 100, 1))
except errors.DocumentDBError as e:
if e.status_code == 404:
print('A Database with id \'{0}\' does not exist'.format(id))
else:
raise errors.HTTPFailure(e.status_code)
The fetching is really slow when I want to get >10k items, how can I improve this?
Thanks!
You can't query documents directly through database entity.
The parameters of the ReadDocuments() method used in your code should be collection link and query options.
def ReadDocuments(self, collection_link, feed_options=None):
"""Reads all documents in a collection.
:Parameters:
- `collection_link`: str, the link to the document collection.
- `feed_options`: dict
:Returns:
query_iterable.QueryIterable
"""
if feed_options is None:
feed_options = {}
return self.QueryDocuments(collection_link, None, feed_options)
So, you could modify your code as below:
# Initialize the Python DocumentDB client
client = document_client.DocumentClient(config['ENDPOINT'], {'masterKey': config['MASTERKEY']})
db = "db"
coll = "coll"
try:
database_link = 'dbs/' + db
database = client.ReadDatabase(database_link)
collection_link = 'dbs/' + db + "/colls/" + coll
collection = client.ReadCollection(collection_link)
# options = {}
# options['enableCrossPartitionQuery'] = True
# options['partitionKey'] = 'jay'
docs = client.ReadDocuments(collection_link)
print(list(docs))
except errors.DocumentDBError as e:
if e.status_code == 404:
print('A Database with id \'{0}\' does not exist'.format(id))
else:
raise errors.HTTPFailure(e.status_code)
If you want to query partition of your collection, please add the snippet of code which are commented in the above code.
options = {}
options['enableCrossPartitionQuery'] = True
options['partitionKey'] = 'jay'
It seems that your issue is focused on Azure Cosmos DB query performance.
You could refer to the following points to improve query performance.
Partitioning
You could set partition keys in your database and query with a filter clause on a single partition key so that it needs lower latency and consumes lower RUs.
Throughput
You could set the throughput a bit larger so that Azure Cosmos DB performance in unit time will be greatly improved. Of course, this will lead to higher costs.
Indexing Policy
The use of indexing paths can offer improved performance and lower latency.
For more details, I recommend that you refer to the official performance documentation.
Hope it helps you.

How to do batch insertion on Neo4J with Python

I have a code that insert many nodes and relationships:
from neo4jrestclient.client import GraphDatabase
from neo4jrestclient import client
import psycopg2
db = GraphDatabase("http://127.0.0.1:7474",username="neo4j", password="1234")
conn = psycopg2.connect("\
dbname='bdTrmmTest'\
user='postgres'\
host='127.0.0.1'\
password='1234'\
");
inicio = 0
while(inicio <= 4429640):
c = conn.cursor()
c.execute("SELECT p.latitude, p.longitude, h.precipitacaoh, h.datah, h.horah FROM pontos AS p, historico AS h WHERE p.gid = h.gidgeo_fk LIMIT 1640 OFFSET %d"%(inicio))
sensorlatlong = db.labels.create("LaLo")
sensorprecip = db.labels.create("Precipitacao")
sensordata = db.labels.create("Data")
sensorhora = db.labels.create("Hora")
records = c.fetchall()
for i in records:
s2 = db.nodes.create(precipitacao=i[2])
sensorprecip.add(s2)
s5 = db.nodes.create(horah=i[4])
sensorhora.add(s5)
s5.relationships.create("REGISTROU",s2)
q = 'MATCH (s:LaLo) WHERE s.latitude = "%s" AND s.longitude = "%s" RETURN s'%(str(i[0]),str(i[1]))
results = db.query(q, returns=(client.Node))
q2 = 'MATCH (s:LaLo)-->(d:Data)-->(h:Hora)-->(p:Precipitacao) WHERE s.latitude = "%s" AND s.longitude = "%s" AND d.datah = "%s" RETURN d'%(str(i[0]), str(i[1]), str(i[3]))
results1 = db.query(q2, returns=(client.Node))
if (len(results) > 0):
n = results[0].pop()
if(len(results1) > 0):
n1 = results1[0].pop()
n1.relationships.create("AS", s5)
else:
s4 = db.nodes.create(datah=i[3])
sensordata.add(s4)
n.relationships.create("EM", s4)
s4.relationships.create("AS",s5)
else:
s3 = db.nodes.create(latitude=i[0],longitude=i[1])
sensorlatlong.add(s3)
if(len(results1) > 0):
n1 = results1[0].pop()
n1.relationships.create("AS", s5)
else:
s4 = db.nodes.create(datah=i[3])
sensordata.add(s4)
s3.relationships.create("EM", s4)
s4.relationships.create("AS",s5)
inicio = inicio+1640
But it takes so many days to insert. How do batch insert in this code to decreasing the insertion time? I read this post http://jexp.de/blog/2012/10/parallel-batch-inserter-with-neo4j/ but it is in Java.
I haven't used Neo4j from Python, but I'm pretty sure the client works the same way as in other languages, and that means your code will generate a lot of distinct HTTP connections, manipulating the low-level node and relationship endpoints. That means lots of latency.
It also generates lots of distinct queries, because it does string replacement instead using of parameterized queries, and Neo4j will have to parse each and everyone of them.
You'd be much better of with a small number of parameterized Cypher queries, or even one.
If I've read the documentation for neo4jrestclient correctly, I think it would look something like that:
c.execute("SELECT p.latitude, p.longitude, h.precipitacaoh, h.datah, h.horah FROM pontos AS p, historico AS h WHERE p.gid = h.gidgeo_fk LIMIT 1640 OFFSET %d"%(inicio))
records = c.fetchall()
q = """
MERGE (lalo:LaLo {latitude: {latitude}, longitude: {longitude}})
WITH lalo
MERGE (lalo)-[:EM]->(data:Data {datah: {datah}})
WITH data
CREATE (data)-[:AS]->(hora:Hora {horah: {horah}})
CREATE (hora)-[:REGISTROU]->(:Precipitacao {precipitacao: {precipitacao}})
"""
for i in records:
params = {
"latitude": str(i[0]),
"longitude": str(i[1]),
"precipitacao": i[2],
"datah": i[3],
"horah": i[4],
}
db.query(q=q, params=params)
Of course, it will run faster if you have indices, so you'd need to create those first (at least the first 2), for example right before the loop, or outside of the process:
CREATE INDEX ON :LaLo(latitude)
CREATE INDEX ON :LaLo(longitude)
CREATE INDEX ON :Data(datah)
The last thing you could do to speed things up is use transactions, so writes happen in batches.
Open a transaction
tx = db.transaction(for_query=True)
Append (for example) up to a thousand queries (or less if you reach the end of the rows)
params = // ...
tx.append(q=q, params=params)
Commit the transaction
tx.execute()
Repeat until you've run out of rows from the SQL database

Efficient way to query in a for loop in Google App Engine?

In the GAE documentation, it states:
Because each get() or put() operation invokes a separate remote
procedure call (RPC), issuing many such calls inside a loop is an
inefficient way to process a collection of entities or keys at once.
Who knows how many other inefficiencies I have in my code, so I'd like to minimize as much as I can. Currently, I do have a for loop where each iteration has a separate query. Let's say I have a User, and a user has friends. I want to get the latest updates for every friend of the user. So what I have is an array of that user's friends:
for friend_dic in friends:
email = friend_dic['email']
lastUpdated = friend_dic['lastUpdated']
userKey = Key('User', email)
query = ndb.gql('SELECT * FROM StatusUpdates WHERE ANCESTOR IS :1 AND modifiedDate > :2', userKey, lastUpdated)
qit = query.iter()
while (yield qit.has_next_async()):
status = qit.next()
status_list.append(status.to_dict())
raise ndb.Return(status_list)
Is there a more efficient way to do this, maybe somehow batch all these into one single query?
Try looking at NDB's map function: https://developers.google.com/appengine/docs/python/ndb/queryclass#Query_map_async
Example (assuming you keep your friend relationships in a separate model, for this example I assumed a Relationships model):
#ndb.tasklet
def callback(entity):
email = friend_dic['email']
lastUpdated = friend_dic['lastUpdated']
userKey = Key('User', email)
query = ndb.gql('SELECT * FROM StatusUpdates WHERE ANCESTOR IS :1 AND modifiedDate > :2', userKey, lastUpdated)
status_updates = yield query.fetch_async()
raise ndb.Return(status_updates)
qry = ndb.gql("SELECT * FROM Relationships WHERE friend_to = :1", user.key)
updates = yield qry.map_async(callback)
#updates will now be a list of status updates
Update:
With a better understanding of your data model:
queries = []
status_list = []
for friend_dic in friends:
email = friend_dic['email']
lastUpdated = friend_dic['lastUpdated']
userKey = Key('User', email)
queries.append(ndb.gql('SELECT * FROM StatusUpdates WHERE ANCESTOR IS :1 AND modifiedDate > :2', userKey, lastUpdated).fetch_async())
for query in queries:
statuses = yield query
status_list.extend([x.to_dict() for x in statuses])
raise ndb.Return(status_list)
You could perform those query concurrently using ndb async methods:
from google.appengine.ext import ndb
class Bar(ndb.Model):
pass
class Foo(ndb.Model):
pass
bars = ndb.put_multi([Bar() for i in range(10)])
ndb.put_multi([Foo(parent=bar) for bar in bars])
futures = [Foo.query(ancestor=bar).fetch_async(10) for bar in bars]
for f in futures:
print(f.get_result())
This launches 10 concurrent Datastore Query RPCs, and the overall latency only depends of the slowest one instead of the sum of all latencies
Also see the official ndb documentation for more detail on how to async APIs with ndb.

SQLAlchemy ON DUPLICATE KEY UPDATE

Is there an elegant way to do an INSERT ... ON DUPLICATE KEY UPDATE in SQLAlchemy? I mean something with a syntax similar to inserter.insert().execute(list_of_dictionaries) ?
ON DUPLICATE KEY UPDATE post version-1.2 for MySQL
This functionality is now built into SQLAlchemy for MySQL only. somada141's answer below has the best solution:
https://stackoverflow.com/a/48373874/319066
ON DUPLICATE KEY UPDATE in the SQL statement
If you want the generated SQL to actually include ON DUPLICATE KEY UPDATE, the simplest way involves using a #compiles decorator.
The code (linked from a good thread on the subject on reddit) for an example can be found on github:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert
#compiles(Insert)
def append_string(insert, compiler, **kw):
s = compiler.visit_insert(insert, **kw)
if 'append_string' in insert.kwargs:
return s + " " + insert.kwargs['append_string']
return s
my_connection.execute(my_table.insert(append_string = 'ON DUPLICATE KEY UPDATE foo=foo'), my_values)
But note that in this approach, you have to manually create the append_string. You could probably change the append_string function so that it automatically changes the insert string into an insert with 'ON DUPLICATE KEY UPDATE' string, but I'm not going to do that here due to laziness.
ON DUPLICATE KEY UPDATE functionality within the ORM
SQLAlchemy does not provide an interface to ON DUPLICATE KEY UPDATE or MERGE or any other similar functionality in its ORM layer. Nevertheless, it has the session.merge() function that can replicate the functionality only if the key in question is a primary key.
session.merge(ModelObject) first checks if a row with the same primary key value exists by sending a SELECT query (or by looking it up locally). If it does, it sets a flag somewhere indicating that ModelObject is in the database already, and that SQLAlchemy should use an UPDATE query. Note that merge is quite a bit more complicated than this, but it replicates the functionality well with primary keys.
But what if you want ON DUPLICATE KEY UPDATE functionality with a non-primary key (for example, another unique key)? Unfortunately, SQLAlchemy doesn't have any such function. Instead, you have to create something that resembles Django's get_or_create(). Another StackOverflow answer covers it, and I'll just paste a modified, working version of it here for convenience.
def get_or_create(session, model, defaults=None, **kwargs):
instance = session.query(model).filter_by(**kwargs).first()
if instance:
return instance
else:
params = dict((k, v) for k, v in kwargs.iteritems() if not isinstance(v, ClauseElement))
if defaults:
params.update(defaults)
instance = model(**params)
return instance
I should mention that ever since the v1.2 release, the SQLAlchemy 'core' has a solution to the above with that's built in and can be seen under here (copied snippet below):
from sqlalchemy.dialects.mysql import insert
insert_stmt = insert(my_table).values(
id='some_existing_id',
data='inserted value')
on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
data=insert_stmt.inserted.data,
status='U'
)
conn.execute(on_duplicate_key_stmt)
Based on phsource's answer, and for the specific use-case of using MySQL and completely overriding the data for the same key without performing a DELETE statement, one can use the following #compiles decorated insert expression:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert
#compiles(Insert)
def append_string(insert, compiler, **kw):
s = compiler.visit_insert(insert, **kw)
if insert.kwargs.get('on_duplicate_key_update'):
fields = s[s.find("(") + 1:s.find(")")].replace(" ", "").split(",")
generated_directive = ["{0}=VALUES({0})".format(field) for field in fields]
return s + " ON DUPLICATE KEY UPDATE " + ",".join(generated_directive)
return s
It's depends upon you. If you want to replace then pass OR REPLACE in prefixes
def bulk_insert(self,objects,table):
#table: Your table class and objects are list of dictionary [{col1:val1, col2:vale}]
for counter,row in enumerate(objects):
inserter = table.__table__.insert(prefixes=['OR IGNORE'], values=row)
try:
self.db.execute(inserter)
except Exception as E:
print E
if counter % 100 == 0:
self.db.commit()
self.db.commit()
Here commit interval can be changed to speed up or speed down
My way
import typing
from datetime import datetime
from sqlalchemy.dialects import mysql
class MyRepository:
def model(self):
return MySqlAlchemyModel
def upsert(self, data: typing.List[typing.Dict]):
if not data:
return
model = self.model()
if hasattr(model, 'created_at'):
for item in data:
item['created_at'] = datetime.now()
stmt = mysql.insert(getattr(model, '__table__')).values(data)
for_update = []
for k, v in data[0].items():
for_update.append(k)
dup = {k: getattr(stmt.inserted, k) for k in for_update}
stmt = stmt.on_duplicate_key_update(**dup)
self.db.session.execute(stmt)
self.db.session.commit()
Usage:
myrepo.upsert([
{
"field11": "value11",
"field21": "value21",
"field31": "value31",
},
{
"field12": "value12",
"field22": "value22",
"field32": "value32",
},
])
The other answers have this covered but figured I'd reference another good example for mysql I found in this gist. This also includes the use of LAST_INSERT_ID, which may be useful depending on your innodb auto increment settings and whether your table has a unique key. Lifting the code here for easy reference but please give the author a star if you find it useful.
from app import db
from sqlalchemy import func
from sqlalchemy.dialects.mysql import insert
def upsert(model, insert_dict):
"""model can be a db.Model or a table(), insert_dict should contain a primary or unique key."""
inserted = insert(model).values(**insert_dict)
upserted = inserted.on_duplicate_key_update(
id=func.LAST_INSERT_ID(model.id), **{k: inserted.inserted[k]
for k, v in insert_dict.items()})
res = db.engine.execute(upserted)
return res.lastrowid
ORM
use upset func based on on_duplicate_key_update
class Model():
__input_data__ = dict()
def __init__(self, **kwargs) -> None:
self.__input_data__ = kwargs
self.session = Session(engine)
def save(self):
self.session.add(self)
self.session.commit()
def upsert(self, *, ingore_keys = []):
column_keys = self.__table__.columns.keys()
udpate_data = dict()
for key in self.__input_data__.keys():
if key not in column_keys:
continue
else:
udpate_data[key] = self.__input_data__[key]
insert_stmt = insert(self.__table__).values(**udpate_data)
all_ignore_keys = ['id']
if isinstance(ingore_keys, list):
all_ignore_keys =[*all_ignore_keys, *ingore_keys]
else:
all_ignore_keys.append(ingore_keys)
udpate_columns = dict()
for key in self.__input_data__.keys():
if key not in column_keys or key in all_ignore_keys:
continue
else:
udpate_columns[key] = insert_stmt.inserted[key]
on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
**udpate_columns
)
# self.session.add(self)
self.session.execute(on_duplicate_key_stmt)
self.session.commit()
class ManagerAssoc(ORM_Base, Model):
def __init__(self, **kwargs):
self.id = idWorker.get_id()
column_keys = self.__table__.columns.keys()
udpate_data = dict()
for key in kwargs.keys():
if key not in column_keys:
continue
else:
udpate_data[key] = kwargs[key]
ORM_Base.__init__(self, **udpate_data)
Model.__init__(self, **kwargs, id = self.id)
....
# you can call it as following:
manager_assoc.upsert()
manager.upsert(ingore_keys = ['manager_id'])
Got a simpler solution:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert
#compiles(Insert)
def replace_string(insert, compiler, **kw):
s = compiler.visit_insert(insert, **kw)
s = s.replace("INSERT INTO", "REPLACE INTO")
return s
my_connection.execute(my_table.insert(replace_string=""), my_values)
I just used plain sql as:
insert_stmt = "REPLACE INTO tablename (column1, column2) VALUES (:column_1_bind, :columnn_2_bind) "
session.execute(insert_stmt, data)
Update Feb 2023: SQLAlchemy version 2 was recently released and supports on_duplicate_key_update in the MySQL dialect. Many many thanks to Federico Caselli of the SQLAlchemy project who helped me develop sample code in a discussion at https://github.com/sqlalchemy/sqlalchemy/discussions/9328
Please see https://stackoverflow.com/a/75538576/1630244
If it's ok to post the same answer twice (?) here is my small self-contained code example:
import sqlalchemy as db
import sqlalchemy.dialects.mysql as mysql
from sqlalchemy import delete, select, String
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
class Base(DeclarativeBase):
pass
class User(Base):
__tablename__ = "foo"
id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str] = mapped_column(String(30))
engine = db.create_engine('mysql+mysqlconnector://USER-NAME-HERE:PASS-WORD-HERE#localhost/SCHEMA-NAME-HERE')
conn = engine.connect()
# setup step 0 - ensure the table exists
Base().metadata.create_all(bind=engine)
# setup step 1 - clean out rows with id 1..5
del_stmt = delete(User).where(User.id.in_([1, 2, 3, 4, 5]))
conn.execute(del_stmt)
conn.commit()
sel_stmt = select(User)
users = list(conn.execute(sel_stmt))
print(f'Table size after cleanout: {len(users)}')
# setup step 2 - insert 4 rows
ins_stmt = mysql.insert(User).values(
[
{"id": 1, "name": "x"},
{"id": 2, "name": "y"},
{"id": 3, "name": "w"},
{"id": 4, "name": "z"},
]
)
conn.execute(ins_stmt)
conn.commit()
users = list(conn.execute(sel_stmt))
print(f'Table size after insert: {len(users)}')
# demonstrate upsert
ups_stmt = mysql.insert(User).values(
[
{"id": 1, "name": "xx"},
{"id": 2, "name": "yy"},
{"id": 3, "name": "ww"},
{"id": 5, "name": "new"},
]
)
ups_stmt = ups_stmt.on_duplicate_key_update(name=ups_stmt.inserted.name)
# if you want to see the compiled result
# x = ups_stmt.compile(dialect=mysql.dialect())
# print(x.string, x.construct_params())
conn.execute(ups_stmt)
conn.commit()
users = list(conn.execute(sel_stmt))
print(f'Table size after upsert: {len(users)}')
As none of these solutions seem all the elegant. A brute force way is to query to see if the row exists. If it does delete the row and then insert otherwise just insert. Obviously some overhead involved but it does not rely on modifying the raw sql and it works on non orm stuff.

What is an efficient way of inserting thousands of records into an SQLite table using Django?

I have to insert 8000+ records into a SQLite database using Django's ORM. This operation needs to be run as a cronjob about once per minute.
At the moment I'm using a for loop to iterate through all the items and then insert them one by one.
Example:
for item in items:
entry = Entry(a1=item.a1, a2=item.a2)
entry.save()
What is an efficient way of doing this?
Edit: A little comparison between the two insertion methods.
Without commit_manually decorator (11245 records):
nox#noxdevel marinetraffic]$ time python manage.py insrec
real 1m50.288s
user 0m6.710s
sys 0m23.445s
Using commit_manually decorator (11245 records):
[nox#noxdevel marinetraffic]$ time python manage.py insrec
real 0m18.464s
user 0m5.433s
sys 0m10.163s
Note: The test script also does some other operations besides inserting into the database (downloads a ZIP file, extracts an XML file from the ZIP archive, parses the XML file) so the time needed for execution does not necessarily represent the time needed to insert the records.
You want to check out django.db.transaction.commit_manually.
http://docs.djangoproject.com/en/dev/topics/db/transactions/#django-db-transaction-commit-manually
So it would be something like:
from django.db import transaction
#transaction.commit_manually
def viewfunc(request):
...
for item in items:
entry = Entry(a1=item.a1, a2=item.a2)
entry.save()
transaction.commit()
Which will only commit once, instead at each save().
In django 1.3 context managers were introduced.
So now you can use transaction.commit_on_success() in a similar way:
from django.db import transaction
def viewfunc(request):
...
with transaction.commit_on_success():
for item in items:
entry = Entry(a1=item.a1, a2=item.a2)
entry.save()
In django 1.4, bulk_create was added, allowing you to create lists of your model objects and then commit them all at once.
NOTE the save method will not be called when using bulk create.
>>> Entry.objects.bulk_create([
... Entry(headline="Django 1.0 Released"),
... Entry(headline="Django 1.1 Announced"),
... Entry(headline="Breaking: Django is awesome")
... ])
In django 1.6, transaction.atomic was introduced, intended to replace now legacy functions commit_on_success and commit_manually.
from the django documentation on atomic:
atomic is usable both as a decorator:
from django.db import transaction
#transaction.atomic
def viewfunc(request):
# This code executes inside a transaction.
do_stuff()
and as a context manager:
from django.db import transaction
def viewfunc(request):
# This code executes in autocommit mode (Django's default).
do_stuff()
with transaction.atomic():
# This code executes inside a transaction.
do_more_stuff()
Bulk creation is available in Django 1.4:
https://django.readthedocs.io/en/1.4/ref/models/querysets.html#bulk-create
Have a look at this. It's meant for use out-of-the-box with MySQL only, but there are pointers on what to do for other databases.
You might be better off bulk-loading the items - prepare a file and use a bulk load tool. This will be vastly more efficient than 8000 individual inserts.
To answer the question particularly with regard to SQLite, as asked, while I have just now confirmed that bulk_create does provide a tremendous speedup there is a limitation with SQLite: "The default is to create all objects in one batch, except for SQLite where the default is such that at maximum 999 variables per query is used."
The quoted stuff is from the docs--- A-IV provided a link.
What I have to add is that this djangosnippets entry by alpar also seems to be working for me. It's a little wrapper that breaks the big batch that you want to process into smaller batches, managing the 999 variables limit.
You should check out DSE. I wrote DSE to solve these kinds of problems ( massive insert or updates ). Using the django orm is a dead-end, you got to do it in plain SQL and DSE takes care of much of that for you.
Thomas
def order(request):
if request.method=="GET":
cust_name = request.GET.get('cust_name', '')
cust_cont = request.GET.get('cust_cont', '')
pincode = request.GET.get('pincode', '')
city_name = request.GET.get('city_name', '')
state = request.GET.get('state', '')
contry = request.GET.get('contry', '')
gender = request.GET.get('gender', '')
paid_amt = request.GET.get('paid_amt', '')
due_amt = request.GET.get('due_amt', '')
order_date = request.GET.get('order_date', '')
print(order_date)
prod_name = request.GET.getlist('prod_name[]', '')
prod_qty = request.GET.getlist('prod_qty[]', '')
prod_price = request.GET.getlist('prod_price[]', '')
print(prod_name)
print(prod_qty)
print(prod_price)
# insert customer information into customer table
try:
# Insert Data into customer table
cust_tab = Customer(customer_name=cust_name, customer_contact=cust_cont, gender=gender, city_name=city_name, pincode=pincode, state_name=state, contry_name=contry)
cust_tab.save()
# Retrive Id from customer table
custo_id = Customer.objects.values_list('customer_id').last() #It is return
Tuple as result from Queryset
custo_id = int(custo_id[0]) #It is convert the Tuple in INT
# Insert Data into Order table
order_tab = Orders(order_date=order_date, paid_amt=paid_amt, due_amt=due_amt, customer_id=custo_id)
order_tab.save()
# Insert Data into Products table
# insert multiple data at a one time from djanog using while loop
i=0
while(i<len(prod_name)):
p_n = prod_name[i]
p_q = prod_qty[i]
p_p = prod_price[i]
# this is checking the variable, if variable is null so fill the varable value in database
if p_n != "" and p_q != "" and p_p != "":
prod_tab = Products(product_name=p_n, product_qty=p_q, product_price=p_p, customer_id=custo_id)
prod_tab.save()
i=i+1
I recommend using plain SQL (not ORM) you can insert multiple rows with a single insert:
insert into A select from B;
The select from B portion of your sql could be as complicated as you want it to get as long as the results match the columns in table A and there are no constraint conflicts.
def order(request):
if request.method=="GET":
# get the value from html page
cust_name = request.GET.get('cust_name', '')
cust_cont = request.GET.get('cust_cont', '')
pincode = request.GET.get('pincode', '')
city_name = request.GET.get('city_name', '')
state = request.GET.get('state', '')
contry = request.GET.get('contry', '')
gender = request.GET.get('gender', '')
paid_amt = request.GET.get('paid_amt', '')
due_amt = request.GET.get('due_amt', '')
order_date = request.GET.get('order_date', '')
prod_name = request.GET.getlist('prod_name[]', '')
prod_qty = request.GET.getlist('prod_qty[]', '')
prod_price = request.GET.getlist('prod_price[]', '')
# insert customer information into customer table
try:
# Insert Data into customer table
cust_tab = Customer(customer_name=cust_name, customer_contact=cust_cont, gender=gender, city_name=city_name, pincode=pincode, state_name=state, contry_name=contry)
cust_tab.save()
# Retrive Id from customer table
custo_id = Customer.objects.values_list('customer_id').last() #It is return Tuple as result from Queryset
custo_id = int(custo_id[0]) #It is convert the Tuple in INT
# Insert Data into Order table
order_tab = Orders(order_date=order_date, paid_amt=paid_amt, due_amt=due_amt, customer_id=custo_id)
order_tab.save()
# Insert Data into Products table
# insert multiple data at a one time from djanog using while loop
i=0
while(i<len(prod_name)):
p_n = prod_name[i]
p_q = prod_qty[i]
p_p = prod_price[i]
# this is checking the variable, if variable is null so fill the varable value in database
if p_n != "" and p_q != "" and p_p != "":
prod_tab = Products(product_name=p_n, product_qty=p_q, product_price=p_p, customer_id=custo_id)
prod_tab.save()
i=i+1
return HttpResponse('Your Record Has been Saved')
except Exception as e:
return HttpResponse(e)
return render(request, 'invoice_system/order.html')

Categories