SQLAlchemy + mysql / mariadb: bulk upsert with composite keys - python

Using SQLAlchemy and a MariaDB backend, I need to bulk upsert data. Using this answer I was able to make it work for model with a single primary key. However, I can't make it work with composite keys.
The key part of the code is this one:
# for single pk
primary_key = [key.name for key in inspect(model).primary_key][0]
# get all entries to be updated
for each in DBSession.query(model).filter(getattr(model, primary_key).in_(entries.keys())).all():
entry = entries.pop(str(getattr(each, primary_key)))
I tried to change it to make it work with composite keys:
primary_keys = tuple([key.name for key in inspect(model).primary_key])
# get all entries to be updated
for each in DBSession.query(model).filter(and_(*[getattr(model, col).in_(entries.keys()) for col in primary_keys])).all():
print("This is never printed :(")
I guess this DBSession.query(model).filter(and_(*[getattr(model, col).in_(entries.keys()) for col in primary_keys])).all() doesn't work as intended.
For reference, here is a fully working snippet:
from sqlalchemy import Column, create_engine, and_, or_
from sqlalchemy.types import String
from sqlalchemy.inspection import inspect
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import scoped_session, sessionmaker
from sqlalchemy import inspect, tuple_
DBSession = scoped_session(sessionmaker())
Base = declarative_base()
class Accounts(Base):
__tablename__ = 'accounts'
account = Column(String(50), primary_key=True)
comment = Column(String(50))
class Users(Base):
__tablename__ = 'users'
user = Column(String(50), primary_key=True)
account = Column(String(50), primary_key=True)
comment = Column(String(50))
accounts_data = {"account1": {"account": "account1", "comment": "test"}, "account2": {"account": "account2", "comment": None}}
users_data = {("user1", "account1"): {"user": "user1", "account": "account1", "comment": ""}, ("user1", "account2"): {"user": "user1", "account": "account2", "comment": ""}}
def upsert_data_single_pk(entries, model):
primary_key = [key.name for key in inspect(model).primary_key][0]
entries_to_update = []
entries_to_insert = []
# get all entries to be updated
for each in DBSession.query(model).filter(getattr(model, primary_key).in_(entries.keys())).all():
entry = entries.pop(str(getattr(each, primary_key)))
entries_to_update.append(entry)
# get all entries to be inserted
for entry in entries.values():
entries_to_insert.append(entry)
DBSession.bulk_insert_mappings(model, entries_to_insert)
DBSession.bulk_update_mappings(model, entries_to_update)
DBSession.commit()
def upsert_data_multiple_pk(entries, model):
primary_keys = tuple([key.name for key in inspect(model).primary_key])
entries_to_update = []
entries_to_insert = []
# get all entries to be updated
for each in DBSession.query(model).filter(and_(*[getattr(model, col).in_(entries.keys()) for col in primary_keys])).all():
# Print the composite primary key value by concatenating the values of the individual columns
print('-'.join([str(getattr(each, col)) for col in primary_keys]))
# get all entries to be inserted
for entry in entries.values():
entries_to_insert.append(entry)
DBSession.bulk_insert_mappings(model, entries_to_insert)
DBSession.bulk_update_mappings(model, entries_to_update)
DBSession.commit()
db_connection_uri = "mysql+pymysql://XXXX:XXXX#XXXX:XXXX/XXXX?charset=utf8mb4"
engine = create_engine(db_connection_uri, echo=False)
DBSession.remove()
DBSession.configure(bind=engine, autoflush=False, expire_on_commit=False)
#Base.metadata.drop_all(engine, checkfirst=True)
Base.metadata.create_all(bind=engine)
#upsert_data_single_pk(accounts_data, Accounts)
upsert_data_multiple_pk(users_data, Users)

I wrote a different function to do what I needed:
def upsert(self, model: Type[Base], data: List[Dict[str, Any]]) -> None:
"""Upsert a record into the database.
If the record already exists, it will be updated. If it does not exist, it will be inserted.
Parameters:
model: The SQLAlchemy model representing the table.
data: The data to be inserted or updated, as a list of dictionaries.
"""
if not data:
logger.info("No data to insert")
return None
logger.info(f"{len(data)} rows to insert/update to {model.__table__}")
insert_stmt = insert(model.__table__).values(data)
primary_keys = ModelTools.get_primary_keys(model)
to_update = {
k: getattr(insert_stmt.inserted, k)
for k in data[0].keys()
if k not in primary_keys
}
on_conflict_stmt = insert_stmt.on_duplicate_key_update(**to_update)
self.engine.execute(on_conflict_stmt)
It is probably not the best time efficient, but it works as intended so for now I'm keeping it.

Related

How I can parse response with SQLAlchemy from multiple rows & columns to one field with list of dictionaries?

My current output looks like:
item
attribute_id
attribute_value
attribute_name
A
zone
A
zone_position
A
type
simple
type_item
A
status
active
state
Desired output is:
item
attributes
A
[{"attribute_id": "zone", "attribute_value": "A", "attribute_name": "zone_position"}, {"attribute_id: "type", "attribute_value": "simple", "attribute_name": "type_item"}, {"attribute_id": "status", "attribute_value": "active", "attribute_name": "state}]
If that would be hard to be done from sql alchemy, How can be done from a json format?
Thanks in advance!
Something like this should work. We can use itertools.groupby to group results by the value of item, and tools from the operator module to abstract finding and extracting values.
import itertools
import operator
import pprint
import sqlalchemy as sa
from sqlalchemy import orm
Base = orm.declarative_base()
# Assuming this model strcuture
class MyModel(Base):
__tablename__ = 't74781694'
id = sa.Column(sa.Integer, primary_key=True)
item = sa.Column(sa.String)
attribute_id = sa.Column(sa.String)
attribute_value = sa.Column(sa.String)
attribute_name = sa.Column(sa.String)
engine = sa.create_engine('sqlite://', echo=True, future=True)
Base.metadata.create_all(engine)
Session = orm.sessionmaker(engine, future=True)
# The attributes that we want to group.
keys = ['attribute_id', 'attribute_value', 'attribute_name']
# ORM solution (using model entities).
attrgetter = operator.attrgetter(*keys)
with Session() as s:
instances = s.scalars(sa.select(MyModel))
data = [
(k, [dict.fromkeys(keys, attrgetter(g)) for g in grouped])
for k, grouped in itertools.groupby(instances, key=lambda m: m.item)
]
pprint.pprint(data)
# Hybrid core/ORM solution (Using session and table).
# Pure core would entail using engine instead of session
keygetter = operator.itemgetter('item')
itemgetter = operator.itemgetter(*keys)
with Session() as s:
tbl = MyModel.__table__
rows = s.execute(sa.select(tbl)).mappings()
data = [
(k, [dict.fromkeys(keys, itemgetter(g)) for g in grouped])
for k, grouped in itertools.groupby(rows, key=keygetter)
]
pprint.pprint(data)

Get count of inserted and updated records in sqlalchemy's upsert

I have a working code of upserting several records by sqlalchemy:
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy import BigInteger
from flask_sqlalchemy import SQLAlchemy as db
class PetModel(db.Model):
__tablename__ = "pets"
id = db.Column(BigInteger, primary_key=True)
data = db.Column(db.String(64), nullable=False, unique=True)
def as_dict(self):
return {
"id": getattr(self, "id"),
"data": getattr(self, "data"),
}
pets = [PetModel(id=1, data="Dog"), PetModel(id=2, data="Cat")]
insert_statement = insert(PetModel).values([_.as_dict() for _ in pets])
upsert_statement = insert_statement.on_conflict_do_update(
constraint="pet_pkey",
set_={"data": insert_statement.excluded.data},
)
ans = db.session.execute(upsert_statement)
I have tried to return all rows by adding returning(PetModel.__table__) into insert_statement, but I can't separate the answer of the [_ for _ in ans] statement on updated and inserted. I don't want to add special field to database.
I know that ans.rowcount returns the sum of updated and inserted records.
How could I get separately the amount of updated and inserted records using sqlalchemy?
As Ilja Everilä said, one of the decisions is to use the xmax hack.
A column with this value we should add to the answer like describing here
from sqlalchemy.dialects.postgresql import insert
...
insert_statement = insert(PetModel).returning(
sqlalchemy.column("xmax") == 0
).values([_.as_dict() for _ in pets])
upsert_statement = insert_statement.on_conflict_do_update(
constraint="pet_pkey",
set_={"data": insert_statement.excluded.data},
)
ans = db.session.execute(upsert_statement)
created = sum([_._row[0] for _ in ans])
updated = len(pets) - created

SQLAlchemy - pass a dynamic tablename to query function?

I have a simple polling script that polls entries based on new ID's in a MSSQL table. I'm using SQLAlchemy's ORM to create a table class and then query that table. I want to be able to add more tables "dynamically" without coding it directly into the method.
My polling function:
def poll_db():
query = db.query(
Transactions.ID).order_by(Transactions.ID.desc()).limit(1)
# Continually poll for new images to classify
max_id_query = query
last_max_id = max_id_query.scalar()
while True:
max_id = max_id_query.scalar()
if max_id > last_max_id:
print(
f"New row(s) found. "
f"Processing ids {last_max_id + 1} through {max_id}"
)
# Insert ML model
id_query = db.query(Transactions).filter(
Transactions.ID > last_max_id)
df_from_query = pd.read_sql_query(
id_query.statement, db.bind, index_col='ID')
print(f"New query was made")
last_max_id = max_id
time.sleep(5)
My table model:
import sqlalchemy as db
from sqlalchemy import Boolean, Column, ForeignKey, Integer, String, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import defer, relationship, query
from database import SessionLocal, engine
insp = db.inspect(engine)
db_list = insp.get_schema_names()
Base = declarative_base(cls=BaseModel)
class Transactions(Base):
__tablename__ = 'simulation_data'
sender_account = db.Column('sender_account', db.BigInteger)
recipient_account = db.Column('recipient_account', db.String)
sender_name = db.Column('sender_name', db.String)
recipient_name = db.Column('recipient_name', db.String)
date = db.Column('date', db.DateTime)
text = db.Column('text', db.String)
amount = db.Column('amount', db.Float)
currency = db.Column('currency', db.String)
transaction_type = db.Column('transaction_type', db.String)
fraud = db.Column('fraud', db.BigInteger)
swift_bic = db.Column('swift_bic', db.String)
recipient_country = db.Column('recipient_country', db.String)
internal_external = db.Column('internal_external', db.String)
ID = Column('ID', db.BigInteger, primary_key=True)
QUESTION
How can I pass the table class name "dynamically" in the likes of poll_db(tablename), where tablename='Transactions', and instead of writing similar queries for multiple tables, such as:
query = db.query(Transactions.ID).order_by(Transactions.ID.desc()).limit(1)
query2 = db.query(Transactions2.ID).order_by(Transactions2.ID.desc()).limit(1)
query3 = db.query(Transactions3.ID).order_by(Transactions3.ID.desc()).limit(1)
The tables will have identical structure, but different data.
I can't give you a full example right now (will edit later) but here's one hacky way to do it (the documentation will probably be a better place to check):
def dynamic_table(tablename):
for class_name, cls in Base._decl_class_registry.items():
if cls.__tablename__ == tablename:
return cls
Transactions2 = dynamic_table("simulation_data")
assert Transactions2 is Transactions
The returned class is the model you want. Keep in mind that Base can only access the tables that have been subclassed already so if you have them in other modules you need to import them first so they are registered as Base's subclasses.
For selecting columns, something like this should work:
def dynamic_table_with_columns(tablename, *columns):
cls = dynamic_table(tablename)
subset = []
for col_name in columns:
column = getattr(cls, col_name)
if column:
subset.append(column)
# in case no columns were given
if not subset:
return db.query(cls)
return db.query(*subset)

Validating SQLAlchemy Fields

I have a dictionary that gets created from a programatic process that looks like
{'field1: 3, 'field2: 'TEST'}
I feed this dictionary into the model as its fields (for example: Model(**dict))
I want to run a series of unit tests that determine whether the fields are of valid data type.
How do I validate that these data types are valid for my database without having to do an insertion and rollback as this would introduce flakiness into my tests as I would interacting with an actual database correct? (MySQL).
I do not have much experience with sqlalchemy but if you use data-types in Columns of your models, won't that work?
This link might help you : http://docs.sqlalchemy.org/en/rel_0_9/core/type_basics.html
Here's a rudimentary way to do what you asked
class Sample_Table(Base):
__tablename__ = 'Sample_Table'
__table_args__ = {'sqlite_autoincrement': True}
id = Column(Integer, primary_key=True, nullable=False)
col1 = Column(Integer)
col2 = Column(Integer)
def __init__(self, **kwargs):
for k,v in kwargs.items():
col_type = str(self.__table__.c[k].type)
try:
if str(type(v).__name__) in col_type.lower():
setattr(self, k, v)
else:
raise Exception("BAD COLUMN TYPE FOR COL " + k)
except ValueError as e:
print e.message
If you try to use the above to insert a record with a column type that is different than what you specified, it will throw an error, i.e. it will not perform an insertion and rollback.
To prove that this works, try the following full-working code:
from sqlalchemy import Column, Integer
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class Sample_Table(Base):
__tablename__ = 'Sample_Table'
__table_args__ = {'sqlite_autoincrement': True}
id = Column(Integer, primary_key=True, nullable=False)
col1 = Column(Integer)
col2 = Column(Integer)
def __init__(self, **kwargs):
for k,v in kwargs.items():
col_type = str(self.__table__.c[k].type)
try:
if str(type(v).__name__) in col_type.lower():
setattr(self, k, v)
else:
raise Exception("BAD COLUMN TYPE FOR COL " + k)
except ValueError as e:
print e.message
engine = create_engine('sqlite:///')
session = sessionmaker()
session.configure(bind=engine)
s = session()
Base.metadata.create_all(engine)
data = {"col1" : 1, "col2" : 2}
record = Sample_Table(**data)
s.add(record) #works
s.commit()
data = {"col1" : 1, "col2" : "2"}
record = Sample_Table(**data)
s.add(record) #doesn't work!
s.commit()
s.close()
(Even though I used SQLite, it will work for a MySQL database alike.)

Dynamic Datasets and SQLAlchemy

I am refactoring some old SQLite3 SQL statements in Python into SQLAlchemy. In our framework, we have the following SQL statements that takes in a dict with certain known keys and potentially any number of unexpected keys and values (depending what information was provided).
import sqlite3
import sys
def dict_factory(cursor, row):
d = {}
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
def Create_DB(db):
# Delete the database
from os import remove
remove(db)
# Recreate it and format it as needed
with sqlite3.connect(db) as conn:
conn.row_factory = dict_factory
conn.text_factory = str
cursor = conn.cursor()
cursor.execute("CREATE TABLE [Listings] ([ID] INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL UNIQUE, [timestamp] REAL NOT NULL DEFAULT(( datetime ( 'now' , 'localtime' ) )), [make] VARCHAR, [model] VARCHAR, [year] INTEGER);")
def Add_Record(db, data):
with sqlite3.connect(db) as conn:
conn.row_factory = dict_factory
conn.text_factory = str
cursor = conn.cursor()
#get column names already in table
cursor.execute("SELECT * FROM 'Listings'")
col_names = list(map(lambda x: x[0], cursor.description))
#check if column doesn't exist in table, then add it
for i in data.keys():
if i not in col_names:
cursor.execute("ALTER TABLE 'Listings' ADD COLUMN '{col}' {type}".format(col=i, type='INT' if type(data[i]) is int else 'VARCHAR'))
#Insert record into table
cursor.execute("INSERT INTO Listings({cols}) VALUES({vals});".format(cols = str(data.keys()).strip('[]'),
vals=str([data[i] for i in data]).strip('[]')
))
#Database filename
db = 'test.db'
Create_DB(db)
data = {'make': 'Chevy',
'model' : 'Corvette',
'year' : 1964,
'price' : 50000,
'color' : 'blue',
'doors' : 2}
Add_Record(db, data)
data = {'make': 'Chevy',
'model' : 'Camaro',
'year' : 1967,
'price' : 62500,
'condition' : 'excellent'}
Add_Record(db, data)
This level of dynamicism is necessary because there's no way we can know what additional information will be provided, but, regardless, it's important that we store all information provided to us. This has never been a problem because in our framework, as we've never expected an unwieldy number of columns in our tables.
While the above code works, it's obvious that it's not a clean implementation and thus why I'm trying to refactor it into SQLAlchemy's cleaner, more robust ORM paradigm. I started going through SQLAlchemy's official tutorials and various examples and have arrived at the following code:
from sqlalchemy import Column, String, Integer
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class Listing(Base):
__tablename__ = 'Listings'
id = Column(Integer, primary_key=True)
make = Column(String)
model = Column(String)
year = Column(Integer)
engine = create_engine('sqlite:///')
session = sessionmaker()
session.configure(bind=engine)
Base.metadata.create_all(engine)
data = {'make':'Chevy',
'model' : 'Corvette',
'year' : 1964}
record = Listing(**data)
s = session()
s.add(record)
s.commit()
s.close()
and it works beautifully with that data dict. Now, when I add a new keyword, such as
data = {'make':'Chevy',
'model' : 'Corvette',
'year' : 1964,
'price' : 50000}
I get a TypeError: 'price' is an invalid keyword argument for Listing error. To try and solve the issue, I modified the class to be dynamic, too:
class Listing(Base):
__tablename__ = 'Listings'
id = Column(Integer, primary_key=True)
make = Column(String)
model = Column(String)
year = Column(Integer)
def __checker__(self, data):
for i in data.keys():
if i not in [a for a in dir(self) if not a.startswith('__')]:
if type(i) is int:
setattr(self, i, Column(Integer))
else:
setattr(self, i, Column(String))
else:
self[i] = data[i]
But I quickly realized this would not work at all for several reasons, e.g. the class was already initialized, the data dict cannot be fed into the class without reinitializing it, it's a hack more than anything, et al.). The more I think about it, the less obvious the solution using SQLAlchemy seems to me. So, my main question is, how do I implement this level of dynamicism using SQLAlchemy?
I've researched a bit to see if anyone has a similar issue. The closest I've found was Dynamic Class Creation in SQLAlchemy but it only talks about the constant attributes ("tablename" et al.). I believe the unanswered https://stackoverflow.com/questions/29105206/sqlalchemy-dynamic-attribute-change may be asking the same question. While Python is not my forte, I consider myself a highly skilled programmer (C++ and JavaScript are my strongest languages) in the context scientific/engineering applications, so I may not hitting the correct Python-specific keywords in my searches.
I welcome any and all help.
class Listing(Base):
__tablename__ = 'Listings'
id = Column(Integer, primary_key=True)
make = Column(String)
model = Column(String)
year = Column(Integer)
def __init__(self,**kwargs):
for k,v in kwargs.items():
if hasattr(self,k):
setattr(self,k,v)
else:
engine.execute("ALTER TABLE %s AD COLUMN %s"%(self.__tablename__,k)
setattr(self.__class__,Column(k, String))
setattr(self,k,v)
might work ... maybe ... I am not entirely sure I did not test it
a better solution would be to use a relational table
class Attribs(Base):
listing_id = Column(Integer,ForeignKey("Listing"))
name = Column(String)
val = Column(String)
class Listing(Base):
id = Column(Integer,primary_key = True)
attributes = relationship("Attribs",backref="listing")
def __init__(self,**kwargs):
for k,v in kwargs.items():
Attribs(listing_id=self.id,name=k,value=v)
def __str__(self):
return "\n".join(["A LISTING",] + ["%s:%s"%(a.name,a.val) for a in self.attribs])
another solution would be to store json
class Listing(Base):
__tablename__ = 'Listings'
id = Column(Integer, primary_key=True)
data = Column(String)
def __init__(self,**kwargs):
self.data = json.dumps(kwargs)
self.data_dict = kwargs
the best solution would be to use a no-sql key,value store (maybe even just a simple json file? or perhaps shelve? or even pickle I guess)

Categories