Efficiently copy data between databases with sqlalchemy

Efficiently copy data between databases with sqlalchemy - python

I'm trying to mirror a postgresql + PostGIS database that I defined with sqlalchemy to a sqlite (spatialite) file database. The session.merge() method appears to work for adding the instances queried from the first session to the other session, but it does not scale for nearly a million rows. See the example below that copies data from an in-memory sqlite database to another memory database for the sake of easy reproducibility. I'm looking for an approach (potentially completely different from what I'm doing now) to efficiently move all the data from one database to another.
from sqlalchemy import create_engine
from sqlalchemy import Table, Column, Integer, ForeignKey, String
from sqlalchemy.orm import declarative_base, sessionmaker
from sqlalchemy.orm import relationship, joinedload
engine_0 = create_engine('sqlite:///:memory:', echo=True)
engine_1 = create_engine('sqlite:///:memory:', echo=True)
Base = declarative_base()
Session0 = sessionmaker(bind=engine_0)
Session1 = sessionmaker(bind=engine_1)
# Define ORM models
association_table = Table('association', Base.metadata,
Column('parent_id', ForeignKey('parent.id'), primary_key=True),
Column('child_id', ForeignKey('child.id'), primary_key=True)
)
class Parent(Base):
__tablename__ = 'parent'
id = Column(Integer, primary_key=True)
name = Column(String)
children = relationship(
"Child",
secondary=association_table,
back_populates="parents")
class Child(Base):
__tablename__ = 'child'
id = Column(Integer, primary_key=True)
name = Column(String)
parents = relationship(
"Parent",
secondary=association_table,
back_populates="children")
# Create schema
Base.metadata.create_all(engine_0)
Base.metadata.create_all(engine_1)
# Create some example instances
# Children
bart = Child(name='Bart')
lisa = Child(name='Lisa')
maggie = Child(name='Maggie')
milhouse = Child(name='Milhouse')
# Parents
homer = Parent(name='Homer',
children=[bart, lisa, maggie])
marge = Parent(name='Marge',
children=[bart, lisa, maggie])
flanders = Parent(name='Ned')
kirk = Parent(name='Kirk', children=[milhouse])
# Insert data into first database
session_0 = Session0()
session_0.add_all([homer, marge, flanders, kirk])
session_0.commit()
# Query the data and insert it into the second database
all_obj = session_0.query(Parent).options(joinedload('*')).all()
session_0.expunge_all()
session_1 = Session1()
for obj in all_obj:
session_1.merge(obj)
session_1.commit()
# MAke sure that 4 instance of child are present in the second database
print(session_1.query(Child).all())
One alternative approach I have tried (unsuccessfully) is to make the parent objects transient using the sqlalchemy.orm.make_transient() function and use session.add_all() instead of session.merge() to insert the objects into the second session. However, this does not propagate to the relationships and only Parent objects are made transient.

Related

sqlalchemy: AttributeError: type object 'customer' has no attribute 'invoices'

I am new to sqlalchemy. I can create database tables by declarative mapping like this:
engine = create_engine("--engine works---")
Base = declarative_base()
class Customer(Base):
__tablename__ = 'customer'
customer_id = Column(Integer, primary_key=True)
name = Column(String(30))
email = Column(String(30))
invoices = relationship(
'Invoice',
order_by="Invoice.inv_id",
back_populates='customer',
cascade="all, delete, delete-orphan"
)
class Invoice(Base):
__tablename__ = 'invoice'
inv_id = Column(Integer, primary_key=True)
name = Column(String(30))
created = Column(Date)
customer_id = Column(ForeignKey('customer.customer_id'))
customer = relationship('Customer', back_populates='invoices')
Base.metadata.create_all(engine)
This is fine. I added some data into both customer and invoice tables.
So far so good. Next, I would try out automap_base on this existing database like this:
from sqlalchemy import select, text
from sqlalchemy.orm import declarative_base, sessionmaker
from sqlalchemy.ext.automap import automap_base
engine = create_engine('--engine works---')
Base = automap_base()
# reflect
Base.prepare(engine, reflect=True)
Customer = Base.classes.customer
Invoice = Base.classes.invoice
Session = sessionmaker(bind=engine, future=True)
session = Session()
# query invoice
stmt = select(Customer, Invoice).join(Customer.invoices).order_by(Customer.customer_id, Invoice.inv_id)
res = session.execute(stmt)
for c in res:
print(c.customer_id)
When I ran the code, I got:
AttributeError: type object 'customer' has no attribute 'invoices'
What did I miss for the relationship on the Customer (one side) or Invoice (many side) in this case so that when I query for customers with its invoices attibute and for invoices with customer attribute? Thanks for any help.

By default, automap will create the relation in the parent by appending "_collection" the lower-cased classname, so the name will be Customer.invoice_collection.
While answering this, I found that the join would raise an AttributeError on Customer.invoice_collection unless I performed a query on Customer beforehand, for example
session.execute(sa.select(Customer).where(False))
I'm not sure why that happens, however you don't necessarily need the join as you can iterate over Customer.invoice_collection directly, or join against the invoice table:
stmt = sa.select(Customer, Invoice).join(Invoice)
res = session.execute(stmt)
for c, i in res:
print(c.customer_id, i.inv_id)

sqlalchemy creating VIEW with ORM

I created the following ORM:
from sqlalchemy import Column, Integer, String, UniqueConstraint
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class TableA(Base):
__tablename__ = 'table_a'
id = Column(Integer, primary_key=True, nullable=False)
identifier = Column(String(320))
internal_id = Column(Integer)
type = Column(String(32))
time = Column(DateTime(timezone=True))
success = Column(Boolean())
parameters = Column(JSONB())
class TableB(Base):
__tablename__ = 'table_b'
__table_args__ = (UniqueConstraint('generate_action',
'print_action',
name='my_action_key'),)
id = Column(Integer, primary_key=True, autoincrement=True, nullable=False)
generate_action = Column(Integer)
print_action = Column(Integer)
generate_action = Column(Integer)
coupon_code = Column(String(300))
number_of_rebought_items = Column(Integer)
seconds_between_rebuy = Column(Integer)
I'm trying to figure out how to convert the following raw SQL view to ORM syntax with sqlalchemy.
CREATE VIEW my_view AS
SELECT table_b.id as table_b_id,
tb.coupon_code as coupon_code,
tb.number_of_rebought_items as number_of_rebought_items,
ta.id as table_a_action_id,
ta.time as time,
ta.parameters as parameters,
FROM table_b tb
LEFT JOIN table_a ta on
ta.id = tb.generate_action;
Couldn't find any good examples out there of how to do it with ORM.
So far, my solution is to just run raw sql to create this view.
can anyone point me to the right direction, or give an example of how to create views with sqlalchemy orm?
Is it possible to create the views with metadata.create_all()

the library sqlalchemy-utils now includes functionality for creating views, and it associates the view with sqlalchemy's metadata so that it is possible to create the view using Base.metadata.create_all
example:
# installation: pip install sqlalchemy-utils
from sqlalchemy_utils import create_view
from sqlalchemy import select, func
# engine Base & Table declaration elided for brevity
stmt = select([
TableB.id.label('table_b_id'),
TableB.coupon_code,
TableB.number_of_rebought_items,
TableA.id.label('table_a_action_id'),
TableA.time,
TableA.parameters
]).select_from(TableB.__table__.outerjoin(TableA, TableB.generate_action == TableA.id))
# attaches the view to the metadata using the select statement
view = create_view('my_view', stmt, Base.metadata)
# provides an ORM interface to the view
class MyView(Base):
__table__ = view
# will create all tables & views defined with ``create_view``
Base.metadata.create_all()
# At this point running the following yields 0, as expected,
# indicating that the view has been constructed on the server
engine.execute(select([func.count('*')], from_obj=MyView)).scalar()

SQLAlchemy how to update children when old = [1, 2] new = [2, 3]

I have two models:
class Person(Model):
id
name
skills = relationship(Skill)
class Skill(Model):
id
skill
person_id
At the beginning, for example:
jack = Person(name='jack')
jack.skills = [Skill(s) for s in ['python', 'ruby']]
jack.save()
Then, one day, jack lost his skill 'ruby' but earned 'swift'
so his skill is ['python', 'swift'].
My current way of doing this update is:
look for existing skills, i get old = ['python', 'ruby']
get the new list new = ['python', 'swift']
make old, new to set(old), set(new)
unchanged = old.intersection(new), so i get the skill that does not change
I add every skill in set(new - unchanged)
I delete every skill in set(old-unchanged)
Is there a easier way to do this?

Use collection_class=set on the relationship to treat it as a set instead of a list.
Here's a working example of how to relate people with skills. This is a many-to-many relationship, instead of each skill being related to one person_id, each skill can be related to many people through the person_skill table. The relationship collection is a set, and Skill has a __hash__ function to make skills with the same name hash the same.
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey, Table
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship
engine = create_engine('sqlite:///:memory:', echo=True)
Base = declarative_base(bind=engine)
Session = sessionmaker(bind=engine)
session = Session()
class Person(Base):
__tablename__ = 'person'
id = Column(Integer, primary_key=True)
name = Column(String, nullable=False, unique=True)
# many-to-many relation, as a set
skills = relationship('Skill', 'person_skill', collection_class=set)
class Skill(Base):
__tablename__ = 'skill'
id = Column(Integer, primary_key=True)
name = Column(String, nullable=False, unique=True)
def __hash__(self):
# so that the set collection will handle duplicate entries
return hash((self.__class__, self.name))
# many-to-many table, relate a person to a skill
person_skill = Table(
'person_skill', Base.metadata,
Column('person_id', Integer, ForeignKey(Person.id), primary_key=True),
Column('skill_id', Integer, ForeignKey(Skill.id), primary_key=True)
)
# create the tables
Base.metadata.create_all()
# populate some skills and people
s1 = Skill(name='python')
s2 = Skill(name='sqlalchemy')
s3 = Skill(name='questions')
s4 = Skill(name='ruby')
p1 = Person(name='davidism', skills={s1, s2, s4})
p2 = Person(name='user2653947', skills={s3})
session.add_all([p1, p2])
session.commit()
# change some skills on people
p1.skills.discard(s4)
p2.skills.add(s2)
session.commit()
This is not a complete solution. You could for instance plug in the unique object pattern demonstrated in this answer to make sure the skills you create are never duplicated.

SQLAlchemy and Normalization

I have a question about sqlalchemy and DB normalization.
I have a table called Accounts, and 2 kinds of persons, Natural_Persons and Legal_Persons.
I need to relate an account to just one person at a time.
For example, the account ID 4 is related with the Natural_Person ID 5.
But... How can I know when I query that information, if the ID 5 in the account record is from a Natural Person or a Legal one?
The simplest solution (for me at the moment) is to add a new field to the Accounts table called person_type, and to use, for example, a char to differentiate them.
So now I have a record in the accounts table with the following data:
account_id = 4
person_id = 5
person_type = N
But now I want to use the DB with sqlalchemy.
If I load an account record using a Account class instance then if I access the "person" attribute it should check the person_type field and create an instance of NaturalPerson class, or LegalPerson class according to the case!
Something like:
acc = Account(4)
acc.person
"""
if person_type == "L", person returns a LegalPerson instance
but otherwise ...
"""

Table inheritance is what you are looking for:
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy import create_engine, Column, Integer, ForeignKey, String
Base = declarative_base()
class Account(Base):
__tablename__ = 'account'
id = Column(Integer, primary_key=True)
person_id = Column(Integer, ForeignKey('person.id'))
person = relationship("Person")
class Person(Base):
__tablename__ = 'person'
id = Column(Integer, primary_key=True)
name = Column(String(50))
type = Column(String(20))
__mapper_args__ = {
'polymorphic_on':type,
'polymorphic_identity':'base'
}
class NaturalPerson(Person):
__mapper_args__ = {
'polymorphic_identity':'natural'
}
class LegalPerson(Person):
__mapper_args__ = {
'polymorphic_identity':'legal'
}
engine = create_engine('sqlite:///:memory:', echo=True)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
a = Account()
np = NaturalPerson()
a.person = np
session.add(a)
a = session.query(Account).first()
print type(a.person)

SQLAlchemy declarative property from join (single attribute, not whole object)

I wish to create a mapped attribute of an object which is populated from another table.
Using the SQLAlchemy documentation example, I wish to make a user_name field exist on the Address class such that it can be both easily queried and easily accessed (without a second round trip to the database)
For example, I wish to be able to query and filter by user_name Address.query.filter(Address.user_name == 'wcdolphin').first()
And also access the user_name attribute of all Address objects, without performance penalty, and have it properly persist writes as would be expected of an attribute in the __tablename__
class User(Base):
__tablename__ = 'users'
id = Column(Integer, primary_key=True)
name = Column(String(50))
addresses = relation("Address", backref="user")
class Address(Base):
__tablename__ = 'addresses'
id = Column(Integer, primary_key=True)
email = Column(String(50))
user_name = Column(Integer, ForeignKey('users.name'))#This line is wrong
How do I do this?
I found the documentation relatively difficult to understand, as it did not seem to conform to most examples, especially the Flask-SQLAlchemy examples.

You can do this with a join on the query object, no need to specify this attribute directly. So your model would look like:
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey
from sqlalchemy.orm import sessionmaker, relation
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
engine = create_engine('sqlite:///')
Session = sessionmaker(bind=engine)
class User(Base):
__tablename__ = 'users'
id = Column(Integer, primary_key=True)
name = Column(String(50))
addresses = relation("Address", backref="user")
class Address(Base):
__tablename__ = 'addresses'
id = Column(Integer, primary_key=True)
email = Column(String(50))
user_id = Column(Integer, ForeignKey("users.id"))
Base.metadata.create_all(engine)
A query after addresses with filtering the username looks like:
>>> session = Session()
>>> session.add(Address(user=User(name='test')))
>>> session.query(Address).join(User).filter(User.name == 'test').first()
<__main__.Address object at 0x02DB3730>
Edit: As you can directly access the user from an address object, there is no need for directly referencing an attribute to the Address class:
>>> a = session.query(Address).join(User).filter(User.name == 'test').first()
>>> a.user.name
'test'

If you truly want Address to have a SQL enabled version of "User.name" without the need to join explicitly, you need to use a correlated subquery. This will work in all cases but tends to be inefficient on the database side (particularly with MySQL), so there is possibly a performance penalty on the SQL side versus using a regular JOIN. Running some EXPLAIN tests may help to analyze how much of an effect there may be.
Another example of a correlated column_property() is at http://docs.sqlalchemy.org/en/latest/orm/mapped_sql_expr.html#using-column-property.
For the "set" event, a correlated subquery represents a read-only attribute, but an event can be used to intercept changes and apply them to the parent User row. Two approaches to this are presented below, one using regular identity map mechanics, which will incur a load of the User row if not already present, the other which emits a direct UPDATE to the row:
from sqlalchemy import *
from sqlalchemy.orm import *
from sqlalchemy.ext.declarative import declarative_base
Base= declarative_base()
class User(Base):
__tablename__ = 'users'
id = Column(Integer, primary_key=True)
name = Column(String(50))
addresses = relation("Address", backref="user")
class Address(Base):
__tablename__ = 'addresses'
id = Column(Integer, primary_key=True)
user_id = Column(Integer, ForeignKey('users.id'))
email = Column(String(50))
Address.user_name = column_property(select([User.name]).where(User.id==Address.id))
from sqlalchemy import event
#event.listens_for(Address.user_name, "set")
def _set_address_user_name(target, value, oldvalue, initiator):
# use ORM identity map + flush
target.user.name = value
# use direct UPDATE
#object_session(target).query(User).with_parent(target).update({'name':value})
e = create_engine("sqlite://", echo=True)
Base.metadata.create_all(e)
s = Session(e)
s.add_all([
User(name='u1', addresses=[Address(email='e1'), Address(email='e2')])
])
s.commit()
a1 = s.query(Address).filter(Address.user_name=="u1").first()
assert a1.user_name == "u1"
a1.user_name = 'u2'
s.commit()
a1 = s.query(Address).filter(Address.user_name=="u2").first()
assert a1.user_name == "u2"

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Efficiently copy data between databases with sqlalchemy - python

Related

sqlalchemy: AttributeError: type object 'customer' has no attribute 'invoices'

sqlalchemy creating VIEW with ORM

SQLAlchemy how to update children when old = [1, 2] new = [2, 3]

SQLAlchemy and Normalization

SQLAlchemy declarative property from join (single attribute, not whole object)

Categories

Resources