Combining SQLAlchemy yield_per and group_by - python

I have a. SQLAlchemy database table spanning 24 hours and with up to 1,000,000 rows per hour. Example table below.
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.ext.declarative import declatative_base
from sqlalchemy.orm import sessionmaker
from random import choice
import pandas as pd
Base = declarative_base()
class WebsiteData(Base):
__tablename__ = 'hourly_website_table'
id = Column(Integer, primary_key=True)
user = Column(String(600), index=True)
website = Column(String(600))
time_secs = Column(Integer, index=True)
class DataBaseManager:
def __init__(self, db_loc='sqlite:////home/test/database.db'):
self.engine = create_engine(db_loc, echo=False)
self.table = WebsiteData
def get_session(self):
Session = sessionmaker(bind=self.engine)
session = Session()
Base.metadata.create_all(self.engine)
return session
def get_db_info(self):
session = self.get_session()
rows = session.query(self.table).count()
session.close()
return rows
def df_to_hourly_db(self, table_name, df, time_secs):
conn = self.engine.raw_connection()
df['hour'] = time_secs
query = "INSERT OR REPLACE INTO %s (user,website,time_secs) VALUES (?,?,?)" %\
table_name
conn.executemany(query, df[['user', 'website', 'hour']].to_records(index=False))
conn.commit()
conn.close()
def create_df(time_secs=0, users=10000, rows_per_user=100):
user_arr = [("u%d" % i) for i in range(users)] * rows_per_user
web_arr = [("www.website_%d" % (time_secs + i)) for i in xrange(rows_per_user * users)]
return pd.DataFrame({'user': user_arr, 'website': web_arr})
DBM = DataBaseManager()
for hour in range(24):
time_secs = (60 * 24 * 3600) + (hour * 3600)
df = create_df(time_secs=time_secs, rows_per_user=choice(range(100)))
DBM.df_to_hourly_db(df, time_secs)
The number of rows per hour is variable. In order to avoid having to load the entire table into memory at once, I would like to perform a group_by(table.time_secs) on the data and then stream each group sequentially. Is it possible to somehow combine SQLAlchemy's group_by and yield_per methods to achieve this? I know yield_per allows you to yield a set number of rows at a time, but is it possible to yield a different number of rows per iteration? If not, is there any other way of doing something similar?

Related

pandas: iterate over dataframe, do SQL query for each row

I have a dataframe and a 5 million row local Postgres database. In each row of the dataframe, I want to add a column that is the result of a query against the Postgres database.
This is what I have right now:
for index, row in df_tf.iterrows():
row = dict(row)
id = row['National ID']
q = 'select name from companies where company_number=%s'
cursor.execute(q, [company_number])
results = cursor.fetchall()
if len(results):
row['name'] = result[0][0]
writer.writerow(row)
else:
row['name'] = ''
writer.writerow(row)
So I'm iterating over the rows and writing the results to a local CSV.
Is there a way I could do this more neatly, and keep the results in a local dataframe?
I know I could load the Postgres data into pandas and join directly, but it's rather large and slow, so I would prefer to use a Postgres query.
The way to do it with sqlalchemy declarative_base.
Rough code:
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import String, Integer # noqa
from sqlalchemy.orm import scoped_session
from sqlalchemy.orm import sessionmaker
base = declarative_base()
engine = create_engine(#some stuff)
session = scoped_session(sessionmaker(bind=engine))()
class Companies(base):
__tablename__ = 'companies'
name = Column(String)
company_number = Column(Integer)
...
#other stuff
#classmethod
def get_by_company_number(cls, company_number):
query = session.query(cls).filter(cls.company_number == company_number)
if query.count() == 0:
return ''
else:
return query.first().name
df_tf['name'] = df_tf['National ID'].apply(Companies.get_by_company_number)
df_tf.to_csv('filename.csv')
I think my first look would be something like (untested):
import pandas
import psycopg2
import csv
import contextlib
def get_company_name(cursor, company_number):
query = 'SELECT name FROM companies WHERE company_number=%s;'
cursor.execute(query, [company_number])
results = cursor.fetchone()
return results[0] if results else ''
df_tf = pandas.DataFrame("...")
with contextlib.ExitStack() as ctx:
connection = ctx.enter_context(psycopg2.connect("..."))
cursor = ctx.enter_context(connection.cursor())
file_out = ctx.enter_context(open("results.csv", "w"))
writer = csv.DictWriter(file_out, fieldnames=["National ID", "Name"])
writer.writeheader()
for _, row in df_tf.iterrows():
row = dict(row)
row['Name'] = get_company_name(cursor, row['National ID'])
writer.writerow(row)
Depending on the data in the dataframe, it might be worth it to cache results from get_company_name(). I imagine there are better answer, but this is what I would try out of the gate.

SQLAlchemy - pass a dynamic tablename to query function?

I have a simple polling script that polls entries based on new ID's in a MSSQL table. I'm using SQLAlchemy's ORM to create a table class and then query that table. I want to be able to add more tables "dynamically" without coding it directly into the method.
My polling function:
def poll_db():
query = db.query(
Transactions.ID).order_by(Transactions.ID.desc()).limit(1)
# Continually poll for new images to classify
max_id_query = query
last_max_id = max_id_query.scalar()
while True:
max_id = max_id_query.scalar()
if max_id > last_max_id:
print(
f"New row(s) found. "
f"Processing ids {last_max_id + 1} through {max_id}"
)
# Insert ML model
id_query = db.query(Transactions).filter(
Transactions.ID > last_max_id)
df_from_query = pd.read_sql_query(
id_query.statement, db.bind, index_col='ID')
print(f"New query was made")
last_max_id = max_id
time.sleep(5)
My table model:
import sqlalchemy as db
from sqlalchemy import Boolean, Column, ForeignKey, Integer, String, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import defer, relationship, query
from database import SessionLocal, engine
insp = db.inspect(engine)
db_list = insp.get_schema_names()
Base = declarative_base(cls=BaseModel)
class Transactions(Base):
__tablename__ = 'simulation_data'
sender_account = db.Column('sender_account', db.BigInteger)
recipient_account = db.Column('recipient_account', db.String)
sender_name = db.Column('sender_name', db.String)
recipient_name = db.Column('recipient_name', db.String)
date = db.Column('date', db.DateTime)
text = db.Column('text', db.String)
amount = db.Column('amount', db.Float)
currency = db.Column('currency', db.String)
transaction_type = db.Column('transaction_type', db.String)
fraud = db.Column('fraud', db.BigInteger)
swift_bic = db.Column('swift_bic', db.String)
recipient_country = db.Column('recipient_country', db.String)
internal_external = db.Column('internal_external', db.String)
ID = Column('ID', db.BigInteger, primary_key=True)
QUESTION
How can I pass the table class name "dynamically" in the likes of poll_db(tablename), where tablename='Transactions', and instead of writing similar queries for multiple tables, such as:
query = db.query(Transactions.ID).order_by(Transactions.ID.desc()).limit(1)
query2 = db.query(Transactions2.ID).order_by(Transactions2.ID.desc()).limit(1)
query3 = db.query(Transactions3.ID).order_by(Transactions3.ID.desc()).limit(1)
The tables will have identical structure, but different data.
I can't give you a full example right now (will edit later) but here's one hacky way to do it (the documentation will probably be a better place to check):
def dynamic_table(tablename):
for class_name, cls in Base._decl_class_registry.items():
if cls.__tablename__ == tablename:
return cls
Transactions2 = dynamic_table("simulation_data")
assert Transactions2 is Transactions
The returned class is the model you want. Keep in mind that Base can only access the tables that have been subclassed already so if you have them in other modules you need to import them first so they are registered as Base's subclasses.
For selecting columns, something like this should work:
def dynamic_table_with_columns(tablename, *columns):
cls = dynamic_table(tablename)
subset = []
for col_name in columns:
column = getattr(cls, col_name)
if column:
subset.append(column)
# in case no columns were given
if not subset:
return db.query(cls)
return db.query(*subset)

Autoflush error and filter_by() query giving unexpected result

My goal is to read data off of an excel sheet and create a database on a SQL server. I am trying to write a sample code using SQLalchemy and I am new to it. The code that I have so far is:
import time
from sqlalchemy import create_engine, Column, Integer, Date, String, Table, MetaData,table
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
engine = create_engine('sqlite:///:memory:', echo = False)
Base = declarative_base()
class blc(Base):
__tablename__ = 'BLC_Databse'
date = Column(String, primary_key = True)
RES = Column(String)
BTTLCOLUMN = Column(String)
CS_HR = Column(Integer)
Base.metadata.create_all(engine)
sample = blc(date=time.strftime("%m/%d/%y") , RES = 'BDY_21', BTTLCOLUMN = '2075', CS_HR = 563)
Session = sessionmaker(bind=engine)
session = Session()
sample2 = blc(date=time.strftime("%m/%d/%y") , RES = 'BDY_21', BTTLCOLUMN = '2076', CS_HR = 375)
session.add(sample2)
session.commit()
with session.no_autoflush:
result = session.query(blc).filter_by(RES = 'BDY_21').first()
print(result)
When I am performing a filter query (which I am assuming it is similar to where clause in SQL) it gives <__main__.blc object at 0x00705770> error
Eventually, I plan to have the insert clause on a loop and it will read data from an excel sheet.
Result is an object that references the class blc. To get the desired column, I had to do result.ColName.

Using window functions to LIMIT a query with SqlAlchemy on Postgres

I'm trying to write the following sql query with sqlalchemy ORM:
SELECT * FROM
(SELECT *, row_number() OVER(w)
FROM (select distinct on (grandma_id, author_id) * from contents) as c
WINDOW w AS (PARTITION BY grandma_id ORDER BY RANDOM())) AS v1
WHERE row_number <= 4;
This is what I've done so far:
s = Session()
unique_users_contents = (s.query(Content).distinct(Content.grandma_id,
Content.author_id)
.subquery())
windowed_contents = (s.query(Content,
func.row_number()
.over(partition_by=Content.grandma_id,
order_by=func.random()))
.select_from(unique_users_contents)).subquery()
contents = (s.query(Content).select_from(windowed_contents)
.filter(row_number >= 4)) ## how can I reference the row_number() value?
result = contents
for content in result:
print "%s\t%s\t%s" % (content.id, content.grandma_id,
content.author_id)
As you can see it's pretty much modeled, but I have no idea how to reference the row_number() result of the subquery from the outer query where. I tried something like windowed_contents.c.row_number and adding a label() call on the window func but it's not working, couldn't find any similar example in the official docs or in stackoverflow.
How can this be accomplished? And also, could you suggest a better way to do this query?
windowed_contents.c.row_number against a label() is how you'd do it, works for me (note the select_entity_from() method is new in SQLA 0.8.2 and will be needed here in 0.9 vs. select_from()):
from sqlalchemy import *
from sqlalchemy.orm import *
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Content(Base):
__tablename__ = 'contents'
grandma_id = Column(Integer, primary_key=True)
author_id = Column(Integer, primary_key=True)
s = Session()
unique_users_contents = s.query(Content).distinct(
Content.grandma_id, Content.author_id).\
subquery('c')
q = s.query(
Content,
func.row_number().over(
partition_by=Content.grandma_id,
order_by=func.random()).label("row_number")
).select_entity_from(unique_users_contents).subquery()
q = s.query(Content).select_entity_from(q).filter(q.c.row_number <= 4)
print q

Converting SQL commands to Python's ORM

How would you convert the following codes to Python's ORM such as by SQLalchemy?
#1 Putting data to Pg
import os, pg, sys, re, psycopg2
#conn = psycopg2.connect("dbname='tkk' host='localhost' port='5432' user='noa' password='123'")
conn = psycopg2.connect("dbname=tk user=naa password=123")
cur = conn.cursor()
cur.execute("""INSERT INTO courses (course_nro)
VALUES ( %(course_nro)s )""", dict(course_nro='abcd'))
conn.commit()
#2 Fetching
cur.execute("SELECT * FROM courses")
print cur.fetchall()
Examples about the two commands in SQLalchemy
insert
sqlalchemy.sql.expression.insert(table, values=None, inline=False, **kwargs)
select
sqlalchemy.sql.expression.select(columns=None, whereclause=None, from_obj=[], **kwargs)
After the initial declarations, you can do something like this:
o = Course(course_nro='abcd')
session.add(o)
session.commit()
and
print session.query(Course).all()
The declarations could look something like this:
from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import session_maker
# create an engine, and a base class
engine = create_engine('postgre://naa:123#localhost/tk')
DeclarativeBase = declarative_base(bind=engine)
metadata = DeclarativeBase.metadata
# create a session
Session = session_maker(engine)
session = Session()
# declare the models
class Cource(DelcarativeBase):
__tablename__ = 'courses'
course_nro = Column('course_nro', CHAR(12))
This declarative method is just one way of using sqlalchemy.
Even though this is old, more examples can't hurt, right? I thought I'd demonstrate how to do this with PyORMish.
from pyormish import Model
class Course(Model):
_TABLE_NAME = 'courses'
_PRIMARY_FIELD = 'id' # or whatever your primary field is
_SELECT_FIELDS = ('id','course_nro')
_COMMIT_FIELDS = ('course_nro',)
Model.db_config = dict(
DB_TYPE='postgres',
DB_CONN_STRING='postgre://naa:123#localhost/tk'
)
To create:
new_course = Course().create(course_nro='abcd')
To select:
# return the first row WHERE course_nro='abcd'
new_course = Course().get_by_fields(course_nro='abcd')

Categories