pandas: iterate over dataframe, do SQL query for each row - python

I have a dataframe and a 5 million row local Postgres database. In each row of the dataframe, I want to add a column that is the result of a query against the Postgres database.
This is what I have right now:
for index, row in df_tf.iterrows():
row = dict(row)
id = row['National ID']
q = 'select name from companies where company_number=%s'
cursor.execute(q, [company_number])
results = cursor.fetchall()
if len(results):
row['name'] = result[0][0]
writer.writerow(row)
else:
row['name'] = ''
writer.writerow(row)
So I'm iterating over the rows and writing the results to a local CSV.
Is there a way I could do this more neatly, and keep the results in a local dataframe?
I know I could load the Postgres data into pandas and join directly, but it's rather large and slow, so I would prefer to use a Postgres query.

The way to do it with sqlalchemy declarative_base.
Rough code:
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import String, Integer # noqa
from sqlalchemy.orm import scoped_session
from sqlalchemy.orm import sessionmaker
base = declarative_base()
engine = create_engine(#some stuff)
session = scoped_session(sessionmaker(bind=engine))()
class Companies(base):
__tablename__ = 'companies'
name = Column(String)
company_number = Column(Integer)
...
#other stuff
#classmethod
def get_by_company_number(cls, company_number):
query = session.query(cls).filter(cls.company_number == company_number)
if query.count() == 0:
return ''
else:
return query.first().name
df_tf['name'] = df_tf['National ID'].apply(Companies.get_by_company_number)
df_tf.to_csv('filename.csv')

I think my first look would be something like (untested):
import pandas
import psycopg2
import csv
import contextlib
def get_company_name(cursor, company_number):
query = 'SELECT name FROM companies WHERE company_number=%s;'
cursor.execute(query, [company_number])
results = cursor.fetchone()
return results[0] if results else ''
df_tf = pandas.DataFrame("...")
with contextlib.ExitStack() as ctx:
connection = ctx.enter_context(psycopg2.connect("..."))
cursor = ctx.enter_context(connection.cursor())
file_out = ctx.enter_context(open("results.csv", "w"))
writer = csv.DictWriter(file_out, fieldnames=["National ID", "Name"])
writer.writeheader()
for _, row in df_tf.iterrows():
row = dict(row)
row['Name'] = get_company_name(cursor, row['National ID'])
writer.writerow(row)
Depending on the data in the dataframe, it might be worth it to cache results from get_company_name(). I imagine there are better answer, but this is what I would try out of the gate.

Related

How to loop through queries and add the results to a single dataframe?

I have a Flask application that currently makes a connection to a remote server and then a particular DB. I have a function that takes the users date range input and then queries data based upon the given dates.
from flask import make_response, redirect, render_template, session, url_for, Blueprint,request
import pandas as pd
import pymssql
from core.database import ConnectionFactory
from core.query import *
dat_a = Blueprint('dat_a', __name__)
ButtonPressed = 0
#dat_a.route('/download', methods=['POST','GET'])
def download():
if session.get('logged_in') != True:
return redirect(url_for('views.index'))
else:
if request.method == 'POST':
return render_template('data.html')
else:
Start = request.args.get('Start Date')
End = request.args.get('End Date')
connection = pymssql.connect(ConnectionFactory.SQL_HOST, ConnectionFactory.SQL_USER_xxx, ConnectionFactory.SQL_PW_xxx,ConnectionFactory.SQL_DB_xxx)
cur=connection.cursor()
query_1 = """
select some column from table
from xx
where date BETWEEN (%s) AND (%s)
"""
query_2 = """
select some column from table
"""
results = []
q_list = [query_1,query_2]
for query in q_list:
cur.execute(query,(Start,End))
results.append(cur)
print(results)
columns = [d[0] for d in cur.description]
data = pd.DataFrame(cur.fetchall(),columns=columns)
print(data)
resp = make_response(data.to_csv())
resp.headers["Content-Disposition"] = "attachment; filename=Data.csv"
resp.headers["Content-Type"] = "text/csv"
return resp
The problem I'm having is getting my for loop to store the data from query_1 into a list and then move it to my dataframe. Currently data from query_2 will be stored into the list and then pushed to the dataframe, but that's inaccurate as it's just fetching random data at that point.
results = []
q_list = [query_1,query_2]
for query in q_list:
cur.execute(query,(Start,End))
results.append(cur)
print(results)
columns = [d[0] for d in cur.description]
data = pd.DataFrame(cur.fetchall(),columns=columns)
I have tried a nested for loop that calls upon each query separately, has it's own connection, and dataframe but that didn't change the results either. Is there a more efficient way to go about this with pandas?
You might be looking for something like this...
cur = connection.cursor()
results = []
for query, args in [
("select some_column from xx where date BETWEEN (%s) AND (%s)", (Start, End)),
("select some_column from yy", ()),
]:
cur.execute(query, args)
results.extend(cur.fetchall())
# Will be overwritten on each iteration, but it's fine as long as the columns are the same for each query
columns = [d[0] for d in cur.description]
data = pd.DataFrame(results, columns=columns)

Open database files (.db) using python

I have a data base file .db in SQLite3 format and I was attempting to open it to look at the data inside it. Below is my attempt to code using python.
import sqlite3
# Create a SQL connection to our SQLite database
con = sqlite3.connect(dbfile)
cur = con.cursor()
# The result of a "cursor.execute" can be iterated over by row
for row in cur.execute("SELECT * FROM "):
print(row)
# Be sure to close the connection
con.close()
For the line ("SELECT * FROM ") , I understand that you have to put in the header of the table after the word "FROM", however, since I can't even open up the file in the first place, I have no idea what header to put. Hence how can I code such that I can open up the data base file to read its contents?
So, you analyzed it all right. After the FROM you have to put in the tablenames. But you can find them out like this:
SELECT name FROM sqlite_master WHERE type = 'table'
In code this looks like this:
# loading in modules
import sqlite3
# creating file path
dbfile = '/home/niklas/Desktop/Stuff/StockData-IBM.db'
# Create a SQL connection to our SQLite database
con = sqlite3.connect(dbfile)
# creating cursor
cur = con.cursor()
# reading all table names
table_list = [a for a in cur.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
# here is you table list
print(table_list)
# Be sure to close the connection
con.close()
That worked for me very good. The reading of the data you have done already right just paste in the tablenames.
If you want to see data for visual analysis as pandas dataframe, the below approach could also be used.
import pandas as pd
import sqlite3
import sqlalchemy
try:
conn = sqlite3.connect("file.db")
except Exception as e:
print(e)
#Now in order to read in pandas dataframe we need to know table name
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(f"Table Name : {cursor.fetchall()}")
df = pd.read_sql_query('SELECT * FROM Table_Name', conn)
conn.close()
from flask import Flask
app = Flask(__name__)
from sqlalchemy import create_engine, select, MetaData, Table
from sqlalchemy.sql import and_, or_
engine = create_engine('sqlite://username:password#host/databasename')
class UserModel():
def __init__(self):
try:
self.meta = MetaData()
self.users = Table("users", self.meta, autoload=True, autoload_with=engine)
except Exception as e:
print(e)
def get(self):
stmt = select([self.users.c.name, self.users.c.email, self.users.c.password])
print(stmt)
result = engine.execute(stmt)
temp = [dict(r) for r in result] if result else None
print(temp)
return temp

Combining SQLAlchemy yield_per and group_by

I have a. SQLAlchemy database table spanning 24 hours and with up to 1,000,000 rows per hour. Example table below.
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.ext.declarative import declatative_base
from sqlalchemy.orm import sessionmaker
from random import choice
import pandas as pd
Base = declarative_base()
class WebsiteData(Base):
__tablename__ = 'hourly_website_table'
id = Column(Integer, primary_key=True)
user = Column(String(600), index=True)
website = Column(String(600))
time_secs = Column(Integer, index=True)
class DataBaseManager:
def __init__(self, db_loc='sqlite:////home/test/database.db'):
self.engine = create_engine(db_loc, echo=False)
self.table = WebsiteData
def get_session(self):
Session = sessionmaker(bind=self.engine)
session = Session()
Base.metadata.create_all(self.engine)
return session
def get_db_info(self):
session = self.get_session()
rows = session.query(self.table).count()
session.close()
return rows
def df_to_hourly_db(self, table_name, df, time_secs):
conn = self.engine.raw_connection()
df['hour'] = time_secs
query = "INSERT OR REPLACE INTO %s (user,website,time_secs) VALUES (?,?,?)" %\
table_name
conn.executemany(query, df[['user', 'website', 'hour']].to_records(index=False))
conn.commit()
conn.close()
def create_df(time_secs=0, users=10000, rows_per_user=100):
user_arr = [("u%d" % i) for i in range(users)] * rows_per_user
web_arr = [("www.website_%d" % (time_secs + i)) for i in xrange(rows_per_user * users)]
return pd.DataFrame({'user': user_arr, 'website': web_arr})
DBM = DataBaseManager()
for hour in range(24):
time_secs = (60 * 24 * 3600) + (hour * 3600)
df = create_df(time_secs=time_secs, rows_per_user=choice(range(100)))
DBM.df_to_hourly_db(df, time_secs)
The number of rows per hour is variable. In order to avoid having to load the entire table into memory at once, I would like to perform a group_by(table.time_secs) on the data and then stream each group sequentially. Is it possible to somehow combine SQLAlchemy's group_by and yield_per methods to achieve this? I know yield_per allows you to yield a set number of rows at a time, but is it possible to yield a different number of rows per iteration? If not, is there any other way of doing something similar?

SQLalchemy- How I can extract table from sqlite file?

I want to extract table information from sqlite file.
I could list all the table name following this page and tried to extract table information using query method on the session instance. But I got following error.
sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) no such column: ComponentSizes [SQL: 'SELECT ComponentSizes']
Does anyone know how should I revise following code in order to extract table specifying the table name?
class read():
def __init__(self,path):
engine = create_engine("sqlite:///" + sqlFile)
inspector=inspect(engine)
for table_name in inspector.get_table_names():
for column in inspector.get_columns(table_name):
#print("Column: %s" % column['name'])
print (table_name+" : "+column['name'])
Session = sessionmaker(bind=engine)
self.session = Session()
def getTable(self,name):
table=self.session.query(name).all()
return table
if __name__ == '__main__':
test=read(sqlFile)
test.getTable('ComponentSizes')
The error you are getting is suggestive of what is going wrong. Your code is translating into SQL - SELECT ComponentSizes which is incomplete. It's not clear for what is your end goal. If you want to extract contents of a table into CSV, you could do this:
import sqlite3
import csv
con = sqlite3.connect('mydatabase.db')
outfile = open('mydump.csv', 'wb')
outcsv = csv.writer(outfile)
cursor = con.execute('select * from ComponentSizes')
# dump column titles (optional)
outcsv.writerow(x[0] for x in cursor.description)
# dump rows
outcsv.writerows(cursor.fetchall())
outfile.close()
Else, if you want contents of the table into a pandas df for further analysis, you could choose to do this:
import sqlite3
import pandas as pd
# Create your connection.
cnx = sqlite3.connect('file.db')
df = pd.read_sql_query("SELECT * FROM ComponentSizes", cnx)
Hope it helps. Happy coding!

Insert and update with core SQLAlchemy

I have a database that I don't have metadata or orm classes for (the database already exists).
I managed to get the select stuff working by:
from sqlalchemy.sql.expression import ColumnClause
from sqlalchemy.sql import table, column, select, update, insert
from sqlalchemy.ext.declarative import *
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import pyodbc
db = create_engine('mssql+pyodbc://pytest')
Session = sessionmaker(bind=db)
session = Session()
list = []
list.append (column("field1"))
list.append (column("field2"))
list.append (column("field3"))
s = select(list)
s.append_from('table')
s.append_whereclause("field1 = 'abc'")
s = s.limit(10)
result = session.execute(s)
out = result.fetchall()
print(out)
So far so good.
The only way I can get an update/insert working is by executing a raw query like:
session.execute(<Some sql>)
I would like to make it so I can make a class out of that like:
u = Update("table")
u.Set("file1","some value")
u.Where(<some conditon>)
seasion.execute(u)
Tried (this is just one of the approaches I tried):
i = insert("table")
v = i.values([{"name":"name1"}, {"name":"name2"}])
u = update("table")
u = u.values({"name": "test1"})
I can't get that to execute on:
session.execute(i)
or
session.execute(u)
Any suggestion how to construct an insert or update without writing ORM models?
As you can see from the SQLAlchemy Overview documentation, sqlalchemy is build with two layers: ORM and Core. Currently you are using only some constructs of the Core and building everything manually.
In order to use Core you should let SQLAlchemy know some meta information about your database in order for it to operate on it. Assuming you have a table mytable with columns field1, field2, field3 and a defined primary key, the code below should perform all the tasks you need:
from sqlalchemy.sql import table, column, select, update, insert
# define meta information
metadata = MetaData(bind=engine)
mytable = Table('mytable', metadata, autoload=True)
# select
s = mytable.select() # or:
#s = select([mytable]) # or (if only certain columns):
#s = select([mytable.c.field1, mytable.c.field2, mytable.c.field3])
s = s.where(mytable.c.field1 == 'abc')
result = session.execute(s)
out = result.fetchall()
print(out)
# insert
i = insert(mytable)
i = i.values({"field1": "value1", "field2": "value2"})
session.execute(i)
# update
u = update(mytable)
u = u.values({"field3": "new_value"})
u = u.where(mytable.c.id == 33)
session.execute(u)

Categories