How to serialize the complex query (peewee) - python

I am using the peewee as ORM and my goal is to serialize the result of the complex query whcih also contains subqueries:
machine_usage_alias = RecordDailyMachineUsage.alias()
subquery = (
machine_usage_alias.select(
machine_usage_alias.machine_id,
fn.MAX(machine_usage_alias.date).alias('max_date'),
)
.group_by(machine_usage_alias.machine_id)
.alias('machine_usage_subquery')
)
record_subquery = RecordDailyMachineUsage.select(
RecordDailyMachineUsage.machine_id, RecordDailyMachineUsage.usage
).join(
subquery,
on=(
(RecordDailyMachineUsage.machine_id == subquery.c.machine_id)
& (RecordDailyMachineUsage.date == subquery.c.max_date)
),
)
query = (
Machine.select(
Machine.id, # 0
Machine.name,
Machine.location,
Machine.arch,
Machine.platform,
Machine.machine_version,
Machine.status,
record_subquery.c.usage.alias('usage'),
fn.GROUP_CONCAT(Tag.name.distinct()).alias('tags_list'),
fn.GROUP_CONCAT(Project.full_name.distinct()).alias('projects_list'),
) # 10
.join(MachineTag)
.join(Tag)
.switch(Machine)
.join(MachineProject)
.join(Project)
.join(
record_subquery,
JOIN.LEFT_OUTER,
on=(Machine.id == record_subquery.c.machine_id),
)
.where((Machine.id != 0) & (Machine.is_alive == 1))
.group_by(Machine.id)
)
I've tried to use the method model_to_dict:
jsonify({'rows': [model_to_dict(c) for c in query]})
But this way gives me the columns and and values from the Machine model only. My aim is include all the columns from the select query.

It turned out that I had to use the dicts method of the query and jsonify the result.
machine_usage_alias = RecordDailyMachineUsage.alias()
subquery = (
machine_usage_alias.select(
machine_usage_alias.machine_id,
fn.MAX(machine_usage_alias.date).alias('max_date'),
)
.group_by(machine_usage_alias.machine_id)
.alias('machine_usage_subquery')
)
record_subquery = RecordDailyMachineUsage.select(
RecordDailyMachineUsage.machine_id, RecordDailyMachineUsage.usage
).join(
subquery,
on=(
(RecordDailyMachineUsage.machine_id == subquery.c.machine_id)
& (RecordDailyMachineUsage.date == subquery.c.max_date)
),
)
query = (
Machine.select(
Machine.id, # 0
Machine.name,
Machine.location,
Machine.arch,
Machine.platform,
Machine.machine_version,
Machine.status,
record_subquery.c.usage.alias('usage'),
fn.GROUP_CONCAT(Tag.name.distinct()).alias('tags_list'),
fn.GROUP_CONCAT(Project.full_name.distinct()).alias('projects_list'),
) # 10
.join(MachineTag)
.join(Tag)
.switch(Machine)
.join(MachineProject)
.join(Project)
.join(
record_subquery,
JOIN.LEFT_OUTER,
on=(Machine.id == record_subquery.c.machine_id),
)
.where((Machine.id != 0) & (Machine.is_alive == 1))
.group_by(Machine.id)
).dicts()
return jsonify({'rows': [c for c in query]})

Related

pd.read_sql - Unsupported format character error (0x27)

As above, I'm trying to use pd.read_sql to query our mysql database, and getting an error for double/single quotes.
When I remove the % operators from the LIKE clause (lines 84-87) the query runs, but these are needed. I know I need to format the strings but I don't know how within such a big query.
Here's the query:
SELECT
s.offer_id,
s.cap_id,
vi.make,
vi.model,
vi.derivative,
i.vehicle_orders,
s.lowest_offer,
CASE
WHEN f.previous_avg = f.previous_low THEN "n/a"
ELSE FORMAT(f.previous_avg, 2)
END as previous_avg,
f.previous_low,
CASE
WHEN ( ( (s.lowest_offer - f.previous_avg) / f.previous_avg) * 100) = ( ( (s.lowest_offer - f.previous_low) / f.previous_low) * 100) THEN "n/a"
ELSE CONCAT(FORMAT( ( ( (s.lowest_offer - f.previous_avg) / f.previous_avg) * 100), 2), "%")
END as diff_avg,
CONCAT(FORMAT( ( ( (s.lowest_offer - f.previous_low) / f.previous_low) * 100), 2), "%") as diff_low,
s.broker,
CASE
WHEN s.in_stock = '1' THEN "In Stock"
ELSE "Factory Order"
END as in_stock,
CASE
WHEN s.special IS NOT NULL THEN "Already in Specials"
ELSE "n/a"
END as special
FROM
( SELECT o.id as offer_id,
o.cap_id as cap_id,
MIN(o.monthly_payment) as lowest_offer,
b.name as broker,
o.stock as in_stock,
so.id as special
FROM
offers o
INNER JOIN brands b ON ( o.brand_id = b.id )
LEFT JOIN special_offers so ON ( so.cap_id = o.cap_id )
WHERE
( o.date_modified >= DATE_ADD(NOW(), INTERVAL -1 DAY) OR o.date_created >= DATE_ADD(NOW(), INTERVAL -1 DAY) )
AND o.deposit_value = 9
AND o.term = 48
AND o.annual_mileage = 8000
AND o.finance_type = 'P'
AND o.monthly_payment > 100
GROUP BY
o.cap_id
ORDER BY
special DESC) s
INNER JOIN
( SELECT o.cap_id as cap_id,
AVG(o.monthly_payment) as previous_avg,
MIN(o.monthly_payment) as previous_low
FROM
offers o
WHERE
o.date_modified < DATE_ADD(NOW(), INTERVAL -1 DAY)
AND o.date_modified >= DATE_ADD(NOW(), INTERVAL -1 WEEK)
AND o.deposit_value = 9
AND o.term = 48
AND o.annual_mileage = 8000
AND o.finance_type = 'P'
AND o.monthly_payment > 100
GROUP BY
o.cap_id ) f ON ( s.cap_id = f.cap_id )
LEFT JOIN
( SELECT a.cap_id as cap_id,
v.manufacturer as make,
v.model as model,
v.derivative as derivative,
COUNT(*) as vehicle_orders
FROM
( SELECT o.id,
o.name as name,
o.email as email,
o.date_created as date,
SUBSTRING_INDEX(SUBSTRING(offer_serialized, LOCATE("capId", offer_serialized) +12, 10), '"', 1) as cap_id
FROM moneyshake.orders o
WHERE o.name NOT LIKE 'test%'
AND o.email NOT LIKE 'jawor%'
AND o.email NOT LIKE 'test%'
AND o.email NOT LIKE '%moneyshake%'
AND o.phone IS NOT NULL
AND o.date_created > DATE_ADD(NOW(), INTERVAL -1 MONTH)
) a JOIN moneyshake.vehicles_view v ON a.cap_id = v.id
GROUP BY
v.manufacturer,
v.model,
v.derivative,
a.cap_id) i ON ( f.cap_id = i.cap_id )
INNER JOIN
( SELECT v.id as id,
v.manufacturer as make,
v.model as model,
v.derivative as derivative
FROM moneyshake.vehicles_view v
GROUP BY v.id ) vi ON s.cap_id = vi.id
WHERE
( ( s.lowest_offer - f.previous_low ) / f.previous_low) * 100 <= -15
GROUP BY
s.cap_id
Thanks!
That error occurs then the DBAPI layer (e.g., mysqlclient) natively uses the "format" paramstyle and the percent sign (%) is misinterpreted as a format character instead of a LIKE wildcard.
The fix is to wrap the SQL statement in a SQLAlchemy text() object. For example, this will fail:
import pandas as pd
import sqlalchemy as sa
engine = sa.create_engine("mysql+mysqldb://scott:tiger#localhost:3307/mydb")
sql = """\
SELECT * FROM million_rows
WHERE varchar_col LIKE 'record00000%'
ORDER BY id
"""
df = pd.read_sql_query(sql, engine)
but simply changing the read_sql_query() call to
df = pd.read_sql_query(sa.text(sql), engine)
will work.

Problems with group_by SqlAlchemy

I'm having a problem with SqlAlchemy and a group_by clause. See the SqlAlchemy query below.
I've got a SqlAlchemy query that includes a group_by clause and it's raising an exception, '(cx_Oracle.DatabaseError) ORA-00979: not a GROUP BY expression'. However, when I get the SQL generated by the SqlAlachemy query, and run that manually, the query works fine.
I'm not sure how to figure out what's wrong with the group_by clause. How can I debug this problem and figure out what I can do to fix it?
# create shorthand aliases
b = db.aliased(Batch)
bs = db.aliased(BatchingStatus)
bp = db.aliased(BatchPress)
bst = db.aliased(BatchState)
bit = db.aliased(BatchItem)
bin = db.aliased(BatchInput)
bpri = db.aliased(BatchPriority)
lcu = db.aliased(LCUser)
s = db.aliased(SubBatch)
w = db.aliased(WorkType)
ptw = db.aliased(LCProductToWorkType)
ctp = db.aliased(LCCategoryToProduct)
c = db.aliased(LCCategory)
# for correlated subquery
subq = (
db.session.query(ctp.product_name)
.join(c, c.category_id == ctp.category_id)
.filter(func.lower(c.category_path) == category)
.filter(ctp.active == 1)
)
# start of problem query
q = db.session.query(
b.batch_signature.label('batch_signature'),
b.batch_num,
b.created_date.label('created_date'),
bst.code.label('batch_state'),
func.min(bin.promise_date).label('due_out'),
bs.job_status,
bp.press_id.label('press_id'),
bp.description.label('press_description'),
bp.code.label('press_code'),
bp.active.label('press_active'),
func.listagg(bin.item_id, ',').within_group(bin.item_id).label('subbatches'),
bs.item_count.label('item_count'),
bs.product.label('product'),
bpri.code.label('priority'),
ptw.display_format.label('product_display_format'),
c.display_name.label('category_display_name'),
lcu.coalesce_first_name,
lcu.coalesce_last_name,
lcu.coalesce_email,
) \
.join(bs, (bs.batch_signature == b.batch_signature) & (bs.press_id == b.press_id)) \
.join(bp, bp.press_id == b.press_id) \
.join(bst, bst.state_id == b.state_id) \
.join(bit, bit.batch_id == b.batch_id) \
.join(bin, bin.batch_input_id == bit.batch_input_id) \
.join(bpri, bpri.priority_id == bin.priority_id) \
.join(lcu, lcu.username == bs.actor) \
.join(s, s.subbatchno == func.to_char(bin.item_id)) \
.join(w, w.worktypeenum == s.worktypeenum) \
.join(ptw, ptw.worktypeenum == w.worktypeenum) \
.join(ctp, ctp.category_to_product_id == ptw.category_to_product_id) \
.join(c, c.category_id == ctp.category_id) \
.filter(bs.product.in_(subq)) \
.filter(b.state_id <= 200) \
.group_by(
b.batch_signature,
b.batch_num,
b.created_date,
bst.code,
bs.job_status,
bp.press_id,
bp.description,
bp.code,
bp.active,
bs.item_count,
bs.product,
bpri.code,
ptw.display_format,
c.display_name,
lcu.coalesce_first_name,
lcu.coalesce_last_name,
lcu.coalesce_email,
) \
.order_by('batch_signature', 'batch_num', 'created_date')
try:
retval = q.all()
except Exception as e:
print e
The above doesn't show the models, some of which have #hybrid_property/#.expression methods, like the lcu.coalesce_first_name columns, which are an attempt to hid the #func.coalesce code that I thought was causing the group_by problems.

How to make a subquery in Sqlalchemy using not in with two fields?

I need filter using not in but in two fields.
q = db_session.query(Necessidade, WFLeilao, BidHeader, BidItemPrice, func.sbprecobruto(BidItemPrice.bid_number,BidItemPrice.line_number, Necessidade.identportal, type_=Float))
q = q.join(WFLeilao, and_(Necessidade.numeroportal == WFLeilao.leilao, Necessidade.numeroitemportal == WFLeilao.itemleilao))
q = q.join(BidHeader, and_(BidHeader.bid_number == BidItemPrice.bid_number))
q = q.join(BidItemPrice, and_(BidItemPrice.auction_header_id == WFLeilao.leilao, BidItemPrice.auction_line_number == WFLeilao.itemleilao, BidItemPrice.bid_number == WFLeilao.lance, BidItemPrice.line_number == WFLeilao.itemlance))
subquery = db_session.query(ItfRetornoPedido.num_leilao_superbuy, ItfRetornoPedido.num_item_leilao_superbuy).filter_by(status_comprador=1).filter_by(acao='I').filter_by(empresa='NK').subquery()
q = q.filter(~(WFLeilao.leilao,Wfleilao.itemleilao).in_(subquery))
In oracle is possible, a similar example:
Select *
from table_a
where (leilao, itemleilao) not in
(Select num_leilao_superbuy, num_item_leilao_superbuy
from table_b
where empresa = 'NK')
Is it possible?
I found a solution using tuple_
q = q.filter(~tuple_(WFLeilao.leilao, WFLeilao.itemleilao).in_(subquery))
you can chain the query:
q = q.filter(~(WFLeilao.leilao.in_(subquery))) \
.filter(~(Wfleilao.itemleilao.in_(subquery)))

Group by column to get array results in Postgresql

I have a table called moviegenre which looks like:
moviegenre:
- movie (FK movie.id)
- genre (FK genre.id)
I have a query (ORM generated) which returns all movie.imdb and genre.id's which have genre.id's in common with a given movie.imdb_id.
SELECT "movie"."imdb_id",
"moviegenre"."genre_id"
FROM "moviegenre"
INNER JOIN "movie"
ON ( "moviegenre"."movie_id" = "movie"."id" )
WHERE ( "movie"."imdb_id" IN (SELECT U0."imdb_id"
FROM "movie" U0
INNER JOIN "moviegenre" U1
ON ( U0."id" = U1."movie_id" )
WHERE ( U0."last_ingested_on" IS NOT NULL
AND NOT ( U0."imdb_id" IN
( 'tt0169547' ) )
AND NOT ( U0."imdb_id" IN
( 'tt0169547' ) )
AND U1."genre_id" IN ( 2, 10 ) ))
AND "moviegenre"."genre_id" IN ( 2, 10 ) )
The problem is that I'll get results in the format:
[
('imdbid22`, 'genreid1'),
('imdbid22`, 'genreid2'),
('imdbid44`, 'genreid1'),
('imdbid55`, 'genreid8'),
]
Is there a way within the query itself I can group all of the genre ids into a list under the movie.imdb_id's? I'd like do to grouping in the query.
Currently doing it in my web app code (Python) which is extremely slow when 50k+ rows are returned.
[
('imdbid22`, ['genreid1', 'genreid2']),
('imdbid44`, 'genreid1'),
('imdbid55`, 'genreid8'),
]
thanks in advance!
edit:
here's the python code which runs against the current results
results_list = []
for item in movies_and_genres:
genres_in_common = len(set([
i['genre__id'] for i in movies_and_genres
if i['movie__imdb_id'] == item['movie__imdb_id']
]))
imdb_id = item['movie__imdb_id']
if genres_in_common >= min_in_comon:
result_item = {
'movie.imdb_id': imdb_id,
'count': genres_in_common
}
if result_item not in results_list:
results_list.append(result_item)
return results_list
select m.imdb_id, array_agg(g.genre_id) as genre_id
from
moviegenre g
inner join
movie m on g.movie_id = m.id
where
m.last_ingested_on is not null
and not m.imdb_id in ('tt0169547')
and not m.imdb_id in ('tt0169547')
and g.genre_id in (2, 10)
group by m.imdb_id
array_agg will create an array of all the genre_ids of a certain imdb_id:
http://www.postgresql.org/docs/current/interactive/functions-aggregate.html#FUNCTIONS-AGGREGATE-TABLE
I hope python code will be fast enough:
movielist = [
('imdbid22', 'genreid1'),
('imdbid22', 'genreid2'),
('imdbid44, 'genreid1'),
('imdbid55', 'genreid8'),
]
dict = {}
for items in movielist:
if dict[items[0]] not in dict:
dict[items[0]] = items[1]
else:
dict[items[0]] = dict[items[0]].append(items[1])
print dict
Output:
{'imdbid44': ['genreid1'], 'imdbid55': ['genreid8'], 'imdbid22': ['genreid1', 'genreid2']}
If you just need movie name, count:
Change this in original query you will get the answer you dont need python code
SELECT "movie"."imdb_id", count("moviegenre"."genre_id")
group by "movie"."imdb_id"

How to store data like Freebase does?

I admit that this is basically a duplicate question of Use freebase data on local server? but I need more detailed answers than have already been given there
I've fallen absolutely in love with Freebase. What I want now is to essentially create a very simple Freebase clone for storing content that may not belong on Freebase itself but can be described using the Freebase schema. Essentially what I want is a simple and elegant way to store data like Freebase itself does and be able to easily use that data in a Python (CherryPy) web application.
Chapter 2 of the MQL reference guide states:
The database that underlies Metaweb is fundamentally different than the relational databases that you may be familiar with. Relational databases store data in the form of tables, but the Metaweb database stores data as a graph of nodes and relationships between those nodes.
Which I guess means that I should be using either a triplestore or a graph database such as Neo4j? Does anybody here have any experience with using one of those from a Python environment?
(What I've actually tried so far is to create a relational database schema which would be able to easily store Freebase topics, but I'm having issues with configuring the mappings in SQLAlchemy).
Things I'm looking into
http://gen5.info/q/2009/02/25/putting-freebase-in-a-star-schema/
http://librdf.org/
UPDATE [28/12/2011]:
I found an article on the Freebase blog that describes the proprietary tuple store / database Freebase themselves use (graphd): http://blog.freebase.com/2008/04/09/a-brief-tour-of-graphd/
This is what worked for me. It allows you to load all of a Freebase dump in a standard MySQL installation on less than 100GB of disk. The key is understanding the data layout in a dump and then transforming it (optimizing it for space and speed).
Freebase notions you should understand before you attempt to use this (all taken from the documentation):
Topic - anything of type '/common/topic', pay attention to the different types of ids you may encounter in Freebase - 'id', 'mid', 'guid', 'webid', etc.
Domain
Type - 'is a' relationship
Properties - 'has a' relationship
Schema
Namespace
Key - human readable in the '/en' namespace
Some other important Freebase specifics:
the query editor is your friend
understand the 'source', 'property', 'destination' and 'value' notions described here
everything has a mid, even things like '/', '/m', '/en', '/lang', '/m/0bnqs_5', etc.; Test using the query editor: [{'id':'/','mid':null}]​
you don't know what any entity (i.e. row) in the data dump is, you have to get to its types to do that (for instance how do I know '/m/0cwtm' is a human);
every entity has at least one type (but usually many more)
every entity has at least one id/key (but usually many more)
the ontology (i.e. metadata) is embedded in the same dump and the same format as the data (not the case with other distributions like DBPedia, etc.)
the 'destination' column in the dump is the confusing one, it may contain a mid or a key (see how the transforms bellow deal with this)
the domains, types, properties are namespace levels at the same time (whoever came up with this is a genius IMHO);
understand what is a Topic and what is not a Topic (absolutely crucial), for example this entity '/m/03lmb2f' of type '/film/performance' is NOT a Topic (I choose to think of these as what Blank Nodes in RDF are although this may not be philosophically accurate), while '/m/04y78wb' of type '/film/director' (among others) is;
Transforms
(see the Python code at the bottom)
TRANSFORM 1 (from shell, split links from namespaces ignoring notable_for and non /lang/en text):
python parse.py freebase.tsv #end up with freebase_links.tsv and freebase_ns.tsv
TRANSFORM 2 (from Python console, split freebase_ns.tsv on freebase_ns_types.tsv, freebase_ns_props.tsv plus 15 others which we ignore for now)
import e
e.split_external_keys( 'freebase_ns.tsv' )
TRANSFORM 3 (from Python console, convert property and destination to mids)
import e
ns = e.get_namespaced_data( 'freebase_ns_types.tsv' )
e.replace_property_and_destination_with_mid( 'freebase_links.tsv', ns ) #produces freebase_links_pdmids.tsv
e.replace_property_with_mid( 'freebase_ns_props.tsv', ns ) #produces freebase_ns_props_pmids.tsv
TRANSFORM 4 (from MySQL console, load freebase_links_mids.tsv, freebase_ns_props_mids.tsv and freebase_ns_types.tsv in DB):
CREATE TABLE links(
source VARCHAR(20),
property VARCHAR(20),
destination VARCHAR(20),
value VARCHAR(1)
) ENGINE=MyISAM CHARACTER SET utf8;
CREATE TABLE ns(
source VARCHAR(20),
property VARCHAR(20),
destination VARCHAR(40),
value VARCHAR(255)
) ENGINE=MyISAM CHARACTER SET utf8;
CREATE TABLE types(
source VARCHAR(20),
property VARCHAR(40),
destination VARCHAR(40),
value VARCHAR(40)
) ENGINE=MyISAM CHARACTER SET utf8;
LOAD DATA LOCAL INFILE "/data/freebase_links_pdmids.tsv" INTO TABLE links FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
LOAD DATA LOCAL INFILE "/data/freebase_ns_props_pmids.tsv" INTO TABLE ns FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
LOAD DATA LOCAL INFILE "/data/freebase_ns_base_plus_types.tsv" INTO TABLE types FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
CREATE INDEX links_source ON links (source) USING BTREE;
CREATE INDEX ns_source ON ns (source) USING BTREE;
CREATE INDEX ns_value ON ns (value) USING BTREE;
CREATE INDEX types_source ON types (source) USING BTREE;
CREATE INDEX types_destination_value ON types (destination, value) USING BTREE;
Code
Save this as e.py:
import sys
#returns a dict to be used by mid(...), replace_property_and_destination_with_mid(...) bellow
def get_namespaced_data( file_name ):
f = open( file_name )
result = {}
for line in f:
elements = line[:-1].split('\t')
if len( elements ) < 4:
print 'Skip...'
continue
result[(elements[2], elements[3])] = elements[0]
return result
#runs out of memory
def load_links( file_name ):
f = open( file_name )
result = {}
for line in f:
if len( result ) % 1000000 == 0:
print len(result)
elements = line[:-1].split('\t')
src, prop, dest = elements[0], elements[1], elements[2]
if result.get( src, False ):
if result[ src ].get( prop, False ):
result[ src ][ prop ].append( dest )
else:
result[ src ][ prop ] = [dest]
else:
result[ src ] = dict([( prop, [dest] )])
return result
#same as load_links but for the namespaced data
def load_ns( file_name ):
f = open( file_name )
result = {}
for line in f:
if len( result ) % 1000000 == 0:
print len(result)
elements = line[:-1].split('\t')
src, prop, value = elements[0], elements[1], elements[3]
if result.get( src, False ):
if result[ src ].get( prop, False ):
result[ src ][ prop ].append( value )
else:
result[ src ][ prop ] = [value]
else:
result[ src ] = dict([( prop, [value] )])
return result
def links_in_set( file_name ):
f = open( file_name )
result = set()
for line in f:
elements = line[:-1].split('\t')
result.add( elements[0] )
return result
def mid( key, ns ):
if key == '':
return False
elif key == '/':
key = '/boot/root_namespace'
parts = key.split('/')
if len(parts) == 1: #cover the case of something which doesn't start with '/'
print key
return False
if parts[1] == 'm': #already a mid
return key
namespace = '/'.join(parts[:-1])
key = parts[-1]
return ns.get( (namespace, key), False )
def replace_property_and_destination_with_mid( file_name, ns ):
fn = file_name.split('.')[0]
f = open( file_name )
f_out_mids = open(fn+'_pdmids'+'.tsv', 'w')
def convert_to_mid_if_possible( value ):
m = mid( value, ns )
if m: return m
else: return None
counter = 0
for line in f:
elements = line[:-1].split('\t')
md = convert_to_mid_if_possible(elements[1])
dest = convert_to_mid_if_possible(elements[2])
if md and dest:
elements[1] = md
elements[2] = dest
f_out_mids.write( '\t'.join(elements)+'\n' )
else:
counter += 1
print 'Skipped: ' + str( counter )
def replace_property_with_mid( file_name, ns ):
fn = file_name.split('.')[0]
f = open( file_name )
f_out_mids = open(fn+'_pmids'+'.tsv', 'w')
def convert_to_mid_if_possible( value ):
m = mid( value, ns )
if m: return m
else: return None
for line in f:
elements = line[:-1].split('\t')
md = convert_to_mid_if_possible(elements[1])
if md:
elements[1]=md
f_out_mids.write( '\t'.join(elements)+'\n' )
else:
#print 'Skipping ' + elements[1]
pass
#cPickle
#ns=e.get_namespaced_data('freebase_2.tsv')
#import cPickle
#cPickle.dump( ns, open('ttt.dump','wb'), protocol=2 )
#ns=cPickle.load( open('ttt.dump','rb') )
#fn='/m/0'
#n=fn.split('/')[2]
#dir = n[:-1]
def is_mid( value ):
parts = value.split('/')
if len(parts) == 1: #it doesn't start with '/'
return False
if parts[1] == 'm':
return True
return False
def check_if_property_or_destination_are_mid( file_name ):
f = open( file_name )
for line in f:
elements = line[:-1].split('\t')
#if is_mid( elements[1] ) or is_mid( elements[2] ):
if is_mid( elements[1] ):
print line
#
def split_external_keys( file_name ):
fn = file_name.split('.')[0]
f = open( file_name )
f_out_extkeys = open(fn+'_extkeys' + '.tsv', 'w')
f_out_intkeys = open(fn+'_intkeys' + '.tsv', 'w')
f_out_props = open(fn+'_props' + '.tsv', 'w')
f_out_types = open(fn+'_types' + '.tsv', 'w')
f_out_m = open(fn+'_m' + '.tsv', 'w')
f_out_src = open(fn+'_src' + '.tsv', 'w')
f_out_usr = open(fn+'_usr' + '.tsv', 'w')
f_out_base = open(fn+'_base' + '.tsv', 'w')
f_out_blg = open(fn+'_blg' + '.tsv', 'w')
f_out_bus = open(fn+'_bus' + '.tsv', 'w')
f_out_soft = open(fn+'_soft' + '.tsv', 'w')
f_out_uri = open(fn+'_uri' + '.tsv', 'w')
f_out_quot = open(fn+'_quot' + '.tsv', 'w')
f_out_frb = open(fn+'_frb' + '.tsv', 'w')
f_out_tag = open(fn+'_tag' + '.tsv', 'w')
f_out_guid = open(fn+'_guid' + '.tsv', 'w')
f_out_dtwrld = open(fn+'_dtwrld' + '.tsv', 'w')
for line in f:
elements = line[:-1].split('\t')
parts_2 = elements[2].split('/')
if len(parts_2) == 1: #the blank destination elements - '', plus the root domain ones
if elements[1] == '/type/object/key':
f_out_types.write( line )
else:
f_out_props.write( line )
elif elements[2] == '/lang/en':
f_out_props.write( line )
elif (parts_2[1] == 'wikipedia' or parts_2[1] == 'authority') and len( parts_2 ) > 2:
f_out_extkeys.write( line )
elif parts_2[1] == 'm':
f_out_m.write( line )
elif parts_2[1] == 'en':
f_out_intkeys.write( line )
elif parts_2[1] == 'source' and len( parts_2 ) > 2:
f_out_src.write( line )
elif parts_2[1] == 'user':
f_out_usr.write( line )
elif parts_2[1] == 'base' and len( parts_2 ) > 2:
if elements[1] == '/type/object/key':
f_out_types.write( line )
else:
f_out_base.write( line )
elif parts_2[1] == 'biology' and len( parts_2 ) > 2:
f_out_blg.write( line )
elif parts_2[1] == 'business' and len( parts_2 ) > 2:
f_out_bus.write( line )
elif parts_2[1] == 'soft' and len( parts_2 ) > 2:
f_out_soft.write( line )
elif parts_2[1] == 'uri':
f_out_uri.write( line )
elif parts_2[1] == 'quotationsbook' and len( parts_2 ) > 2:
f_out_quot.write( line )
elif parts_2[1] == 'freebase' and len( parts_2 ) > 2:
f_out_frb.write( line )
elif parts_2[1] == 'tag' and len( parts_2 ) > 2:
f_out_tag.write( line )
elif parts_2[1] == 'guid' and len( parts_2 ) > 2:
f_out_guid.write( line )
elif parts_2[1] == 'dataworld' and len( parts_2 ) > 2:
f_out_dtwrld.write( line )
else:
f_out_types.write( line )
Save this as parse.py:
import sys
def parse_freebase_quadruple_tsv_file( file_name ):
fn = file_name.split('.')[0]
f = open( file_name )
f_out_links = open(fn+'_links'+'.tsv', 'w')
f_out_ns = open(fn+'_ns' +'.tsv', 'w')
for line in f:
elements = line[:-1].split('\t')
if len( elements ) < 4:
print 'Skip...'
continue
#print 'Processing ' + str( elements )
#cases described here http://wiki.freebase.com/wiki/Data_dumps
if elements[1].endswith('/notable_for'): #ignore notable_for, it has JSON in it
continue
elif elements[2] and not elements[3]: #case 1, linked
f_out_links.write( line )
elif not (elements[2].startswith('/lang/') and elements[2] != '/lang/en'): #ignore languages other than English
f_out_ns.write( line )
if len(sys.argv[1:]) == 0:
print 'Pass a list of .tsv filenames'
for file_name in sys.argv[1:]:
parse_freebase_quadruple_tsv_file( file_name )
Notes:
Depending on the machine the index creation may take anywhere from a few to 12+ hours (consider the amount of data you are dealing with though).
To be able to traverse the data in both directions you need an index on links.destination as well which I found to be expensive timewise and never finished.
Many other optimizations are possible here. For example the 'types' table is small enough to be loaded in memory in a Python dict (see e.get_namespaced_data( 'freebase_ns_types.tsv' ))
And the standard disclaimer here. It has been a few months since I did this. I believe it is mostly correct but I do apologize if my notes missed something. Unfortunately the project I needed it for fell through the cracks but hope this helps someone else. If something isn't clear drop a comment here.
My 2 cents...
I use a little bit of Java code to convert the Freebase data dump into RDF: https://github.com/castagna/freebase2rdf
I use Apache Jena's TDB store to load the RDF data and Fuseki to serve the data via SPARQL protocol over HTTP.
See also:
http://markmail.org/thread/mq6ylzdes6n7sc5o
http://markmail.org/thread/jegtn6vn7kb62zof
SPARQL is the query language to query RDF, it allows to write SQL-alike queries. Most RDF databases implement SPARQL interfaces. Moreover, Freebase allows you to export data in RDF so you could potentially use that data directly in an RDF database and query it with SPARQL.
I would have a look at this tutorial to get a better sense of SPARQL.
If you are going to handle a big dataset, like freebase, I would use 4store together with any of the Python clients. 4store exposes SPARQL via HTTP, you can make HTTP requests to assert, remove and query data. It also handles resultsets in JSON, and this is really handy with Python. I have used this infrastructure in several projects, not with CherryPy but with Django, but I guess that this difference doesn't really matter.
A good news for freebase dump users is that Freebase now offer RDF dump now: http://wiki.freebase.com/wiki/Data_dumps . It is in turtle format, so it is very convenient to use any graph database designed for RDF.
My suggestion is also 4store: http://4store.org/ . it is simple and easy to use.
You could use http request to do the SPARQL operation.
One tricky thing in my project is that the "." used in Freebase dump (to represent shorten URL) is not recognizable to 4store. So I add a bracket "<>" o all the columns contained "." and deal with the shorten URL myself.
Have a look at https://cayley.io. I believe it is written by the same author and uses same principles as graphd, the backend of Freebase, before Google killed it.
Regarding the data, you probably will want to run something like this to cleanup the Freebase DB dumps or use datahub.
And this is the extra code for my other answer. The meat is in edb.py. Run from Python console and follow the examples. Or use the web2py controller and run in your browser.
Save this as edb.py:
import MySQLdb
import sys
connection = MySQLdb.connect (host = "localhost",
user = "root",
passwd = "x",
db = "y")
cursor = connection.cursor()
query_counter = 0
print_queries = False
limit = 1000
def fetch_one( query ):
global query_counter, print_queries
query = query + ' LIMIT ' + str(limit)
if print_queries:
print query
cursor = connection.cursor()
cursor.execute( query )
query_counter += 1
result = cursor.fetchone()
if result:
return result[0]
else:
return None
def fetch_all( query ):
global query_counter, print_queries
query = query + ' LIMIT ' + str(limit)
if print_queries:
print query
cursor = connection.cursor()
cursor.execute( query )
query_counter += 1
return cursor.fetchall()
def _flatten( list_of_lists ):
import itertools
return list(itertools.chain(*list_of_lists))
#Example: e._search_by_name('steve martin')
def _search_by_name( name, operator = '=' ):
typed, ranked = {}, []
if name:
name = name.strip()
if not name:
return ( typed, ranked )
filler = '' if operator == '=' else '%'
ranks = {}
#to filter meaningful stuff for every mid returned order by the number of types they have
#search for value text if prop. is
#select * from ns where value = 'the king' and (property = '/m/01gr' or property = '/m/06b');
name_mid = _mid( '/type/object/name' )
alias_mid = _mid( '/common/topic/alias' )
query = "select ns.source from ns where ns.value %s '%s%s' and ns.property in ('%s', '%s')" % ( operator, name, filler, name_mid, alias_mid )
for i in fetch_all( query ):
typed[ i[0] ] = _types( i[0] )
import operator
ranked = [ ( len( typed[i] ), i ) for i in typed ]
ranked = [ e[1] for e in sorted( ranked, key=operator.itemgetter(0), reverse = True ) ]
return (typed, ranked)
#Example: e._children('') <---will get the top level domains
# e._children('/film') <---get all types from the domain
# e._children('/film/film') <---get all properties for the type
def _children( parent, expand = False, raw = False ):
query = "select t.source, t.value from types t where t.destination = '%s'" % (parent)
res = fetch_all( query )
if raw:
return [ row[0] for row in res ]
if expand: prefix = parent
else: prefix = ''
return [ prefix + '/' + row[1] for row in fetch_all(query) ]
#Example: e._parent('/film/film/songs')
def _parent( child ): # '/people/marriage/to' -> '/people/marriage'
#if not isinstance( child, str ): return None # what kind of safety mechanisms do we need here?
return '/'.join(child.split('/')[:-1])
#Example: e._domains()
def _domains():
return _children('')
#Example: e._top_level_types()
def _top_level_types():
return _children('/type')
#TODO get all primitive types
#Example: e._mid('/type/object')
# e._mid('/authority/imdb/name/nm0000188')
def _mid( key ):
if key == '':
return None
elif key == '/':
key = '/boot/root_namespace'
parts = key.split('/')
if parts[1] == 'm': #already a mid
return key
namespace = '/'.join(parts[:-1])
key = parts[-1]
return fetch_one( "select source from types t where t.destination = '%s' and t.value = '%s'" % (namespace, key) )
#Example: e._key('/type')
def _key( mid ):
if isinstance( mid, str):
res = _keys( mid )
if not res:
return None
rt = [ r for r in res if r.startswith( '/type' ) ]
if rt:
return rt[0]
else:
return res[0]
elif isinstance( mid, list ) or isinstance( mid, tuple ):
res = [ _key( e ) for e in mid ]
return [ r for r in res if r is not None ]
else:
return None
def _keys( mid ):
# check for '/type/object/key' as well?
query = "select t.destination, t.value from types t where t.source = '%s'" % mid
return [ row[0]+'/'+row[1] for row in fetch_all( query ) ]
#Example: e._types('/m/0p_47')
def _types( mid ):
tm = _mid( '/type/object/type' )
query = "select l.destination from links l where l.source = '%s' and l.property = '%s'" % (mid, tm)
return [ row[0] for row in fetch_all( query ) ]
#Example: e._props_n('/m/0p_47') <---Named immediate properties (like name, etc.)
def _props_n( mid ): #the same property can be set more than once per topic!
query = "select ns.property from ns where ns.source = '%s'" % (mid)
return list( set( [ row[0] for row in fetch_all( query ) ] ) )
#Example: e._props_l('/m/0p_47') <---All remote properties, some are named, some are anonymous
def _props_l( mid ): #the same property can be set more than once per topic!
tm = _mid( '/type/object/type' ) #exclude types, they have tons of instance links
res = fetch_all( "select l.property, l.destination from links l where l.source = '%s' and property <> '%s'" % (mid, tm) )
output = {}
for r in res:
dests = output.get( r[0], False )
if dests:
dests.append( r[1] )
else:
output[ r[0] ] = [ r[1] ]
return output
#Example: e._props_ln('/m/0p_47') <---All remote named properties
def _props_ln( mid ): #named properties
result = []
ps = _props_l( mid )
common_topic = _mid( '/common/topic' )
for p in ps:
ts = _types( ps[p][0] )
if common_topic in ts: #it's a common topic
result.append( p )
return result
#Example: e._props_la('/m/0p_47') <---All remote anonymous properties, these actually belong to the children!
#instead of has type /common/topic we used to check if it has name
def _props_la( mid, raw = True ): #anonymous properties (blank nodes in RDF?)
result = []
ps = _props_l( mid )
common_topic = _mid( '/common/topic' )
for p in ps:
ts = _types( ps[p][0] )
if common_topic not in ts: #it is not a common topic
t = _key( _types( ps[p][0] ) )
if t and '/type/type' not in t: #FIXME: hack not to go into types, could be done better
result.append( _children( t[0], expand=True, raw=raw ) ) #get the first, is this correct?
return _flatten( result ) #it is a list of lists
#FIXME: try to get '/film/actor/film' -> '/type/property/expected_type' -> '/film/performance' -> properties/children
#instead of trying is something has name
#Example: e._get_n('/m/0p_47', e._props_n('/m/0p_47')[0])['/lang/en'] <---These come with a namespace
def _get_n( mid, prop ): #the same property can be set more than once per topic!
p = _mid( prop )
query = "select ns.value from ns where ns.source = '%s' and ns.property = '%s'" % (mid, p)
return [ r[0] for r in fetch_all( query ) ]
#Example: e._get_l('/m/0p_47', e._props_l('/m/0p_47')[0]) <---returns a list of mids coresponding to that prop.
# e._name(e._get_l('/m/0p_47', '/film/writer/film'))
def _get_l( mid, prop ): #the same property can be set more than once per topic!
p = _mid( prop )
query = "select l.destination from links l where l.source = '%s' and l.property = '%s'" % (mid, p)
return [ row[0] for row in fetch_all( query ) ]
#Example: e._name(e._get_ln('/m/0p_47', e._props_ln('/m/0p_47')[0]))
def _get_ln( mid, p ): #just alias for _get_l, keeping for consistency
return _get_l( mid, p )
#Example: e._name(e._get_la('/m/0p_47', '/film/performance/film'))
def _get_la( mid, prop ):
result = []
ps = _props_l( mid )
for p in ps:
es = _get_l( mid, p ) #get the destinations
if not es: continue
ts = set( _types( es[0] ) )
if _mid(_parent(_key(_mid(prop)))) in ts: #should be able to do this more efficiently!!!
for e in es:
result.append( _get_l( e, prop ) )
return _flatten( result ) #return after the first result
#How do we determine properties with multiple values vs those with singular (i.e. place of birth)?
#is this in the ontology?
#Ans: yes, /type/property/unique
#Example: e._all_names_ln('/m/0p_47') <---gets all of object's remote named properties
def _all_names_ln( mid ):
result = {}
for p in _props_ln( mid ):
result[ _key(p) ] = _name( _get_ln( mid, p ) )
return result
#Example: e._all_names_la('/m/0p_47') <---gets all of object's remote anonymous properties
def _all_names_la( mid ): #TODO: prevent loops, run e.all_names_la('/m/0p_47')
result = {}
for p in _props_la( mid ):
result[ _key( p ) ] = _name ( _get_la( mid, p ) )
return result
#FIXME: _all_names_la is going into destinations which are types and have a ton of instance links...
#Example: e._name('/m/0p_47') <---the name of a topic
#
def _name( mid ):
if isinstance( mid, str ):
nm = _mid( '/type/object/name' )
return _get_n( mid, nm )
elif isinstance( mid, list ) or isinstance( mid, tuple ) or isinstance( mid, set ):
return [ _name( e ) for e in mid ]
else:
return None
#for internal use only
def _get_linked( mid ):
tm = _mid( '/type/object/type' ) #exclude types, they have tons of instance links
query = "select destination from links where source = '%s' and property <> '%s' " % ( mid, tm )
return set( [ r[0] for r in fetch_all( query ) ] )
#for internal use only
def _get_connections_internal( entity1, target, path, all_paths, depth, max_depth):
import copy
if depth > max_depth:
return
if True:
print
print str(entity1) + ', ' + str(target)
print str( path )
print str( all_paths )
print depth
path.append( entity1 )
linked1 = _get_linked( entity1 )
if target in linked1 or entity1 == target:
path.append( target )
all_paths.append( path )
#print str( path )
return
for l1 in linked1:
if l1 in path:
continue
_get_connections_internal( l1,
target,
copy.copy( path ),
all_paths,
depth+1,
max_depth )
#Example: e._name(e._get_connections('/m/0p_47', '/m/0cwtm')) <---find path in the graph between the two entities
def _get_connections( entity1, target ):
result = []
_get_connections_internal( entity1, target, [], result, 0, 2 )
return result
#for internal use only
def _get_connections_internal2( entity1, entity2, path1, path2, all_paths, depth, max_depth, level ):
import copy
if depth > max_depth:
return
if level < 0: level = 0
path1.append( entity1 )
path2.append( entity2 )
if entity1 == entity2 and level == 0:
all_paths.append( ( path1, path2 ) ) #no need to append entity1 or entity2 to the paths
return
linked1 = _get_linked( entity1 )
if entity2 in linked1 and entity2 not in path1 and level == 0:
path1.append( entity2 )
all_paths.append( ( path1, path2 ) )
return
linked2 = _get_linked( entity2 )
if entity1 in linked2 and entity1 not in path2 and level == 0:
path2.append( entity1 )
all_paths.append( ( path1, path2 ) )
return
inters = linked1.intersection( linked2 )
inters = inters.difference( set( path1 ) )
inters = inters.difference( set( path2 ) )
if inters and level == 0:
for e in inters: #these are many paths, have to clone
p1 = copy.copy( path1 )
p1.append( e )
p2 = copy.copy( path2 )
p2.append( e )
all_paths.append( ( p1,p2 ) )
return
for l1 in linked1:
if l1 in path1 or l1 in path2:
continue
for l2 in linked2:
if l2 in path1 or l2 in path2:
continue
_get_connections_internal2( l1, l2,
copy.copy( path1 ), copy.copy( path2 ),
all_paths,
depth+1,
max_depth,
level - 1 )
#Example: e._name(e._get_connections2('/m/0p_47', '/m/0cwtm')) <---returns two meeting paths starting from both entities
# e._name(e._get_connections('/m/0p_47', '/m/0cwtm', level=1)) <---search deeper
# e._name(e._get_connections('/m/0p_47', '/m/0cwtm', level=2)) <---even deeper
def _get_connections2( entity1, entity2, level = 0 ):
result = []
_get_connections_internal2( entity1, entity2, [], [], result, 0, 15, level )
return result
And here is a sample web2py controller (just copy edb.py in the web2py models directory):
# -*- coding: utf-8 -*-
def mid_to_url( mid ):
return mid.split('/')[2]
def index():
form = FORM( TABLE( TR( INPUT(_name='term', _value=request.vars.term ) ),
TR(INPUT(_type='submit', _value='Search') ) ),
_method='get')
typed, ranked = _search_by_name( request.vars.term )
rows = []
for r in ranked:
keys = []
for t in typed[r]:
k = _key( t )
if k:
keys.append( k )
rows.append( TR( TD( A(_name( r ),
_href = URL('result', args = [mid_to_url(r)]))),
TD( XML( '<br/>'.join( keys ) ) ) ) )
result = TABLE( *rows )
return {
'form': form,
'result' : result
}
def result():
path, data = '', ''
if not request.args:
return { 'path':path, 'data':data}
path_rows = []
for ra in range(len(request.args)):
if ra%2:
arrow_url = URL( 'static', 'images/blue_arr.png' )
display_name = _key('/m/'+request.args[ra]) #it's a property
else:
arrow_url = URL( 'static', 'images/red_arr.png' )
display_name = _name('/m/'+request.args[ra]) #it's a topic
path_rows.append( TD( A( display_name, _href=URL( args = request.args[0:ra+1] ) ) ) )
path_rows.append( TD( IMG( _src = arrow_url ) ) )
path = TABLE( *path_rows )
elems = [ '/m/'+a for a in request.args ]
if _mid( '/type/property' ) in _types( elems[-1] ): #we are rendering a property
objects = _get_ln( elems[-2], elems[-1] )
if not objects: #there should be a better way to see if this is anonymous
objects = _get_la( elems[-2], elems[-1] )
data = TABLE( *[ TR( TD( A(_name(o), _href = URL( args = request.args+[mid_to_url(o)])))) for o in objects ] )
else: #we are rendering a topic
direct_props = TABLE(*[TR(TD(_key(p)), TD(', '.join(_get_n( elems[-1], p)))) for p in _props_n( elems[-1] )])
linked_named_props = TABLE(*[TR(TD(A(_key(p),
_href = URL(args = request.args+[mid_to_url(p)])))) for p in _props_ln( elems[-1] ) ] )
linked_anon_props = TABLE(*[TR(TD(A(_key(p),
_href = URL(args = request.args+[mid_to_url(p)])))) for p in _props_la( elems[-1] ) ] )
data = TABLE( TR( TH( 'Linked named data:'), TH( 'Linked anonymous data:' ), TH( 'Direct data:' ) ),
TR( TD( linked_named_props ), TD( linked_anon_props ), TD( direct_props ) ) )
return { 'path': path, 'data':data }

Categories