Group by column to get array results in Postgresql

Group by column to get array results in Postgresql - python

I have a table called moviegenre which looks like:
moviegenre:
- movie (FK movie.id)
- genre (FK genre.id)
I have a query (ORM generated) which returns all movie.imdb and genre.id's which have genre.id's in common with a given movie.imdb_id.
SELECT "movie"."imdb_id",
"moviegenre"."genre_id"
FROM "moviegenre"
INNER JOIN "movie"
ON ( "moviegenre"."movie_id" = "movie"."id" )
WHERE ( "movie"."imdb_id" IN (SELECT U0."imdb_id"
FROM "movie" U0
INNER JOIN "moviegenre" U1
ON ( U0."id" = U1."movie_id" )
WHERE ( U0."last_ingested_on" IS NOT NULL
AND NOT ( U0."imdb_id" IN
( 'tt0169547' ) )
AND NOT ( U0."imdb_id" IN
( 'tt0169547' ) )
AND U1."genre_id" IN ( 2, 10 ) ))
AND "moviegenre"."genre_id" IN ( 2, 10 ) )
The problem is that I'll get results in the format:
[
('imdbid22`, 'genreid1'),
('imdbid22`, 'genreid2'),
('imdbid44`, 'genreid1'),
('imdbid55`, 'genreid8'),
]
Is there a way within the query itself I can group all of the genre ids into a list under the movie.imdb_id's? I'd like do to grouping in the query.
Currently doing it in my web app code (Python) which is extremely slow when 50k+ rows are returned.
[
('imdbid22`, ['genreid1', 'genreid2']),
('imdbid44`, 'genreid1'),
('imdbid55`, 'genreid8'),
]
thanks in advance!
edit:
here's the python code which runs against the current results
results_list = []
for item in movies_and_genres:
genres_in_common = len(set([
i['genre__id'] for i in movies_and_genres
if i['movie__imdb_id'] == item['movie__imdb_id']
]))
imdb_id = item['movie__imdb_id']
if genres_in_common >= min_in_comon:
result_item = {
'movie.imdb_id': imdb_id,
'count': genres_in_common
}
if result_item not in results_list:
results_list.append(result_item)
return results_list

select m.imdb_id, array_agg(g.genre_id) as genre_id
from
moviegenre g
inner join
movie m on g.movie_id = m.id
where
m.last_ingested_on is not null
and not m.imdb_id in ('tt0169547')
and not m.imdb_id in ('tt0169547')
and g.genre_id in (2, 10)
group by m.imdb_id
array_agg will create an array of all the genre_ids of a certain imdb_id:
http://www.postgresql.org/docs/current/interactive/functions-aggregate.html#FUNCTIONS-AGGREGATE-TABLE

I hope python code will be fast enough:
movielist = [
('imdbid22', 'genreid1'),
('imdbid22', 'genreid2'),
('imdbid44, 'genreid1'),
('imdbid55', 'genreid8'),
]
dict = {}
for items in movielist:
if dict[items[0]] not in dict:
dict[items[0]] = items[1]
else:
dict[items[0]] = dict[items[0]].append(items[1])
print dict
Output:
{'imdbid44': ['genreid1'], 'imdbid55': ['genreid8'], 'imdbid22': ['genreid1', 'genreid2']}
If you just need movie name, count:
Change this in original query you will get the answer you dont need python code
SELECT "movie"."imdb_id", count("moviegenre"."genre_id")
group by "movie"."imdb_id"

Related

How to serialize the complex query (peewee)

I am using the peewee as ORM and my goal is to serialize the result of the complex query whcih also contains subqueries:
machine_usage_alias = RecordDailyMachineUsage.alias()
subquery = (
machine_usage_alias.select(
machine_usage_alias.machine_id,
fn.MAX(machine_usage_alias.date).alias('max_date'),
)
.group_by(machine_usage_alias.machine_id)
.alias('machine_usage_subquery')
)
record_subquery = RecordDailyMachineUsage.select(
RecordDailyMachineUsage.machine_id, RecordDailyMachineUsage.usage
).join(
subquery,
on=(
(RecordDailyMachineUsage.machine_id == subquery.c.machine_id)
& (RecordDailyMachineUsage.date == subquery.c.max_date)
),
)
query = (
Machine.select(
Machine.id, # 0
Machine.name,
Machine.location,
Machine.arch,
Machine.platform,
Machine.machine_version,
Machine.status,
record_subquery.c.usage.alias('usage'),
fn.GROUP_CONCAT(Tag.name.distinct()).alias('tags_list'),
fn.GROUP_CONCAT(Project.full_name.distinct()).alias('projects_list'),
) # 10
.join(MachineTag)
.join(Tag)
.switch(Machine)
.join(MachineProject)
.join(Project)
.join(
record_subquery,
JOIN.LEFT_OUTER,
on=(Machine.id == record_subquery.c.machine_id),
)
.where((Machine.id != 0) & (Machine.is_alive == 1))
.group_by(Machine.id)
)
I've tried to use the method model_to_dict:
jsonify({'rows': [model_to_dict(c) for c in query]})
But this way gives me the columns and and values from the Machine model only. My aim is include all the columns from the select query.

It turned out that I had to use the dicts method of the query and jsonify the result.
machine_usage_alias = RecordDailyMachineUsage.alias()
subquery = (
machine_usage_alias.select(
machine_usage_alias.machine_id,
fn.MAX(machine_usage_alias.date).alias('max_date'),
)
.group_by(machine_usage_alias.machine_id)
.alias('machine_usage_subquery')
)
record_subquery = RecordDailyMachineUsage.select(
RecordDailyMachineUsage.machine_id, RecordDailyMachineUsage.usage
).join(
subquery,
on=(
(RecordDailyMachineUsage.machine_id == subquery.c.machine_id)
& (RecordDailyMachineUsage.date == subquery.c.max_date)
),
)
query = (
Machine.select(
Machine.id, # 0
Machine.name,
Machine.location,
Machine.arch,
Machine.platform,
Machine.machine_version,
Machine.status,
record_subquery.c.usage.alias('usage'),
fn.GROUP_CONCAT(Tag.name.distinct()).alias('tags_list'),
fn.GROUP_CONCAT(Project.full_name.distinct()).alias('projects_list'),
) # 10
.join(MachineTag)
.join(Tag)
.switch(Machine)
.join(MachineProject)
.join(Project)
.join(
record_subquery,
JOIN.LEFT_OUTER,
on=(Machine.id == record_subquery.c.machine_id),
)
.where((Machine.id != 0) & (Machine.is_alive == 1))
.group_by(Machine.id)
).dicts()
return jsonify({'rows': [c for c in query]})

Iterating through lists with different lengths

I'm trying to iterate through 4 columns in a CSV that each contain a different amount sale ids.
I make a pandas dataframe and convert each row to a list.
If a column has a greater amount of sale ids than the following column it gives me an error:
Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/form[1]/div/select/option[#value=nan]"}
however if all columns have the same amount of id's each, the code works fine.
def get_report_data(self):
current_date = helpers.currentDate
data = pd.read_csv(r'C:\Users\rford\Desktop\sale_ids.csv')
everyone_ids = data['Everyone'].tolist()
dd_ids = data['Daily Deal'].tolist()
targeted_ids = data['Targeted'].tolist()
push_ids = data['Push Notification'].tolist()
acq_ids = data['Acquisition'].tolist()
for form_code, sales_type, idlist in (
( 1, "Everyone", everyone_ids ),
( 1, "Daily Deal", dd_ids ),
( 2, "Targeted", targeted_ids ),
( 2, "Push Notification", push_ids ),
( 2, "Acquisition", acq_ids ) ):
print('Gathering {} Sale Information'.format(sales_type))
for sale_id in idlist:
results = []
helpers.WebDriverWait(helpers.driver, 10)
helpers.driver.find_element_by_xpath('/html/body/form[{}]/div/select/option[#value={}]'.format(form_code, sale_id)).click()

The built-in function any might be useful in conjunction with each list's pop method:
def get_report_data(self):
current_date = helpers.currentDate
data = pd.read_csv(r'C:\Users\rford\Desktop\sale_ids.csv')
everyone_ids = data['Everyone'].tolist()
dd_ids = data['Daily Deal'].tolist()
targeted_ids = data['Targeted'].tolist()
push_ids = data['Push Notification'].tolist()
acq_ids = data['Acquisition'].tolist()
for form_code, sales_type, idlist in (
( 1, "Everyone", everyone_ids ),
( 1, "Daily Deal", dd_ids ),
( 2, "Targeted", targeted_ids ),
( 2, "Push Notification", push_ids ),
( 2, "Acquisition", acq_ids ) ):
print('Gathering {} Sale Information'.format(sales_type))
while any(idlist):
results = []
helpers.WebDriverWait(helpers.driver, 10)
helpers.driver.find_element_by_xpath(
'/html/body/form[{}]/div/select/option[#value={}]'.format(
form_code, idlist.pop(0)
)
).click()

Turns out pandas was reading some cells of the csv as float.
The fix ended up being to use .fillna(0) on my dataframe and then turn each column to a list and make them integers with .astype(int)
df = pd.read_csv(r'C:\Users\rford\Desktop\sale_ids.csv')
data = df.fillna(0)
everyone_ids = data['Everyone'].astype(int).tolist()
dd_ids = data['Daily Deal'].astype(int).tolist()
targeted_ids = data['Targeted'].astype(int).tolist()
push_ids = data['Push Notification'].astype(int).tolist()
acq_ids = data['Acquisition'].astype(int).tolist()

pd.read_sql - Unsupported format character error (0x27)

As above, I'm trying to use pd.read_sql to query our mysql database, and getting an error for double/single quotes.
When I remove the % operators from the LIKE clause (lines 84-87) the query runs, but these are needed. I know I need to format the strings but I don't know how within such a big query.
Here's the query:
SELECT
s.offer_id,
s.cap_id,
vi.make,
vi.model,
vi.derivative,
i.vehicle_orders,
s.lowest_offer,
CASE
WHEN f.previous_avg = f.previous_low THEN "n/a"
ELSE FORMAT(f.previous_avg, 2)
END as previous_avg,
f.previous_low,
CASE
WHEN ( ( (s.lowest_offer - f.previous_avg) / f.previous_avg) * 100) = ( ( (s.lowest_offer - f.previous_low) / f.previous_low) * 100) THEN "n/a"
ELSE CONCAT(FORMAT( ( ( (s.lowest_offer - f.previous_avg) / f.previous_avg) * 100), 2), "%")
END as diff_avg,
CONCAT(FORMAT( ( ( (s.lowest_offer - f.previous_low) / f.previous_low) * 100), 2), "%") as diff_low,
s.broker,
CASE
WHEN s.in_stock = '1' THEN "In Stock"
ELSE "Factory Order"
END as in_stock,
CASE
WHEN s.special IS NOT NULL THEN "Already in Specials"
ELSE "n/a"
END as special
FROM
( SELECT o.id as offer_id,
o.cap_id as cap_id,
MIN(o.monthly_payment) as lowest_offer,
b.name as broker,
o.stock as in_stock,
so.id as special
FROM
offers o
INNER JOIN brands b ON ( o.brand_id = b.id )
LEFT JOIN special_offers so ON ( so.cap_id = o.cap_id )
WHERE
( o.date_modified >= DATE_ADD(NOW(), INTERVAL -1 DAY) OR o.date_created >= DATE_ADD(NOW(), INTERVAL -1 DAY) )
AND o.deposit_value = 9
AND o.term = 48
AND o.annual_mileage = 8000
AND o.finance_type = 'P'
AND o.monthly_payment > 100
GROUP BY
o.cap_id
ORDER BY
special DESC) s
INNER JOIN
( SELECT o.cap_id as cap_id,
AVG(o.monthly_payment) as previous_avg,
MIN(o.monthly_payment) as previous_low
FROM
offers o
WHERE
o.date_modified < DATE_ADD(NOW(), INTERVAL -1 DAY)
AND o.date_modified >= DATE_ADD(NOW(), INTERVAL -1 WEEK)
AND o.deposit_value = 9
AND o.term = 48
AND o.annual_mileage = 8000
AND o.finance_type = 'P'
AND o.monthly_payment > 100
GROUP BY
o.cap_id ) f ON ( s.cap_id = f.cap_id )
LEFT JOIN
( SELECT a.cap_id as cap_id,
v.manufacturer as make,
v.model as model,
v.derivative as derivative,
COUNT(*) as vehicle_orders
FROM
( SELECT o.id,
o.name as name,
o.email as email,
o.date_created as date,
SUBSTRING_INDEX(SUBSTRING(offer_serialized, LOCATE("capId", offer_serialized) +12, 10), '"', 1) as cap_id
FROM moneyshake.orders o
WHERE o.name NOT LIKE 'test%'
AND o.email NOT LIKE 'jawor%'
AND o.email NOT LIKE 'test%'
AND o.email NOT LIKE '%moneyshake%'
AND o.phone IS NOT NULL
AND o.date_created > DATE_ADD(NOW(), INTERVAL -1 MONTH)
) a JOIN moneyshake.vehicles_view v ON a.cap_id = v.id
GROUP BY
v.manufacturer,
v.model,
v.derivative,
a.cap_id) i ON ( f.cap_id = i.cap_id )
INNER JOIN
( SELECT v.id as id,
v.manufacturer as make,
v.model as model,
v.derivative as derivative
FROM moneyshake.vehicles_view v
GROUP BY v.id ) vi ON s.cap_id = vi.id
WHERE
( ( s.lowest_offer - f.previous_low ) / f.previous_low) * 100 <= -15
GROUP BY
s.cap_id
Thanks!

That error occurs then the DBAPI layer (e.g., mysqlclient) natively uses the "format" paramstyle and the percent sign (%) is misinterpreted as a format character instead of a LIKE wildcard.
The fix is to wrap the SQL statement in a SQLAlchemy text() object. For example, this will fail:
import pandas as pd
import sqlalchemy as sa
engine = sa.create_engine("mysql+mysqldb://scott:tiger#localhost:3307/mydb")
sql = """\
SELECT * FROM million_rows
WHERE varchar_col LIKE 'record00000%'
ORDER BY id
"""
df = pd.read_sql_query(sql, engine)
but simply changing the read_sql_query() call to
df = pd.read_sql_query(sa.text(sql), engine)
will work.

pass an array of values into bigquery query with pandas

After some processing I get the following array:
users = array([u'5451709866311680', u'4660301072957440', u'6370791394377728',
u'5121933955825664', u'4778500988862464', u'5841867648270336',
u'4751430816628736', u'4869137213947904', u'5152642703556608',
u'6531810976595968', u'4824167228637184', u'6058117842337792',
u'5969360933879808', u'4764494160986112', u'5443041280131072',
u'4846257587617792', u'5409371420884992', u'6197117949313024',
u'6643644022915072', u'5060273861820416'], dtype=object)
And then I would like to query this users in another table in bigquery but I'm having issues.
query = """
SELECT *
FROM games
WHERE user_id IN %users
"""
segment = pd.io.gbq.read_gbq(query, project_id='shared', dialect='standard)
Anyone knows how to proceed?
Thank you

Probably you are having issues in your query and not in pandas. In order for this query to work, you'd have to do something like:
query = """
SELECT *
FROM crozzles.games
WHERE user_id IN UNNEST(['user1', 'user2', 'user3'])
"""
If you do not UNNEST your array then BigQuery cannot look for its inner values.
One thing you could do then is something like:
query = """
SELECT *
FROM crozzles.games
WHERE user_id IN UNNEST(%s)
""" %(map(str, users))
Should result in:
query = """SELECT *
FROM crozzles.games
WHERE user_id IN UNNEST(['5451709866311680', '4660301072957440', '6370791394377728', '5121933955825664', '4778500988862464', '5841867648270336', '4751430816628736', '4869137213947904', '5152642703556608', '6531810976595968', '4824167228637184', '6058117842337792', '5969360933879808', '4764494160986112', '5443041280131072', '4846257587617792', '5409371420884992', '6197117949313024', '6643644022915072', '5060273861820416'])

Here is one possibility using the open dataset bigquery-public-data.github_repos:
from numpy import array
import pandas as pd
PROJEC_ID = 'choose-your-project-id'
input_array = array(['JavaScript', 'Python', 'R'], dtype=object)
query = """
SELECT lang.name, COUNT(*) AS count
FROM `bigquery-public-data.github_repos.languages`, UNNEST(language) AS lang
WHERE lang.name IN UNNEST(#lang_names)
GROUP BY 1
ORDER BY 2 DESC;
"""
query_config = {
'query': {
'parameterMode': 'NAMED',
'queryParameters': [
{
'name': 'lang_names',
'parameterType': {'type': 'ARRAY',
'arrayType': {'type': 'STRING'}},
'parameterValue': {'arrayValues': [{'value': i} for i in input_array]}
}
]
}
}
result = pd.io.gbq.read_gbq(query, project_id=PROJEC_ID, dialect='standard',
configuration=query_config)
print(result.to_string())
Now this results in:
name count
0 JavaScript 1109499
1 Python 551257
2 R 29572
References:
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#QueryRequest
https://cloud.google.com/bigquery/docs/reference/rest/v2/QueryParameter

How to prevent duplicated records and only update it?

i want to add some records to another table model without duplicated it
i create a function to check the table data and return specific values to add it in another table
here is my code
def lol_hah(self,cr,uid,ids,context=None):
noobs_data=[]
cr.execute("select DISTINCT ON (subject_id)subject_id from fci_attendance_line")
noobs1 = cr.dictfetchall()
ages = [li['subject_id'] for li in noobs1]
print (ages)
for k in ages:
cr.execute(
"select DISTINCT ON (student_id)student_id, count(present) AS Number_of_Absenece,present,subject_id as subject_name, s.name AS Student_Name,s.standard_id,s.group_id from fci_attendance_line ,fci_student s where subject_id=%d and present=False and s.id=student_id group by student_id ,s.sit_number,present, s.name,s.standard_id,s.group_id ,subject_id "% (
k))
noobs = cr.dictfetchall()
cr.execute(
"select DISTINCT ON (student_id)student_id, count(present) AS Number_of_Absenece,present,subject_id as subject_name, s.name AS Student_Name,s.standard_id,s.group_id from fci_attendance_line ,fci_student s where subject_id=%d and present=False and s.id=student_id group by student_id ,s.sit_number,present, s.name,s.standard_id,s.group_id ,subject_id "% (
k))
noobs_details = cr.dictfetchall()
for details_ids in noobs_details:
for data in noobs:
details_ids[data['student_id']] = str(data['number_of_absenece'])+str(data['student_id']) + str(data['standard_id'])+str(data['group_id'])+str(data['subject_name'])
noobs_data.append(details_ids)
print (noobs_data)
subo_obj = self.pool.get('fci.attendance.subjects')
count=0
for name in noobs_data:
count =count+1
student_ids=self.search(cr,uid,[('student_id.id','=',int(name['student_id']))])
if student_ids and int(name['number_of_absenece']) >= 3:
subo_obj.create(cr, uid,{'student_id':int(name['student_id']),
'number_of_absence':int(name['number_of_absenece']),
'subject_id':int(name['subject_name']),
'standard_id':int(name['standard_id']),
'standard_group':int(name['group_id'])})
print ('Number of times LOL : ',count)
return True
my function work perfectly but when i add another value to my table and try to add to the other fields it duplicated but i want to just update the already date if excist i try to change my function like this but it didn't work :
def lol_hah(self,cr,uid,ids,context=None):
noobs_data=[]
cr.execute("select DISTINCT ON (subject_id)subject_id from fci_attendance_line")
noobs1 = cr.dictfetchall()
ages = [li['subject_id'] for li in noobs1]
print (ages)
for k in ages:
cr.execute(
"select DISTINCT ON (student_id)student_id, count(present) AS Number_of_Absenece,present,subject_id as subject_name, s.name AS Student_Name,s.standard_id,s.group_id from fci_attendance_line ,fci_student s where subject_id=%d and present=False and s.id=student_id group by student_id ,s.sit_number,present, s.name,s.standard_id,s.group_id ,subject_id "% (
k))
noobs = cr.dictfetchall()
cr.execute(
"select DISTINCT ON (student_id)student_id, count(present) AS Number_of_Absenece,present,subject_id as subject_name, s.name AS Student_Name,s.standard_id,s.group_id from fci_attendance_line ,fci_student s where subject_id=%d and present=False and s.id=student_id group by student_id ,s.sit_number,present, s.name,s.standard_id,s.group_id ,subject_id "% (
k))
noobs_details = cr.dictfetchall()
for details_ids in noobs_details:
for data in noobs:
details_ids[data['student_id']] = str(data['number_of_absenece'])+str(data['student_id']) + str(data['standard_id'])+str(data['group_id'])+str(data['subject_name'])
noobs_data.append(details_ids)
print (noobs_data)
subo_obj = self.pool.get('fci.attendance.subjects')
count=0
for name in noobs_data:
count =count+1
student_ids=self.search(cr,uid,[('student_id.id','=',int(name['student_id']))])
if student_ids and int(name['number_of_absenece']) >= 3:
ds_ids=subo_obj.search(cr,uid,[('student_id.id','=',int(name['student_id']))])
print('Here is ids found',ds_ids)
if ds_ids != []:
subo_obj.write(cr, uid, ds_ids, {'number_of_absence': int(name['number_of_absenece'])}, context=context)
else:
subo_obj.create(cr, uid,{'student_id':int(name['student_id']),
'number_of_absence':int(name['number_of_absenece']),
'subject_id':int(name['subject_name']),
'standard_id':int(name['standard_id']),
'standard_group':int(name['group_id'])})
print ('Number of times LOL : ',count)
return True
I hope you got what i want :)

Do you mean you're trying to merge 2 list but want to have only 1 unique instance of each item?
If this is the case you could add all of the data to the list then run something like noobs_data_trimmed = list(set(noobs_data))
Making a list into a set will obliterate exact duplicates within the set. Then you can turn it back into a list for easier processing.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Group by column to get array results in Postgresql - python

Related

How to serialize the complex query (peewee)

Iterating through lists with different lengths

pd.read_sql - Unsupported format character error (0x27)

pass an array of values into bigquery query with pandas

How to prevent duplicated records and only update it?

Categories

Resources