Question: I want to do a similar operation to
ARRAY_AGG(STRUCT(table)) in beam for python.
Background:
Similar to this thread I'm running a beam pipeline in python. I have two tables, one with ids and a sum:
ID
total
1
10
2
15
3
5
And one breakdown table where each row is:
table1_id
item_name
item_price
1
a
2
1
b
8
2
c
5
2
d
5
2
e
5
3
f
7
I want the output in biq query to look like:
id
total
item.item_name
item.item_price
1
10
a
2
b
8
2
15
c
5
d
5
e
5
3
5
f
7
In BQ this is solvable by doing an ARRAY_AGG(SRUCT(line_items)) and grouping by table1_id which can then be joined on table1. Is there a smart way to do so in beam with python?
(Assuming it's something with groupby by haven't been able to get it working)
I propose you a full code to implement your solution in an unit test :
from typing import List, Dict, Tuple, Any
import apache_beam as beam
import pytest
from apache_beam import Create
from apache_beam.pvalue import AsList
from apache_beam.testing.test_pipeline import TestPipeline
def test_pipeline(self):
with TestPipeline() as p:
ids = [
{
'ID': 1,
'total': 10
},
{
'ID': 2,
'total': 15
},
{
'ID': 3,
'total': 5
}
]
items = [
{
'table1_id': 1,
'item_name': 'a',
'item_price': 2
},
{
'table1_id': 1,
'item_name': 'b',
'item_price': 8
},
{
'table1_id': 2,
'item_name': 'c',
'item_price': 5
},
{
'table1_id': 2,
'item_name': 'd',
'item_price': 5
},
{
'table1_id': 2,
'item_name': 'e',
'item_price': 5
},
{
'table1_id': 3,
'item_name': 'f',
'item_price': 7
}
]
ids_side_inputs = p | 'Side input IDs' >> Create(ids)
result = (p
| 'Input items' >> Create(items)
| beam.GroupBy(lambda i: i['table1_id'])
| beam.Map(self.to_item_tuple_with_total, ids=AsList(ids_side_inputs))
| beam.Map(self.to_item_result)
)
result | "Print outputs" >> beam.Map(print)
def to_item_tuple_with_total(self, item_tuple: Tuple[int, Any], ids: List[Dict]) -> Tuple[Dict, List[Dict]]:
table_id = item_tuple[0]
total = next(id_element for id_element in ids if id_element['ID'] == table_id)['total']
return {'id': table_id, 'total': total}, item_tuple[1]
def to_item_result(self, item_tuple: Tuple[Dict, Any]) -> Dict:
item_key = item_tuple[0]
return {'id': item_key['id'], 'total': item_key['total'], 'item': item_tuple[1]}
The result is :
{
'id': 1,
'total': 10,
'item':
[
{'table1_id': 1, 'item_name': 'a', 'item_price': 2},
{'table1_id': 1, 'item_name': 'b', 'item_price': 8}
]
}
{
'id': 2,
'total': 15,
'item':
[
{'table1_id': 2, 'item_name': 'c', 'item_price': 5},
{'table1_id': 2, 'item_name': 'd', 'item_price': 5},
{'table1_id': 2, 'item_name': 'e', 'item_price': 5}
]
}
{
'id': 3,
'total': 5,
'item':
[
{'table1_id': 3, 'item_name': 'f', 'item_price': 7}
]
}
Some explanations :
I simulated the items input PCollection from BigQuery
I sumulated the ids side input PCollection from BigQuery
I added a GroupBy on table1_id from item PCollection
I added a Map with side input list IDs to link total to items
The last Map returns a Dict with expected fields before to save the result to BigQuery
Related
I have a pandas dataframe and would like to cast it to a dict.
item settings_id mat_id order
0 a-1 b1 32 1
1 a-1 x1 12 2
2 a-1 y4 3 3
3 a-2 k1 0 1
4 a-2 3a 2 2
5 a-2 x1 94 3
6 b-1 y4 32 1
7 b-1 b1 9 2
to
{'roots': [{'item': 'a-1',
'settings': [{'settings_id': 'b1', 'mat_id': 32, 'order': 1},
{'settings_id': 'x1', 'mat_id': 12, 'order': 2},
{'settings_id': 'y4', 'mat_id': 3, 'order': 3}]},
{'item': 'a-2',
'settings': [{'settings_id': 'k1', 'mat_id': 0, 'order': 1},
{'settings_id': '3a', 'mat_id': 2, 'order': 2},
{'settings_id': 'x1', 'mat_id': 94, 'order': 3}]},
{'item': 'b-1',
'settings': [{'settings_id': 'y4', 'mat_id': 32, 'order': 1},
{'settings_id': 'b1', 'mat_id': 9, 'order': 2}]}]}
In the pandas documentation, there exists the method to_dict. But I couldn't get it running in the way I wanted. Therefore I came up with using dataclasses for that.
However, I was wondering if there is a more convenient way?
from typing import List
from typing import Any
from dataclasses import dataclass, asdict
import pandas as pd
#dataclass
class Setting:
settings_id: str
mat_id: int
order: int
#staticmethod
def from_dict(obj: Any) -> 'Setting':
_settings_id = str(obj.get("settings_id"))
_mat_id = int(obj.get("mat_id"))
_order = int(obj.get("order"))
return Statistic(_settings_id, _mat_id, _order)
#dataclass
class ItemData:
item: str
settings: List[Setting]
#staticmethod
def from_dict(obj: Any) -> 'ItemData':
_item = str(obj.get("item"))
_settings = [Setting.from_dict(y) for y in obj.get("settings")]
return ItemData(_item, _settings)
#dataclass
class Root:
roots: List[ItemData]
#staticmethod
def from_dict(obj: Any) -> 'Root':
_roots = [ItemData.from_dict(y) for y in obj.get("ItemData")]
return Root(_roots)
df = pd.DataFrame({"item": ["a-1","a-1","a-1","a-2","a-2","a-2","b-1","b-1"],
"settings_id": ["b1","x1","y4","k1","3a","x1","y4","b1"],
"mat_id":[32,12,3,0,2,94,32,9],
"order":[1,2,3,1,2,3,1,2]
})
itemsData = []
items = df["item"].unique()
for item in items:
element = df[df["item"] == item]
settings = []
for index, row in element.iterrows():
setting = Setting(row["settings_id"],row["mat_id"],row["order"])
settings.append(setting)
itemsData.append(ItemData(item, settings))
r = Root(itemsData)
asdict(r)
Thank you in advance
You can use to_dict with kwarg orient="records" while looping over df.groupby("item"):
rec = []
for item, sub_df in df.groupby("item"):
rec.append({
"item": item,
"settings": sub_df.drop(columns="item").to_dict(orient="records")
})
pprint(rec)
Output:
[{'item': 'a-1',
'settings': [{'mat_id': 32, 'order': 1, 'settings_id': 'b1'},
{'mat_id': 12, 'order': 2, 'settings_id': 'x1'},
{'mat_id': 3, 'order': 3, 'settings_id': 'y4'}]},
{'item': 'a-2',
'settings': [{'mat_id': 0, 'order': 1, 'settings_id': 'k1'},
{'mat_id': 2, 'order': 2, 'settings_id': '3a'},
{'mat_id': 94, 'order': 3, 'settings_id': 'x1'}]},
{'item': 'b-1',
'settings': [{'mat_id': 32, 'order': 1, 'settings_id': 'y4'},
{'mat_id': 9, 'order': 2, 'settings_id': 'b1'}]}]
I need to get the count of groups which is same 'id' and 'name'
Input:
myd = {
"Items": [
{
"id": 1,
"name": "ABC",
"value": 666
},
{
"id": 1,
"name": "ABC",
"value": 89
},
{
"id": 2,
"name": "DEF",
"value": 111
},
{
"id": 3,
"name": "GHI",
"value": 111
}
]
}
Expected output:
The count of {'id':1, 'name': 'ABC' } is 2
The count of {'id':2, 'name': 'DEF' } is 1
The count of {'id':3, 'name': 'GHI' } is 1
for total length we can get by len(myd) for single key its len(myd['id'])
How to get the count for the combination of id and name
You can use collections.OrderedDict and set both 'id' and 'name' as tuple keys. In this way, the OrderedDict automatically groups the dictionaries with same 'id' and 'name' values in order:
myd = {'Items': [
{'id':1, 'name': 'ABC', 'value': 666},
{'id':1, 'name': 'ABC', 'value': 89},
{'id':2, 'name': 'DEF', 'value': 111 },
{'id':3, 'name': 'GHI', 'value': 111 }]
}
from collections import OrderedDict
od = OrderedDict()
for d in myd['Items']:
od.setdefault((d['id'], d['name']), set()).add(d['value'])
for ks, v in od.items():
print("The count of {{'id': {}, 'name': {}}} is {}".format(ks[0], ks[1], len(v)))
Output:
The count of {'id': 1, 'name': ABC} is 2
The count of {'id': 2, 'name': DEF} is 1
The count of {'id': 3, 'name': GHI} is 1
This is a good candidate for groupby and itemgetter usage:
from itertools import groupby
from operator import itemgetter
myd = {'Items': [
{'id': 1, 'name': 'ABC', 'value': 666},
{'id': 1, 'name': 'ABC', 'value': 89},
{'id': 2, 'name': 'DEF', 'value': 111},
{'id': 3, 'name': 'GHI', 'value': 111}]
}
grouper = itemgetter('id', 'name')
for i, v in groupby(sorted(myd['Items'], key=grouper), key=grouper):
print(f"the count for {dict(id=i[0], name=i[1])} is {len(list(v))}")
I have a dataframe like below. Each topic has several sub-topics.
pd.DataFrame({'topic': ['A', 'A', 'A', 'B', 'B'],
'sub-topic': ['A1', 'A2', 'A3', 'B1', 'B3' ],
'value': [2,12,44,21,1]})
topic sub-topic value
0 A A1 2
1 A A2 12
2 A A3 44
3 B B1 21
4 B B3 1
I need to convert it to Json format like below. Within first layer, for example topic A, the value is the sum of all its sub-topics.
{'A': {
'value': 58,
'children': {
'A1': {'value': 2},
'A2': {'value': 12},
'A3': {'value': 44}
},
},
'B': {
'value': 22,
'children': {
'B1': {'value': 21},
'B3': {'value': 1}
}
}
}
Does anyone know how I can convert the data to this specific json? I have no clue how I should approach that. Thanks a lot in advance.
Use cusom function in GroupBy.apply, last use Series.to_dict or Series.to_json:
def f(x):
d = {'value': x['value'].sum(),
'children': x.set_index('sub-topic')[['value']].to_dict('index')}
return (d)
#for dictonary
out = df.groupby('topic').apply(f).to_dict()
#for json
#out = df.groupby('topic').apply(f).to_json()
print (out)
{
'A': {
'value': 58,
'children': {
'A1': {
'value': 2
},
'A2': {
'value': 12
},
'A3': {
'value': 44
}
}
},
'B': {
'value': 22,
'children': {
'B1': {
'value': 21
},
'B3': {
'value': 1
}
}
}
}
I have a json file with a deeply nested recursive structure:
{"children": [
"val" = x
"data" = y
"children": [{
"val" = x
"data" = y
"children": [{
....
"val" = x
"data" = y
"children": [{
"val" = x
"data" = y
"children": [{
....
Using pandas json_normalize as follows:
json_normalize(data = self.data["children"], record_path="children")
gives dataframe where the first level is flattened but the deepers levels remain json strings within the dataframe.
How can i flatten my dataframe such that the entire json tree is unpacked and flattened?
Providing your json is well formatted and has the same structure at all levels you can extract all the data by passing a List of keywords to json_normalize from each level.
json = {'children': [{
'val': 1,
'data': 2,
'children': [{
'val': 3,
'data' : 4,
'children': [{'val' : 4,
'data' : 5}],
}],
},{
'val' : 6,
'data' : 7,
'children': [{
'val' : 8,
'data' : 9,
'children': [{'val' : 10,
'data' : 11}],
}]
}]}
for i in range(1,3):
print( json_normalize(data = json,record_path=['children']*i) )
This gives the following output, which you can use recursively add into a single DataFrame if you wish.
children data val
0 [{'val': 3, 'data': 4, 'children': [{'val': 4,... 2 1
1 [{'val': 8, 'data': 9, 'children': [{'val': 10... 7 6
children data val
0 [{'val': 4, 'data': 5}] 4 3
1 [{'val': 10, 'data': 11}] 9 8
data val
0 5 4
1 11 10
This question already has answers here:
How to unnest (explode) a column in a pandas DataFrame, into multiple rows
(16 answers)
Closed 3 years ago.
I've got a pandas dataset with a column that's a comma-separated string, e.g. 1,2,3,10:
data = [
{ 'id': 1, 'score': 9, 'topics': '11,22,30' },
{ 'id': 2, 'score': 7, 'topics': '11,18,30' },
{ 'id': 3, 'score': 6, 'topics': '1,12,30' },
{ 'id': 4, 'score': 4, 'topics': '1,18,30' }
]
df = pd.DataFrame(data)
I'd like to get a count and a mean score for each value in topics. So:
topic_id,count,mean
1,2,5
11,2,8
12,1,6
et cetera. How can I do this?
I've got as far as:
df['topic_ids'] = df.topics.str.split()
But now I guess I want to explode topic_ids out, so there's a column for each unique value in the entire set of values...?
unnest then groupby and agg
df.topics=df.topics.str.split(',')
New_df=pd.DataFrame({'topics':np.concatenate(df.topics.values),'id':df.id.repeat(df.topics.apply(len)),'score':df.score.repeat(df.topics.apply(len))})
New_df.groupby('topics').score.agg(['count','mean'])
Out[1256]:
count mean
topics
1 2 5.0
11 2 8.0
12 1 6.0
18 2 5.5
22 1 9.0
30 4 6.5
In [111]: def mean1(x): return np.array(x).astype(int).mean()
In [112]: df.topics.str.split(',', expand=False).agg([mean1, len])
Out[112]:
mean1 len
0 21.000000 3
1 19.666667 3
2 14.333333 3
3 16.333333 3
This is one way. Reindex & stack, then groupby & agg.
import pandas as pd
data = [
{ 'id': 1, 'score': 9, 'topics': '11,22,30' },
{ 'id': 2, 'score': 7, 'topics': '11,18,30' },
{ 'id': 3, 'score': 6, 'topics': '1,12,30' },
{ 'id': 4, 'score': 4, 'topics': '1,18,30' }
]
df = pd.DataFrame(data)
df.topics = df.topics.str.split(',')
df2 = pd.DataFrame(df.topics.tolist(), index=[df.id, df.score])\
.stack()\
.reset_index(name='topics')\
.drop('level_2', 1)
df2.groupby('topics').score.agg(['count', 'mean']).reset_index()