I have a list of dictionaries:
data = [{"average": 2, "day": "2022-01-01", "name": "joe", "employee_id": 1},
{"average": 3, "day": "2022-01-02", "name": "joe", "employee_id": 1},
{"average": 9, "day": "2022-01-03", "name": "joe", "employee_id": 1},
{"sum": 13, "day": "2022-01-01", "name": "joe", "employee_id": 1},
{"sum": 15, "day": "2022-01-02", "name": "joe", "employee_id": 1},
{"sum": 0, "day": "2022-01-03", "name": "joe", "employee_id": 1},
{"average": 1, "day": "2022-01-01", "name": "bob", "employee_id": 2},
{"average": 3, "day": "2022-01-02", "name": "bob", "employee_id": 2},
{"sum": 9, "day": "2022-01-01", "name": "bob", "employee_id": 2},
{"sum": 8, "day": "2022-01-02", "name": "bob", "employee_id": 2}]
I want my output as:
output = [{"name": "joe", "employee_id": 1, "day": "2022-01-01", "average": 2, "sum": 13},
{"name": "joe", "employee_id": 1, "day": "2022-01-02", "average": 3, "sum": 15},
{"name": "joe", "employee_id": 1, "day": "2022-01-03", "average": 9, "sum": 0},
{"name": "bob", "employee_id": 2, "day": "2022-01-01", "average": 1, "sum": 9},
{"name": "bob", "employee_id": 2, "day": "2022-01-02", "average": 3, "sum": 8}]
The goal is that the output values are put together by day, name, and employee_id.
I've tried:
output = {}
for item in data:
if item["day"] not in output:
output[item["day"]] = item
else:
output[item["day"]].update(item)
print(list(output.values()))
This works in getting the "average" and "sum" and "date" together, but it ends up not including all of the employees and their IDs.
Any help is appreciated
Using collections.defaultdict with dict. Here, take the value of 'day' and 'name' of each dictionary as the key:
>>> from collections import defaultdict
>>> defdict = defaultdict(dict)
>>> for mp in data:
... defdict[mp['day'], mp['name']].update(mp)
...
>>> keys = ('name', 'employee_id', 'day', 'average', 'sum')
>>> [{k: mp[k] for k in keys} for mp in defdict.values()]
[{'name': 'joe', 'employee_id': 1, 'day': '2022-01-01', 'average': 2, 'sum': 13},
{'name': 'joe', 'employee_id': 1, 'day': '2022-01-02', 'average': 3, 'sum': 15},
{'name': 'joe', 'employee_id': 1, 'day': '2022-01-03', 'average': 9, 'sum': 0},
{'name': 'bob', 'employee_id': 2, 'day': '2022-01-01', 'average': 1, 'sum': 9},
{'name': 'bob', 'employee_id': 2, 'day': '2022-01-02', 'average': 3, 'sum': 8}]
For 150w pieces of data, the performance of this solution is still better than that of pandas (at least when converting data into DataFrame, the for loop has completed the work):
In [451]: random.seed(0)
...: names = [''.join(random.choices(string.ascii_lowercase, k=random.randrange(3, 7))) for _ in range(10000)]
...: dates = [str(datetime.date(2022, i, j)) for i in range(7, 10) for j in range(1, 31)]
...: keys = ['sum', 'average']
...:
...: data = [{k: random.randrange(10), 'day': date, 'name': name, 'employee_id': i}
...: for i, name in enumerate(names, 1)
...: for date in sorted(random.sample(dates, random.randrange(60, 90)))
...: for k in keys]
...:
In [452]: len(data)
Out[452]: 1492286
In [453]: %%timeit
...: defdict = defaultdict(dict)
...: for mp in data:
...: defdict[mp['day'], mp['name']].update(mp)
...: keys = ('name', 'employee_id', 'day', 'average', 'sum')
...: [{k: mp[k] for k in keys} for mp in defdict.values()]
...:
...:
926 ms ± 6.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [454]: %%timeit
...: df = pd.DataFrame(data)
...: pd.merge(df.loc[df['average'].notna()][[ 'name','day','employee_id','average']],
...: df.loc[df['sum'].notna()][['name','day','employee_id','sum']],
...: how='outer'
...: ).to_dict(orient= 'records')
...:
...:
3.58 s ± 19.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [455]: %timeit pd.DataFrame(data)
1.26 s ± 17.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
here is one way to do it
# filter using loc where average is not null and where sum is not null, as two separate frame
# merge the two DF
#finally, output as a dict of orient record
pd.merge(df.loc[df['average'].notna()][[ 'name','day','employee_id','average']],
df.loc[df['sum'].notna()][['name','day','employee_id','sum']],
how='outer'
).to_dict(orient= 'records')
[{'name': 'joe',
'day': '2022-01-01',
'employee_id': 1,
'average': 2.0,
'sum': 13.0},
{'name': 'joe',
'day': '2022-01-02',
'employee_id': 1,
'average': 3.0,
'sum': 15.0},
{'name': 'joe',
'day': '2022-01-03',
'employee_id': 1,
'average': 9.0,
'sum': 0.0},
{'name': 'bob',
'day': '2022-01-01',
'employee_id': 2,
'average': 1.0,
'sum': 9.0},
{'name': 'bob',
'day': '2022-01-02',
'employee_id': 2,
'average': 3.0,
'sum': 8.0}]
From the description given the combination "day', "name", "employee_id" acts like a unique combination to which the other two fields should be added. Each incoming dictionary has these and we can use them as a key into a new dictionary, but we need to convert them to something hashable like a json string which we need to produce with sorting to make them unique..
from json import dumps
data = [{"average": 2, "day": "2022-01-01", "employee_id": 1, "name": "joe"},
{"average": 3, "day": "2022-01-02", "name": "joe", "employee_id": 1},
{"average": 9, "day": "2022-01-03", "name": "joe", "employee_id": 1},
{"sum": 13, "day": "2022-01-01", "name": "joe", "employee_id": 1},
{"sum": 15, "day": "2022-01-02", "name": "joe", "employee_id": 1},
{"sum": 0, "day": "2022-01-03", "name": "joe", "employee_id": 1},
{"average": 1, "day": "2022-01-01", "name": "bob", "employee_id": 2},
{"average": 3, "day": "2022-01-02", "name": "bob", "employee_id": 2},
{"sum": 9, "day": "2022-01-01", "name": "bob", "employee_id": 2},
{"sum": 8, "day": "2022-01-02", "name": "bob", "employee_id": 2}]
flattend_employee_summaries = dict()
for employee_summary in data:
key = employee_summary.copy()
if "average" in key:
del key["average"]
if dumps(key, sort_keys=True) not in flattend_employee_summaries:
flattend_employee_summaries[dumps(key, sort_keys=True)] = employee_summary.copy()
else:
flattend_employee_summaries[dumps(key, sort_keys=True)]["average"] = employee_summary["average"]
if "sum" in key:
del key["sum"]
if dumps(key, sort_keys=True) not in flattend_employee_summaries:
flattend_employee_summaries[dumps(key, sort_keys=True)] = employee_summary.copy()
else:
flattend_employee_summaries[dumps(key, sort_keys=True)]["sum"] = employee_summary["sum"]
flattend_employee_summaries = [ summary for summary in flattend_employee_summaries.values()]
print(f'{flattend_employee_summaries=}')
It has been answered, and I suspect this is the long way of repeating Mechanic Pig's solution which I'd recommend. For all of the solutions, I believe we are assuming there is only one average record per employee/day.
employees = dict()
for data_row in data:
if data_row['employee_id'] not in employees:
employees[data_row['employee_id']] = {data_row['day']: {'name':data_row.get('name', 0),
'average': data_row.get('average', 0),
'sum': data_row.get('sum',0)
}
}
else:
data_row_day = data_row['day']
if data_row['day'] not in employees[data_row['employee_id']]:
employees[data_row['employee_id']][data_row_day] = {'name':data_row.get('name', 0),
'average': data_row.get('average', 0),
'sum': data_row.get('sum', 0)
}
else:
current_sum = employees[data_row['employee_id']][data_row_day].get('sum',0)
employees[data_row['employee_id']][data_row_day].update({'sum': current_sum + data_row.get('sum', 0) })
employee_output = list()
for employee_id, employee_dates in employees.items():
for employee_date, employee_details in employee_dates.items():
employee_output.append({"name": employee_details['name'],
"employee_id": employee_id,
"day": employee_date,
"average": employee_details['average'],
"sum": employee_details['sum'],
})
employee_output would then contain:
[{'name': 'joe',
'employee_id': 1,
'day': '2022-01-01',
'average': 2,
'sum': 13},
{'name': 'joe',
'employee_id': 1,
'day': '2022-01-02',
'average': 3,
'sum': 15},
{'name': 'joe',
'employee_id': 1,
'day': '2022-01-03',
'average': 9,
'sum': 0},
{'name': 'bob',
'employee_id': 2,
'day': '2022-01-01',
'average': 1,
'sum': 9},
{'name': 'bob',
'employee_id': 2,
'day': '2022-01-02',
'average': 3,
'sum': 8}]
Related
Assume I have this:
[
{"name": "bob", "total": 1},
{"name": "alice", "total": 5},
{"name": "eve", "total": 2},
{"name": "bob", "total": 3},
{"name": "alice", "total": 2},
{"name": "alice", "total": 2},
]
I want to transform this list into :
[
{"name": "bob", "total": 4},
{"name": "alice", "total": 9},
{"name": "eve", "total": 2}
]
For now, I walk through the whole second list to find if the key exist for each loop of the first list.
How can I achieve this with a lower complexity?
from collections import defaultdict
a = [
{"name": "bob", "total": 1},
{"name": "alice", "total": 5},
{"name": "eve", "total": 2},
{"name": "bob", "total": 3},
{"name": "alice", "total": 2},
{"name": "alice", "total": 2},
]
# calculate the frequency of each key
freq = defaultdict(lambda: 0)
for d in a:
freq[d['name']] += d['total']
# build the result list
a = list()
for key, val in freq.items():
a.append({'name': key, 'total': val})
print(a)
If you only have two pieces of information (name and total), I would suggest changing your schema a bit. Instead of a list of dictionaries, use a single dictionary where the keys are names and the values are totals:
>>> values = [
... {"name": "bob", "total": 1},
... {"name": "alice", "total": 5},
... {"name": "eve", "total": 2},
... {"name": "bob", "total": 3},
... {"name": "alice", "total": 2},
... {"name": "alice", "total": 2},
... ]
>>> from collections import defaultdict
>>> totals_by_name = defaultdict(int)
>>> for value in values:
... totals_by_name[value["name"]] += value["total"]
...
>>> totals_by_name
defaultdict(<class 'int'>, {'bob': 4, 'alice': 9, 'eve': 2})
This can work even if you have more pieces of data that you want to look up by name (replace the integer value with a nested dictionary that stores the total as well as other data).
You can use groupby from the itertools module:
from itertools import groupby
from operator import itemgetter
# itemgetter(foo) is roughly equivalent to lambda x: x[foo]
get_name = itemgetter('name')
get_total = itemgetter('total')
lst = [
{"name": "bob", "total": 1},
{"name": "alice", "total": 5},
{"name": "eve", "total": 2},
{"name": "bob", "total": 3},
{"name": "alice", "total": 2},
{"name": "alice", "total": 2},
]
grouped = groupby(sorted(lst, key=get_name), get_name)
new_list = [{'name': k, 'total': sum(get_total(x) for x in v)} for k, v in grouped]
groupby will produce a new sequence that collects the dicts from the original list into subsequences, based on a common value of the 'name' attribute. Iterating over that lets you extract all the total values to sum up for use in a new list of dict values.
Let's say,
your_data = [
{"name": "bob", "total": 1},
{"name": "alice", "total": 5},
{"name": "eve", "total": 2},
{"name": "bob", "total": 3},
{"name": "alice", "total": 2},
{"name": "alice", "total": 2},
]
You can simply use pandas to receive the desired output.
import pandas as pd
df = pd.DataFrame(your_data)
df = df.groupby(by = 'name', as_index = False).sum('total')
result = df.to_dict(orient = 'records')
OUTPUT: [{'name': 'alice', 'total': 9}, {'name': 'bob', 'total': 4}, {'name': 'eve', 'total': 2}]
I have a pandas dataframe that has information about a user with multiple orders and within each order there are multiple items purchases. An example of the dataframe format:
user_id | order_num | item_id | item_desc
1 1 1 red
1 1 2 blue
1 1 3 green
I want to convert it to JSONb Object in a column so that I can query it in postgresql.
Currently I am using the following code:
j = (reg_test.groupby(['user_id', 'order_num'], as_index=False)
.apply(lambda x: x[['item_id','item_desc']].to_dict('r'))
.reset_index()
.rename(columns={0:'New-Data'})
.to_json(orient='records'))
This is the result I am getting:
'''
[
{
"New-Data": [
{
"item_id": "1",
"item_desc": "red",
},
{
"item_id": "2",
"item_desc": "blue",
},
{
"item_id": "3",
"item_desc": "green",
}
],
"order_number": "1",
"user_id": "1"
}
]
'''
While that is correct json format, I want the result to look like this:
'''
[
{
"New-Data": [{
"1":
{
"item_id": "1",
"item_desc": "red",
},
"2": {
"item_id": "2",
"item_desc": "blue",
},
"3":
{
"item_id": "3",
"item_desc": "green",
}
}
],
"order_number": "1",
"user_id": "1"
}
]
'''
as an alternative to #rpanai's solution, i moved the processing into vanilla python :
convert dataframe to dict :
M = df.to_dict("records")
create the dict for the items
items = [
{key: value
for key, value in entry.items()
if key not in ("user_id", "order_num")}
for entry in M
]
item_details = [{str(num + 1): entry}
for num, entry
in enumerate(items)]
print(item_details)
[{'1': {'item_id': 1, 'item_desc': 'red'}},
{'2': {'item_id': 2, 'item_desc': 'blue'}},
{'3': {'item_id': 3, 'item_desc': 'green'}}]
Initialize dict and add the remaining data
d = dict()
d['New-Data'] = item_details
d['order_number'] = M[0]['order_num']
d['user_id'] = M[0]['user_id']
wrapper = [d]
print(wrapper)
[{'New-Data': [{'1': {'item_id': 1, 'item_desc': 'red'}},
{'2': {'item_id': 2, 'item_desc': 'blue'}},
{'3': {'item_id': 3, 'item_desc': 'green'}}],
'order_number': 1,
'user_id': 1}]
Have you considered to use a custom function
import pandas as pd
df = pd.DataFrame({'user_id': {0: 1, 1: 1, 2: 1},
'order_num': {0: 1, 1: 1, 2: 1},
'item_id': {0: 1, 1: 2, 2: 3},
'item_desc': {0: 'red', 1: 'blue', 2: 'green'}})
out = df.groupby(['user_id', 'order_num'])[["item_id", "item_desc"]]\
.apply(lambda x: x.to_dict("records"))\
.apply(lambda x: [{str(l["item_id"]):l for l in x}])\
.reset_index(name="New-Data")\
.to_dict("records")
where out returns
[{'user_id': 1,
'order_num': 1,
'New-Data': [{'1': {'item_id': 1, 'item_desc': 'red'},
'2': {'item_id': 2, 'item_desc': 'blue'},
'3': {'item_id': 3, 'item_desc': 'green'}}]}]
I have two JSON strings as sample:
json_1 = [
{
"breadth": 48.04,
"vessel_id": 1,
"vessel_name": "SHIP-01",
"vessel_type": "Crude Oil Tanker",
"year_built": 2012
},
{
"breadth": 42,
"vessel_id": 2,
"vessel_name": "SHIP-02",
"vessel_type": "Crude Oil Tanker",
"year_built": 2016
}
]
json_2 = [
{
"Ballast_miles": 43575.8,
"Ballast_miles_pct": 36.1,
"org_id": 1,
"port_days": 383.5,
"sea_days": 414.9,
"total_days": 798.4,
"vessel_id": 1
},
{
"Ballast_miles": 21642.7,
"Ballast_miles_pct": 29.8,
"org_id": 1,
"port_days": 325.7,
"sea_days": 259.8,
"total_days": 585.5,
"vessel_id": 2
}
]
I want to combine these two JSON based on vessel_id.
My output format should look like:
[{ vesselId: 1,
json1:{},
json2:{}
},
{ vesselId: 2,
json1:{},
json2:{}
}]
What I've tried so far is:
data = {'First_Json': json_1, 'Second_Json': json_2}
json.dumps(data)
But this combines entirely without checking based on vessel_id.
Something like this?
json_1 = [{ "breadth": 48.04, "vessel_id": 1, "vessel_name": "SHIP-01", "vessel_type": "Crude Oil Tanker", "year_built": 2012 }, { "breadth": 42, "vessel_id": 2, "vessel_name": "SHIP-02", "vessel_type": "Crude Oil Tanker", "year_built": 2016 }]
json_2 = [{ "Ballast_miles": 43575.8, "Ballast_miles_pct": 36.1, "org_id": 1, "port_days": 383.5, "sea_days": 414.9, "total_days": 798.4, "vessel_id": 1 }, { "Ballast_miles": 21642.7, "Ballast_miles_pct": 29.8, "org_id": 1, "port_days": 325.7, "sea_days": 259.8, "total_days": 585.5, "vessel_id": 2 }]
from collections import defaultdict
result = defaultdict(dict)
for item in json_1:
result[item['vessel_id']]['json_1'] = item
for item in json_2:
result[item['vessel_id']]['json_2'] = item
[{"vessel_id" : k,
"json1" : v['json_1'],
"json2" : v['json_2']}
for k,v in result.items()]
Output:
[{'json1': {'breadth': 48.04,
'vessel_id': 1,
'vessel_name': 'SHIP-01',
'vessel_type': 'Crude Oil Tanker',
'year_built': 2012},
'json2': {'Ballast_miles': 43575.8,
'Ballast_miles_pct': 36.1,
'org_id': 1,
'port_days': 383.5,
'sea_days': 414.9,
'total_days': 798.4,
'vessel_id': 1},
'vessel_id': 1},
{'json1': {'breadth': 42,
'vessel_id': 2,
'vessel_name': 'SHIP-02',
'vessel_type': 'Crude Oil Tanker',
'year_built': 2016},
'json2': {'Ballast_miles': 21642.7,
'Ballast_miles_pct': 29.8,
'org_id': 1,
'port_days': 325.7,
'sea_days': 259.8,
'total_days': 585.5,
'vessel_id': 2},
'vessel_id': 2}]
If you want to remove the redundant vessel_id, try using for loop with a del command on each dict
I have two lists containing dictionaries:
List1 = [{"Value": "Value1", "Start": 7.11, "End": 8},
{"Value": "Value2", "Start": 16.45, "End": 20}]
List2 = [{"From":7.11, "To": 8, "Result": 0},
{"From":16.45, "To": 20 "Result": 1}
]
I need to produce a list by correlating these lists. So result will be
Result = [{"Value": "Value1", "Start": 7.11, "End": 8, Result: 0},
{"Value": "Value2", "Start": 16.45, "End": 20,Result: 1}]
This almost seem like simple table join in SQL.
How would I do it in Python?
Thanks!
You can use a nested dictionary comprehension:
List1 = [{"Value": "Value1", "Start": 7.11, "End": 8},
{"Value": "Value2", "Start": 16.45, "End": 20}]
List2 = [{"From":7.11, "To": 8, "Result": 0},
{"From":16.45, "To": 20, "Result": 1}
]
new_list = [{**a, **{'Result':b['Result']}} for a, b in zip(List1, List2)]
Output:
[{'Value': 'Value1', 'Start': 7.11, 'End': 8, 'Result': 0}, {'Value': 'Value2', 'Start': 16.45, 'End': 20, 'Result': 1}]
Since, dictionary unpacking (**) is a feature in Python3 only, you can use dict.items in Python2:
new_list = [dict(a.items()+[('Result', b['Result'])]) for a, b in zip(List1, List2)]
Output:
[{'Start': 7.11, 'End': 8, 'Result': 0, 'Value': 'Value1'}, {'Start': 16.45, 'End': 20, 'Result': 1, 'Value': 'Value2'}]
I have the following list of dictionaries:
dict1 = [{"id": 1, "name": "tamara", "age":23},
{"id": 1, "name": "mia", "age":14},
{"id": 1, "name": "teo", "age":33},
{"id": 2, "name": "maya", "age":30}}
I would like to create new list of dictionaries from the existing list of dictionaries where If I have the same "id":1 three times in dict1 then don't repeat them in the list and rather have dict in a dict:
dict2 = [{"id": 1, newkey: [{"name": "tamara", "age":23},
{"name":"mia", "age":14},
{"name": "teo", "age":33}]},
{"id": 2, "name": "maya", "age":30}}
This is what I want to achieve any suggestion how?
You can use itertools.groupby:
import itertools
dict1 = [{"id": 1, "name": "tamara", "age":23}, {"id": 1, "name": "mia", "age":14}, {"id": 1, "name": "teo", "age":33}, {"id": 2, "name": "maya", "age":30}]
new_d = [[a, list(b)] for a, b in itertools.groupby(sorted(dict1, key=lambda x:x['id']), key=lambda x:x['id'])]
dict2 = [{'id':a, 'new_key':[{c:d for c, d in i.items() if c != 'id'} for i in b]} for a, b in new_d]
Output:
[{'new_key': [{'age': 23, 'name': 'tamara'}, {'age': 14, 'name': 'mia'}, {'age': 33, 'name': 'teo'}], 'id': 1}, {'new_key': [{'age': 30, 'name': 'maya'}], 'id': 2}]
Use itertools.groupby
>>> from operator import itemgetter
>>> from itertools import groupby
>>> dict1 = [{"id": 1, "name": "tamara", "age":23}, {"id": 1, "name": "mia", "age":14}, {"id": 1, "name": "teo", "age":33}, {"id": 2, "name": "maya", "age":30}]
>>> [{'id': k, 'new_key':[{k2:v2} for d in list(v) for k2,v2 in d.items() if k2!='id']} for k,v in groupby(dict1, itemgetter('id'))]
# [{'new_key': [{'age': 23}, {'name': 'tamara'}, {'age': 14}, {'name': 'mia'}, {'age': 33}, {'name': 'teo'}], 'id': 1}, {'new_key': [{'age': 30}, {'name': 'maya'}], 'id': 2}]