Pandas dataframe to dict using dataclass - python

I have a pandas dataframe and would like to cast it to a dict.
item settings_id mat_id order
0 a-1 b1 32 1
1 a-1 x1 12 2
2 a-1 y4 3 3
3 a-2 k1 0 1
4 a-2 3a 2 2
5 a-2 x1 94 3
6 b-1 y4 32 1
7 b-1 b1 9 2
to
{'roots': [{'item': 'a-1',
'settings': [{'settings_id': 'b1', 'mat_id': 32, 'order': 1},
{'settings_id': 'x1', 'mat_id': 12, 'order': 2},
{'settings_id': 'y4', 'mat_id': 3, 'order': 3}]},
{'item': 'a-2',
'settings': [{'settings_id': 'k1', 'mat_id': 0, 'order': 1},
{'settings_id': '3a', 'mat_id': 2, 'order': 2},
{'settings_id': 'x1', 'mat_id': 94, 'order': 3}]},
{'item': 'b-1',
'settings': [{'settings_id': 'y4', 'mat_id': 32, 'order': 1},
{'settings_id': 'b1', 'mat_id': 9, 'order': 2}]}]}
In the pandas documentation, there exists the method to_dict. But I couldn't get it running in the way I wanted. Therefore I came up with using dataclasses for that.
However, I was wondering if there is a more convenient way?
from typing import List
from typing import Any
from dataclasses import dataclass, asdict
import pandas as pd
#dataclass
class Setting:
settings_id: str
mat_id: int
order: int
#staticmethod
def from_dict(obj: Any) -> 'Setting':
_settings_id = str(obj.get("settings_id"))
_mat_id = int(obj.get("mat_id"))
_order = int(obj.get("order"))
return Statistic(_settings_id, _mat_id, _order)
#dataclass
class ItemData:
item: str
settings: List[Setting]
#staticmethod
def from_dict(obj: Any) -> 'ItemData':
_item = str(obj.get("item"))
_settings = [Setting.from_dict(y) for y in obj.get("settings")]
return ItemData(_item, _settings)
#dataclass
class Root:
roots: List[ItemData]
#staticmethod
def from_dict(obj: Any) -> 'Root':
_roots = [ItemData.from_dict(y) for y in obj.get("ItemData")]
return Root(_roots)
df = pd.DataFrame({"item": ["a-1","a-1","a-1","a-2","a-2","a-2","b-1","b-1"],
"settings_id": ["b1","x1","y4","k1","3a","x1","y4","b1"],
"mat_id":[32,12,3,0,2,94,32,9],
"order":[1,2,3,1,2,3,1,2]
})
itemsData = []
items = df["item"].unique()
for item in items:
element = df[df["item"] == item]
settings = []
for index, row in element.iterrows():
setting = Setting(row["settings_id"],row["mat_id"],row["order"])
settings.append(setting)
itemsData.append(ItemData(item, settings))
r = Root(itemsData)
asdict(r)
Thank you in advance

You can use to_dict with kwarg orient="records" while looping over df.groupby("item"):
rec = []
for item, sub_df in df.groupby("item"):
rec.append({
"item": item,
"settings": sub_df.drop(columns="item").to_dict(orient="records")
})
pprint(rec)
Output:
[{'item': 'a-1',
'settings': [{'mat_id': 32, 'order': 1, 'settings_id': 'b1'},
{'mat_id': 12, 'order': 2, 'settings_id': 'x1'},
{'mat_id': 3, 'order': 3, 'settings_id': 'y4'}]},
{'item': 'a-2',
'settings': [{'mat_id': 0, 'order': 1, 'settings_id': 'k1'},
{'mat_id': 2, 'order': 2, 'settings_id': '3a'},
{'mat_id': 94, 'order': 3, 'settings_id': 'x1'}]},
{'item': 'b-1',
'settings': [{'mat_id': 32, 'order': 1, 'settings_id': 'y4'},
{'mat_id': 9, 'order': 2, 'settings_id': 'b1'}]}]

Related

How to convert a list of nested dictionaries (includes tuples) as a dataframe

I have a piece of code which generates a list of nested dictionaries like below:
[{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 2, 'num': 68}),
'final_value': 118},
{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 4, 'num': 67}),
'final_value': 117},
{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 6, 'num': 67}),
'final_value': 117}]
I want to convert the dictionary into a dataframe like below
How can I do it using Python?
I have tried the below piece of code
merge_values = [{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 2, 'num': 68}),
'final_value': 118},
{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 4, 'num': 67}),
'final_value': 117},
{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 6, 'num': 67}),
'final_value': 117}]
test = pd.DataFrame()
i = 0
for match in merge_values:
for d in match:
final_cfr = d['final_value']
comb = d['cb']
i = i+1
z = pd.DataFrame()
for t in comb:
dct = {k:[v] for k,v in t.items()}
x = pd.DataFrame(dct)
x['merge_id'] = i
x['Final_Value'] = final_value
test = pd.concat([test, x])
The problem with this piece of code is it adds the rows one below another. I need the elements of the tuple next to each other.
You will need to clean your data by creating a new dict with the structure that you want, like this:
import pandas as pd
dirty_data = [{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 2, 'num': 68}),
'final_value': 118},
{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 4, 'num': 67}),
'final_value': 117},
{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 6, 'num': 67}),
'final_value': 117}]
def clean_data(dirty_data: dict) -> dict:
names = []
ids = []
nums = []
m_ids = []
m_nums = []
finals = []
for cb in dirty_data:
names.append(cb["cb"][0]["Name"])
ids.append(cb["cb"][0]["ID"])
nums.append(cb["cb"][0]["num"])
m_ids.append(cb["cb"][1]["ID"])
m_nums.append(cb["cb"][1]["num"])
finals.append(cb["final_value"])
return {"Name": names, "ID": ids, "num": nums, "M_ID": m_ids, "M_num": m_nums, "Final": finals}
df = pd.DataFrame(clean_data(dirty_data))
df
You could try to read the data into a dataframe as is and then restructure it until you get the desired result, but in this case, it doesn't seem practical.
Instead, I'd flatten the input into a list of lists to pass to pd.DataFrame. Here is a relatively concise way to do that with your sample data:
from operator import itemgetter
import pandas as pd
data = [{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 2, 'num': 68}),
'final_value': 118},
{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 4, 'num': 67}),
'final_value': 117},
{'cb': ({'Name': 'A', 'ID': 1, 'num': 50},
{'Name': 'A', 'ID': 6, 'num': 67}),
'final_value': 117}]
keys = ['Name', 'ID', 'num', 'M_Name', 'M_ID', 'M_num', 'final_value']
# generates ['A', 1, 50, 'A', 2, 68, 118] etc.
flattened = ([value for item in row['cb']
for value in itemgetter(*keys[:3])(item)]
+ [row['final_value']]
for row in data)
df = pd.DataFrame(flattened)
df.columns = keys
# get rid of superfluous M_Name column
df.drop('M_Name', axis=1, inplace=True)
itemgetter(*keys[:3])(item) is the same as [item[k] for k in keys[:3]]. On flattening lists of lists with list (or generator) comprehensions, see How do I make a flat list out of a list of lists?.
Result:
Name ID num M_ID M_num final_value
0 A 1 50 2 68 118
1 A 1 50 4 67 117
2 A 1 50 6 67 117

dictionary from df with columns within a key

I have a df such as follows:
data = [['a', 10, 1], ['b', 15,12], ['c', 14,12]]
df = pd.DataFrame(data, columns = ['Name', 'x', 'y'])
Name x y
0 a 10 1
1 b 15 12
2 c 14 12
Now I want it to pass it to a dict where x and y are inside of a key called total:
so the final dict would be like this
{
'Name': 'a',
"total": {
"x": 308,
"y": 229
},
}
I know i can use df.to_dict('records') to get this dict:
{
'Name': 'a',
"x": 308,
"y": 229
}
Any tips?
You could try
my_dict = [{'Name': row['Name'], 'total': {'x': row['x'], 'y': row['y']}} for row in df.to_dict('records')]
Result:
[{'Name': 'a', 'total': {'x': 10, 'y': 1}}, {'Name': 'b', 'total': {'x': 15, 'y': 12}}, {'Name': 'c', 'total': {'x': 14, 'y': 12}}]
Or, if you wish to convert all columns except the 'Name' to the 'total', and provided that there are no repititions in 'Name':
df.set_index('Name', inplace=True)
result = [{'Name': name, 'total': total} for name, total in df.to_dict('index').items()]
With the same result as before.

Compare two list of dictionaries in python

I have two lists of dictionaries named as category and sub_category.
category = [{'cat_id':1,'total':300,'from':250},{'cat_id':2,'total':100,'from':150}]
sub_category = [{'id':1,'cat_id':1,'charge':30},{'id':2,'cat_id':1,'charge':20},{'id':3,'cat_id':2,'charge':30}]
I want to change the value for charge to 0 in sub_category if the value of total >= from in category where cat_id's are equal.
Expected result is :
sub_category = [{'id':1,'cat_id':1,'charge':0},{'id':2,'cat_id':1,'charge':0},{'id':3,'cat_id':2,'charge':30}]
I managed to get the result by using this
for sub in sub_category:
for cat in category:
if cat['cat_id'] == sub['cat_id']:
if cat['total'] >= cat['from']:
sub['charge']=0
But I want to know the better way of doing this. Any help would be highly appreciated.
This is one approach. Change category to a dict for easy loopup.
Ex:
category = [{'cat_id':1,'total':300,'from':250},{'cat_id':2,'total':100,'from':150}]
sub_category = [{'id':1,'cat_id':1,'charge':30},{'id':2,'cat_id':1,'charge':20},{'id':3,'cat_id':2,'charge':30}]
category = {i.pop('cat_id'): i for i in category}
for i in sub_category:
if i['cat_id'] in category:
if category[i['cat_id']]['total'] >= category[i['cat_id']]['from']:
i['charge'] = 0
print(sub_category)
Output:
[{'cat_id': 1, 'charge': 0, 'id': 1},
{'cat_id': 1, 'charge': 0, 'id': 2},
{'cat_id': 2, 'charge': 30, 'id': 3}]
Try this:
I thinkt the way i did may not suitable at some cases. I like to use List Comprehensions just have a look.
category = [{'cat_id':1,'total':300,'from':250},{'cat_id':2,'total':100,'from':150}]
sub_category = [{'id':1,'cat_id':1,'charge':30},{'id':2,'cat_id':1,'charge':20},{'id':3,'cat_id':2,'charge':30}]
print [sub_cat if cat['cat_id'] == sub_cat['id'] and cat['total'] >= cat['from'] and not sub_cat.__setitem__('charge','0') else sub_cat for sub_cat in sub_category for cat in category]
Result:[{'cat_id': 1, 'charge': '0', 'id': 1}, {'cat_id': 1, 'charge': '0', 'id': 1}, {'cat_id': 1, 'charge': 20, 'id': 2}, {'cat_id': 1, 'charge': 20, 'id': 2}, {'cat_id': 2, 'charge': 30, 'id': 3}, {'cat_id': 2, 'charge': 30, 'id': 3}]
You can solve your problem using this approach:
target_categories = set([elem.get('cat_id') for elem in category if elem.get('total', 0) >= elem.get('from', 0)])
if None in target_categories:
target_categories.remove(None) # if there's no cat_id in one of the categories we will get None in target_categories. Remove it.
for elem in sub_category:
if elem.get('cat_id') in target_categories:
elem.update({'charge': 0})
Time comparison with another approach:
import numpy as np
size = 5000000
np.random.seed()
cat_ids = np.random.randint(50, size=(size,))
totals = np.random.randint(500, size=(size,))
froms = np.random.randint(500, size=(size,))
category = [{'cat_id': cat_id, 'total': total, 'from': from_} for cat_id, total, from_ in zip(cat_ids, totals, froms)]
sub_category = [{'id': 1, 'cat_id': np.random.randint(50), 'charge': np.random.randint(100)} for i in range(size)]
%%time
target_categories = set([elem.get('cat_id') for elem in category if elem.get('total', 0) >= elem.get('from', 0)])
if None in target_categories:
target_categories.remove(None) # if there's no cat_id in one of the categories we will get None in target_categories. Remove it.
for elem in sub_category:
if elem.get('cat_id') in target_categories:
elem.update({'charge': 0})
# Wall time: 3.47 s
%%time
category = {i.pop('cat_id'): i for i in category}
for i in sub_category:
if i['cat_id'] in category:
if category[i['cat_id']]['total'] >= category[i['cat_id']]['from']:
i['charge'] = 0
# Wall time: 5.73 s
Solution:
# Input
category = [{'cat_id':1,'total':300,'from':250},{'cat_id':2,'total':100,'from':150}]
sub_category = [{'id':1,'cat_id':1,'charge':30},{'id':2,'cat_id':1,'charge':20},{'id':3,'cat_id':2,'charge':30}]
# Main code
for k in sub_category:
if k["cat_id"] in [i["cat_id"] for i in category if i["total"] >= i["from"]]:
k["charge"] = 0
print (sub_category)
# Output
[{'id': 1, 'cat_id': 1, 'charge': 0}, {'id': 2, 'cat_id': 1, 'charge': 0}, {'id': 3, 'cat_id': 2, 'charge': 30}]

Combining multiple lists of dictionaries

I have several lists of dictionaries, where each dictionary contains a unique id value that is common among all lists. I'd like to combine them into a single list of dicts, where each dict is joined on that id value.
list1 = [{'id': 1, 'value': 20}, {'id': 2, 'value': 21}]
list2 = [{'id': 1, 'sum': 10}, {'id': 2, 'sum': 11}]
list3 = [{'id': 1, 'total': 30}, {'id': 2, 'total': 32}]
desired_output = [{'id': 1, 'value': 20, 'sum': 10, 'total': 30}, {'id': 2, 'value': 21, 'sum': 11, 'total': 32}]
I tried doing something like the answer found at https://stackoverflow.com/a/42018660/7564393, but I'm getting very confused since I have more than 2 lists. Should I try using a defaultdict approach? More importantly, I am NOT always going to know the other values, only that the id value is present in all dicts.
You can use itertools.groupby():
from itertools import groupby
list1 = [{'id': 1, 'value': 20}, {'id': 2, 'value': 21}]
list2 = [{'id': 1, 'sum': 10}, {'id': 2, 'sum': 11}]
list3 = [{'id': 1, 'total': 30}, {'id': 2, 'total': 32}]
desired_output = []
for _, values in groupby(sorted([*list1, *list2, *list3], key=lambda x: x['id']), key=lambda x: x['id']):
temp = {}
for d in values:
temp.update(d)
desired_output.append(temp)
Result:
[{'id': 1, 'value': 20, 'sum': 10, 'total': 30}, {'id': 2, 'value': 21, 'sum': 11, 'total': 32}]
list1 = [{'id': 1, 'value': 20}, {'id': 2, 'value': 21}]
list2 = [{'id': 1, 'sum': 10}, {'id': 2, 'sum': 11}]
list3 = [{'id': 1, 'total': 30}, {'id': 2, 'total': 32}]
# combine all lists
d = {} # id -> dict
for l in [list1, list2, list3]:
for list_d in l:
if 'id' not in list_d: continue
id = list_d['id']
if id not in d:
d[id] = list_d
else:
d[id].update(list_d)
# dicts with same id are grouped together since id is used as key
res = [v for v in d.values()]
print(res)
You can first build a dict of dicts, then turn it into a list:
from itertools import chain
from collections import defaultdict
list1 = [{'id': 1, 'value': 20}, {'id': 2, 'value': 21}]
list2 = [{'id': 1, 'sum': 10}, {'id': 2, 'sum': 11}]
list3 = [{'id': 1, 'total': 30}, {'id': 2, 'total': 32}]
dict_out = defaultdict(dict)
for d in chain(list1, list2, list3):
dict_out[d['id']].update(d)
out = list(dict_out.values())
print(out)
# [{'id': 1, 'value': 20, 'sum': 10, 'total': 30}, {'id': 2, 'value': 21, 'sum': 11, 'total': 32}]
itertools.chain allows you to iterate on all the dicts contained in the 3 lists. We build a dict dict_out having the id as key, and the corresponding dict being built as value. This way, we can easily update the already built part with the small dict of our current iteration.
Here, I have presented a functional approach without using itertools (which is excellent in rapid development work).
This solution will work for any number of lists as the function takes variable number of arguments and also let user to specify the type of return output (list/dict).
By default it returns list as you want that otherwise it returns dictionary in case if you pass as_list = False.
I preferred dictionary to solve this because its fast and search complexity is also less.
Just have a look at the below get_packed_list() function.
get_packed_list()
def get_packed_list(*dicts_lists, as_list=True):
output = {}
for dicts_list in dicts_lists:
for dictionary in dicts_list:
_id = dictionary.pop("id") # id() is in-built function so preferred _id
if _id not in output:
# Create new id
output[_id] = {"id": _id}
for key in dictionary:
output[_id][key] = dictionary[key]
dictionary["id"] = _id # push back the 'id' after work (call by reference mechanism)
if as_list:
return [output[key] for key in output]
return output # dictionary
Test
list1 = [{'id': 1, 'value': 20}, {'id': 2, 'value': 21}]
list2 = [{'id': 1, 'sum': 10}, {'id': 2, 'sum': 11}]
list3 = [{'id': 1, 'total': 30}, {'id': 2, 'total': 32}]
output = get_packed_list(list1, list2, list3)
print(output)
# [{'id': 1, 'value': 20, 'sum': 10, 'total': 30}, {'id': 2, 'value': 21, 'sum': 11, 'total': 32}]
output = get_packed_list(list1, list2, list3, as_list=False)
print(output)
# {1: {'id': 1, 'value': 20, 'sum': 10, 'total': 30}, 2: {'id': 2, 'value': 21, 'sum': 11, 'total': 32}}
list1 = [{'id': 1, 'value': 20}, {'id': 2, 'value': 21}]
list2 = [{'id': 1, 'sum': 10}, {'id': 2, 'sum': 11}]
list3 = [{'id': 1, 'total': 30}, {'id': 2, 'total': 32}]
print(list1+list2+list3)
list1 = [{'id': 1, 'value': 20}, {'id': 2, 'value': 21}]
list2 = [{'id': 1, 'sum': 10}, {'id': 2, 'sum': 11}]
list3 = [{'id': 1, 'total': 30}, {'id': 2, 'total': 32}]
result = []
for i in range(0,len(list1)):
final_dict = dict(list(list1[i].items()) + list(list2[i].items()) + list(list3[i].items()))
result.append(final_dict)
print(result)
output : [{'id': 1, 'value': 20, 'sum': 10, 'total': 30}, {'id': 2, 'value': 21, 'sum': 11, 'total': 32}]

List of dictionaries - stack one value of dictionary

I have trouble in adding one value of dictionary when conditions met, For example I have this list of dictionaries:
[{'plu': 1, 'price': 150, 'quantity': 2, 'stock': 5},
{'plu': 2, 'price': 150, 'quantity': 7, 'stock': 10},
{'plu': 1, 'price': 150, 'quantity': 6, 'stock': 5},
{'plu': 1, 'price': 200, 'quantity': 4, 'stock': 5},
{'plu': 2, 'price': 150, 'quantity': 3, 'stock': 10}
]
Then output should look like this:
[{'plu': 1, 'price': 150, 'quantity': 8, 'stock': 5},
{'plu': 1, 'price': 200, 'quantity': 4, 'stock': 5},
{'plu': 2, 'price': 150, 'quantity': 10, 'stock': 10}
]
Quantity should be added only if plu and price are the same, it should ignore key:values other than that (ex. stock). What is the most efficient way to do that?
#edit
I tried:
import itertools as it
keyfunc = lambda x: x['plu']
groups = it.groupby(sorted(new_data, key=keyfunc), keyfunc)
x = [{'plu': k, 'quantity': sum(x['quantity'] for x in g)} for k, g in groups]
But it works only on plu and then I get only quantity value when making html table in django, other are empty
You need to sort/groupby the combined key, not just one key. Easiest/most efficient way to do this is with operator.itemgetter. To preserve an arbitrary stock value, you'll need to use the group twice, so you'll need to convert it to a sequence:
from operator import itemgetter
keyfunc = itemgetter('plu', 'price')
# Unpack key and listify g so it can be reused
groups = ((plu, price, list(g))
for (plu, price), g in it.groupby(sorted(new_data, key=keyfunc), keyfunc))
x = [{'plu': plu, 'price': price, 'stock': g[0]['stock'],
'quantity': sum(x['quantity'] for x in g)}
for plu, price, g in groups]
Alternatively, if stock is guaranteed to be the same for each unique plu/price pair, you can include it in the key to simplify matters, so you don't need to listify the groups:
keyfunc = itemgetter('plu', 'price', 'stock')
groups = it.groupby(sorted(new_data, key=keyfunc), keyfunc)
x = [{'plu': plu, 'price': price, 'stock': stock,
'quantity': sum(x['quantity'] for x in g)
for (plu, price, stock), g in groups]
Optionally, you could create getquantity = itemgetter('quantity') at top level (like the keyfunc) and change sum(x['quantity'] for x in g) to sum(map(getquantity, g)) which pushes work to the C layer in CPython, and can be faster if your groups are large.
The other approach is to avoid sorting entirely using collections.Counter (or collections.defaultdict(int), though Counter makes the intent more clear here):
from collections import Counter
grouped = Counter()
for plu, price, stock, quantity in map(itemgetter('plu', 'price', 'stock', 'quantity'), new_data):
grouped[plu, price, stock] += quantity
then convert back to your preferred form with:
x = [{'plu': plu, 'price': price, 'stock': stock, 'quantity': quantity}
for (plu, price, stock), quantity in grouped.items()]
This should be faster for large inputs, since it replaces O(n log n) sorting work with O(n) dict operations (which are roughly O(1) cost).
Using pandas will make this a trivial problem:
import pandas as pd
data = [{'plu': 1, 'price': 150, 'quantity': 2, 'stock': 5},
{'plu': 2, 'price': 150, 'quantity': 7, 'stock': 10},
{'plu': 1, 'price': 150, 'quantity': 6, 'stock': 5},
{'plu': 1, 'price': 200, 'quantity': 4, 'stock': 5},
{'plu': 2, 'price': 150, 'quantity': 3, 'stock': 10}]
df = pd.DataFrame.from_records(data)
# df
#
# plu price quantity stock
# 0 1 150 2 5
# 1 2 150 7 10
# 2 1 150 6 5
# 3 1 200 4 5
# 4 2 150 3 10
new_df = df.groupby(['plu','price','stock'], as_index=False).sum()
new_df = new_df[['plu','price','quantity','stock']] # Optional: reorder the columns
# new_df
#
# plu price quantity stock
# 0 1 150 8 5
# 1 1 200 4 5
# 2 2 150 10 10
And finally, if you want to, port it back to dict (though I would argue pandas give you a lot more functionality to handle the data elements):
new_data = df2.to_dict(orient='records')
# new_data
#
# [{'plu': 1, 'price': 150, 'quantity': 8, 'stock': 5},
# {'plu': 1, 'price': 200, 'quantity': 4, 'stock': 5},
# {'plu': 2, 'price': 150, 'quantity': 10, 'stock': 10}]

Categories