I have a list of dicts as follows:
[{"server":"8.8.8.8",
"domains":[{"google.com":[{"time":15, "serial":14}, {"time":78, "serial":14}]},
{"intuit.com":[{"time":20, "serial":23}, {"time":91, "serial":18}]}
]
},
{"server":"8.8.4.4",
"domains":[{"google.com":[{"time":19, "serial":45}, {"time":92, "serial":76}]},
{"intuit.com":[{"time":45, "serial":89}, {"time":93, "serial":74}]}
]
},
{"server":"206.67.222.222",
"domains":[{"google.com":[{"time":98, "serial":76}, {"time":64, "serial":54}]},
{"intuit.com":[{"time":43, "serial":21}, {"time":65, "serial":59}]}
]
}]
How would I go about creating a structure where I select only the dict for each domain with the max serial number and when I have the same serial number, select the max time so that I am left with the following:
[{"server":"8.8.8.8",
"domains":[{"google.com":{"time":78, "serial":14}},
{"intuit.com":{"time":20, "serial":23}}
]
},
{"server":"8.8.4.4",
"domains":[{"google.com":{"time":92, "serial":76}},
{"intuit.com":{"time":45, "serial":89}}
]
},
{"server":"206.67.222.222",
"domains":[{"google.com":{"time":98, "serial":76}},
{"intuit.com":{"time":65, "serial":59}}
]
}]
The solution using built-in max() function:
import json
# l is your initial list of dicts
for item in l:
for d in item['domains']:
for k, v in d.items():
# whether `serial` numbers are unique
has_uniq_serial = len(set([i['serial'] for i in v])) > 1
d[k] = max(v, key=lambda o: o['serial']) if has_uniq_serial else max(v, key=lambda o: o['time'])
# `json.dumps` used for pretty printing of nested dicts
print(json.dumps(l, indent=4))
The output:
[
{
"server": "8.8.8.8",
"domains": [
{
"google.com": {
"serial": 14,
"time": 78
}
},
{
"intuit.com": {
"serial": 23,
"time": 20
}
}
]
},
{
"server": "8.8.4.4",
"domains": [
{
"google.com": {
"serial": 76,
"time": 92
}
},
{
"intuit.com": {
"serial": 89,
"time": 45
}
}
]
},
{
"server": "206.67.222.222",
"domains": [
{
"google.com": {
"serial": 76,
"time": 98
}
},
{
"intuit.com": {
"serial": 59,
"time": 65
}
}
]
}
]
Try this (d is your dict):
for item in d:
for i in item["domains"]:
for k, v in i.items():
c = sorted([(j["time"], j["serial"]) for j in v], key=lambda x: (x[1], x[0]))
i[k] = {"time": c[-1][0], "serial": c[-1][1]}
print d
You can sort your time-serial list for each domain by your requirement and get the first one, let variable data be your input list:
def domain_sorter(d):
def compare(x, y):
k = y['serial'] - x['serial']
j = y['time'] - x['time']
return k if k != 0 else j
return sorted(d, cmp=compare)
def filter_domain(domain):
for k, v in domain.items():
return {
k: domain_sorter(v)[0]
}
print [{
"server": e['server'],
"domains": [filter_domain(domain) for domain in e['domains']]
} for e in data]
Related
Suppose I have a table represented in JSON as a list of dicts, where the keys of each item are the same:
J = [
{
"symbol": "ETHBTC",
"name": "Ethereum",
:
},
{
"symbol": "LTC",
"name": "LiteCoin"
:
},
And suppose I require efficient lookup, e.g. symbols['ETHBTC']['name']
I can transform with symbols = { item['name']: item for item in J }, producing:
{
"ETHBTC": {
"symbol": "ETHBTC",
"name": "Ethereum",
:
},
"LTCBTC": {
"symbol": "LTCBTC",
"name": "LiteCoin",
:
},
(Ideally I would also remove the now redundant symbol field).
However, what if each item itself contains a "table-as-list-of-dicts"?
Here's a fuller minimal example (I've removed lines not pertinent to the problem):
J = {
"symbols": [
{
"symbol":"ETHBTC",
"filters":[
{
"filterType":"PRICE_FILTER",
"minPrice":"0.00000100",
},
{
"filterType":"PERCENT_PRICE",
"multiplierUp":"5",
},
],
},
{
"symbol":"LTCBTC",
"filters":[
{
"filterType":"PRICE_FILTER",
"minPrice":"0.00000100",
},
{
"filterType":"PERCENT_PRICE",
"multiplierUp":"5",
},
],
}
]
}
So the challenge is to transform this structure into:
J = {
"symbols": {
"ETHBTC": {
"filters": {
"PRICE_FILTER": {
"minPrice": "0.00000100",
:
}
I can write a flatten function:
def flatten(L:list, key) -> dict:
def remove_key_from(D):
del D[key]
return D
return { D[key]: remove_key_from(D) for D in L }
Then I can flatten the outer list and loop through each key/val in the resulting dict, flattening val['filters']:
J['symbols'] = flatten(J['symbols'], key="symbol")
for symbol, D in J['symbols'].items():
D['filters'] = flatten(D['filters'], key="filterType")
Is it possible to improve upon this using glom (or otherwise)?
Initial transform has no performance constraint, but I require efficient lookup.
I don't know if you'd call it pythonic but you could make your function more generic using recursion and dropping key as argument. Since you already suppose that your lists contain dictionaries you could benefit from python dynamic typing by taking any kind of input:
from pprint import pprint
def flatten_rec(I) -> dict:
if isinstance(I, dict):
I = {k: flatten_rec(v) for k,v in I.items()}
elif isinstance(I, list):
I = { list(D.values())[0]: {k:flatten_rec(v) for k,v in list(D.items())[1:]} for D in I }
return I
pprint(flatten_rec(J))
Output:
{'symbols': {'ETHBTC': {'filters': {'PERCENT_PRICE': {'multiplierUp': '5'},
'PRICE_FILTER': {'minPrice': '0.00000100'}}},
'LTCBTC': {'filters': {'PERCENT_PRICE': {'multiplierUp': '5'},
'PRICE_FILTER': {'minPrice': '0.00000100'}}}}}
Since you have different transformation rules for different keys, you can keep a list of the key names that require "grouping" on:
t = ['symbol', 'filterType']
def transform(d):
if (m:={a:b for a, b in d.items() if a in t}):
return {[*m.values()][0]:transform({a:b for a, b in d.items() if a not in m})}
return {a:b if not isinstance(b, list) else {x:y for j in b for x, y in transform(j).items()} for a, b in d.items()}
import json
print(json.dumps(transform(J), indent=4))
{
"symbols": {
"ETHBTC": {
"filters": {
"PRICE_FILTER": {
"minPrice": "0.00000100"
},
"PERCENT_PRICE": {
"multiplierUp": "5"
}
}
},
"LTCBTC": {
"filters": {
"PRICE_FILTER": {
"minPrice": "0.00000100"
},
"PERCENT_PRICE": {
"multiplierUp": "5"
}
}
}
}
}
I have 2 nested dictionaries that have some matching keys and similar structure, and want to merge them into a final third dictionary in a specific way. A default value dictionary, has the values that will be used if not in the second dictionary, which will have some keys that match, and some keys that dont exist. In either event I want it to overwrite the default key or add a new key from the second dictionary to this third dictionary. See (shortened) example below:
default:
{"model_name": "null",
"description": "null",
"frequency": "d",
"tasks": [
{
"target": "elastic",
"metrics": "null",
"model_type": "null",
"alert": {
"type": "pagerduty",
"threshold": 5,
"service_id" : "P94CEA6"
}
}
]
}
second dict
{"model_name": "dqs_cie_registration_09",
"description": "test cie registration",
"tasks": [
{
"source": "elastic",
"metrics": [
"indid_unique_cnt", "zs"
],
"model_type": "Deep_Dive",
"elastic_config": "config",
"read_object": "dqs_rtfs_d_*",
"watcher": "cie_watch_zs_3d.json",
"target_write_index": "dqs_target_write_index"
}
]
}
Id like to merge it so it results in
{"model_name": "dqs_cie_registration_09",
"description": "test cie registration",
"frequency": "d",
"tasks": [
{
"target": "elastic",
"source": "elastic",
"metrics": ["indid_unique_cnt", "zs"],
"model_type": "Deep_Dive",
"elastic_config": "config",
"read_object": "dqs_rtfs_d_*",
"watcher": "cie_watch_zs_3d.json",
"target_write_index": "dqs_target_write_index",
"alert": {
"type": "pagerduty",
"threshold": 5,
"service_id" : "P94CEA6"
}
]
}
The third dict merges the second dict on the first.
I haven't really gotten anywhere but I feel there is a really easy way to implement this that I just don't remember.
Following merge routine produces desired result
import copy # to provide deepcopy
import pprint # Pretty Print
def merge(a, b):
" Merges b into a (to preserve a make a deepcopy prior to calling merge "
if isinstance(a, dict) and isinstance(b, dict):
" Dictionaries "
for k, v in b.items():
if k in a:
# Conditionally add keys from b
if isinstance(a[k], str):
if a[k] == "null":
a[k] = copy.deepcopy(b[k])
else:
merge(a[k], b[k])
else:
# Add keys from b
a[k] = copy.deepcopy(b[k])
elif isinstance(a, list) and isinstance(b, list):
" Lists "
if len(a) == len(b):
for i, item in enumerate(b):
if isinstance(item, str) and isinstance(b[i], str):
if item == "null":
a[i] = b[i]
else:
merge(a[i], b[i])
Usage
d1 = {"model_name": "null",
"description": "null",
"frequency": "d",
"tasks": [
{
"target": "elastic",
"metrics": "null",
"model_type": "null",
"alert": {
"type": "pagerduty",
"threshold": 5,
"service_id" : "P94CEA6"
}
}
]
}
d2 = {"model_name": "dqs_cie_registration_09",
"description": "test cie registration",
"tasks": [
{
"source": "elastic",
"metrics": [
"indid_unique_cnt", "zs"
],
"model_type": "Deep_Dive",
"elastic_config": "config",
"read_object": "dqs_rtfs_d_*",
"watcher": "cie_watch_zs_3d.json",
"target_write_index": "dqs_target_write_index"
}
]
}
merge(d1, d2) # to preserve d1 create a deepcopy prior to merge (i.e. temp = copy.deepcopy(d1))
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(d1)
Output
{ 'description': 'test cie registration',
'frequency': 'd',
'model_name': 'dqs_cie_registration_09',
'tasks': [ { 'alert': { 'service_id': 'P94CEA6',
'threshold': 5,
'type': 'pagerduty'},
'elastic_config': 'config',
'metrics': ['indid_unique_cnt', 'zs'],
'model_type': 'Deep_Dive',
'read_object': 'dqs_rtfs_d_*',
'source': 'elastic',
'target': 'elastic',
'target_write_index': 'dqs_target_write_index',
'watcher': 'cie_watch_zs_3d.json'}
]
}
Every time i will get 500 records form file1 to join file2 which contains more than 100000 records it costs two minutes !!
with open(file1,'r') as f1,open(file2,'r') as f2:
a = json.load(f1)
b = json.load(f2)
list_a = []
for i in range(len(a)):
for n in range(len(b)):
if b[n]["id"] == a[i]["id"]:
list_a.append(dict(b[n], **a[i]))
with open(result,'w') as f3:
json.dump(list_a, f3,sort_keys=True, ensure_ascii=False)
File1:
[{ "id":"1", "name":"Tom" },
{ "id":"2", "name":"Jim" },
{ "id":"3", "name":"Bob" },
{ "id":"4", "name":"Jeny" },
{ "id":"5", "name":"Lara" },
{ "id":"6", "name":"Lin" },
{ "id":"7", "name":"Kim" },
{ "id":"8", "name":"Jack" },
{ "id":"9", "name":"Tony" }]
File 2:
[ { "id":"1", "Details":[ { "label":"jcc", "hooby":"Swimming" }, { "label":"hkt", "hooby":"Basketball" }, ] },
{ "id":"2", "Details":[ { "label":"NTC", "hooby":"Games" } ] } ]
Result:
[ { "id":"1", "name":"Tom", "Details":[ { "label":"jcc", "hooby":"Swimming" }, { "label":"hkt", "hooby":"Basketball" }, ] },
{ "id":"2", "name":"Jim", "Details":[ { "label":"NTC", "hooby":"Games" } ] } ]
Your code runs in O(N*M) time (where N == len(a) and M == len(b)), which is too slow for such big files. You can make it run in O(N + M) time by creating a mapping for a's ids first and using it to find matching b's ids, e.g.:
import json
with open('file1') as f1, open('file2') as f2, open('file3', 'w') as f3:
a = json.load(f1)
b = json.load(f2)
aid = {d['id']: d for d in a}
list_a = [{k: v for d in (b_dict, aid[b_dict['id']]) for k, v in d.items()}
for b_dict in b if b_dict['id'] in aid]
json.dump(list_a, f3, sort_keys=True, ensure_ascii=False)
If you want the code to be compatible with Python 2.x, you can use a dictionary comprehension to merge dictionaries (like illustrated above). In Python 3.5+ you can simply use unpacking, e.g. {**d1, **d2}.
I don't have the experience to know if this would speed it up. The solution below provided by Eugene Yarmash seems more reliable. I also don't have the big files to test speed, but you can try and see if using collections would speed up the iteration. I'd actually be curious myself if it would change anything:
File1 = [ { "id":"1", "name":"Tom" }, { "id":"2", "name":"Jim" }, { "id":"3", "name":"Bob" }, { "id":"4", "name":"Jeny" }, { "id":"5", "name":"Lara" }, { "id":"6", "name":"Lin" }, { "id":"7", "name":"Kim" }, { "id":"8", "name":"Jack" }, { "id":"9", "name":"Tony" } ]
File2 = [ { "id":"1", "Details":[ { "label":"jcc", "hooby":"Swimming" }, { "label":"hkt", "hooby":"Basketball" }, ] }, { "id":"2", "Details":[ { "label":"NTC", "hooby":"Games" } ] } ]
from collections import defaultdict
d = defaultdict(dict)
for l in (File1, File2):
for elem in l:
d[elem['id']].update(elem)
Result = dict(d)
I have a list of paths:
paths = [
"root/child1/file1",
"root/child1/file2",
"root/child2/file1"
]
And I want to parse it ith python into dict (or list of dicts) that looks like:
{
"text": "root",
"children": [
{
"text": "child1",
"children": [
{
"text": "file1",
"children": []
},
{
"text": "file2",
"children": []
}
]
},
{
"text": "child2",
"children": [
{
"text": "file2",
"children": []
}
]
}
I tried to write some recursive function, but no success. Example:
def path2dict(path, depth):
d = {}
text = path.split('/')[0]
d['text'] = text
depth = depth + 1
d['children'] = [path2dict(p, depth) for p in path.split('/')[depth:]]
return d
paths = [
"root/child1/file1",
"root/child1/file2",
"root/child2/file1"
]
depth = 0
for path in paths:
d = path2dict(path, depth)
print(d)
Sorry for not using your existing solution, but I have some other:
def stage1(paths):
result = {}
for path in paths:
split = path.split('/')
current = result
for part in split:
current.setdefault(part, {})
current = current[part]
return result
def stage2(dct):
return [
{
'text': key,
'children': stage2(value)
}
for key, value in dct.items()
]
after_stage1 = stage1(paths)
# after_stage1 is
# {
# 'root': {
# 'child1': {
# 'file1': {},
# 'file2': {}
# },
# 'child2': {
# 'file1': {}
# }
# }
# }
after_stage2 = stage2(after_stage1)
# after_stage2 contains exactly what you need
You can use itertools.groupby:
from itertools import groupby
import json
d = ['root/child1/file1', 'root/child1/file2', 'root/child2/file1']
def create_paths(paths):
_vals = [[a, [c for _, *c in b]] for a, b in groupby(sorted(paths, key=lambda x:x[0]), key=lambda x:x[0])]
return [{'text':a, 'children':[] if not b[0] else create_paths(b)} for a, b in _vals]
print(json.dumps(create_paths([i.split('/') for i in d]), indent=4))
Output:
[
{
"text": "root",
"children": [
{
"text": "child1",
"children": [
{
"text": "file1",
"children": []
},
{
"text": "file2",
"children": []
}
]
},
{
"text": "child2",
"children": [
{
"text": "file1",
"children": []
}
]
}
]
}
]
I have a json(test.json) file with the below data. I have around 10000 records. I need to convert value from string to float write in the new file(test1.json). How can I do do this from Python?
{
"name":"test001",
"cat":"test",
"loc":"x loc",
"ings":[
{
"name":"rrrrrr",
"value":"13.0"
},
{
"name":"hhhh",
"value":"18.0"
}
],
"nums":[
{
"name":"kkkk",
"value":"82.05"
},
{
"name":"uuuuu",
"value":"53.55"
}
]
},
{
"name":"test002",
"cat":"test1",
"loc":"y loc",
"ings":[
{
"name":"trtrtr",
"value":"11.0"
},
{
"name":"wewew",
"value":"19.0"
}
],
"nums":[
{
"name":"iuyt",
"value":"122.05"
},
{
"name":"oiui",
"value":"15.5"
}
]
}
resulting json file(test1.json) should be like below...
{
"name":"test001",
"cat":"test",
"loc":"x loc",
"ings":[
{
"name":"rrrrrr",
"value":13.0
},
{
"name":"hhhh",
"value":18.0
}
],
"nums":[
{
"name":"kkkk",
"value":82.05
},
{
"name":"uuuuu",
"value":53.55
}
]
},
{
"name":"test002",
"cat":"test1",
"loc":"y loc",
"ings":[
{
"name":"trtrtr",
"value":11.0
},
{
"name":"wewew",
"value":19.0
}
],
"nums":[
{
"name":"iuyt",
"value":122.05
},
{
"name":"oiui",
"value":15.5
}
]
}
You can provide an object_hook to the json.loads method which will allow you to modify any object (dicts) found within the json:
import json
json_data = """
[{
"name":"test001",
"cat":"test",
"loc":"x loc",
"ings":[
{
"name":"rrrrrr",
"value":"13.0"
},
{
"name":"hhhh",
"value":"18.0"
}
],
"nums":[
{
"name":"kkkk",
"value":"82.05"
},
{
"name":"uuuuu",
"value":"53.55"
}
]
},
{
"name":"test002",
"cat":"test1",
"loc":"y loc",
"ings":[
{
"name":"trtrtr",
"value":"11.0"
},
{
"name":"wewew",
"value":"19.0"
}
],
"nums":[
{
"name":"iuyt",
"value":"122.05"
},
{
"name":"oiui",
"value":"15.5"
}
]
}]
"""
def as_float(obj):
"""Checks each dict passed to this function if it contains the key "value"
Args:
obj (dict): The object to decode
Returns:
dict: The new dictionary with changes if necessary
"""
if "value" in obj:
obj["value"] = float(obj["value"])
return obj
if __name__ == '__main__':
l = json.loads(json_data, object_hook=as_float)
print (json.dumps(l, indent=4))
This results in what you want:
[
{
"loc": "x loc",
"ings": [
{
"name": "rrrrrr",
"value": 13.0
},
{
"name": "hhhh",
"value": 18.0
}
],
"name": "test001",
"nums": [
{
"name": "kkkk",
"value": 82.05
},
{
"name": "uuuuu",
"value": 53.55
}
],
"cat": "test"
},
{
"loc": "y loc",
"ings": [
{
"name": "trtrtr",
"value": 11.0
},
{
"name": "wewew",
"value": 19.0
}
],
"name": "test002",
"nums": [
{
"name": "iuyt",
"value": 122.05
},
{
"name": "oiui",
"value": 15.5
}
],
"cat": "test1"
}
]
To write to a file instead:
with open("out.json", "w+") as out:
json.dump(l, out, indent=4)
You would need to recursively traverse the data and convert anything that looks like a float to a float:
def fix_floats(data):
if isinstance(data,list):
iterator = enumerate(data)
elif isinstance(data,dict):
iterator = data.items()
else:
raise TypeError("can only traverse list or dict")
for i,value in iterator:
if isinstance(value,(list,dict)):
fix_floats(value)
elif isinstance(value,str):
try:
data[i] = float(value)
except ValueError:
pass
It should do the trick:
my_data = [
{ "name" : "rrrrrr",
"value" : "13.0" },
{ "name" : "hhhh",
"value" : "18.0" },
]
fix_floats(my_data)
>>> my_data
[{'name': 'rrrrrr', 'value': 13.0}, {'name': 'hhhh', 'value': 18.0}]
If you have a single or specific key value object, you can reiterate the value containing alphabetical strings or numerical strings, then map and check against their type with string.isnumeric():
dict = { 'a':'100', 'b':'200', 'c':'300', 'd':'four_hundred', 'e':'500' }
dict_parse = {k: int(v) if v.isnumeric() else v for k, v in dict.items()}
>>> dict_parse
{ 'a': 100, 'b': 200, 'c': 300, 'd':'four_hundred', 'e':500}
when dealing with float numbers amend the if statement to replace decimal point, you can apply same principal to negative numbers:
dict = { 'a':'10.0', 'b':'20.12', 'c':'300.3', 'd':'four_hundred', 'e':'500' }
dict_parse = {k: float(v) if v.replace(".", "").isnumeric() else v for k, v in dict.items()}
>>> dict_parse
{ 'a': 10.0, 'b': 20.12, 'c': 300.3, 'd':'four_hundred', 'e':500}