I have several hundred thousand endpoint URLs that I want to generate stats for. For example I have:
/a/b/c
/a/b/d
/a/c/d
/b/c/d
/b/d/e
/a/b/c
/b/c/d
I want to create a dictionary that looks like this
{
'a': {
'b': {
'c': {
'_count': 2
},
'd': {
'_count': 1
}
},
'c': {
'd': {
'_count': 1
}
}
},
'b': {
'c': {
'd': {
'_count': 2
}
},
'd': {
'e': {
'_count': 1
}
}
}
}
Any clever ways to do this?
EDIT
I should mention that the paths are not always 3 parts. There might be
/a/b/c/d/e/f/g/h... etc, etc.
If the paths all look like in your example, this would work:
counts = {}
for p in paths:
parts = p.split('/')
branch = counts
for part in parts[1:-1]:
branch = branch.setdefault(part, {})
branch[parts[-1]] = 1 + branch.get(parts[-1], 0)
This uses dictionary methods like setdefault() and get() to avoid having to write a lot of if-statements.
Note that this will not work if a path that has subdirectories can also appear on it's own. Then it's not clear if the corresponding part of counts should contain a number or another dictionary. In this case it would probably be best to store both a count and a dict for each node, using a tuple or a custom class.
The basic algorithm stays the same:
class Stats(object):
def __init__(self):
self.count = 0
self.subdirs = {}
counts = Stats()
for p in paths:
parts = p.split('/')
branch = counts
for part in parts[1:]:
branch = branch.subdirs.setdefault(part, Stats())
branch.count += 1
With some pretty printing you get:
def printstats(stats, indent=''):
print indent + str(stats.count) + ' times'
for (d, s) in stats.subdirs.items():
print indent + d + ':'
printstats(s, indent + ' ')
>>> printstats(counts)
0 times
a:
0 times
c:
0 times
d:
1 times
b:
0 times
c:
2 times
d:
1 times
...
EDIT:
I've amended my code to fit your last comment, above (no complex data structure now).
def dictizeString(string, dictionary):
while string.startswith('/'):
string = string[1:]
parts = string.split('/', 1)
if len(parts) > 1:
branch = dictionary.setdefault(parts[0], {})
dictizeString(parts[1], branch)
else:
if dictionary.has_key(parts[0]):
# If there's an addition error here, it's because invalid data was added
dictionary[parts[0]] += 1
else:
dictionary[parts[0]] = 1
It will store a list of [frequency, dictionary] for each item.
Test case
>>> d = {}
>>> dictizeString('/a/b/c/d', d)
>>> dictizeString('/a/b/c/d', d)
>>> dictizeString('/a/b/c/d', d)
>>> dictizeString('/a/b/c/d', d)
>>> dictizeString('/a/b/e', d)
>>> dictizeString('/c', d)
>>> d
{'a': {'b': {'c': {'d': 4}, 'e': 1}}, 'c': 1}
Old result, but still near the top in google, so I'll update: You could use dpath-python for this.
$ easy_install dpath
>>> result = {}
>>> for path in my_list_of_paths:
>>> ... dpath.util.set(result, path, SOME_VALUE)
... and that's it. I don't understand the math you're using to precompute those values on the terminus (1, 2, etc), but you could precalculate it and use a dictionary of path-to-value instead of a bare list
>>> x = {'path/name': 0, 'other/path/name': 1}
>>> for (path, value) in x.iteritems():
>>> ... dpath.util.set(result, path, value)
Something like that would work.
Here's my attempt:
class Result(object):
def __init__(self):
self.count = 0
self._sub_results = {}
def __getitem__(self, key):
if key not in self._sub_results:
self._sub_results[key] = Result()
return self._sub_results[key]
def __str__(self):
return "(%s, %s)" % (self.count, self._sub_results)
def __repr__(self):
return str(self)
def process_paths(paths):
path_result = Result()
for path in paths:
components = path[1:].split("/")
local_result = path_result
for component in components:
local_result = local_result[component]
local_result.count += 1
return path_result
I've wrapped up some of the logic into the Result class to try and make the algorithm itself a little clearer.
Based on the answers, I wrote a general function for setting a dictionary value along a path:
def dictPath(path, dictionary, val, sep="/"):
"set a value in a nested dictionary"
while path.startswith(sep):
path = path[1:]
parts = path.split(sep, 1)
if len(parts) > 1:
branch = dictionary.setdefault(parts[0], {})
dictPath(parts[1], branch, val, sep)
else:
dictionary[parts[0]] = val
Related
I am trying to populate a python dict from a list of paths (the purpose is to create a ttk.treview) :
paths = ["\\hihi", "\\hihi\\hoho\\huhu", "\\hihi\\hoho\\haha", "\\haha", "\\huhu"]
and I want to create this dictionnary (json serialized here) :
{
"haha": {},
"hihi": {
"hoho": {
"huhu": 0
},
"huhu": {
"haha": 0
}
},
"huhu": {}
}
What is the best way to do this? I tried with a for loop (recursive loop?), with a dict comprehension and with dpath but I have no valid result.
The biginning of my code :
split = paths.split("\\")
del split[0]
dict = {}
?
Thank you very much in advance
You could use defaultdict for this:
def make_empty_default_dict():
return defaultdict(make_empty_default_dict)
Define how you add a path:
def add_path(pth, dct):
if pth:
subdict = dct[pth[0]]
return add_path(pth[1:], subdict)
else:
return dct
Then populate your default dict with keys:
d = make_empty_default_dict()
for path in paths:
d = add_path(path.split("\\"), d)
I have an alternative to the recursive solution. For each path:
put a cursor at the root of the target dict
search sequence: move the cursor forth until you find a 0 or a missing part of the path
build sequence: add an empty dict and move the cursor on that dict until you hit the last part.
the last part needs a special handling for the 0.
Here's the code:
def build_paths(paths, d={}):
for path in paths:
parts = path.split("\\")[1:] # remove the part before \
cursor = d
search = True
for part in parts[:-1]:
if search:
if part not in cursor or not cursor[part]: # not found or 0
cursor[part] = {} # set a new dict
search = False
else:
cursor[part] = {}
cursor = cursor[part] # advance one level deeper in the dict
cursor[parts[-1]] = 0 # close with a 0
return d
It's faster than the recursive vesion of #xtofl, but not that fast. With timeit:
iterative: 6.169872568580601
recursive: 17.209112331781498
You can use recursion with itertools.groupby:
import itertools
paths = ["\\hihi", "\\hihi\\hoho\\huhu", "\\hihi\\hoho\\haha", "\\haha", "\\huhu"]
new_paths = [list(filter(None, i.split('\\'))) for i in paths]
def update_output(f):
def wrapper(_d):
result = f(_d)
final = lambda x, level = 0:{a:{} if not level and not b else b if not b else final(b, level+1) for a, b in x.items()}
return final(result)
return wrapper
#update_output
def full_directory(data):
def files(d):
return {a:(lambda x:0 if len(x) == 1 else files([i[1:] for i in filter(lambda y:len(y) != 1 or y[0] != a, x)]))(list(b)) for a, b in itertools.groupby(sorted(d, key=lambda x:x[0]), key=lambda x:x[0])}
return files(data)
print(full_directory(new_paths))
Output:
{'haha': {}, 'hihi': {'hoho': {'haha': 0, 'huhu': 0}}, 'huhu': {}}
I found this : http://connor-johnson.com/2015/02/28/generate-a-tree-structure-in-python/
It works very well! So the code :
def add(t, path):
for node in path:
t = t[node]
Tree = lambda: defaultdict(Tree)
t = Tree()
paths = ["\\hihi", "\\hihi\\hoho\\huhu", "\\hihi\\hoho\\haha", "\\haha", "\\huhu"]
for path in paths:
split = path.split("\\")
del split[0]
for elt in split:
add(t, split)
dicts = lambda t: { k:dicts(t[k]) for k in t }
print(json.dumps(dicts(t), indent=4))
Given that you have an empty dictionary
data = {}
I have a path and a value
path = "root.sub.item"
value = 12
How could I recursively add objects that do not exist?
def add_value(path, value):
for part in path.split('.'):
if not part in data:
data[part] = {}
The expected output for this would be:
data = {
'root':{
'sub':{
'item': 12
}
}
}
Could somebody help out with this or point me in the right direction?
I'm using Python 3.6.
You can use some another kind of solution like recursive defaultdict, as in this answer.
A quick and stupid example about how it can used:
from collections import defaultdict
def func(rdict, path, value):
items = path.split('.')
d = rdict[items[0]]
for item in items[1:-1]:
d = d[item]
d[items[-1]] = value
nested_dict = lambda: defaultdict(nested_dict)
result = nested_dict()
func(result, 'root.sub.item', 12)
func(result, 'root.moon.value', 1)
assert result['root']['sub']['item'] == 12
assert result['root']['moon']['value'] == 1
assert result['root']['moon']['noop'] != 0
You're almost there, you just need to keep track of how far you are into the tree structure, and a way to know when you're on the last element of the path:
def add_value(path, value):
tmp = data
parts = list(path.split('.'))
for i in range(len(parts) - 1):
part = parts[i]
if not part in tmp:
tmp[part] = {}
tmp = tmp[part]
tmp[parts[-1]] = value
you can try Raymond Hettinger recipe :
source: https://twitter.com/raymondh/status/343823801278140417
from collections import defaultdict
infinity_dict=lambda:defaultdict(infinity_dict)
d=infinity_dict()
d['root']['sub']['item'] = 12
I've been working on this for too long and need some help.
I'm trying to create a dictionary using faker. If it were only that simple.
Initially the dictionary is flat. A key and item. If the first letter of the key is 'B' or 'M' it will then turn that string, into a dictionary with 5 keys and keep doing that until it finds none starting with either of those two letters. I know, there's no recursion happening now. That's why I need help. I'm trying to figure out how to properly recurse rather than hard code the depth.
Starting Dictionary:
{
"Marcia": "https://www.skinner.biz/categories/tags/terms.htm",
"Nicholas": "https://scott-tran.com/",
"Christopher": "https://www.ellis.com/",
"Paul": "https://lopez.com/index/",
"Jennifer": "https://www.sosa.com/wp-content/main/login.php"
}
Marcia should expand to this...
Example:
"Marcia": {
"Alexander": "http://hicks.net/home.html",
"Barry": {
"Jared": "https://www.parker-robinson.com/faq.html",
"Eddie": "https://www.smith-thomas.com/",
"Ryan": "https://www.phillips.org/homepage/",
"Mary": {
"Alex": "http://www.perry.com/tags/explore/post.htm",
"Joseph": "https://www.hansen.com/main/list/list/index/",
"Alicia": "https://www.tran.biz/wp-content/explore/posts/",
"Anna": "http://lee-mclaughlin.biz/search/login/",
"Kevin": "https://blake.net/main/index/"
}
"Evan": "http://carroll.com/homepage.html"
}
"Sharon": "https://www.watson.org/categories/app/login/",
"Hayley": "https://www.parks.com/",
"William": "https://www.wyatt-ware.com/"
}
My code is more manual than dynamic in that I must explicitly know now many levels deep the dictionary goes rather than dynamically figuring it out.
Here's what I have that works to the depth of 2 levels but I want to to find any key starting with 'B' or 'M' and acting on it.
import json
from build_a_dictionary import add_dic
from faker import Faker
dic = add_dic(10)
dic1 = {}
dic2 = {}
def build_dic(dic_len):
dic1 = {}
fake = Faker()
if len(dic1) == 0:
dic1 = add_dic(dic_len)
print(json.dumps(dic1, indent=4))
for k, v in dic1.items():
dic2[k] = add_dic(dic_len)
for key in dic2[k].keys():
for f in key:
if f == 'B' or f == 'M':
dic2[k][key] = add_dic(dic_len)
return dic2
Here is the code from add_dic() I wrote:
import string, time
from faker import Faker #had to install with pip
fake = Faker()
dic = {}
dics = {}
key = ""
def add_dic(x):
dic={}
start = time.time()
if x > 690:
print("Please select a value under 690")
sys.exit()
for n in range(x):
while len(dic) < x:
key = fake.first_name()
if key in dic.keys():
break
val = fake.uri()
dic[key] = val
end = time.time()
runtime = end - start
return dic
You're just doing it wrong, if you want it to be recursive, write the function as a recursive function. It's essentially a custom (recursive) map function for a dictionary. As for your expected dictionary, I'm not sure how you'd ever get Faker to deterministically give you that same output every time. It's random...
Note: There is nothing "dynamic" about this, it's just a recursive map function.
from faker import Faker
import pprint
pp = pprint.PrettyPrinter(indent=4)
fake = Faker()
def map_val(key, val):
if key[0] == 'M' or key[0] == 'B':
names = [(fake.first_name(), fake.uri()) for i in range(5)]
return {k : map_val(k, v) for k,v in names}
else:
return val
#uncomment below to generate 5 initial names
#names = [(fake.first_name(), fake.uri()) for i in range(5)]
#initial_dict = {k : v for k,v in names}
initial_dict = {
"Marcia": "https://www.skinner.biz/categories/tags/terms.htm",
"Nicholas": "https://scott-tran.com/",
"Christopher": "https://www.ellis.com/",
"Paul": "https://lopez.com/index/",
"Jennifer": "https://www.sosa.com/wp-content/main/login.php"
}
dict_2 = {k : map_val(k,v) for k,v in initial_dict.items()}
pp.pprint(dict_2)
Output:
rpg711$ python nested_dicts.py
{ 'Christopher': 'https://www.ellis.com/',
'Jennifer': 'https://www.sosa.com/wp-content/main/login.php',
'Marcia': { 'Chelsea': 'http://francis.org/category.jsp',
'Heather': 'http://www.rodgers.com/privacy.jsp',
'Jaime': 'https://bates-molina.com/register/',
'John': 'http://www.doyle.com/author.htm',
'Kimberly': 'https://www.harris.org/homepage/'},
'Nicholas': 'https://scott-tran.com/',
'Paul': 'https://lopez.com/index/'
}
Thank you all for your help. I've managed to figure it out.
It now builds a dynamic dictionary or dynamic json for whatever need.
import sys, json
from faker import Faker
fake = Faker()
def build_dic(dic_len, dic):
if isinstance(dic, (list, tuple)):
dic = dict(dic)
if isinstance(dic, dict):
for counter in range(len(dic)):
for k,v in dic.items():
if k[0] == 'B' or k[0] == "M":
update = [(fake.first_name(), fake.uri()) for i in range(5)]
update = dict(update)
dic.update({k: update})
return dic
def walk(dic):
for key, item in dic.items():
#print(type(item))
if isinstance(item, dict):
build_dic(5, item)
walk(item)
return dic
a = build_dic(10, ([(fake.first_name(), fake.uri()) for i in range(10)]))
walk(a)
print(json.dumps(a, indent=4))
Recursion is when a function calls itself; when designing a recursive function, it's important to have an exit condition in mind (i.e. when will the recursion stop).
Let's consider a contrived example to increment a number until it reaches a certain value:
def increment_until_equal_to_or_greater_than_value(item, target):
print 'item is', item,
if item < target:
print 'incrementing'
item += 1
increment_until_equal_to_or_greater_than_value(item, target)
else:
print 'returning'
return item
increment_until_equal_to_or_greater_than_value(1, 10)
And the output
item is 1 incrementing
item is 2 incrementing
item is 3 incrementing
item is 4 incrementing
item is 5 incrementing
item is 6 incrementing
item is 7 incrementing
item is 8 incrementing
item is 9 incrementing
item is 10 returning
You can see we've defined our recursive part in the if statement and the exit condition in the else.
I've put together a snippet that shows a recursive function on a nested data structure.
It doesn't solve exactly your issue, this way you can learn by dissecting it and making it fit for your use case.
# our recursive method
def deep_do_something_if_string(source, something):
# if source is a dict, iterate through it's values
if isinstance(source, dict):
for v in source.itervalues():
# call this method on the value
deep_do_something_if_string(v, something)
# if source is a list, tuple or set, iterate through it's items
elif isinstance(source, (list, tuple, set)):
for v in source:
deep_do_something_if_string(v, something)
# otherwise do something with the value
else:
return something(source)
# a test something to do with the value
def print_it_out(value):
print value
# an example data structure
some_dict = {
'a': 'value a',
'b': [
{
'c': 'value c',
'd': 'value d',
},
],
'e': {
'f': 'value f',
'g': {
'h': {
'i': {
'j': 'value j'
}
}
}
}
}
deep_do_something_if_string(some_dict, print_it_out)
And the output
value a
value c
value d
value j
value f
I have a text file abc.txt:
abc/pqr/lmn/xyz:pass
abc/pqr/lmn/bcd:pass
I need to parse these statements and output should be in nested dictionary as below:
{'abc':{'pqr':{'lmn':{'xyz':{'pass':1},{'bcd':{'pass':1}}}}}}
where 1 is 'pass' count.
I'm able to do as much as this:
import re
d={}
p=re.compile('[a-zA-z]+')
for line in open('abc.txt'):
for key in p.findall(line):
d['key']={}
Check out the setdefault method on dictionaries.
d = {}
d.setdefault('pqr', {}).setdefault('lmn', {}).setdefault('xyz', {})['pass'] = 1
d.setdefault('pqr', {}).setdefault('lmn', {}).setdefault('bcd', {})['pass'] = 1
d
gives
{'pqr': {'lmn': {'bcd': {'pass': 1}, 'xyz': {'pass': 1}}}}
Here's an updated version of my answer in which leaves of the tree data-structure are now different from those in rest of it. Instead of the tree being strictly a dict-of-nested-dicts, the "leaves" on each branch are now instances of a different subclass of dict named collections.Counter which are useful for counting the number of times each of their keys occur. I did this because of your response to my question about what should happen if the last part of each line was something other than ":pass" (which was "we have to put new count for that key").
Nested dictionaries are often called Tree data-structures and can be defined recursively — the root is a dictionary as are the branches. The following uses a dict subclass instead of a plain dict because it makes constructing them easier since you don't need to special case the creation of the first branch of next level down (except I still do when adding the "leaves" because they are a different subclass, collections.Counter).
from collections import Counter
from functools import reduce
import re
# (Optional) trick to make Counter subclass print like a regular dict.
class Counter(Counter):
def __repr__(self):
return dict(self).__repr__()
# Borrowed from answer # https://stackoverflow.com/a/19829714/355230
class Tree(dict):
def __missing__(self, key):
value = self[key] = type(self)()
return value
# Utility functions based on answer # https://stackoverflow.com/a/14692747/355230
def nested_dict_get(nested_dict, keys):
return reduce(lambda d, k: d[k], keys, nested_dict)
def nested_dict_set(nested_dict, keys, value):
nested_dict_get(nested_dict, keys[:-1])[keys[-1]] = value
def nested_dict_update_count(nested_dict, keys):
counter = nested_dict_get(nested_dict, keys[:-1])
if counter: # Update existing Counter.
counter.update([keys[-1]])
else: # Create a new Counter.
nested_dict_set(nested_dict, keys[:-1], Counter([keys[-1]]))
d = Tree()
pat = re.compile(r'[a-zA-z]+')
with open('abc.txt') as file:
for line in file:
nested_dict_update_count(d, [w for w in pat.findall(line.rstrip())])
print(d) # Prints like a regular dict.
To test the leaf-counting capabilities of the revised code, I used the following test file which includes the same line twice, once ending again with :pass and another ending in :fail.
Expanded abc.txt test file:
abc/pqr/lmn/xyz:pass
abc/pqr/lmn/bcd:pass
abc/pqr/lmn/xyz:fail
abc/pqr/lmn/xyz:pass
Output:
{'abc': {'pqr': {'lmn': {'bcd': {'pass': 1}, 'xyz': {'fail': 1, 'pass': 2}}}}}
If i understand your question:
sources = ["abc/pqr/lmn/xyz:pass", "abc/pqr/lmn/bcd:pass", "abc/pqr/lmn/xyz:pass"]
def prepare_source(source):
path, value = source.split(':')
elements = path.split('/')
return elements, value
def add_key(elements, value):
result = dict()
if len(elements) > 1:
result[elements[0]] = add_key(elements[1:], value)
else:
result[elements[0]] = {value: 1}
return result
# base merge function get from here:
# http://stackoverflow.com/questions/7204805/dictionaries-of-dictionaries-merge
def merge(a, b, path=None):
"merges b into a"
if path is None: path = []
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge(a[key], b[key], path + [str(key)])
elif isinstance(a[key], int) and isinstance(b[key], int):
a[key] += b[key]
else:
raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
else:
a[key] = b[key]
return a
result = dict()
for source in sources:
result = merge(result, add_key(*prepare_source(source)))
print result
Output will be:
{'abc': {'pqr': {'lmn': {'bcd': {'pass': 1}, 'xyz': {'pass': 2}}}}}
I am looking to create a simple nested "lookup" mechanism in python, and wanted to make sure there wasn't already something somewhere hidden in the vast libraries in python that doesn't already do this before creating it.
I am looking to take a dict that is formatted something like this
my_dict = {
"root": {
"secondary": {
"user1": {
"name": "jim",
"age": 24
},
"user2": {
"name": "fred",
"age": 25
}
}
}
}
and I am trying to have a way to access the data by using a decimal notation that would be something similar to
root.secondary.user2
and return that resulting dict back as a response. I am thinking that there must be something that does this and I could write one without much difficulty but I want to make sure I am not recreating something I might be missing from the documentation. Thanks
There's nothing in the standard library for this purpose, but it is rather easy to code this yourself:
>>> key = "root.secondary.user2"
>>> reduce(dict.get, key.split("."), my_dict)
{'age': 25, 'name': 'fred'}
This exploits the fact that the look-up for the key k in the dictionary d can be written as dict.get(d, k). Applying this iteratively using reduce() leads to the desired result.
Edit: For completeness three functions to get, set or delete dictionary keys using this method:
def get_key(my_dict, key):
return reduce(dict.get, key.split("."), my_dict)
def set_key(my_dict, key, value):
key = key.split(".")
my_dict = reduce(dict.get, key[:-1], my_dict)
my_dict[key[-1]] = value
def del_key(my_dict, key):
key = key.split(".")
my_dict = reduce(dict.get, key[:-1], my_dict)
del my_dict[key[-1]]
You can have that. You can subclass dict, add the key lookup (and even retain the name dict) by using code similar to the one below. The {...} form however will still use the builtin dict class (now called orig_dict), so you have to enclose it, like so: Dict({...}). This implementation recursively converts dictionaries to the new form, so you don't have to use the method above for any dictionary entries that are plain dictionaries themselves.
orig_dict = dict
class Dict(orig_dict):
def __init__(self, *args, **kwargs):
super(Dict, self).__init__(*args, **kwargs)
for k, v in self.iteritems():
if type(v) == orig_dict and not isinstance(v, Dict):
super(Dict, self).__setitem__(k, Dict(v))
def __getattribute__(self, k):
try: return super(Dict, self).__getattribute__(k)
except: return self.__getitem__(k)
def __setattr__(self, k, v):
if self.has_key(k): self.__setitem__(k, v)
else: return super(Dict, self).__setattr__(k, v)
def __delattr__(self, k):
try: self.__delitem__(k)
except: super(Dict, self).__delattr__(k)
def __setitem__(self, k, v):
toconvert = type(v) == orig_dict and not isinstance(v, Dict)
super(Dict, self).__setitem__(k, Dict(v) if toconvert else v)
# dict = Dict <-- you can even do this but I advise against it
# testing:
b = Dict(a=1, b=Dict(c=2, d=3))
c = Dict({'a': 1, 'b': {'c': 2, 'd': 3}})
d = Dict(a=1, b={'c': 2, 'd': {'e': 3, 'f': {'g': 4}}})
b.a = b.b
b.b = 1
d.b.d.f.g = 40
del d.b.d.e
d.b.c += d.b.d.f.g
c.b.c += c.a
del c.a
print b
print c
print d
Recursion still works.
def walk_into( dict, key ):
head, _, tail = key.partition('.')
if tail:
return walk_into( dict[head], tail )
return dict, key
d, k = walk_into( my_dict, "root.secondary.user2" )
d[k] can be used for getting or putting a new value.
I have a pretty complete implementation for this and some other stuff here. Repository here, trict.util combined with the __get__ method in trict.trict might have the stuff you need if you don't feel like installing it. Also it actually is in conda-forge even though the README might say otherwise if I haven't gotten around to updating it before you're reading this.