I have a function which creates data
from faker import Faker
import pandas as pd
import random
def create_rows_faker(num=1, name_col = True, address_col = True, email_col = False):
output = []
for x in range(num):
out = {}
if name_col:
out["name"] = fake.name()
if address_col:
out["address"] = fake.address()
if email_col:
out["email"] = fake.email()
output.append(out)
return output
but I want to remove the multiple if statements inside the for loop. What is the best method to improve this?
You can use kwargs and dictionary
def create_rows_faker(num=1, **kwargs):
output = []
es = {"name": fake.name(), "address": fake.address(), "email": fake.email()}
for x in range(num):
output.append({key: es[key] for key in kwargs.keys() if kwargs.get(key) })
return output
create_rows_faker(num=1, name=True, address=True, email=True)
Instead of taking the columns as separate arguments, use a list of column names. You can then loop over this list, and fill in out with the corresponding fakes, using getattr() to call the methods dynamically.
from copy import deepcopy
def create_rows_faker(num=1, columns):
output = []
for _ in range(num):
out = {col: getattr(fake, col)() for col in columns}
output.append(out)
return output
I'm not sure if this really is going to be any faster, because copying dictionaries does take at least as much time as doing if statements, but you can create the dictionary once and then copy it in to your output as needed.
def create_mock_rows(num: int = 1,
name_col: bool = True,
address_col: bool = True,
email_col: bool = True) -> list:
out = {
"name": fake.name() if name_col else None,
"address": fake.address() if address_col else None,
"email": fake.email() if email_col else None,
}
return [ {k: v for k, v in out.items() if v is not None} for _ in range(num)]
Another option is to leverage **kwargs:
def create_mock_rows(num: int = 1, **kwargs) -> list:
return [{k: getattr(fake, v)() for k, v in kwargs.items()} for _ in range(num)]
I admit I don't love this, though, because kwargs could be anything, and there is some chance of this just failing or giving you a weird result if improperly called.
Almost every language provides a switch statement:
switch(col.key) {
case "name":
col.val = fake.name();
break;
case "address":
col.val = fake.address();
break;
...
default:
throw new InvalidDataException();
As of version 3.10, Python added something similar: P634 - Structural Pattern Matching.
match subject:
case <pattern_1>:
<action_1>
case <pattern_2>:
<action_2>
case <pattern_3>:
<action_3>
case _:
<action_wildcard>
Related
I am trying to populate a python dict from a list of paths (the purpose is to create a ttk.treview) :
paths = ["\\hihi", "\\hihi\\hoho\\huhu", "\\hihi\\hoho\\haha", "\\haha", "\\huhu"]
and I want to create this dictionnary (json serialized here) :
{
"haha": {},
"hihi": {
"hoho": {
"huhu": 0
},
"huhu": {
"haha": 0
}
},
"huhu": {}
}
What is the best way to do this? I tried with a for loop (recursive loop?), with a dict comprehension and with dpath but I have no valid result.
The biginning of my code :
split = paths.split("\\")
del split[0]
dict = {}
?
Thank you very much in advance
You could use defaultdict for this:
def make_empty_default_dict():
return defaultdict(make_empty_default_dict)
Define how you add a path:
def add_path(pth, dct):
if pth:
subdict = dct[pth[0]]
return add_path(pth[1:], subdict)
else:
return dct
Then populate your default dict with keys:
d = make_empty_default_dict()
for path in paths:
d = add_path(path.split("\\"), d)
I have an alternative to the recursive solution. For each path:
put a cursor at the root of the target dict
search sequence: move the cursor forth until you find a 0 or a missing part of the path
build sequence: add an empty dict and move the cursor on that dict until you hit the last part.
the last part needs a special handling for the 0.
Here's the code:
def build_paths(paths, d={}):
for path in paths:
parts = path.split("\\")[1:] # remove the part before \
cursor = d
search = True
for part in parts[:-1]:
if search:
if part not in cursor or not cursor[part]: # not found or 0
cursor[part] = {} # set a new dict
search = False
else:
cursor[part] = {}
cursor = cursor[part] # advance one level deeper in the dict
cursor[parts[-1]] = 0 # close with a 0
return d
It's faster than the recursive vesion of #xtofl, but not that fast. With timeit:
iterative: 6.169872568580601
recursive: 17.209112331781498
You can use recursion with itertools.groupby:
import itertools
paths = ["\\hihi", "\\hihi\\hoho\\huhu", "\\hihi\\hoho\\haha", "\\haha", "\\huhu"]
new_paths = [list(filter(None, i.split('\\'))) for i in paths]
def update_output(f):
def wrapper(_d):
result = f(_d)
final = lambda x, level = 0:{a:{} if not level and not b else b if not b else final(b, level+1) for a, b in x.items()}
return final(result)
return wrapper
#update_output
def full_directory(data):
def files(d):
return {a:(lambda x:0 if len(x) == 1 else files([i[1:] for i in filter(lambda y:len(y) != 1 or y[0] != a, x)]))(list(b)) for a, b in itertools.groupby(sorted(d, key=lambda x:x[0]), key=lambda x:x[0])}
return files(data)
print(full_directory(new_paths))
Output:
{'haha': {}, 'hihi': {'hoho': {'haha': 0, 'huhu': 0}}, 'huhu': {}}
I found this : http://connor-johnson.com/2015/02/28/generate-a-tree-structure-in-python/
It works very well! So the code :
def add(t, path):
for node in path:
t = t[node]
Tree = lambda: defaultdict(Tree)
t = Tree()
paths = ["\\hihi", "\\hihi\\hoho\\huhu", "\\hihi\\hoho\\haha", "\\haha", "\\huhu"]
for path in paths:
split = path.split("\\")
del split[0]
for elt in split:
add(t, split)
dicts = lambda t: { k:dicts(t[k]) for k in t }
print(json.dumps(dicts(t), indent=4))
Given that you have an empty dictionary
data = {}
I have a path and a value
path = "root.sub.item"
value = 12
How could I recursively add objects that do not exist?
def add_value(path, value):
for part in path.split('.'):
if not part in data:
data[part] = {}
The expected output for this would be:
data = {
'root':{
'sub':{
'item': 12
}
}
}
Could somebody help out with this or point me in the right direction?
I'm using Python 3.6.
You can use some another kind of solution like recursive defaultdict, as in this answer.
A quick and stupid example about how it can used:
from collections import defaultdict
def func(rdict, path, value):
items = path.split('.')
d = rdict[items[0]]
for item in items[1:-1]:
d = d[item]
d[items[-1]] = value
nested_dict = lambda: defaultdict(nested_dict)
result = nested_dict()
func(result, 'root.sub.item', 12)
func(result, 'root.moon.value', 1)
assert result['root']['sub']['item'] == 12
assert result['root']['moon']['value'] == 1
assert result['root']['moon']['noop'] != 0
You're almost there, you just need to keep track of how far you are into the tree structure, and a way to know when you're on the last element of the path:
def add_value(path, value):
tmp = data
parts = list(path.split('.'))
for i in range(len(parts) - 1):
part = parts[i]
if not part in tmp:
tmp[part] = {}
tmp = tmp[part]
tmp[parts[-1]] = value
you can try Raymond Hettinger recipe :
source: https://twitter.com/raymondh/status/343823801278140417
from collections import defaultdict
infinity_dict=lambda:defaultdict(infinity_dict)
d=infinity_dict()
d['root']['sub']['item'] = 12
I've been working on this for too long and need some help.
I'm trying to create a dictionary using faker. If it were only that simple.
Initially the dictionary is flat. A key and item. If the first letter of the key is 'B' or 'M' it will then turn that string, into a dictionary with 5 keys and keep doing that until it finds none starting with either of those two letters. I know, there's no recursion happening now. That's why I need help. I'm trying to figure out how to properly recurse rather than hard code the depth.
Starting Dictionary:
{
"Marcia": "https://www.skinner.biz/categories/tags/terms.htm",
"Nicholas": "https://scott-tran.com/",
"Christopher": "https://www.ellis.com/",
"Paul": "https://lopez.com/index/",
"Jennifer": "https://www.sosa.com/wp-content/main/login.php"
}
Marcia should expand to this...
Example:
"Marcia": {
"Alexander": "http://hicks.net/home.html",
"Barry": {
"Jared": "https://www.parker-robinson.com/faq.html",
"Eddie": "https://www.smith-thomas.com/",
"Ryan": "https://www.phillips.org/homepage/",
"Mary": {
"Alex": "http://www.perry.com/tags/explore/post.htm",
"Joseph": "https://www.hansen.com/main/list/list/index/",
"Alicia": "https://www.tran.biz/wp-content/explore/posts/",
"Anna": "http://lee-mclaughlin.biz/search/login/",
"Kevin": "https://blake.net/main/index/"
}
"Evan": "http://carroll.com/homepage.html"
}
"Sharon": "https://www.watson.org/categories/app/login/",
"Hayley": "https://www.parks.com/",
"William": "https://www.wyatt-ware.com/"
}
My code is more manual than dynamic in that I must explicitly know now many levels deep the dictionary goes rather than dynamically figuring it out.
Here's what I have that works to the depth of 2 levels but I want to to find any key starting with 'B' or 'M' and acting on it.
import json
from build_a_dictionary import add_dic
from faker import Faker
dic = add_dic(10)
dic1 = {}
dic2 = {}
def build_dic(dic_len):
dic1 = {}
fake = Faker()
if len(dic1) == 0:
dic1 = add_dic(dic_len)
print(json.dumps(dic1, indent=4))
for k, v in dic1.items():
dic2[k] = add_dic(dic_len)
for key in dic2[k].keys():
for f in key:
if f == 'B' or f == 'M':
dic2[k][key] = add_dic(dic_len)
return dic2
Here is the code from add_dic() I wrote:
import string, time
from faker import Faker #had to install with pip
fake = Faker()
dic = {}
dics = {}
key = ""
def add_dic(x):
dic={}
start = time.time()
if x > 690:
print("Please select a value under 690")
sys.exit()
for n in range(x):
while len(dic) < x:
key = fake.first_name()
if key in dic.keys():
break
val = fake.uri()
dic[key] = val
end = time.time()
runtime = end - start
return dic
You're just doing it wrong, if you want it to be recursive, write the function as a recursive function. It's essentially a custom (recursive) map function for a dictionary. As for your expected dictionary, I'm not sure how you'd ever get Faker to deterministically give you that same output every time. It's random...
Note: There is nothing "dynamic" about this, it's just a recursive map function.
from faker import Faker
import pprint
pp = pprint.PrettyPrinter(indent=4)
fake = Faker()
def map_val(key, val):
if key[0] == 'M' or key[0] == 'B':
names = [(fake.first_name(), fake.uri()) for i in range(5)]
return {k : map_val(k, v) for k,v in names}
else:
return val
#uncomment below to generate 5 initial names
#names = [(fake.first_name(), fake.uri()) for i in range(5)]
#initial_dict = {k : v for k,v in names}
initial_dict = {
"Marcia": "https://www.skinner.biz/categories/tags/terms.htm",
"Nicholas": "https://scott-tran.com/",
"Christopher": "https://www.ellis.com/",
"Paul": "https://lopez.com/index/",
"Jennifer": "https://www.sosa.com/wp-content/main/login.php"
}
dict_2 = {k : map_val(k,v) for k,v in initial_dict.items()}
pp.pprint(dict_2)
Output:
rpg711$ python nested_dicts.py
{ 'Christopher': 'https://www.ellis.com/',
'Jennifer': 'https://www.sosa.com/wp-content/main/login.php',
'Marcia': { 'Chelsea': 'http://francis.org/category.jsp',
'Heather': 'http://www.rodgers.com/privacy.jsp',
'Jaime': 'https://bates-molina.com/register/',
'John': 'http://www.doyle.com/author.htm',
'Kimberly': 'https://www.harris.org/homepage/'},
'Nicholas': 'https://scott-tran.com/',
'Paul': 'https://lopez.com/index/'
}
Thank you all for your help. I've managed to figure it out.
It now builds a dynamic dictionary or dynamic json for whatever need.
import sys, json
from faker import Faker
fake = Faker()
def build_dic(dic_len, dic):
if isinstance(dic, (list, tuple)):
dic = dict(dic)
if isinstance(dic, dict):
for counter in range(len(dic)):
for k,v in dic.items():
if k[0] == 'B' or k[0] == "M":
update = [(fake.first_name(), fake.uri()) for i in range(5)]
update = dict(update)
dic.update({k: update})
return dic
def walk(dic):
for key, item in dic.items():
#print(type(item))
if isinstance(item, dict):
build_dic(5, item)
walk(item)
return dic
a = build_dic(10, ([(fake.first_name(), fake.uri()) for i in range(10)]))
walk(a)
print(json.dumps(a, indent=4))
Recursion is when a function calls itself; when designing a recursive function, it's important to have an exit condition in mind (i.e. when will the recursion stop).
Let's consider a contrived example to increment a number until it reaches a certain value:
def increment_until_equal_to_or_greater_than_value(item, target):
print 'item is', item,
if item < target:
print 'incrementing'
item += 1
increment_until_equal_to_or_greater_than_value(item, target)
else:
print 'returning'
return item
increment_until_equal_to_or_greater_than_value(1, 10)
And the output
item is 1 incrementing
item is 2 incrementing
item is 3 incrementing
item is 4 incrementing
item is 5 incrementing
item is 6 incrementing
item is 7 incrementing
item is 8 incrementing
item is 9 incrementing
item is 10 returning
You can see we've defined our recursive part in the if statement and the exit condition in the else.
I've put together a snippet that shows a recursive function on a nested data structure.
It doesn't solve exactly your issue, this way you can learn by dissecting it and making it fit for your use case.
# our recursive method
def deep_do_something_if_string(source, something):
# if source is a dict, iterate through it's values
if isinstance(source, dict):
for v in source.itervalues():
# call this method on the value
deep_do_something_if_string(v, something)
# if source is a list, tuple or set, iterate through it's items
elif isinstance(source, (list, tuple, set)):
for v in source:
deep_do_something_if_string(v, something)
# otherwise do something with the value
else:
return something(source)
# a test something to do with the value
def print_it_out(value):
print value
# an example data structure
some_dict = {
'a': 'value a',
'b': [
{
'c': 'value c',
'd': 'value d',
},
],
'e': {
'f': 'value f',
'g': {
'h': {
'i': {
'j': 'value j'
}
}
}
}
}
deep_do_something_if_string(some_dict, print_it_out)
And the output
value a
value c
value d
value j
value f
I have a list of objects that need to be unpacked to a dictionary efficiently. There are more than 2,000,000 objects in the list. The operation takes more than 1.5 hours complete. I would like to know if this can be done more efficiently.
The objects in the list is based on this class.
class ResObj:
def __init__(self, index, result):
self.loc = index ### This is the location, where the values should go in the final result dictionary
self.res = result ### This is a dictionary that has values for this location.
self.loc = 2
self.res = {'value1':5.4, 'value2':2.3, 'valuen':{'sub_value1':4.5, 'sub_value2':3.4, 'sub_value3':7.6}}
Currently I use this method to perform this operation.
def make_final_result(list_of_results):
no_sub_result_variables = ['value1', 'value2']
sub_result_variables = ['valuen']
sub_value_variables = ['sub_value1', 'sub_value3', 'sub_value3']
final_result = {}
num_of_results = len(list_of_results)
for var in no_sub_result_variables:
final_result[var] = numpy.zeros(num_of_results)
for var in sub_result_variables:
final_result[var] = {sub_var:numpy.zeros(num_of_results) for sub_var in sub_value_variables}
for obj in list_of_results:
i = obj.loc
result = obj.res
for var in no_sub_result_variables:
final_result[var][i] = result[var]
for var in sub_result_variables:
for name in sub_value_variables:
try:
final_result[var][name][i] = result[var][name]
except KeyError as e:
##TODO Add a debug check
pass
I have tried using multiprocessing.Manager().dict and Manager().Array() to use parallelism for this, however, I could only get 2 processes to work (even though, I manually set the processes to # of CPUs = 24).
Can you please help me to use a faster method to improve the performance.
Thank you.
Having nested numpy arrays doesn't seem the best way to structure your data. You can use numpy's structured arrays to create a more intuitive data structure.
import numpy as np
# example values
values = [
{
"v1": 0,
"v2": 1,
"vs": {
"x": 2,
"y": 3,
"z": 4,
}
},
{
"v1": 5,
"v2": 6,
"vs": {
"x": 7,
"y": 8,
"z": 9,
}
}
]
def value_to_record(value):
"""Take a dictionary and convert it to an array-like format"""
return (
value["v1"],
value["v2"],
(
value["vs"]["x"],
value["vs"]["y"],
value["vs"]["z"]
)
)
# define what a record looks like -- f8 is an 8-byte float
dtype = [
("v1", "f8"),
("v2", "f8"),
("vs", [
("x", "f8"),
("y", "f8"),
("z", "f8")
])
]
# create actual array
arr = np.fromiter(map(value_to_record, values), dtype=dtype, count=len(values))
# access individual record
print(arr[0]) # prints (0.0, 1.0, (2.0, 3.0, 4.0))
# access specific value
assert arr[0]['vs']['x'] == 2
# access all values of a specific field
print(arr['v2']) # prints [ 1. 6.]
assert arr['v2'].sum() == 7
Using this way of generating the data created a 2,000,000 long array in 2 seconds on my machine.
To make it work for your ResObj objects then sort them by the loc attribute, and then pass the res attribute to the value_to_record function.
You you can distribute the work among processes by key names.
Here I create a pool of workers and pass to them var and optional subvar names.
The huge dataset is shared with workers using cheap fork.
Unpacker.unpack picks the specified vars from ResObj and returns them as an np.array
The main loop in make_final_result combines the arrays in final_result.
Py2:
from collections import defaultdict
from multiprocessing import Process, Pool
import numpy as np
class ResObj(object):
def __init__(self, index=None, result=None):
self.loc = index ### This is the location, where the values should go in the final result dictionary
self.res = result ### This is a dictionary that has values for this location.
self.loc = 2
self.res = {'value1':5.4, 'value2':2.3, 'valuen':{'sub_value1':4.5, 'sub_value2':3.4, 'sub_value3':7.6}}
class Unpacker(object):
#classmethod
def cls_init(cls, list_of_results):
cls.list_of_results = list_of_results
#classmethod
def unpack(cls, var, name):
list_of_results = cls.list_of_results
result = np.zeros(len(list_of_results))
if name is None:
for i, it in enumerate(list_of_results):
result[i] = it.res[var]
else:
for i, it in enumerate(list_of_results):
result[i] = it.res[var][name]
return var, name, result
#Pool.map doesn't accept instancemethods so the use of a wrapper
def Unpacker_unpack((var, name),):
return Unpacker.unpack(var, name)
def make_final_result(list_of_results):
no_sub_result_variables = ['value1', 'value2']
sub_result_variables = ['valuen']
sub_value_variables = ['sub_value1', 'sub_value3', 'sub_value3']
pool = Pool(initializer=Unpacker.cls_init, initargs=(list_of_results, ))
final_result = defaultdict(dict)
def key_generator():
for var in no_sub_result_variables:
yield var, None
for var in sub_result_variables:
for name in sub_value_variables:
yield var, name
for var, name, result in pool.imap(Unpacker_unpack, key_generator()):
if name is None:
final_result[var] = result
else:
final_result[var][name] = result
return final_result
if __name__ == '__main__':
print make_final_result([ResObj() for x in xrange(10)])
Ensure that you are not on Windows. It lacks fork and multiprocessing will have to pipe entire dataset to each of 24 worker processes.
Hope this will help.
Remove some indentation to make your loops non-nested:
for obj in list_of_results:
i = obj.loc
result = obj.res
for var in no_sub_result_variables:
final_result[var][i] = result[var]
for var in sub_result_variables:
for name in sub_value_variables:
try:
final_result[var][name][i] = result[var][name]
except KeyError as e:
##TODO Add a debug check
pass
I have a JSON object such that:
zt123
zt3653
zt777 ..etc.
I tried the following but think I am over-complicating this. Is there a simplified way?
def extract(dict_in, dict_out):
for key, value in dict_in.iteritems():
if isinstance(value, dict): # If value itself is dictionary
extract(value, dict_out)
elif isinstance(value, unicode):
# Write to dict_out
dict_out[key] = value
return dict_out
The chosen answer on this StackOverFlow question may be of service to you:
What is the best (idiomatic) way to check the type of a Python variable?
these will always be nested in -->interfaces-->interface-->zt
If it's in a fixed position just call this position:
hosts1_xxxxxxx= {
"line": {},
"interfaces": {
"interface": {
"zt123": {},
"zt456": {},
},
},
}
zts = list(hosts1_xxxxxxx["interfaces"]["interace"].keys())
print(zts)
# ["zt123", "zt456"]
Here's a general way of doing this (For any depth in the dict)-
# This function takes the dict and required prefix
def extract(d, prefix, res=None):
if not res:
res = []
for key, val in d.iteritems():
if key.startswith(prefix):
res.append(key)
if type(val) == dict:
res = extract(val, prefix, res[:])
return res
# Assume this to be a sample dictionary -
d = {"zt1": "1", "zt2":{"zt3":{"zt4":"2"}}}
res = extract(d, "zt")
print res
# Outputs-
['zt1', 'zt2', 'zt3', 'zt4']
This basically iterates each and every key and uses the startswith function to find out if the key starts with zt