performance of calculations on large flattened dictionary with implied hierarchy - python

Given a dictionary structured like this:
{'guy1_arm_param1':23.0, 'guy1_arm_param2_low':2.0, 'guy1_arm_param2_high':3.0, 'guy1_arm_param3':20.0,
'guy1_leg_param1':40.0, 'guy1_leg_param2_low':2.0, 'guy1_leg_param2_high':3.0, 'guy1_leg_param3':20.0,
'guy2_arm_param1':23.0, 'guy2_arm_param2_low':2.0, 'guy2_arm_param2_high':3.0, 'guy2_arm_param3':20.0,
'guy2_leg_param1':40.0, 'guy2_leg_param2_low':2.0, 'guy2_leg_param2_high':3.0, 'guy2_leg_param3':20.0,
'another_guy_param1':3.0}
What the most efficient way to make a function which will go through and pull out the parameters for a given 'guy' and make a calculation with them?
For example:
def full_guy_function(given_dict, guy):
d = unflatten(given_dict)
guy_functions = list()
guy_dict = {}
for body_part in d[guy].keys():
param1 = d[guy][body_part]['param1']
param3 = d[guy][body_part]['param3']
for k, v in d[guy][body_part]['param2'].iteritems():
guy_functions.append(get_function_for_part(param1, v, param3))
full_guy_function = sum(guy_functions)
return full_guy_function
def get_function_for_part(param1, param2, param3):
x = [x for x in range(0,100)]
x = numpy.array(x)
return param3**(x*param1/param2)
# http://stackoverflow.com/questions/6037503/python-unflatten-dict
def unflatten(dictionary):
resultDict = dict()
for key, value in dictionary.iteritems():
parts = key.split('_')
d = resultDict
for part in parts[:-1]:
if part not in d:
d[part] = dict()
d = d[part]
d[parts[-1]] = value
return resultDict
I feel like looping through and making other dictionaries etc. is horribly inefficient. (this is a big main dictionary, and this function will be called every couple milliseconds.)
I tried to do this using objects which was much more easily understood, but the hierarchical objects cannot be read from different processes than the ones that are using and writing to them. So, I am stuck with trying to make a massive flattened dictionary like this and calculating the results on both sides of the processes.
If this type of operation has to be done every couple of milliseconds on large sets of data, is it better to do with a database?

Better later than never...
I suggest you to use python-benedict, it is open-source on GitHub (I am the author).
Installation: pip install python-benedict
Just test how the flatten dict will be:
from benedict import benedict
data = benedict({
'guy1_arm_param1':23.0, 'guy1_arm_param2_low':2.0, 'guy1_arm_param2_high':3.0, 'guy1_arm_param3':20.0,
'guy1_leg_param1':40.0, 'guy1_leg_param2_low':2.0, 'guy1_leg_param2_high':3.0, 'guy1_leg_param3':20.0,
'guy2_arm_param1':23.0, 'guy2_arm_param2_low':2.0, 'guy2_arm_param2_high':3.0, 'guy2_arm_param3':20.0,
'guy2_leg_param1':40.0, 'guy2_leg_param2_low':2.0, 'guy2_leg_param2_high':3.0, 'guy2_leg_param3':20.0,
'another_guy_param1':3.0,
})
data_unflatten = data.unflatten()
print(data_unflatten.dump())
Your code:
from benedict import benedict
def full_guy_function(given_dict, guy):
b = benedict(given_dict)
d = b.unflatten()
guy_functions = []
guy_dict = {}
for guy_key, guy_val in d.items():
param1 = guy_val['param1']
param3 = guy_val['param3']
for k, v in guy_val['param2'].items():
guy_functions.append(get_function_for_part(param1, v, param3))
full_guy_function = sum(guy_functions)
return full_guy_function
def get_function_for_part(param1, param2, param3):
x = [x for x in range(0,100)]
x = numpy.array(x)
return param3**(x*param1/param2)

Related

Automatically pass variables names to list

I have a long chain of code for a portion of a script. For example:
B1_2013 = images.select('0_B1')
B2_2013 = images.select('0_B2')
B3_2013 = images.select('0_B3')
B4_2013 = images.select('0_B4')
B5_2013 = images.select('0_B5')
B6_2013 = images.select('0_B6')
B7_2013 = images.select('0_B7')
B8_2013 = images.select('0_B8')
B9_2013 = images.select('0_B9')
B10_2013 = images.select('0_B10')
B11_2013 = images.select('0_B11')
B1_2014 = images.select('1_B1')
B2_2014 = images.select('1_B2')
B3_2014 = images.select('1_B3')
B4_2014 = images.select('1_B4')
B5_2014 = images.select('1_B5')
B6_2014 = images.select('1_B6')
B7_2014 = images.select('1_B7')
B8_2014 = images.select('1_B8')
B9_2014 = images.select('1_B9')
B10_2014 = images.select('1_B10')
B11_2014 = images.select('1_B11')
and so on ...
B11_2020 = images.select('7_B11')
Ultimately, from these lines of code, I need to generate a list of variables (my_variables) that I can pass off to a function. For example:
my_variables = [B1_2013, B2_2013, B3_2013, B4_2013, B5_2013, B6_2013, B7_2013, B8_2013, B9_2013, B10_2013, B11_2013, \
B1_2014, B2_2014, B3_2014, B4_2014, B5_2014, B6_2014, B7_2014, B8_2014, B9_2014, B10_2014, B11_2014]
Is there a more efficient approach to automatically generate hundreds of lines of code (e.g. B1_2013 = images.select('0_B1') and so on...) following the first code example so that I can automatically generate a list of variables in the second code example (e.g. my_variables = [B1_2013, and so on...]?
Just make a list using a loop or list comprehension.
my_images = [images.select(f'{i}_B{j}') for i in range(8) for j in range(12)]
In this use case it is more viable to use a dict to store the "variables" it is not good practice to dynamically build variables. Below is an example using itertools.product that will build a dict with desired output:
from itertools import product
images = {f'B{i}_{y+2013}': images.select(f'{y}_B{i}')
for y, i in product(range(12), range(1, 8))}
Result:
{'B1_2013': '0_B1',
'B2_2013': '0_B2',
'B3_2013': '0_B3',
'B4_2013': '0_B4',
'B5_2013': '0_B5',
'B6_2013': '0_B6',
'B7_2013': '0_B7',
'B1_2014': '1_B1',
'B2_2014': '1_B2',
'B3_2014': '1_B3',
...
}
Then to access the desired "variable" use:
>>> images['B3_2014']
'1_B1'
#Barmar's answer is correct. You can extend his answer if you wanted to index the variables by doing the following:
my_images = {f'{i}_B{j}':images.select(f'{i}_B{j}') for i in range(8) for j in range(12)}
This is called dictionary comprehension.
Or dictionary comprehension:
my_images = {'B{j}_{2013 + i}': images.select(f'{i}_B{j}') for i in range(8) for j in range(12)}
Refer them with:
my_images['B1_2013']
my_images['B2_2013']
...

SQL Dictionary Appending for Large Datasets in a Loop in cx_Oracle

I am trying to append dictionaries together and then use "from_dict" to get the final returned data from cx_Oracle as I heard that is more efficient that appending each returned row from SQL. However, my loop still takes a very long time (the ending loop returns a VERY large database, each loop gets data for an I.D. which returns ~ 12,000 rows per I.D. - there are over 700 I.D.s in the loop). How do I take advantage of "from_dict" so this speeds up? I don't think this is the most efficient way to do this as I have the code written now. Any suggestions? Thanks.
Is there a more efficient way? Using concat and not append?
for iteration, c in enumerate(l, start = 1):
total = len(l)
data['SP_ID'] = c
data['BEGIN_DATE'] = BEGIN_DATE
print("Getting consumption data for service point I.D.:", c, " ---->", iteration, "of", total)
cursor.arraysize = 1000000
cursor.prefetchrows = 2
cursor.execute(sql, data)
cursor.rowfactory = lambda *args: dict(zip([d[0] for d in cursor.description], args))
df_row = cursor.fetchall()
if len(df_row) == 0:
pass
else:
a = {k: [d[k] for d in df_row] for k in df_row[0]} # Here is where I combine dictionaries, but this is for only dataset pulling from SQL.I want to combine all the dictionaries from each loop to increase efficiency.
AMI_data = pd.DataFrame.from_dict(a)
#AMI.append(AMI_data)
#final_AMI_data = pd.concat(AMI)
# final_data.dropna(inplace = True)
# UPDATED
final_AMI_data = pd.DataFrame()
for iteration, c in enumerate(l, start = 1):
total = len(l)
data['SP_ID'] = c
data['BEGIN_DATE'] = BEGIN_DATE
print("Getting consumption data for service point I.D.:", c, " ---->", iteration, "of", total)
cursor.arraysize = 1000000
cursor.prefetchrows = 2
cursor.execute(sql, data)
cursor.rowfactory = lambda *args: dict(zip([d[0] for d in cursor.description], args))
df_row = cursor.fetchall()
if len(df_row) == 0:
pass
else:
AMI_data = pd.DataFrame.from_records(df_row)
final_AMI_data.append(AMI_data, ignore_index = False)
# final_data.dropna(inplace = True)
You shouldn't need to re-create your dictionary if you've already the dict-style cursor factory. (Btw, see this answer for how to make a better one.)
Assuming your df_rows looks like this after fetching all rows, with 'X' and 'Y' being example column names for the query-result:
[{'X': 'xval1', 'Y': 'yval1'},
{'X': 'xval2', 'Y': 'yval2'},
{'X': 'xval3', 'Y': 'yval3'}]
1. Then use .from_records() to create your dataframe:
pd.DataFrame.from_records(df_rows)
Output:
X Y
0 xval1 yval1
1 xval2 yval2
2 xval3 yval3
That way, you don't need to restructure your results to use with from_dict().
2. And if you want to keep adding each group of 12,000 results to the same DataFrame, use DataFrame.append() with ignore_index=True to keep adding each new group of results to the existing dataframe.
It's better to just append into your dataframe instead of creating a bigger and bigger dictionaryto finally create one df.
In case it wasn't clear, remove these two lines in your else:
a = {k: [d[k] for d in df_row] for k in df_row[0]}
AMI_data = pd.DataFrame.from_dict(a)
and replace it with just:
AMI_data = pd.DataFrame.from_records(df_row)
# and then to add it to your final:
final_AMI_data.append(AMI_data, ignore_index=True)

Why data = copy.deepcopy(G) matters in Karger min cut algorithm?

Here is the code to compute the min cut posted here karger min cut algorithm in python 2.7. Without data = copy.deepcopy(G), the efficiency to find the min cut is not good. Can anybody explain why? Thanks.
import random, copy
data = open("***.txt","r")
G = {}
for line in data:
lst = [int(s) for s in line.split()]
G[lst[0]] = lst[1:]
def choose_random_key(G):
v1 = random.choice(list(G.keys()))
v2 = random.choice(list(G[v1]))
return v1, v2
def karger(G):
length = []
while len(G) > 2:
v1, v2 = choose_random_key(G)
G[v1].extend(G[v2])
for x in G[v2]:
G[x].remove(v2)
G[x].append(v1)
while v1 in G[v1]:
G[v1].remove(v1)
del G[v2]
for key in G.keys():
length.append(len(G[key]))
return length[0]
def operation(n):
i = 0
count = 10000
while i < n:
data = copy.deepcopy(G)
min_cut = karger(data)
if min_cut < count:
count = min_cut
i = i + 1
return count
print(operation(100))
data = G and karger(data) doesn't duplicate dictionary and karger(data) would use original G dictionary. But karger changes values in data so automatically it would change values in original dictionary. So next executions of karger(data) would use dictionary with different values.
Remove deepcopy and add print(data) before karger(data) and you see different values in data.

Re-order numpy array based on where its associated ids are positioned in the `master_order` array

I am looking for a function that makes a new array of values based on ordered_ids, when the array has a length of one million.
Input:
>>> ids=array(["WYOMING01","TEXAS01","TEXAS02",...])
>>> values=array([12,20,30,...])
>>> ordered_ids=array(["TEXAS01","TEXAS02","ALABAMA01",...])
Output:
ordered [ 20 , 30 , nan , ...]
Closing Summary
#Dietrich's use of a dictionary in list comprehension is 10x faster than using numpy index search (numpy.where). I compared the times of three results in my answer below.
You could try:
import numpy as np
def order_array(ids, values, master_order_ids):
n = len(master_order_ids)
idx = np.searchsorted(master_order_ids, ids)
ordered_values = np.zeros(n)
ordered_values[idx < n] = values[idx < n]
print "ordered", ordered_values
return ordered_values
Searchsorted gives you indices where you should insert ids into master_order_ids to keep the arrray ordered. Then you just drop those (idx, values) that are out of the range of master_order_ids.
You could try using a dict() to associate the stings to your numbers. It simplifies the code considerably:
import numpy as np
def order_bydict(ids,values,master_order_ids):
""" Using a dict to order ``master_order_ids`` """
dd = dict([(k,v) for k,v in zip(ids, values)]) # create the dict
ordered_values = [dd.get(m, 0) for m in master_order_ids] # get() return 0 if key not found
return np.asarray(ordered_values) # return a numpy array instead of a list
The speedwise improvement is hard to predict without testing longer arrays (with your example it was 25% faster based on %timeit).
import numpy
from numpy import copy, random, arange
import time
# SETUP
N=10**4
ids = arange(0,N).astype(str)
values = arange(0,N)
numpy.random.shuffle(ids)
numpy.random.shuffle(values)
ordered_ids=arange(0,N).astype(str)
ordered_values = numpy.empty((N,1))
ordered_values[:] = numpy.NAN
# METHOD 1
start = time.clock()
for i in range(len(values)):ordered_values[ordered_ids==ids[i]]=values[i]
print "not using dictionary:", time.clock() - start
# METHOD 2
start = time.clock()
d = dict(zip(ids, values))
for k, v in d.iteritems(): ordered_values[ordered_ids==k] = v
print "using dictionary:", time.clock() - start
# METHOD 3 #Dietrich's approach in the answer above
start = time.clock()
dd = dict(zip(ids, values))
ordered_values = [dd.get(m, 0) for m in ordered_ids]
print "using dictionary with list comprehension:", time.clock() - start
Results
not using dictionary: 1.320237 # Method 1
using dictionary: 1.327119 # Method 2
using dictionary with list comprehension: 0.013287 # #Dietrich
The following solution using the numpy_indexed package (disclaimer: I am its author) is purely vectorized, and likely to be much more efficient than the solutions posted thus far:
import numpy_indexed as npi
idx = npi.indices(ids, ordered_ids, missing='mask')
new_values = values[idx]
new_values[idx.mask] = -1 # or cast to float and set to nan, but you get the idea...

fetching values from dictionary. Range match on keys

I created a dictionary myDict holding 10 million entries in the following form. Each entry in the dictionary represent {(id, age): code}
>>> myDict = {('1039', '68.0864'): '42731,42781,V4501',
('1039', '68.1704'): '4770,4778,V071',
('0845', '60.4476'): '2724,27800,4019',
('0983', '63.3936'): '41401,4168,4240,V1582,V7281'
}
A constant ageOffset is defined with value = 0.1
Given an (id,age) tuple, how can I fetch all values from myDict which have key (id, X) where:
age <= X <= age+ageOffset
I need to perform this fetch operation 20 billion times.
Examples:
1.
myTup = ('1039', '68.0')
the answer is: '42731,42781,V4501'
2.
myTup = ('0845', '60.0')
Ans : No value returned
Edit:
Can I create a sub-dictionary, on the basis of partial match on the first element of the Key. I mean, If first element of the tuple Key matched, then create a subdictionary. According to my data, this wont be longer than a couple of hundreds. And then perform linear range search comparing the second element in the tuple key and finding the corresponding values.
To do this operation 20 billion(!) times, you will have to preprocess your data a bit.
First, I would group by id:
def preprocess(data):
from collections import defaultdict # Python 2.5+ only
preprocessed = defaultdict(list)
# group by id
for (id, age), value in data.iteritems():
preprocessed[id].append((float(age), value))
# sort lists for binary search, see edit
for key, value in preprocessed.iteritems():
value.sort()
return preprocessed
Result should look like this:
>>> preprocess(myDict)
defaultdict(<type 'list'>, {
'0845': [(60.4476, '2724,27800,4019')],
'0983': [(63.3936, '41401,4168,4240,V1582,V7281')],
'1039': [(68.0864, '42731,42781,V4501'), (68.1704, '4770,4778,V071')]}
If relatively few items share the same id, thus resulting in short lists, you might get away with filtering the list.
def lookup(data, id, age, age_offset=0.1):
if id in data:
return [value for x, value in data[id] if age <= x <= age+age_offset]
else:
return None
lookup(preprocessed, '1039', 68.0) # Note that I use floats for age
['42731,42781,V4501']
However, if many items share the same id, you will have to traverse long lists, making the lookup relatively slow. In this case, you will have to apply further optimizations.
Edit: as suggested by #Andrey Petrov
from bisect import bisect_left
from itertools import islice, takewhile
def optimized_lookup(data, id, age, age_offset=0.1):
if id in data:
l = data[id]
idx = bisect_left(l, age)
return [a for a,v in takewhile(lambda (x, value): x <= age+age_offset, islice(l, idx, None))]
else:
return None
Here's a way to do it in numpy, and though I haven't tested it I'm pretty confident it will be vastly faster than looping over the dictionary. I replaced the dictionary structure with a Numpy record array, and used np.where to locate the rows where they match the parameters you gave.
import numpy as np
myDict = {('1039', '68.0864'): '42731,42781,V4501',
('1039', '68.1704'): '4770,4778,V071',
('0845', '60.4476'): '2724,27800,4019',
('0983', '63.3936'): '41401,4168,4240,V1582,V7281'
}
records=[]
for k,v in myDict.iteritems():
records.append([k[0], float(k[1]), v])
myArr = np.rec.fromrecords(records, formats='S10, f4, S100',
names="ID, Age, Code")
def findInMyArray(arr, requestedID, requestedAge, tolerance=0.1):
idx = np.where(((arr["Age"] - requestedAge) < tolerance) & (arr["ID"] == requestedID))
return idx
idx = findInMyArray(myArr, "1039", 68.0, tolerance=0.1)
print "The index found is: ", idx
print "The values are: ", myArr["Code"][idx[0]]
def getr(t):
id = float(t[0])
age = float(t[1])
os = 0.1
rs = []
correct_id=fixed[id]
for k in correct_id.keys():
if (k > age and k <= age + os):
rs.append(correct_id.get(k))
return rs
ct = {('1039', '68.0864'): '42731,42781,V4501',
('1039', '68.1704'): '4770,4778,V071',
('0845', '60.4476'): '2724,27800,4019',
('0983', '63.3936'): '41401,4168,4240,V1582,V7281' }
fixed={}
for k in ct:
if not(float(k[0]) in fixed):
fixed[float(k[0])]={}
fixed[float(k[0])][float(k[1])] = ct[k]
print "1"
myTup = ('1039', '68.0')
assert(getr(myTup) == ['42731,42781,V4501'])
#the answer is: '42731,42781,V4501'
print "2"
myTup = ('0845', '60.0')
assert(getr(myTup) == [])
#Ans : No value returned

Categories