How to speed up nested loop and add condition? - python

I am trying to speed up my nested loop it currently takes 15 mins for 100k customers.
I am also having trouble adding an additional condition that only multiplies states (A,B,C) by lookup2 val, else multiplies by 1.
customer_data = pd.DataFrame({"cust_id": [1, 2, 3, 4, 5, 6, 7, 8],
"state": ['B', 'E', 'D', 'A', 'B', 'E', 'C', 'A'],
"cust_amt": [1000,300, 500, 200, 400, 600, 200, 300],
"year":[3, 3, 4, 3, 4, 2, 2, 4],
"group":[10, 25, 30, 40, 55, 60, 70, 85]})
state_list = ['A','B','C','D','E']
# All lookups should be dataframes with the year and/or group and the value like these.
lookup1 = pd.DataFrame({'year': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'lim %': 0.1})
lookup2 = pd.concat([pd.DataFrame({'group':g, 'lookup_val': 0.1, 'year':range(1, 11)}
for g in customer_data['group'].unique())]).explode('year')
multi_data = np.arange(250).reshape(10,5,5)
lookups = [lookup1, lookup2]
# Preprocessing.
# Transform the state to categorical code to use it as array index.
customer_data['state'] = pd.Categorical(customer_data['state'],
categories=state_list,
ordered=True).codes
# Set index on lookups.
for i in range(len(lookups)):
if 'group' in lookups[i].columns:
lookups[i] = lookups[i].set_index(['year', 'group'])
else:
lookups[i] = lookups[i].set_index(['year'])
calculation:
results = {}
for customer, state, amount, start, group in customer_data.itertuples(name=None, index=False):
for year in range(start, len(multi_data)+1):
if year == start:
results[customer] = [[amount * multi_data[year-1, state, :]]]
else:
results[customer].append([results[customer][-1][-1] # multi_data[year-1]])
for lookup in lookups:
if isinstance(lookup.index, pd.MultiIndex):
value = lookup.loc[(year, group)].iat[0]
else:
value = lookup.loc[year].iat[0]
results[customer][-1].append(value * results[customer][-1][-1])
example of expected output:
{1: [[array([55000, 56000, 57000, 58000, 59000]),
array([5500., 5600., 5700., 5800., 5900.]),
array([550., 560., 570., 5800., 5900.])],...

You could use multiprocessing if you have more than one CPU.
from multiprocessing import Pool
def get_customer_data(data_tuple) -> dict:
results = {}
customer, state, amount, start, group = data_tuple
for year in range(start, len(multi_data)+1):
if year == start:
results[customer] = [[amount * multi_data[year-1, state, :]]]
else:
results[customer].append([results[customer][-1][-1] # multi_data[year-1]])
for lookup in lookups:
if isinstance(lookup.index, pd.MultiIndex):
value = lookup.loc[(year, group)].iat[0]
else:
value = lookup.loc[year].iat[0]
results[customer][-1].append(value * results[customer][-1][-1])
return results
p = Pool(mp.cpu_count())
# Pool.map() takes a function and an iterable like a list or generator
results_list = p.map(get_customer_data, [data_tuple for data_tuple in customer_data.itertuples(name=None, index=False)] )
# results is a list of dict()
results_dict = {k:v for x in results_list for k,v in x.items()}
p.close()

Glad to see you posting this! As promised, my thoughts:
With Pandas works with columns very well. What you need to look to do is remove the need for loops as much as possible (In your case I would say get rid of the main loop you have then keep the year and lookups loop).
To do this, forget about the results{} variable for now. You want to do the calculations directly on the DataFrame. For example your first calculation would become something like:
customer_data['meaningful_column_name'] = [[amount * multi_data[customer_data['year']-1, customer_data['state'], :]]]
For your lookups loop you just have to be aware that the if statement will be looking at entire columns.
Finally, as it seems you want to have your data in a list of arrays you will need to do some formatting to extract the data from a DataFrame structure.
I hope that makes some sense

Related

How to create a dict from list where values are elements of the list and keys are function of those elements in python?

I have a list of caller_address elements. For each of these addresses I can get a caller_function, a function containing that caller_address. In a single function there may be more than 1 address.
So if I have a list of caller_address elements:
caller_addresses = [1, 2, 3, 4, 5, 6, 7, 8]
For each of them I can get a function:
caller_functions = [getFunctionContaining(addr) for addr in caller_addresses]
print(caller_functions)
# prints(example): ['func1', 'func1', 'func2', 'func2', 'func2', 'func2', 'func3', 'func3']
In the result I need to get a dict where keys are the functions and values are lists of addresses those functions contain. In my example in must be:
{'func1': [1, 2], 'func2': [3, 4, 5, 6], 'func3': [7, 8]}
# Means 'func1' contains addresses 1 and 2, 'func2' contains 3, 4, 5 and 6, ...
It would be great if there was a function like:
result = to_dict(lambda addr: getFunctionContaining(addr), caller_addresses)
to get the same result.
Where the first argument is the function for keys and the second argument is the list of values. Is there such function in standard library in python?
I could implement it with for loop and dict[getFunctionContaining(addr)].append(addr), but I'm looking for more pythonic way to do this.
Thanks!
Found a solution using itertools.groupby.
This solution is also faster than a solution using a loop.
import itertools
import time
def f(v):
if v < 5:
return 1
if v < 7:
return 2
return 3
def to_dict(key, list_):
out = {}
for el in list_:
out.setdefault(key(el), []).append(el)
return out
def to_dict2(key, list_):
return {k: list(v) for k, v in itertools.groupby(list_, key)}
lst = [1, 2, 3, 4, 5, 6, 7, 8] * 10**4
COUNT = 1000
def timeit(to_dict_f):
elapsed_sum = 0
for _ in range(COUNT):
elapsed_sum -= time.time()
to_dict_f(f, lst)
elapsed_sum += time.time()
return elapsed_sum / COUNT
print('Average time: ', timeit(to_dict), timeit(to_dict2))
Results:
Average time: 0.014930561065673828 0.01346096110343933
to_dict2 (itertools.groupby) on average takes less time than to_dict (loop)

How shuffle and then unshuffle a bytearray, given a key as seed

I am trying to create an encryption system, and for that I will be taking a bytearray, and I intend to use a
random.Random(seed).shuffle(bytearray)
function to encrypt the information.
I'm having trouble reversing this process for the deencryption, i tried something like (didnt work) :
random.Random(1/seed).shuffle(encryptedbytearray)
Is there anyway to do this?
Shuffle a sorted range, so that we can match the shuffled indicies to the unshuffled indicies.
x = list(range(len(s)))
random.Random(seed).shuffle(x)
For a seed of 12345, this produces [14, 15, 12, 3, 24, 16, 7, 22, 10, 2, 19, 4, 20, 17, 1, 21, 5, 25, 18, 8, 6, 11, 9, 0, 23, 13]. This indicates that the value in the 0th index in the shuffled list is actually in the 14th index of the unshuffled list, the 1st index is actually the 15th unshuffled, etc.
Then match each shuffled index to the shuffled value, and then sort (based on the index) back into their unshuffled positions.
unshuffled = bytearray(c for i, c in sorted(zip(x, s)))
print(unshuffled)
Full example:
import random
# setup
s = bytearray(b"abcdefghijklmnopqrstuvxwyz")
seed = 12345
random.Random(seed).shuffle(s)
# shuffle a sorted range, so that we can match the shuffled indicies to the unshuffled indicies
x = list(range(len(s)))
random.Random(seed).shuffle(x)
# match each shuffled index to the shuffled value, and then sort (based on the index) back into their unshuffled positions
unshuffled = bytearray(c for i, c in sorted(zip(x, s)))
print(unshuffled)
The process detailed above should work for any shuffled sequence (eg. lists), not just bytearrays.
Here is a more detailed explanation of this process on crypto.se with a lot more math involved.
You need to use the same seed to shuffle indexes so that you can backtrack the original positions. (enumerate will allow you to avoid sorting that mapping)
import random
def encrypt(decrypted,seed=4):
encrypted = decrypted.copy()
random.Random(seed).shuffle(encrypted)
return encrypted
def decrypt(encrypted,seed=4):
decrypted = encrypted.copy()
indexes = list(range(len(encrypted)))
random.Random(seed).shuffle(indexes)
for e,d in enumerate(indexes):
decrypted[d] = encrypted[e]
return decrypted
Sample run (using list of characters but it will work for bytearrays or any other type of lists):
clearText = list('ABCDE')
encryptedText = encrypt(clearText)
print(encryptedText)
['D', 'E', 'A', 'C', 'B']
decryptedText = decrypt(encryptedText)
print(decryptedText)
['A', 'B', 'C', 'D', 'E']
If you want the functions to work "in place" directly on the array (instead of returning a value), you can writ them like this:
def encrypt(decrypted,seed=4):
random.Random(seed).shuffle(encrypted)
def decrypt(encrypted,seed=4):
before = encrypted.copy()
indexes = list(range(len(encrypted)))
random.Random(seed).shuffle(indexes)
for e,d in enumerate(indexes):
encrypted[d] = before[e]

What is the fastest way to convert a dictionary frequency to list in Python?

I have dictionary frequency as follows:
freq = {'a': 1, 'b': 2, 'c': 3}
It simply means that I have one a's, twob's, and three c's.
I would like to convert it into a complete list:
lst = ['a', 'b', 'b', 'c', 'c', 'c']
What is the fastest way (time-efficient) or most compact way (space-efficient) to do so?
Yes, but only if the items are (or can be represented as) integers, and if the number of items between the smallest and largest item is sufficiently close to the difference between the two, in which case you can use bucket sort, resulting in O(n) time complexity, where n is the difference between the smallest and the largest item. This would be more efficient than using other sorting algorithms, with an average time complexity of O(n log n).
In the case of List = [1, 4, 5, 2, 6, 7, 9, 3] as it is in your question, it is indeed more efficient to use bucket sort when it is known that 1 is the smallest item and 9 is the largest item, since only 8 is missing between the range. The following example uses collections.Counter to account for the possibility that there can be duplicates in the input list:
from collections import Counter
counts = Counter(List)
print(list(Counter({i: counts[i] for i in range(1, 10)}).elements()))
This outputs:
[1, 2, 3, 4, 5, 6, 7, 9]
Let's break this into two O(N) passes: one to catalog the numbers, and one to create the sorted list. I updated the variable names; List is an especially bad choice, given the built-in type list. I also added 10 to each value, so you can see how the low-end offset works.
coll = [11, 14, 15, 12, 16, 17, 19, 13]
last = 19
first = 11
offset = first
size = last-first+1
# Recognize all values in a dense "array"
need = [False] * size
for item in coll:
need[item - offset] = True
# Iterate again in numerical order; for each True value, add that item to the new list
sorted_list = [idx + offset for idx, needed_flag in enumerate(need) if needed_flag]
print(sorted_list)
OUTPUT:
[11, 12, 13, 14, 15, 16, 17, 19]
The most compact way I usually use is list comprehension -
lst = ['a', 'b', 'b', 'c', 'c', 'c']
freq = {i: 0 for i in lst}
for i in lst: freq[i] += 1
Space complexity - O(n)
Time complexity - O(n)

Python 2D list to dictionary

I have a 2 Dimensional list and have to get 2 columns from the 2D list and place the values from each column as key:value pairs.
Example:
table = [[15, 29, 6, 2],
[16, 9, 8, 0],
[7, 27, 16, 0]]
def averages(table, col, by):
columns = tuple(([table[i][col] for i in range(len(table))])) #Place col column into tuple so it can be placed into dictionary
groupby = tuple(([table[i][by] for i in range(len(table))])) #Place groupby column into tuple so it can be placed into dictionary
avgdict = {}
avgdict[groupby] = [columns]
print(avgdict)
averages(table, 1, 3)
Output is:
{(2, 0, 0): [(29, 9, 27)]}
I am trying to get the output to equal:
{0:36, 2:29}
So essentially the 2 keys of 0 have their values added
I'm having a hard time understanding how to separate each key with their values
and then adding the values together if the keys are equal.
Edit: I'm only using Python Standard library, and not implementing numpy for this problem.
You can create an empty dictionary, then iterate through every element of groupby. If the element in groupby exist in the dictionary, then add the corresponding element in columns to the values in the dictionary. Otherwise, add the element in groupby as key and the corresponding element in columns as value.The implementation is as follows:
table = [[15, 29, 6, 2],
[16, 9, 8, 0],
[7, 27, 16, 0]]
def averages(table, col, by):
columns = tuple(([table[i][col] for i in range(len(table))])) #Place col column into tuple so it can be placed into dictionary
groupby = tuple(([table[i][by] for i in range(len(table))])) #Place groupby column into tuple so it can be placed into dictionary
avgdict = {}
for x in range(len(groupby)):
key = groupby[x]
if key in avgdict:
avgdict[key] += columns[x]
else:
avgdict[key] = columns[x]
print(avgdict)
averages(table, 1, 3)
Otherwise, if you want to keep your initial avgdict, then you can change the averages() function to
def averages(table, col, by):
columns = tuple(([table[i][col] for i in range(len(table))])) #Place col column into tuple so it can be placed into dictionary
groupby = tuple(([table[i][by] for i in range(len(table))])) #Place groupby column into tuple so it can be placed into dictionary
avgdict = {}
avgdict[groupby] = [columns]
newdict = {}
for key in avgdict:
for x in range(len(key)):
if key[x] in newdict:
newdict[key[x]] += avgdict[key][0][x]
else:
newdict[key[x]] = avgdict[key][0][x]
print(newdict)
It took me a minute to figure out what you were trying to accomplish because your function and variable names reference averages but your output is a sum.
Based on your output, it seems you're trying to aggregate row values in a given column by a group in another column.
Here's a recommended solution (which likely could be reduced to a one-liner via list comprehension). This loops through the unique (using set) values (b) in your group by, creates a dictionary key (agg_dict[b]) for the group by being processed, and sums all rows in a given column (col) if the group by is being processed (table[i][by] == by).
table = [[15, 29, 6, 2],
[16, 9, 8, 0],
[7, 27, 16, 0]]
def aggregate(tbl, col, by):
agg_dict = {}
for b in list(set([table[i][by] for i in range(len(table))]))
agg_dict[b] = sum([table[i][col] for i in range(len(table)) if table[i][by] == b])
print(agg_dict)
aggregate(table, 1, 3)
You can also try the following answer. It doesn't use numpy, and is based on the use of sets to find unique elements in groupby.
table = [[15, 29, 6, 2],
[16, 9, 8, 0],
[7, 27, 16, 0]]
def averages(table, col, by):
columns = tuple(([table[i][col] for i in range(len(table))])) #Place col column into tuple so it can be placed into dictionary
groupby = tuple(([table[i][by] for i in range(len(table))])) #Place groupby column into tuple so it can be placed into dictionary
'''groupby_unq: tuple data type
stores list of unique entries in groupby.'''
groupby_unq = tuple(set(groupby))
'''avg: numpy.ndarray data type
numpy array of zeros of same length as groupby_unq.'''
avg = np.zeros( len(groupby_unq) )
for i in range(len(groupby)):
for j in range(len(groupby_unq)):
if(groupby[i]==groupby_unq[j]): avg[j]+=columns[i]
avgdict = dict( (groupby_unq[i], avg[i]) for i in range(len(avg)) )
return avgdict
result = averages(table, 1, 3)
print result
{0: 36.0, 2: 29.0}

Need to create a list of sets, from a list of sets whose members may be connected

I'm dealing with polygonal data in realtime here, but the problems quite simple.
I have a huge list containing thousands of sets of polygon Indecies (Integers) and I need to simplify the list as "fast" as possible into a list of sets of "connected" Indecies.
i.e. Any sets containing integers that are also in another set become one set in the result. I've read several possible solutions involving sets & graphs etc. All i'm after are a final list of sets which had any degree of commonality.
I'm dealing with lots of data here, but for simplicities sake here's some sample data:
setA = set([0,1,2])
setB = set([6,7,8,9])
setC = set([4,5,6])
setD = set([3,4,5,0])
setE = set([10,11,12])
setF = set([11,13,14,15])
setG = set([16,17,18,19])
listOfSets = [setA,setB,setC,setD,setE,setF,setG]
In this case I'm after a list with a result like this, although ordering is irrelevant:
connectedFacesListOfSets = [ set([0,1,2,3,4,5,6,7,8,9]), set([10,11,12,13,14,15]), set([16,17,18,19])]
I've looked for similar solutions, but the one with the highest votes gave incorrect results on my large test data.
Merge lists that share common elements
It's hard to tell the performance without a sufficiently large set, but here is some basic code to start from:
while True:
merged_one = False
supersets = [listOfSets[0]]
for s in listOfSets[1:]:
in_super_set = False
for ss in supersets:
if s & ss:
ss |= s
merged_one = True
in_super_set = True
break
if not in_super_set:
supersets.append(s)
print supersets
if not merged_one:
break
listOfSets = supersets
This works in 3 iterations on the provided data. And the output is as follows:
[set([0, 1, 2, 3, 4, 5]), set([4, 5, 6, 7, 8, 9]), set([10, 11, 12, 13, 14, 15]), set([16, 17, 18, 19])]
[set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), set([10, 11, 12, 13, 14, 15]), set([16, 17, 18, 19])]
[set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), set([10, 11, 12, 13, 14, 15]), set([16, 17, 18, 19])]
This is a union find problem.
Though I haven't used it, this Python code looks good to me.
http://code.activestate.com/recipes/577225-union-find/
Forgive the messed up caps (autocorrect...):
# the results cotainer
Connected = set()
sets = # some list of sets
# convert the sets to frozensets (which are hashable and can be added to sets themselves)
Sets = map(frozenset, sets)
for s1 in sets:
Res = copy.copy(s1)
For s2 in sets:
If s1 & s2:
Res = res | s2
Connected.add(res)
So.. I think I got it. It's a mess but I got it. Here's what I did:
def connected_valid(li):
for i, l in enumerate(li):
for j, k in enumerate(li):
if i != j and contains(l,k):
return False
return True
def contains(set1, set2):
for s in set1:
if s in set2:
return True
return False
def combine(set1, set2):
set2 |= set1
return set2
def connect_sets(li):
while not connected_valid(li):
s1 = li.pop(0)
s2 = li[0]
if contains(s1, s2):
li[0] = combine(s1,s2)
else:
li.append(s1)
return li
Then in the main function you'd do something like this:
setA = set([0,1,2])
setB = set([6,7,8,9])
setC = set([4,5,6])
setD = set([3,4,5,0])
setE = set([10,11,12])
setF = set([11,13,14,15])
setG = set([16,17,18,19])
connected_sets = connect_sets([setA,setB,setC,setD,setE,setF,setG,])
After running it, I got the following output
print connected_sets
[set([0,1,2,3,4,5,6,7,8,9]), set([10,11,12,13,14,15]), set([16,17,18,19])]
Hope that's what you're looking for.
EDIT: Added code to randomly generate sets:
# Creates a list of 4000 sets with a random number of values ranging from 0 to 20000
sets = []
ma = 0
mi = 21000
for x in range(4000):
rand_num = sample(range(20),1)[0]
tmp_set_li = sample(range(20000), rand_num)
sets.append(set(tmp_set_li))
The last 3 lines can be condensed into one if you really wanted to.
I tried to do something different: this algorithm loops once for each set and once for each element:
# Our test sets
setA = set([0,1,2])
setB = set([6,7,8,9])
setC = set([4,5,6])
setD = set([3,4,5,0])
setE = set([10,11,12])
setF = set([11,13,14,15])
setG = set([16,17,18,19])
list_of_sets = [setA,setB,setC,setD,setE,setF,setG]
# We will use a map to store our new merged sets.
# This map will work as an reference abstraction, so it will
# map set ids to the set or to other set id.
# This map may have an indirection level greater than 1
merged_sets = {}
# We will also use a map between indexes and set ids.
index_to_id = {}
# Given a set id, returns an equivalent set id that refers directly
# to a set in the merged_sets map
def resolve_id(id):
if not isinstance(id, (int, long)):
return None
while isinstance(merged_sets[id], (int, long)):
id = merged_sets[id]
return id
# Points the informed set to the destination id
def link_id(id_source, id_destination):
point_to = merged_sets[id_source]
merged_sets[id_source] = id_destination
if isinstance(point_to, (int, long)):
link_id(point_to, id_destination)
empty_set_found = False
# For each set
for current_set_id, current_set in enumerate(list_of_sets):
if len(current_set) == 0 and empty_set_found:
continue
if len(current_set) == 0:
empty_set_found = True
# Create a set id for the set and place it on the merged sets map
merged_sets[current_set_id] = current_set
# For each index in the current set
possibly_merged_current_set = current_set
for index in current_set:
# See if the index is free, i.e., has not been assigned to any set id
if index not in index_to_id:
# If it is free, then assign the set id to the index
index_to_id[index] = current_set_id
# ... and then go to the next index
else:
# If it is not free, then we may need to merge the sets
# Find out to which set we need to merge the current one,
# ... dereferencing if necessary
id_to_merge = resolve_id(index_to_id[index])
# First we check to see if the assignment is to the current set or not
if id_to_merge == resolve_id(merged_sets[current_set_id]):
continue
# Merge the current set to the one found
print 'Merging %d with %d' % (current_set_id, id_to_merge)
merged_sets[id_to_merge] |= possibly_merged_current_set
possibly_merged_current_set = merged_sets[id_to_merge]
# Map the current set id to the set id of the merged set
link_id(current_set_id, id_to_merge)
# Return all the sets in the merged sets map (ignore the references)
print [x for x in merged_sets.itervalues() if not isinstance(x, (int, long))]
It prints:
Merging 2 with 1
Merging 3 with 0
Merging 3 with 1
Merging 5 with 4
[set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), set([10, 11, 12, 13, 14, 15]), set([16, 17, 18, 19])]

Categories