Faster implementation for shuffling buckets in a dataset? Python

Faster implementation for shuffling buckets in a dataset? Python - python

Is there a faster way to implement this? Each row is about 1024 buckets and it's not as fast as I wish it was..
I'd like to generate quite a lot but it needs a few hours to complete as it is. It's quite the bottleneck at this point.. Any suggestions or ideas for how to optimize it would be greatly appreciated!
Edit*
Apologies for not having a minimal working example before. Now it's posted. If optimization could be made for Python 2.7 it would be very appreciated.
import math
import numpy as np
import copy
import random
def number_to_move(n):
l=math.exp(-n)
k=0
p=1.0
while p>l:
k += 1
p *= random.random()
return k-n
def createShuffledDataset(input_data, shuffle_indexes_dict, shuffle_quantity):
shuffled = []
for key in shuffle_indexes_dict:
for values in shuffle_indexes_dict[key]:
temp_holder = copy.copy(input_data[values[0] - 40: values[1]]) #may need to increase 100 padding
for line in temp_holder:
buckets = range(1,1022)
for bucket in buckets:
bucket_value = line[bucket]
proposed_number = number_to_move(bucket_value)
moving_amount = abs(proposed_number) if bucket_value - abs(proposed_number) >= 0 else bucket_value
line[bucket] -= moving_amount
if proposed_number > 0:
line[bucket + 1] += moving_amount
else:
line[bucket - 1] += moving_amount
shuffled.extend(temp_holder)
return np.array(shuffled)
example_data = np.ones((100,1024))
shuffle_indexes = {"Ranges to Shuffle 1" : [[10,50], [53, 72]]}
shuffle_quantity = 1150
shuffled_data = createShuffledDataset(example_data, shuffle_indexes,
shuffle_quantity)

Some minors things you might try:
merge key and values into single call:
for key, values in shuffle_indexes_dict.iteritems():
use xrange rather range
bucket values seem to be integer - try caching them.
_cache = {}
def number_to_move(n):
v = _cache.get(n, None)
if v is not None: return v
l=math.exp(-n)
k=0
p=1.0
while p>l:
k += 1
p *= random.random()
v = k-n
_cache[n] = v
return v
If shuffle_indexes_dict ranges are exclusive, then you can - probably - avoid coping values from input_data.
Otherwise i'd say you're out of luck.

Related

How can I get rid of this stack overflow problem in python for Kosaraju's algorithm?

I'm a new programmer, and I'm working through Stanford's algorithms course on edX.
One of the programming assignments is to find strongly connected components using Kosaraju's algorithm, on a graph with ~1,000,000 vertices. My implementation is the most basic translation from the textbook's pseudo code into Python, and it works fine on smaller graphs, so I think it's correct.
With Python's default recursion limit of 1000 it reaches the limit.
I've tried sys.setrecursionlimit(1000000), but that doesn't help, and instead I get "Process finished with exit code -1073741571" which is a stack overflow.
I found these two pages for increasing stack size limits, but am not sure how to use either of them: Set stack size, ulimit command.
Another relevant bit of information is that Python doesn't optimize tail recursion, but I'm not sure that applies to my code as it currently is.
G = {}
for i in range(1, 875715):
G[i] = [0]
for l in open('SCC.txt'):
items = list(map(int, l.split()))
G[items[0]].append(items[1])
Grev = {}
for i in range(1, 875715):
Grev[i] = [0]
for l in open('SCC.txt'):
items = list(map(int, l.split()))
Grev[items[1]].append(items[0])
def TopoSort(G):
for v in G:
if G[v][0] == 0:
DFStopo(G, v)
def DFStopo(G, s):
G[s][0] = 1
global orderedVertexList
for v in G[s][1:]:
if G[v][0] == 0:
DFStopo(G, v)
orderedVertexList.insert(0, s)
def Kosaraju(G, Grev):
TopoSort(Grev)
global numSCC
for v in orderedVertexList:
if G[v][0] == 0:
numSCC = numSCC + 1
DFSSCC(G, v)
def DFSSCC(G, s):
G[s][0] = 1
global SCC
SCC.append(numSCC)
for v in G[s][1:]:
if G[v][0] == 0:
DFSSCC(G, v)
numSCC = 0
orderedVertexList = []
SCC = []
Kosaraju(copy.deepcopy(G), copy.deepcopy(Grev))

Python find most common value in array

import numpy as np
x = ([1,2,3,3])
y = ([1,2,3])
z = ([6,6,1,2,9,9])
(only positive values)
In each array i need to return the most common value, or, if values come up the same amount of times - return the minimum.
This is home assignment and I can't use anything but numpy.
outputs:
f(x) = 3,
f(y) = 1,
f(z) = 6

for a numpy exclusive solution something like this will work:
occurances = np.bincount(x)
print (np.argmax(occurances))
The above mentioned method won't work if there is a negative number in the list. So in order to account for such an occurrence kindly use:
not_required, counts = np.unique(x, return_counts=True)
x=np.array(x)
if (x >= 0).all():
print(not_required[np.argmax(counts)])
else:
print(not_required[np.argmax(counts)])

It's called a mode function. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mode.html

Without numpy
n_dict = {}
for k in x:
try:
n_dict[k] += 1
except KeyError:
n_dict[k] = 1
rev_n_dict = {}
for k in n_dict:
if n_dict[k] not in rev_n_dict:
rev_n_dict[n_dict[k]] = [k]
else:
rev_n_dict[n_dict[k]].append(k)
local_max = 0
for k in rev_n_dict:
if k > local_max:
local_max = k
if len(rev_n_dict[local_max]) > 0:
print (min(rev_n_dict[local_max]))
else:
print (rev_n_dict[local_max])

To add to the previous results, you could use a collections.Counter object:
my_array = [3,24,543,3,1,6,7,8,....,223213,13213]
from collections import Counter
my_counter = Counter( my_array)
most_common_value = my_counter.most_common(1)[0][0]

It is quite simple but certainly not pretty. I have used variable names that will be self explanatory along with the comments. Feel free to ask if there is a doubt.
import numpy as np
x=([6,6,1,2,9,9])
def tester(x):
not_required, counts = np.unique(x, return_counts=True)
x=np.array(x)
if (x >= 0).all():
highest_occurance=[not_required[np.argmax(counts)]]
number_of_counts=np.max(counts)
else:
highest_occurance=not_required[np.argmax(counts)]
number_of_counts=np.max(counts)
return highest_occurance,number_of_counts
most_abundant,first_test_counts=(tester(x))
new_x=[vals for vals in x if vals not in most_abundant]
second_most_abundant,second_test_counts=(tester(new_x))
if second_test_counts==first_test_counts:
print("Atleast two elements have the same number of counts",most_abundant," and", second_most_abundant, "have %s"%first_test_counts,"occurances")
else:
print("%s occurrs for the max of %s times"%(most_abundant,first_test_counts))
we can also loop it to check if there are more than two elements with the same occurrence, instead of using an if else for a specific case of only looking at two elements

How do you use a list as an index argument for numpy ndarrays?

So I have a problem that might be super duper simple.
I have these numpy ndarrays that I allocated and want to assign values to them via indices returned as lists. It might be easier if I showed you some example code. The questionable code I have is at the bottom, and in my testing (before actually taking this to scale) I keep getting syntax errors :'(
EDIT: edited to make it easier to troubleshoot and put some example code at the bottoms
import numpy as np
def do_stuff(index, mask):
# this is where the calculations are made
magic = sum(mask)
return index, magic
def foo(full_index, comparison_dims, *xargs):
# I have this function executed in Parallel since I'm using a machine with 36 nodes per core, and can access upto 16 cores for each script #blessed
# figure out how many dimensions there are, and how big they are
parent_dims = []
parent_diffs = []
for j in xargs:
parent_dims += [len(j)]
parent_diffs += [j[1] - j[0]] # this is used to find a mask
index = [] # this is where the individual dimension indices will be stored
dim_n = 0
# loop through the dimensions
while dim_n < len(parent_dims):
dim_index = full_index % parent_dims[dim_n]
index += [dim_index]
if dim_n == 0:
mask = (comparison_dims[dim_n] > xargs[dim_n][dim_index] - parent_diffs[dim_n]/2) * \
(comparison_dims[dim_n] <= xargs[dim_n][dim_index] +parent_diffs[dim_n] / 2)
else:
mask *= (comparison_dims[dim_n] > xargs[dim_n][dim_index] - parent_diffs[dim_n]/2) * \
(comparison_dims[dim_n] <=xargs[dim_n][dim_index] + parent_diffs[dim_n] / 2)
full_index //= parent_dims[dim_n]
dim_n += 1
return do_stuff(index, mask)
def bar(comparison_dims, *xargs):
if len(xargs) == comparison_dims.shape[0]:
pass
elif len(comparison_dims.shape) == 2:
pass
else:
raise ValueError("silly person, you failed")
from joblib import Parallel, delayed
dims = []
for j in xargs:
dims += [len(j)]
myArray = np.empty(tuple(dims))
results = Parallel(n_jobs=1)(
delayed(foo)(
index, comparison_dims, *xargs)
for index in range(np.prod(dims))
)
# LOOK HERE, HELP HERE!
for index_list, result in results:
# I thought this would work, but oh golly was I was wrong, index_list here is a list of ints, and result is a value
# for example index, result = [0,3,7], 45.4
# so in execution, that would yield: myArray[0,3,7] = 45.4
# instead it yields SyntaxError because I don't know what I'm doing XD
myArray[*index_list] = result
return myArray
Any ideas how I can make that work. What do I need to do?
I'm not the sharpest tool in the shed, but I think with your help we might be able to figure this out!
A quick example to troubleshoot this problem would be:
compareDims = np.array([np.random.rand(1000), np.random.rand(1000)])
dim0 = np.arange(0,1,1./20)
dim1 = np.arange(0,1,1./30)
myArray = bar(compareDims, dim0, dim1)

To index a numpy array with an arbitrary list of multidimensional indices. you actually need to use a tuple:
for index_list, result in results:
myArray[tuple(index_list)] = result

Speeding a numpy correlation program using the fact that lists are sorted

I am currently using python and numpy for calculations of correlations between 2 lists: data_0 and data_1. Each list contains respecively sorted times t0 and t1.
I want to calculate all the events where 0 < t1 - t0 < t_max.
for time_0 in np.nditer(data_0):
delta_time = np.subtract(data_1, np.full(data_1.size, time_0))
delta_time = delta_time[delta_time >= 0]
delta_time = delta_time[delta_time < time_max]
Doing so, as the list are sorted, I am selecting a subarray of data_1 of the form data_1[index_min: index_max].
So I need in fact to find two indexes to get what I want.
And what's interesting is that when I go to the next time_0, as data_0 is also sorted, I just need to find the new index_min / index_max such as new_index_min >= index_min / new_index_max >= index_max.
Meaning that I don't need to scann again all the data_1.
(data list from scratch).
I have implemented such a solution not using the numpy methods (just with while loop) and it gives me the same results as before but not as fast than before (15 times longer!).
I think as normally it requires less calculation, there should be a way to make it faster using numpy methods but I don't know how to do it.
Does anyone have an idea?
I am not sure if I am super clear so if you have any questions, do not hestitate.
Thank you in advance,
Paul

Here is a vectorized approach using argsort. It uses a strategy similar to your avoid-full-scan idea:
import numpy as np
def find_gt(ref, data, incl=True):
out = np.empty(len(ref) + len(data) + 1, int)
total = (data, ref) if incl else (ref, data)
out[1:] = np.argsort(np.concatenate(total), kind='mergesort')
out[0] = -1
split = (out < len(data)) if incl else (out >= len(ref))
if incl:
out[~split] -= len(data)
split[0] = False
return np.maximum.accumulate(np.where(split, -1, out))[split] + 1
def find_intervals(ref, data, span, incl=(True, True)):
index_min = find_gt(ref, data, incl[0])
index_max = len(ref) - find_gt(-ref[::-1], -span-data[::-1], incl[1])[::-1]
return index_min, index_max
ref = np.sort(np.random.randint(0,20000,(10000,)))
data = np.sort(np.random.randint(0,20000,(10000,)))
span = 2
idmn, idmx = find_intervals(ref, data, span, (True, True))
print('checking')
for d,mn,mx in zip(data, idmn, idmx):
assert mn == len(ref) or ref[mn] >= d
assert mn == 0 or ref[mn-1] < d
assert mx == len(ref) or ref[mx] > d+span
assert mx == 0 or ref[mx-1] <= d+span
print('ok')
It works by
indirectly sorting both sets together
finding for each time in one set the preceding time in the other
this is done using maximum.reduce
the preceding steps are applied twice, the second time the times in
one set are shifted by span

comparing large vectors in python

I have two large vectors (~133000 values) of different length. They are each sortet from small to large values. I want to find values that are similar within a given tolerance. This is my solution but it is very slow. Is there a way to speed this up?
import numpy as np
for lv in range(np.size(vector1)):
for lv_2 in range(np.size(vector2)):
if np.abs(vector1[lv_2]-vector2[lv])<.02:
print(vector1[lv_2],vector2[lv],lv,lv_2)
break

Your algorithm is far from optimal. You compare way too much values. Assume you are at a certain position in vector1 and the current value in vector2 is already more than 0.02 bigger. Why would you compare the rest of vector2?
Start with something like
pos1 = 0
pos2 = 0
Now compare the values at those postions in your vectors. If the difference is too big, move the position of the smaller one fowared and check again. Continue until you reach the end of one vector.

haven't tested it, but the following should work. The idea is to exploit the fact that the vectors are sorted
lv_1, lv_2 = 0,0
while lv_1 < len(vector1) and lv_2 < len(vector2):
if np.abs(vector1[lv_2]-vector2[lv_1])<.02:
print(vector1[lv_2],vector2[lv_1],lv_1,lv_2)
lv_1 += 1
lv_2 += 1
elif vector1[lv_1] < vector2[lv_2]: lv_1 += 1
else: lv_2 += 1

The following code gives a nice increase in performance that depends upon how dense the numbers are. Using a set of 1000 random numbers, sampled uniformly between 0 and 100, it runs about 30 times faster than your implementation.
pos_1_start = 0
for i in range(np.size(vector1)):
for j in range(pos1_start, np.size(vector2)):
if np.abs(vector1[i] - vector2[j]) < .02:
results1 += [(vector1[i], vector2[j], i, j)]
else:
if vector2[j] < vector1[i]:
pos1_start += 1
else:
break
The timing:
time new method: 0.112464904785
time old method: 3.59720897675
Which is produced by the following script:
import random
import numpy as np
import time
# initialize the vectors to be compared
vector1 = [random.uniform(0, 40) for i in range(1000)]
vector2 = [random.uniform(0, 40) for i in range(1000)]
vector1.sort()
vector2.sort()
# the arrays that will contain the results for the first method
results1 = []
# the arrays that will contain the results for the second method
results2 = []
pos1_start = 0
t_start = time.time()
for i in range(np.size(vector1)):
for j in range(pos1_start, np.size(vector2)):
if np.abs(vector1[i] - vector2[j]) < .02:
results1 += [(vector1[i], vector2[j], i, j)]
else:
if vector2[j] < vector1[i]:
pos1_start += 1
else:
break
t1 = time.time() - t_start
print "time new method:", t1
t = time.time()
for lv1 in range(np.size(vector1)):
for lv2 in range(np.size(vector2)):
if np.abs(vector1[lv1]-vector2[lv2])<.02:
results2 += [(vector1[lv1], vector2[lv2], lv1, lv2)]
t2 = time.time() - t_start
print "time old method:", t2
# sort the results
results1.sort()
results2.sort()
print np.allclose(results1, results2)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Faster implementation for shuffling buckets in a dataset? Python - python

Related

How can I get rid of this stack overflow problem in python for Kosaraju's algorithm?

Python find most common value in array

How do you use a list as an index argument for numpy ndarrays?

Speeding a numpy correlation program using the fact that lists are sorted

comparing large vectors in python

Categories

Resources