Summing list to specific treshold with zeros trace - python

Is there a reasonable way to get the following done in fast compilation way?
I try to sum the list of numbers to specific treshold and replace previous values to 0.
I'm looking for the fastest compilation way (the list has 18 kk records).
For given example the treshold is "1".
Input:
[0.2, 0.4, 0.2, 0.2, 0.1, 1.2, 3.2 ,0.2, 0.1, 0.4, 0.5, 0.1]
Output:
[0.0, 0.0, 0.0, 1.0, 0.0, 1.3, 3.2 ,0.0, 0.0, 0.0, 1.2, 0.1]

A more faster approach compared to appending each interim value to the final list:
lst = [0.2, 0.4, 0.2, 0.2, 0.1, 1.2, 3.2 ,0.2, 0.1, 0.4, 0.5, 0.1]
res = [0] * len(lst) # initial zeros
L_size, t = len(lst), 0
for i, n in enumerate(lst):
t += n
if t >= 1 or i == L_size - 1:
res[i] = t
t = 0
print(res)
[0, 0, 0, 1.0, 0, 1.3, 3.2, 0, 0, 0, 1.2000000000000002, 0.1]

A list comprehension:
s = 0.0
res = [
0.0 if (s := s + x if s < 1.0 else x) < 1.0 else s
for x in lst
]
res[-1] = s
Benchmark with ~1.8 million values, times multiplied by 10 to estimate for your 18 million:
1.74 ± 0.06 seconds Kelly
3.32 ± 0.10 seconds Roman
3.56 ± 0.10 seconds Roman_Andrej
5.17 ± 0.07 seconds mozway
Benchmark code (Attempt This Online!):
from timeit import timeit
from statistics import mean, stdev
def mozway(l):
total = 0
out = []
for i, n in enumerate(l):
new_total = total + n
if new_total >= 1 or i+1 == len(l):
out.append(new_total)
total = 0
else:
out.append(0)
total = new_total
return out
def Roman(lst):
res = [0] * len(lst)
L_size, t = len(lst), 0
for i, n in enumerate(lst):
t += n
if t >= 1 or i == L_size - 1:
res[i] = t
t = 0
return res
def Roman_Andrej(lst):
L_size, t = len(lst), 0
for i, n in enumerate(lst):
t += n
if t >= 1 or i == L_size - 1:
lst[i] = t
t = 0
else:
lst[i] = 0
return res
def Kelly(lst):
s = 0.0
res = [
0.0 if (s := s + x if s < 1.0 else x) < 1.0 else s
for x in lst
]
res[-1] = s
return res
funcs = mozway, Roman, Roman_Andrej, Kelly
lst = [0.2, 0.4, 0.2, 0.2, 0.1, 1.2, 3.2 ,0.2, 0.1, 0.4, 0.5, 0.1]
exp = [0.0, 0.0, 0.0, 1.0, 0.0, 1.3, 3.2 ,0.0, 0.0, 0.0, 1.2, 0.1]
for f in funcs:
res = [round(x, 6) for x in f(lst[:])]
print(res == exp)
# print(exp)
# print(res)
times = {f: [] for f in funcs}
def stats(f):
ts = [t for t in sorted(times[f])[:5]]
return f'{mean(ts):4.2f} ± {stdev(ts):4.2f} seconds '
lst *= 1800000 // len(lst)
for _ in range(10):
for f in funcs:
copy = lst[:]
t = timeit(lambda: f(copy), number=1) * 10
times[f].append(t)
for f in sorted(funcs, key=stats):
print(stats(f), f.__name__)

Not sure what you mean by "fast compilation way":
l = [0.2, 0.4, 0.2, 0.2, 0.1, 1.2, 3.2 ,0.2, 0.1, 0.4, 0.5, 0.1]
total = 0
out = []
for i, n in enumerate(l):
new_total = total + n
if new_total >= 1 or i+1 == len(l):
out.append(new_total)
total = 0
else:
out.append(0)
total = new_total
Output:
[0, 0, 0, 1.0, 0, 1.3, 3.2, 0, 0, 0, 1.2000000000000002, 0.1]
Running time for 18K values:
8.16 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
precision
If you need precise floating point operations you might want to use Decimal:
from decimal import Decimal
from math import isclose
total = 0
out = []
for i, n in enumerate(map(Decimal, l)):
new_total = total + n
if new_total >= 1 or isclose(new_total, 1) or i+1 == len(l):
out.append(float(new_total))
total = 0
else:
out.append(0)
total = new_total
Output:
[0, 0, 0, 1.0, 0, 1.3, 3.2, 0, 0, 0, 1.2, 0.1]
Running time for 18K values:
49.5 ms ± 2.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Related

Numpy cumsum with boundary conditions to the sum [duplicate]

I have a numpy/pandas list of values:
a = np.random.randint(-100, 100, 10000)
b = a/100
I want to apply a custom cumsum function, but I haven't found a way to do it without loops. The custom function sets an upper limit of 1 and lower limit of -1 for the cumsum values, if the "add" to sum is beyond these limits the "add" becomes 0.
In the case that sum is between the limits of -1 and 1 but the "added" value would break beyond the limits, the "added" becomes the remainder to -1 or 1.
Here is the loop version:
def cumsum_with_limits(values):
cumsum_values = []
sum = 0
for i in values:
if sum+i <= 1 and sum+i >= -1:
sum += i
cumsum_values.append(sum)
elif sum+i >= 1:
d = 1-sum # Remainder to 1
sum += d
cumsum_values.append(sum)
elif sum+i <= -1:
d = -1-sum # Remainder to -1
sum += d
cumsum_values.append(sum)
return cumsum_values
Is there any way to vectorize this? I need to run this function on large datasets and performance is my current issue. Appreciate any help!
Update: Fixed the code a bit, and a little clarification for the outputs:
Using np.random.seed(0), the first 6 values are:
b = [0.72, -0.53, 0.17, 0.92, -0.33, 0.95]
Expected output:
o = [0.72, 0.19, 0.36, 1, 0.67, 1]
Loops aren't necessarily undesirable. If performance is an issue, consider numba. There's a ~330x improvement without materially changing your logic:
from numba import njit
np.random.seed(0)
a = np.random.randint(-100, 100, 10000)
b = a/100
#njit
def cumsum_with_limits_nb(values):
n = len(values)
res = np.empty(n)
sum_val = 0
for i in range(n):
x = values[i]
if (sum_val+x <= 1) and (sum_val+x >= -1):
res[i] = x
sum_val += x
elif sum_val+x >= 1:
d = 1-sum_val # Remainder to 1
res[i] = d
sum_val += d
elif sum_val+x <= -1:
d = -1-sum_val # Remainder to -1
res[i] = d
sum_val += d
return res
assert np.isclose(cumsum_with_limits(b), cumsum_with_limits_nb(b)).all()
If you don't mind sacrificing some performance, you can rewrite this loop more succinctly:
#njit
def cumsum_with_limits_nb2(values):
n = len(values)
res = np.empty(n)
sum_val = 0
for i in range(n):
x = values[i]
next_sum = sum_val + x
if np.abs(next_sum) >= 1:
x = np.sign(next_sum) - sum_val
res[i] = x
sum_val += x
return res
With similar performance to nb2, here's an alternative (thanks to #jdehesa):
#njit
def cumsum_with_limits_nb3(values):
n = len(values)
res = np.empty(n)
sum_val = 0
for i in range(n):
x = min(max(sum_val + values[i], -1) , 1) - sum_val
res[i] = x
sum_val += x
return res
Performance comparisons:
assert np.isclose(cumsum_with_limits(b), cumsum_with_limits_nb(b)).all()
assert np.isclose(cumsum_with_limits(b), cumsum_with_limits_nb2(b)).all()
assert np.isclose(cumsum_with_limits(b), cumsum_with_limits_nb3(b)).all()
%timeit cumsum_with_limits(b) # 12.5 ms per loop
%timeit cumsum_with_limits_nb(b) # 40.9 µs per loop
%timeit cumsum_with_limits_nb2(b) # 54.7 µs per loop
%timeit cumsum_with_limits_nb3(b) # 54 µs per loop
Start with a regular cumsum:
b = ...
s = np.cumsum(b)
Find the first clip point:
i = np.argmax((s[0:] > 1) | (s[0:] < -1))
Adjust everything that follows:
s[i:] += (np.sign(s[i]) - s[i])
Rinse and repeat. This still requires a loop, but only over the adjustment points, which is generally expected to be much smaller than the total number of array size.
b = ...
s = np.cumsum(b)
while True:
i = np.argmax((s[0:] > 1) | (s[0:] < -1))
if np.abs(s[i]) <= 1:
break
s[i:] += (np.sign(s[i]) - s[i])
I still haven't found a way to completely pre-compute the adjustment points up front, so I would have to guess that the numba solution will be faster than this, even if it you compiled this with numba.
Starting with np.seed(0), your original example has 3090 adjustment points, which is approximately 1/3. Unfortunately, with all the temp arrays and extra sums, that makes the algorithmic complexity of my solution tend to O(n2). This is completely unacceptable.
I thought I had already answered the generic question of "cumulative sum with bounds" in the past, but I can't find it.
This solution also uses numba and is a bit more general (custom bounds) and concise than the ones given by #jpp.
It operates on the OP's problem (10K values, bounds at -1, 1) in 40 µs.
import numpy as np
from numba import njit
#njit
def cumsum_clip(a, xmin=-np.inf, xmax=np.inf):
res = np.empty_like(a)
c = 0
for i in range(len(a)):
c = min(max(c + a[i], xmin), xmax)
res[i] = c
return res
Example
np.random.seed(0)
x = np.random.randint(-100, 100, 10_000) / 100
>>> x[:6]
array([ 0.72, -0.53, 0.17, 0.92, -0.33, 0.95])
>>> cumsum_clip(x, -1, 1)[:6]
array([0.72, 0.19, 0.36, 1. , 0.67, 1. ])
%timeit cumsum_clip(x, -1, 1)
39.3 µs ± 31 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
Note: you can specify other bounds, e.g.:
>>> cumsum_clip(x, 0, 1)[:10]
array([0.72, 0.19, 0.36, 1. , 0.67, 1. , 1. , 0.09, 0. , 0. ])
Or omit one of the bounds (for example here specifying only an upper bound):
>>> cumsum_clip(x, xmax=1)[:10]
array([ 0.72, 0.19, 0.36, 1. , 0.67, 1. , 1. , 0.09, -0.7 , -1.34])
Of course, it preserves the original dtype:
np.random.seed(0)
x = np.random.randint(-10, 10, 10)
>>> cumsum_clip(x, 0, 10)
array([ 2, 7, 0, 0, 0, 0, 0, 9, 10, 4])
>>> cumsum_clip(x, 0, 10).dtype
dtype('int64')

How to use np.cumsum to speed up mean average precision calculation?

I have,
scores = np.array([[0.9, 0.8, 0.6, 0.5, 0.4], [0.5, 0.4, 0.31, 0.21, 0.4 ]])
labels = np.array([[1, 0, 1, 1, 0], [0, 0, 0, 1, 1]])
I want to calculate at K map, which I wrote an algo as below,
k=3
mean_ap = 0
n = len(scores)
for i in range(n):
cum = ap = 0.0
idx = np.argsort(-scores[i])
used_label = labels[i][idx][:k]
m = sum(labels[i])
for j, label in enumerate(used_label):
cum += label
ap += cum * label / (j + 1)
mean_ap += ap / min(m, k)
val = mean_ap / n
It basically gives calculation formula like below:
(1 + 0 + 2 / 3) / 3 + ( 0 + 0 + 1 / 3) / 2
Any suggestion that I could use np.cumsum to speed up my algo?
I assume it has been optimized and I don't see any enhancement room here?
Thanks in advance.
hope this can help you (I try to avoid for loops):
k = 3
n = len(scores)
m = labels.sum(axis=1)
idx = np.argsort(-scores)
used_label = labels[:,idx][np.arange(0,n),np.arange(0,n),:k]
val = (np.cumsum(used_label, axis=1)*used_label /
np.arange(1,k+1) /
np.min([m,np.repeat(k,n)],axis=0).reshape(-1,1)).sum(axis=1).sum() / n

convert/map python dictionary to a list, by using the key as the list index

I have a python dictionary as follows:
dict = {4:0.65,8:1.23,3:0.43}
I would like to convert this to a python list by using the key as the index to the list. The desired converted result would be:
listLength = 10
plist = [0,0,0,0.43,0.65,0,0,0,1.23,0]
I know how to do the above using a loop but that is not pythonic and it is not fast. What is the most pythonic way to do the above without using a loop.
I specially need to do the above with the best performance.
Since you tag pandas, solution from reindex
pd.Series(d).reindex(range(10),fill_value=0).tolist()
Out[369]: [0.0, 0.0, 0.0, 0.43, 0.65, 0.0, 0.0, 0.0, 1.23, 0.0]
Using numpy and numpy indexing is going to be the most performant solution:
out = np.zeros(10)
out[list(d.keys())] = list(d.values())
array([0. , 0. , 0. , 0.43, 0.65, 0. , 0. , 0. , 1.23, 0. ])
Performance since you asked:
k = np.random.randint(1, 100000, 10000)
v = np.random.rand(10000)
d = dict(zip(k, v))
In [119]: %%timeit
...: out = np.zeros(100000)
...: out[list(d.keys())] = list(d.values())
...:
...:
1.86 ms ± 13.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In [120]: %timeit [d.get(i, 0) for i in range(100000)]
17.4 ms ± 231 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [121]: %timeit pd.Series(d).reindex(range(100000),fill_value=0).tolist()
9.77 ms ± 148 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
For larger data sets you can gain some speed using np.fromiter directly on the key and value iterators instead of creating lists first.
Create test case
>>> d = dict(zip(np.random.randint(1, 10, 1_000_000).cumsum(), np.arange(1_000_000.)))
>>> out = np.zeros(10_000_000)
Define fromiter method
>>> def use_iter():
... k, v = (np.fromiter(w, dtype=t, count=len(d)) for w, t in [(d.keys(), int), (d.values(), float)])
... out[k] = v
... return out
and list method for reference
>>> def use_list():
... out[list(d.keys())] = list(d.values())
... return out
and time them
>>> timeit(use_iter, number=100)
4.2583943260106025
>>> timeit(use_list, number=100)
17.10310926999955
Also, check correctness
>>> np.all(use_list() == use_iter())
True
Avoid shadowing the built-in dict. Use some other name instead.
dict_ = {4:0.65,8:1.23,3:0.43}
length = max(dict_) + 1 # Get number of entries needed
list_ = [0] * length # Initialize a list of zeroes
for i in dict_:
list_[i] = dict_[i]
Using list comprehension
lst = [d[i] if i in d else 0 for i in range(10)]
print(lst)
# [0, 0, 0, 0.43, 0.65, 0, 0, 0, 1.23, 0]
Expanded:
lst = []
for i in range(10):
if i in d:
lst.append(d[i])
else:
lst.append(0)
You could to something like this:
list_length = 10
d = {4: 0.65, 8: 1.23, 3: 0.43}
plist = [d.get(i, 0) for i in range(list_length)]
print(plist)
Output
[0, 0, 0, 0.43, 0.65, 0, 0, 0, 1.23, 0]
Note: Don't use the name dict for your own variables, you will shadow the built-in name dict.
You can just iterate over the dictionary and place them into a list. I am doing error checking to make sure that the key is within the specified list length.
list = [0] * length
for key, val in d.items():
if key < length:
list[key] = val
If you want the list to be as big as the max key, follow this bellow
maxKey = max(d, key=int)
list = [0] * maxKey
for key, val in d.items():
list[key] = val

Python : How to change from a like impulse list to a like saw tooth list

I want to change the list from like impulse to like saw tooth.
Please tell me how to write it by a effective way.
# input list
list_a = [0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0]
# output list
list_b = [0.0, 0.0, 1.0, 0.9, 0.8, 0.7, 1.0, 0.9, 0.8, 1.0, 0.9]
Normal code
I want to know how to write a effective way of the under code.
like a list comprehension.
list_b = []
for i, val in enumerate(list_a):
if i == 0:
list_b.append(val)
elif val == 1:
list_b.append(float(val))
elif list_b[-1] >= 0.1:
list_b.append(list_b[-1] - 0.1)
else:
# list_b is not subtract under zero
list_b.append(0.0)
You can try this:
import itertools
list_a = [0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0]
new_data = list(map(lambda x:x[-1], [(a, list(b)) for a, b in itertools.groupby(list_a, key=lambda x:x == 1)]))
final_data = map(float, list(itertools.chain.from_iterable([list(map(lambda x:0.0 if x < 0 else x, [1-(0.1*c) for c in range(1, len(a)+1)])) if a[0] == 0 and new_data[i-1][0] == 1 and i > 0 else a for i, a in enumerate(new_data)])))
Output:
[0.0, 0.0, 1.0, 0.9, 0.8, 0.7, 1.0, 0.9, 0.8, 1.0, 0.9]

python multiply list elements in a changing manner

I want to decay elements of a list such that on every 5 element, the elements will be reduced by half. For example, a list of ones with length 10 will become:
[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
[1,1,1,1,1,0.5,0.5,0.5,0.5,0.5,0.25,0.25,0.25,0.25,0.25]
I tried list comprehensions and a basic for loop, but I couldn't construc the logic behind it.
Is this what you're looking for?
>>> x = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
>>> r = [v*2**(-(i//5)) for i, v in enumerate(x)]
>>> r
[1, 1, 1, 1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.25, 0.25, 0.25, 0.25, 0.25]
>>>
Think simple.
value = 1
result = []
for i in range(3):
for j in range(5):
result.append(value)
else:
value /= 2
print(result)
# [1, 1, 1, 1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.25, 0.25, 0.25, 0.25, 0.25]
All other answers are great, i would like to add a stretched solution for this.
start_range = 0
end_range = 5
num = 1
x = [1 for _ in range(10)]
res = []
while start_range <= len(x):
for item in x[start_range:end_range]:
res.append(item*num)
start_range = end_range
end_range = start_range + 5
num /= float(2)
print res
# output: [1, 1, 1, 1, 1, 0.5, 0.5, 0.5, 0.5, 0.5]

Categories