Vectorized equivalent of dict.get - python

I'm looking for the functionality that operates like such
lookup_dict = {5:1.0, 12:2.0, 39:2.0...}
# this is the missing magic:
lookup = vectorized_dict(lookup_dict)
x = numpy.array([5.0, 59.39, 39.49...])
xbins = numpy.trunc(x).astype(numpy.int_)
y = lookup.get(xbins, 0.0)
# the idea is that we get this as the postcondition:
for (result, input) in zip(y, xbins):
assert(result==lookup_dict.get(input, 0.0))
Is there some flavor of sparse array in numpy (or scipy) that gets at this kind of functionality?
The full context is that I'm binning some samples of a 1-D feature.

As far as I know, numpy does not support different data types in the same array structures but you can achieve a similar result if you are willing to separate keys from values and maintain the keys (and corresponding values) in sorted order:
import numpy as np
keys = np.array([5,12,39])
values = np.array([1.0, 2.0, 2.0])
valueOf5 = values[keys.searchsorted(5)] # 2.0
k = np.array([5,5,12,39,12])
values[keys.searchsorted(k)] # array([1., 1., 2., 2., 2.])
This may not be as efficient as a hashing key but it does support the propagation of indirections from arrays with any number of dimensions.
note that this assumes your keys are always present in the keys array. If not, rather than an error, you could be getting the value from the next key up.

Using np.select to create boolean masks over the array, ([xbins == k for k in lookup_dict]), the values from the dict (lookup_dict.values()), and a default value of 0:
y = np.select(
[xbins == k for k in lookup_dict],
lookup_dict.values(),
0.0
)
# In [17]: y
# Out[17]: array([1., 0., 2.])
This assumes that the dictionary is sorted, I'm not sure what the behaviour would be below python 3.6.
OR overkill with pandas:
import pandas as pd
s = pd.Series(xbins)
s = s.map(lookup_dict).fillna(0)

Another approach is to use searchsorted to search a numpy array which has the integer 'keys' and returns the initially loaded value in the range n <= x < n+1. This may be useful to somebody asking the a similar question in the future.
import numpy as np
class NpIntDict:
""" Class to simulate a python dict get for a numpy array. """
def __init__( self, dict_in, default = np.nan ):
""" dict_in: a dictionary with integer keys.
default: the value to be returned for keys not in the dictionary.
defaults to np.nan
default must be consistent with the dtype of values
"""
# Create list of dict items sorted by key.
list_in = sorted([ item for item in dict_in.items() ])
# Create three empty lists.
key_list = []
val_list = []
is_def_mask = []
for key, value in list_in:
key = int(key)
if not key in key_list: # key not yet in key list
# Update the three lists for key as default.
key_list.append( key )
val_list.append( default )
is_def_mask.append( True )
# Update the lists for key+1. With searchsorted this gives the required results.
key_list.append( key + 1 )
val_list.append( value )
is_def_mask.append( False )
# Add the key > max(key) to the val and is_def_mask lists.
val_list.append( default )
is_def_mask.append( True )
self.keys = np.array( key_list, dtype = np.int )
self.values = np.array( val_list )
self.default_mask = np.array( is_def_mask )
def set_default( self, default = 0 ):
""" Set the default to a new default value. Using self.default_mask.
Changes the default value for all future self.get(arr).
"""
self.values[ self.default_mask ] = default
def get( self, arr, default = None ):
""" Returns an array looking up the values in `arr` in the dict.
default can be used to change the default value returned for this get only.
"""
if default is None:
values = self.values
else:
values= self.values.copy()
values[ self.default_mask ] = default
return values[ np.searchsorted( self.keys, arr, side = 'right' ) ]
# side = 'right' to ensure key[ix] <= x < key[ix+1]
# side = 'left' would mean key[ix] < x <= key[ix+1]
This could be simplified if there's no requirement to change the default returned after the NpIntDict is created.
To test it.
d = { 2: 5.1, 3: 10.2, 5: 47.1, 8: -6}
# x <2 Return default
# 2 <= x <3 return 5.1
# 3 <= x < 4 return 10.2
# 4 <= x < 5 return default
# 5 <= x < 6 return 47.1
# 6 <= x < 8 return default
# 8 <= x < 9 return -6.
# 9 <= x return default
test = NpIntDict( d, default = 0.0 )
arr = np.arange( 0., 100. ).reshape(10,10)/10
print( arr )
"""
[[0. 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]
[1. 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9]
[2. 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9]
[3. 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9]
[4. 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9]
[5. 5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9]
[6. 6.1 6.2 6.3 6.4 6.5 6.6 6.7 6.8 6.9]
[7. 7.1 7.2 7.3 7.4 7.5 7.6 7.7 7.8 7.9]
[8. 8.1 8.2 8.3 8.4 8.5 8.6 8.7 8.8 8.9]
[9. 9.1 9.2 9.3 9.4 9.5 9.6 9.7 9.8 9.9]]
"""
print( test.get( arr ) )
"""
[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]
[ 5.1 5.1 5.1 5.1 5.1 5.1 5.1 5.1 5.1 5.1]
[10.2 10.2 10.2 10.2 10.2 10.2 10.2 10.2 10.2 10.2]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]
[47.1 47.1 47.1 47.1 47.1 47.1 47.1 47.1 47.1 47.1]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]
[-6. -6. -6. -6. -6. -6. -6. -6. -6. -6. ]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]]
"""
This could be amended to raise an exception if any of the arr elements aren't in the key list. For me returning a default would be more useful.

Related

How do I mask only the output (labelled data). I don't have any problem in input data

I have so many Nan values in my output data and I padded those values with zeros. Please don't suggest me to delete Nan or impute with any other no. I want model to skip those nan positions.
example:
x = np.arange(0.5, 30)
x.shape = [10, 3]
x = [[ 0.5 1.5 2.5]
[ 3.5 4.5 5.5]
[ 6.5 7.5 8.5]
[ 9.5 10.5 11.5]
[12.5 13.5 14.5]
[15.5 16.5 17.5]
[18.5 19.5 20.5]
[21.5 22.5 23.5]
[24.5 25.5 26.5]
[27.5 28.5 29.5]]
y = np.arange(2, 10, 0.8)
y.shape = [10, 1]
y[4, 0] = 0.0
y[6, 0] = 0.0
y[7, 0] = 0.0
y = [[2. ]
[2.8]
[3.6]
[4.4]
[0. ]
[6. ]
[0. ]
[0. ]
[8.4]
[9.2]]
I expect keras deep learning model to predict zeros for 5th, 7th and 8th row as similar to the padded value in 'y'.

How to extract a DataFrame to obtain a nested array?

I have a sample DataFrame as below:
First column consists of 2 years, for each year, 2 track exist and each track includes pairs of longitude and latitude coordinated. How can I extract every track for each year separately to obtain an array of tracks with lat and long?
df = pd.DataFrame(
{'year':[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1],
'track_number':[0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1],
'lat': [11.7,11.8,11.9,11.9,12.0,12.1,12.2,12.2,12.3,12.3,12.4,12.5,12.6,12.6,12.7,12.8],
'long':[-83.68,-83.69,-83.70,-83.71,-83.71,-83.73,-83.74,-83.75,-83.76,-83.77,-83.78,-83.79,-83.80,-83.81,-83.82,-83.83]})
You can groupby year and then extract a numpy.array from the created dataframes with .to_numpy().
>>> years = []
>>> for _, df2 in df.groupby(["year"]):
years.append(df2.to_numpy()[:, 1:])
>>> years[0]
array([[ 0. , 11.7 , -83.68],
[ 0. , 11.8 , -83.69],
[ 0. , 11.9 , -83.7 ],
[ 0. , 11.9 , -83.71],
[ 1. , 12. , -83.71],
[ 1. , 12.1 , -83.73],
[ 1. , 12.2 , -83.74],
[ 1. , 12.2 , -83.75]])
>>> years[1]
array([[ 0. , 12.3 , -83.76],
[ 0. , 12.3 , -83.77],
[ 0. , 12.4 , -83.78],
[ 0. , 12.5 , -83.79],
[ 1. , 12.6 , -83.8 ],
[ 1. , 12.6 , -83.81],
[ 1. , 12.7 , -83.82],
[ 1. , 12.8 , -83.83]])
Where years[0] would have the desired information for the year 0. And so on. Inside the array, the positions of the original dataframe are preserved. That is, the first element is the track; the second, the latitude, and the third, the longitude.
If you wish to do the same for the track, i.e, have an array of only latitude and longitude, you can groupby(["year", "track_number"]) as well.

Finding shortest sublist with sum greater than 50

I have a list and I want to find shortest sublist with sum greater than 50.
For example my list is
[8.4 , 10.3 , 12.9 , 8.2 , 13.7 , 11.2 , 11.3 ,10.4 , 4.2 , 3.3 , 4.0 , 2.1]
and I want to find shortest sublist so that its sum is more than 50.
Output Should be like [12.9 , 13.7 , 11.2 , 11.3, 10.4]
this is way bad solution (in term of not doing all graph serach and find optimum values ), but solution is correct
lis =[8.4 , 10.3 , 12.9 , 8.2 , 13.7 , 11.2 , 11.3 ,10.4 , 4.2 , 3.3 , 4.0 , 2.1]
from collections import defaultdict
dic = defaultdict(list)
for i in range(len(lis)):
dic[lis[i]]+=[i]
tmp_lis = lis.copy()
tmp_lis.sort(reverse=True)
res =[]
for i in tmp_lis:
if sum(res)>50 :
break
else:
res.append(i)
res1 = [(i,dic[i]) for i in res]
res1.sort(key=lambda x:x[1])
solution =[i[0] for i in res1]
output
[12.9, 13.7, 11.2, 11.3, 10.4]
O(n) solution for list of positive numbers
Provided your list cannot contain negative numbers, then there is a linear solution using two-pointers traversal.
Track the sum between both pointers. Increment the right pointer whenever the sum is below 50 and increment the left one otherwise.
This provides a sequence of pointers within which you will find the ones with minimal distance. It suffices to use min to get the smallest interval out of those.
Due to the behaviour of min, this will return the left-most sublist with minimal length if more than one solution exists.
Code
def intervals_generator(lst, bound):
i, j = 0, 0
sum_ = 0
while True:
try:
if sum_ <= bound:
sum_ += lst[j]
j += 1
else:
yield i, j
sum_ -= lst[i]
i += 1
except IndexError:
break
def smallest_sub_list(lst, bound):
i, j = min(intervals_generator(lst, bound), key=lambda x: x[1] - x[0])
return lst[i:j]
Examples
lst = [8.4 , 10.3 , 12.9 , 8.2 , 13.7 , 11.2 , 11.3 ,10.4 , 4.2 , 3.3 , 4.0 , 2.1]
print(smallest_sub_list(lst, 50)) # [8.4, 10.3, 12.9, 8.2, 13.7]
lst = [0, 10, 45, 55]
print(smallest_sub_list(lst, 50)) # [55]
Solution for general list of numbers
If the list can contain negative numbers then the above will not work and I believe there exists no solution more efficient than to iterate over all possible sublists.
Sort it in descending order and sum the first elements until you hit +50.0.
myList = [8.4 , 10.3 , 12.9 , 8.2 , 13.7 , 11.2 , 11.3 ,10.4 , 4.2 , 3.3 , 4.0 , 2.1]
mySublist = []
for i in sorted(myList, reverse=True):
mySublist.append(i)
if sum(mySublist) > 50.0:
break
print mySublist # [13.7, 12.9, 11.3, 11.2, 10.4]
Considering that what you want is the smallest sublist in size, and not the smallest in sum value.
If you are searching for any shortest sublist, this can be a solution (maybe to be optimized):
lst = [8.4 , 10.3 , 12.9 , 8.2 , 13.7 , 11.2 , 11.3 , 10.4 , 4.2 , 3.3 , 4.0 , 2.1]
def find_sub(lst, limit=50):
for l in range(1, len(lst)+1):
for i in range(len(lst)-l+1):
sub = lst[i:i+l]
if sum(sub) > limit:
return sub
>>> print(find_sub(lst))
Output:
[8.4, 10.3, 12.9, 8.2, 13.7]

Meshgrid of z values that match x and y meshgrid values

Edit: Original question was flawed but I am leaving it here for reasons of transparency.
Original:
I have some x, y, z data where x and y are coordinates of a 2D grid and z is a scalar value corresponding to (x, y).
>>> import numpy as np
>>> # Dummy example data
>>> x = np.arange(0.0, 5.0, 0.5)
>>> y = np.arange(1.0, 2.0, 0.1)
>>> z = np.sin(x)**2 + np.cos(y)**2
>>> print "x = ", x, "\n", "y = ", y, "\n", "z = ", z
x = [ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
y = [ 1. 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9]
z = [ 0.29192658 0.43559829 0.83937656 1.06655187 0.85571064 0.36317266
0.02076747 0.13964978 0.62437081 1.06008127]
Using xx, yy = np.meshgrid(x, y) I can get two grids containing x and y values corresponding to each grid position.
>>> xx, yy = np.meshgrid(x, y)
>>> print xx
[[ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
[ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
[ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
[ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
[ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
[ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
[ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
[ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
[ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
[ 0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]]
>>> print yy
[[ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. ]
[ 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1]
[ 1.2 1.2 1.2 1.2 1.2 1.2 1.2 1.2 1.2 1.2]
[ 1.3 1.3 1.3 1.3 1.3 1.3 1.3 1.3 1.3 1.3]
[ 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4]
[ 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5]
[ 1.6 1.6 1.6 1.6 1.6 1.6 1.6 1.6 1.6 1.6]
[ 1.7 1.7 1.7 1.7 1.7 1.7 1.7 1.7 1.7 1.7]
[ 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8]
[ 1.9 1.9 1.9 1.9 1.9 1.9 1.9 1.9 1.9 1.9]]
Now I want an array of the same shape for z, where the grid values correspond to the matching x and y values in the original data! But I cannot find an elegant, built-in solution where I do not need to re-grid the data, and I think I am missing some understanding of how I should approach it.
I have tried following this solution (with my real data, not this simple example data, but it should have the same result) but my final grid was not fully populated.
Please help!
Corrected question:
As was pointed out by commenters, my original dummy data was unsuitable for the question I am asking. Here is an improved version of the question:
I have some x, y, z data where x and y are coordinates of a 2D grid and z is a scalar value corresponding to (x, y). The data is read from a text file "data.txt":
#x y z
1.4 0.2 1.93164166734
1.4 0.3 1.88377897779
1.4 0.4 1.81946452501
1.6 0.2 1.9596778849
1.6 0.3 1.91181519535
1.6 0.4 1.84750074257
1.8 0.2 1.90890970517
1.8 0.3 1.86104701562
1.8 0.4 1.79673256284
2.0 0.2 1.78735230743
2.0 0.3 1.73948961789
2.0 0.4 1.67517516511
Loading the text:
>>> import numpy as np
>>> inFile = 'C:\data.txt'
>>> x, y, z = np.loadtxt(inFile, unpack=True, usecols=(0, 1, 2), comments='#', dtype=float)
>>> print x
[ 1.4 1.4 1.4 1.6 1.6 1.6 1.8 1.8 1.8 2. 2. 2. ]
>>> print y
[ 0.2 0.3 0.4 0.2 0.3 0.4 0.2 0.3 0.4 0.2 0.3 0.4]
>>> print z
[ 1.93164167 1.88377898 1.81946453 1.95967788 1.9118152 1.84750074
1.90890971 1.86104702 1.79673256 1.78735231 1.73948962 1.67517517]
Using xx, yy= np.meshgrid(np.unique(x), np.unique(y)) I can get two grids containing x and y values corresponding to each grid position.
>>> xx, yy= np.meshgrid(np.unique(x), np.unique(y))
>>> print xx
[[ 1.4 1.6 1.8 2. ]
[ 1.4 1.6 1.8 2. ]
[ 1.4 1.6 1.8 2. ]]
>>> print yy
[[ 0.2 0.2 0.2 0.2]
[ 0.3 0.3 0.3 0.3]
[ 0.4 0.4 0.4 0.4]]
Now each corresponding cell position in both xx and yy correspond to one of the original grid point locations.
I simply need an equivalent array where the grid values correspond to the matching z values in the original data!
"""e.g.
[[ 1.93164166734 1.9596778849 1.90890970517 1.78735230743]
[ 1.88377897779 1.91181519535 1.86104701562 1.73948961789]
[ 1.81946452501 1.84750074257 1.79673256284 1.67517516511]]"""
But I cannot find an elegant, built-in solution where I do not need to re-grid the data, and I think I am missing some understanding of how I should approach it. For example, using xx, yy, zz = np.meshgrid(x, y, z) returns three 3D arrays that I don't think I can use.
Please help!
Edit:
I managed to make this example work thanks to the solution from Jaime: Fill 2D numpy array from three 1D numpy arrays
>>> x_vals, x_idx = np.unique(x, return_inverse=True)
>>> y_vals, y_idx = np.unique(y, return_inverse=True)
>>> vals_array = np.empty(x_vals.shape + y_vals.shape)
>>> vals_array.fill(np.nan) # or whatever your desired missing data flag is
>>> vals_array[x_idx, y_idx] = z
>>> zz = vals_array.T
>>> print zz
But the code (with real input data) that led me on this path was still failing. I found the problem now. I have been using scipy.ndimage.zoom to resample my gridded data to a higher resolution before generating zz.
>>> import scipy.ndimage
>>> zoom = 2
>>> x = scipy.ndimage.zoom(x, zoom)
>>> y = scipy.ndimage.zoom(y, zoom)
>>> z = scipy.ndimage.zoom(z, zoom)
This produced an array containing many nan entries:
array([[ nan, nan, nan, ..., nan, nan, nan],
[ nan, nan, nan, ..., nan, nan, nan],
[ nan, nan, nan, ..., nan, nan, nan],
...,
[ nan, nan, nan, ..., nan, nan, nan],
[ nan, nan, nan, ..., nan, nan, nan],
[ nan, nan, nan, ..., nan, nan, nan]])
When I skip the zoom stage, the correct array is produced:
array([[-22365.93400183, -22092.31794674, -22074.21420168, ...,
-14513.89091599, -12311.97437017, -12088.07062786],
[-29264.34039242, -28775.79743097, -29021.31886353, ...,
-21354.6799064 , -21150.76555669, -21046.41225097],
[-39792.93758344, -39253.50249278, -38859.2562673 , ...,
-24253.36838785, -25714.71895023, -29237.74277727],
...,
[ 44829.24733543, 44779.37084337, 44770.32987311, ...,
21041.42652441, 20777.00408692, 20512.58162671],
[ 44067.26616067, 44054.5398901 , 44007.62587598, ...,
21415.90416488, 21151.48168444, 20887.05918082],
[ 43265.35371973, 43332.5983711 , 43332.21743471, ...,
21780.32283309, 21529.39770759, 21278.47255848]])

Rolling median in python

I have some stock data based on daily close values. I need to be able to insert these values into a python list and get a median for the last 30 closes. Is there a python library that does this?
In pure Python, having your data in a Python list a, you could do
median = sum(sorted(a[-30:])[14:16]) / 2.0
(This assumes a has at least 30 items.)
Using the NumPy package, you could use
median = numpy.median(a[-30:])
Have you considered pandas? It is based on numpy and can automatically associate timestamps with your data, and discards any unknown dates as long as you fill it with numpy.nan. It also offers some rather powerful graphing via matplotlib.
Basically it was designed for financial analysis in python.
isn't the median just the middle value in a sorted range?
so, assuming your list is stock_data:
last_thirty = stock_data[-30:]
median = sorted(last_thirty)[15]
Now you just need to get the off-by-one errors found and fixed and also handle the case of stock_data being less than 30 elements...
let us try that here a bit:
def rolling_median(data, window):
if len(data) < window:
subject = data[:]
else:
subject = data[-30:]
return sorted(subject)[len(subject)/2]
#found this helpful:
list=[10,20,30,40,50]
med=[]
j=0
for x in list:
sub_set=list[0:j+1]
median = np.median(sub_set)
med.append(median)
j+=1
print(med)
Here is a much faster method with w*|x| space complexity.
def moving_median(x, w):
shifted = np.zeros((len(x)+w-1, w))
shifted[:,:] = np.nan
for idx in range(w-1):
shifted[idx:-w+idx+1, idx] = x
shifted[idx+1:, idx+1] = x
# print(shifted)
medians = np.median(shifted, axis=1)
for idx in range(w-1):
medians[idx] = np.median(shifted[idx, :idx+1])
medians[-idx-1] = np.median(shifted[-idx-1, -idx-1:])
return medians[(w-1)//2:-(w-1)//2]
moving_median(np.arange(10), 4)
# Output
array([0.5, 1. , 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8. ])
The output has the same length as the input vector.
Rows with less than one entry will be ignored and with half of them nans (happens only for an even window-width), only the first option will be returned. Here is the shifted_matrix from above with the respective median values:
[[ 0. nan nan nan] -> -
[ 1. 0. nan nan] -> 0.5
[ 2. 1. 0. nan] -> 1.0
[ 3. 2. 1. 0.] -> 1.5
[ 4. 3. 2. 1.] -> 2.5
[ 5. 4. 3. 2.] -> 3.5
[ 6. 5. 4. 3.] -> 4.5
[ 7. 6. 5. 4.] -> 5.5
[ 8. 7. 6. 5.] -> 6.5
[ 9. 8. 7. 6.] -> 7.5
[nan 9. 8. 7.] -> 8.0
[nan nan 9. 8.] -> -
[nan nan nan 9.]]-> -
The behaviour can be changed by adapting the final slice medians[(w-1)//2:-(w-1)//2].
Benchmark:
%%timeit
moving_median(np.arange(1000), 4)
# 267 µs ± 759 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Alternative approach: (the results will be shifted)
def moving_median_list(x, w):
medians = np.zeros(len(x))
for j in range(len(x)):
medians[j] = np.median(x[j:j+w])
return medians
%%timeit
moving_median_list(np.arange(1000), 4)
# 15.7 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Both algorithms have a linear time complexity.
Therefore, the function moving_median will be the faster option.

Categories