Python Series Sum Resample - python

Given a list [5,2,4,5,1,2,4,5], how to do a sum resample like pandas.resample df.resample().sum() without the hasle of creating a DatetimeIndex?
inp = [5,2,4,5,1,2,4,5]
out = resample(inp, 2, how='sum')
>> [7, 9, 3, 9]
Note: This is because df.resample().sum() only accept datetime-like index. I have spent some time googling this topic but find nothing. Sorry if there is exist a same question like this.
Edit:
A manual solution might look like this
import numpy as np
def resample_sum(inp, window):
return np.sum(np.reshape(inp, (len(inp)//window, window)), axis=1)

def resample(inp_list,window_size,how='sum'):
output = []
for i in range(0,len(inp_list),window_size):
window = inp_list[i:i+window_size]
if how == 'sum':
output.append(sum(window))
else:
raise NotImplementedError #replace this with other how's you want
return output
inp = [5,2,4,5,1,2,4,5]
out = resample(inp, 2, how='sum')
#[7, 9, 3, 9]
Edit 1: A vectorized numpy array solution which will have better performance for a huge array. The idea is to reshape the array into a 2-d array where the rows of the new array are the values that should be summed together.
import numpy as np
def resample(inp_array,window_size,how='sum'):
inp_array = np.asarray(inp_array)
#check how many zeros need to be added to the end to make
# the array length a multiple of window_size
pad = (window_size-(inp_array.size % window_size)) % window_size
if pad > 0:
inp_array = np.r_[np.ndarray.flatten(inp_array),np.zeros(pad)]
else:
inp_array = np.ndarray.flatten(inp_array)
#reshape so that the number of columns = window_size
inp_windows = inp_array.reshape((inp_array.size//window_size,window_size))
if how == 'sum':
#sum across columns
return np.sum(inp_windows,axis=1)
else:
raise NotImplementedError #replace this with other how's you want
inp = [5,2,4,5,1,2,4,5]
out = resample(inp, 2, how='sum')
#[7, 9, 3, 9]
Edit 2:
The closest thing to this I found in a popular library is skimage.measure.block_reduce.
You can treat your data as a 1-dimensional image, pass a block size, and pass the np.sum function.

Related

Compare 2 numpy arrays get positional difference

So i have two arrays as shown in the example below:
import os
import numpy as np
tiA = np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2]) * 1000
tiB = np.array([0.1,0.2,0.4,0.5,0.6,0.7,0.8,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2]) * 1000
res = [idx for idx, elem in enumerate(tiB)
if elem != tiA[idx]]
print(res)
It gives me an answer of [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]. However i wanted to get position 3 and position 9 or [2,8] as an answer because 0.3 and 0.9 is missing from tiB compared to tiA. Also how can i use this answer to select a 4d array. So my array is sized arrayA=(128x128x5xtiA). However i want my new array to be sized arrayB=(128x128x5xtiB) selected from tiA. So basically arrayB will be missing [2,8] of the 4th dimension as shown in my example as compared to arrayA. My problem is most of the time there can be multiple differences (1,2,3 missing) between tiA and tiB. Thank you for all your help.
Kevin
Your code was already a good start, but you need to check if the element is not in the other list and not if the elements at the same position are equal.
import numpy as np
tiA = np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2]) * 1000
tiB = np.array([0.1,0.2,0.4,0.5,0.6,0.7,0.8,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2]) * 1000
res = [idx for idx, elem in enumerate(tiA) if elem not in tiB]
print(res)
For the second part you should be able to use np.delete. You just need to define the indices (res) and the correct axis (should be 3 in your case then).
arrayB = np.delete(arrayA, res, 3))

Pandas assignment using nested loops leading to memory error

I am using pandas and trying to do an assignment using a nested loops. I iterate over a dataframe and then run a distance function if it meets a certain criteria. I am faced with two problems:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
Memory Error. It doesn't work on large datasets. I end up having to terminate the process.
How should I change my solution to ensure it can scale with a larger dataset of 60,000 rows?
for i, row in df.iterrows():
listy = 0
school = []
if row['LS_Type'] == 'Primary (1-4)':
a = row['Northing']
b = row['Easting']
LS_ID = row['LS_ID']
for j, row2 in df.iterrows():
if row2['LS_Type'] == 'Primary (1-8)':
dist_km = distance(a,b, df.Northing[j], df.Easting[j])
if (listy == 0):
listy = dist_km
school.append([df.LS_Name[j], df.LS_ID[j]])
else:
if dist_km < listy:
listy = dist_km
school[0] = [df.LS_Name[j], int(df.LS_ID[j])]
df['dist_up_prim'][i] = listy
df["closest_up_prim"][i] = school[0]
else:
df['dist_up_prim'][i] = 0
The double for loop is what's killing you here. See if you can break it up into two separate apply steps.
Here is a toy example of using df.apply() and partial to do a nested for loop:
import math
import pandas as pd
from functools import partial
df = pd.DataFrame.from_dict({'A': [1, 2, 3, 4, 5, 6, 7, 8],
'B': [1, 2, 3, 4, 5, 6, 7, 8]})
def myOtherFunc(row):
if row['A'] <= 4:
return row['B']*row['A']
def myFunc(the_df, row):
if row['A'] <= 2:
other_B = the_df.apply(myOtherFunc, axis=1)
return other_B.mean()
return pd.np.NaN
apply_myFunc_on_df = partial(myFunc, df)
df.apply(apply_myFunc_on_df, axis=1)
You can rewrite your code in this form, which will be much faster.

Remove values from numpy array closer to each other

Actually i want to remove the elements from numpy array which are closer to each other.For example i have array [1,2,10,11,18,19] then I need code that can give output like [1,10,18] because 2 is closer to 1 and so on.
In the following is provided an additional solution using numpy functionalities (more precisely np.ediff1d which makes the differences between consecutive elements of a given array. This code considers as threshold the value associated to the th variable.
a = np.array([1,2,10,11,18,19])
th = 1
b = np.delete(a, np.argwhere(np.ediff1d(a) <= th) + 1) # [1, 10, 18]
Here is simple function to find the first values of series of consecutives values in a 1D numpy array.
import numpy as np
def find_consec(a, step=1):
vals = []
for i, x in enumerate(a):
if i == 0:
diff = a[i + 1] - x
if diff == step:
vals.append(x)
elif i < a.size-1:
diff = a[i + 1] - x
if diff > step:
vals.append(a[i + 1])
return np.array(vals)
a = np.array([1,2,10,11,18,19])
find_consec(a) # [1, 10, 18]
Welcome to stackoverflow. below is the code that can answer you question:
def closer(arr,cozy):
result = []
result.append(arr[0])
for i in range(1,len(arr)-1):
if arr[i]-result[-1]>cozy:
result.append(arr[i])
print result
Example:
a = [6,10,7,20,21,16,14,3,2]
a.sort()
closer(a,1)
output : [2, 6, 10, 14, 16, 20]
closer(a,3)
Output: [2, 6, 10, 14, 20]

How to use np.where() to create a new array of specific rows?

I have an array (msaarr) of 1700 values, ranging from approximately 0 to 150. I know that 894 of these values should be less than 2, and I wish to create a new array containing only these values.
So far, I have attempted this code:
Combined = np.zeros(shape=(894,8))
for i in range(len(Spitzer)): #len(Spitzer) = 1700
index = np.where(msaarr <= 2)
Combined[:,0] = msaarr[index]
The reason there are eight columns is because I have more data associated with each value in msaarr that I also want to display. msaarr was created using several lines of code, which is why I haven't mentioned them here, but it is an array with shape (1700,1) with type float64.
The problem I'm having is that if I print msaarr[index], then I get an array of shape (893,), but when I attempt to assign this as my zeroth column, I get the error
ValueError: could not broadcast input array from shape (1699) into shape (894)
I also attempted
Combined[:,0] = np.extract(msaarr <= 2, msaarr)
Which gave the same error.
I thought at first this might just be some confusion with Python's zero-indexing, so I tried changing the shape to 893, and also tried to assign to a different column Combined[:,1], but I have the same error every time.
Alternatively, when I try:
Combined[:,1][i] = msaarr[index][i]
I get the error:
IndexError: index 894 is out of bounds for axis 0 with size 894
What am I doing wrong?
EDIT:
A friend pointed out that I might not be calling index correctly because it is a tuple, and so his suggestion was this:
index = np.where(msaarr < 2)
Combined[:,0] = msaarr[index[0][:]]
But I am still getting this error:
ValueError: could not broadcast input array from shape (893,1) into shape (893)
How can my shape be (893) and not (893, 1)?
Also, I did check, and len(index[0][:]) = 893, and len(msaarr[index[0][:]]) = 893.
The full code as of last attempts is:
import numpy as np
from astropy.io import ascii
from astropy.io import fits
targets = fits.getdata('/Users/vcolt/Dropbox/ATLAS source matches/OzDES.fits')
Spitzer = ascii.read(r'/Users/vcolt/Desktop/Catalogue/cdfs_spitzer.csv', header_start=0, data_start=1)
## Find minimum separations, indexed.
RADiffArr = np.zeros(shape=(len(Spitzer),1))
DecDiffArr = np.zeros(shape=(len(Spitzer),1))
msaarr = np.zeros(shape=(len(Spitzer),1))
Combined= np.zeros(shape=(893,8))
for i in range(len(Spitzer)):
x = Spitzer["RA_IR"][i]
y = Spitzer["DEC_IR"][i]
sep = abs(np.sqrt(((x - targets["RA"])*np.cos(np.array(y)))**2 + (y - targets["DEC"])**2))
minsep = np.nanmin(sep)
minseparc = minsep*3600
msaarr[i] = minseparc
min_positions = [j for j, p in enumerate(sep) if p == minsep]
x2 = targets["RA"][min_positions][0]
RADiff = x*3600 - x2*3600
RADiffArr[i] = RADiff
y2 = targets["DEC"][min_positions][0]
DecDiff = y*3600 - y2*3600
DecDiffArr[i] = DecDiff
index = np.where(msaarr < 2)
print msaarr[index].shape
Combined[:,0] = msaarr[index[0][:]]
I get the same error whether index = np.where(msaarr < 2) is in or out of the loop.
Take a look at using numpy.take in combination with numpy.where.
inds = np.where(msaarr <= 2)
new_msaarr = np.take(msaarr, inds)
If it is a multi-dimensional array, you can also add the axis keyword to take slices along that axis.
I think loop is not at the right place. np.where() will return an array of index of elements which matches the condition you have specified.
This should suffice
Index = np.where(msaarr <= 2)
Since index is an array. You need to loop over this index and fill the values in combined[:0]
Also I want to point out one thing. You have said that there will be 894 values less than 2 but in the code you are using less than and equal to 2.
np.where(condition) will return a tuple of arrays containing the indexes of elements that verify your condition.
To get an array of the elements verifying your condition use:
new_array = msaarr[msaarr <= 2]
>>> x = np.random.randint(0, 10, (4, 4))
>>> x
array([[1, 6, 8, 4],
[0, 6, 6, 5],
[9, 6, 4, 4],
[9, 6, 8, 6]])
>>> x[x>2]
array([6, 8, 4, 6, 6, 5, 9, 6, 4, 4, 9, 6, 8, 6])

Numpy array conditional matching

I need to match two very large Numpy arrays (one is 20000 rows, another about 100000 rows) and I am trying to build a script to do it efficiently. Simple looping over the arrays is incredibly slow, can someone suggest a better way? Here is what I am trying to do: array datesSecondDict and array pwfs2Dates contain datetime values, I need to take each datetime value from array pwfs2Dates (smaller array) and see if there is a datetime value like that (plus minus 5 minutes) in array datesSecondDict (there might be more than 1). If there is one (or more) I populate a new array (of the same size as array pwfs2Dates) with the value (one of the values) from array valsSecondDict (which is just the array with the corresponding numerical values to datesSecondDict). Here is a solution by #unutbu and #joaquin that worked for me (thanks guys!):
import time
import datetime as dt
import numpy as np
def combineArs(dict1, dict2):
"""Combine data from 2 dictionaries into a list.
dict1 contains primary data (e.g. seeing parameter).
The function compares each timestamp in dict1 to dict2
to see if there is a matching timestamp record(s)
in dict2 (plus/minus 5 minutes).
==If yes: a list called data gets appended with the
corresponding parameter value from dict2.
(Note that if there are more than 1 record matching,
the first occuring value gets appended to the list).
==If no: a list called data gets appended with 0."""
# Specify the keys to use
pwfs2Key = 'pwfs2:dc:seeing'
dimmKey = 'ws:seeFwhm'
# Create an iterator for primary dict
datesPrimDictIter = iter(dict1[pwfs2Key]['datetimes'])
# Take the first timestamp value in primary dict
nextDatePrimDict = next(datesPrimDictIter)
# Split the second dictionary into lists
datesSecondDict = dict2[dimmKey]['datetime']
valsSecondDict = dict2[dimmKey]['values']
# Define time window
fiveMins = dt.timedelta(minutes = 5)
data = []
#st = time.time()
for i, nextDateSecondDict in enumerate(datesSecondDict):
try:
while nextDatePrimDict < nextDateSecondDict - fiveMins:
# If there is no match: append zero and move on
data.append(0)
nextDatePrimDict = next(datesPrimDictIter)
while nextDatePrimDict < nextDateSecondDict + fiveMins:
# If there is a match: append the value of second dict
data.append(valsSecondDict[i])
nextDatePrimDict = next(datesPrimDictIter)
except StopIteration:
break
data = np.array(data)
#st = time.time() - st
return data
Thanks,
Aina.
Are the array dates sorted ?
If yes, you can speed up your comparisons by breaking from the inner
loop comparison once its dates are bigger than the date given by the
outer loop. In this way you will made a one-pass comparison instead of
looping dimVals items len(pwfs2Vals) times
If no, maybe you should transform the current pwfs2Dates array to, for example,
an array of pairs [(date, array_index),...] and then you can sort by
date all your arrays to make the one-pass comparison indicated above and at the
same time to be able to get the original indexes needed to set data[i]
for example if the arrays were already sorted (I use lists here, not sure you need arrays for that):
(Edited: now using and iterator not to loop pwfs2Dates from the beginning on each step):
pdates = iter(enumerate(pwfs2Dates))
i, datei = pdates.next()
for datej, valuej in zip(dimmDates, dimvals):
while datei < datej - fiveMinutes:
i, datei = pdates.next()
while datei < datej + fiveMinutes:
data[i] = valuej
i, datei = pdates.next()
Otherwise, if they were not ordered and you created the sorted, indexed lists like this:
pwfs2Dates = sorted([(date, idx) for idx, date in enumerate(pwfs2Dates)])
dimmDates = sorted([(date, idx) for idx, date in enumerate(dimmDates)])
the code would be:
(Edited: now using and iterator not to loop pwfs2Dates from the beginning on each step):
pdates = iter(pwfs2Dates)
datei, i = pdates.next()
for datej, j in dimmDates:
while datei < datej - fiveMinutes:
datei, i = pdates.next()
while datei < datej + fiveMinutes:
data[i] = dimVals[j]
datei, i = pdates.next()
great!
..
Note that dimVals:
dimVals = np.array(dict1[dimmKey]['values'])
is not used in your code and can be eliminated.
Note that your code gets greatly simplified by looping through the
array itself instead of using xrange
Edit: The answer from unutbu address some weak parts in the code above.
I indicate them here for completness:
Use of next: next(iterator) is prefered to iterator.next().
iterator.next() is an exception to a conventional naming rule that
has been fixed in py3k renaming this method as
iterator.__next__().
Check for the end of the iterator with a try/except. After all the
items in the iterator are finished the next call to next()
produces an StopIteration Exception. Use try/except to kindly
break out of the loop when that happens. For the specific case of the
OP question this is not an issue, because the two arrrays are the same
size so the for loop finishes at the same time than the iterator. So no
exception is risen. However, there could be cases were dict1 and dict2
are not the same size. And in this case there is the posibility of an
exception being risen.
Question is: what is better, to use try/except or to prepare the arrays
before looping by equalizing them to the shorter one.
Building on joaquin's idea:
import datetime as dt
import itertools
def combineArs(dict1, dict2, delta = dt.timedelta(minutes = 5)):
marks = dict1['datetime']
values = dict1['values']
pdates = iter(dict2['datetime'])
data = []
datei = next(pdates)
for datej, val in itertools.izip(marks, values):
try:
while datei < datej - delta:
data.append(0)
datei = next(pdates)
while datei < datej + delta:
data.append(val)
datei = next(pdates)
except StopIteration:
break
return data
dict1 = { 'ws:seeFwhm':
{'datetime': [dt.datetime(2011, 12, 19, 12, 0, 0),
dt.datetime(2011, 12, 19, 12, 1, 0),
dt.datetime(2011, 12, 19, 12, 20, 0),
dt.datetime(2011, 12, 19, 12, 22, 0),
dt.datetime(2011, 12, 19, 12, 40, 0), ],
'values': [1, 2, 3, 4, 5] } }
dict2 = { 'pwfs2:dc:seeing':
{'datetime': [dt.datetime(2011, 12, 19, 12, 9),
dt.datetime(2011, 12, 19, 12, 19),
dt.datetime(2011, 12, 19, 12, 29),
dt.datetime(2011, 12, 19, 12, 39),
], } }
if __name__ == '__main__':
dimmKey = 'ws:seeFwhm'
pwfs2Key = 'pwfs2:dc:seeing'
print(combineArs(dict1[dimmKey], dict2[pwfs2Key]))
yields
[0, 3, 0, 5]
I think you can do it with one fewer loops:
import datetime
import numpy
# Test data
# Create an array of dates spaced at 1 minute intervals
m = range(1, 21)
n = datetime.datetime.now()
a = numpy.array([n + datetime.timedelta(minutes=i) for i in m])
# A smaller array with three of those dates
m = [5, 10, 15]
b = numpy.array([n + datetime.timedelta(minutes=i) for i in m])
# End of test data
def date_range(date_array, single_date, delta):
plus = single_date + datetime.timedelta(minutes=delta)
minus = single_date - datetime.timedelta(minutes=delta)
return date_array[(date_array < plus) * (date_array > minus)]
dates = []
for i in b:
dates.append(date_range(a, i, 5))
all_matches = numpy.unique(numpy.array(dates).flatten())
There is surely a better way to gather and merge the matches, but you get the idea... You could also use numpy.argwhere((a < plus) * (a > minus)) to return the index instead of the date and use the index to grab the whole row and place it into your new array.

Categories