I am making lists by some conditions.
this is what it looks like.
def time_price_pair(a, b):
if 32400<=a and a<32940:
a_list=[]
a_list.append(b)
elif 32940<=a and a<33480:
b_list=[]
b_list.append(b)
elif 33480<=a and a <34020:
c_list=[]
c_list.append(b)
......
......
......
elif 52920 <=a and a <53460:
some_list=[]
some_list.append(b)
each condition will be added by 540. like [32400, 32940, 33480, 34020, 34560, 35100, 35640, 36180, 36720, 37260, 37800, 38340,38880, 39420....53460]
and list name doesn't matter.
I would use a dict to store these lists of values, and use some math to know where to put these numbers
from collections import defaultdict
lists = defaultdict(list)
def time_price_pair(a, b):
if 32400 <= a < 53460:
i = (a-32400)/540
lists[i].append(b)
You can just use a for loop with some incrementing variable i and keep updating the requirements. Something like this:
def time_price_pair(a, b):
min = 32400
max = 32940
inc = 540
for i in range(some value):
if min + inc*i <= a < max + inc*i:
b = min + inc*i
a_list = [b]
It looks like a simple high-level pandas function pd.cut would suit your purpose very well.
import pandas as np
import numpy as np
# simulate your data
# ==================================
np.random.seed(0)
a = np.random.randint(32400, 53439, size=1000000)
b = np.random.randn(1000000)
# put them in dataframe
df = pd.DataFrame(dict(a=a, b=b))
print(df)
a b
0 35132 -0.4605
1 43199 -0.9469
2 42245 0.2580
3 52048 -0.7309
4 45523 -0.4334
5 41625 2.0155
6 53157 -1.4712
7 46516 -0.1715
8 47335 -0.6594
9 47830 -1.0391
... ... ...
999990 39754 0.8771
999991 34779 0.7030
999992 37836 0.5409
999993 44330 -0.6747
999994 41078 -1.1368
999995 38752 1.6121
999996 42155 -0.1139
999997 49018 -0.1737
999998 45848 -1.2640
999999 50669 -0.4367
# processing
# ===================================
rng = np.arange(32400, 53461, 540)
# your custom labels
labels = np.arange(1, len(rng))
# use pd.cut()
%time df['cat'] = pd.cut(df.a, bins=rng, right=False, labels=labels)
CPU times: user 52.5 ms, sys: 16 µs, total: 52.5 ms
Wall time: 51.6 ms
print(df)
a b cat
0 35132 -0.4605 6
1 43199 -0.9469 20
2 42245 0.2580 19
3 52048 -0.7309 37
4 45523 -0.4334 25
5 41625 2.0155 18
6 53157 -1.4712 39
7 46516 -0.1715 27
8 47335 -0.6594 28
9 47830 -1.0391 29
... ... ... ..
999990 39754 0.8771 14
999991 34779 0.7030 5
999992 37836 0.5409 11
999993 44330 -0.6747 23
999994 41078 -1.1368 17
999995 38752 1.6121 12
999996 42155 -0.1139 19
999997 49018 -0.1737 31
999998 45848 -1.2640 25
999999 50669 -0.4367 34
[1000000 rows x 3 columns]
# groupby
grouped = df.groupby('cat')['b']
# access to a particular group using your user_defined key
grouped.get_group(1).values
array([ 0.4525, -0.7226, -0.981 , ..., 0.0985, -1.4286, -0.2257])
A dictionary could be used to hold all of the used time range bins as follows:
import collections
time_prices = [(32401, 20), (32402,30), (32939, 42), (32940, 10), (32941, 15), (40000, 123), (40100, 234)]
dPrices = collections.OrderedDict()
for atime, aprice in time_prices:
abin = 32400 + ((atime - 32400) // 540) * 540 # For bins as times
#abin = (atime - 32400) // 540 + 1 # For bins starting from 1
dPrices.setdefault(abin, []).append(aprice)
# Display results
for atime, prices in dPrices.items():
print atime, prices
This would give you the following output:
32400 [20, 30, 42]
32940 [10, 15]
39960 [123, 234]
Or individually as:
print dPrices[32400]
[20, 30, 42]
Tested using Python 2.7
Related
I have 2 dataframe sets , I want to create a third one. I am trying to to write a code that to do the following :
if A_pd["from"] and A_pd["To"] is within the range of B_pd["from"]and B_pd["To"] then add to the C_pd dateframe A_pd["from"] and A_pd["To"] and B_pd["Value"].
if the A_pd["from"] is within the range of B_pd["from"]and B_pd["To"] and A_pd["To"] within the range of B_pd["from"]and B_pd["To"] of teh next row , then i want to split the range A_pd["from"] and A_pd["To"] to 2 ranges (A_pd["from"] and B_pd["To"]) and ( B_pd["To"] and A_pd["To"] ) and the corresponded B_pd["Value"].
I created the following code:
import pandas as pd
A_pd = {'from':[0,20,80,180,250],
'To':[20, 50,120,210,300]}
A_pd=pd.DataFrame(A_pd)
B_pd = {'from':[0,20,100,200],
'To':[20, 100,200,300],
'Value':[20, 17,15,12]}
B_pd=pd.DataFrame(B_pd)
for i in range(len(A_pd)):
numberOfIntrupt=0
for j in range(len(B_pd)):
if A_pd["from"].values[i] >= B_pd["from"].values[j] and A_pd["from"].values[i] > B_pd["To"].values[j]:
numberOfIntrupt+=1
cols = ['C_from', 'C_To', 'C_value']
C_dp=pd.DataFrame(columns=cols, index=range(len(A_pd)+numberOfIntrupt))
for i in range(len(A_pd)):
for j in range(len(B_pd)):
a=A_pd ["from"].values[i]
b=A_pd["To"].values[i]
c_eval=B_pd["Value"].values[j]
range_s=B_pd["from"].values[j]
range_f=B_pd["To"].values[j]
if a >= range_s and a <= range_f and b >= range_s and b <= range_f :
C_dp['C_from'].loc[i]=a
C_dp['C_To'].loc[i]=b
C_dp['C_value'].loc[i]=c_eval
elif a >= range_s and b > range_f:
C_dp['C_from'].loc[i]=a
C_dp['C_To'].loc[i]=range_f
C_dp['C_value'].loc[i]=c_eval
C_dp['C_from'].loc[i+1]=range_f
C_dp['C_To'].loc[i+1]=b
C_dp['C_value'].loc[i+1]=B_pd["Value"].values[j+1]
print(C_dp)
The current result is C_dp:
C_from C_To C_value
0 0 20 20
1 20 50 17
2 80 100 17
3 180 200 15
4 250 300 12
5 200 300 12
6 NaN NaN NaN
7 NaN NaN NaN
the expected should be :
C_from C_To C_value
0 0 20 20
1 20 50 17
2 80 100 17
3 100 120 15
4 180 200 15
5 200 210 12
6 250 300 12
Thank you a lot for the support
I'm sure there is a better way to do this without loops, but this will help your logic flow.
import pandas as pd
A_pd = {'from':[0, 20, 80, 180, 250],
'To':[20, 50, 120, 210, 300]}
A_pd=pd.DataFrame(A_pd)
B_pd = {'from':[0, 20, 100, 200],
'To':[20, 100,200, 300],
'Value':[20, 17, 15, 12]}
B_pd=pd.DataFrame(B_pd)
cols = ['C_from', 'C_To', 'C_value']
C_dp=pd.DataFrame(columns=cols)
spillover = False
for i in range(len(A_pd)):
for j in range(len(B_pd)):
a_from = A_pd["from"].values[i]
a_to = A_pd["To"].values[i]
b_from = B_pd["from"].values[j]
b_to = B_pd["To"].values[j]
b_value = B_pd['Value'].values[j]
if (a_from >= b_to):
# a_from outside b range
continue # next b
elif (a_from >= b_from):
# a_from within b range
if a_to <= b_to:
C_dp = C_dp.append({"C_from": a_from, "C_To": a_to, "C_value": b_value}, ignore_index=True)
break # next a
else:
C_dp = C_dp.append({"C_from": a_from, "C_To": b_to, "C_value": b_value}, ignore_index=True)
if j < len(B_pd):
spillover = True
continue
if spillover:
if a_to <= b_to:
C_dp = C_dp.append({"C_from": b_from, "C_To": a_to, "C_value": b_value}, ignore_index=True)
spillover = False
break
else:
C_dp = C_dp.append({"C_from": b_from, "C_To": b_to, "C_value": b_value}, ignore_index=True)
spillover = True
continue
print(C_dp)
Output
C_from C_To C_value
0 0 20 20
1 20 50 17
2 80 100 17
3 100 120 15
4 180 200 15
5 200 210 12
6 250 300 12
I have a dataframe like so:
Class price demand
1 22 8
1 60 7
3 32 14
2 72 9
4 45 20
5 42 25
What I'd like to do is group classes 1-3 in one category and classes 4-5 in one category. Then I'd like to get the sum of price for each category and the sum of demand for each category. I'd like to also get the mean. The result should look something like this:
Class TotalPrice TotalDemand AveragePrice AverageDemand
P 186 38 46.5 9.5
E 87 45 43.5 22.5
Where P is classes 1-3 and E is classes 4-5. How can I group by categories in pandas? Is there a way to do this?
In [8]: df.groupby(np.where(df['Class'].isin([1, 2, 3]), 'P', 'E'))[['price', 'demand']].agg(['sum', 'mean'])
Out[8]:
price demand
sum mean sum mean
E 87 43.5 45 22.5
P 186 46.5 38 9.5
You can create a dictionary that defines your groups.
mapping = {**dict.fromkeys([1, 2, 3], 'P'), **dict.fromkeys([4, 5], 'E')}
Then if you pass a dictionary or callable to a groupby it automatically gets mapped onto the index. So, let's set the index to Class
d = df.set_index('Class').groupby(mapping).agg(['sum', 'mean']).sort_index(1, 1)
Finally, we do some tweaking to get column names the way you specified.
rename_dict = {'sum': 'Total', 'mean': 'Average'}
d.columns = d.columns.map(lambda c: f"{rename_dict[c[1]]}{c[0].title()}")
d.rename_axis('Class').reset_index()
Class TotalPrice TotalDemand AveragePrice AverageDemand
0 E 87 45 43.5 22.5
1 P 186 38 46.5 9.5
In general, you can form arbitrary bins to group your data using pd.cut, specifying the right bin edges:
import pandas as pd
pd.cut(df.Class, bins=[0, 3, 5], labels=['P', 'E'])
#0 P
#1 P
#2 P
#3 P
#4 E
#5 E
df2 = (df.groupby(pd.cut(df.Class, bins=[0,3,5], labels=['P', 'E']))[['demand', 'price']]
.agg({'sum', 'mean'}).reset_index())
# Get rid of the multi-level columns
df2.columns = [f'{i}_{j}' if j != '' else f'{i}' for i,j in df2.columns]
Output:
Class demand_sum demand_mean price_sum price_mean
0 P 38 9.5 186 46.5
1 E 45 22.5 87 43.5
Any help is greatly appreciated!! I have been trying to solve this for the last few days....
I have two arrays:
import pandas as pd
OldDataSet = {
'id': [20,30,40,50,60,70]
,'OdoLength': [26.12,43.12,46.81,56.23,111.07,166.38]}
NewDataSet = {
'id': [3000,4000,5000,6000,7000,8000]
,'OdoLength': [25.03,42.12,45.74,46,110.05,165.41]}
df1= pd.DataFrame(OldDataSet)
df2 = pd.DataFrame(NewDataSet)
OldDataSetArray = df1.as_matrix()
NewDataSetArray = df2.as_matrix()
The result that I am trying to get is:
Array 1 and Array 2 Match by closes difference, based on left over number from Array2
20 26.12 3000 25.03
30 43.12 4000 42.12
40 46.81 6000 46
50 56.23 7000 110.05
60 111.07 8000 165.41
70 166.38 0 0
Starting at Array 1, ID 20, find the nearest which in this case would be the first Number in Array 2 ID 3000 (26.12-25.03). so ID 20, gets matched to 3000.
Where it gets tricky is if one value in Array 2 is not the closest, then it is skipped. for example, ID 40 value 46.81 is compared to 45.74, 46 and the smallest value is .81 from 46 ID 6000. So ID 40--> ID 6000. ID 5000 in array 2 is now skipped for any future comparisons. So now when comparing array 1 ID 50, it is compared to the next available number in array 2, 110.05. array 1 ID 50 is matched to Array 2 ID 7000.
UPDATE
so here's the code that i have tried and it works. Yes, it is not the greatest, so if someone has another suggestion please let me know.
import pandas as pd
import operator
OldDataSet = {
'id': [20,30,40,50,60,70]
,'OdoLength': [26.12,43.12,46.81,56.23,111.07,166.38]}
NewDataSet = {
'id': [3000,4000,5000,6000,7000,8000]
,'OdoLength': [25.03,42.12,45.74,46,110.05,165.41]}
df1= pd.DataFrame(OldDataSet)
df2 = pd.DataFrame(NewDataSet)
OldDataSetArray = df1.as_matrix()
NewDataSetArray = df2.as_matrix()
newPos = 1
CurrentNumber = 0
OldArrayLen = len(OldDataSetArray) -1
NewArrayLen = len(NewDataSetArray) -1
numberResults = []
for oldPos in range(len(OldDataSetArray)):
PreviousNumber = abs(OldDataSetArray[oldPos, 0]- NewDataSetArray[oldPos, 0])
while newPos <= len(NewDataSetArray) - 1:
CurrentNumber = abs(OldDataSetArray[oldPos, 0] - NewDataSetArray[newPos, 0])
#if it is the last row for the inner array, then match the next available
#in Array 1 to that last record
if newPos == NewArrayLen and oldPos < newPos and oldPos +1 <= OldArrayLen:
numberResults.append([OldDataSetArray[oldPos +1, 1],NewDataSetArray[newPos, 1],OldDataSetArray[oldPos +1, 0],NewDataSetArray[newPos, 0]])
if PreviousNumber < CurrentNumber:
numberResults.append([OldDataSetArray[oldPos, 1], NewDataSetArray[newPos - 1, 1], OldDataSetArray[oldPos, 0], NewDataSetArray[newPos - 1, 0]])
newPos +=1
break
elif PreviousNumber > CurrentNumber:
PreviousNumber = CurrentNumber
newPos +=1
#sort by array one values
numberResults = sorted(numberResults, key=operator.itemgetter(0))
numberResultsDf = pd.DataFrame(numberResults)
You can use NumPy broadcasting to build a distance matrix:
a = numpy.array([26.12, 43.12, 46.81, 56.23, 111.07, 166.38,])
b = numpy.array([25.03, 42.12, 45.74, 46, 110.05, 165.41,])
numpy.abs(a[:, None] - b[None, :])
# array([[ 1.09, 16. , 19.62, 19.88, 83.93, 139.29],
# [ 18.09, 1. , 2.62, 2.88, 66.93, 122.29],
# [ 21.78, 4.69, 1.07, 0.81, 63.24, 118.6 ],
# [ 31.2 , 14.11, 10.49, 10.23, 53.82, 109.18],
# [ 86.04, 68.95, 65.33, 65.07, 1.02, 54.34],
# [ 141.35, 124.26, 120.64, 120.38, 56.33, 0.97]])
of that matrix you can then find the closest elements using argmin, either row- or columnwise (depending of if you want to search in a or b).
numpy.argmin(numpy.abs(a[:, None] - b[None, :]), axis=1)
# array([0, 1, 3, 3, 4, 5])
Compute all the differences, and use `np.argmin to lookup the closest.
a,b=np.random.rand(2,10)
all_differences=np.abs(np.subtract.outer(a,b))
ia=all_differences.argmin(axis=1)
for i in range(10):
print(i,a[i],ia[i], b[ia[i]])
0 0.231603891949 8 0.21177584152
1 0.27810475456 7 0.302647382888
2 0.582133214953 2 0.548920922033
3 0.892858042793 1 0.872622982632
4 0.67293347218 6 0.677971552011
5 0.985227546492 1 0.872622982632
6 0.82431697833 5 0.83765895237
7 0.426992114791 4 0.451084369838
8 0.181147161752 8 0.21177584152
9 0.631139744522 3 0.653554586691
EDIT
with dataframes and indexes:
va,vb=np.random.rand(2,10)
na,nb=np.random.randint(0,100,(2,10))
dfa=pd.DataFrame({'id':na,'odo':va})
dfb=pd.DataFrame({'id':nb,'odo':vb})
all_differences=np.abs(np.subtract.outer(dfa.odo,dfb.odo))
ia=all_differences.argmin(axis=1)
dfc=dfa.merge(dfb.loc[ia].reset_index(drop=True),\
left_index=True,right_index=True)
Input :
In [337]: dfa
Out[337]:
id odo
0 72 0.426457
1 12 0.315997
2 96 0.623164
3 9 0.821498
4 72 0.071237
5 5 0.730634
6 45 0.963051
7 14 0.603289
8 5 0.401737
9 63 0.976644
In [338]: dfb
Out[338]:
id odo
0 95 0.333215
1 7 0.023957
2 61 0.021944
3 57 0.660894
4 22 0.666716
5 6 0.234920
6 83 0.642148
7 64 0.509589
8 98 0.660273
9 19 0.658639
Output :
In [339]: dfc
Out[339]:
id_x odo_x id_y odo_y
0 72 0.426457 64 0.509589
1 12 0.315997 95 0.333215
2 96 0.623164 83 0.642148
3 9 0.821498 22 0.666716
4 72 0.071237 7 0.023957
5 5 0.730634 22 0.666716
6 45 0.963051 22 0.666716
7 14 0.603289 83 0.642148
8 5 0.401737 95 0.333215
9 63 0.976644 22 0.666716
I am currently trying to process an experimental timeseries dataset, which has missing values. I would like to calculate the sliding windowed mean of this dataset along time, while handling nan values. The correct way for me to do it is to compute inside each window the sum of the finite elements and divide it with their number. This nonlinearity forces me to use non convolutional methods to face this problem, thus I have a severe time bottleneck in this part of the process. As a code example of what I am trying to accomplish I present the following:
import numpy as np
#Construct sample data
n = 50
n_miss = 20
win_size = 3
data= np.random.random(50)
data[np.random.randint(0,n-1, n_miss)] = None
#Compute mean
result = np.zeros(data.size)
for count in range(data.size):
part_data = data[max(count - (win_size - 1) / 2, 0): min(count + (win_size + 1) / 2, data.size)]
mask = np.isfinite(part_data)
if np.sum(mask) != 0:
result[count] = np.sum(part_data[mask]) / np.sum(mask)
else:
result[count] = None
print 'Input:\t',data
print 'Output:\t',result
with output:
Input: [ 0.47431791 0.17620835 0.78495647 0.79894688 0.58334064 0.38068788
0.87829696 nan 0.71589171 nan 0.70359557 0.76113969
0.13694387 0.32126573 0.22730891 nan 0.35057169 nan
0.89251851 0.56226354 0.040117 nan 0.37249799 0.77625334
nan nan nan nan 0.63227417 0.92781944
0.99416471 0.81850753 0.35004997 nan 0.80743783 0.60828597
nan 0.01410721 nan nan 0.6976317 nan
0.03875394 0.60924066 0.22998065 nan 0.34476729 0.38090961
nan 0.2021964 ]
Output: [ 0.32526313 0.47849424 0.5867039 0.72241466 0.58765847 0.61410849
0.62949242 0.79709433 0.71589171 0.70974364 0.73236763 0.53389305
0.40644977 0.22850617 0.27428732 0.2889403 0.35057169 0.6215451
0.72739103 0.49829968 0.30119027 0.20630749 0.57437567 0.57437567
0.77625334 nan nan 0.63227417 0.7800468 0.85141944
0.91349722 0.7209074 0.58427875 0.5787439 0.7078619 0.7078619
0.31119659 0.01410721 0.01410721 0.6976317 0.6976317 0.36819282
0.3239973 0.29265842 0.41961066 0.28737397 0.36283845 0.36283845
0.29155301 0.2021964 ]
Can this result be produced by numpy operations, without using a for loop?
You can do that using the rolling function of Pandas:
import numpy as np
import pandas as pd
#Construct sample data
n = 50
n_miss = 20
win_size = 3
data = np.random.random(n)
data[np.random.randint(0, n-1, n_miss)] = None
windowed_mean = pd.Series(data).rolling(window=win_size, min_periods=1).mean()
print(pd.DataFrame({'Data': data, 'Windowed mean': windowed_mean}) )
Output:
Data Windowed mean
0 0.589376 0.589376
1 0.639173 0.614274
2 0.343534 0.524027
3 0.250329 0.411012
4 0.911952 0.501938
5 NaN 0.581141
6 0.224964 0.568458
7 NaN 0.224964
8 0.508419 0.366692
9 0.215418 0.361918
10 NaN 0.361918
11 0.638118 0.426768
12 0.587478 0.612798
13 0.097037 0.440878
14 0.688689 0.457735
15 0.858593 0.548107
16 0.408903 0.652062
17 0.448993 0.572163
18 NaN 0.428948
19 0.877453 0.663223
20 NaN 0.877453
21 NaN 0.877453
22 0.021798 0.021798
23 0.482054 0.251926
24 0.092387 0.198746
25 0.251766 0.275402
26 0.093854 0.146002
27 NaN 0.172810
28 NaN 0.093854
29 NaN NaN
30 0.965669 0.965669
31 0.695999 0.830834
32 NaN 0.830834
33 NaN 0.695999
34 NaN NaN
35 0.613727 0.613727
36 0.837533 0.725630
37 NaN 0.725630
38 0.782295 0.809914
39 NaN 0.782295
40 0.777429 0.779862
41 0.401355 0.589392
42 0.491709 0.556831
43 0.127813 0.340292
44 0.781625 0.467049
45 0.960466 0.623301
46 0.637618 0.793236
47 0.651264 0.749782
48 0.154911 0.481264
49 0.159145 0.321773
Here's a convolution based approach using np.convolve -
mask = np.isnan(data)
K = np.ones(win_size,dtype=int)
out = np.convolve(np.where(mask,0,data), K)/np.convolve(~mask,K)
Please note that this would have one extra element on either sides.
If you are working with 2D data, we can use Scipy's 2D convolution.
Approaches -
def original_app(data, win_size):
#Compute mean
result = np.zeros(data.size)
for count in range(data.size):
part_data = data[max(count - (win_size - 1) / 2, 0): \
min(count + (win_size + 1) / 2, data.size)]
mask = np.isfinite(part_data)
if np.sum(mask) != 0:
result[count] = np.sum(part_data[mask]) / np.sum(mask)
else:
result[count] = None
return result
def numpy_app(data, win_size):
mask = np.isnan(data)
K = np.ones(win_size,dtype=int)
out = np.convolve(np.where(mask,0,data), K)/np.convolve(~mask,K)
return out[1:-1] # Slice out the one-extra elems on sides
Sample run -
In [118]: #Construct sample data
...: n = 50
...: n_miss = 20
...: win_size = 3
...: data= np.random.random(50)
...: data[np.random.randint(0,n-1, n_miss)] = np.nan
...:
In [119]: original_app(data, win_size = 3)
Out[119]:
array([ 0.88356487, 0.86829731, 0.85249541, 0.83776219, nan,
nan, 0.61054015, 0.63111926, 0.63111926, 0.65169837,
0.1857301 , 0.58335324, 0.42088104, 0.5384565 , 0.31027752,
0.40768907, 0.3478563 , 0.34089655, 0.55462903, 0.71784816,
0.93195716, nan, 0.41635575, 0.52211653, 0.65053379,
0.76762282, 0.72888574, 0.35250449, 0.35250449, 0.14500637,
0.06997668, 0.22582318, 0.18621848, 0.36320784, 0.19926647,
0.24506199, 0.09983572, 0.47595439, 0.79792941, 0.5982114 ,
0.42389375, 0.28944089, 0.36246113, 0.48088139, 0.71105449,
0.60234163, 0.40012839, 0.45100475, 0.41768466, 0.41768466])
In [120]: numpy_app(data, win_size = 3)
__main__:36: RuntimeWarning: invalid value encountered in divide
Out[120]:
array([ 0.88356487, 0.86829731, 0.85249541, 0.83776219, nan,
nan, 0.61054015, 0.63111926, 0.63111926, 0.65169837,
0.1857301 , 0.58335324, 0.42088104, 0.5384565 , 0.31027752,
0.40768907, 0.3478563 , 0.34089655, 0.55462903, 0.71784816,
0.93195716, nan, 0.41635575, 0.52211653, 0.65053379,
0.76762282, 0.72888574, 0.35250449, 0.35250449, 0.14500637,
0.06997668, 0.22582318, 0.18621848, 0.36320784, 0.19926647,
0.24506199, 0.09983572, 0.47595439, 0.79792941, 0.5982114 ,
0.42389375, 0.28944089, 0.36246113, 0.48088139, 0.71105449,
0.60234163, 0.40012839, 0.45100475, 0.41768466, 0.41768466])
Runtime test -
In [122]: #Construct sample data
...: n = 50000
...: n_miss = 20000
...: win_size = 3
...: data= np.random.random(n)
...: data[np.random.randint(0,n-1, n_miss)] = np.nan
...:
In [123]: %timeit original_app(data, win_size = 3)
1 loops, best of 3: 1.51 s per loop
In [124]: %timeit numpy_app(data, win_size = 3)
1000 loops, best of 3: 1.09 ms per loop
In [125]: import pandas as pd
# #jdehesa's pandas solution
In [126]: %timeit pd.Series(data).rolling(window=3, min_periods=1).mean()
100 loops, best of 3: 3.34 ms per loop
With following code snippet
import pandas as pd
train = pd.read_csv('train.csv',parse_dates=['dates'])
print(data['dates'])
I load and control the data.
My question is, how can I standardize/normalize data['dates'] to make all the elements lie between -1 and 1 (linear or gaussian)??
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import time
def convert_to_timestamp(x):
"""Convert date objects to integers"""
return time.mktime(x.to_datetime().timetuple())
def normalize(df):
"""Normalize the DF using min/max"""
scaler = MinMaxScaler(feature_range=(-1, 1))
dates_scaled = scaler.fit_transform(df['dates'])
return dates_scaled
if __name__ == '__main__':
# Create a random series of dates
df = pd.DataFrame({
'dates':
['1980-01-01', '1980-02-02', '1980-03-02', '1980-01-21',
'1981-01-21', '1991-02-21', '1991-03-23']
})
# Convert to date objects
df['dates'] = pd.to_datetime(df['dates'])
# Now df has date objects like you would, we convert to UNIX timestamps
df['dates'] = df['dates'].apply(convert_to_timestamp)
# Call normalization function
df = normalize(df)
Sample:
Date objects that we convert using convert_to_timestamp
dates
0 1980-01-01
1 1980-02-02
2 1980-03-02
3 1980-01-21
4 1981-01-21
5 1991-02-21
6 1991-03-23
UNIX timestamps that we can normalize using a MinMaxScaler from sklearn
dates
0 315507600
1 318272400
2 320778000
3 317235600
4 348858000
5 667069200
6 669661200
Normalized to (-1, 1), the final result
[-1. -0.98438644 -0.97023664 -0.99024152 -0.81166138 0.98536228
1. ]
a solution with Pandas
df = pd.DataFrame({
'A':
['1980-01-01', '1980-02-02', '1980-03-02', '1980-01-21',
'1981-01-21', '1991-02-21', '1991-03-23'] })
df['A'] = pd.to_datetime(df['A']).astype('int64')
max_a = df.A.max()
min_a = df.A.min()
min_norm = -1
max_norm =1
df['NORMA'] = (df.A- min_a) *(max_norm - min_norm) / (max_a-min_a) + min_norm
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
df = pd.DataFrame(np.random.randint(1, 100, (1000, 2)).astype(float64), columns=['A', 'B'])
A B
0 87 95
1 15 12
2 85 88
3 33 61
4 33 29
5 33 91
6 67 19
7 68 20
8 79 18
9 29 93
.. .. ..
990 70 84
991 37 24
992 91 12
993 92 13
994 4 64
995 32 98
996 97 62
997 38 40
998 12 56
999 48 8
[1000 rows x 2 columns]
# specify your desired range (-1, 1)
scaler = MinMaxScaler(feature_range=(-1, 1))
scaled = scaler.fit_transform(df.values)
print(scaled)
[[ 0.7551 0.9184]
[-0.7143 -0.7755]
[ 0.7143 0.7755]
...,
[-0.2449 -0.2041]
[-0.7755 0.1224]
[-0.0408 -0.8571]]
df[['A', 'B']] = scaled
Out[30]:
A B
0 0.7551 0.9184
1 -0.7143 -0.7755
2 0.7143 0.7755
3 -0.3469 0.2245
4 -0.3469 -0.4286
5 -0.3469 0.8367
6 0.3469 -0.6327
7 0.3673 -0.6122
8 0.5918 -0.6531
9 -0.4286 0.8776
.. ... ...
990 0.4082 0.6939
991 -0.2653 -0.5306
992 0.8367 -0.7755
993 0.8571 -0.7551
994 -0.9388 0.2857
995 -0.3673 0.9796
996 0.9592 0.2449
997 -0.2449 -0.2041
998 -0.7755 0.1224
999 -0.0408 -0.8571
[1000 rows x 2 columns]