I would like to use QuantLib within python mainly to price interest rate instruments (derivatives down the track) within a portfolio context. The main requirement would be to pass daily yield curves to the system to price on successive days (let's ignore system performance issues for now). My question is, have I structured the example below correctly to do this? My understanding is that I would need at least one curve object per day with the necessary linking etc. I have made use of pandas to attempt this. Guidance on this would be appreciated.
import QuantLib as ql
import math
import pandas as pd
import datetime as dt
# MARKET PARAMETRES
calendar = ql.SouthAfrica()
bussiness_convention = ql.Unadjusted
day_count = ql.Actual365Fixed()
interpolation = ql.Linear()
compounding = ql.Compounded
compoundingFrequency = ql.Quarterly
def perdelta(start, end, delta):
date_list=[]
curr = start
while curr < end:
date_list.append(curr)
curr += delta
return date_list
def to_datetime(d):
return dt.datetime(d.year(),d.month(), d.dayOfMonth())
def format_rate(r):
return '{0:.4f}'.format(r.rate()*100.00)
#QuantLib must have dates in its date objects
dicPeriod={'DAY':ql.Days,'WEEK':ql.Weeks,'MONTH':ql.Months,'YEAR':ql.Years}
issueDate = ql.Date(19,8,2014)
maturityDate = ql.Date(19,8,2016)
#Bond Schedule
schedule = ql.Schedule (issueDate, maturityDate,
ql.Period(ql.Quarterly),ql.TARGET(),ql.Following, ql.Following,
ql.DateGeneration.Forward,False)
fixing_days = 0
face_amount = 100.0
def price_floater(myqlvalDate,jindex,jibarTermStructure,discount_curve):
bond = ql.FloatingRateBond(settlementDays = 0,
faceAmount = 100,
schedule = schedule,
index = jindex,
paymentDayCounter = ql.Actual365Fixed(),
spreads=[0.02])
bondengine = ql.DiscountingBondEngine(ql.YieldTermStructureHandle(discount_curve))
bond.setPricingEngine(bondengine)
ql.Settings.instance().evaluationDate = myqlvalDate
return [bond.NPV() ,bond.cleanPrice()]
start_date=dt.datetime(2014,8,19)
end_date=dt.datetime(2015,8,19)
all_dates=perdelta(start_date,end_date,dt.timedelta(days=1))
dtes=[];fixings=[]
for d in all_dates:
if calendar.isBusinessDay(ql.QuantLib.Date(d.day,d.month,d.year)):
dtes.append(ql.QuantLib.Date(d.day,d.month,d.year))
fixings.append(0.1)
df_ad=pd.DataFrame(all_dates,columns=['valDate'])
df_ad['qlvalDate']=df_ad.valDate.map(lambda x:ql.DateParser.parseISO(x.strftime('%Y-%m-%d')))
df_ad['jibarTermStructure'] = df_ad.qlvalDate.map(lambda x:ql.RelinkableYieldTermStructureHandle())
df_ad['discountStructure'] = df_ad.qlvalDate.map(lambda x:ql.RelinkableYieldTermStructureHandle())
df_ad['jindex'] = df_ad.jibarTermStructure.map(lambda x: ql.Jibar(ql.Period(3,ql.Months),x))
df_ad.jindex.map(lambda x:x.addFixings(dtes, fixings))
df_ad['flatCurve'] = df_ad.apply(lambda r: ql.FlatForward(r['qlvalDate'],0.1,ql.Actual365Fixed(),compounding,compoundingFrequency),axis=1)
df_ad.apply(lambda x:x['jibarTermStructure'].linkTo(x['flatCurve']),axis=1)
df_ad.apply(lambda x:x['discountStructure'].linkTo(x['flatCurve']),axis=1)
df_ad['discount_curve']= df_ad.apply(lambda x:ql.ZeroSpreadedTermStructure(x['discountStructure'],ql.QuoteHandle(ql.SimpleQuote(math.log(1+0.02)))),axis=1)
df_ad['all_in_price']=df_ad.apply(lambda r:price_floater(r['qlvalDate'],r['jindex'],r['jibarTermStructure'],r['discount_curve'])[0],axis=1)
df_ad['clean_price']=df_ad.apply(lambda r:price_floater(r['qlvalDate'],r['jindex'],r['jibarTermStructure'],r['discount_curve'])[1],axis=1)
df_plt=df_ad[['valDate','all_in_price','clean_price']]
df_plt=df_plt.set_index('valDate')
from matplotlib import ticker
def func(x, pos):
s = str(x)
ind = s.index('.')
return s[:ind] + '.' + s[ind+1:]
ax=df_plt.plot()
ax.yaxis.set_major_formatter(ticker.FuncFormatter(func))
Thanks to Luigi Ballabio I have reworked the example above to incorporate the design principles within QuantLib so as to avoid unnecessary calling.
Now the static data is truly static and only the market data varies (I hope).
I now understand better how the live objects listen for changes in linked variables.
Static data is the following:
bondengine
bond
structurehandles
historical jibar index
Market data will be the only varying component
daily swap curve
market spread over swap curve
The reworked example is below:
import QuantLib as ql
import math
import pandas as pd
import datetime as dt
import numpy as np
# MARKET PARAMETRES
calendar = ql.SouthAfrica()
bussiness_convention = ql.Unadjusted
day_count = ql.Actual365Fixed()
interpolation = ql.Linear()
compounding = ql.Compounded
compoundingFrequency = ql.Quarterly
def perdelta(start, end, delta):
date_list=[]
curr = start
while curr < end:
date_list.append(curr)
curr += delta
return date_list
def to_datetime(d):
return dt.datetime(d.year(),d.month(), d.dayOfMonth())
def format_rate(r):
return '{0:.4f}'.format(r.rate()*100.00)
#QuantLib must have dates in its date objects
dicPeriod={'DAY':ql.Days,'WEEK':ql.Weeks,'MONTH':ql.Months,'YEAR':ql.Years}
issueDate = ql.Date(19,8,2014)
maturityDate = ql.Date(19,8,2016)
#Bond Schedule
schedule = ql.Schedule (issueDate, maturityDate,
ql.Period(ql.Quarterly),ql.TARGET(),ql.Following, ql.Following,
ql.DateGeneration.Forward,False)
fixing_days = 0
face_amount = 100.0
start_date=dt.datetime(2014,8,19)
end_date=dt.datetime(2015,8,19)
all_dates=perdelta(start_date,end_date,dt.timedelta(days=1))
dtes=[];fixings=[]
for d in all_dates:
if calendar.isBusinessDay(ql.QuantLib.Date(d.day,d.month,d.year)):
dtes.append(ql.QuantLib.Date(d.day,d.month,d.year))
fixings.append(0.1)
jibarTermStructure = ql.RelinkableYieldTermStructureHandle()
jindex = ql.Jibar(ql.Period(3,ql.Months), jibarTermStructure)
jindex.addFixings(dtes, fixings)
discountStructure = ql.RelinkableYieldTermStructureHandle()
bond = ql.FloatingRateBond(settlementDays = 0,
faceAmount = 100,
schedule = schedule,
index = jindex,
paymentDayCounter = ql.Actual365Fixed(),
spreads=[0.02])
bondengine = ql.DiscountingBondEngine(discountStructure)
bond.setPricingEngine(bondengine)
spread = ql.SimpleQuote(0.0)
discount_curve = ql.ZeroSpreadedTermStructure(jibarTermStructure,ql.QuoteHandle(spread))
discountStructure.linkTo(discount_curve)
# ...here is the pricing function...
# pricing:
def price_floater(myqlvalDate,jibar_curve,credit_spread):
credit_spread = math.log(1.0+credit_spread)
ql.Settings.instance().evaluationDate = myqlvalDate
jibarTermStructure.linkTo(jibar_curve)
spread.setValue(credit_spread)
ql.Settings.instance().evaluationDate = myqlvalDate
return pd.Series({'NPV': bond.NPV(), 'cleanPrice': bond.cleanPrice()})
# ...and here are the remaining varying parts:
df_ad=pd.DataFrame(all_dates,columns=['valDate'])
df_ad['qlvalDate']=df_ad.valDate.map(lambda x:ql.DateParser.parseISO(x.strftime('%Y-%m-%d')))
df_ad['jibar_curve'] = df_ad.apply(lambda r: ql.FlatForward(r['qlvalDate'],0.1,ql.Actual365Fixed(),compounding,compoundingFrequency),axis=1)
df_ad['spread']=np.random.uniform(0.015, 0.025, size=len(df_ad)) # market spread
df_ad['all_in_price'], df_ad["clean_price"]=zip(*df_ad.apply(lambda r:price_floater(r['qlvalDate'],r['jibar_curve'],r['spread']),axis=1).to_records())[1:]
# plot result
df_plt=df_ad[['valDate','all_in_price','clean_price']]
df_plt=df_plt.set_index('valDate')
from matplotlib import ticker
def func(x, pos): # formatter function takes tick label and tick position
s = str(x)
ind = s.index('.')
return s[:ind] + '.' + s[ind+1:] # change dot to comma
ax=df_plt.plot()
ax.yaxis.set_major_formatter(ticker.FuncFormatter(func))
Your solution would work, but creating a bond per day kind of goes against the grain of the library. You can create the bond and the JIBAR index just once, and just change the evaluation date and the corresponding curves; the bond will detect the changes and recalculate.
In the general case, this would be something like:
# here are the parts that stay the same...
jibarTermStructure = ql.RelinkableYieldTermStructureHandle()
jindex = ql.Jibar(ql.Period(3,ql.Months), jibarTermStructure)
jindex.addFixings(dtes, fixings)
discountStructure = ql.RelinkableYieldTermStructureHandle()
bond = ql.FloatingRateBond(settlementDays = 0,
faceAmount = 100,
schedule = schedule,
index = jindex,
paymentDayCounter = ql.Actual365Fixed(),
spreads=[0.02])
bondengine = ql.DiscountingBondEngine(discountStructure)
bond.setPricingEngine(bondengine)
# ...here is the pricing function...
def price_floater(myqlvalDate,jibar_curve,discount_curve):
ql.Settings.instance().evaluationDate = myqlvalDate
jibarTermStructure.linkTo(jibar_curve)
discountStructure.linkTo(discount_curve)
return [bond.NPV() ,bond.cleanPrice()]
# ...and here are the remaining varying parts:
df_ad=pd.DataFrame(all_dates,columns=['valDate'])
df_ad['qlvalDate']=df_ad.valDate.map(lambda x:ql.DateParser.parseISO(x.strftime('%Y-%m-%d')))
df_ad['flatCurve'] = df_ad.apply(lambda r: ql.FlatForward(r['qlvalDate'],0.1,ql.Actual365Fixed(),compounding,compoundingFrequency),axis=1)
df_ad['discount_curve']= df_ad.apply(lambda x:ql.ZeroSpreadedTermStructure(jibarTermStructure,ql.QuoteHandle(ql.SimpleQuote(math.log(1+0.02)))),axis=1)
df_ad['all_in_price']=df_ad.apply(lambda r:price_floater(r['qlvalDate'],r['flatCurve'],r['discount_curve'])[0],axis=1)
df_ad['clean_price']=df_ad.apply(lambda r:price_floater(r['qlvalDate'],r['flatCurve'],r['discount_curve'])[0],axis=1)
df_plt=df_ad[['valDate','all_in_price','clean_price']]
df_plt=df_plt.set_index('valDate')
Now, even in the most general case, the above can be optimized: you're calling price_floater twice per date, so you're doing twice the work. I'm not familiar with pandas, but I'd guess you can make a single call and set df_ad['all_in_price'] and df_ad['clean_price'] with a single assignment.
Moreover, there might be ways to simplify the code even further depending on your use cases. The discount curve might be instantiated once and the spread changed during pricing:
# in the "only once" part:
spread = ql.SimpleQuote()
discount_curve = ql.ZeroSpreadedTermStructure(jibarTermStructure,ql.QuoteHandle(spread))
discountStructure.linkTo(discount_curve)
# pricing:
def price_floater(myqlvalDate,jibar_curve,credit_spread):
ql.Settings.instance().evaluationDate = myqlvalDate
jibarTermStructure.linkTo(jibar_curve)
spread.setValue(credit_spread)
return [bond.NPV() ,bond.cleanPrice()]
and in the varying part, you'll just have an array of credit spreads intead of an array of discount curves.
If the curves are all flat, you can do the same by taking advantage of another feature: if you initialize a curve with a number of days and a calendar instead of a date, its reference date will move with the evaluation date (if the number of days is 0, it will be the evaluation date; if it's 1, it will be the next business day, and so on).
# only once:
risk_free = ql.SimpleQuote()
jibar_curve = ql.FlatForward(0,calendar,ql.QuoteHandle(risk_free),ql.Actual365Fixed(),compounding,compoundingFrequency)
jibarTermStructure.linkTo(jibar_curve)
# pricing:
def price_floater(myqlvalDate,risk_free_rate,credit_spread):
ql.Settings.instance().evaluationDate = myqlvalDate
risk_free.linkTo(risk_free_rate)
spread.setValue(credit_spread)
return [bond.NPV() ,bond.cleanPrice()]
and in the varying part, you'll replace the array of jibar curves with a simple array of rates.
The above should give you the same result as your code, but will instantiate a lot less objects and thus probably save memory and increase performance.
One final warning: neither my code nor yours will work if pandas' map evaluates the results in parallel; you'd end up trying to set up the global evaluation date to several values simultaneously, and that wouldn't go well.
Related
I am streaming live price data using the IB API, and I want to put it in a dataframe for analysis. My data consists of a price being live streamed with no timestamp.
I think I need to create new rows using row numbers that are automatically added, and have the prices inserted in the price column.
I have tried defining the dataframe and telling the price where to go as follows:
def tick_df(self, reqId,
contract): # this stores price dataframe by creating an empty dataframe and setting the index to the time column
self.bardata[reqId] = pd.DataFrame(columns=['index', 'price'])
self.reqMktData(reqId, contract, "", False, False, [])
self.bardata[reqId].index = [x for x in range(1, len(self.bardata[reqId].values) + 1)]
return self.bardata[reqId]
def tickPrice(self, reqId, tickType, price, attrib): # this function prints the price
if tickType == 2 and reqId == 102:
self.bardata[reqId].loc[self.bardata[reqId].index] = price
I have been using a methodology similar to here (https://github.com/PythonForForex/Interactive-brokers-python-api-guide/blob/master/GOOG_five_percent.py). However, as I am only streaming a price, I am unable to use the timestamp for creating new rows.
I don't know if this is what you need. In a loop I generate random price that I append to a data frame.
import numpy as np
import pandas as pd
_price = 1.1300 # first price in the series
_std = 0.0005 # volatility (stadard deviation)
df = pd.DataFrame(columns=['price'])
for i in range(1000):
_wn = np.random.normal(loc=0, scale=_std, size=1) # random white noise
_price = _price + _wn[0] # random price
df = df.append({'price':_price}, ignore_index=True)
df
I work with FOREX time series and I do not conceive time series without time so, just in case you have the same 'problem', I'm including a version with time stamp:
import numpy as np
import pandas as pd
from datetime import datetime
_price = 1.1300 # first price in the series
_std = 0.0005 # volatility (stadard deviation)
df = pd.DataFrame(columns=['price', 'time'])
for i in range(1000):
_wn = np.random.normal(loc=0, scale=_std, size=1) # random white noise
_price = _price + _wn[0] # random price
_time = datetime.now()
df = df.append({'price':_price, 'time':_time}, ignore_index=True)
df
Please let me know if this is what you needed.
I have two functions:
First (z_score) to compute rolling z-score values given df column
Second (z_score_cum) to compute cumulative z-score without forward-looking bias
# rolling z_score
def z_score(df, window):
val_column = df.columns[0]
col_mean = df[val_column].rolling(window=window).mean()
col_std = df[val_column].rolling(window=window).std()
df['zscore' + '_'+ str(window)+'D'] = (df[val_column] - col_mean)/col_std
return df
# cumulative z_score
def z_score_cum(data_frame):
# calculating length of original data frame to standardize
len_ = len(data_frame)
# storing column name & making a copy of data frame
val_column = data_frame.columns[0]
data_frame_standardized_final = data_frame.copy()
# calculating statistics
data_frame_standardized_final['mean_past'] = [np.mean(data_frame_standardized_final[val_column][0:lv+1]) for lv in range(0,len_)]
data_frame_standardized_final['std_past'] = [np.std(data_frame_standardized_final[val_column][0:lv+1]) for lv in range(0,len_)]
data_frame_standardized_final['z_score_cum'] = (data_frame_standardized_final[val_column] - data_frame_standardized_final['mean_past']) / data_frame_standardized_final['std_past']
return data_frame_standardized_final[['z_score_cum']]
I would like to somehow combine those two into one z-score function, so that, no matter if I pass time window as parameter, it would compute z-score based on window and additionaly, will contain one column with cumulative z-score. Currently, I am creating a list of time windows (here in days), which I am passing in the loop while calling the function and joining this additional column separately, which I don't think is the optimal way of processing.
d_list = [n * 21 for n in range(1,13)]
df_zscore = df.copy()
for i in d_list:
df_zscore = z_score(df_zscore, i)
df_zscore_cum = z_score_cum(df)
df_z_scores = pd.concat([df_zscore, df_zscore_cum], axis=1)
Eventually, I made it this way:
def calculate_z_scores(self, list_of_windows, freq_flag='D'):
"""
Calculates rolling z-scores and cumulative z-scores based on given list
of time windows
Parameters
----------
list_of_windows : list
a list of time windows.
freq_flag : string
frequency flag. The default is 'D' (daily)
Returns
-------
data frame
a data frame with calculated rolling & cumulative z-score.
"""
z_scores_data_frame = self.original_data_frame.copy()
# get column with values (1st column)
val_column = z_scores_data_frame.columns[0]
len_ = len(z_scores_data_frame)
# calculating statistics for cumulative_zscore
z_scores_data_frame['mean_past'] = [np.mean(z_scores_data_frame[val_column][0:lv+1]) for lv in range(0,len_)]
z_scores_data_frame['std_past'] = [np.std(z_scores_data_frame[val_column][0:lv+1]) for lv in range(0,len_)]
z_scores_data_frame['zscore_cum'] = (z_scores_data_frame[val_column] - z_scores_data_frame['mean_past']) / z_scores_data_frame['std_past']
# taking care of rolling z_scores
for i in list_of_windows:
col_mean = z_scores_data_frame[val_column].rolling(window=i).mean()
col_std = z_scores_data_frame[val_column].rolling(window=i).std()
z_scores_data_frame['zscore' + '_' + str(i)+ freq_flag] = (z_scores_data_frame[val_column] - col_mean)/col_std
cols_to_leave = [c for c in z_scores_data_frame.columns if 'zscore' in c]
self.z_scores_data_frame = z_scores_data_frame[cols_to_leave]
return self.z_scores_data_frame
Just a sidenote: This is my class method, but after minor modifications, could be use as a standalone function.
I have some data where I'm trying to apply multiprocessing.pool on it as I have a machine available with 16 processors.
Here do I generate some pseudo data:
y = pd.Series(np.random.randint(400, high=600, size=1250))
date_today = datetime.now()
x = pd.date_range(date_today, date_today + timedelta(1250), freq='D')
data = pd.DataFrame(columns=['Date','Price'])
data['Date'] = x
data['Price'] = y
d={name: group for name, group in data.groupby(np.arange(len(data)) // (len(data)))}
What I exactly want is that I apply pool in the for loop parameters. So using a processor per constant:
parameters = range(300,550,50)
portfolio = pd.DataFrame(columns=['Parameter','Date','Price','Calculation'])
for key, value in sorted(d.items()):
for constante in parameters:
print('Constante:',constante)
# HERE I WANT TO USE MP.POOL()
In the code I'm using some sort of shifting window to perform calculations on. This is the simplest version of the code. So I want to assign a process per constant in the parameters while writing to a DF. How does one achieve this?
You'll want to use multiprocessing.pool.map a bit like this, though you'll probably have to adjust for your needs...
from functools import partial
from multiprocessing import Pool
def pool_map_fn(value=None, constante=None, i=None):
s = {'val': value[i:i+constante]}
window = pd.concat([s['val']['Date'],s['val']['Price']], axis=1)
window['Price'] = pd.to_numeric(window['Price'], errors='coerce').fillna(0)
calc = window['Price'].mean()
date_variable = window['Date'].iloc[-1]
price_var = window['Price'].iloc[-1]
if price_var < calc:
print('Parameter',constante,'Lower than average',date_variable,price_var,calc)
portfolio = portfolio.append({'Parameter': constante,
'Date': date_variable,
'Price': price_var,
'Calculation': calc}, ignore_index=True)
if price_var > calc:
print('Parameter',constante,'Higher than average',date_variable,price_var,calc)
parameters = range(300,550,50)
portfolio = pd.DataFrame(columns=['Parameter','Date','Price','Calculation'])
for key, value in sorted(d.items()):
for constante in parameters:
with Pool() as pool:
results = pool.map(partial(pool_map_fn, value=value, constante=constante),
range(len(value) - constante + 1))
Note: This is untested but should work, if you get errors try to resolve them as the concept should be sound.
I've got the following code that takes historical prices for a single asset and calculated forecasts, and computes how you would have faired if you had really invested your money according to the forecast. In financial parlance, it's a back-test.
The main problem is that its very slow, and I'm not sure what the right strategy is for improving it. I need to run this thousands of times, so an order of magnitude speedup is required.
Where should I begin looking?
class accountCurve():
def __init__(self, forecasts, prices):
self.curve = pd.DataFrame(columns=['Capital','Holding','Cash','Trade', 'Position'], dtype=float)
forecasts.dropna(inplace=True)
self.curve['Forecast'] = forecasts
self.curve['Price'] = prices
self.curve.loc[self.curve.index[0],['Capital', 'Holding', 'Cash', 'Trade', 'Position']] = [10000, 0, 10000, 0, 0]
for date, forecast in forecasts.iteritems():
x=self.curve.loc[date]
previous = self.curve.shift(1).loc[date]
if previous.isnull()['Cash']==False:
x['Cash'] = previous['Cash'] - previous['Trade'] * x['Price']
x['Position'] = previous['Position'] + previous['Trade']
x['Holding'] = x['Position'] * x['Price']
x['Capital'] = x['Cash'] + x['Holding']
x['Trade'] = np.fix(x['Capital']/x['Price'] * x['Forecast']/20) - x['Position']
Edit:
Datasets as requested:
Prices:
import quandl
corn = quandl.get('CHRIS/CME_C2')
prices = corn['Open']
Forecasts:
def ewmac(d):
columns = pd.Series([2, 4, 8, 16, 32, 64])
g = lambda x: d.ewm(span = x, min_periods = x*4).mean() - d.ewm(span = x*4, min_periods=x*4).mean()
f = columns.apply(g).transpose()
f = f*10/f.abs().mean()
f.columns = columns
return f.clip(-20,20)
forecasts=ewmac(prices)
I would suggest using a numpy array instead of a data frame inside the for loop. It usually gives significant speed boost.
So the code may look like:
class accountCurve():
def __init__(self, forecasts, prices):
self.curve = pd.DataFrame(columns=['Capital','Holding','Cash','Trade', 'Position'], dtype=float)
# forecasts.dropna(inplace=True)
self.curve['Forecast'] = forecasts.dropna()
self.curve['Price'] = prices
# helper np.array:
self.arr = np.array(self.curve)
self.arr[0,:5] = [10000, 0, 10000, 0, 0]
for i in range(1, self.arr.shape[0]):
this = self.arr[i]
prev = self.arr[i-1]
cash = prev[2] - prev[3] * this[6]
position = ...
holding = ...
capital = ...
trade = ...
this[:5] = [capital, holding, cash, trade, position]
# back to data frame:
self.curve[['Capital','Holding','Cash','Trade', 'Position']] = self.arr[:,:5]
# or maybe this would be faster:
# self.curve[:] = self.arr
I don't quite understand the significance of the line if previous.isnull()['Cash']==False:. It looks as if previous['Cash'] was never null, except maybe for the first row - but you set the first row earlier.
Also, you may consider executing forecasts.dropna(inplace=True) outside of the class. If its originally a data frame, you'll run it once instead of repeating it for every column. (Do I understand correctly that you input single columns of forecasts into the class?)
Next step I'd recommend is using some line profiler to see where your code spends most of the time and trying to optimize these bottlenecks. If you use ipython then you can try running %prun or %lprun. For example
%lprun -f accountCurve.__init__ A = accountCurve(...)
will produce stats for every line in your __init__.
I have a dataframe of 600 000 x/y points with date-time information, along another field 'status', with extra descriptive information
My objective is, for each record:
sum column 'status' by records that are within a certain spatial temporal buffer
the specific buffer is within t - 8 hours and < 100 meters
Currently I have the data in a pandas data frame.
I could, loop through the rows, and for each record, subset the dates of interest, then calculate a distances and restrict the selection further. However that would still be quite slow with so many records.
THIS TAKES 4.4 hours to run.
I can see that I could create a 3 dimensional kdtree with x, y, date as epoch time. However, I am not certain how to restrict the distances properly when incorporating dates and geographic distances.
Here is some reproducible code for you guys to test on:
Import
import numpy.random as npr
import numpy
import pandas as pd
from pandas import DataFrame, date_range
from datetime import datetime, timedelta
Create data
np.random.seed(111)
Function to generate test data
def CreateDataSet(Number=1):
Output = []
for i in range(Number):
# Create a date range with hour frequency
date = date_range(start='10/1/2012', end='10/31/2012', freq='H')
# Create long lat data
laty = npr.normal(4815862, 5000,size=len(date))
longx = npr.normal(687993, 5000,size=len(date))
# status of interest
status = [0,1]
# Make a random list of statuses
random_status = [status[npr.randint(low=0,high=len(status))] for i in range(len(date))]
# user pool
user = ['sally','derik','james','bob','ryan','chris']
# Make a random list of users
random_user = [user[npr.randint(low=0,high=len(user))] for i in range(len(date))]
Output.extend(zip(random_user, random_status, date, longx, laty))
return pd.DataFrame(Output, columns = ['user', 'status', 'date', 'long', 'lat'])
#Create data
data = CreateDataSet(3)
len(data)
#some time deltas
before = timedelta(hours = 8)
after = timedelta(minutes = 1)
Function to speed up
def work(df):
output = []
#loop through data index's
for i in range(0, len(df)):
l = []
#first we will filter out the data by date to have a smaller list to compute distances for
#create a mask to query all dates between range for date i
date_mask = (df['date'] >= df['date'].iloc[i]-before) & (df['date'] <= df['date'].iloc[i]+after)
#create a mask to query all users who are not user i (themselves)
user_mask = df['user']!=df['user'].iloc[i]
#apply masks
dists_to_check = df[date_mask & user_mask]
#for point i, create coordinate to calculate distances from
a = np.array((df['long'].iloc[i], df['lat'].iloc[i]))
#create array of distances to check on the masked data
b = np.array((dists_to_check['long'].values, dists_to_check['lat'].values))
#for j in the date queried data
for j in range(1, len(dists_to_check)):
#compute the ueclidean distance between point a and each point of b (the date masked data)
x = np.linalg.norm(a-np.array((b[0][j], b[1][j])))
#if the distance is within our range of interest append the index to a list
if x <=100:
l.append(j)
else:
pass
try:
#use the list of desired index's 'l' to query a final subset of the data
data = dists_to_check.iloc[l]
#summarize the column of interest then append to output list
output.append(data['status'].sum())
except IndexError, e:
output.append(0)
#print "There were no data to add"
return pd.DataFrame(output)
Run code and time it
start = datetime.now()
out = work(data)
print datetime.now() - start
Is there a way to do this query in a vectorized way? Or should I be chasing another technique.
<3
Here is what at least somewhat solves my problem. Since the loop can operate on different parts of the data independently, parallelization makes sense here.
using Ipython...
from IPython.parallel import Client
cli = Client()
cli.ids
cli = Client()
dview=cli[:]
with dview.sync_imports():
import numpy as np
import os
from datetime import timedelta
import pandas as pd
#We also need to add the time deltas and output list into the function as
#local variables as well as add the Ipython.parallel decorator
#dview.parallel(block=True)
def work(df):
before = timedelta(hours = 8)
after = timedelta(minutes = 1)
output = []
final time 1:17:54.910206, about 1/4 original time
I would still be very interested for anyone to suggest small speed improvements within the body of the function.