I'm trying to figure out how to speed up a scipy.minimize
function.
minimize()
is called thousands of times. I run it in parallel using a ProcessPoolExecutor. bootstrap()
is the parent function.
def optimize_weights(forecasts, prices, instrument):
guess = [1/forecasts.shape[1]] * forecasts.shape[1]
bounds = [(0.0,1.0)] * forecasts.shape[1]
cons = {'type': 'eq', 'fun': lambda x: 1 - sum(x)}
def function(w, forecasts, prices, instrument):
wf = (w*forecasts).mean(axis=1)
wf = wf*10/wf.std()
wf = wf.clip(-20,20)
l = accountCurve(wf, prices, slippage=instrument.slippage, per_contract_cost=instrument.per_contract_cost)
return -l.sharpe()
result = minimize(function, guess, (forecasts, prices, instrument), bounds=bounds, method='SLSQP', tol=0.0001, constraints=cons, options={'disp': False ,'eps' : 1e0})
return result.x
def mp_optimize_weights(samples, prices, instrument):
with ProcessPoolExecutor() as executor:
return executor.map(partial(optimize_weights, prices=prices, instrument=instrument), samples)
def bootstrap(instrument, parallel_process=True):
print("Parallel Process: ", parallel_process)
forecasts = instrument.forecasts().dropna()
prices = instrument.prices().reset_index('Contract', drop=True)
prices = prices[forecasts.index]
years = list(set(prices.index.year))
years.sort()
result={}
for year in years:
sample_length = np.int(prices[:str(year)].size/10)
end_of_sample_selection_space = prices[:str(year)].tail(1).index[0] - pd.Timedelta(days=sample_length)
sample_dates = pd.to_datetime(np.random.choice(prices[:end_of_sample_selection_space].index,100))
if(sample_length > 50):
samples = [forecasts.loc[date:date+pd.Timedelta(days=sample_length)] for date in sample_dates]
if parallel_process is True:
weights = pd.DataFrame(list(mp_optimize_weights(samples, prices[:str(year)], instrument=instrument)))
else:
weights = pd.DataFrame(list(map(partial(optimize_weights, prices=prices[:str(year)], instrument=instrument), samples)))
if len(weights)<2:
print('Weights error')
break
result[year]=weights.mean()
print(year, sample_length)
output = pd.DataFrame.from_dict(result).transpose()
output.columns = forecasts.columns
pl.plot(output)
display.clear_output(wait=True)
display.display(pl.gcf())
return output
On my data, this takes around 45 minutes to run.
I'd like to know:
- Is my approach to parallel processing correct? Should I be using threads instead of processes?
- Can I reconfigure minimize to finish faster? This is bootstrapping, which is a monte-carlo based sampling method, may not require such an accurate result.
- Anything else I can do to speed it up?
In an while world, I'd like to speed it up an order of magnitude.
Aucun commentaire:
Enregistrer un commentaire