Coverage for jetgp/full_gddegp/optimizer.py: 79%

515 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2026-04-03 15:37 -0500

1import numpy as np 

2from numpy.linalg import cholesky, solve 

3from jetgp.full_gddegp import gddegp_utils as utils 

4import jetgp.utils as gen_utils 

5from scipy.linalg import cho_solve, cho_factor 

6from jetgp.hyperparameter_optimizers import OPTIMIZERS 

7from line_profiler import profile 

8from jetgp.utils import matern_kernel_grad_builder 

9class Optimizer: 

10 """ 

11 Optimizer class to perform hyperparameter tuning for derivative-enhanced Gaussian Process models 

12 by minimizing the negative log marginal likelihood (NLL). 

13 

14 Parameters 

15 ---------- 

16 model : object 

17 An instance of a model (e.g., ddegp) containing the necessary training data 

18 and kernel configuration. 

19 """ 

20 

21 def __init__(self, model): 

22 self.model = model 

23 self._kernel_plan = None 

24 self._deriv_buf = None 

25 self._deriv_buf_shape = None 

26 self._deriv_factors = None 

27 self._deriv_factors_key = None 

28 self._K_buf = None 

29 self._dK_buf = None 

30 self._kernel_buf_size = None 

31 self._W_proj_buf = None 

32 self._W_proj_shape = None 

33 

34 def _get_deriv_buf(self, phi, n_bases, order): 

35 from math import comb 

36 ndir = comb(n_bases + order, order) 

37 shape = (ndir, phi.shape[0], phi.shape[1]) 

38 if self._deriv_buf is None or self._deriv_buf_shape != shape: 

39 self._deriv_buf = np.zeros(shape, dtype=np.float64) 

40 self._deriv_buf_shape = shape 

41 return self._deriv_buf 

42 

43 def _expand_derivs(self, phi, n_bases, deriv_order): 

44 """Expand OTI derivatives, using fast struct path if available.""" 

45 if hasattr(phi, 'get_all_derivs_fast'): 

46 buf = self._get_deriv_buf(phi, n_bases, deriv_order) 

47 factors = self._get_deriv_factors(n_bases, deriv_order) 

48 return phi.get_all_derivs_fast(factors, buf) 

49 return phi.get_all_derivs(n_bases, deriv_order) 

50 

51 @staticmethod 

52 def _enum_factors(max_basis, ordi): 

53 from math import factorial 

54 from collections import Counter 

55 if ordi == 1: 

56 for _ in range(max_basis): 

57 yield 1.0 

58 return 

59 for last in range(1, max_basis + 1): 

60 if ordi == 2: 

61 for i in range(1, last + 1): 

62 counts = Counter((i, last)) 

63 f = 1 

64 for c in counts.values(): 

65 f *= factorial(c) 

66 yield float(f) 

67 else: 

68 for _, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1): 

69 counts = dict(prefix_counts) 

70 counts[last] = counts.get(last, 0) + 1 

71 f = 1 

72 for c in counts.values(): 

73 f *= factorial(c) 

74 yield float(f) 

75 

76 @staticmethod 

77 def _enum_factors_with_counts(max_basis, ordi): 

78 from math import factorial 

79 from collections import Counter 

80 if ordi == 1: 

81 for i in range(1, max_basis + 1): 

82 yield 1.0, {i: 1} 

83 return 

84 for last in range(1, max_basis + 1): 

85 for _, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1): 

86 counts = dict(prefix_counts) 

87 counts[last] = counts.get(last, 0) + 1 

88 f = 1 

89 for c in counts.values(): 

90 f *= factorial(c) 

91 yield float(f), counts 

92 

93 def _get_deriv_factors(self, n_bases, order): 

94 key = (n_bases, order) 

95 if self._deriv_factors is not None and self._deriv_factors_key == key: 

96 return self._deriv_factors 

97 factors = [1.0] 

98 for ordi in range(1, order + 1): 

99 factors.extend(self._enum_factors(n_bases, ordi)) 

100 self._deriv_factors = np.array(factors, dtype=np.float64) 

101 self._deriv_factors_key = key 

102 return self._deriv_factors 

103 

104 def _ensure_kernel_plan(self, n_bases): 

105 """Lazily precompute kernel plan (once per n_bases).""" 

106 if self._kernel_plan is not None and self._kernel_plan_n_bases == n_bases: 

107 return 

108 if not hasattr(utils, 'precompute_kernel_plan'): 

109 self._kernel_plan = None 

110 return 

111 self._kernel_plan = utils.precompute_kernel_plan( 

112 self.model.n_order, n_bases, 

113 self.model.flattened_der_indices, 

114 None, # GDDEGP uses even/odd bases, not powers 

115 self.model.derivative_locations, 

116 ) 

117 self._kernel_plan_n_bases = n_bases 

118 self._K_buf = None 

119 self._dK_buf = None 

120 self._kernel_buf_size = None 

121 

122 def _ensure_kernel_bufs(self, n_rows_func): 

123 """Pre-allocate reusable K and dK buffers (avoids repeated malloc).""" 

124 if self._kernel_plan is None: 

125 return 

126 total = n_rows_func + self._kernel_plan['n_pts_with_derivs'] 

127 if self._kernel_buf_size != total: 

128 self._K_buf = np.empty((total, total)) 

129 self._dK_buf = np.empty((total, total)) 

130 self._kernel_buf_size = total 

131 if 'row_offsets_abs' not in self._kernel_plan: 

132 self._kernel_plan['row_offsets_abs'] = self._kernel_plan['row_offsets'] + n_rows_func 

133 self._kernel_plan['col_offsets_abs'] = self._kernel_plan['col_offsets'] + n_rows_func 

134 

135 def _build_K(self, phi_exp, phi, n_bases): 

136 """Build kernel matrix using fast path if available.""" 

137 self._ensure_kernel_plan(n_bases) 

138 if self._kernel_plan is not None: 

139 base_shape = phi.shape 

140 self._ensure_kernel_bufs(base_shape[0]) 

141 phi_3d = phi_exp.reshape(phi_exp.shape[0], base_shape[0], base_shape[1]) 

142 return utils.rbf_kernel_fast(phi_3d, self._kernel_plan, out=self._K_buf) 

143 return utils.rbf_kernel( 

144 phi, phi_exp, self.model.n_order, n_bases, 

145 self.model.flattened_der_indices, 

146 index=self.model.derivative_locations, 

147 ) 

148 

149 @profile 

150 def negative_log_marginal_likelihood(self, x0): 

151 """ 

152 Compute the negative log marginal likelihood (NLL) of the model. 

153 

154 NLL = 0.5 * y^T K^-1 y + 0.5 * log|K| + 0.5 * N * log(2π) 

155 

156 Parameters 

157 ---------- 

158 x0 : ndarray 

159 Vector of log-scaled hyperparameters (length scales and noise). 

160 

161 Returns 

162 ------- 

163 float 

164 Value of the negative log marginal likelihood. 

165 """ 

166 ell = x0[:-1] 

167 sigma_n = x0[-1] 

168 llhood = 0 

169 diffs = self.model.differences_by_dim 

170 phi = self.model.kernel_func(diffs, ell) 

171 if self.model.n_order == 0: 

172 n_bases = 0 

173 phi_exp = phi.real 

174 phi_exp = phi_exp[np.newaxis,:,:] 

175 else: 

176 active = phi.get_active_bases() 

177 n_bases = active[-1] if active else self.model.n_bases 

178 

179 # Extract ALL derivative components 

180 deriv_order = 2 * self.model.n_order 

181 phi_exp = self._expand_derivs(phi, n_bases, deriv_order) 

182 K = self._build_K(phi_exp, phi, n_bases) 

183 K += ((10 ** sigma_n) ** 2) * np.eye(len(K)) 

184 K += self.model.sigma_data**2 

185 

186 try: 

187 L, low = cho_factor(K) 

188 alpha = cho_solve( 

189 (L, low), 

190 self.model.y_train 

191 ) 

192 

193 data_fit = 0.5 * np.dot(self.model.y_train, alpha) 

194 log_det_K = np.sum(np.log(np.diag(L))) 

195 complexity = log_det_K 

196 N = len(self.model.y_train) 

197 const = 0.5 * N * np.log(2 * np.pi) 

198 return data_fit + complexity + const 

199 except Exception: 

200 return 1e6 

201 

202 def nll_wrapper(self, x0): 

203 """ 

204 Wrapper function to compute NLL for optimizer. 

205 

206 Parameters 

207 ---------- 

208 x0 : ndarray 

209 Hyperparameter vector. 

210 

211 Returns 

212 ------- 

213 float 

214 NLL evaluated at x0. 

215 """ 

216 return self.negative_log_marginal_likelihood(x0) 

217 

218 def nll_grad(self, x0): 

219 """Analytic gradient of the NLL w.r.t. log10-scaled hyperparameters.""" 

220 ln10 = np.log(10.0) 

221 

222 kernel = self.model.kernel 

223 kernel_type = self.model.kernel_type 

224 D = len(self.model.differences_by_dim) 

225 sigma_n_sq = (10.0 ** x0[-1]) ** 2 

226 diffs = self.model.differences_by_dim 

227 oti = self.model.kernel_factory.oti 

228 

229 phi = self.model.kernel_func(diffs, x0[:-1]) 

230 if self.model.n_order == 0: 

231 n_bases = 0 

232 phi_exp = phi.real[np.newaxis, :, :] 

233 else: 

234 active = phi.get_active_bases() 

235 n_bases = active[-1] if active else self.model.n_bases 

236 deriv_order = 2 * self.model.n_order 

237 phi_exp = self._expand_derivs(phi, n_bases, deriv_order) 

238 

239 K = self._build_K(phi_exp, phi, n_bases) 

240 K.flat[::K.shape[0] + 1] += sigma_n_sq 

241 K += self.model.sigma_data ** 2 

242 

243 try: 

244 L, low = cho_factor(K) 

245 alpha_v = cho_solve((L, low), self.model.y_train) 

246 N = len(self.model.y_train) 

247 K_inv = cho_solve((L, low), np.eye(N)) 

248 W = K_inv - np.outer(alpha_v, alpha_v) 

249 except Exception: 

250 return np.zeros(len(x0)) 

251 

252 grad = np.zeros(len(x0)) 

253 use_fast = self._kernel_plan is not None 

254 base_shape = phi.shape 

255 

256 W_proj = None 

257 if use_fast: 

258 from math import comb 

259 ndir = comb(n_bases + deriv_order, deriv_order) 

260 proj_shape = (ndir, base_shape[0], base_shape[1]) 

261 if self._W_proj_buf is None or self._W_proj_shape != proj_shape: 

262 self._W_proj_buf = np.empty(proj_shape) 

263 self._W_proj_shape = proj_shape 

264 W_proj = self._W_proj_buf 

265 plan = self._kernel_plan 

266 row_off = plan.get('row_offsets_abs', plan['row_offsets'] + base_shape[0]) 

267 col_off = plan.get('col_offsets_abs', plan['col_offsets'] + base_shape[1]) 

268 utils._project_W_to_phi_space( 

269 W, W_proj, base_shape[0], base_shape[1], 

270 plan['fd_flat_indices'], plan['df_flat_indices'], 

271 plan['dd_flat_indices'], 

272 plan['idx_flat'], plan['idx_offsets'], plan['index_sizes'], 

273 plan['n_deriv_types'], row_off, col_off, 

274 ) 

275 

276 def _gc(dphi): 

277 if self.model.n_order == 0: 

278 dphi_exp = dphi.real[np.newaxis, :, :] 

279 else: 

280 dphi_exp = self._expand_derivs(dphi, n_bases, deriv_order) 

281 if W_proj is not None: 

282 dphi_3d = dphi_exp.reshape(W_proj.shape) 

283 return 0.5 * np.vdot(W_proj, dphi_3d) 

284 elif use_fast: 

285 dphi_3d = dphi_exp.reshape(dphi_exp.shape[0], base_shape[0], base_shape[1]) 

286 dK = utils.rbf_kernel_fast(dphi_3d, self._kernel_plan, out=self._dK_buf) 

287 return 0.5 * np.vdot(W, dK) 

288 else: 

289 dK = utils.rbf_kernel( 

290 dphi, dphi_exp, self.model.n_order, n_bases, 

291 self.model.flattened_der_indices, 

292 index=self.model.derivative_locations, 

293 ) 

294 return 0.5 * np.vdot(W, dK) 

295 

296 grad[-2] = _gc(oti.mul(2.0 * ln10, phi)) 

297 grad[-1] = ln10 * sigma_n_sq * np.trace(W) 

298 

299 if kernel == 'SE': 

300 if kernel_type == 'anisotropic': 

301 ell = 10.0 ** x0[:D] 

302 if hasattr(phi, 'fused_scale_sq_mul'): 

303 dphi_buf = oti.zeros(phi.shape) 

304 for d in range(D): 

305 dphi_buf.fused_scale_sq_mul(diffs[d], phi, -ln10 * ell[d] ** 2) 

306 grad[d] = _gc(dphi_buf) 

307 else: 

308 for d in range(D): 

309 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2, 

310 oti.mul(oti.mul(diffs[d], diffs[d]), phi))) 

311 else: 

312 ell = 10.0 ** float(x0[0]) 

313 if hasattr(phi, 'fused_sum_sq'): 

314 sum_sq = oti.zeros(phi.shape) 

315 sum_sq.fused_sum_sq(diffs) 

316 else: 

317 sum_sq = oti.mul(diffs[0], diffs[0]) 

318 for d in range(1, D): 

319 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d])) 

320 grad[0] = _gc(oti.mul(-ln10 * ell ** 2, oti.mul(sum_sq, phi))) 

321 

322 elif kernel == 'RQ': 

323 if kernel_type == 'anisotropic': 

324 ell = 10.0 ** x0[:D]; alpha_rq = 10.0 ** float(x0[D]); alpha_idx = D 

325 else: 

326 ell = np.full(D, 10.0 ** float(x0[0])) 

327 alpha_rq = np.exp(float(x0[1])); alpha_idx = 1 

328 if hasattr(phi, 'fused_sqdist'): 

329 r2 = oti.zeros(phi.shape) 

330 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64) 

331 r2.fused_sqdist(diffs, ell_sq) 

332 else: 

333 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2) 

334 for d in range(1, D): 

335 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td)) 

336 base = oti.sum(1.0, oti.mul(r2, 1.0 / (2.0 * alpha_rq))) 

337 inv_base = oti.pow(base, -1) 

338 phi_over_base = oti.mul(phi, inv_base) 

339 if kernel_type == 'anisotropic': 

340 if hasattr(phi, 'fused_scale_sq_mul'): 

341 dphi_buf = oti.zeros(phi.shape) 

342 for d in range(D): 

343 dphi_buf.fused_scale_sq_mul(diffs[d], phi_over_base, -ln10 * ell[d] ** 2) 

344 grad[d] = _gc(dphi_buf) 

345 else: 

346 for d in range(D): 

347 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2, 

348 oti.mul(oti.mul(diffs[d], diffs[d]), phi_over_base))) 

349 else: 

350 if hasattr(phi, 'fused_sum_sq'): 

351 sum_sq = oti.zeros(phi.shape) 

352 sum_sq.fused_sum_sq(diffs) 

353 else: 

354 sum_sq = oti.mul(diffs[0], diffs[0]) 

355 for d in range(1, D): 

356 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d])) 

357 grad[0] = _gc(oti.mul(-ln10 * ell[0] ** 2, oti.mul(sum_sq, phi_over_base))) 

358 log_base = oti.log(base) 

359 term = oti.sub(oti.sub(1.0, inv_base), log_base) 

360 alpha_factor = ln10 * alpha_rq if kernel_type == 'anisotropic' else alpha_rq 

361 grad[alpha_idx] = _gc(oti.mul(alpha_factor, oti.mul(phi, term))) 

362 

363 elif kernel == 'SineExp': 

364 if kernel_type == 'anisotropic': 

365 ell = 10.0 ** x0[:D]; p = 10.0 ** x0[D:2*D] 

366 pip = np.pi / p; p_start = D 

367 else: 

368 ell = np.full(D, 10.0 ** float(x0[0])) 

369 pip = np.full(D, np.pi / 10.0 ** float(x0[1])); p_start = 1 

370 sin_d = [oti.sin(oti.mul(pip[d], diffs[d])) for d in range(D)] 

371 cos_d = [oti.cos(oti.mul(pip[d], diffs[d])) for d in range(D)] 

372 if kernel_type == 'anisotropic': 

373 if hasattr(phi, 'fused_scale_sq_mul'): 

374 dphi_buf = oti.zeros(phi.shape) 

375 for d in range(D): 

376 dphi_buf.fused_scale_sq_mul(sin_d[d], phi, -4.0 * ln10 * ell[d] ** 2) 

377 grad[d] = _gc(dphi_buf) 

378 else: 

379 for d in range(D): 

380 grad[d] = _gc(oti.mul(-4.0 * ln10 * ell[d] ** 2, 

381 oti.mul(oti.mul(sin_d[d], sin_d[d]), phi))) 

382 for d in range(D): 

383 sc = oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d])) 

384 grad[p_start + d] = _gc(oti.mul(4.0 * ln10 * ell[d] ** 2 * pip[d], 

385 oti.mul(sc, phi))) 

386 else: 

387 if hasattr(phi, 'fused_sum_sq'): 

388 ss = oti.zeros(phi.shape) 

389 ss.fused_sum_sq(sin_d) 

390 else: 

391 ss = oti.mul(sin_d[0], sin_d[0]) 

392 for d in range(1, D): 

393 ss = oti.sum(ss, oti.mul(sin_d[d], sin_d[d])) 

394 grad[0] = _gc(oti.mul(-4.0 * ln10 * ell[0] ** 2, oti.mul(ss, phi))) 

395 scd = oti.mul(sin_d[0], oti.mul(cos_d[0], diffs[0])) 

396 for d in range(1, D): 

397 scd = oti.sum(scd, oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d]))) 

398 grad[p_start] = _gc(oti.mul(4.0 * ln10 * ell[0] ** 2 * pip[0], 

399 oti.mul(scd, phi))) 

400 

401 elif kernel == 'Matern': 

402 kf = self.model.kernel_factory 

403 if not hasattr(kf, '_matern_grad_prebuild'): 

404 kf._matern_grad_prebuild = matern_kernel_grad_builder(getattr(kf, "nu", 1.5), oti_module=oti) 

405 ell = (10.0 ** x0[:D] if kernel_type == 'anisotropic' 

406 else np.full(D, 10.0 ** float(x0[0]))) 

407 sigma_f_sq = (10.0 ** float(x0[-2])) ** 2 

408 _eps = 1e-10 

409 if hasattr(phi, 'fused_sqdist'): 

410 r2 = oti.zeros(phi.shape) 

411 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64) 

412 r2.fused_sqdist(diffs, ell_sq) 

413 else: 

414 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2) 

415 for d in range(1, D): 

416 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td)) 

417 r_oti = oti.sqrt(oti.sum(r2, _eps ** 2)) 

418 f_prime_r = kf._matern_grad_prebuild(r_oti) 

419 inv_r = oti.pow(r_oti, -1) 

420 base_matern = oti.mul(sigma_f_sq, oti.mul(f_prime_r, inv_r)) 

421 if kernel_type == 'anisotropic': 

422 if hasattr(phi, 'fused_scale_sq_mul'): 

423 dphi_buf = oti.zeros(phi.shape) 

424 for d in range(D): 

425 dphi_buf.fused_scale_sq_mul(diffs[d], base_matern, ln10 * ell[d] ** 2) 

426 grad[d] = _gc(dphi_buf) 

427 else: 

428 for d in range(D): 

429 d_sq = oti.mul(diffs[d], diffs[d]) 

430 dphi_d = oti.mul(ln10 * ell[d] ** 2, oti.mul(d_sq, base_matern)) 

431 grad[d] = _gc(dphi_d) 

432 else: 

433 if hasattr(phi, 'fused_sum_sq'): 

434 sum_dsq = oti.zeros(phi.shape) 

435 sum_dsq.fused_sum_sq(diffs) 

436 else: 

437 sum_dsq = oti.mul(diffs[0], diffs[0]) 

438 for d in range(1, D): 

439 sum_dsq = oti.sum(sum_dsq, oti.mul(diffs[d], diffs[d])) 

440 dphi_e = oti.mul(ln10 * ell[0] ** 2, oti.mul(sum_dsq, base_matern)) 

441 grad[0] = _gc(dphi_e) 

442 

443 return grad 

444 

445 def nll_and_grad(self, x0): 

446 """Compute NLL and its gradient in a single pass, sharing one Cholesky.""" 

447 ln10 = np.log(10.0) 

448 

449 kernel = self.model.kernel 

450 kernel_type = self.model.kernel_type 

451 D = len(self.model.differences_by_dim) 

452 sigma_n_sq = (10.0 ** x0[-1]) ** 2 

453 diffs = self.model.differences_by_dim 

454 oti = self.model.kernel_factory.oti 

455 

456 # --- shared kernel computation (done ONCE) --- 

457 phi = self.model.kernel_func(diffs, x0[:-1]) 

458 if self.model.n_order == 0: 

459 n_bases = 0 

460 phi_exp = phi.real[np.newaxis, :, :] 

461 else: 

462 active = phi.get_active_bases() 

463 n_bases = active[-1] if active else self.model.n_bases 

464 deriv_order = 2 * self.model.n_order 

465 phi_exp = self._expand_derivs(phi, n_bases, deriv_order) 

466 

467 K = self._build_K(phi_exp, phi, n_bases) 

468 K.flat[::K.shape[0] + 1] += sigma_n_sq 

469 K += self.model.sigma_data ** 2 

470 

471 try: 

472 L, low = cho_factor(K) 

473 alpha_v = cho_solve((L, low), self.model.y_train) 

474 N = len(self.model.y_train) 

475 

476 # NLL 

477 nll = (0.5 * np.dot(self.model.y_train, alpha_v) 

478 + np.sum(np.log(np.diag(L))) 

479 + 0.5 * N * np.log(2 * np.pi)) 

480 

481 # W matrix for gradient (reuse same Cholesky) 

482 K_inv = cho_solve((L, low), np.eye(N)) 

483 W = K_inv - np.outer(alpha_v, alpha_v) 

484 except Exception: 

485 return 1e6, np.zeros(len(x0)) 

486 

487 # --- gradient from W (no second kernel build / Cholesky) --- 

488 grad = np.zeros(len(x0)) 

489 use_fast = self._kernel_plan is not None 

490 base_shape = phi.shape 

491 

492 W_proj = None 

493 if use_fast and self.model.n_order > 0: 

494 from math import comb 

495 ndir = comb(n_bases + deriv_order, deriv_order) 

496 proj_shape = (ndir, base_shape[0], base_shape[1]) 

497 if self._W_proj_buf is None or self._W_proj_shape != proj_shape: 

498 self._W_proj_buf = np.empty(proj_shape) 

499 self._W_proj_shape = proj_shape 

500 W_proj = self._W_proj_buf 

501 plan = self._kernel_plan 

502 row_off = plan.get('row_offsets_abs', plan['row_offsets'] + base_shape[0]) 

503 col_off = plan.get('col_offsets_abs', plan['col_offsets'] + base_shape[1]) 

504 utils._project_W_to_phi_space( 

505 W, W_proj, base_shape[0], base_shape[1], 

506 plan['fd_flat_indices'], plan['df_flat_indices'], 

507 plan['dd_flat_indices'], 

508 plan['idx_flat'], plan['idx_offsets'], plan['index_sizes'], 

509 plan['n_deriv_types'], row_off, col_off, 

510 ) 

511 

512 def _gc(dphi): 

513 if self.model.n_order == 0: 

514 dphi_exp = dphi.real[np.newaxis, :, :] 

515 else: 

516 dphi_exp = self._expand_derivs(dphi, n_bases, deriv_order) 

517 if W_proj is not None: 

518 dphi_3d = dphi_exp.reshape(W_proj.shape) 

519 return 0.5 * np.vdot(W_proj, dphi_3d) 

520 elif use_fast: 

521 dphi_3d = dphi_exp.reshape(dphi_exp.shape[0], base_shape[0], base_shape[1]) 

522 dK = utils.rbf_kernel_fast(dphi_3d, self._kernel_plan, out=self._dK_buf) 

523 return 0.5 * np.vdot(W, dK) 

524 else: 

525 dK = utils.rbf_kernel( 

526 dphi, dphi_exp, self.model.n_order, n_bases, 

527 self.model.flattened_der_indices, 

528 index=self.model.derivative_locations, 

529 ) 

530 return 0.5 * np.vdot(W, dK) 

531 

532 grad[-2] = _gc(oti.mul(2.0 * ln10, phi)) 

533 grad[-1] = ln10 * sigma_n_sq * np.trace(W) 

534 

535 if kernel == 'SE': 

536 if kernel_type == 'anisotropic': 

537 ell = 10.0 ** x0[:D] 

538 if hasattr(phi, 'fused_scale_sq_mul'): 

539 dphi_buf = oti.zeros(phi.shape) 

540 for d in range(D): 

541 dphi_buf.fused_scale_sq_mul(diffs[d], phi, -ln10 * ell[d] ** 2) 

542 grad[d] = _gc(dphi_buf) 

543 else: 

544 for d in range(D): 

545 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2, 

546 oti.mul(oti.mul(diffs[d], diffs[d]), phi))) 

547 else: 

548 ell = 10.0 ** float(x0[0]) 

549 if hasattr(phi, 'fused_sum_sq'): 

550 sum_sq = oti.zeros(phi.shape) 

551 sum_sq.fused_sum_sq(diffs) 

552 else: 

553 sum_sq = oti.mul(diffs[0], diffs[0]) 

554 for d in range(1, D): 

555 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d])) 

556 grad[0] = _gc(oti.mul(-ln10 * ell ** 2, oti.mul(sum_sq, phi))) 

557 

558 elif kernel == 'RQ': 

559 if kernel_type == 'anisotropic': 

560 ell = 10.0 ** x0[:D]; alpha_rq = 10.0 ** float(x0[D]); alpha_idx = D 

561 else: 

562 ell = np.full(D, 10.0 ** float(x0[0])) 

563 alpha_rq = np.exp(float(x0[1])); alpha_idx = 1 

564 if hasattr(phi, 'fused_sqdist'): 

565 r2 = oti.zeros(phi.shape) 

566 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64) 

567 r2.fused_sqdist(diffs, ell_sq) 

568 else: 

569 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2) 

570 for d in range(1, D): 

571 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td)) 

572 base = oti.sum(1.0, oti.mul(r2, 1.0 / (2.0 * alpha_rq))) 

573 inv_base = oti.pow(base, -1) 

574 phi_over_base = oti.mul(phi, inv_base) 

575 if kernel_type == 'anisotropic': 

576 if hasattr(phi, 'fused_scale_sq_mul'): 

577 dphi_buf = oti.zeros(phi.shape) 

578 for d in range(D): 

579 dphi_buf.fused_scale_sq_mul(diffs[d], phi_over_base, -ln10 * ell[d] ** 2) 

580 grad[d] = _gc(dphi_buf) 

581 else: 

582 for d in range(D): 

583 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2, 

584 oti.mul(oti.mul(diffs[d], diffs[d]), phi_over_base))) 

585 else: 

586 if hasattr(phi, 'fused_sum_sq'): 

587 sum_sq = oti.zeros(phi.shape) 

588 sum_sq.fused_sum_sq(diffs) 

589 else: 

590 sum_sq = oti.mul(diffs[0], diffs[0]) 

591 for d in range(1, D): 

592 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d])) 

593 grad[0] = _gc(oti.mul(-ln10 * ell[0] ** 2, oti.mul(sum_sq, phi_over_base))) 

594 log_base = oti.log(base) 

595 term = oti.sub(oti.sub(1.0, inv_base), log_base) 

596 alpha_factor = ln10 * alpha_rq if kernel_type == 'anisotropic' else alpha_rq 

597 grad[alpha_idx] = _gc(oti.mul(alpha_factor, oti.mul(phi, term))) 

598 

599 elif kernel == 'SineExp': 

600 if kernel_type == 'anisotropic': 

601 ell = 10.0 ** x0[:D]; p = 10.0 ** x0[D:2*D] 

602 pip = np.pi / p; p_start = D 

603 else: 

604 ell = np.full(D, 10.0 ** float(x0[0])) 

605 pip = np.full(D, np.pi / 10.0 ** float(x0[1])); p_start = 1 

606 sin_d = [oti.sin(oti.mul(pip[d], diffs[d])) for d in range(D)] 

607 cos_d = [oti.cos(oti.mul(pip[d], diffs[d])) for d in range(D)] 

608 if kernel_type == 'anisotropic': 

609 if hasattr(phi, 'fused_scale_sq_mul'): 

610 dphi_buf = oti.zeros(phi.shape) 

611 for d in range(D): 

612 dphi_buf.fused_scale_sq_mul(sin_d[d], phi, -4.0 * ln10 * ell[d] ** 2) 

613 grad[d] = _gc(dphi_buf) 

614 else: 

615 for d in range(D): 

616 grad[d] = _gc(oti.mul(-4.0 * ln10 * ell[d] ** 2, 

617 oti.mul(oti.mul(sin_d[d], sin_d[d]), phi))) 

618 for d in range(D): 

619 sc = oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d])) 

620 grad[p_start + d] = _gc(oti.mul(4.0 * ln10 * ell[d] ** 2 * pip[d], 

621 oti.mul(sc, phi))) 

622 else: 

623 if hasattr(phi, 'fused_sum_sq'): 

624 ss = oti.zeros(phi.shape) 

625 ss.fused_sum_sq(sin_d) 

626 else: 

627 ss = oti.mul(sin_d[0], sin_d[0]) 

628 for d in range(1, D): 

629 ss = oti.sum(ss, oti.mul(sin_d[d], sin_d[d])) 

630 grad[0] = _gc(oti.mul(-4.0 * ln10 * ell[0] ** 2, oti.mul(ss, phi))) 

631 scd = oti.mul(sin_d[0], oti.mul(cos_d[0], diffs[0])) 

632 for d in range(1, D): 

633 scd = oti.sum(scd, oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d]))) 

634 grad[p_start] = _gc(oti.mul(4.0 * ln10 * ell[0] ** 2 * pip[0], 

635 oti.mul(scd, phi))) 

636 

637 elif kernel == 'Matern': 

638 kf = self.model.kernel_factory 

639 if not hasattr(kf, '_matern_grad_prebuild'): 

640 kf._matern_grad_prebuild = matern_kernel_grad_builder(getattr(kf, "nu", 1.5), oti_module=oti) 

641 ell = (10.0 ** x0[:D] if kernel_type == 'anisotropic' 

642 else np.full(D, 10.0 ** float(x0[0]))) 

643 sigma_f_sq = (10.0 ** float(x0[-2])) ** 2 

644 _eps = 1e-10 

645 if hasattr(phi, 'fused_sqdist'): 

646 r2 = oti.zeros(phi.shape) 

647 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64) 

648 r2.fused_sqdist(diffs, ell_sq) 

649 else: 

650 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2) 

651 for d in range(1, D): 

652 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td)) 

653 r_oti = oti.sqrt(oti.sum(r2, _eps ** 2)) 

654 f_prime_r = kf._matern_grad_prebuild(r_oti) 

655 inv_r = oti.pow(r_oti, -1) 

656 base_matern = oti.mul(sigma_f_sq, oti.mul(f_prime_r, inv_r)) 

657 if kernel_type == 'anisotropic': 

658 if hasattr(phi, 'fused_scale_sq_mul'): 

659 dphi_buf = oti.zeros(phi.shape) 

660 for d in range(D): 

661 dphi_buf.fused_scale_sq_mul(diffs[d], base_matern, ln10 * ell[d] ** 2) 

662 grad[d] = _gc(dphi_buf) 

663 else: 

664 for d in range(D): 

665 d_sq = oti.mul(diffs[d], diffs[d]) 

666 dphi_d = oti.mul(ln10 * ell[d] ** 2, oti.mul(d_sq, base_matern)) 

667 grad[d] = _gc(dphi_d) 

668 else: 

669 if hasattr(phi, 'fused_sum_sq'): 

670 sum_dsq = oti.zeros(phi.shape) 

671 sum_dsq.fused_sum_sq(diffs) 

672 else: 

673 sum_dsq = oti.mul(diffs[0], diffs[0]) 

674 for d in range(1, D): 

675 sum_dsq = oti.sum(sum_dsq, oti.mul(diffs[d], diffs[d])) 

676 dphi_e = oti.mul(ln10 * ell[0] ** 2, oti.mul(sum_dsq, base_matern)) 

677 grad[0] = _gc(dphi_e) 

678 

679 return float(nll), grad 

680 

681 def optimize_hyperparameters( self, 

682 optimizer="pso", 

683 **kwargs): 

684 """ 

685 Optimize the DEGP model hyperparameters using Particle Swarm Optimization (PSO). 

686 

687 Parameters: 

688 ---------- 

689 n_restart_optimizer : int, default=20 

690 Maximum number of iterations for PSO. 

691 swarm_size : int, default=20 

692 Number of particles in the swarm. 

693 verbose : bool, default=True 

694 Controls verbosity of PSO output. 

695 

696 Returns: 

697 ------- 

698 best_x : ndarray 

699 The optimal set of hyperparameters found. 

700 """ 

701 

702 if isinstance(optimizer, str): 

703 if optimizer not in OPTIMIZERS: 

704 raise ValueError( 

705 f"Unknown optimizer '{optimizer}'. Available: {list(OPTIMIZERS.keys())}" 

706 ) 

707 optimizer_fn = OPTIMIZERS[optimizer] 

708 else: 

709 optimizer_fn = optimizer # allow passing a callable directly 

710 

711 bounds = self.model.bounds 

712 lb = [b[0] for b in bounds] 

713 ub = [b[1] for b in bounds] 

714 

715 if optimizer in ('lbfgs', 'jade', 'pso') and 'func_and_grad' not in kwargs and 'grad_func' not in kwargs: 

716 kwargs['func_and_grad'] = self.nll_and_grad 

717 

718 best_x, best_val = optimizer_fn(self.nll_wrapper, lb, ub, **kwargs) 

719 

720 self.model.opt_x0 = best_x 

721 self.model.opt_nll = best_val 

722 

723 

724 return best_x