Coverage for jetgp/full_ddegp/optimizer.py: 69%

503 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2026-04-03 15:37 -0500

1import numpy as np 

2 

3from scipy.linalg import cho_solve, cho_factor 

4from jetgp.full_ddegp import ddegp_utils as utils 

5from line_profiler import profile 

6import jetgp.utils as gen_utils 

7from jetgp.hyperparameter_optimizers import OPTIMIZERS 

8from jetgp.utils import matern_kernel_grad_builder 

9 

10class Optimizer: 

11 """ 

12 Optimizer class to perform hyperparameter tuning for derivative-enhanced Gaussian Process models 

13 by minimizing the negative log marginal likelihood (NLL). 

14 

15 Parameters 

16 ---------- 

17 model : object 

18 An instance of a model (e.g., ddegp) containing the necessary training data 

19 and kernel configuration. 

20 """ 

21 

22 def __init__(self, model): 

23 self.model = model 

24 self._kernel_plan = None 

25 self._deriv_buf = None 

26 self._deriv_buf_shape = None 

27 self._deriv_factors = None 

28 self._deriv_factors_key = None 

29 self._K_buf = None 

30 self._dK_buf = None 

31 self._kernel_buf_size = None 

32 self._W_proj_buf = None 

33 self._W_proj_shape = None 

34 

35 def _get_deriv_buf(self, phi, n_bases, order): 

36 from math import comb 

37 ndir = comb(n_bases + order, order) 

38 shape = (ndir, phi.shape[0], phi.shape[1]) 

39 if self._deriv_buf is None or self._deriv_buf_shape != shape: 

40 self._deriv_buf = np.zeros(shape, dtype=np.float64) 

41 self._deriv_buf_shape = shape 

42 return self._deriv_buf 

43 

44 def _expand_derivs(self, phi, n_bases, deriv_order): 

45 """Expand OTI derivatives, using fast struct path if available.""" 

46 if hasattr(phi, 'get_all_derivs_fast'): 

47 buf = self._get_deriv_buf(phi, n_bases, deriv_order) 

48 factors = self._get_deriv_factors(n_bases, deriv_order) 

49 return phi.get_all_derivs_fast(factors, buf) 

50 return phi.get_all_derivs(n_bases, deriv_order) 

51 

52 @staticmethod 

53 def _enum_factors(max_basis, ordi): 

54 from math import factorial 

55 from collections import Counter 

56 if ordi == 1: 

57 for _ in range(max_basis): 

58 yield 1.0 

59 return 

60 for last in range(1, max_basis + 1): 

61 if ordi == 2: 

62 for i in range(1, last + 1): 

63 counts = Counter((i, last)) 

64 f = 1 

65 for c in counts.values(): 

66 f *= factorial(c) 

67 yield float(f) 

68 else: 

69 for _, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1): 

70 counts = dict(prefix_counts) 

71 counts[last] = counts.get(last, 0) + 1 

72 f = 1 

73 for c in counts.values(): 

74 f *= factorial(c) 

75 yield float(f) 

76 

77 @staticmethod 

78 def _enum_factors_with_counts(max_basis, ordi): 

79 from math import factorial 

80 from collections import Counter 

81 if ordi == 1: 

82 for i in range(1, max_basis + 1): 

83 yield 1.0, {i: 1} 

84 return 

85 for last in range(1, max_basis + 1): 

86 for _, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1): 

87 counts = dict(prefix_counts) 

88 counts[last] = counts.get(last, 0) + 1 

89 f = 1 

90 for c in counts.values(): 

91 f *= factorial(c) 

92 yield float(f), counts 

93 

94 def _get_deriv_factors(self, n_bases, order): 

95 key = (n_bases, order) 

96 if self._deriv_factors is not None and self._deriv_factors_key == key: 

97 return self._deriv_factors 

98 factors = [1.0] 

99 for ordi in range(1, order + 1): 

100 factors.extend(self._enum_factors(n_bases, ordi)) 

101 self._deriv_factors = np.array(factors, dtype=np.float64) 

102 self._deriv_factors_key = key 

103 return self._deriv_factors 

104 

105 def _ensure_kernel_plan(self, n_bases): 

106 """Lazily precompute kernel plan (once per n_bases).""" 

107 if self._kernel_plan is not None and self._kernel_plan_n_bases == n_bases: 

108 return 

109 if not hasattr(utils, 'precompute_kernel_plan'): 

110 self._kernel_plan = None 

111 return 

112 self._kernel_plan = utils.precompute_kernel_plan( 

113 self.model.n_order, n_bases, 

114 self.model.flattened_der_indices, 

115 self.model.powers, 

116 self.model.derivative_locations, 

117 ) 

118 self._kernel_plan_n_bases = n_bases 

119 self._K_buf = None 

120 self._dK_buf = None 

121 self._kernel_buf_size = None 

122 

123 def _ensure_kernel_bufs(self, n_rows_func): 

124 """Pre-allocate reusable K and dK buffers (avoids repeated malloc).""" 

125 if self._kernel_plan is None: 

126 return 

127 total = n_rows_func + self._kernel_plan['n_pts_with_derivs'] 

128 if self._kernel_buf_size != total: 

129 self._K_buf = np.empty((total, total)) 

130 self._dK_buf = np.empty((total, total)) 

131 self._kernel_buf_size = total 

132 if 'row_offsets_abs' not in self._kernel_plan: 

133 self._kernel_plan['row_offsets_abs'] = self._kernel_plan['row_offsets'] + n_rows_func 

134 self._kernel_plan['col_offsets_abs'] = self._kernel_plan['col_offsets'] + n_rows_func 

135 

136 def _build_K(self, phi_exp, phi, n_bases): 

137 """Build kernel matrix using fast path if available.""" 

138 self._ensure_kernel_plan(n_bases) 

139 if self._kernel_plan is not None: 

140 base_shape = phi.shape 

141 self._ensure_kernel_bufs(base_shape[0]) 

142 phi_3d = phi_exp.reshape(phi_exp.shape[0], base_shape[0], base_shape[1]) 

143 return utils.rbf_kernel_fast(phi_3d, self._kernel_plan, out=self._K_buf) 

144 return utils.rbf_kernel( 

145 phi, phi_exp, self.model.n_order, n_bases, 

146 self.model.flattened_der_indices, self.model.powers, 

147 self.model.derivative_locations, 

148 ) 

149 

150 @profile 

151 def negative_log_marginal_likelihood(self, x0): 

152 """ 

153 Compute the negative log marginal likelihood (NLL) of the model. 

154 

155 NLL = 0.5 * y^T K^-1 y + 0.5 * log|K| + 0.5 * N * log(2π) 

156 

157 Parameters 

158 ---------- 

159 x0 : ndarray 

160 Vector of log-scaled hyperparameters (length scales and noise). 

161 

162 Returns 

163 ------- 

164 float 

165 Value of the negative log marginal likelihood. 

166 """ 

167 ell = x0[:-1] 

168 sigma_n = x0[-1] 

169 llhood = 0 

170 diffs = self.model.differences_by_dim 

171 phi = self.model.kernel_func(diffs, ell) 

172 n_bases = phi.get_active_bases()[-1] 

173 

174 # Extract ALL derivative components 

175 deriv_order = 2 * self.model.n_order 

176 phi_exp = self._expand_derivs(phi, n_bases, deriv_order) 

177 K = self._build_K(phi_exp, phi, n_bases) 

178 K += ((10 ** sigma_n) ** 2) * np.eye(len(K)) 

179 K += self.model.sigma_data**2 

180 

181 try: 

182 L, low = cho_factor(K) 

183 alpha = cho_solve( 

184 (L, low), 

185 self.model.y_train 

186 ) 

187 

188 data_fit = 0.5 * np.dot(self.model.y_train, alpha) 

189 log_det_K = np.sum(np.log(np.diag(L))) 

190 complexity = log_det_K 

191 N = len(self.model.y_train) 

192 const = 0.5 * N * np.log(2 * np.pi) 

193 return data_fit + complexity + const 

194 except Exception: 

195 return 1e6 

196 

197 def nll_wrapper(self, x0): 

198 """ 

199 Wrapper function to compute NLL for optimizer. 

200 

201 Parameters 

202 ---------- 

203 x0 : ndarray 

204 Hyperparameter vector. 

205 

206 Returns 

207 ------- 

208 float 

209 NLL evaluated at x0. 

210 """ 

211 return self.negative_log_marginal_likelihood(x0) 

212 

213 def nll_grad(self, x0): 

214 """Analytic gradient of the NLL w.r.t. log10-scaled hyperparameters.""" 

215 ln10 = np.log(10.0) 

216 

217 kernel = self.model.kernel 

218 kernel_type = self.model.kernel_type 

219 D = len(self.model.differences_by_dim) 

220 sigma_n_sq = (10.0 ** x0[-1]) ** 2 

221 diffs = self.model.differences_by_dim 

222 oti = self.model.kernel_factory.oti 

223 

224 phi = self.model.kernel_func(diffs, x0[:-1]) 

225 n_bases = phi.get_active_bases()[-1] 

226 deriv_order = 2 * self.model.n_order 

227 phi_exp = self._expand_derivs(phi, n_bases, deriv_order) 

228 

229 K = self._build_K(phi_exp, phi, n_bases) 

230 K.flat[::K.shape[0] + 1] += sigma_n_sq 

231 K += self.model.sigma_data ** 2 

232 

233 try: 

234 L, low = cho_factor(K) 

235 alpha_v = cho_solve((L, low), self.model.y_train) 

236 N = len(self.model.y_train) 

237 K_inv = cho_solve((L, low), np.eye(N)) 

238 W = K_inv - np.outer(alpha_v, alpha_v) 

239 except Exception: 

240 return np.zeros(len(x0)) 

241 

242 grad = np.zeros(len(x0)) 

243 use_fast = self._kernel_plan is not None 

244 base_shape = phi.shape 

245 

246 W_proj = None 

247 if use_fast: 

248 from math import comb 

249 ndir = comb(n_bases + deriv_order, deriv_order) 

250 proj_shape = (ndir, base_shape[0], base_shape[1]) 

251 if self._W_proj_buf is None or self._W_proj_shape != proj_shape: 

252 self._W_proj_buf = np.empty(proj_shape) 

253 self._W_proj_shape = proj_shape 

254 W_proj = self._W_proj_buf 

255 plan = self._kernel_plan 

256 row_off = plan.get('row_offsets_abs', plan['row_offsets'] + base_shape[0]) 

257 col_off = plan.get('col_offsets_abs', plan['col_offsets'] + base_shape[1]) 

258 utils._project_W_to_phi_space( 

259 W, W_proj, base_shape[0], base_shape[1], 

260 plan['fd_flat_indices'], plan['df_flat_indices'], 

261 plan['dd_flat_indices'], 

262 plan['idx_flat'], plan['idx_offsets'], plan['index_sizes'], 

263 plan['signs'], plan['n_deriv_types'], row_off, col_off, 

264 ) 

265 

266 def _gc(dphi): 

267 dphi_exp = self._expand_derivs(dphi, n_bases, deriv_order) 

268 if W_proj is not None: 

269 dphi_3d = dphi_exp.reshape(W_proj.shape) 

270 return 0.5 * np.vdot(W_proj, dphi_3d) 

271 elif use_fast: 

272 dphi_3d = dphi_exp.reshape(dphi_exp.shape[0], base_shape[0], base_shape[1]) 

273 dK = utils.rbf_kernel_fast(dphi_3d, self._kernel_plan, out=self._dK_buf) 

274 return 0.5 * np.vdot(W, dK) 

275 else: 

276 dK = utils.rbf_kernel( 

277 dphi, dphi_exp, self.model.n_order, n_bases, 

278 self.model.flattened_der_indices, self.model.powers, 

279 self.model.derivative_locations, 

280 ) 

281 return 0.5 * np.vdot(W, dK) 

282 

283 grad[-2] = _gc(oti.mul(2.0 * ln10, phi)) 

284 grad[-1] = ln10 * sigma_n_sq * np.trace(W) 

285 

286 if kernel == 'SE': 

287 if kernel_type == 'anisotropic': 

288 ell = 10.0 ** x0[:D] 

289 if hasattr(phi, 'fused_scale_sq_mul'): 

290 dphi_buf = oti.zeros(phi.shape) 

291 for d in range(D): 

292 dphi_buf.fused_scale_sq_mul(diffs[d], phi, -ln10 * ell[d] ** 2) 

293 grad[d] = _gc(dphi_buf) 

294 else: 

295 for d in range(D): 

296 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2, 

297 oti.mul(oti.mul(diffs[d], diffs[d]), phi))) 

298 else: 

299 ell = 10.0 ** float(x0[0]) 

300 if hasattr(phi, 'fused_sum_sq'): 

301 sum_sq = oti.zeros(phi.shape) 

302 sum_sq.fused_sum_sq(diffs) 

303 else: 

304 sum_sq = oti.mul(diffs[0], diffs[0]) 

305 for d in range(1, D): 

306 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d])) 

307 grad[0] = _gc(oti.mul(-ln10 * ell ** 2, oti.mul(sum_sq, phi))) 

308 

309 elif kernel == 'RQ': 

310 if kernel_type == 'anisotropic': 

311 ell = 10.0 ** x0[:D]; alpha_rq = 10.0 ** float(x0[D]); alpha_idx = D 

312 else: 

313 ell = np.full(D, 10.0 ** float(x0[0])) 

314 alpha_rq = np.exp(float(x0[1])); alpha_idx = 1 

315 if hasattr(phi, 'fused_sqdist'): 

316 r2 = oti.zeros(phi.shape) 

317 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64) 

318 r2.fused_sqdist(diffs, ell_sq) 

319 else: 

320 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2) 

321 for d in range(1, D): 

322 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td)) 

323 base = oti.sum(1.0, oti.mul(r2, 1.0 / (2.0 * alpha_rq))) 

324 inv_base = oti.pow(base, -1) 

325 phi_over_base = oti.mul(phi, inv_base) 

326 if kernel_type == 'anisotropic': 

327 if hasattr(phi, 'fused_scale_sq_mul'): 

328 dphi_buf = oti.zeros(phi.shape) 

329 for d in range(D): 

330 dphi_buf.fused_scale_sq_mul(diffs[d], phi_over_base, -ln10 * ell[d] ** 2) 

331 grad[d] = _gc(dphi_buf) 

332 else: 

333 for d in range(D): 

334 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2, 

335 oti.mul(oti.mul(diffs[d], diffs[d]), phi_over_base))) 

336 else: 

337 if hasattr(phi, 'fused_sum_sq'): 

338 sum_sq = oti.zeros(phi.shape) 

339 sum_sq.fused_sum_sq(diffs) 

340 else: 

341 sum_sq = oti.mul(diffs[0], diffs[0]) 

342 for d in range(1, D): 

343 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d])) 

344 grad[0] = _gc(oti.mul(-ln10 * ell[0] ** 2, oti.mul(sum_sq, phi_over_base))) 

345 log_base = oti.log(base) 

346 term = oti.sub(oti.sub(1.0, inv_base), log_base) 

347 alpha_factor = ln10 * alpha_rq if kernel_type == 'anisotropic' else alpha_rq 

348 grad[alpha_idx] = _gc(oti.mul(alpha_factor, oti.mul(phi, term))) 

349 

350 elif kernel == 'SineExp': 

351 if kernel_type == 'anisotropic': 

352 ell = 10.0 ** x0[:D]; p = 10.0 ** x0[D:2*D] 

353 pip = np.pi / p; p_start = D 

354 else: 

355 ell = np.full(D, 10.0 ** float(x0[0])) 

356 pip = np.full(D, np.pi / 10.0 ** float(x0[1])); p_start = 1 

357 sin_d = [oti.sin(oti.mul(pip[d], diffs[d])) for d in range(D)] 

358 cos_d = [oti.cos(oti.mul(pip[d], diffs[d])) for d in range(D)] 

359 if kernel_type == 'anisotropic': 

360 if hasattr(phi, 'fused_scale_sq_mul'): 

361 dphi_buf = oti.zeros(phi.shape) 

362 for d in range(D): 

363 dphi_buf.fused_scale_sq_mul(sin_d[d], phi, -4.0 * ln10 * ell[d] ** 2) 

364 grad[d] = _gc(dphi_buf) 

365 else: 

366 for d in range(D): 

367 grad[d] = _gc(oti.mul(-4.0 * ln10 * ell[d] ** 2, 

368 oti.mul(oti.mul(sin_d[d], sin_d[d]), phi))) 

369 for d in range(D): 

370 sc = oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d])) 

371 grad[p_start + d] = _gc(oti.mul(4.0 * ln10 * ell[d] ** 2 * pip[d], 

372 oti.mul(sc, phi))) 

373 else: 

374 if hasattr(phi, 'fused_sum_sq'): 

375 ss = oti.zeros(phi.shape) 

376 ss.fused_sum_sq(sin_d) 

377 else: 

378 ss = oti.mul(sin_d[0], sin_d[0]) 

379 for d in range(1, D): 

380 ss = oti.sum(ss, oti.mul(sin_d[d], sin_d[d])) 

381 grad[0] = _gc(oti.mul(-4.0 * ln10 * ell[0] ** 2, oti.mul(ss, phi))) 

382 scd = oti.mul(sin_d[0], oti.mul(cos_d[0], diffs[0])) 

383 for d in range(1, D): 

384 scd = oti.sum(scd, oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d]))) 

385 grad[p_start] = _gc(oti.mul(4.0 * ln10 * ell[0] ** 2 * pip[0], 

386 oti.mul(scd, phi))) 

387 

388 elif kernel == 'Matern': 

389 kf = self.model.kernel_factory 

390 if not hasattr(kf, '_matern_grad_prebuild'): 

391 kf._matern_grad_prebuild = matern_kernel_grad_builder(getattr(kf, "nu", 1.5), oti_module=oti) 

392 ell = (10.0 ** x0[:D] if kernel_type == 'anisotropic' 

393 else np.full(D, 10.0 ** float(x0[0]))) 

394 sigma_f_sq = (10.0 ** float(x0[-2])) ** 2 

395 _eps = 1e-10 

396 if hasattr(phi, 'fused_sqdist'): 

397 r2 = oti.zeros(phi.shape) 

398 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64) 

399 r2.fused_sqdist(diffs, ell_sq) 

400 else: 

401 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2) 

402 for d in range(1, D): 

403 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td)) 

404 r_oti = oti.sqrt(oti.sum(r2, _eps ** 2)) 

405 f_prime_r = kf._matern_grad_prebuild(r_oti) 

406 inv_r = oti.pow(r_oti, -1) 

407 base_matern = oti.mul(sigma_f_sq, oti.mul(f_prime_r, inv_r)) 

408 if kernel_type == 'anisotropic': 

409 if hasattr(phi, 'fused_scale_sq_mul'): 

410 dphi_buf = oti.zeros(phi.shape) 

411 for d in range(D): 

412 dphi_buf.fused_scale_sq_mul(diffs[d], base_matern, ln10 * ell[d] ** 2) 

413 grad[d] = _gc(dphi_buf) 

414 else: 

415 for d in range(D): 

416 d_sq = oti.mul(diffs[d], diffs[d]) 

417 dphi_d = oti.mul(ln10 * ell[d] ** 2, oti.mul(d_sq, base_matern)) 

418 grad[d] = _gc(dphi_d) 

419 else: 

420 if hasattr(phi, 'fused_sum_sq'): 

421 sum_dsq = oti.zeros(phi.shape) 

422 sum_dsq.fused_sum_sq(diffs) 

423 else: 

424 sum_dsq = oti.mul(diffs[0], diffs[0]) 

425 for d in range(1, D): 

426 sum_dsq = oti.sum(sum_dsq, oti.mul(diffs[d], diffs[d])) 

427 dphi_e = oti.mul(ln10 * ell[0] ** 2, oti.mul(sum_dsq, base_matern)) 

428 grad[0] = _gc(dphi_e) 

429 

430 return grad 

431 

432 def nll_and_grad(self, x0): 

433 """Compute NLL and its gradient in a single pass, sharing one Cholesky.""" 

434 ln10 = np.log(10.0) 

435 

436 kernel = self.model.kernel 

437 kernel_type = self.model.kernel_type 

438 D = len(self.model.differences_by_dim) 

439 sigma_n_sq = (10.0 ** x0[-1]) ** 2 

440 diffs = self.model.differences_by_dim 

441 oti = self.model.kernel_factory.oti 

442 

443 # --- shared kernel computation (done ONCE) --- 

444 phi = self.model.kernel_func(diffs, x0[:-1]) 

445 if self.model.n_order == 0: 

446 n_bases = 0 

447 phi_exp = phi.real[np.newaxis, :, :] 

448 else: 

449 n_bases = phi.get_active_bases()[-1] 

450 deriv_order = 2 * self.model.n_order 

451 phi_exp = self._expand_derivs(phi, n_bases, deriv_order) 

452 

453 K = self._build_K(phi_exp, phi, n_bases) 

454 K.flat[::K.shape[0] + 1] += sigma_n_sq 

455 K += self.model.sigma_data ** 2 

456 

457 try: 

458 L, low = cho_factor(K) 

459 alpha_v = cho_solve((L, low), self.model.y_train) 

460 N = len(self.model.y_train) 

461 

462 # NLL 

463 nll = (0.5 * np.dot(self.model.y_train, alpha_v) 

464 + np.sum(np.log(np.diag(L))) 

465 + 0.5 * N * np.log(2 * np.pi)) 

466 

467 # W matrix for gradient (reuse same Cholesky) 

468 K_inv = cho_solve((L, low), np.eye(N)) 

469 W = K_inv - np.outer(alpha_v, alpha_v) 

470 except Exception: 

471 return 1e6, np.zeros(len(x0)) 

472 

473 # --- gradient from W (no second kernel build / Cholesky) --- 

474 grad = np.zeros(len(x0)) 

475 use_fast = self._kernel_plan is not None 

476 base_shape = phi.shape 

477 deriv_order_gc = 2 * self.model.n_order 

478 

479 W_proj = None 

480 if use_fast and self.model.n_order > 0: 

481 from math import comb 

482 ndir = comb(n_bases + deriv_order_gc, deriv_order_gc) 

483 proj_shape = (ndir, base_shape[0], base_shape[1]) 

484 if self._W_proj_buf is None or self._W_proj_shape != proj_shape: 

485 self._W_proj_buf = np.empty(proj_shape) 

486 self._W_proj_shape = proj_shape 

487 W_proj = self._W_proj_buf 

488 plan = self._kernel_plan 

489 row_off = plan.get('row_offsets_abs', plan['row_offsets'] + base_shape[0]) 

490 col_off = plan.get('col_offsets_abs', plan['col_offsets'] + base_shape[1]) 

491 utils._project_W_to_phi_space( 

492 W, W_proj, base_shape[0], base_shape[1], 

493 plan['fd_flat_indices'], plan['df_flat_indices'], 

494 plan['dd_flat_indices'], 

495 plan['idx_flat'], plan['idx_offsets'], plan['index_sizes'], 

496 plan['signs'], plan['n_deriv_types'], row_off, col_off, 

497 ) 

498 

499 def _gc(dphi): 

500 if self.model.n_order == 0: 

501 dphi_exp = dphi.real[np.newaxis, :, :] 

502 else: 

503 dphi_exp = self._expand_derivs(dphi, n_bases, deriv_order_gc) 

504 if W_proj is not None: 

505 dphi_3d = dphi_exp.reshape(W_proj.shape) 

506 return 0.5 * np.vdot(W_proj, dphi_3d) 

507 elif use_fast: 

508 dphi_3d = dphi_exp.reshape(dphi_exp.shape[0], base_shape[0], base_shape[1]) 

509 dK = utils.rbf_kernel_fast(dphi_3d, self._kernel_plan, out=self._dK_buf) 

510 return 0.5 * np.vdot(W, dK) 

511 else: 

512 dK = utils.rbf_kernel( 

513 dphi, dphi_exp, self.model.n_order, n_bases, 

514 self.model.flattened_der_indices, self.model.powers, 

515 self.model.derivative_locations, 

516 ) 

517 return 0.5 * np.vdot(W, dK) 

518 

519 grad[-2] = _gc(oti.mul(2.0 * ln10, phi)) 

520 grad[-1] = ln10 * sigma_n_sq * np.trace(W) 

521 

522 if kernel == 'SE': 

523 if kernel_type == 'anisotropic': 

524 ell = 10.0 ** x0[:D] 

525 if hasattr(phi, 'fused_scale_sq_mul'): 

526 dphi_buf = oti.zeros(phi.shape) 

527 for d in range(D): 

528 dphi_buf.fused_scale_sq_mul(diffs[d], phi, -ln10 * ell[d] ** 2) 

529 grad[d] = _gc(dphi_buf) 

530 else: 

531 for d in range(D): 

532 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2, 

533 oti.mul(oti.mul(diffs[d], diffs[d]), phi))) 

534 else: 

535 ell = 10.0 ** float(x0[0]) 

536 if hasattr(phi, 'fused_sum_sq'): 

537 sum_sq = oti.zeros(phi.shape) 

538 sum_sq.fused_sum_sq(diffs) 

539 else: 

540 sum_sq = oti.mul(diffs[0], diffs[0]) 

541 for d in range(1, D): 

542 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d])) 

543 grad[0] = _gc(oti.mul(-ln10 * ell ** 2, oti.mul(sum_sq, phi))) 

544 

545 elif kernel == 'RQ': 

546 if kernel_type == 'anisotropic': 

547 ell = 10.0 ** x0[:D]; alpha_rq = 10.0 ** float(x0[D]); alpha_idx = D 

548 else: 

549 ell = np.full(D, 10.0 ** float(x0[0])) 

550 alpha_rq = np.exp(float(x0[1])); alpha_idx = 1 

551 if hasattr(phi, 'fused_sqdist'): 

552 r2 = oti.zeros(phi.shape) 

553 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64) 

554 r2.fused_sqdist(diffs, ell_sq) 

555 else: 

556 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2) 

557 for d in range(1, D): 

558 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td)) 

559 base = oti.sum(1.0, oti.mul(r2, 1.0 / (2.0 * alpha_rq))) 

560 inv_base = oti.pow(base, -1) 

561 phi_over_base = oti.mul(phi, inv_base) 

562 if kernel_type == 'anisotropic': 

563 if hasattr(phi, 'fused_scale_sq_mul'): 

564 dphi_buf = oti.zeros(phi.shape) 

565 for d in range(D): 

566 dphi_buf.fused_scale_sq_mul(diffs[d], phi_over_base, -ln10 * ell[d] ** 2) 

567 grad[d] = _gc(dphi_buf) 

568 else: 

569 for d in range(D): 

570 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2, 

571 oti.mul(oti.mul(diffs[d], diffs[d]), phi_over_base))) 

572 else: 

573 if hasattr(phi, 'fused_sum_sq'): 

574 sum_sq = oti.zeros(phi.shape) 

575 sum_sq.fused_sum_sq(diffs) 

576 else: 

577 sum_sq = oti.mul(diffs[0], diffs[0]) 

578 for d in range(1, D): 

579 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d])) 

580 grad[0] = _gc(oti.mul(-ln10 * ell[0] ** 2, oti.mul(sum_sq, phi_over_base))) 

581 log_base = oti.log(base) 

582 term = oti.sub(oti.sub(1.0, inv_base), log_base) 

583 alpha_factor = ln10 * alpha_rq if kernel_type == 'anisotropic' else alpha_rq 

584 grad[alpha_idx] = _gc(oti.mul(alpha_factor, oti.mul(phi, term))) 

585 

586 elif kernel == 'SineExp': 

587 if kernel_type == 'anisotropic': 

588 ell = 10.0 ** x0[:D]; p = 10.0 ** x0[D:2*D] 

589 pip = np.pi / p; p_start = D 

590 else: 

591 ell = np.full(D, 10.0 ** float(x0[0])) 

592 pip = np.full(D, np.pi / 10.0 ** float(x0[1])); p_start = 1 

593 sin_d = [oti.sin(oti.mul(pip[d], diffs[d])) for d in range(D)] 

594 cos_d = [oti.cos(oti.mul(pip[d], diffs[d])) for d in range(D)] 

595 if kernel_type == 'anisotropic': 

596 if hasattr(phi, 'fused_scale_sq_mul'): 

597 dphi_buf = oti.zeros(phi.shape) 

598 for d in range(D): 

599 dphi_buf.fused_scale_sq_mul(sin_d[d], phi, -4.0 * ln10 * ell[d] ** 2) 

600 grad[d] = _gc(dphi_buf) 

601 else: 

602 for d in range(D): 

603 grad[d] = _gc(oti.mul(-4.0 * ln10 * ell[d] ** 2, 

604 oti.mul(oti.mul(sin_d[d], sin_d[d]), phi))) 

605 for d in range(D): 

606 sc = oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d])) 

607 grad[p_start + d] = _gc(oti.mul(4.0 * ln10 * ell[d] ** 2 * pip[d], 

608 oti.mul(sc, phi))) 

609 else: 

610 if hasattr(phi, 'fused_sum_sq'): 

611 ss = oti.zeros(phi.shape) 

612 ss.fused_sum_sq(sin_d) 

613 else: 

614 ss = oti.mul(sin_d[0], sin_d[0]) 

615 for d in range(1, D): 

616 ss = oti.sum(ss, oti.mul(sin_d[d], sin_d[d])) 

617 grad[0] = _gc(oti.mul(-4.0 * ln10 * ell[0] ** 2, oti.mul(ss, phi))) 

618 scd = oti.mul(sin_d[0], oti.mul(cos_d[0], diffs[0])) 

619 for d in range(1, D): 

620 scd = oti.sum(scd, oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d]))) 

621 grad[p_start] = _gc(oti.mul(4.0 * ln10 * ell[0] ** 2 * pip[0], 

622 oti.mul(scd, phi))) 

623 

624 elif kernel == 'Matern': 

625 kf = self.model.kernel_factory 

626 if not hasattr(kf, '_matern_grad_prebuild'): 

627 kf._matern_grad_prebuild = matern_kernel_grad_builder(getattr(kf, "nu", 1.5), oti_module=oti) 

628 ell = (10.0 ** x0[:D] if kernel_type == 'anisotropic' 

629 else np.full(D, 10.0 ** float(x0[0]))) 

630 sigma_f_sq = (10.0 ** float(x0[-2])) ** 2 

631 _eps = 1e-10 

632 if hasattr(phi, 'fused_sqdist'): 

633 r2 = oti.zeros(phi.shape) 

634 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64) 

635 r2.fused_sqdist(diffs, ell_sq) 

636 else: 

637 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2) 

638 for d in range(1, D): 

639 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td)) 

640 r_oti = oti.sqrt(oti.sum(r2, _eps ** 2)) 

641 f_prime_r = kf._matern_grad_prebuild(r_oti) 

642 inv_r = oti.pow(r_oti, -1) 

643 base_matern = oti.mul(sigma_f_sq, oti.mul(f_prime_r, inv_r)) 

644 if kernel_type == 'anisotropic': 

645 if hasattr(phi, 'fused_scale_sq_mul'): 

646 dphi_buf = oti.zeros(phi.shape) 

647 for d in range(D): 

648 dphi_buf.fused_scale_sq_mul(diffs[d], base_matern, ln10 * ell[d] ** 2) 

649 grad[d] = _gc(dphi_buf) 

650 else: 

651 for d in range(D): 

652 d_sq = oti.mul(diffs[d], diffs[d]) 

653 dphi_d = oti.mul(ln10 * ell[d] ** 2, oti.mul(d_sq, base_matern)) 

654 grad[d] = _gc(dphi_d) 

655 else: 

656 if hasattr(phi, 'fused_sum_sq'): 

657 sum_dsq = oti.zeros(phi.shape) 

658 sum_dsq.fused_sum_sq(diffs) 

659 else: 

660 sum_dsq = oti.mul(diffs[0], diffs[0]) 

661 for d in range(1, D): 

662 sum_dsq = oti.sum(sum_dsq, oti.mul(diffs[d], diffs[d])) 

663 dphi_e = oti.mul(ln10 * ell[0] ** 2, oti.mul(sum_dsq, base_matern)) 

664 grad[0] = _gc(dphi_e) 

665 

666 return float(nll), grad 

667 

668 def optimize_hyperparameters( self, 

669 optimizer="pso", 

670 **kwargs): 

671 """ 

672 Optimize the DEGP model hyperparameters using Particle Swarm Optimization (PSO). 

673 

674 Parameters: 

675 ---------- 

676 n_restart_optimizer : int, default=20 

677 Maximum number of iterations for PSO. 

678 swarm_size : int, default=20 

679 Number of particles in the swarm. 

680 verbose : bool, default=True 

681 Controls verbosity of PSO output. 

682 

683 Returns: 

684 ------- 

685 best_x : ndarray 

686 The optimal set of hyperparameters found. 

687 """ 

688 

689 if isinstance(optimizer, str): 

690 if optimizer not in OPTIMIZERS: 

691 raise ValueError( 

692 f"Unknown optimizer '{optimizer}'. Available: {list(OPTIMIZERS.keys())}" 

693 ) 

694 optimizer_fn = OPTIMIZERS[optimizer] 

695 else: 

696 optimizer_fn = optimizer # allow passing a callable directly 

697 

698 bounds = self.model.bounds 

699 lb = [b[0] for b in bounds] 

700 ub = [b[1] for b in bounds] 

701 

702 if optimizer in ('lbfgs', 'jade', 'pso') and 'func_and_grad' not in kwargs and 'grad_func' not in kwargs: 

703 kwargs['func_and_grad'] = self.nll_and_grad 

704 

705 best_x, best_val = optimizer_fn(self.nll_wrapper, lb, ub, **kwargs) 

706 

707 self.model.opt_x0 = best_x 

708 self.model.opt_nll = best_val 

709 

710 

711 return best_x 

712