Coverage for jetgp/full_degp/optimizer.py: 84%

421 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2026-04-03 15:37 -0500

1import numpy as np 

2from scipy.linalg import cho_solve, cho_factor 

3from jetgp.full_degp import degp_utils as utils 

4from line_profiler import profile 

5import jetgp.utils as gen_utils 

6from jetgp.hyperparameter_optimizers import OPTIMIZERS 

7from jetgp.utils import matern_kernel_grad_builder 

8 

9class Optimizer: 

10 """ 

11 Optimizer class to perform hyperparameter tuning for derivative-enhanced Gaussian Process models 

12 by minimizing the negative log marginal likelihood (NLL). 

13 

14 Parameters 

15 ---------- 

16 model : object 

17 An instance of a model (e.g., ddegp) containing the necessary training data 

18 and kernel configuration. 

19 """ 

20 

21 def __init__(self, model): 

22 self.model = model 

23 self._kernel_plan = None 

24 self._deriv_buf = None 

25 self._deriv_buf_shape = None 

26 self._deriv_factors = None 

27 self._deriv_factors_key = None 

28 self._K_buf = None 

29 self._dK_buf = None 

30 self._kernel_buf_size = None 

31 self._W_proj_buf = None 

32 self._W_proj_shape = None 

33 

34 def _get_deriv_buf(self, phi, n_bases, order): 

35 """Return a pre-allocated buffer for get_all_derivs, reusing if shape matches.""" 

36 from math import comb 

37 ndir = comb(n_bases + order, order) 

38 shape = (ndir, phi.shape[0], phi.shape[1]) 

39 if self._deriv_buf is None or self._deriv_buf_shape != shape: 

40 self._deriv_buf = np.zeros(shape, dtype=np.float64) 

41 self._deriv_buf_shape = shape 

42 return self._deriv_buf 

43 

44 def _expand_derivs(self, phi, n_bases, deriv_order): 

45 """Expand OTI derivatives, using fast struct path if available.""" 

46 if hasattr(phi, 'get_all_derivs_fast'): 

47 buf = self._get_deriv_buf(phi, n_bases, deriv_order) 

48 factors = self._get_deriv_factors(n_bases, deriv_order) 

49 return phi.get_all_derivs_fast(factors, buf) 

50 return phi.get_all_derivs(n_bases, deriv_order) 

51 

52 @staticmethod 

53 def _enum_factors(max_basis, ordi): 

54 """Enumerate derivative factors in struct memory order for a given order. 

55 

56 Yields the factorial factor prod(count_b!) for each multi-index of 

57 the given order, enumerated in the same order as the OTI struct layout 

58 (last-index-major: for last=1..max_basis, recurse prefix with max=last). 

59 """ 

60 from math import factorial 

61 from collections import Counter 

62 if ordi == 1: 

63 for _ in range(max_basis): 

64 yield 1.0 

65 return 

66 for last in range(1, max_basis + 1): 

67 if ordi == 2: 

68 for i in range(1, last + 1): 

69 counts = Counter((i, last)) 

70 f = 1 

71 for c in counts.values(): 

72 f *= factorial(c) 

73 yield float(f) 

74 else: 

75 for prefix_factor, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1): 

76 counts = dict(prefix_counts) 

77 counts[last] = counts.get(last, 0) + 1 

78 f = 1 

79 for c in counts.values(): 

80 f *= factorial(c) 

81 yield float(f) 

82 

83 @staticmethod 

84 def _enum_factors_with_counts(max_basis, ordi): 

85 """Enumerate (factor, counts_dict) pairs in struct order.""" 

86 from math import factorial 

87 from collections import Counter 

88 if ordi == 1: 

89 for i in range(1, max_basis + 1): 

90 yield 1.0, {i: 1} 

91 return 

92 for last in range(1, max_basis + 1): 

93 for _, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1): 

94 counts = dict(prefix_counts) 

95 counts[last] = counts.get(last, 0) + 1 

96 f = 1 

97 for c in counts.values(): 

98 f *= factorial(c) 

99 yield float(f), counts 

100 

101 def _get_deriv_factors(self, n_bases, order): 

102 """Return cached precomputed derivative factorial factors.""" 

103 key = (n_bases, order) 

104 if self._deriv_factors is not None and self._deriv_factors_key == key: 

105 return self._deriv_factors 

106 factors = [1.0] # order 0: real part 

107 for ordi in range(1, order + 1): 

108 factors.extend(self._enum_factors(n_bases, ordi)) 

109 self._deriv_factors = np.array(factors, dtype=np.float64) 

110 self._deriv_factors_key = key 

111 return self._deriv_factors 

112 

113 def _ensure_kernel_plan(self, n_bases): 

114 """Lazily precompute kernel plan (once per n_bases).""" 

115 if self._kernel_plan is not None and self._kernel_plan_n_bases == n_bases: 

116 return 

117 if not hasattr(utils, 'precompute_kernel_plan'): 

118 self._kernel_plan = None 

119 return 

120 self._kernel_plan = utils.precompute_kernel_plan( 

121 self.model.n_order, n_bases, 

122 self.model.flattened_der_indices, 

123 self.model.powers, 

124 self.model.derivative_locations, 

125 ) 

126 self._kernel_plan_n_bases = n_bases 

127 # Reset kernel buffers when plan changes 

128 self._K_buf = None 

129 self._dK_buf = None 

130 self._kernel_buf_size = None 

131 

132 def _ensure_kernel_bufs(self, n_rows_func): 

133 """Pre-allocate reusable K and dK buffers (avoids repeated malloc).""" 

134 if self._kernel_plan is None: 

135 return 

136 total = n_rows_func + self._kernel_plan['n_pts_with_derivs'] 

137 if self._kernel_buf_size != total: 

138 self._K_buf = np.empty((total, total)) 

139 self._dK_buf = np.empty((total, total)) 

140 self._kernel_buf_size = total 

141 # Cache absolute offsets in plan so rbf_kernel_fast doesn't recompute 

142 if 'row_offsets_abs' not in self._kernel_plan: 

143 self._kernel_plan['row_offsets_abs'] = self._kernel_plan['row_offsets'] + n_rows_func 

144 self._kernel_plan['col_offsets_abs'] = self._kernel_plan['col_offsets'] + n_rows_func 

145 

146 @profile 

147 def negative_log_marginal_likelihood(self, x0): 

148 """ 

149 Compute the negative log marginal likelihood (NLL) of the model. 

150 

151 NLL = 0.5 * y^T K^-1 y + 0.5 * log|K| + 0.5 * N * log(2π) 

152 

153 Parameters 

154 ---------- 

155 x0 : ndarray 

156 Vector of log-scaled hyperparameters (length scales and noise). 

157 

158 Returns 

159 ------- 

160 float 

161 Value of the negative log marginal likelihood. 

162 """ 

163 ell = x0[:-1] 

164 sigma_n = x0[-1] 

165 llhood = 0 

166 diffs = self.model.differences_by_dim 

167 phi = self.model.kernel_func(diffs, ell) 

168 if self.model.n_order == 0: 

169 n_bases = 0 

170 phi_exp = phi.real 

171 phi_exp = phi_exp[np.newaxis, :, :] 

172 else: 

173 n_bases = phi.get_active_bases()[-1] 

174 deriv_order = 2 * self.model.n_order 

175 phi_exp = self._expand_derivs(phi, n_bases, deriv_order) 

176 self._ensure_kernel_plan(n_bases) 

177 if self._kernel_plan is not None: 

178 base_shape = phi.shape 

179 self._ensure_kernel_bufs(base_shape[0]) 

180 phi_3d = phi_exp.reshape(phi_exp.shape[0], base_shape[0], base_shape[1]) 

181 K = utils.rbf_kernel_fast(phi_3d, self._kernel_plan, out=self._K_buf) 

182 else: 

183 K = utils.rbf_kernel( 

184 phi, phi_exp, 

185 self.model.n_order, n_bases, 

186 self.model.flattened_der_indices, self.model.powers, 

187 index=self.model.derivative_locations, 

188 ) 

189 noise_var = (10 ** sigma_n) ** 2 

190 K.flat[::K.shape[0] + 1] += noise_var 

191 K += self.model.sigma_data**2 

192 

193 # Debug: check kernel matrix sparsity 

194 # near_zero = np.sum(np.abs(K) < 1e-10) 

195 # total = K.size 

196 # sparsity = near_zero / total 

197 # if sparsity > 0.5: 

198 # print(f" WARNING: K is {sparsity*100:.1f}% sparse | ell={10**np.array(ell)} | sigma_n={10**sigma_n:.2e} | cond={np.linalg.cond(K.real):.2e}") 

199 # input('i') 

200 try: 

201 L, low = cho_factor(K) 

202 alpha = cho_solve( 

203 (L, low), 

204 self.model.y_train 

205 ) 

206 

207 data_fit = 0.5 * np.dot(self.model.y_train, alpha) 

208 log_det_K = np.sum(np.log(np.diag(L))) 

209 complexity = log_det_K 

210 N = len(self.model.y_train) 

211 const = 0.5 * N * np.log(2 * np.pi) 

212 return data_fit + complexity + const 

213 except Exception: 

214 return 1e6 

215 

216 def nll_wrapper(self, x0): 

217 """ 

218 Wrapper function to compute NLL for optimizer. 

219 

220 Parameters 

221 ---------- 

222 x0 : ndarray 

223 Hyperparameter vector. 

224 

225 Returns 

226 ------- 

227 float 

228 NLL evaluated at x0. 

229 """ 

230 return self.negative_log_marginal_likelihood(x0) 

231 

232 def _compute_grad(self, x0, W, phi, n_bases, oti, diffs): 

233 """ 

234 Compute the NLL gradient given pre-factorised W = K^{-1} - α α^T. 

235 

236 Factoring this out allows nll_grad and nll_and_grad to share the 

237 expensive Cholesky decomposition instead of each rebuilding it. 

238 """ 

239 ln10 = np.log(10.0) 

240 kernel = self.model.kernel 

241 kernel_type = self.model.kernel_type 

242 D = len(diffs) 

243 sigma_n_sq = (10.0 ** x0[-1]) ** 2 

244 

245 grad = np.zeros(len(x0)) 

246 use_fast = self._kernel_plan is not None 

247 base_shape = (W.shape[0] - self._kernel_plan['n_pts_with_derivs'],) * 2 if use_fast else None 

248 

249 deriv_order = 2 * self.model.n_order 

250 

251 # Precompute W projected into phi_exp space to avoid assembling 

252 # the full dK matrix for each hyperparameter dimension. 

253 W_proj = None 

254 if use_fast and self.model.n_order > 0: 

255 from math import comb 

256 ndir = comb(n_bases + deriv_order, deriv_order) 

257 proj_shape = (ndir, base_shape[0], base_shape[1]) 

258 if self._W_proj_buf is None or self._W_proj_shape != proj_shape: 

259 self._W_proj_buf = np.empty(proj_shape) 

260 self._W_proj_shape = proj_shape 

261 W_proj = self._W_proj_buf 

262 

263 plan = self._kernel_plan 

264 row_off = plan.get('row_offsets_abs', plan['row_offsets'] + base_shape[0]) 

265 col_off = plan.get('col_offsets_abs', plan['col_offsets'] + base_shape[1]) 

266 

267 utils._project_W_to_phi_space( 

268 W, W_proj, base_shape[0], base_shape[1], 

269 plan['fd_flat_indices'], plan['df_flat_indices'], 

270 plan['dd_flat_indices'], 

271 plan['idx_flat'], plan['idx_offsets'], plan['index_sizes'], 

272 plan['signs'], plan['n_deriv_types'], row_off, col_off, 

273 ) 

274 

275 def _gc(dphi): 

276 if self.model.n_order == 0: 

277 dphi_exp = dphi.real[np.newaxis, :, :] 

278 else: 

279 dphi_exp = self._expand_derivs(dphi, n_bases, deriv_order) 

280 if W_proj is not None: 

281 dphi_3d = dphi_exp.reshape(W_proj.shape) 

282 return 0.5 * np.vdot(W_proj, dphi_3d) 

283 elif use_fast: 

284 dphi_3d = dphi_exp.reshape(dphi_exp.shape[0], base_shape[0], base_shape[1]) 

285 dK = utils.rbf_kernel_fast(dphi_3d, self._kernel_plan, out=self._dK_buf) 

286 return 0.5 * np.vdot(W, dK) 

287 else: 

288 dK = utils.rbf_kernel( 

289 dphi, dphi_exp, 

290 self.model.n_order, n_bases, 

291 self.model.flattened_der_indices, self.model.powers, 

292 index=self.model.derivative_locations, 

293 ) 

294 return 0.5 * np.vdot(W, dK) 

295 

296 # ── signal variance (common: d phi/d log_sf = 2*ln10 * phi) ────── 

297 grad[-2] = _gc(oti.mul(2.0 * ln10, phi)) 

298 

299 # ── noise variance (common: dK/d log_sn = diag(2*ln10*σ_n²)) ──── 

300 grad[-1] = ln10 * sigma_n_sq * np.trace(W) 

301 

302 # ── kernel-specific hyperparameter gradients ────────────────────── 

303 

304 if kernel == 'SE': 

305 # phi = sf² * exp(-0.5 * Σ_d ell_d² * diff_d²) 

306 # d phi/d log_ell_d = -ln10 * ell_d² * diff_d² * phi 

307 if kernel_type == 'anisotropic': 

308 ell = 10.0 ** x0[:D] 

309 if hasattr(phi, 'fused_scale_sq_mul'): 

310 dphi_buf = oti.zeros(phi.shape) 

311 for d in range(D): 

312 dphi_buf.fused_scale_sq_mul(diffs[d], phi, -ln10 * ell[d] ** 2) 

313 grad[d] = _gc(dphi_buf) 

314 else: 

315 for d in range(D): 

316 d_sq = oti.mul(diffs[d], diffs[d]) 

317 dphi_d = oti.mul(-ln10 * ell[d] ** 2, oti.mul(d_sq, phi)) 

318 grad[d] = _gc(dphi_d) 

319 else: # isotropic: single ell 

320 ell = 10.0 ** float(x0[0]) 

321 if hasattr(phi, 'fused_sum_sq'): 

322 sum_sq = oti.zeros(phi.shape) 

323 sum_sq.fused_sum_sq(diffs) 

324 else: 

325 sum_sq = oti.mul(diffs[0], diffs[0]) 

326 for d in range(1, D): 

327 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d])) 

328 grad[0] = _gc(oti.mul(-ln10 * ell ** 2, oti.mul(sum_sq, phi))) 

329 

330 elif kernel == 'RQ': 

331 # phi = sf² * (1 + r²/(2α))^(-α), r² = Σ_d (ell_d * diff_d)² 

332 # d phi/d log_ell_d = -ln10 * ell_d² * diff_d² * phi / base 

333 # d phi/d log_α = ln10 * α * phi * [-log(base) + (1 - 1/base)] 

334 if kernel_type == 'anisotropic': 

335 ell = 10.0 ** x0[:D] 

336 alpha_rq = 10.0 ** float(x0[D]) 

337 alpha_idx = D 

338 else: 

339 ell_val = 10.0 ** float(x0[0]) 

340 ell = np.full(D, ell_val) 

341 alpha_rq = np.exp(float(x0[1])) # iso uses exp(x), not 10^x 

342 alpha_idx = 1 

343 

344 # Recompute r² and base in OTI 

345 if hasattr(phi, 'fused_sqdist'): 

346 r2 = oti.zeros(phi.shape) 

347 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64) 

348 r2.fused_sqdist(diffs, ell_sq) 

349 else: 

350 r2 = oti.mul(ell[0], diffs[0]) 

351 r2 = oti.mul(r2, r2) 

352 for d in range(1, D): 

353 td = oti.mul(ell[d], diffs[d]) 

354 r2 = oti.sum(r2, oti.mul(td, td)) 

355 base = oti.sum(1.0, oti.mul(r2, 1.0 / (2.0 * alpha_rq))) 

356 inv_base = oti.pow(base, -1) 

357 phi_over_base = oti.mul(phi, inv_base) 

358 

359 if kernel_type == 'anisotropic': 

360 if hasattr(phi, 'fused_scale_sq_mul'): 

361 dphi_buf = oti.zeros(phi.shape) 

362 for d in range(D): 

363 dphi_buf.fused_scale_sq_mul(diffs[d], phi_over_base, -ln10 * ell[d] ** 2) 

364 grad[d] = _gc(dphi_buf) 

365 else: 

366 for d in range(D): 

367 d_sq = oti.mul(diffs[d], diffs[d]) 

368 dphi_d = oti.mul(-ln10 * ell[d] ** 2, oti.mul(d_sq, phi_over_base)) 

369 grad[d] = _gc(dphi_d) 

370 else: 

371 if hasattr(phi, 'fused_sum_sq'): 

372 sum_sq = oti.zeros(phi.shape) 

373 sum_sq.fused_sum_sq(diffs) 

374 else: 

375 sum_sq = oti.mul(diffs[0], diffs[0]) 

376 for d in range(1, D): 

377 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d])) 

378 grad[0] = _gc(oti.mul(-ln10 * ell[0] ** 2, oti.mul(sum_sq, phi_over_base))) 

379 

380 # alpha gradient: phi * α_factor * [-log(base) + (1 - 1/base)] 

381 # aniso: alpha = 10^x → d alpha/dx = ln10 * alpha 

382 # iso: alpha = exp(x) → d alpha/dx = alpha 

383 log_base = oti.log(base) 

384 term = oti.sub(oti.sub(1.0, inv_base), log_base) 

385 alpha_factor = ln10 * alpha_rq if kernel_type == 'anisotropic' else alpha_rq 

386 grad[alpha_idx] = _gc(oti.mul(alpha_factor, oti.mul(phi, term))) 

387 

388 elif kernel == 'SineExp': 

389 # phi = sf² * exp(-2 * Σ_d (ell_d * sin(π/p_d * diff_d))²) 

390 # d phi/d log_ell_d = -4*ln10 * ell_d² * sin_d² * phi 

391 # d phi/d log_p_d = 4*ln10 * ell_d² * (π/p_d) * sin_d * cos_d * diff_d * phi 

392 if kernel_type == 'anisotropic': 

393 ell = 10.0 ** x0[:D] 

394 p = 10.0 ** x0[D:2 * D] 

395 pip = np.pi / p # π/p_d per dimension 

396 p_start = D # index of first log_p in x0 

397 else: 

398 ell_val = 10.0 ** float(x0[0]) 

399 p_val = 10.0 ** float(x0[1]) 

400 pip_val = np.pi / p_val 

401 ell = np.full(D, ell_val) 

402 pip = np.full(D, pip_val) 

403 p_start = 1 

404 

405 # Precompute sin and cos for each dimension 

406 sin_d = [] 

407 cos_d = [] 

408 for d in range(D): 

409 arg = oti.mul(pip[d], diffs[d]) 

410 sin_d.append(oti.sin(arg)) 

411 cos_d.append(oti.cos(arg)) 

412 

413 # Length-scale gradients: d phi/d log_ell_d = -4*ln10 * ell_d² * sin_d² * phi 

414 if kernel_type == 'anisotropic': 

415 if hasattr(phi, 'fused_scale_sq_mul'): 

416 dphi_buf = oti.zeros(phi.shape) 

417 for d in range(D): 

418 dphi_buf.fused_scale_sq_mul(sin_d[d], phi, -4.0 * ln10 * ell[d] ** 2) 

419 grad[d] = _gc(dphi_buf) 

420 else: 

421 for d in range(D): 

422 sin_sq = oti.mul(sin_d[d], sin_d[d]) 

423 grad[d] = _gc(oti.mul(-4.0 * ln10 * ell[d] ** 2, 

424 oti.mul(sin_sq, phi))) 

425 else: 

426 if hasattr(phi, 'fused_sum_sq'): 

427 sum_sin_sq = oti.zeros(phi.shape) 

428 sum_sin_sq.fused_sum_sq(sin_d) 

429 else: 

430 sum_sin_sq = oti.mul(sin_d[0], sin_d[0]) 

431 for d in range(1, D): 

432 sum_sin_sq = oti.sum(sum_sin_sq, oti.mul(sin_d[d], sin_d[d])) 

433 grad[0] = _gc(oti.mul(-4.0 * ln10 * ell[0] ** 2, 

434 oti.mul(sum_sin_sq, phi))) 

435 

436 # Period gradients 

437 if kernel_type == 'anisotropic': 

438 for d in range(D): 

439 # d phi/d log_p_d = 4*ln10*ell_d²*(π/p_d)*sin_d*cos_d*diff_d * phi 

440 sc_diff = oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d])) 

441 scale = 4.0 * ln10 * ell[d] ** 2 * pip[d] 

442 grad[p_start + d] = _gc(oti.mul(scale, oti.mul(sc_diff, phi))) 

443 else: 

444 # d phi/d log_p = 4*ln10*ell²*(π/p) * Σ_d(sin_d*cos_d*diff_d) * phi 

445 sum_scd = oti.mul(sin_d[0], oti.mul(cos_d[0], diffs[0])) 

446 for d in range(1, D): 

447 sum_scd = oti.sum(sum_scd, 

448 oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d]))) 

449 scale = 4.0 * ln10 * ell[0] ** 2 * pip[0] 

450 grad[p_start] = _gc(oti.mul(scale, oti.mul(sum_scd, phi))) 

451 

452 elif kernel == 'Matern': 

453 # phi = sf² * f(r), r = sqrt(Σ_d (ell_d*(diff_d+ε))²) 

454 # d phi/d log_ell_d = sf² * f'(r) * ln10 * ell_d² * (diff_d+ε)² / r 

455 kf = self.model.kernel_factory 

456 

457 # Build/cache the Matern derivative function 

458 if not hasattr(kf, '_matern_grad_prebuild'): 

459 kf._matern_grad_prebuild = matern_kernel_grad_builder( 

460 kf.nu, oti_module=oti) 

461 

462 if kernel_type == 'anisotropic': 

463 ell = 10.0 ** x0[:D] 

464 else: 

465 ell = np.full(D, 10.0 ** float(x0[0])) 

466 

467 sigma_f_sq = (10.0 ** float(x0[-2])) ** 2 

468 _eps = 1e-10 # regularise r, not each diff (matches kernel_funcs.py) 

469 

470 # Recompute r in OTI (matches matern_kernel_anisotropic/isotropic) 

471 if hasattr(phi, 'fused_sqdist'): 

472 r2 = oti.zeros(phi.shape) 

473 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64) 

474 r2.fused_sqdist(diffs, ell_sq) 

475 else: 

476 r2 = oti.mul(ell[0], diffs[0]) 

477 r2 = oti.mul(r2, r2) 

478 for d in range(1, D): 

479 td = oti.mul(ell[d], diffs[d]) 

480 r2 = oti.sum(r2, oti.mul(td, td)) 

481 r_oti = oti.sqrt(oti.sum(r2, _eps ** 2)) 

482 f_prime_r = kf._matern_grad_prebuild(r_oti) # df/dr (OTI) 

483 inv_r = oti.pow(r_oti, -1) 

484 

485 # Precompute base = sigma_f² * f'(r) * 1/r for length-scale gradients 

486 # grad[d] = _gc(base * ln10 * ell_d² * diff_d²) 

487 base_matern = oti.mul(sigma_f_sq, oti.mul(f_prime_r, inv_r)) 

488 if kernel_type == 'anisotropic': 

489 if hasattr(phi, 'fused_scale_sq_mul'): 

490 dphi_buf = oti.zeros(phi.shape) 

491 for d in range(D): 

492 dphi_buf.fused_scale_sq_mul(diffs[d], base_matern, ln10 * ell[d] ** 2) 

493 grad[d] = _gc(dphi_buf) 

494 else: 

495 for d in range(D): 

496 d_sq = oti.mul(diffs[d], diffs[d]) 

497 dphi_d = oti.mul(ln10 * ell[d] ** 2, oti.mul(d_sq, base_matern)) 

498 grad[d] = _gc(dphi_d) 

499 else: 

500 ell_val = ell[0] 

501 if hasattr(phi, 'fused_sum_sq'): 

502 sum_dsq = oti.zeros(phi.shape) 

503 sum_dsq.fused_sum_sq(diffs) 

504 else: 

505 sum_dsq = oti.mul(diffs[0], diffs[0]) 

506 for d in range(1, D): 

507 sum_dsq = oti.sum(sum_dsq, oti.mul(diffs[d], diffs[d])) 

508 dphi_e = oti.mul(ln10 * ell_val ** 2, oti.mul(sum_dsq, base_matern)) 

509 grad[0] = _gc(dphi_e) 

510 

511 elif kernel == 'SI': 

512 # phi = sf² * Π_d (1 + ell_d * B(diff_d)) 

513 # d phi/d log_ell_d = ln10 * ell_d * B(diff_d) / (1 + ell_d*B(diff_d)) * phi 

514 kf = self.model.kernel_factory 

515 si_prebuild = kf.SI_kernel_prebuild 

516 

517 if kernel_type == 'anisotropic': 

518 ell = 10.0 ** x0[:D] 

519 else: 

520 ell = np.full(D, 10.0 ** float(x0[0])) 

521 

522 # Precompute SI values and factor terms for each dimension 

523 si_vals = [si_prebuild(diffs[d]) for d in range(D)] 

524 term_vals = [oti.sum(1.0, oti.mul(ell[d], si_vals[d])) for d in range(D)] 

525 

526 if kernel_type == 'anisotropic': 

527 for d in range(D): 

528 phi_over_term = oti.div(phi, term_vals[d]) 

529 dphi_d = oti.mul(ln10 * ell[d], 

530 oti.mul(si_vals[d], phi_over_term)) 

531 grad[d] = _gc(dphi_d) 

532 else: 

533 ell_val = ell[0] 

534 # d phi/d log_ell = ln10 * ell * Σ_d [B(diff_d)/(1+ell*B(diff_d))] * phi 

535 acc = oti.mul(si_vals[0], oti.div(phi, term_vals[0])) 

536 for d in range(1, D): 

537 acc = oti.sum(acc, oti.mul(si_vals[d], 

538 oti.div(phi, term_vals[d]))) 

539 grad[0] = _gc(oti.mul(ln10 * ell_val, acc)) 

540 

541 return grad 

542 

543 def nll_grad(self, x0): 

544 """Analytic gradient of the NLL (separate Cholesky from nll_wrapper).""" 

545 diffs = self.model.differences_by_dim 

546 oti = self.model.kernel_factory.oti 

547 sigma_n_sq = (10.0 ** x0[-1]) ** 2 

548 

549 phi = self.model.kernel_func(diffs, x0[:-1]) 

550 if self.model.n_order == 0: 

551 n_bases = 0 

552 phi_exp = phi.real[np.newaxis, :, :] 

553 else: 

554 n_bases = phi.get_active_bases()[-1] 

555 deriv_order = 2 * self.model.n_order 

556 phi_exp = self._expand_derivs(phi, n_bases, deriv_order) 

557 

558 self._ensure_kernel_plan(n_bases) 

559 if self._kernel_plan is not None: 

560 base_shape = phi.shape 

561 self._ensure_kernel_bufs(base_shape[0]) 

562 phi_3d = phi_exp.reshape(phi_exp.shape[0], base_shape[0], base_shape[1]) 

563 K = utils.rbf_kernel_fast(phi_3d, self._kernel_plan, out=self._K_buf) 

564 else: 

565 K = utils.rbf_kernel( 

566 phi, phi_exp, self.model.n_order, n_bases, 

567 self.model.flattened_der_indices, self.model.powers, 

568 index=self.model.derivative_locations, 

569 ) 

570 K.flat[::K.shape[0] + 1] += sigma_n_sq 

571 K += self.model.sigma_data ** 2 

572 

573 try: 

574 L, low = cho_factor(K) 

575 alpha_v = cho_solve((L, low), self.model.y_train) 

576 N = len(self.model.y_train) 

577 K_inv = cho_solve((L, low), np.eye(N)) 

578 W = K_inv - np.outer(alpha_v, alpha_v) 

579 except Exception: 

580 return np.zeros(len(x0)) 

581 

582 return self._compute_grad(x0, W, phi, n_bases, oti, diffs) 

583 

584 def nll_and_grad(self, x0): 

585 """ 

586 Compute NLL and its gradient in a single pass, sharing one Cholesky. 

587 

588 Returns 

589 ------- 

590 nll : float 

591 grad : ndarray 

592 """ 

593 diffs = self.model.differences_by_dim 

594 oti = self.model.kernel_factory.oti 

595 sigma_n_sq = (10.0 ** x0[-1]) ** 2 

596 

597 phi = self.model.kernel_func(diffs, x0[:-1]) 

598 if self.model.n_order == 0: 

599 n_bases = 0 

600 phi_exp = phi.real[np.newaxis, :, :] 

601 else: 

602 n_bases = phi.get_active_bases()[-1] 

603 deriv_order = 2 * self.model.n_order 

604 phi_exp = self._expand_derivs(phi, n_bases, deriv_order) 

605 

606 self._ensure_kernel_plan(n_bases) 

607 if self._kernel_plan is not None: 

608 base_shape = phi.shape 

609 self._ensure_kernel_bufs(base_shape[0]) 

610 phi_3d = phi_exp.reshape(phi_exp.shape[0], base_shape[0], base_shape[1]) 

611 K = utils.rbf_kernel_fast(phi_3d, self._kernel_plan, out=self._K_buf) 

612 else: 

613 K = utils.rbf_kernel( 

614 phi, phi_exp, self.model.n_order, n_bases, 

615 self.model.flattened_der_indices, self.model.powers, 

616 index=self.model.derivative_locations, 

617 ) 

618 K.flat[::K.shape[0] + 1] += sigma_n_sq 

619 K += self.model.sigma_data ** 2 

620 

621 try: 

622 L, low = cho_factor(K) 

623 alpha_v = cho_solve((L, low), self.model.y_train) 

624 N = len(self.model.y_train) 

625 

626 nll = (0.5 * np.dot(self.model.y_train, alpha_v) 

627 + np.sum(np.log(np.diag(L))) 

628 + 0.5 * N * np.log(2 * np.pi)) 

629 

630 K_inv = cho_solve((L, low), np.eye(N)) 

631 W = K_inv - np.outer(alpha_v, alpha_v) 

632 except Exception: 

633 return 1e6, np.zeros(len(x0)) 

634 

635 grad = self._compute_grad(x0, W, phi, n_bases, oti, diffs) 

636 return float(nll), grad 

637 

638 def optimize_hyperparameters(self, 

639 optimizer="pso", 

640 **kwargs): 

641 """ 

642 Optimize the DEGP model hyperparameters using Particle Swarm Optimization (PSO). 

643 

644 Parameters: 

645 ---------- 

646 n_restart_optimizer : int, default=20 

647 Maximum number of iterations for PSO. 

648 swarm_size : int, default=20 

649 Number of particles in the swarm. 

650 verbose : bool, default=True 

651 Controls verbosity of PSO output. 

652 

653 Returns: 

654 ------- 

655 best_x : ndarray 

656 The optimal set of hyperparameters found. 

657 """ 

658 

659 if isinstance(optimizer, str): 

660 if optimizer not in OPTIMIZERS: 

661 raise ValueError( 

662 f"Unknown optimizer '{optimizer}'. Available: {list(OPTIMIZERS.keys())}" 

663 ) 

664 optimizer_fn = OPTIMIZERS[optimizer] 

665 else: 

666 optimizer_fn = optimizer # allow passing a callable directly 

667 

668 bounds = self.model.bounds 

669 lb = [b[0] for b in bounds] 

670 ub = [b[1] for b in bounds] 

671 

672 # Inject nll_and_grad (single Cholesky per step) for all gradient-aware optimizers. 

673 if optimizer in ('lbfgs', 'jade', 'pso') and 'func_and_grad' not in kwargs and 'grad_func' not in kwargs: 

674 kwargs['func_and_grad'] = self.nll_and_grad 

675 

676 best_x, best_val = optimizer_fn(self.nll_wrapper, lb, ub, **kwargs) 

677 

678 self.model.opt_x0 = best_x 

679 self.model.opt_nll = best_val 

680 

681 

682 return best_x