Coverage for jetgp/full_degp/optimizer.py: 84%
421 statements
« prev ^ index » next coverage.py v7.10.7, created at 2026-04-03 15:37 -0500
« prev ^ index » next coverage.py v7.10.7, created at 2026-04-03 15:37 -0500
1import numpy as np
2from scipy.linalg import cho_solve, cho_factor
3from jetgp.full_degp import degp_utils as utils
4from line_profiler import profile
5import jetgp.utils as gen_utils
6from jetgp.hyperparameter_optimizers import OPTIMIZERS
7from jetgp.utils import matern_kernel_grad_builder
9class Optimizer:
10 """
11 Optimizer class to perform hyperparameter tuning for derivative-enhanced Gaussian Process models
12 by minimizing the negative log marginal likelihood (NLL).
14 Parameters
15 ----------
16 model : object
17 An instance of a model (e.g., ddegp) containing the necessary training data
18 and kernel configuration.
19 """
21 def __init__(self, model):
22 self.model = model
23 self._kernel_plan = None
24 self._deriv_buf = None
25 self._deriv_buf_shape = None
26 self._deriv_factors = None
27 self._deriv_factors_key = None
28 self._K_buf = None
29 self._dK_buf = None
30 self._kernel_buf_size = None
31 self._W_proj_buf = None
32 self._W_proj_shape = None
34 def _get_deriv_buf(self, phi, n_bases, order):
35 """Return a pre-allocated buffer for get_all_derivs, reusing if shape matches."""
36 from math import comb
37 ndir = comb(n_bases + order, order)
38 shape = (ndir, phi.shape[0], phi.shape[1])
39 if self._deriv_buf is None or self._deriv_buf_shape != shape:
40 self._deriv_buf = np.zeros(shape, dtype=np.float64)
41 self._deriv_buf_shape = shape
42 return self._deriv_buf
44 def _expand_derivs(self, phi, n_bases, deriv_order):
45 """Expand OTI derivatives, using fast struct path if available."""
46 if hasattr(phi, 'get_all_derivs_fast'):
47 buf = self._get_deriv_buf(phi, n_bases, deriv_order)
48 factors = self._get_deriv_factors(n_bases, deriv_order)
49 return phi.get_all_derivs_fast(factors, buf)
50 return phi.get_all_derivs(n_bases, deriv_order)
52 @staticmethod
53 def _enum_factors(max_basis, ordi):
54 """Enumerate derivative factors in struct memory order for a given order.
56 Yields the factorial factor prod(count_b!) for each multi-index of
57 the given order, enumerated in the same order as the OTI struct layout
58 (last-index-major: for last=1..max_basis, recurse prefix with max=last).
59 """
60 from math import factorial
61 from collections import Counter
62 if ordi == 1:
63 for _ in range(max_basis):
64 yield 1.0
65 return
66 for last in range(1, max_basis + 1):
67 if ordi == 2:
68 for i in range(1, last + 1):
69 counts = Counter((i, last))
70 f = 1
71 for c in counts.values():
72 f *= factorial(c)
73 yield float(f)
74 else:
75 for prefix_factor, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1):
76 counts = dict(prefix_counts)
77 counts[last] = counts.get(last, 0) + 1
78 f = 1
79 for c in counts.values():
80 f *= factorial(c)
81 yield float(f)
83 @staticmethod
84 def _enum_factors_with_counts(max_basis, ordi):
85 """Enumerate (factor, counts_dict) pairs in struct order."""
86 from math import factorial
87 from collections import Counter
88 if ordi == 1:
89 for i in range(1, max_basis + 1):
90 yield 1.0, {i: 1}
91 return
92 for last in range(1, max_basis + 1):
93 for _, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1):
94 counts = dict(prefix_counts)
95 counts[last] = counts.get(last, 0) + 1
96 f = 1
97 for c in counts.values():
98 f *= factorial(c)
99 yield float(f), counts
101 def _get_deriv_factors(self, n_bases, order):
102 """Return cached precomputed derivative factorial factors."""
103 key = (n_bases, order)
104 if self._deriv_factors is not None and self._deriv_factors_key == key:
105 return self._deriv_factors
106 factors = [1.0] # order 0: real part
107 for ordi in range(1, order + 1):
108 factors.extend(self._enum_factors(n_bases, ordi))
109 self._deriv_factors = np.array(factors, dtype=np.float64)
110 self._deriv_factors_key = key
111 return self._deriv_factors
113 def _ensure_kernel_plan(self, n_bases):
114 """Lazily precompute kernel plan (once per n_bases)."""
115 if self._kernel_plan is not None and self._kernel_plan_n_bases == n_bases:
116 return
117 if not hasattr(utils, 'precompute_kernel_plan'):
118 self._kernel_plan = None
119 return
120 self._kernel_plan = utils.precompute_kernel_plan(
121 self.model.n_order, n_bases,
122 self.model.flattened_der_indices,
123 self.model.powers,
124 self.model.derivative_locations,
125 )
126 self._kernel_plan_n_bases = n_bases
127 # Reset kernel buffers when plan changes
128 self._K_buf = None
129 self._dK_buf = None
130 self._kernel_buf_size = None
132 def _ensure_kernel_bufs(self, n_rows_func):
133 """Pre-allocate reusable K and dK buffers (avoids repeated malloc)."""
134 if self._kernel_plan is None:
135 return
136 total = n_rows_func + self._kernel_plan['n_pts_with_derivs']
137 if self._kernel_buf_size != total:
138 self._K_buf = np.empty((total, total))
139 self._dK_buf = np.empty((total, total))
140 self._kernel_buf_size = total
141 # Cache absolute offsets in plan so rbf_kernel_fast doesn't recompute
142 if 'row_offsets_abs' not in self._kernel_plan:
143 self._kernel_plan['row_offsets_abs'] = self._kernel_plan['row_offsets'] + n_rows_func
144 self._kernel_plan['col_offsets_abs'] = self._kernel_plan['col_offsets'] + n_rows_func
146 @profile
147 def negative_log_marginal_likelihood(self, x0):
148 """
149 Compute the negative log marginal likelihood (NLL) of the model.
151 NLL = 0.5 * y^T K^-1 y + 0.5 * log|K| + 0.5 * N * log(2π)
153 Parameters
154 ----------
155 x0 : ndarray
156 Vector of log-scaled hyperparameters (length scales and noise).
158 Returns
159 -------
160 float
161 Value of the negative log marginal likelihood.
162 """
163 ell = x0[:-1]
164 sigma_n = x0[-1]
165 llhood = 0
166 diffs = self.model.differences_by_dim
167 phi = self.model.kernel_func(diffs, ell)
168 if self.model.n_order == 0:
169 n_bases = 0
170 phi_exp = phi.real
171 phi_exp = phi_exp[np.newaxis, :, :]
172 else:
173 n_bases = phi.get_active_bases()[-1]
174 deriv_order = 2 * self.model.n_order
175 phi_exp = self._expand_derivs(phi, n_bases, deriv_order)
176 self._ensure_kernel_plan(n_bases)
177 if self._kernel_plan is not None:
178 base_shape = phi.shape
179 self._ensure_kernel_bufs(base_shape[0])
180 phi_3d = phi_exp.reshape(phi_exp.shape[0], base_shape[0], base_shape[1])
181 K = utils.rbf_kernel_fast(phi_3d, self._kernel_plan, out=self._K_buf)
182 else:
183 K = utils.rbf_kernel(
184 phi, phi_exp,
185 self.model.n_order, n_bases,
186 self.model.flattened_der_indices, self.model.powers,
187 index=self.model.derivative_locations,
188 )
189 noise_var = (10 ** sigma_n) ** 2
190 K.flat[::K.shape[0] + 1] += noise_var
191 K += self.model.sigma_data**2
193 # Debug: check kernel matrix sparsity
194 # near_zero = np.sum(np.abs(K) < 1e-10)
195 # total = K.size
196 # sparsity = near_zero / total
197 # if sparsity > 0.5:
198 # print(f" WARNING: K is {sparsity*100:.1f}% sparse | ell={10**np.array(ell)} | sigma_n={10**sigma_n:.2e} | cond={np.linalg.cond(K.real):.2e}")
199 # input('i')
200 try:
201 L, low = cho_factor(K)
202 alpha = cho_solve(
203 (L, low),
204 self.model.y_train
205 )
207 data_fit = 0.5 * np.dot(self.model.y_train, alpha)
208 log_det_K = np.sum(np.log(np.diag(L)))
209 complexity = log_det_K
210 N = len(self.model.y_train)
211 const = 0.5 * N * np.log(2 * np.pi)
212 return data_fit + complexity + const
213 except Exception:
214 return 1e6
216 def nll_wrapper(self, x0):
217 """
218 Wrapper function to compute NLL for optimizer.
220 Parameters
221 ----------
222 x0 : ndarray
223 Hyperparameter vector.
225 Returns
226 -------
227 float
228 NLL evaluated at x0.
229 """
230 return self.negative_log_marginal_likelihood(x0)
232 def _compute_grad(self, x0, W, phi, n_bases, oti, diffs):
233 """
234 Compute the NLL gradient given pre-factorised W = K^{-1} - α α^T.
236 Factoring this out allows nll_grad and nll_and_grad to share the
237 expensive Cholesky decomposition instead of each rebuilding it.
238 """
239 ln10 = np.log(10.0)
240 kernel = self.model.kernel
241 kernel_type = self.model.kernel_type
242 D = len(diffs)
243 sigma_n_sq = (10.0 ** x0[-1]) ** 2
245 grad = np.zeros(len(x0))
246 use_fast = self._kernel_plan is not None
247 base_shape = (W.shape[0] - self._kernel_plan['n_pts_with_derivs'],) * 2 if use_fast else None
249 deriv_order = 2 * self.model.n_order
251 # Precompute W projected into phi_exp space to avoid assembling
252 # the full dK matrix for each hyperparameter dimension.
253 W_proj = None
254 if use_fast and self.model.n_order > 0:
255 from math import comb
256 ndir = comb(n_bases + deriv_order, deriv_order)
257 proj_shape = (ndir, base_shape[0], base_shape[1])
258 if self._W_proj_buf is None or self._W_proj_shape != proj_shape:
259 self._W_proj_buf = np.empty(proj_shape)
260 self._W_proj_shape = proj_shape
261 W_proj = self._W_proj_buf
263 plan = self._kernel_plan
264 row_off = plan.get('row_offsets_abs', plan['row_offsets'] + base_shape[0])
265 col_off = plan.get('col_offsets_abs', plan['col_offsets'] + base_shape[1])
267 utils._project_W_to_phi_space(
268 W, W_proj, base_shape[0], base_shape[1],
269 plan['fd_flat_indices'], plan['df_flat_indices'],
270 plan['dd_flat_indices'],
271 plan['idx_flat'], plan['idx_offsets'], plan['index_sizes'],
272 plan['signs'], plan['n_deriv_types'], row_off, col_off,
273 )
275 def _gc(dphi):
276 if self.model.n_order == 0:
277 dphi_exp = dphi.real[np.newaxis, :, :]
278 else:
279 dphi_exp = self._expand_derivs(dphi, n_bases, deriv_order)
280 if W_proj is not None:
281 dphi_3d = dphi_exp.reshape(W_proj.shape)
282 return 0.5 * np.vdot(W_proj, dphi_3d)
283 elif use_fast:
284 dphi_3d = dphi_exp.reshape(dphi_exp.shape[0], base_shape[0], base_shape[1])
285 dK = utils.rbf_kernel_fast(dphi_3d, self._kernel_plan, out=self._dK_buf)
286 return 0.5 * np.vdot(W, dK)
287 else:
288 dK = utils.rbf_kernel(
289 dphi, dphi_exp,
290 self.model.n_order, n_bases,
291 self.model.flattened_der_indices, self.model.powers,
292 index=self.model.derivative_locations,
293 )
294 return 0.5 * np.vdot(W, dK)
296 # ── signal variance (common: d phi/d log_sf = 2*ln10 * phi) ──────
297 grad[-2] = _gc(oti.mul(2.0 * ln10, phi))
299 # ── noise variance (common: dK/d log_sn = diag(2*ln10*σ_n²)) ────
300 grad[-1] = ln10 * sigma_n_sq * np.trace(W)
302 # ── kernel-specific hyperparameter gradients ──────────────────────
304 if kernel == 'SE':
305 # phi = sf² * exp(-0.5 * Σ_d ell_d² * diff_d²)
306 # d phi/d log_ell_d = -ln10 * ell_d² * diff_d² * phi
307 if kernel_type == 'anisotropic':
308 ell = 10.0 ** x0[:D]
309 if hasattr(phi, 'fused_scale_sq_mul'):
310 dphi_buf = oti.zeros(phi.shape)
311 for d in range(D):
312 dphi_buf.fused_scale_sq_mul(diffs[d], phi, -ln10 * ell[d] ** 2)
313 grad[d] = _gc(dphi_buf)
314 else:
315 for d in range(D):
316 d_sq = oti.mul(diffs[d], diffs[d])
317 dphi_d = oti.mul(-ln10 * ell[d] ** 2, oti.mul(d_sq, phi))
318 grad[d] = _gc(dphi_d)
319 else: # isotropic: single ell
320 ell = 10.0 ** float(x0[0])
321 if hasattr(phi, 'fused_sum_sq'):
322 sum_sq = oti.zeros(phi.shape)
323 sum_sq.fused_sum_sq(diffs)
324 else:
325 sum_sq = oti.mul(diffs[0], diffs[0])
326 for d in range(1, D):
327 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d]))
328 grad[0] = _gc(oti.mul(-ln10 * ell ** 2, oti.mul(sum_sq, phi)))
330 elif kernel == 'RQ':
331 # phi = sf² * (1 + r²/(2α))^(-α), r² = Σ_d (ell_d * diff_d)²
332 # d phi/d log_ell_d = -ln10 * ell_d² * diff_d² * phi / base
333 # d phi/d log_α = ln10 * α * phi * [-log(base) + (1 - 1/base)]
334 if kernel_type == 'anisotropic':
335 ell = 10.0 ** x0[:D]
336 alpha_rq = 10.0 ** float(x0[D])
337 alpha_idx = D
338 else:
339 ell_val = 10.0 ** float(x0[0])
340 ell = np.full(D, ell_val)
341 alpha_rq = np.exp(float(x0[1])) # iso uses exp(x), not 10^x
342 alpha_idx = 1
344 # Recompute r² and base in OTI
345 if hasattr(phi, 'fused_sqdist'):
346 r2 = oti.zeros(phi.shape)
347 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64)
348 r2.fused_sqdist(diffs, ell_sq)
349 else:
350 r2 = oti.mul(ell[0], diffs[0])
351 r2 = oti.mul(r2, r2)
352 for d in range(1, D):
353 td = oti.mul(ell[d], diffs[d])
354 r2 = oti.sum(r2, oti.mul(td, td))
355 base = oti.sum(1.0, oti.mul(r2, 1.0 / (2.0 * alpha_rq)))
356 inv_base = oti.pow(base, -1)
357 phi_over_base = oti.mul(phi, inv_base)
359 if kernel_type == 'anisotropic':
360 if hasattr(phi, 'fused_scale_sq_mul'):
361 dphi_buf = oti.zeros(phi.shape)
362 for d in range(D):
363 dphi_buf.fused_scale_sq_mul(diffs[d], phi_over_base, -ln10 * ell[d] ** 2)
364 grad[d] = _gc(dphi_buf)
365 else:
366 for d in range(D):
367 d_sq = oti.mul(diffs[d], diffs[d])
368 dphi_d = oti.mul(-ln10 * ell[d] ** 2, oti.mul(d_sq, phi_over_base))
369 grad[d] = _gc(dphi_d)
370 else:
371 if hasattr(phi, 'fused_sum_sq'):
372 sum_sq = oti.zeros(phi.shape)
373 sum_sq.fused_sum_sq(diffs)
374 else:
375 sum_sq = oti.mul(diffs[0], diffs[0])
376 for d in range(1, D):
377 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d]))
378 grad[0] = _gc(oti.mul(-ln10 * ell[0] ** 2, oti.mul(sum_sq, phi_over_base)))
380 # alpha gradient: phi * α_factor * [-log(base) + (1 - 1/base)]
381 # aniso: alpha = 10^x → d alpha/dx = ln10 * alpha
382 # iso: alpha = exp(x) → d alpha/dx = alpha
383 log_base = oti.log(base)
384 term = oti.sub(oti.sub(1.0, inv_base), log_base)
385 alpha_factor = ln10 * alpha_rq if kernel_type == 'anisotropic' else alpha_rq
386 grad[alpha_idx] = _gc(oti.mul(alpha_factor, oti.mul(phi, term)))
388 elif kernel == 'SineExp':
389 # phi = sf² * exp(-2 * Σ_d (ell_d * sin(π/p_d * diff_d))²)
390 # d phi/d log_ell_d = -4*ln10 * ell_d² * sin_d² * phi
391 # d phi/d log_p_d = 4*ln10 * ell_d² * (π/p_d) * sin_d * cos_d * diff_d * phi
392 if kernel_type == 'anisotropic':
393 ell = 10.0 ** x0[:D]
394 p = 10.0 ** x0[D:2 * D]
395 pip = np.pi / p # π/p_d per dimension
396 p_start = D # index of first log_p in x0
397 else:
398 ell_val = 10.0 ** float(x0[0])
399 p_val = 10.0 ** float(x0[1])
400 pip_val = np.pi / p_val
401 ell = np.full(D, ell_val)
402 pip = np.full(D, pip_val)
403 p_start = 1
405 # Precompute sin and cos for each dimension
406 sin_d = []
407 cos_d = []
408 for d in range(D):
409 arg = oti.mul(pip[d], diffs[d])
410 sin_d.append(oti.sin(arg))
411 cos_d.append(oti.cos(arg))
413 # Length-scale gradients: d phi/d log_ell_d = -4*ln10 * ell_d² * sin_d² * phi
414 if kernel_type == 'anisotropic':
415 if hasattr(phi, 'fused_scale_sq_mul'):
416 dphi_buf = oti.zeros(phi.shape)
417 for d in range(D):
418 dphi_buf.fused_scale_sq_mul(sin_d[d], phi, -4.0 * ln10 * ell[d] ** 2)
419 grad[d] = _gc(dphi_buf)
420 else:
421 for d in range(D):
422 sin_sq = oti.mul(sin_d[d], sin_d[d])
423 grad[d] = _gc(oti.mul(-4.0 * ln10 * ell[d] ** 2,
424 oti.mul(sin_sq, phi)))
425 else:
426 if hasattr(phi, 'fused_sum_sq'):
427 sum_sin_sq = oti.zeros(phi.shape)
428 sum_sin_sq.fused_sum_sq(sin_d)
429 else:
430 sum_sin_sq = oti.mul(sin_d[0], sin_d[0])
431 for d in range(1, D):
432 sum_sin_sq = oti.sum(sum_sin_sq, oti.mul(sin_d[d], sin_d[d]))
433 grad[0] = _gc(oti.mul(-4.0 * ln10 * ell[0] ** 2,
434 oti.mul(sum_sin_sq, phi)))
436 # Period gradients
437 if kernel_type == 'anisotropic':
438 for d in range(D):
439 # d phi/d log_p_d = 4*ln10*ell_d²*(π/p_d)*sin_d*cos_d*diff_d * phi
440 sc_diff = oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d]))
441 scale = 4.0 * ln10 * ell[d] ** 2 * pip[d]
442 grad[p_start + d] = _gc(oti.mul(scale, oti.mul(sc_diff, phi)))
443 else:
444 # d phi/d log_p = 4*ln10*ell²*(π/p) * Σ_d(sin_d*cos_d*diff_d) * phi
445 sum_scd = oti.mul(sin_d[0], oti.mul(cos_d[0], diffs[0]))
446 for d in range(1, D):
447 sum_scd = oti.sum(sum_scd,
448 oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d])))
449 scale = 4.0 * ln10 * ell[0] ** 2 * pip[0]
450 grad[p_start] = _gc(oti.mul(scale, oti.mul(sum_scd, phi)))
452 elif kernel == 'Matern':
453 # phi = sf² * f(r), r = sqrt(Σ_d (ell_d*(diff_d+ε))²)
454 # d phi/d log_ell_d = sf² * f'(r) * ln10 * ell_d² * (diff_d+ε)² / r
455 kf = self.model.kernel_factory
457 # Build/cache the Matern derivative function
458 if not hasattr(kf, '_matern_grad_prebuild'):
459 kf._matern_grad_prebuild = matern_kernel_grad_builder(
460 kf.nu, oti_module=oti)
462 if kernel_type == 'anisotropic':
463 ell = 10.0 ** x0[:D]
464 else:
465 ell = np.full(D, 10.0 ** float(x0[0]))
467 sigma_f_sq = (10.0 ** float(x0[-2])) ** 2
468 _eps = 1e-10 # regularise r, not each diff (matches kernel_funcs.py)
470 # Recompute r in OTI (matches matern_kernel_anisotropic/isotropic)
471 if hasattr(phi, 'fused_sqdist'):
472 r2 = oti.zeros(phi.shape)
473 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64)
474 r2.fused_sqdist(diffs, ell_sq)
475 else:
476 r2 = oti.mul(ell[0], diffs[0])
477 r2 = oti.mul(r2, r2)
478 for d in range(1, D):
479 td = oti.mul(ell[d], diffs[d])
480 r2 = oti.sum(r2, oti.mul(td, td))
481 r_oti = oti.sqrt(oti.sum(r2, _eps ** 2))
482 f_prime_r = kf._matern_grad_prebuild(r_oti) # df/dr (OTI)
483 inv_r = oti.pow(r_oti, -1)
485 # Precompute base = sigma_f² * f'(r) * 1/r for length-scale gradients
486 # grad[d] = _gc(base * ln10 * ell_d² * diff_d²)
487 base_matern = oti.mul(sigma_f_sq, oti.mul(f_prime_r, inv_r))
488 if kernel_type == 'anisotropic':
489 if hasattr(phi, 'fused_scale_sq_mul'):
490 dphi_buf = oti.zeros(phi.shape)
491 for d in range(D):
492 dphi_buf.fused_scale_sq_mul(diffs[d], base_matern, ln10 * ell[d] ** 2)
493 grad[d] = _gc(dphi_buf)
494 else:
495 for d in range(D):
496 d_sq = oti.mul(diffs[d], diffs[d])
497 dphi_d = oti.mul(ln10 * ell[d] ** 2, oti.mul(d_sq, base_matern))
498 grad[d] = _gc(dphi_d)
499 else:
500 ell_val = ell[0]
501 if hasattr(phi, 'fused_sum_sq'):
502 sum_dsq = oti.zeros(phi.shape)
503 sum_dsq.fused_sum_sq(diffs)
504 else:
505 sum_dsq = oti.mul(diffs[0], diffs[0])
506 for d in range(1, D):
507 sum_dsq = oti.sum(sum_dsq, oti.mul(diffs[d], diffs[d]))
508 dphi_e = oti.mul(ln10 * ell_val ** 2, oti.mul(sum_dsq, base_matern))
509 grad[0] = _gc(dphi_e)
511 elif kernel == 'SI':
512 # phi = sf² * Π_d (1 + ell_d * B(diff_d))
513 # d phi/d log_ell_d = ln10 * ell_d * B(diff_d) / (1 + ell_d*B(diff_d)) * phi
514 kf = self.model.kernel_factory
515 si_prebuild = kf.SI_kernel_prebuild
517 if kernel_type == 'anisotropic':
518 ell = 10.0 ** x0[:D]
519 else:
520 ell = np.full(D, 10.0 ** float(x0[0]))
522 # Precompute SI values and factor terms for each dimension
523 si_vals = [si_prebuild(diffs[d]) for d in range(D)]
524 term_vals = [oti.sum(1.0, oti.mul(ell[d], si_vals[d])) for d in range(D)]
526 if kernel_type == 'anisotropic':
527 for d in range(D):
528 phi_over_term = oti.div(phi, term_vals[d])
529 dphi_d = oti.mul(ln10 * ell[d],
530 oti.mul(si_vals[d], phi_over_term))
531 grad[d] = _gc(dphi_d)
532 else:
533 ell_val = ell[0]
534 # d phi/d log_ell = ln10 * ell * Σ_d [B(diff_d)/(1+ell*B(diff_d))] * phi
535 acc = oti.mul(si_vals[0], oti.div(phi, term_vals[0]))
536 for d in range(1, D):
537 acc = oti.sum(acc, oti.mul(si_vals[d],
538 oti.div(phi, term_vals[d])))
539 grad[0] = _gc(oti.mul(ln10 * ell_val, acc))
541 return grad
543 def nll_grad(self, x0):
544 """Analytic gradient of the NLL (separate Cholesky from nll_wrapper)."""
545 diffs = self.model.differences_by_dim
546 oti = self.model.kernel_factory.oti
547 sigma_n_sq = (10.0 ** x0[-1]) ** 2
549 phi = self.model.kernel_func(diffs, x0[:-1])
550 if self.model.n_order == 0:
551 n_bases = 0
552 phi_exp = phi.real[np.newaxis, :, :]
553 else:
554 n_bases = phi.get_active_bases()[-1]
555 deriv_order = 2 * self.model.n_order
556 phi_exp = self._expand_derivs(phi, n_bases, deriv_order)
558 self._ensure_kernel_plan(n_bases)
559 if self._kernel_plan is not None:
560 base_shape = phi.shape
561 self._ensure_kernel_bufs(base_shape[0])
562 phi_3d = phi_exp.reshape(phi_exp.shape[0], base_shape[0], base_shape[1])
563 K = utils.rbf_kernel_fast(phi_3d, self._kernel_plan, out=self._K_buf)
564 else:
565 K = utils.rbf_kernel(
566 phi, phi_exp, self.model.n_order, n_bases,
567 self.model.flattened_der_indices, self.model.powers,
568 index=self.model.derivative_locations,
569 )
570 K.flat[::K.shape[0] + 1] += sigma_n_sq
571 K += self.model.sigma_data ** 2
573 try:
574 L, low = cho_factor(K)
575 alpha_v = cho_solve((L, low), self.model.y_train)
576 N = len(self.model.y_train)
577 K_inv = cho_solve((L, low), np.eye(N))
578 W = K_inv - np.outer(alpha_v, alpha_v)
579 except Exception:
580 return np.zeros(len(x0))
582 return self._compute_grad(x0, W, phi, n_bases, oti, diffs)
584 def nll_and_grad(self, x0):
585 """
586 Compute NLL and its gradient in a single pass, sharing one Cholesky.
588 Returns
589 -------
590 nll : float
591 grad : ndarray
592 """
593 diffs = self.model.differences_by_dim
594 oti = self.model.kernel_factory.oti
595 sigma_n_sq = (10.0 ** x0[-1]) ** 2
597 phi = self.model.kernel_func(diffs, x0[:-1])
598 if self.model.n_order == 0:
599 n_bases = 0
600 phi_exp = phi.real[np.newaxis, :, :]
601 else:
602 n_bases = phi.get_active_bases()[-1]
603 deriv_order = 2 * self.model.n_order
604 phi_exp = self._expand_derivs(phi, n_bases, deriv_order)
606 self._ensure_kernel_plan(n_bases)
607 if self._kernel_plan is not None:
608 base_shape = phi.shape
609 self._ensure_kernel_bufs(base_shape[0])
610 phi_3d = phi_exp.reshape(phi_exp.shape[0], base_shape[0], base_shape[1])
611 K = utils.rbf_kernel_fast(phi_3d, self._kernel_plan, out=self._K_buf)
612 else:
613 K = utils.rbf_kernel(
614 phi, phi_exp, self.model.n_order, n_bases,
615 self.model.flattened_der_indices, self.model.powers,
616 index=self.model.derivative_locations,
617 )
618 K.flat[::K.shape[0] + 1] += sigma_n_sq
619 K += self.model.sigma_data ** 2
621 try:
622 L, low = cho_factor(K)
623 alpha_v = cho_solve((L, low), self.model.y_train)
624 N = len(self.model.y_train)
626 nll = (0.5 * np.dot(self.model.y_train, alpha_v)
627 + np.sum(np.log(np.diag(L)))
628 + 0.5 * N * np.log(2 * np.pi))
630 K_inv = cho_solve((L, low), np.eye(N))
631 W = K_inv - np.outer(alpha_v, alpha_v)
632 except Exception:
633 return 1e6, np.zeros(len(x0))
635 grad = self._compute_grad(x0, W, phi, n_bases, oti, diffs)
636 return float(nll), grad
638 def optimize_hyperparameters(self,
639 optimizer="pso",
640 **kwargs):
641 """
642 Optimize the DEGP model hyperparameters using Particle Swarm Optimization (PSO).
644 Parameters:
645 ----------
646 n_restart_optimizer : int, default=20
647 Maximum number of iterations for PSO.
648 swarm_size : int, default=20
649 Number of particles in the swarm.
650 verbose : bool, default=True
651 Controls verbosity of PSO output.
653 Returns:
654 -------
655 best_x : ndarray
656 The optimal set of hyperparameters found.
657 """
659 if isinstance(optimizer, str):
660 if optimizer not in OPTIMIZERS:
661 raise ValueError(
662 f"Unknown optimizer '{optimizer}'. Available: {list(OPTIMIZERS.keys())}"
663 )
664 optimizer_fn = OPTIMIZERS[optimizer]
665 else:
666 optimizer_fn = optimizer # allow passing a callable directly
668 bounds = self.model.bounds
669 lb = [b[0] for b in bounds]
670 ub = [b[1] for b in bounds]
672 # Inject nll_and_grad (single Cholesky per step) for all gradient-aware optimizers.
673 if optimizer in ('lbfgs', 'jade', 'pso') and 'func_and_grad' not in kwargs and 'grad_func' not in kwargs:
674 kwargs['func_and_grad'] = self.nll_and_grad
676 best_x, best_val = optimizer_fn(self.nll_wrapper, lb, ub, **kwargs)
678 self.model.opt_x0 = best_x
679 self.model.opt_nll = best_val
682 return best_x