Coverage for jetgp/full_ddegp/optimizer.py: 69%
503 statements
« prev ^ index » next coverage.py v7.10.7, created at 2026-04-03 15:37 -0500
« prev ^ index » next coverage.py v7.10.7, created at 2026-04-03 15:37 -0500
1import numpy as np
3from scipy.linalg import cho_solve, cho_factor
4from jetgp.full_ddegp import ddegp_utils as utils
5from line_profiler import profile
6import jetgp.utils as gen_utils
7from jetgp.hyperparameter_optimizers import OPTIMIZERS
8from jetgp.utils import matern_kernel_grad_builder
10class Optimizer:
11 """
12 Optimizer class to perform hyperparameter tuning for derivative-enhanced Gaussian Process models
13 by minimizing the negative log marginal likelihood (NLL).
15 Parameters
16 ----------
17 model : object
18 An instance of a model (e.g., ddegp) containing the necessary training data
19 and kernel configuration.
20 """
22 def __init__(self, model):
23 self.model = model
24 self._kernel_plan = None
25 self._deriv_buf = None
26 self._deriv_buf_shape = None
27 self._deriv_factors = None
28 self._deriv_factors_key = None
29 self._K_buf = None
30 self._dK_buf = None
31 self._kernel_buf_size = None
32 self._W_proj_buf = None
33 self._W_proj_shape = None
35 def _get_deriv_buf(self, phi, n_bases, order):
36 from math import comb
37 ndir = comb(n_bases + order, order)
38 shape = (ndir, phi.shape[0], phi.shape[1])
39 if self._deriv_buf is None or self._deriv_buf_shape != shape:
40 self._deriv_buf = np.zeros(shape, dtype=np.float64)
41 self._deriv_buf_shape = shape
42 return self._deriv_buf
44 def _expand_derivs(self, phi, n_bases, deriv_order):
45 """Expand OTI derivatives, using fast struct path if available."""
46 if hasattr(phi, 'get_all_derivs_fast'):
47 buf = self._get_deriv_buf(phi, n_bases, deriv_order)
48 factors = self._get_deriv_factors(n_bases, deriv_order)
49 return phi.get_all_derivs_fast(factors, buf)
50 return phi.get_all_derivs(n_bases, deriv_order)
52 @staticmethod
53 def _enum_factors(max_basis, ordi):
54 from math import factorial
55 from collections import Counter
56 if ordi == 1:
57 for _ in range(max_basis):
58 yield 1.0
59 return
60 for last in range(1, max_basis + 1):
61 if ordi == 2:
62 for i in range(1, last + 1):
63 counts = Counter((i, last))
64 f = 1
65 for c in counts.values():
66 f *= factorial(c)
67 yield float(f)
68 else:
69 for _, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1):
70 counts = dict(prefix_counts)
71 counts[last] = counts.get(last, 0) + 1
72 f = 1
73 for c in counts.values():
74 f *= factorial(c)
75 yield float(f)
77 @staticmethod
78 def _enum_factors_with_counts(max_basis, ordi):
79 from math import factorial
80 from collections import Counter
81 if ordi == 1:
82 for i in range(1, max_basis + 1):
83 yield 1.0, {i: 1}
84 return
85 for last in range(1, max_basis + 1):
86 for _, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1):
87 counts = dict(prefix_counts)
88 counts[last] = counts.get(last, 0) + 1
89 f = 1
90 for c in counts.values():
91 f *= factorial(c)
92 yield float(f), counts
94 def _get_deriv_factors(self, n_bases, order):
95 key = (n_bases, order)
96 if self._deriv_factors is not None and self._deriv_factors_key == key:
97 return self._deriv_factors
98 factors = [1.0]
99 for ordi in range(1, order + 1):
100 factors.extend(self._enum_factors(n_bases, ordi))
101 self._deriv_factors = np.array(factors, dtype=np.float64)
102 self._deriv_factors_key = key
103 return self._deriv_factors
105 def _ensure_kernel_plan(self, n_bases):
106 """Lazily precompute kernel plan (once per n_bases)."""
107 if self._kernel_plan is not None and self._kernel_plan_n_bases == n_bases:
108 return
109 if not hasattr(utils, 'precompute_kernel_plan'):
110 self._kernel_plan = None
111 return
112 self._kernel_plan = utils.precompute_kernel_plan(
113 self.model.n_order, n_bases,
114 self.model.flattened_der_indices,
115 self.model.powers,
116 self.model.derivative_locations,
117 )
118 self._kernel_plan_n_bases = n_bases
119 self._K_buf = None
120 self._dK_buf = None
121 self._kernel_buf_size = None
123 def _ensure_kernel_bufs(self, n_rows_func):
124 """Pre-allocate reusable K and dK buffers (avoids repeated malloc)."""
125 if self._kernel_plan is None:
126 return
127 total = n_rows_func + self._kernel_plan['n_pts_with_derivs']
128 if self._kernel_buf_size != total:
129 self._K_buf = np.empty((total, total))
130 self._dK_buf = np.empty((total, total))
131 self._kernel_buf_size = total
132 if 'row_offsets_abs' not in self._kernel_plan:
133 self._kernel_plan['row_offsets_abs'] = self._kernel_plan['row_offsets'] + n_rows_func
134 self._kernel_plan['col_offsets_abs'] = self._kernel_plan['col_offsets'] + n_rows_func
136 def _build_K(self, phi_exp, phi, n_bases):
137 """Build kernel matrix using fast path if available."""
138 self._ensure_kernel_plan(n_bases)
139 if self._kernel_plan is not None:
140 base_shape = phi.shape
141 self._ensure_kernel_bufs(base_shape[0])
142 phi_3d = phi_exp.reshape(phi_exp.shape[0], base_shape[0], base_shape[1])
143 return utils.rbf_kernel_fast(phi_3d, self._kernel_plan, out=self._K_buf)
144 return utils.rbf_kernel(
145 phi, phi_exp, self.model.n_order, n_bases,
146 self.model.flattened_der_indices, self.model.powers,
147 self.model.derivative_locations,
148 )
150 @profile
151 def negative_log_marginal_likelihood(self, x0):
152 """
153 Compute the negative log marginal likelihood (NLL) of the model.
155 NLL = 0.5 * y^T K^-1 y + 0.5 * log|K| + 0.5 * N * log(2π)
157 Parameters
158 ----------
159 x0 : ndarray
160 Vector of log-scaled hyperparameters (length scales and noise).
162 Returns
163 -------
164 float
165 Value of the negative log marginal likelihood.
166 """
167 ell = x0[:-1]
168 sigma_n = x0[-1]
169 llhood = 0
170 diffs = self.model.differences_by_dim
171 phi = self.model.kernel_func(diffs, ell)
172 n_bases = phi.get_active_bases()[-1]
174 # Extract ALL derivative components
175 deriv_order = 2 * self.model.n_order
176 phi_exp = self._expand_derivs(phi, n_bases, deriv_order)
177 K = self._build_K(phi_exp, phi, n_bases)
178 K += ((10 ** sigma_n) ** 2) * np.eye(len(K))
179 K += self.model.sigma_data**2
181 try:
182 L, low = cho_factor(K)
183 alpha = cho_solve(
184 (L, low),
185 self.model.y_train
186 )
188 data_fit = 0.5 * np.dot(self.model.y_train, alpha)
189 log_det_K = np.sum(np.log(np.diag(L)))
190 complexity = log_det_K
191 N = len(self.model.y_train)
192 const = 0.5 * N * np.log(2 * np.pi)
193 return data_fit + complexity + const
194 except Exception:
195 return 1e6
197 def nll_wrapper(self, x0):
198 """
199 Wrapper function to compute NLL for optimizer.
201 Parameters
202 ----------
203 x0 : ndarray
204 Hyperparameter vector.
206 Returns
207 -------
208 float
209 NLL evaluated at x0.
210 """
211 return self.negative_log_marginal_likelihood(x0)
213 def nll_grad(self, x0):
214 """Analytic gradient of the NLL w.r.t. log10-scaled hyperparameters."""
215 ln10 = np.log(10.0)
217 kernel = self.model.kernel
218 kernel_type = self.model.kernel_type
219 D = len(self.model.differences_by_dim)
220 sigma_n_sq = (10.0 ** x0[-1]) ** 2
221 diffs = self.model.differences_by_dim
222 oti = self.model.kernel_factory.oti
224 phi = self.model.kernel_func(diffs, x0[:-1])
225 n_bases = phi.get_active_bases()[-1]
226 deriv_order = 2 * self.model.n_order
227 phi_exp = self._expand_derivs(phi, n_bases, deriv_order)
229 K = self._build_K(phi_exp, phi, n_bases)
230 K.flat[::K.shape[0] + 1] += sigma_n_sq
231 K += self.model.sigma_data ** 2
233 try:
234 L, low = cho_factor(K)
235 alpha_v = cho_solve((L, low), self.model.y_train)
236 N = len(self.model.y_train)
237 K_inv = cho_solve((L, low), np.eye(N))
238 W = K_inv - np.outer(alpha_v, alpha_v)
239 except Exception:
240 return np.zeros(len(x0))
242 grad = np.zeros(len(x0))
243 use_fast = self._kernel_plan is not None
244 base_shape = phi.shape
246 W_proj = None
247 if use_fast:
248 from math import comb
249 ndir = comb(n_bases + deriv_order, deriv_order)
250 proj_shape = (ndir, base_shape[0], base_shape[1])
251 if self._W_proj_buf is None or self._W_proj_shape != proj_shape:
252 self._W_proj_buf = np.empty(proj_shape)
253 self._W_proj_shape = proj_shape
254 W_proj = self._W_proj_buf
255 plan = self._kernel_plan
256 row_off = plan.get('row_offsets_abs', plan['row_offsets'] + base_shape[0])
257 col_off = plan.get('col_offsets_abs', plan['col_offsets'] + base_shape[1])
258 utils._project_W_to_phi_space(
259 W, W_proj, base_shape[0], base_shape[1],
260 plan['fd_flat_indices'], plan['df_flat_indices'],
261 plan['dd_flat_indices'],
262 plan['idx_flat'], plan['idx_offsets'], plan['index_sizes'],
263 plan['signs'], plan['n_deriv_types'], row_off, col_off,
264 )
266 def _gc(dphi):
267 dphi_exp = self._expand_derivs(dphi, n_bases, deriv_order)
268 if W_proj is not None:
269 dphi_3d = dphi_exp.reshape(W_proj.shape)
270 return 0.5 * np.vdot(W_proj, dphi_3d)
271 elif use_fast:
272 dphi_3d = dphi_exp.reshape(dphi_exp.shape[0], base_shape[0], base_shape[1])
273 dK = utils.rbf_kernel_fast(dphi_3d, self._kernel_plan, out=self._dK_buf)
274 return 0.5 * np.vdot(W, dK)
275 else:
276 dK = utils.rbf_kernel(
277 dphi, dphi_exp, self.model.n_order, n_bases,
278 self.model.flattened_der_indices, self.model.powers,
279 self.model.derivative_locations,
280 )
281 return 0.5 * np.vdot(W, dK)
283 grad[-2] = _gc(oti.mul(2.0 * ln10, phi))
284 grad[-1] = ln10 * sigma_n_sq * np.trace(W)
286 if kernel == 'SE':
287 if kernel_type == 'anisotropic':
288 ell = 10.0 ** x0[:D]
289 if hasattr(phi, 'fused_scale_sq_mul'):
290 dphi_buf = oti.zeros(phi.shape)
291 for d in range(D):
292 dphi_buf.fused_scale_sq_mul(diffs[d], phi, -ln10 * ell[d] ** 2)
293 grad[d] = _gc(dphi_buf)
294 else:
295 for d in range(D):
296 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2,
297 oti.mul(oti.mul(diffs[d], diffs[d]), phi)))
298 else:
299 ell = 10.0 ** float(x0[0])
300 if hasattr(phi, 'fused_sum_sq'):
301 sum_sq = oti.zeros(phi.shape)
302 sum_sq.fused_sum_sq(diffs)
303 else:
304 sum_sq = oti.mul(diffs[0], diffs[0])
305 for d in range(1, D):
306 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d]))
307 grad[0] = _gc(oti.mul(-ln10 * ell ** 2, oti.mul(sum_sq, phi)))
309 elif kernel == 'RQ':
310 if kernel_type == 'anisotropic':
311 ell = 10.0 ** x0[:D]; alpha_rq = 10.0 ** float(x0[D]); alpha_idx = D
312 else:
313 ell = np.full(D, 10.0 ** float(x0[0]))
314 alpha_rq = np.exp(float(x0[1])); alpha_idx = 1
315 if hasattr(phi, 'fused_sqdist'):
316 r2 = oti.zeros(phi.shape)
317 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64)
318 r2.fused_sqdist(diffs, ell_sq)
319 else:
320 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2)
321 for d in range(1, D):
322 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td))
323 base = oti.sum(1.0, oti.mul(r2, 1.0 / (2.0 * alpha_rq)))
324 inv_base = oti.pow(base, -1)
325 phi_over_base = oti.mul(phi, inv_base)
326 if kernel_type == 'anisotropic':
327 if hasattr(phi, 'fused_scale_sq_mul'):
328 dphi_buf = oti.zeros(phi.shape)
329 for d in range(D):
330 dphi_buf.fused_scale_sq_mul(diffs[d], phi_over_base, -ln10 * ell[d] ** 2)
331 grad[d] = _gc(dphi_buf)
332 else:
333 for d in range(D):
334 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2,
335 oti.mul(oti.mul(diffs[d], diffs[d]), phi_over_base)))
336 else:
337 if hasattr(phi, 'fused_sum_sq'):
338 sum_sq = oti.zeros(phi.shape)
339 sum_sq.fused_sum_sq(diffs)
340 else:
341 sum_sq = oti.mul(diffs[0], diffs[0])
342 for d in range(1, D):
343 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d]))
344 grad[0] = _gc(oti.mul(-ln10 * ell[0] ** 2, oti.mul(sum_sq, phi_over_base)))
345 log_base = oti.log(base)
346 term = oti.sub(oti.sub(1.0, inv_base), log_base)
347 alpha_factor = ln10 * alpha_rq if kernel_type == 'anisotropic' else alpha_rq
348 grad[alpha_idx] = _gc(oti.mul(alpha_factor, oti.mul(phi, term)))
350 elif kernel == 'SineExp':
351 if kernel_type == 'anisotropic':
352 ell = 10.0 ** x0[:D]; p = 10.0 ** x0[D:2*D]
353 pip = np.pi / p; p_start = D
354 else:
355 ell = np.full(D, 10.0 ** float(x0[0]))
356 pip = np.full(D, np.pi / 10.0 ** float(x0[1])); p_start = 1
357 sin_d = [oti.sin(oti.mul(pip[d], diffs[d])) for d in range(D)]
358 cos_d = [oti.cos(oti.mul(pip[d], diffs[d])) for d in range(D)]
359 if kernel_type == 'anisotropic':
360 if hasattr(phi, 'fused_scale_sq_mul'):
361 dphi_buf = oti.zeros(phi.shape)
362 for d in range(D):
363 dphi_buf.fused_scale_sq_mul(sin_d[d], phi, -4.0 * ln10 * ell[d] ** 2)
364 grad[d] = _gc(dphi_buf)
365 else:
366 for d in range(D):
367 grad[d] = _gc(oti.mul(-4.0 * ln10 * ell[d] ** 2,
368 oti.mul(oti.mul(sin_d[d], sin_d[d]), phi)))
369 for d in range(D):
370 sc = oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d]))
371 grad[p_start + d] = _gc(oti.mul(4.0 * ln10 * ell[d] ** 2 * pip[d],
372 oti.mul(sc, phi)))
373 else:
374 if hasattr(phi, 'fused_sum_sq'):
375 ss = oti.zeros(phi.shape)
376 ss.fused_sum_sq(sin_d)
377 else:
378 ss = oti.mul(sin_d[0], sin_d[0])
379 for d in range(1, D):
380 ss = oti.sum(ss, oti.mul(sin_d[d], sin_d[d]))
381 grad[0] = _gc(oti.mul(-4.0 * ln10 * ell[0] ** 2, oti.mul(ss, phi)))
382 scd = oti.mul(sin_d[0], oti.mul(cos_d[0], diffs[0]))
383 for d in range(1, D):
384 scd = oti.sum(scd, oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d])))
385 grad[p_start] = _gc(oti.mul(4.0 * ln10 * ell[0] ** 2 * pip[0],
386 oti.mul(scd, phi)))
388 elif kernel == 'Matern':
389 kf = self.model.kernel_factory
390 if not hasattr(kf, '_matern_grad_prebuild'):
391 kf._matern_grad_prebuild = matern_kernel_grad_builder(getattr(kf, "nu", 1.5), oti_module=oti)
392 ell = (10.0 ** x0[:D] if kernel_type == 'anisotropic'
393 else np.full(D, 10.0 ** float(x0[0])))
394 sigma_f_sq = (10.0 ** float(x0[-2])) ** 2
395 _eps = 1e-10
396 if hasattr(phi, 'fused_sqdist'):
397 r2 = oti.zeros(phi.shape)
398 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64)
399 r2.fused_sqdist(diffs, ell_sq)
400 else:
401 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2)
402 for d in range(1, D):
403 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td))
404 r_oti = oti.sqrt(oti.sum(r2, _eps ** 2))
405 f_prime_r = kf._matern_grad_prebuild(r_oti)
406 inv_r = oti.pow(r_oti, -1)
407 base_matern = oti.mul(sigma_f_sq, oti.mul(f_prime_r, inv_r))
408 if kernel_type == 'anisotropic':
409 if hasattr(phi, 'fused_scale_sq_mul'):
410 dphi_buf = oti.zeros(phi.shape)
411 for d in range(D):
412 dphi_buf.fused_scale_sq_mul(diffs[d], base_matern, ln10 * ell[d] ** 2)
413 grad[d] = _gc(dphi_buf)
414 else:
415 for d in range(D):
416 d_sq = oti.mul(diffs[d], diffs[d])
417 dphi_d = oti.mul(ln10 * ell[d] ** 2, oti.mul(d_sq, base_matern))
418 grad[d] = _gc(dphi_d)
419 else:
420 if hasattr(phi, 'fused_sum_sq'):
421 sum_dsq = oti.zeros(phi.shape)
422 sum_dsq.fused_sum_sq(diffs)
423 else:
424 sum_dsq = oti.mul(diffs[0], diffs[0])
425 for d in range(1, D):
426 sum_dsq = oti.sum(sum_dsq, oti.mul(diffs[d], diffs[d]))
427 dphi_e = oti.mul(ln10 * ell[0] ** 2, oti.mul(sum_dsq, base_matern))
428 grad[0] = _gc(dphi_e)
430 return grad
432 def nll_and_grad(self, x0):
433 """Compute NLL and its gradient in a single pass, sharing one Cholesky."""
434 ln10 = np.log(10.0)
436 kernel = self.model.kernel
437 kernel_type = self.model.kernel_type
438 D = len(self.model.differences_by_dim)
439 sigma_n_sq = (10.0 ** x0[-1]) ** 2
440 diffs = self.model.differences_by_dim
441 oti = self.model.kernel_factory.oti
443 # --- shared kernel computation (done ONCE) ---
444 phi = self.model.kernel_func(diffs, x0[:-1])
445 if self.model.n_order == 0:
446 n_bases = 0
447 phi_exp = phi.real[np.newaxis, :, :]
448 else:
449 n_bases = phi.get_active_bases()[-1]
450 deriv_order = 2 * self.model.n_order
451 phi_exp = self._expand_derivs(phi, n_bases, deriv_order)
453 K = self._build_K(phi_exp, phi, n_bases)
454 K.flat[::K.shape[0] + 1] += sigma_n_sq
455 K += self.model.sigma_data ** 2
457 try:
458 L, low = cho_factor(K)
459 alpha_v = cho_solve((L, low), self.model.y_train)
460 N = len(self.model.y_train)
462 # NLL
463 nll = (0.5 * np.dot(self.model.y_train, alpha_v)
464 + np.sum(np.log(np.diag(L)))
465 + 0.5 * N * np.log(2 * np.pi))
467 # W matrix for gradient (reuse same Cholesky)
468 K_inv = cho_solve((L, low), np.eye(N))
469 W = K_inv - np.outer(alpha_v, alpha_v)
470 except Exception:
471 return 1e6, np.zeros(len(x0))
473 # --- gradient from W (no second kernel build / Cholesky) ---
474 grad = np.zeros(len(x0))
475 use_fast = self._kernel_plan is not None
476 base_shape = phi.shape
477 deriv_order_gc = 2 * self.model.n_order
479 W_proj = None
480 if use_fast and self.model.n_order > 0:
481 from math import comb
482 ndir = comb(n_bases + deriv_order_gc, deriv_order_gc)
483 proj_shape = (ndir, base_shape[0], base_shape[1])
484 if self._W_proj_buf is None or self._W_proj_shape != proj_shape:
485 self._W_proj_buf = np.empty(proj_shape)
486 self._W_proj_shape = proj_shape
487 W_proj = self._W_proj_buf
488 plan = self._kernel_plan
489 row_off = plan.get('row_offsets_abs', plan['row_offsets'] + base_shape[0])
490 col_off = plan.get('col_offsets_abs', plan['col_offsets'] + base_shape[1])
491 utils._project_W_to_phi_space(
492 W, W_proj, base_shape[0], base_shape[1],
493 plan['fd_flat_indices'], plan['df_flat_indices'],
494 plan['dd_flat_indices'],
495 plan['idx_flat'], plan['idx_offsets'], plan['index_sizes'],
496 plan['signs'], plan['n_deriv_types'], row_off, col_off,
497 )
499 def _gc(dphi):
500 if self.model.n_order == 0:
501 dphi_exp = dphi.real[np.newaxis, :, :]
502 else:
503 dphi_exp = self._expand_derivs(dphi, n_bases, deriv_order_gc)
504 if W_proj is not None:
505 dphi_3d = dphi_exp.reshape(W_proj.shape)
506 return 0.5 * np.vdot(W_proj, dphi_3d)
507 elif use_fast:
508 dphi_3d = dphi_exp.reshape(dphi_exp.shape[0], base_shape[0], base_shape[1])
509 dK = utils.rbf_kernel_fast(dphi_3d, self._kernel_plan, out=self._dK_buf)
510 return 0.5 * np.vdot(W, dK)
511 else:
512 dK = utils.rbf_kernel(
513 dphi, dphi_exp, self.model.n_order, n_bases,
514 self.model.flattened_der_indices, self.model.powers,
515 self.model.derivative_locations,
516 )
517 return 0.5 * np.vdot(W, dK)
519 grad[-2] = _gc(oti.mul(2.0 * ln10, phi))
520 grad[-1] = ln10 * sigma_n_sq * np.trace(W)
522 if kernel == 'SE':
523 if kernel_type == 'anisotropic':
524 ell = 10.0 ** x0[:D]
525 if hasattr(phi, 'fused_scale_sq_mul'):
526 dphi_buf = oti.zeros(phi.shape)
527 for d in range(D):
528 dphi_buf.fused_scale_sq_mul(diffs[d], phi, -ln10 * ell[d] ** 2)
529 grad[d] = _gc(dphi_buf)
530 else:
531 for d in range(D):
532 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2,
533 oti.mul(oti.mul(diffs[d], diffs[d]), phi)))
534 else:
535 ell = 10.0 ** float(x0[0])
536 if hasattr(phi, 'fused_sum_sq'):
537 sum_sq = oti.zeros(phi.shape)
538 sum_sq.fused_sum_sq(diffs)
539 else:
540 sum_sq = oti.mul(diffs[0], diffs[0])
541 for d in range(1, D):
542 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d]))
543 grad[0] = _gc(oti.mul(-ln10 * ell ** 2, oti.mul(sum_sq, phi)))
545 elif kernel == 'RQ':
546 if kernel_type == 'anisotropic':
547 ell = 10.0 ** x0[:D]; alpha_rq = 10.0 ** float(x0[D]); alpha_idx = D
548 else:
549 ell = np.full(D, 10.0 ** float(x0[0]))
550 alpha_rq = np.exp(float(x0[1])); alpha_idx = 1
551 if hasattr(phi, 'fused_sqdist'):
552 r2 = oti.zeros(phi.shape)
553 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64)
554 r2.fused_sqdist(diffs, ell_sq)
555 else:
556 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2)
557 for d in range(1, D):
558 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td))
559 base = oti.sum(1.0, oti.mul(r2, 1.0 / (2.0 * alpha_rq)))
560 inv_base = oti.pow(base, -1)
561 phi_over_base = oti.mul(phi, inv_base)
562 if kernel_type == 'anisotropic':
563 if hasattr(phi, 'fused_scale_sq_mul'):
564 dphi_buf = oti.zeros(phi.shape)
565 for d in range(D):
566 dphi_buf.fused_scale_sq_mul(diffs[d], phi_over_base, -ln10 * ell[d] ** 2)
567 grad[d] = _gc(dphi_buf)
568 else:
569 for d in range(D):
570 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2,
571 oti.mul(oti.mul(diffs[d], diffs[d]), phi_over_base)))
572 else:
573 if hasattr(phi, 'fused_sum_sq'):
574 sum_sq = oti.zeros(phi.shape)
575 sum_sq.fused_sum_sq(diffs)
576 else:
577 sum_sq = oti.mul(diffs[0], diffs[0])
578 for d in range(1, D):
579 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d]))
580 grad[0] = _gc(oti.mul(-ln10 * ell[0] ** 2, oti.mul(sum_sq, phi_over_base)))
581 log_base = oti.log(base)
582 term = oti.sub(oti.sub(1.0, inv_base), log_base)
583 alpha_factor = ln10 * alpha_rq if kernel_type == 'anisotropic' else alpha_rq
584 grad[alpha_idx] = _gc(oti.mul(alpha_factor, oti.mul(phi, term)))
586 elif kernel == 'SineExp':
587 if kernel_type == 'anisotropic':
588 ell = 10.0 ** x0[:D]; p = 10.0 ** x0[D:2*D]
589 pip = np.pi / p; p_start = D
590 else:
591 ell = np.full(D, 10.0 ** float(x0[0]))
592 pip = np.full(D, np.pi / 10.0 ** float(x0[1])); p_start = 1
593 sin_d = [oti.sin(oti.mul(pip[d], diffs[d])) for d in range(D)]
594 cos_d = [oti.cos(oti.mul(pip[d], diffs[d])) for d in range(D)]
595 if kernel_type == 'anisotropic':
596 if hasattr(phi, 'fused_scale_sq_mul'):
597 dphi_buf = oti.zeros(phi.shape)
598 for d in range(D):
599 dphi_buf.fused_scale_sq_mul(sin_d[d], phi, -4.0 * ln10 * ell[d] ** 2)
600 grad[d] = _gc(dphi_buf)
601 else:
602 for d in range(D):
603 grad[d] = _gc(oti.mul(-4.0 * ln10 * ell[d] ** 2,
604 oti.mul(oti.mul(sin_d[d], sin_d[d]), phi)))
605 for d in range(D):
606 sc = oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d]))
607 grad[p_start + d] = _gc(oti.mul(4.0 * ln10 * ell[d] ** 2 * pip[d],
608 oti.mul(sc, phi)))
609 else:
610 if hasattr(phi, 'fused_sum_sq'):
611 ss = oti.zeros(phi.shape)
612 ss.fused_sum_sq(sin_d)
613 else:
614 ss = oti.mul(sin_d[0], sin_d[0])
615 for d in range(1, D):
616 ss = oti.sum(ss, oti.mul(sin_d[d], sin_d[d]))
617 grad[0] = _gc(oti.mul(-4.0 * ln10 * ell[0] ** 2, oti.mul(ss, phi)))
618 scd = oti.mul(sin_d[0], oti.mul(cos_d[0], diffs[0]))
619 for d in range(1, D):
620 scd = oti.sum(scd, oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d])))
621 grad[p_start] = _gc(oti.mul(4.0 * ln10 * ell[0] ** 2 * pip[0],
622 oti.mul(scd, phi)))
624 elif kernel == 'Matern':
625 kf = self.model.kernel_factory
626 if not hasattr(kf, '_matern_grad_prebuild'):
627 kf._matern_grad_prebuild = matern_kernel_grad_builder(getattr(kf, "nu", 1.5), oti_module=oti)
628 ell = (10.0 ** x0[:D] if kernel_type == 'anisotropic'
629 else np.full(D, 10.0 ** float(x0[0])))
630 sigma_f_sq = (10.0 ** float(x0[-2])) ** 2
631 _eps = 1e-10
632 if hasattr(phi, 'fused_sqdist'):
633 r2 = oti.zeros(phi.shape)
634 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64)
635 r2.fused_sqdist(diffs, ell_sq)
636 else:
637 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2)
638 for d in range(1, D):
639 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td))
640 r_oti = oti.sqrt(oti.sum(r2, _eps ** 2))
641 f_prime_r = kf._matern_grad_prebuild(r_oti)
642 inv_r = oti.pow(r_oti, -1)
643 base_matern = oti.mul(sigma_f_sq, oti.mul(f_prime_r, inv_r))
644 if kernel_type == 'anisotropic':
645 if hasattr(phi, 'fused_scale_sq_mul'):
646 dphi_buf = oti.zeros(phi.shape)
647 for d in range(D):
648 dphi_buf.fused_scale_sq_mul(diffs[d], base_matern, ln10 * ell[d] ** 2)
649 grad[d] = _gc(dphi_buf)
650 else:
651 for d in range(D):
652 d_sq = oti.mul(diffs[d], diffs[d])
653 dphi_d = oti.mul(ln10 * ell[d] ** 2, oti.mul(d_sq, base_matern))
654 grad[d] = _gc(dphi_d)
655 else:
656 if hasattr(phi, 'fused_sum_sq'):
657 sum_dsq = oti.zeros(phi.shape)
658 sum_dsq.fused_sum_sq(diffs)
659 else:
660 sum_dsq = oti.mul(diffs[0], diffs[0])
661 for d in range(1, D):
662 sum_dsq = oti.sum(sum_dsq, oti.mul(diffs[d], diffs[d]))
663 dphi_e = oti.mul(ln10 * ell[0] ** 2, oti.mul(sum_dsq, base_matern))
664 grad[0] = _gc(dphi_e)
666 return float(nll), grad
668 def optimize_hyperparameters( self,
669 optimizer="pso",
670 **kwargs):
671 """
672 Optimize the DEGP model hyperparameters using Particle Swarm Optimization (PSO).
674 Parameters:
675 ----------
676 n_restart_optimizer : int, default=20
677 Maximum number of iterations for PSO.
678 swarm_size : int, default=20
679 Number of particles in the swarm.
680 verbose : bool, default=True
681 Controls verbosity of PSO output.
683 Returns:
684 -------
685 best_x : ndarray
686 The optimal set of hyperparameters found.
687 """
689 if isinstance(optimizer, str):
690 if optimizer not in OPTIMIZERS:
691 raise ValueError(
692 f"Unknown optimizer '{optimizer}'. Available: {list(OPTIMIZERS.keys())}"
693 )
694 optimizer_fn = OPTIMIZERS[optimizer]
695 else:
696 optimizer_fn = optimizer # allow passing a callable directly
698 bounds = self.model.bounds
699 lb = [b[0] for b in bounds]
700 ub = [b[1] for b in bounds]
702 if optimizer in ('lbfgs', 'jade', 'pso') and 'func_and_grad' not in kwargs and 'grad_func' not in kwargs:
703 kwargs['func_and_grad'] = self.nll_and_grad
705 best_x, best_val = optimizer_fn(self.nll_wrapper, lb, ub, **kwargs)
707 self.model.opt_x0 = best_x
708 self.model.opt_nll = best_val
711 return best_x