Coverage for jetgp/full_gddegp/optimizer.py: 79%
515 statements
« prev ^ index » next coverage.py v7.10.7, created at 2026-04-03 15:37 -0500
« prev ^ index » next coverage.py v7.10.7, created at 2026-04-03 15:37 -0500
1import numpy as np
2from numpy.linalg import cholesky, solve
3from jetgp.full_gddegp import gddegp_utils as utils
4import jetgp.utils as gen_utils
5from scipy.linalg import cho_solve, cho_factor
6from jetgp.hyperparameter_optimizers import OPTIMIZERS
7from line_profiler import profile
8from jetgp.utils import matern_kernel_grad_builder
9class Optimizer:
10 """
11 Optimizer class to perform hyperparameter tuning for derivative-enhanced Gaussian Process models
12 by minimizing the negative log marginal likelihood (NLL).
14 Parameters
15 ----------
16 model : object
17 An instance of a model (e.g., ddegp) containing the necessary training data
18 and kernel configuration.
19 """
21 def __init__(self, model):
22 self.model = model
23 self._kernel_plan = None
24 self._deriv_buf = None
25 self._deriv_buf_shape = None
26 self._deriv_factors = None
27 self._deriv_factors_key = None
28 self._K_buf = None
29 self._dK_buf = None
30 self._kernel_buf_size = None
31 self._W_proj_buf = None
32 self._W_proj_shape = None
34 def _get_deriv_buf(self, phi, n_bases, order):
35 from math import comb
36 ndir = comb(n_bases + order, order)
37 shape = (ndir, phi.shape[0], phi.shape[1])
38 if self._deriv_buf is None or self._deriv_buf_shape != shape:
39 self._deriv_buf = np.zeros(shape, dtype=np.float64)
40 self._deriv_buf_shape = shape
41 return self._deriv_buf
43 def _expand_derivs(self, phi, n_bases, deriv_order):
44 """Expand OTI derivatives, using fast struct path if available."""
45 if hasattr(phi, 'get_all_derivs_fast'):
46 buf = self._get_deriv_buf(phi, n_bases, deriv_order)
47 factors = self._get_deriv_factors(n_bases, deriv_order)
48 return phi.get_all_derivs_fast(factors, buf)
49 return phi.get_all_derivs(n_bases, deriv_order)
51 @staticmethod
52 def _enum_factors(max_basis, ordi):
53 from math import factorial
54 from collections import Counter
55 if ordi == 1:
56 for _ in range(max_basis):
57 yield 1.0
58 return
59 for last in range(1, max_basis + 1):
60 if ordi == 2:
61 for i in range(1, last + 1):
62 counts = Counter((i, last))
63 f = 1
64 for c in counts.values():
65 f *= factorial(c)
66 yield float(f)
67 else:
68 for _, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1):
69 counts = dict(prefix_counts)
70 counts[last] = counts.get(last, 0) + 1
71 f = 1
72 for c in counts.values():
73 f *= factorial(c)
74 yield float(f)
76 @staticmethod
77 def _enum_factors_with_counts(max_basis, ordi):
78 from math import factorial
79 from collections import Counter
80 if ordi == 1:
81 for i in range(1, max_basis + 1):
82 yield 1.0, {i: 1}
83 return
84 for last in range(1, max_basis + 1):
85 for _, prefix_counts in Optimizer._enum_factors_with_counts(last, ordi - 1):
86 counts = dict(prefix_counts)
87 counts[last] = counts.get(last, 0) + 1
88 f = 1
89 for c in counts.values():
90 f *= factorial(c)
91 yield float(f), counts
93 def _get_deriv_factors(self, n_bases, order):
94 key = (n_bases, order)
95 if self._deriv_factors is not None and self._deriv_factors_key == key:
96 return self._deriv_factors
97 factors = [1.0]
98 for ordi in range(1, order + 1):
99 factors.extend(self._enum_factors(n_bases, ordi))
100 self._deriv_factors = np.array(factors, dtype=np.float64)
101 self._deriv_factors_key = key
102 return self._deriv_factors
104 def _ensure_kernel_plan(self, n_bases):
105 """Lazily precompute kernel plan (once per n_bases)."""
106 if self._kernel_plan is not None and self._kernel_plan_n_bases == n_bases:
107 return
108 if not hasattr(utils, 'precompute_kernel_plan'):
109 self._kernel_plan = None
110 return
111 self._kernel_plan = utils.precompute_kernel_plan(
112 self.model.n_order, n_bases,
113 self.model.flattened_der_indices,
114 None, # GDDEGP uses even/odd bases, not powers
115 self.model.derivative_locations,
116 )
117 self._kernel_plan_n_bases = n_bases
118 self._K_buf = None
119 self._dK_buf = None
120 self._kernel_buf_size = None
122 def _ensure_kernel_bufs(self, n_rows_func):
123 """Pre-allocate reusable K and dK buffers (avoids repeated malloc)."""
124 if self._kernel_plan is None:
125 return
126 total = n_rows_func + self._kernel_plan['n_pts_with_derivs']
127 if self._kernel_buf_size != total:
128 self._K_buf = np.empty((total, total))
129 self._dK_buf = np.empty((total, total))
130 self._kernel_buf_size = total
131 if 'row_offsets_abs' not in self._kernel_plan:
132 self._kernel_plan['row_offsets_abs'] = self._kernel_plan['row_offsets'] + n_rows_func
133 self._kernel_plan['col_offsets_abs'] = self._kernel_plan['col_offsets'] + n_rows_func
135 def _build_K(self, phi_exp, phi, n_bases):
136 """Build kernel matrix using fast path if available."""
137 self._ensure_kernel_plan(n_bases)
138 if self._kernel_plan is not None:
139 base_shape = phi.shape
140 self._ensure_kernel_bufs(base_shape[0])
141 phi_3d = phi_exp.reshape(phi_exp.shape[0], base_shape[0], base_shape[1])
142 return utils.rbf_kernel_fast(phi_3d, self._kernel_plan, out=self._K_buf)
143 return utils.rbf_kernel(
144 phi, phi_exp, self.model.n_order, n_bases,
145 self.model.flattened_der_indices,
146 index=self.model.derivative_locations,
147 )
149 @profile
150 def negative_log_marginal_likelihood(self, x0):
151 """
152 Compute the negative log marginal likelihood (NLL) of the model.
154 NLL = 0.5 * y^T K^-1 y + 0.5 * log|K| + 0.5 * N * log(2π)
156 Parameters
157 ----------
158 x0 : ndarray
159 Vector of log-scaled hyperparameters (length scales and noise).
161 Returns
162 -------
163 float
164 Value of the negative log marginal likelihood.
165 """
166 ell = x0[:-1]
167 sigma_n = x0[-1]
168 llhood = 0
169 diffs = self.model.differences_by_dim
170 phi = self.model.kernel_func(diffs, ell)
171 if self.model.n_order == 0:
172 n_bases = 0
173 phi_exp = phi.real
174 phi_exp = phi_exp[np.newaxis,:,:]
175 else:
176 active = phi.get_active_bases()
177 n_bases = active[-1] if active else self.model.n_bases
179 # Extract ALL derivative components
180 deriv_order = 2 * self.model.n_order
181 phi_exp = self._expand_derivs(phi, n_bases, deriv_order)
182 K = self._build_K(phi_exp, phi, n_bases)
183 K += ((10 ** sigma_n) ** 2) * np.eye(len(K))
184 K += self.model.sigma_data**2
186 try:
187 L, low = cho_factor(K)
188 alpha = cho_solve(
189 (L, low),
190 self.model.y_train
191 )
193 data_fit = 0.5 * np.dot(self.model.y_train, alpha)
194 log_det_K = np.sum(np.log(np.diag(L)))
195 complexity = log_det_K
196 N = len(self.model.y_train)
197 const = 0.5 * N * np.log(2 * np.pi)
198 return data_fit + complexity + const
199 except Exception:
200 return 1e6
202 def nll_wrapper(self, x0):
203 """
204 Wrapper function to compute NLL for optimizer.
206 Parameters
207 ----------
208 x0 : ndarray
209 Hyperparameter vector.
211 Returns
212 -------
213 float
214 NLL evaluated at x0.
215 """
216 return self.negative_log_marginal_likelihood(x0)
218 def nll_grad(self, x0):
219 """Analytic gradient of the NLL w.r.t. log10-scaled hyperparameters."""
220 ln10 = np.log(10.0)
222 kernel = self.model.kernel
223 kernel_type = self.model.kernel_type
224 D = len(self.model.differences_by_dim)
225 sigma_n_sq = (10.0 ** x0[-1]) ** 2
226 diffs = self.model.differences_by_dim
227 oti = self.model.kernel_factory.oti
229 phi = self.model.kernel_func(diffs, x0[:-1])
230 if self.model.n_order == 0:
231 n_bases = 0
232 phi_exp = phi.real[np.newaxis, :, :]
233 else:
234 active = phi.get_active_bases()
235 n_bases = active[-1] if active else self.model.n_bases
236 deriv_order = 2 * self.model.n_order
237 phi_exp = self._expand_derivs(phi, n_bases, deriv_order)
239 K = self._build_K(phi_exp, phi, n_bases)
240 K.flat[::K.shape[0] + 1] += sigma_n_sq
241 K += self.model.sigma_data ** 2
243 try:
244 L, low = cho_factor(K)
245 alpha_v = cho_solve((L, low), self.model.y_train)
246 N = len(self.model.y_train)
247 K_inv = cho_solve((L, low), np.eye(N))
248 W = K_inv - np.outer(alpha_v, alpha_v)
249 except Exception:
250 return np.zeros(len(x0))
252 grad = np.zeros(len(x0))
253 use_fast = self._kernel_plan is not None
254 base_shape = phi.shape
256 W_proj = None
257 if use_fast:
258 from math import comb
259 ndir = comb(n_bases + deriv_order, deriv_order)
260 proj_shape = (ndir, base_shape[0], base_shape[1])
261 if self._W_proj_buf is None or self._W_proj_shape != proj_shape:
262 self._W_proj_buf = np.empty(proj_shape)
263 self._W_proj_shape = proj_shape
264 W_proj = self._W_proj_buf
265 plan = self._kernel_plan
266 row_off = plan.get('row_offsets_abs', plan['row_offsets'] + base_shape[0])
267 col_off = plan.get('col_offsets_abs', plan['col_offsets'] + base_shape[1])
268 utils._project_W_to_phi_space(
269 W, W_proj, base_shape[0], base_shape[1],
270 plan['fd_flat_indices'], plan['df_flat_indices'],
271 plan['dd_flat_indices'],
272 plan['idx_flat'], plan['idx_offsets'], plan['index_sizes'],
273 plan['n_deriv_types'], row_off, col_off,
274 )
276 def _gc(dphi):
277 if self.model.n_order == 0:
278 dphi_exp = dphi.real[np.newaxis, :, :]
279 else:
280 dphi_exp = self._expand_derivs(dphi, n_bases, deriv_order)
281 if W_proj is not None:
282 dphi_3d = dphi_exp.reshape(W_proj.shape)
283 return 0.5 * np.vdot(W_proj, dphi_3d)
284 elif use_fast:
285 dphi_3d = dphi_exp.reshape(dphi_exp.shape[0], base_shape[0], base_shape[1])
286 dK = utils.rbf_kernel_fast(dphi_3d, self._kernel_plan, out=self._dK_buf)
287 return 0.5 * np.vdot(W, dK)
288 else:
289 dK = utils.rbf_kernel(
290 dphi, dphi_exp, self.model.n_order, n_bases,
291 self.model.flattened_der_indices,
292 index=self.model.derivative_locations,
293 )
294 return 0.5 * np.vdot(W, dK)
296 grad[-2] = _gc(oti.mul(2.0 * ln10, phi))
297 grad[-1] = ln10 * sigma_n_sq * np.trace(W)
299 if kernel == 'SE':
300 if kernel_type == 'anisotropic':
301 ell = 10.0 ** x0[:D]
302 if hasattr(phi, 'fused_scale_sq_mul'):
303 dphi_buf = oti.zeros(phi.shape)
304 for d in range(D):
305 dphi_buf.fused_scale_sq_mul(diffs[d], phi, -ln10 * ell[d] ** 2)
306 grad[d] = _gc(dphi_buf)
307 else:
308 for d in range(D):
309 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2,
310 oti.mul(oti.mul(diffs[d], diffs[d]), phi)))
311 else:
312 ell = 10.0 ** float(x0[0])
313 if hasattr(phi, 'fused_sum_sq'):
314 sum_sq = oti.zeros(phi.shape)
315 sum_sq.fused_sum_sq(diffs)
316 else:
317 sum_sq = oti.mul(diffs[0], diffs[0])
318 for d in range(1, D):
319 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d]))
320 grad[0] = _gc(oti.mul(-ln10 * ell ** 2, oti.mul(sum_sq, phi)))
322 elif kernel == 'RQ':
323 if kernel_type == 'anisotropic':
324 ell = 10.0 ** x0[:D]; alpha_rq = 10.0 ** float(x0[D]); alpha_idx = D
325 else:
326 ell = np.full(D, 10.0 ** float(x0[0]))
327 alpha_rq = np.exp(float(x0[1])); alpha_idx = 1
328 if hasattr(phi, 'fused_sqdist'):
329 r2 = oti.zeros(phi.shape)
330 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64)
331 r2.fused_sqdist(diffs, ell_sq)
332 else:
333 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2)
334 for d in range(1, D):
335 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td))
336 base = oti.sum(1.0, oti.mul(r2, 1.0 / (2.0 * alpha_rq)))
337 inv_base = oti.pow(base, -1)
338 phi_over_base = oti.mul(phi, inv_base)
339 if kernel_type == 'anisotropic':
340 if hasattr(phi, 'fused_scale_sq_mul'):
341 dphi_buf = oti.zeros(phi.shape)
342 for d in range(D):
343 dphi_buf.fused_scale_sq_mul(diffs[d], phi_over_base, -ln10 * ell[d] ** 2)
344 grad[d] = _gc(dphi_buf)
345 else:
346 for d in range(D):
347 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2,
348 oti.mul(oti.mul(diffs[d], diffs[d]), phi_over_base)))
349 else:
350 if hasattr(phi, 'fused_sum_sq'):
351 sum_sq = oti.zeros(phi.shape)
352 sum_sq.fused_sum_sq(diffs)
353 else:
354 sum_sq = oti.mul(diffs[0], diffs[0])
355 for d in range(1, D):
356 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d]))
357 grad[0] = _gc(oti.mul(-ln10 * ell[0] ** 2, oti.mul(sum_sq, phi_over_base)))
358 log_base = oti.log(base)
359 term = oti.sub(oti.sub(1.0, inv_base), log_base)
360 alpha_factor = ln10 * alpha_rq if kernel_type == 'anisotropic' else alpha_rq
361 grad[alpha_idx] = _gc(oti.mul(alpha_factor, oti.mul(phi, term)))
363 elif kernel == 'SineExp':
364 if kernel_type == 'anisotropic':
365 ell = 10.0 ** x0[:D]; p = 10.0 ** x0[D:2*D]
366 pip = np.pi / p; p_start = D
367 else:
368 ell = np.full(D, 10.0 ** float(x0[0]))
369 pip = np.full(D, np.pi / 10.0 ** float(x0[1])); p_start = 1
370 sin_d = [oti.sin(oti.mul(pip[d], diffs[d])) for d in range(D)]
371 cos_d = [oti.cos(oti.mul(pip[d], diffs[d])) for d in range(D)]
372 if kernel_type == 'anisotropic':
373 if hasattr(phi, 'fused_scale_sq_mul'):
374 dphi_buf = oti.zeros(phi.shape)
375 for d in range(D):
376 dphi_buf.fused_scale_sq_mul(sin_d[d], phi, -4.0 * ln10 * ell[d] ** 2)
377 grad[d] = _gc(dphi_buf)
378 else:
379 for d in range(D):
380 grad[d] = _gc(oti.mul(-4.0 * ln10 * ell[d] ** 2,
381 oti.mul(oti.mul(sin_d[d], sin_d[d]), phi)))
382 for d in range(D):
383 sc = oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d]))
384 grad[p_start + d] = _gc(oti.mul(4.0 * ln10 * ell[d] ** 2 * pip[d],
385 oti.mul(sc, phi)))
386 else:
387 if hasattr(phi, 'fused_sum_sq'):
388 ss = oti.zeros(phi.shape)
389 ss.fused_sum_sq(sin_d)
390 else:
391 ss = oti.mul(sin_d[0], sin_d[0])
392 for d in range(1, D):
393 ss = oti.sum(ss, oti.mul(sin_d[d], sin_d[d]))
394 grad[0] = _gc(oti.mul(-4.0 * ln10 * ell[0] ** 2, oti.mul(ss, phi)))
395 scd = oti.mul(sin_d[0], oti.mul(cos_d[0], diffs[0]))
396 for d in range(1, D):
397 scd = oti.sum(scd, oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d])))
398 grad[p_start] = _gc(oti.mul(4.0 * ln10 * ell[0] ** 2 * pip[0],
399 oti.mul(scd, phi)))
401 elif kernel == 'Matern':
402 kf = self.model.kernel_factory
403 if not hasattr(kf, '_matern_grad_prebuild'):
404 kf._matern_grad_prebuild = matern_kernel_grad_builder(getattr(kf, "nu", 1.5), oti_module=oti)
405 ell = (10.0 ** x0[:D] if kernel_type == 'anisotropic'
406 else np.full(D, 10.0 ** float(x0[0])))
407 sigma_f_sq = (10.0 ** float(x0[-2])) ** 2
408 _eps = 1e-10
409 if hasattr(phi, 'fused_sqdist'):
410 r2 = oti.zeros(phi.shape)
411 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64)
412 r2.fused_sqdist(diffs, ell_sq)
413 else:
414 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2)
415 for d in range(1, D):
416 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td))
417 r_oti = oti.sqrt(oti.sum(r2, _eps ** 2))
418 f_prime_r = kf._matern_grad_prebuild(r_oti)
419 inv_r = oti.pow(r_oti, -1)
420 base_matern = oti.mul(sigma_f_sq, oti.mul(f_prime_r, inv_r))
421 if kernel_type == 'anisotropic':
422 if hasattr(phi, 'fused_scale_sq_mul'):
423 dphi_buf = oti.zeros(phi.shape)
424 for d in range(D):
425 dphi_buf.fused_scale_sq_mul(diffs[d], base_matern, ln10 * ell[d] ** 2)
426 grad[d] = _gc(dphi_buf)
427 else:
428 for d in range(D):
429 d_sq = oti.mul(diffs[d], diffs[d])
430 dphi_d = oti.mul(ln10 * ell[d] ** 2, oti.mul(d_sq, base_matern))
431 grad[d] = _gc(dphi_d)
432 else:
433 if hasattr(phi, 'fused_sum_sq'):
434 sum_dsq = oti.zeros(phi.shape)
435 sum_dsq.fused_sum_sq(diffs)
436 else:
437 sum_dsq = oti.mul(diffs[0], diffs[0])
438 for d in range(1, D):
439 sum_dsq = oti.sum(sum_dsq, oti.mul(diffs[d], diffs[d]))
440 dphi_e = oti.mul(ln10 * ell[0] ** 2, oti.mul(sum_dsq, base_matern))
441 grad[0] = _gc(dphi_e)
443 return grad
445 def nll_and_grad(self, x0):
446 """Compute NLL and its gradient in a single pass, sharing one Cholesky."""
447 ln10 = np.log(10.0)
449 kernel = self.model.kernel
450 kernel_type = self.model.kernel_type
451 D = len(self.model.differences_by_dim)
452 sigma_n_sq = (10.0 ** x0[-1]) ** 2
453 diffs = self.model.differences_by_dim
454 oti = self.model.kernel_factory.oti
456 # --- shared kernel computation (done ONCE) ---
457 phi = self.model.kernel_func(diffs, x0[:-1])
458 if self.model.n_order == 0:
459 n_bases = 0
460 phi_exp = phi.real[np.newaxis, :, :]
461 else:
462 active = phi.get_active_bases()
463 n_bases = active[-1] if active else self.model.n_bases
464 deriv_order = 2 * self.model.n_order
465 phi_exp = self._expand_derivs(phi, n_bases, deriv_order)
467 K = self._build_K(phi_exp, phi, n_bases)
468 K.flat[::K.shape[0] + 1] += sigma_n_sq
469 K += self.model.sigma_data ** 2
471 try:
472 L, low = cho_factor(K)
473 alpha_v = cho_solve((L, low), self.model.y_train)
474 N = len(self.model.y_train)
476 # NLL
477 nll = (0.5 * np.dot(self.model.y_train, alpha_v)
478 + np.sum(np.log(np.diag(L)))
479 + 0.5 * N * np.log(2 * np.pi))
481 # W matrix for gradient (reuse same Cholesky)
482 K_inv = cho_solve((L, low), np.eye(N))
483 W = K_inv - np.outer(alpha_v, alpha_v)
484 except Exception:
485 return 1e6, np.zeros(len(x0))
487 # --- gradient from W (no second kernel build / Cholesky) ---
488 grad = np.zeros(len(x0))
489 use_fast = self._kernel_plan is not None
490 base_shape = phi.shape
492 W_proj = None
493 if use_fast and self.model.n_order > 0:
494 from math import comb
495 ndir = comb(n_bases + deriv_order, deriv_order)
496 proj_shape = (ndir, base_shape[0], base_shape[1])
497 if self._W_proj_buf is None or self._W_proj_shape != proj_shape:
498 self._W_proj_buf = np.empty(proj_shape)
499 self._W_proj_shape = proj_shape
500 W_proj = self._W_proj_buf
501 plan = self._kernel_plan
502 row_off = plan.get('row_offsets_abs', plan['row_offsets'] + base_shape[0])
503 col_off = plan.get('col_offsets_abs', plan['col_offsets'] + base_shape[1])
504 utils._project_W_to_phi_space(
505 W, W_proj, base_shape[0], base_shape[1],
506 plan['fd_flat_indices'], plan['df_flat_indices'],
507 plan['dd_flat_indices'],
508 plan['idx_flat'], plan['idx_offsets'], plan['index_sizes'],
509 plan['n_deriv_types'], row_off, col_off,
510 )
512 def _gc(dphi):
513 if self.model.n_order == 0:
514 dphi_exp = dphi.real[np.newaxis, :, :]
515 else:
516 dphi_exp = self._expand_derivs(dphi, n_bases, deriv_order)
517 if W_proj is not None:
518 dphi_3d = dphi_exp.reshape(W_proj.shape)
519 return 0.5 * np.vdot(W_proj, dphi_3d)
520 elif use_fast:
521 dphi_3d = dphi_exp.reshape(dphi_exp.shape[0], base_shape[0], base_shape[1])
522 dK = utils.rbf_kernel_fast(dphi_3d, self._kernel_plan, out=self._dK_buf)
523 return 0.5 * np.vdot(W, dK)
524 else:
525 dK = utils.rbf_kernel(
526 dphi, dphi_exp, self.model.n_order, n_bases,
527 self.model.flattened_der_indices,
528 index=self.model.derivative_locations,
529 )
530 return 0.5 * np.vdot(W, dK)
532 grad[-2] = _gc(oti.mul(2.0 * ln10, phi))
533 grad[-1] = ln10 * sigma_n_sq * np.trace(W)
535 if kernel == 'SE':
536 if kernel_type == 'anisotropic':
537 ell = 10.0 ** x0[:D]
538 if hasattr(phi, 'fused_scale_sq_mul'):
539 dphi_buf = oti.zeros(phi.shape)
540 for d in range(D):
541 dphi_buf.fused_scale_sq_mul(diffs[d], phi, -ln10 * ell[d] ** 2)
542 grad[d] = _gc(dphi_buf)
543 else:
544 for d in range(D):
545 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2,
546 oti.mul(oti.mul(diffs[d], diffs[d]), phi)))
547 else:
548 ell = 10.0 ** float(x0[0])
549 if hasattr(phi, 'fused_sum_sq'):
550 sum_sq = oti.zeros(phi.shape)
551 sum_sq.fused_sum_sq(diffs)
552 else:
553 sum_sq = oti.mul(diffs[0], diffs[0])
554 for d in range(1, D):
555 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d]))
556 grad[0] = _gc(oti.mul(-ln10 * ell ** 2, oti.mul(sum_sq, phi)))
558 elif kernel == 'RQ':
559 if kernel_type == 'anisotropic':
560 ell = 10.0 ** x0[:D]; alpha_rq = 10.0 ** float(x0[D]); alpha_idx = D
561 else:
562 ell = np.full(D, 10.0 ** float(x0[0]))
563 alpha_rq = np.exp(float(x0[1])); alpha_idx = 1
564 if hasattr(phi, 'fused_sqdist'):
565 r2 = oti.zeros(phi.shape)
566 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64)
567 r2.fused_sqdist(diffs, ell_sq)
568 else:
569 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2)
570 for d in range(1, D):
571 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td))
572 base = oti.sum(1.0, oti.mul(r2, 1.0 / (2.0 * alpha_rq)))
573 inv_base = oti.pow(base, -1)
574 phi_over_base = oti.mul(phi, inv_base)
575 if kernel_type == 'anisotropic':
576 if hasattr(phi, 'fused_scale_sq_mul'):
577 dphi_buf = oti.zeros(phi.shape)
578 for d in range(D):
579 dphi_buf.fused_scale_sq_mul(diffs[d], phi_over_base, -ln10 * ell[d] ** 2)
580 grad[d] = _gc(dphi_buf)
581 else:
582 for d in range(D):
583 grad[d] = _gc(oti.mul(-ln10 * ell[d] ** 2,
584 oti.mul(oti.mul(diffs[d], diffs[d]), phi_over_base)))
585 else:
586 if hasattr(phi, 'fused_sum_sq'):
587 sum_sq = oti.zeros(phi.shape)
588 sum_sq.fused_sum_sq(diffs)
589 else:
590 sum_sq = oti.mul(diffs[0], diffs[0])
591 for d in range(1, D):
592 sum_sq = oti.sum(sum_sq, oti.mul(diffs[d], diffs[d]))
593 grad[0] = _gc(oti.mul(-ln10 * ell[0] ** 2, oti.mul(sum_sq, phi_over_base)))
594 log_base = oti.log(base)
595 term = oti.sub(oti.sub(1.0, inv_base), log_base)
596 alpha_factor = ln10 * alpha_rq if kernel_type == 'anisotropic' else alpha_rq
597 grad[alpha_idx] = _gc(oti.mul(alpha_factor, oti.mul(phi, term)))
599 elif kernel == 'SineExp':
600 if kernel_type == 'anisotropic':
601 ell = 10.0 ** x0[:D]; p = 10.0 ** x0[D:2*D]
602 pip = np.pi / p; p_start = D
603 else:
604 ell = np.full(D, 10.0 ** float(x0[0]))
605 pip = np.full(D, np.pi / 10.0 ** float(x0[1])); p_start = 1
606 sin_d = [oti.sin(oti.mul(pip[d], diffs[d])) for d in range(D)]
607 cos_d = [oti.cos(oti.mul(pip[d], diffs[d])) for d in range(D)]
608 if kernel_type == 'anisotropic':
609 if hasattr(phi, 'fused_scale_sq_mul'):
610 dphi_buf = oti.zeros(phi.shape)
611 for d in range(D):
612 dphi_buf.fused_scale_sq_mul(sin_d[d], phi, -4.0 * ln10 * ell[d] ** 2)
613 grad[d] = _gc(dphi_buf)
614 else:
615 for d in range(D):
616 grad[d] = _gc(oti.mul(-4.0 * ln10 * ell[d] ** 2,
617 oti.mul(oti.mul(sin_d[d], sin_d[d]), phi)))
618 for d in range(D):
619 sc = oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d]))
620 grad[p_start + d] = _gc(oti.mul(4.0 * ln10 * ell[d] ** 2 * pip[d],
621 oti.mul(sc, phi)))
622 else:
623 if hasattr(phi, 'fused_sum_sq'):
624 ss = oti.zeros(phi.shape)
625 ss.fused_sum_sq(sin_d)
626 else:
627 ss = oti.mul(sin_d[0], sin_d[0])
628 for d in range(1, D):
629 ss = oti.sum(ss, oti.mul(sin_d[d], sin_d[d]))
630 grad[0] = _gc(oti.mul(-4.0 * ln10 * ell[0] ** 2, oti.mul(ss, phi)))
631 scd = oti.mul(sin_d[0], oti.mul(cos_d[0], diffs[0]))
632 for d in range(1, D):
633 scd = oti.sum(scd, oti.mul(sin_d[d], oti.mul(cos_d[d], diffs[d])))
634 grad[p_start] = _gc(oti.mul(4.0 * ln10 * ell[0] ** 2 * pip[0],
635 oti.mul(scd, phi)))
637 elif kernel == 'Matern':
638 kf = self.model.kernel_factory
639 if not hasattr(kf, '_matern_grad_prebuild'):
640 kf._matern_grad_prebuild = matern_kernel_grad_builder(getattr(kf, "nu", 1.5), oti_module=oti)
641 ell = (10.0 ** x0[:D] if kernel_type == 'anisotropic'
642 else np.full(D, 10.0 ** float(x0[0])))
643 sigma_f_sq = (10.0 ** float(x0[-2])) ** 2
644 _eps = 1e-10
645 if hasattr(phi, 'fused_sqdist'):
646 r2 = oti.zeros(phi.shape)
647 ell_sq = np.ascontiguousarray(ell ** 2, dtype=np.float64)
648 r2.fused_sqdist(diffs, ell_sq)
649 else:
650 r2 = oti.mul(ell[0], diffs[0]); r2 = oti.mul(r2, r2)
651 for d in range(1, D):
652 td = oti.mul(ell[d], diffs[d]); r2 = oti.sum(r2, oti.mul(td, td))
653 r_oti = oti.sqrt(oti.sum(r2, _eps ** 2))
654 f_prime_r = kf._matern_grad_prebuild(r_oti)
655 inv_r = oti.pow(r_oti, -1)
656 base_matern = oti.mul(sigma_f_sq, oti.mul(f_prime_r, inv_r))
657 if kernel_type == 'anisotropic':
658 if hasattr(phi, 'fused_scale_sq_mul'):
659 dphi_buf = oti.zeros(phi.shape)
660 for d in range(D):
661 dphi_buf.fused_scale_sq_mul(diffs[d], base_matern, ln10 * ell[d] ** 2)
662 grad[d] = _gc(dphi_buf)
663 else:
664 for d in range(D):
665 d_sq = oti.mul(diffs[d], diffs[d])
666 dphi_d = oti.mul(ln10 * ell[d] ** 2, oti.mul(d_sq, base_matern))
667 grad[d] = _gc(dphi_d)
668 else:
669 if hasattr(phi, 'fused_sum_sq'):
670 sum_dsq = oti.zeros(phi.shape)
671 sum_dsq.fused_sum_sq(diffs)
672 else:
673 sum_dsq = oti.mul(diffs[0], diffs[0])
674 for d in range(1, D):
675 sum_dsq = oti.sum(sum_dsq, oti.mul(diffs[d], diffs[d]))
676 dphi_e = oti.mul(ln10 * ell[0] ** 2, oti.mul(sum_dsq, base_matern))
677 grad[0] = _gc(dphi_e)
679 return float(nll), grad
681 def optimize_hyperparameters( self,
682 optimizer="pso",
683 **kwargs):
684 """
685 Optimize the DEGP model hyperparameters using Particle Swarm Optimization (PSO).
687 Parameters:
688 ----------
689 n_restart_optimizer : int, default=20
690 Maximum number of iterations for PSO.
691 swarm_size : int, default=20
692 Number of particles in the swarm.
693 verbose : bool, default=True
694 Controls verbosity of PSO output.
696 Returns:
697 -------
698 best_x : ndarray
699 The optimal set of hyperparameters found.
700 """
702 if isinstance(optimizer, str):
703 if optimizer not in OPTIMIZERS:
704 raise ValueError(
705 f"Unknown optimizer '{optimizer}'. Available: {list(OPTIMIZERS.keys())}"
706 )
707 optimizer_fn = OPTIMIZERS[optimizer]
708 else:
709 optimizer_fn = optimizer # allow passing a callable directly
711 bounds = self.model.bounds
712 lb = [b[0] for b in bounds]
713 ub = [b[1] for b in bounds]
715 if optimizer in ('lbfgs', 'jade', 'pso') and 'func_and_grad' not in kwargs and 'grad_func' not in kwargs:
716 kwargs['func_and_grad'] = self.nll_and_grad
718 best_x, best_val = optimizer_fn(self.nll_wrapper, lb, ub, **kwargs)
720 self.model.opt_x0 = best_x
721 self.model.opt_nll = best_val
724 return best_x