diff --git a/benches/steepest_descent.rs b/benches/steepest_descent.rs index 5f73837..9f66762 100644 --- a/benches/steepest_descent.rs +++ b/benches/steepest_descent.rs @@ -6,18 +6,18 @@ use test::Bencher; use tuutal::{descent, s, Array1, DescentParameter}; fn rosenbrock_nd() -> ( - impl Fn(&Array1) -> f32, - impl Fn(&Array1) -> Array1, + impl Fn(&Array1, &()) -> f32, + impl Fn(&Array1, &()) -> Array1, ) { // Rosenbrock function and its gradient function for arrays of size >= 3. - let f = |x: &Array1| { + let f = |x: &Array1, _: &()| { let xi = x.slice(s![0..x.len() - 1]); let xi_plus1 = x.slice(s![1..]); let term = 100. * (xi_plus1.to_owned() - xi.map(|x| x.powi(2))).map(|x| x.powi(2)) + (1. - xi.to_owned()).map(|x| x.powi(2)); term.sum() }; - let gradf = |x: &Array1| { + let gradf = |x: &Array1, _: &()| { let n = x.len(); let first_deriv = |i: usize| -400. * x[i] * (x[i + 1] - x[i].powi(2)) - 2. * (1. - x[i]); let last_deriv = |i: usize| 200. * (x[i] - x[i - 1].powi(2)); @@ -42,7 +42,7 @@ fn armijo_bench(bench: &mut Bencher) { let x0 = Array1::from_vec(vec![0_f32; LENGTH]); let params = DescentParameter::new_armijo(0.01, 0.01); bench.iter(|| { - let _solution = descent(&f, &gradf, &x0, params, 1e-6, Some(1000)); + let _solution = descent(&f, &gradf, &x0, params, 1e-6, Some(1000), (), ()); }); } @@ -53,7 +53,7 @@ fn powell_wolfe_bench(bench: &mut Bencher) { let x0 = Array1::from_vec(vec![0_f32; LENGTH]); let params = DescentParameter::new_powell_wolfe(0.01, 0.1); bench.iter(|| { - let _solution = descent(&f, &gradf, &x0, params, 1e-6, Some(1000)); + let _solution = descent(&f, &gradf, &x0, params, 1e-6, Some(1000), (), ()); }); } @@ -64,7 +64,7 @@ fn adagrad_bench(bench: &mut Bencher) { let x0 = Array1::from_vec(vec![0_f32; LENGTH]); let params = DescentParameter::new_adagrad(0.1, 0.0001); bench.iter(|| { - let _solution = descent(&f, &gradf, &x0, params, 1e-6, Some(1000)); + let _solution = descent(&f, &gradf, &x0, params, 1e-6, Some(1000), (), ()); }); } @@ -75,6 +75,6 @@ fn adadelta_bench(bench: &mut Bencher) { let x0 = Array1::from_vec(vec![0_f32; LENGTH]); let params = DescentParameter::new_adadelta(0.1, 0.0001); bench.iter(|| { - let _solution = descent(&f, &gradf, &x0, params, 1e-6, Some(1000)); + let _solution = descent(&f, &gradf, &x0, params, 1e-6, Some(1000), (), ()); }); } diff --git a/src/first_order.rs b/src/first_order.rs index 18bea74..8d1c028 100644 --- a/src/first_order.rs +++ b/src/first_order.rs @@ -167,8 +167,8 @@ where /// ``` /// use tuutal::{array, descent, DescentParameter, Array1}; /// // Example from python scipy.optimize.minimize_scalar -/// let f = |x: &Array1| (x[0] - 2.) * x[0] * (x[0] + 2.).powi(2); -/// let gradf = |x: &Array1| array![2. * (x[0] + 2.) * (2. * x[0].powi(2) - x[0] - 1.)]; +/// let f = |x: &Array1, _: &()| (x[0] - 2.) * x[0] * (x[0] + 2.).powi(2); +/// let gradf = |x: &Array1, _: &()| array![2. * (x[0] + 2.) * (2. * x[0].powi(2) - x[0] - 1.)]; /// let x0 = &array![-1.]; /// /// let (x_star, f_star) = descent( @@ -178,6 +178,8 @@ where /// DescentParameter::new_armijo(1e-2, 0.25), /// 1e-3, /// Some(10), +/// (), +/// () /// ).unwrap(); /// assert!((-2. - x_star[0]).abs() < 1e-10); /// @@ -188,44 +190,48 @@ where /// DescentParameter::new_powell_wolfe(1e-2, 0.9), /// 1e-3, /// Some(10), +/// (), +/// () /// ).unwrap(); /// assert!((-2. - x_star[0]).abs() < 1e-10); /// /// let x0 = &array![-0.5]; -/// let (x_star, f_star) = descent(f, gradf, &x0, Default::default(), 1e-3, Some(10)).unwrap(); +/// let (x_star, f_star) = descent(f, gradf, &x0, Default::default(), 1e-3, Some(10), (), ()).unwrap(); /// assert!((-0.5 - x_star[0]).abs() < 1e-10); /// /// let x0 = &array![0.]; -/// let (x_star, f_star) = descent(f, gradf, &x0, Default::default(), 1e-3, Some(10)).unwrap(); +/// let (x_star, f_star) = descent(f, gradf, &x0, Default::default(), 1e-3, Some(10), (), ()).unwrap(); /// assert!((1. - x_star[0]).abs() < 1e-10); /// /// // It also takes multivariate objective functions /// let f = -/// |arr: &Array1| 100. * (arr[1] - arr[0].powi(2)).powi(2) + (1. - arr[0]).powi(2); -/// let gradf = |arr: &Array1| { +/// |arr: &Array1, _: &()| 100. * (arr[1] - arr[0].powi(2)).powi(2) + (1. - arr[0]).powi(2); +/// let gradf = |arr: &Array1, _: &()| { /// array![ /// -400. * arr[0] * (arr[1] - arr[0].powi(2)) - 2. * (1. - arr[0]), /// 200. * (arr[1] - arr[0].powi(2)) /// ] /// }; /// let x = array![1f32, -0.5f32]; -/// let (x_star, f_star) = descent(f, gradf, &x, Default::default(), 1e-3, Some(10000)).unwrap(); +/// let (x_star, f_star) = descent(f, gradf, &x, Default::default(), 1e-3, Some(10000), (), ()).unwrap(); /// assert!((x_star[0] - 1.).abs() <= 1e-2); /// assert!((x_star[1] - 1.).abs() <= 1e-2); /// ``` -pub fn descent( +pub fn descent( f: F, gradf: G, x0: &X, params: DescentParameter, gtol: X::Elem, maxiter: Option, + farg: Farg, + garg: Garg, ) -> Result<(X, X::Elem), TuutalError> where X: Vector + Clone + VecDot, for<'a> &'a X: Add + Mul<&'a X, Output = X> + Mul, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, + F: Fn(&X, &Farg) -> X::Elem, + G: Fn(&X, &Garg) -> X, { match params { DescentParameter::Armijo { gamma, beta } => Armijo::new( @@ -237,6 +243,8 @@ where beta, epsilon: gtol, }, + farg, + garg, ) .optimize(maxiter), DescentParameter::PowellWolfe { gamma, beta } => PowellWolfe::new( @@ -248,6 +256,8 @@ where beta, epsilon: gtol, }, + farg, + garg, ) .optimize(maxiter), DescentParameter::AdaDelta { gamma, beta } => AdaDelta::new( @@ -259,6 +269,8 @@ where beta, epsilon: gtol, }, + farg, + garg, ) .optimize(maxiter), DescentParameter::AdaGrad { gamma, beta } => AdaGrad::new( @@ -270,6 +282,8 @@ where beta, epsilon: gtol, }, + farg, + garg, ) .optimize(maxiter), } diff --git a/src/first_order/adaptive_descent/adadelta.rs b/src/first_order/adaptive_descent/adadelta.rs index e2305d6..5a863a1 100644 --- a/src/first_order/adaptive_descent/adadelta.rs +++ b/src/first_order/adaptive_descent/adadelta.rs @@ -34,12 +34,13 @@ descent_rule!( impl_optimizer_descent!(AdaDelta, X, AdaDeltaHyperParameter, AdaDeltaAccumulators); -impl AdaDelta, AdaDeltaAccumulators> +impl + AdaDelta, AdaDeltaAccumulators, Farg, Garg> where X: Vector, for<'b> &'b X: Add + Mul<&'b X, Output = X>, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, + F: Fn(&X, &Farg) -> X::Elem, + G: Fn(&X, &Garg) -> X, { pub(crate) fn step(&mut self) { let (gamma, beta) = (self.hyper_params.gamma, self.hyper_params.beta); diff --git a/src/first_order/adaptive_descent/adagrad.rs b/src/first_order/adaptive_descent/adagrad.rs index d16cddc..419b87c 100644 --- a/src/first_order/adaptive_descent/adagrad.rs +++ b/src/first_order/adaptive_descent/adagrad.rs @@ -30,12 +30,13 @@ descent_rule!( ); impl_optimizer_descent!(AdaGrad, X, AdaGradHyperParameter, AdaGradAccumulators); -impl AdaGrad, AdaGradAccumulators> +impl + AdaGrad, AdaGradAccumulators, Farg, Garg> where X: Vector, for<'b> &'b X: Add + Mul<&'b X, Output = X>, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, + F: Fn(&X, &Farg) -> X::Elem, + G: Fn(&X, &Garg) -> X, { pub(crate) fn step(&mut self) { let (gamma, beta) = (self.hyper_params.gamma, self.hyper_params.beta); diff --git a/src/first_order/macros.rs b/src/first_order/macros.rs index 69eb211..5e34f70 100644 --- a/src/first_order/macros.rs +++ b/src/first_order/macros.rs @@ -2,9 +2,11 @@ macro_rules! descent_rule { ($rule:ident, $step:ty, $sigma:expr, $hp:ident, $accum:ty, $accumnew:expr) => { #[derive(Debug)] #[allow(dead_code)] - pub struct $rule + pub struct $rule where X: Vector, + F: Fn(&X, &Farg) -> X::Elem, + G: Fn(&X, &Garg) -> X, { f: F, // objective function gradf: G, // gradient of the objective function @@ -15,47 +17,38 @@ macro_rules! descent_rule { counter: Counter, // [nb of iterations, number of f calls, nb of gradf calls] stop_metrics: X::Elem, // metrics used to stop the algorithm, accumulators: A, // accumulators during corresponding algorithms. + farg: Farg, + garg: Garg, } - impl $rule, $accum> + impl $rule, $accum, Farg, Garg> where X: Vector, + F: Fn(&X, &Farg) -> X::Elem, + G: Fn(&X, &Garg) -> X, { - #[allow(dead_code)] - pub(crate) fn func(&self, x: &X) -> X::Elem - where - F: Fn(&X) -> X::Elem, - { - let f = &self.f; - f(x) - } - pub(crate) fn grad(&self, x: &X) -> X - where - G: Fn(&X) -> X, - { - let g = &self.gradf; - g(x) - } - pub(crate) fn stop(&self) -> bool { - self.stop_metrics <= self.hyper_params.epsilon.powi(2) - } - - pub fn new(f: F, gradf: G, x: X, hyper_params: $hp) -> Self - where - X: Vector, - G: Fn(&X) -> X, - { - let neg_gradfx = -gradf(&x); + /// New algorithm + pub fn new( + f: F, + gradf: G, + x: X, + hyper_params: $hp, + farg: Farg, + garg: Garg, + ) -> Self { + let neg_gradfx = -gradf(&x, &garg); let mut optimizer = Self { f, gradf, x, - neg_gradfx: neg_gradfx, + neg_gradfx, sigma: $sigma, hyper_params, counter: Counter::new(), stop_metrics: X::Elem::infinity(), accumulators: $accumnew, + farg, + garg, }; optimizer.counter.gcalls += 1; // Not needed when broadcasting is allowed ? @@ -68,18 +61,31 @@ macro_rules! descent_rule { // } optimizer } + #[allow(dead_code)] + pub(crate) fn func(&self, x: &X) -> X::Elem { + let f = &self.f; + f(x, &self.farg) + } + pub(crate) fn grad(&self, x: &X) -> X { + let g = &self.gradf; + g(x, &self.garg) + } + pub(crate) fn stop(&self) -> bool { + self.stop_metrics <= self.hyper_params.epsilon.powi(2) + } } }; } macro_rules! impl_optimizer_descent { ($rule:ident, $step:ty, $hp:ident, $accum:ty) => { - impl core::iter::Iterator for $rule, $accum> + impl core::iter::Iterator + for $rule, $accum, Farg, Garg> where X: Vector + VecDot + Clone, for<'b> &'b X: Add + Mul<&'b X, Output = X>, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, + F: Fn(&X, &Farg) -> X::Elem, + G: Fn(&X, &Garg) -> X, { type Item = X::Elem; fn next(&mut self) -> Option { @@ -95,12 +101,13 @@ macro_rules! impl_optimizer_descent { } } } - impl Optimizer for $rule, $accum> + impl Optimizer + for $rule, $accum, Farg, Garg> where X: Vector + VecDot + Clone, for<'b> &'b X: Add + Mul<&'b X, Output = X>, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, + F: Fn(&X, &Farg) -> X::Elem, + G: Fn(&X, &Garg) -> X, { type Iterate = X; type Intermediate = $step; diff --git a/src/first_order/steepest_descent/armijo.rs b/src/first_order/steepest_descent/armijo.rs index c70908c..b0b8cf1 100644 --- a/src/first_order/steepest_descent/armijo.rs +++ b/src/first_order/steepest_descent/armijo.rs @@ -6,7 +6,6 @@ use crate::{ Counter, Optimizer, }; use num_traits::{Float, One, Zero}; - /// Hyperparameters used to compute step sizes in Armijo rule. #[derive(Debug)] pub struct ArmijoHyperParameter { @@ -25,12 +24,13 @@ descent_rule!( ); impl_optimizer_descent!(Armijo, [::Elem; 1], ArmijoHyperParameter, ()); -impl Armijo, ()> +impl + Armijo, (), Farg, Garg> where X: Vector + VecDot, for<'b> &'b X: Add, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, + F: Fn(&X, &Farg) -> X::Elem, + G: Fn(&X, &Garg) -> X, { pub(crate) fn step(&mut self) { let mut sigma = X::Elem::one(); diff --git a/src/first_order/steepest_descent/powell_wolfe.rs b/src/first_order/steepest_descent/powell_wolfe.rs index 438e3f7..7891a6b 100644 --- a/src/first_order/steepest_descent/powell_wolfe.rs +++ b/src/first_order/steepest_descent/powell_wolfe.rs @@ -24,12 +24,13 @@ descent_rule!( ); impl_optimizer_descent!(PowellWolfe, [X::Elem; 1], PowellWolfeHyperParameter, ()); -impl PowellWolfe, ()> +impl + PowellWolfe, (), Farg, Garg> where X: Vector + VecDot + Add, for<'b> &'b X: Add, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, + F: Fn(&X, &Farg) -> X::Elem, + G: Fn(&X, &Garg) -> X, { pub(crate) fn step(&mut self) { let mut sigma_minus = X::Elem::one(); diff --git a/src/first_order/steepest_descent/unit_test.rs b/src/first_order/steepest_descent/unit_test.rs index 2e87676..50e4181 100644 --- a/src/first_order/steepest_descent/unit_test.rs +++ b/src/first_order/steepest_descent/unit_test.rs @@ -3,10 +3,14 @@ mod tests { use crate::{descent, DescentParameter}; use crate::{l2_diff, Array1, TuutalError}; - fn rosenbrock_2d() -> (fn(&Array1) -> f32, fn(&Array1) -> Array1) { - let f = - |arr: &Array1| 100. * (arr[1] - arr[0].powi(2)).powi(2) + (1. - arr[0]).powi(2); - let gradf = |arr: &Array1| { + fn rosenbrock_2d() -> ( + fn(&Array1, &()) -> f32, + fn(&Array1, &()) -> Array1, + ) { + let f = |arr: &Array1, _farg: &()| { + 100. * (arr[1] - arr[0].powi(2)).powi(2) + (1. - arr[0]).powi(2) + }; + let gradf = |arr: &Array1, _garg: &()| { Array1::from_iter([ -400. * arr[0] * (arr[1] - arr[0].powi(2)) - 2. * (1. - arr[0]), 200. * (arr[1] - arr[0].powi(2)), @@ -20,7 +24,7 @@ mod tests { let armijo = DescentParameter::new_armijo(0.01, 0.5); let (f, gradf) = rosenbrock_2d(); let x = Array1::from_iter([1f32, -0.5f32]); - let (opt, _f_star) = descent(f, gradf, &x, armijo, 1e-4, Some(10000)).unwrap(); + let (opt, _f_star) = descent(f, gradf, &x, armijo, 1e-4, Some(10000), (), ()).unwrap(); let expected = Array1::from_iter([1., 1.]); assert!(l2_diff(&opt, &expected) < 1e-3); } @@ -30,7 +34,7 @@ mod tests { let powolf = DescentParameter::new_powell_wolfe(0.0001, 0.9); let (f, gradf) = rosenbrock_2d(); let x = Array1::from_iter([1f32, -0.5f32]); - let (opt, _f_star) = descent(f, gradf, &x, powolf, 1e-4, Some(10000)).unwrap(); + let (opt, _f_star) = descent(f, gradf, &x, powolf, 1e-4, Some(10000), (), ()).unwrap(); let expected = Array1::from_iter([1., 1.]); assert!(l2_diff(&opt, &expected) < 1e-3); } @@ -43,7 +47,7 @@ mod tests { }; let (f, gradf) = rosenbrock_2d(); let x = Array1::from_iter([1f32, -0.5f32]); - let opt = descent(f, gradf, &x, adagrad, 1e-4, Some(10000)).unwrap_err(); + let opt = descent(f, gradf, &x, adagrad, 1e-4, Some(10000), (), ()).unwrap_err(); let expected = Array1::from_iter([1., 1.]); // Slow convergence rate for this problem match opt { @@ -63,7 +67,7 @@ mod tests { }; let (f, gradf) = rosenbrock_2d(); let x = Array1::from_iter([1f32, -0.5f32]); - let opt = descent(f, gradf, &x, adadelta, 1e-4, Some(10000)).unwrap_err(); + let opt = descent(f, gradf, &x, adadelta, 1e-4, Some(10000), (), ()).unwrap_err(); let expected = Array1::from_iter([1., 1.]); // println!("{:?}", opt); // Slow convergence rate for this problem @@ -79,12 +83,12 @@ mod tests { #[test] fn test_rosenbrock_3d() { let powolf = DescentParameter::new_powell_wolfe(0.0001, 0.9); - let f = |x: &Array1| { + let f = |x: &Array1, _: &()| { 100. * ((x[1] - x[0].powi(2)).powi(2) + (x[2] - x[1].powi(2)).powi(2)) + (1. - x[0]).powi(2) + (1. - x[1]).powi(2) }; - let gradf = |x: &Array1| { + let gradf = |x: &Array1, _: &()| { 2. * Array1::from_iter([ 200. * x[0] * (x[0].powi(2) - x[1]) + (x[0] - 1.), 100. * (x[1] - x[0].powi(2) + 2. * x[1] * (x[1].powi(2) - x[2])) + (x[1] - 1.), @@ -92,7 +96,7 @@ mod tests { ]) }; let x = Array1::from_iter([10f32, -15., -100.]); - let (opt, _f_star) = descent(f, gradf, &x, powolf, 1e-4, Some(10000)).unwrap(); + let (opt, _f_star) = descent(f, gradf, &x, powolf, 1e-4, Some(10000), (), ()).unwrap(); let expected = Array1::from_iter([1., 1., 1.]); assert!(l2_diff(&opt, &expected) < 1e-3); } diff --git a/tests/quickckeck.rs b/tests/quickckeck.rs index d5a08db..5e8c421 100644 --- a/tests/quickckeck.rs +++ b/tests/quickckeck.rs @@ -23,14 +23,14 @@ quickcheck! { return true; } let eye = Array1::ones(arr.shape()[0]); - let f = |x: &Array1| 0.5 * x.dot(x).powi(2) + eye.dot(x) + 1.; - let gradf = |x: &Array1| 2. * x * x.dot(x) + eye.clone(); - let mut iterates = Armijo::new(f, gradf, arr.to_owned(), ArmijoHyperParameter{gamma:0.01, beta: 0.5, epsilon: 1e-3}); + let f = |x: &Array1, _: &()| 0.5 * x.dot(x).powi(2) + eye.dot(x) + 1.; + let gradf = |x: &Array1, _: &()| 2. * x * x.dot(x) + eye.clone(); + let mut iterates = Armijo::new(f, gradf, arr.to_owned(), ArmijoHyperParameter{gamma:0.01, beta: 0.5, epsilon: 1e-3}, (), ()); let mut x_prev = arr.to_owned(); let mut iter = 1; while let Some(_) = iterates.next() { let x = iterates.iterate(); - assert!(f(&x) < f(&x_prev) + f32::EPSILON); + assert!(f(&x, &()) < f(&x_prev, &()) + f32::EPSILON); x_prev = x; iter += 1; if iter > 10 { @@ -56,19 +56,20 @@ quickcheck! { // To avoid arbitrarily large or missing numbers. return true; } - let f = |x: &Array1| 0.5 * x.dot(x).powi(2) + 1.; - let gradf = |x: &Array1| 2. * x * x.dot(x) ; + let f = |x: &Array1, c: &(f32, f32)| c.0 * x.dot(x).powi(2) + c.1; + let gradf = |x: &Array1, _:&()| 2. * x * x.dot(x) ; let gamma = 0.001; let beta = 0.9; - let mut iterates = PowellWolfe::new(f, gradf, arr.to_owned(), PowellWolfeHyperParameter{gamma, beta, epsilon: 1e-3}); + let cons = (0.5, 1.); + let mut iterates = PowellWolfe::new(f, gradf, arr.to_owned(), PowellWolfeHyperParameter{gamma, beta, epsilon: 1e-3}, cons, ()); let mut x_prev = arr.to_owned(); let mut iter = 1; while let Some(_) = iterates.next() { let x_next = iterates.iterate(); - let neg_gradfx_prev = -gradf(&x_prev); + let neg_gradfx_prev = -gradf(&x_prev, &()); let gradfx_d = neg_gradfx_prev.dot(&neg_gradfx_prev); let step_size = iterates.intermediate()[0]; - assert!(f(&x_next) <= f(&x_prev) - step_size * gamma * gradfx_d); + assert!(f(&x_next, &cons) <= f(&x_prev, &cons) - step_size * gamma * gradfx_d); x_prev = x_next; iter += 1; if iter > 10 {