| package optimize |
| |
| import ( |
| "math" |
| |
| "gonum.org/v1/gonum/floats" |
| ) |
| |
| const ( |
| iterationRestartFactor = 6 |
| angleRestartThreshold = -0.9 |
| ) |
| |
| // CGVariant calculates the scaling parameter, β, used for updating the |
| // conjugate direction in the nonlinear conjugate gradient (CG) method. |
| type CGVariant interface { |
| // Init is called at the first iteration and provides a way to initialize |
| // any internal state. |
| Init(loc *Location) |
| // Beta returns the value of the scaling parameter that is computed |
| // according to the particular variant of the CG method. |
| Beta(grad, gradPrev, dirPrev []float64) float64 |
| } |
| |
| // CG implements the nonlinear conjugate gradient method for solving nonlinear |
| // unconstrained optimization problems. It is a line search method that |
| // generates the search directions d_k according to the formula |
| // d_{k+1} = -∇f_{k+1} + β_k*d_k, d_0 = -∇f_0. |
| // Variants of the conjugate gradient method differ in the choice of the |
| // parameter β_k. The conjugate gradient method usually requires fewer function |
| // evaluations than the gradient descent method and no matrix storage, but |
| // L-BFGS is usually more efficient. |
| // |
| // CG implements a restart strategy that takes the steepest descent direction |
| // (i.e., d_{k+1} = -∇f_{k+1}) whenever any of the following conditions holds: |
| // |
| // - A certain number of iterations has elapsed without a restart. This number |
| // is controllable via IterationRestartFactor and if equal to 0, it is set to |
| // a reasonable default based on the problem dimension. |
| // - The angle between the gradients at two consecutive iterations ∇f_k and |
| // ∇f_{k+1} is too large. |
| // - The direction d_{k+1} is not a descent direction. |
| // - β_k returned from CGVariant.Beta is equal to zero. |
| // |
| // The line search for CG must yield step sizes that satisfy the strong Wolfe |
| // conditions at every iteration, otherwise the generated search direction |
| // might fail to be a descent direction. The line search should be more |
| // stringent compared with those for Newton-like methods, which can be achieved |
| // by setting the gradient constant in the strong Wolfe conditions to a small |
| // value. |
| // |
| // See also William Hager, Hongchao Zhang, A survey of nonlinear conjugate |
| // gradient methods. Pacific Journal of Optimization, 2 (2006), pp. 35-58, and |
| // references therein. |
| type CG struct { |
| // Linesearcher must satisfy the strong Wolfe conditions at every iteration. |
| // If Linesearcher == nil, an appropriate default is chosen. |
| Linesearcher Linesearcher |
| // Variant implements the particular CG formula for computing β_k. |
| // If Variant is nil, an appropriate default is chosen. |
| Variant CGVariant |
| // InitialStep estimates the initial line search step size, because the CG |
| // method does not generate well-scaled search directions. |
| // If InitialStep is nil, an appropriate default is chosen. |
| InitialStep StepSizer |
| |
| // IterationRestartFactor determines the frequency of restarts based on the |
| // problem dimension. The negative gradient direction is taken whenever |
| // ceil(IterationRestartFactor*(problem dimension)) iterations have elapsed |
| // without a restart. For medium and large-scale problems |
| // IterationRestartFactor should be set to 1, low-dimensional problems a |
| // larger value should be chosen. Note that if the ceil function returns 1, |
| // CG will be identical to gradient descent. |
| // If IterationRestartFactor is 0, it will be set to 6. |
| // CG will panic if IterationRestartFactor is negative. |
| IterationRestartFactor float64 |
| // AngleRestartThreshold sets the threshold angle for restart. The method |
| // is restarted if the cosine of the angle between two consecutive |
| // gradients is smaller than or equal to AngleRestartThreshold, that is, if |
| // ∇f_k·∇f_{k+1} / (|∇f_k| |∇f_{k+1}|) <= AngleRestartThreshold. |
| // A value of AngleRestartThreshold closer to -1 (successive gradients in |
| // exact opposite directions) will tend to reduce the number of restarts. |
| // If AngleRestartThreshold is 0, it will be set to -0.9. |
| // CG will panic if AngleRestartThreshold is not in the interval [-1, 0]. |
| AngleRestartThreshold float64 |
| |
| ls *LinesearchMethod |
| |
| restartAfter int |
| iterFromRestart int |
| |
| dirPrev []float64 |
| gradPrev []float64 |
| gradPrevNorm float64 |
| } |
| |
| func (cg *CG) Init(loc *Location) (Operation, error) { |
| if cg.IterationRestartFactor < 0 { |
| panic("cg: IterationRestartFactor is negative") |
| } |
| if cg.AngleRestartThreshold < -1 || cg.AngleRestartThreshold > 0 { |
| panic("cg: AngleRestartThreshold not in [-1, 0]") |
| } |
| |
| if cg.Linesearcher == nil { |
| cg.Linesearcher = &MoreThuente{CurvatureFactor: 0.1} |
| } |
| if cg.Variant == nil { |
| cg.Variant = &HestenesStiefel{} |
| } |
| if cg.InitialStep == nil { |
| cg.InitialStep = &FirstOrderStepSize{} |
| } |
| |
| if cg.IterationRestartFactor == 0 { |
| cg.IterationRestartFactor = iterationRestartFactor |
| } |
| if cg.AngleRestartThreshold == 0 { |
| cg.AngleRestartThreshold = angleRestartThreshold |
| } |
| |
| if cg.ls == nil { |
| cg.ls = &LinesearchMethod{} |
| } |
| cg.ls.Linesearcher = cg.Linesearcher |
| cg.ls.NextDirectioner = cg |
| |
| return cg.ls.Init(loc) |
| } |
| |
| func (cg *CG) Iterate(loc *Location) (Operation, error) { |
| return cg.ls.Iterate(loc) |
| } |
| |
| func (cg *CG) InitDirection(loc *Location, dir []float64) (stepSize float64) { |
| dim := len(loc.X) |
| |
| cg.restartAfter = int(math.Ceil(cg.IterationRestartFactor * float64(dim))) |
| cg.iterFromRestart = 0 |
| |
| // The initial direction is always the negative gradient. |
| copy(dir, loc.Gradient) |
| floats.Scale(-1, dir) |
| |
| cg.dirPrev = resize(cg.dirPrev, dim) |
| copy(cg.dirPrev, dir) |
| cg.gradPrev = resize(cg.gradPrev, dim) |
| copy(cg.gradPrev, loc.Gradient) |
| cg.gradPrevNorm = floats.Norm(loc.Gradient, 2) |
| |
| cg.Variant.Init(loc) |
| return cg.InitialStep.Init(loc, dir) |
| } |
| |
| func (cg *CG) NextDirection(loc *Location, dir []float64) (stepSize float64) { |
| copy(dir, loc.Gradient) |
| floats.Scale(-1, dir) |
| |
| cg.iterFromRestart++ |
| var restart bool |
| if cg.iterFromRestart == cg.restartAfter { |
| // Restart because too many iterations have been taken without a restart. |
| restart = true |
| } |
| |
| gDot := floats.Dot(loc.Gradient, cg.gradPrev) |
| gNorm := floats.Norm(loc.Gradient, 2) |
| if gDot <= cg.AngleRestartThreshold*gNorm*cg.gradPrevNorm { |
| // Restart because the angle between the last two gradients is too large. |
| restart = true |
| } |
| |
| // Compute the scaling factor β_k even when restarting, because cg.Variant |
| // may be keeping an inner state that needs to be updated at every iteration. |
| beta := cg.Variant.Beta(loc.Gradient, cg.gradPrev, cg.dirPrev) |
| if beta == 0 { |
| // β_k == 0 means that the steepest descent direction will be taken, so |
| // indicate that the method is in fact being restarted. |
| restart = true |
| } |
| if !restart { |
| // The method is not being restarted, so update the descent direction. |
| floats.AddScaled(dir, beta, cg.dirPrev) |
| if floats.Dot(loc.Gradient, dir) >= 0 { |
| // Restart because the new direction is not a descent direction. |
| restart = true |
| copy(dir, loc.Gradient) |
| floats.Scale(-1, dir) |
| } |
| } |
| |
| // Get the initial line search step size from the StepSizer even if the |
| // method was restarted, because StepSizers need to see every iteration. |
| stepSize = cg.InitialStep.StepSize(loc, dir) |
| if restart { |
| // The method was restarted and since the steepest descent direction is |
| // not related to the previous direction, discard the estimated step |
| // size from cg.InitialStep and use step size of 1 instead. |
| stepSize = 1 |
| // Reset to 0 the counter of iterations taken since the last restart. |
| cg.iterFromRestart = 0 |
| } |
| |
| copy(cg.gradPrev, loc.Gradient) |
| copy(cg.dirPrev, dir) |
| cg.gradPrevNorm = gNorm |
| return stepSize |
| } |
| |
| func (*CG) Needs() struct { |
| Gradient bool |
| Hessian bool |
| } { |
| return struct { |
| Gradient bool |
| Hessian bool |
| }{true, false} |
| } |
| |
| // FletcherReeves implements the Fletcher-Reeves variant of the CG method that |
| // computes the scaling parameter β_k according to the formula |
| // β_k = |∇f_{k+1}|^2 / |∇f_k|^2. |
| type FletcherReeves struct { |
| prevNorm float64 |
| } |
| |
| func (fr *FletcherReeves) Init(loc *Location) { |
| fr.prevNorm = floats.Norm(loc.Gradient, 2) |
| } |
| |
| func (fr *FletcherReeves) Beta(grad, _, _ []float64) (beta float64) { |
| norm := floats.Norm(grad, 2) |
| beta = (norm / fr.prevNorm) * (norm / fr.prevNorm) |
| fr.prevNorm = norm |
| return beta |
| } |
| |
| // PolakRibierePolyak implements the Polak-Ribiere-Polyak variant of the CG |
| // method that computes the scaling parameter β_k according to the formula |
| // β_k = max(0, ∇f_{k+1}·y_k / |∇f_k|^2), |
| // where y_k = ∇f_{k+1} - ∇f_k. |
| type PolakRibierePolyak struct { |
| prevNorm float64 |
| } |
| |
| func (pr *PolakRibierePolyak) Init(loc *Location) { |
| pr.prevNorm = floats.Norm(loc.Gradient, 2) |
| } |
| |
| func (pr *PolakRibierePolyak) Beta(grad, gradPrev, _ []float64) (beta float64) { |
| norm := floats.Norm(grad, 2) |
| dot := floats.Dot(grad, gradPrev) |
| beta = (norm*norm - dot) / (pr.prevNorm * pr.prevNorm) |
| pr.prevNorm = norm |
| return math.Max(0, beta) |
| } |
| |
| // HestenesStiefel implements the Hestenes-Stiefel variant of the CG method |
| // that computes the scaling parameter β_k according to the formula |
| // β_k = max(0, ∇f_{k+1}·y_k / d_k·y_k), |
| // where y_k = ∇f_{k+1} - ∇f_k. |
| type HestenesStiefel struct { |
| y []float64 |
| } |
| |
| func (hs *HestenesStiefel) Init(loc *Location) { |
| hs.y = resize(hs.y, len(loc.Gradient)) |
| } |
| |
| func (hs *HestenesStiefel) Beta(grad, gradPrev, dirPrev []float64) (beta float64) { |
| floats.SubTo(hs.y, grad, gradPrev) |
| beta = floats.Dot(grad, hs.y) / floats.Dot(dirPrev, hs.y) |
| return math.Max(0, beta) |
| } |
| |
| // DaiYuan implements the Dai-Yuan variant of the CG method that computes the |
| // scaling parameter β_k according to the formula |
| // β_k = |∇f_{k+1}|^2 / d_k·y_k, |
| // where y_k = ∇f_{k+1} - ∇f_k. |
| type DaiYuan struct { |
| y []float64 |
| } |
| |
| func (dy *DaiYuan) Init(loc *Location) { |
| dy.y = resize(dy.y, len(loc.Gradient)) |
| } |
| |
| func (dy *DaiYuan) Beta(grad, gradPrev, dirPrev []float64) (beta float64) { |
| floats.SubTo(dy.y, grad, gradPrev) |
| norm := floats.Norm(grad, 2) |
| return norm * norm / floats.Dot(dirPrev, dy.y) |
| } |
| |
| // HagerZhang implements the Hager-Zhang variant of the CG method that computes the |
| // scaling parameter β_k according to the formula |
| // β_k = (y_k - 2 d_k |y_k|^2/(d_k·y_k))·∇f_{k+1} / (d_k·y_k), |
| // where y_k = ∇f_{k+1} - ∇f_k. |
| type HagerZhang struct { |
| y []float64 |
| } |
| |
| func (hz *HagerZhang) Init(loc *Location) { |
| hz.y = resize(hz.y, len(loc.Gradient)) |
| } |
| |
| func (hz *HagerZhang) Beta(grad, gradPrev, dirPrev []float64) (beta float64) { |
| floats.SubTo(hz.y, grad, gradPrev) |
| dirDotY := floats.Dot(dirPrev, hz.y) |
| gDotY := floats.Dot(grad, hz.y) |
| gDotDir := floats.Dot(grad, dirPrev) |
| yNorm := floats.Norm(hz.y, 2) |
| return (gDotY - 2*gDotDir*yNorm*yNorm/dirDotY) / dirDotY |
| } |