| // Copyright ©2017 The Gonum Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package optimize |
| |
| import ( |
| "math" |
| "sort" |
| |
| "golang.org/x/exp/rand" |
| |
| "gonum.org/v1/gonum/floats" |
| "gonum.org/v1/gonum/mat" |
| "gonum.org/v1/gonum/stat/distmv" |
| ) |
| |
| var _ Method = (*CmaEsChol)(nil) |
| |
| // TODO(btracey): If we ever implement the traditional CMA-ES algorithm, provide |
| // the base explanation there, and modify this description to just |
| // describe the differences. |
| |
| // CmaEsChol implements the covariance matrix adaptation evolution strategy (CMA-ES) |
| // based on the Cholesky decomposition. The full algorithm is described in |
| // |
| // Krause, Oswin, Dídac Rodríguez Arbonès, and Christian Igel. "CMA-ES with |
| // optimal covariance update and storage complexity." Advances in Neural |
| // Information Processing Systems. 2016. |
| // https://papers.nips.cc/paper/6457-cma-es-with-optimal-covariance-update-and-storage-complexity.pdf |
| // |
| // CMA-ES is a global optimization method that progressively adapts a population |
| // of samples. CMA-ES combines techniques from local optimization with global |
| // optimization. Specifically, the CMA-ES algorithm uses an initial multivariate |
| // normal distribution to generate a population of input locations. The input locations |
| // with the lowest function values are used to update the parameters of the normal |
| // distribution, a new set of input locations are generated, and this procedure |
| // is iterated until convergence. The initial sampling distribution will have |
| // a mean specified by the initial x location, and a covariance specified by |
| // the InitCholesky field. |
| // |
| // As the normal distribution is progressively updated according to the best samples, |
| // it can be that the mean of the distribution is updated in a gradient-descent |
| // like fashion, followed by a shrinking covariance. |
| // It is recommended that the algorithm be run multiple times (with different |
| // InitMean) to have a better chance of finding the global minimum. |
| // |
| // The CMA-ES-Chol algorithm differs from the standard CMA-ES algorithm in that |
| // it directly updates the Cholesky decomposition of the normal distribution. |
| // This changes the runtime from O(dimension^3) to O(dimension^2*population) |
| // The evolution of the multi-variate normal will be similar to the baseline |
| // CMA-ES algorithm, but the covariance update equation is not identical. |
| // |
| // For more information about the CMA-ES algorithm, see |
| // |
| // https://en.wikipedia.org/wiki/CMA-ES |
| // https://arxiv.org/pdf/1604.00772.pdf |
| type CmaEsChol struct { |
| // InitStepSize sets the initial size of the covariance matrix adaptation. |
| // If InitStepSize is 0, a default value of 0.5 is used. InitStepSize cannot |
| // be negative, or CmaEsChol will panic. |
| InitStepSize float64 |
| // Population sets the population size for the algorithm. If Population is |
| // 0, a default value of 4 + math.Floor(3*math.Log(float64(dim))) is used. |
| // Population cannot be negative or CmaEsChol will panic. |
| Population int |
| // InitCholesky specifies the Cholesky decomposition of the covariance |
| // matrix for the initial sampling distribution. If InitCholesky is nil, |
| // a default value of I is used. If it is non-nil, then it must have |
| // InitCholesky.Size() be equal to the problem dimension. |
| InitCholesky *mat.Cholesky |
| // StopLogDet sets the threshold for stopping the optimization if the |
| // distribution becomes too peaked. The log determinant is a measure of the |
| // (log) "volume" of the normal distribution, and when it is too small |
| // the samples are almost the same. If the log determinant of the covariance |
| // matrix becomes less than StopLogDet, the optimization run is concluded. |
| // If StopLogDet is 0, a default value of dim*log(1e-16) is used. |
| // If StopLogDet is NaN, the stopping criterion is not used, though |
| // this can cause numeric instabilities in the algorithm. |
| StopLogDet float64 |
| // ForgetBest, when true, does not track the best overall function value found, |
| // instead returning the new best sample in each iteration. If ForgetBest |
| // is false, then the minimum value returned will be the lowest across all |
| // iterations, regardless of when that sample was generated. |
| ForgetBest bool |
| // Src allows a random number generator to be supplied for generating samples. |
| // If Src is nil the generator in golang.org/x/math/rand is used. |
| Src rand.Source |
| |
| // Fixed algorithm parameters. |
| dim int |
| pop int |
| weights []float64 |
| muEff float64 |
| cc, cs, c1, cmu, ds float64 |
| eChi float64 |
| |
| // Function data. |
| xs *mat.Dense |
| fs []float64 |
| |
| // Adaptive algorithm parameters. |
| invSigma float64 // inverse of the sigma parameter |
| pc, ps []float64 |
| mean []float64 |
| chol mat.Cholesky |
| |
| // Overall best. |
| bestX []float64 |
| bestF float64 |
| |
| // Synchronization. |
| sentIdx int |
| receivedIdx int |
| operation chan<- Task |
| updateErr error |
| } |
| |
| var ( |
| _ Statuser = (*CmaEsChol)(nil) |
| _ Method = (*CmaEsChol)(nil) |
| ) |
| |
| func (cma *CmaEsChol) methodConverged() Status { |
| sd := cma.StopLogDet |
| switch { |
| case math.IsNaN(sd): |
| return NotTerminated |
| case sd == 0: |
| sd = float64(cma.dim) * -36.8413614879 // ln(1e-16) |
| } |
| if cma.chol.LogDet() < sd { |
| return MethodConverge |
| } |
| return NotTerminated |
| } |
| |
| // Status returns the status of the method. |
| func (cma *CmaEsChol) Status() (Status, error) { |
| if cma.updateErr != nil { |
| return Failure, cma.updateErr |
| } |
| return cma.methodConverged(), nil |
| } |
| |
| func (*CmaEsChol) Uses(has Available) (uses Available, err error) { |
| return has.function() |
| } |
| |
| func (cma *CmaEsChol) Init(dim, tasks int) int { |
| if dim <= 0 { |
| panic(nonpositiveDimension) |
| } |
| if tasks < 0 { |
| panic(negativeTasks) |
| } |
| |
| // Set fixed algorithm parameters. |
| // Parameter values are from https://arxiv.org/pdf/1604.00772.pdf . |
| cma.dim = dim |
| cma.pop = cma.Population |
| n := float64(dim) |
| if cma.pop == 0 { |
| cma.pop = 4 + int(3*math.Log(n)) // Note the implicit floor. |
| } else if cma.pop < 0 { |
| panic("cma-es-chol: negative population size") |
| } |
| mu := cma.pop / 2 |
| cma.weights = resize(cma.weights, mu) |
| for i := range cma.weights { |
| v := math.Log(float64(mu)+0.5) - math.Log(float64(i)+1) |
| cma.weights[i] = v |
| } |
| floats.Scale(1/floats.Sum(cma.weights), cma.weights) |
| cma.muEff = 0 |
| for _, v := range cma.weights { |
| cma.muEff += v * v |
| } |
| cma.muEff = 1 / cma.muEff |
| |
| cma.cc = (4 + cma.muEff/n) / (n + 4 + 2*cma.muEff/n) |
| cma.cs = (cma.muEff + 2) / (n + cma.muEff + 5) |
| cma.c1 = 2 / ((n+1.3)*(n+1.3) + cma.muEff) |
| cma.cmu = math.Min(1-cma.c1, 2*(cma.muEff-2+1/cma.muEff)/((n+2)*(n+2)+cma.muEff)) |
| cma.ds = 1 + 2*math.Max(0, math.Sqrt((cma.muEff-1)/(n+1))-1) + cma.cs |
| // E[chi] is taken from https://en.wikipedia.org/wiki/CMA-ES (there |
| // listed as E[||N(0,1)||]). |
| cma.eChi = math.Sqrt(n) * (1 - 1.0/(4*n) + 1/(21*n*n)) |
| |
| // Allocate memory for function data. |
| cma.xs = mat.NewDense(cma.pop, dim, nil) |
| cma.fs = resize(cma.fs, cma.pop) |
| |
| // Allocate and initialize adaptive parameters. |
| cma.invSigma = 1 / cma.InitStepSize |
| if cma.InitStepSize == 0 { |
| cma.invSigma = 10.0 / 3 |
| } else if cma.InitStepSize < 0 { |
| panic("cma-es-chol: negative initial step size") |
| } |
| cma.pc = resize(cma.pc, dim) |
| for i := range cma.pc { |
| cma.pc[i] = 0 |
| } |
| cma.ps = resize(cma.ps, dim) |
| for i := range cma.ps { |
| cma.ps[i] = 0 |
| } |
| cma.mean = resize(cma.mean, dim) // mean location initialized at the start of Run |
| |
| if cma.InitCholesky != nil { |
| if cma.InitCholesky.SymmetricDim() != dim { |
| panic("cma-es-chol: incorrect InitCholesky size") |
| } |
| cma.chol.Clone(cma.InitCholesky) |
| } else { |
| // Set the initial Cholesky to I. |
| b := mat.NewDiagDense(dim, nil) |
| for i := 0; i < dim; i++ { |
| b.SetDiag(i, 1) |
| } |
| var chol mat.Cholesky |
| ok := chol.Factorize(b) |
| if !ok { |
| panic("cma-es-chol: bad cholesky. shouldn't happen") |
| } |
| cma.chol = chol |
| } |
| |
| cma.bestX = resize(cma.bestX, dim) |
| cma.bestF = math.Inf(1) |
| |
| cma.sentIdx = 0 |
| cma.receivedIdx = 0 |
| cma.operation = nil |
| cma.updateErr = nil |
| t := min(tasks, cma.pop) |
| return t |
| } |
| |
| func (cma *CmaEsChol) sendInitTasks(tasks []Task) { |
| for i, task := range tasks { |
| cma.sendTask(i, task) |
| } |
| cma.sentIdx = len(tasks) |
| } |
| |
| // sendTask generates a sample and sends the task. It does not update the cma index. |
| func (cma *CmaEsChol) sendTask(idx int, task Task) { |
| task.ID = idx |
| task.Op = FuncEvaluation |
| distmv.NormalRand(cma.xs.RawRowView(idx), cma.mean, &cma.chol, cma.Src) |
| copy(task.X, cma.xs.RawRowView(idx)) |
| cma.operation <- task |
| } |
| |
| // bestIdx returns the best index in the functions. Returns -1 if all values |
| // are NaN. |
| func (cma *CmaEsChol) bestIdx() int { |
| best := -1 |
| bestVal := math.Inf(1) |
| for i, v := range cma.fs { |
| if math.IsNaN(v) { |
| continue |
| } |
| // Use equality in case somewhere evaluates to +inf. |
| if v <= bestVal { |
| best = i |
| bestVal = v |
| } |
| } |
| return best |
| } |
| |
| // findBestAndUpdateTask finds the best task in the current list, updates the |
| // new best overall, and then stores the best location into task. |
| func (cma *CmaEsChol) findBestAndUpdateTask(task Task) Task { |
| // Find and update the best location. |
| // Don't use floats because there may be NaN values. |
| best := cma.bestIdx() |
| bestF := math.NaN() |
| bestX := cma.xs.RawRowView(0) |
| if best != -1 { |
| bestF = cma.fs[best] |
| bestX = cma.xs.RawRowView(best) |
| } |
| if cma.ForgetBest { |
| task.F = bestF |
| copy(task.X, bestX) |
| } else { |
| if bestF < cma.bestF { |
| cma.bestF = bestF |
| copy(cma.bestX, bestX) |
| } |
| task.F = cma.bestF |
| copy(task.X, cma.bestX) |
| } |
| return task |
| } |
| |
| func (cma *CmaEsChol) Run(operations chan<- Task, results <-chan Task, tasks []Task) { |
| copy(cma.mean, tasks[0].X) |
| cma.operation = operations |
| // Send the initial tasks. We know there are at most as many tasks as elements |
| // of the population. |
| cma.sendInitTasks(tasks) |
| |
| Loop: |
| for { |
| result := <-results |
| switch result.Op { |
| default: |
| panic("unknown operation") |
| case PostIteration: |
| break Loop |
| case MajorIteration: |
| // The last thing we did was update all of the tasks and send the |
| // major iteration. Now we can send a group of tasks again. |
| cma.sendInitTasks(tasks) |
| case FuncEvaluation: |
| cma.receivedIdx++ |
| cma.fs[result.ID] = result.F |
| switch { |
| case cma.sentIdx < cma.pop: |
| // There are still tasks to evaluate. Send the next. |
| cma.sendTask(cma.sentIdx, result) |
| cma.sentIdx++ |
| case cma.receivedIdx < cma.pop: |
| // All the tasks have been sent, but not all of them have been received. |
| // Need to wait until all are back. |
| continue Loop |
| default: |
| // All of the evaluations have been received. |
| if cma.receivedIdx != cma.pop { |
| panic("bad logic") |
| } |
| cma.receivedIdx = 0 |
| cma.sentIdx = 0 |
| |
| task := cma.findBestAndUpdateTask(result) |
| // Update the parameters and send a MajorIteration or a convergence. |
| err := cma.update() |
| // Kill the existing data. |
| for i := range cma.fs { |
| cma.fs[i] = math.NaN() |
| cma.xs.Set(i, 0, math.NaN()) |
| } |
| switch { |
| case err != nil: |
| cma.updateErr = err |
| task.Op = MethodDone |
| case cma.methodConverged() != NotTerminated: |
| task.Op = MethodDone |
| default: |
| task.Op = MajorIteration |
| task.ID = -1 |
| } |
| operations <- task |
| } |
| } |
| } |
| |
| // Been told to stop. Clean up. |
| // Need to see best of our evaluated tasks so far. Should instead just |
| // collect, then see. |
| for task := range results { |
| switch task.Op { |
| case MajorIteration: |
| case FuncEvaluation: |
| cma.fs[task.ID] = task.F |
| default: |
| panic("unknown operation") |
| } |
| } |
| // Send the new best value if the evaluation is better than any we've |
| // found so far. Keep this separate from findBestAndUpdateTask so that |
| // we only send an iteration if we find a better location. |
| if !cma.ForgetBest { |
| best := cma.bestIdx() |
| if best != -1 && cma.fs[best] < cma.bestF { |
| task := tasks[0] |
| task.F = cma.fs[best] |
| copy(task.X, cma.xs.RawRowView(best)) |
| task.Op = MajorIteration |
| task.ID = -1 |
| operations <- task |
| } |
| } |
| close(operations) |
| } |
| |
| // update computes the new parameters (mean, cholesky, etc.). Does not update |
| // any of the synchronization parameters (taskIdx). |
| func (cma *CmaEsChol) update() error { |
| // Sort the function values to find the elite samples. |
| ftmp := make([]float64, cma.pop) |
| copy(ftmp, cma.fs) |
| indexes := make([]int, cma.pop) |
| for i := range indexes { |
| indexes[i] = i |
| } |
| sort.Sort(bestSorter{F: ftmp, Idx: indexes}) |
| |
| meanOld := make([]float64, len(cma.mean)) |
| copy(meanOld, cma.mean) |
| |
| // m_{t+1} = \sum_{i=1}^mu w_i x_i |
| for i := range cma.mean { |
| cma.mean[i] = 0 |
| } |
| for i, w := range cma.weights { |
| idx := indexes[i] // index of teh 1337 sample. |
| floats.AddScaled(cma.mean, w, cma.xs.RawRowView(idx)) |
| } |
| meanDiff := make([]float64, len(cma.mean)) |
| floats.SubTo(meanDiff, cma.mean, meanOld) |
| |
| // p_{c,t+1} = (1-c_c) p_{c,t} + \sqrt(c_c*(2-c_c)*mueff) (m_{t+1}-m_t)/sigma_t |
| floats.Scale(1-cma.cc, cma.pc) |
| scaleC := math.Sqrt(cma.cc*(2-cma.cc)*cma.muEff) * cma.invSigma |
| floats.AddScaled(cma.pc, scaleC, meanDiff) |
| |
| // p_{sigma, t+1} = (1-c_sigma) p_{sigma,t} + \sqrt(c_s*(2-c_s)*mueff) A_t^-1 (m_{t+1}-m_t)/sigma_t |
| floats.Scale(1-cma.cs, cma.ps) |
| // First compute A_t^-1 (m_{t+1}-m_t), then add the scaled vector. |
| tmp := make([]float64, cma.dim) |
| tmpVec := mat.NewVecDense(cma.dim, tmp) |
| diffVec := mat.NewVecDense(cma.dim, meanDiff) |
| err := tmpVec.SolveVec(cma.chol.RawU().T(), diffVec) |
| if err != nil { |
| return err |
| } |
| scaleS := math.Sqrt(cma.cs*(2-cma.cs)*cma.muEff) * cma.invSigma |
| floats.AddScaled(cma.ps, scaleS, tmp) |
| |
| // Compute the update to A. |
| scaleChol := 1 - cma.c1 - cma.cmu |
| if scaleChol == 0 { |
| scaleChol = math.SmallestNonzeroFloat64 // enough to kill the old data, but still non-zero. |
| } |
| cma.chol.Scale(scaleChol, &cma.chol) |
| cma.chol.SymRankOne(&cma.chol, cma.c1, mat.NewVecDense(cma.dim, cma.pc)) |
| for i, w := range cma.weights { |
| idx := indexes[i] |
| floats.SubTo(tmp, cma.xs.RawRowView(idx), meanOld) |
| cma.chol.SymRankOne(&cma.chol, cma.cmu*w*cma.invSigma, tmpVec) |
| } |
| |
| // sigma_{t+1} = sigma_t exp(c_sigma/d_sigma * norm(p_{sigma,t+1}/ E[chi] -1) |
| normPs := floats.Norm(cma.ps, 2) |
| cma.invSigma /= math.Exp(cma.cs / cma.ds * (normPs/cma.eChi - 1)) |
| return nil |
| } |
| |
| type bestSorter struct { |
| F []float64 |
| Idx []int |
| } |
| |
| func (b bestSorter) Len() int { |
| return len(b.F) |
| } |
| func (b bestSorter) Less(i, j int) bool { |
| return b.F[i] < b.F[j] |
| } |
| func (b bestSorter) Swap(i, j int) { |
| b.F[i], b.F[j] = b.F[j], b.F[i] |
| b.Idx[i], b.Idx[j] = b.Idx[j], b.Idx[i] |
| } |