| // Copyright ©2014 The Gonum Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package stat |
| |
| import ( |
| "math" |
| "sort" |
| |
| "gonum.org/v1/gonum/floats" |
| ) |
| |
| // CumulantKind specifies the behavior for calculating the empirical CDF or Quantile |
| type CumulantKind int |
| |
| // List of supported CumulantKind values for the Quantile function. |
| // Constant values should match the R nomenclature. See |
| // https://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population |
| const ( |
| // Empirical treats the distribution as the actual empirical distribution. |
| Empirical CumulantKind = 1 |
| // LinInterp linearly interpolates the empirical distribution between sample values, with a flat extrapolation. |
| LinInterp CumulantKind = 4 |
| ) |
| |
| // bhattacharyyaCoeff computes the Bhattacharyya Coefficient for probability distributions given by: |
| // |
| // \sum_i \sqrt{p_i q_i} |
| // |
| // It is assumed that p and q have equal length. |
| func bhattacharyyaCoeff(p, q []float64) float64 { |
| var bc float64 |
| for i, a := range p { |
| bc += math.Sqrt(a * q[i]) |
| } |
| return bc |
| } |
| |
| // Bhattacharyya computes the distance between the probability distributions p and q given by: |
| // |
| // -\ln ( \sum_i \sqrt{p_i q_i} ) |
| // |
| // The lengths of p and q must be equal. It is assumed that p and q sum to 1. |
| func Bhattacharyya(p, q []float64) float64 { |
| if len(p) != len(q) { |
| panic("stat: slice length mismatch") |
| } |
| bc := bhattacharyyaCoeff(p, q) |
| return -math.Log(bc) |
| } |
| |
| // CDF returns the empirical cumulative distribution function value of x, that is |
| // the fraction of the samples less than or equal to q. The |
| // exact behavior is determined by the CumulantKind. CDF is theoretically |
| // the inverse of the Quantile function, though it may not be the actual inverse |
| // for all values q and CumulantKinds. |
| // |
| // The x data must be sorted in increasing order. If weights is nil then all |
| // of the weights are 1. If weights is not nil, then len(x) must equal len(weights). |
| // CDF will panic if the length of x is zero. |
| // |
| // CumulantKind behaviors: |
| // - Empirical: Returns the lowest fraction for which q is greater than or equal |
| // to that fraction of samples |
| func CDF(q float64, c CumulantKind, x, weights []float64) float64 { |
| if weights != nil && len(x) != len(weights) { |
| panic("stat: slice length mismatch") |
| } |
| if floats.HasNaN(x) { |
| return math.NaN() |
| } |
| if len(x) == 0 { |
| panic("stat: zero length slice") |
| } |
| if !sort.Float64sAreSorted(x) { |
| panic("x data are not sorted") |
| } |
| |
| if q < x[0] { |
| return 0 |
| } |
| if q >= x[len(x)-1] { |
| return 1 |
| } |
| |
| var sumWeights float64 |
| if weights == nil { |
| sumWeights = float64(len(x)) |
| } else { |
| sumWeights = floats.Sum(weights) |
| } |
| |
| // Calculate the index |
| switch c { |
| case Empirical: |
| // Find the smallest value that is greater than that percent of the samples |
| var w float64 |
| for i, v := range x { |
| if v > q { |
| return w / sumWeights |
| } |
| if weights == nil { |
| w++ |
| } else { |
| w += weights[i] |
| } |
| } |
| panic("impossible") |
| default: |
| panic("stat: bad cumulant kind") |
| } |
| } |
| |
| // ChiSquare computes the chi-square distance between the observed frequencies 'obs' and |
| // expected frequencies 'exp' given by: |
| // |
| // \sum_i (obs_i-exp_i)^2 / exp_i |
| // |
| // The lengths of obs and exp must be equal. |
| func ChiSquare(obs, exp []float64) float64 { |
| if len(obs) != len(exp) { |
| panic("stat: slice length mismatch") |
| } |
| var result float64 |
| for i, a := range obs { |
| b := exp[i] |
| if a == 0 && b == 0 { |
| continue |
| } |
| result += (a - b) * (a - b) / b |
| } |
| return result |
| } |
| |
| // CircularMean returns the circular mean of the dataset. |
| // |
| // atan2(\sum_i w_i * sin(alpha_i), \sum_i w_i * cos(alpha_i)) |
| // |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| func CircularMean(x, weights []float64) float64 { |
| if weights != nil && len(x) != len(weights) { |
| panic("stat: slice length mismatch") |
| } |
| |
| var aX, aY float64 |
| if weights != nil { |
| for i, v := range x { |
| aX += weights[i] * math.Cos(v) |
| aY += weights[i] * math.Sin(v) |
| } |
| } else { |
| for _, v := range x { |
| aX += math.Cos(v) |
| aY += math.Sin(v) |
| } |
| } |
| |
| return math.Atan2(aY, aX) |
| } |
| |
| // Correlation returns the weighted correlation between the samples of x and y |
| // with the given means. |
| // |
| // sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (stdX * stdY) |
| // |
| // The lengths of x and y must be equal. If weights is nil then all of the |
| // weights are 1. If weights is not nil, then len(x) must equal len(weights). |
| func Correlation(x, y, weights []float64) float64 { |
| // This is a two-pass corrected implementation. It is an adaptation of the |
| // algorithm used in the MeanVariance function, which applies a correction |
| // to the typical two pass approach. |
| |
| if len(x) != len(y) { |
| panic("stat: slice length mismatch") |
| } |
| xu := Mean(x, weights) |
| yu := Mean(y, weights) |
| var ( |
| sxx float64 |
| syy float64 |
| sxy float64 |
| xcompensation float64 |
| ycompensation float64 |
| ) |
| if weights == nil { |
| for i, xv := range x { |
| yv := y[i] |
| xd := xv - xu |
| yd := yv - yu |
| sxx += xd * xd |
| syy += yd * yd |
| sxy += xd * yd |
| xcompensation += xd |
| ycompensation += yd |
| } |
| // xcompensation and ycompensation are from Chan, et. al. |
| // referenced in the MeanVariance function. They are analogous |
| // to the second term in (1.7) in that paper. |
| sxx -= xcompensation * xcompensation / float64(len(x)) |
| syy -= ycompensation * ycompensation / float64(len(x)) |
| |
| return (sxy - xcompensation*ycompensation/float64(len(x))) / math.Sqrt(sxx*syy) |
| |
| } |
| |
| var sumWeights float64 |
| for i, xv := range x { |
| w := weights[i] |
| yv := y[i] |
| xd := xv - xu |
| wxd := w * xd |
| yd := yv - yu |
| wyd := w * yd |
| sxx += wxd * xd |
| syy += wyd * yd |
| sxy += wxd * yd |
| xcompensation += wxd |
| ycompensation += wyd |
| sumWeights += w |
| } |
| // xcompensation and ycompensation are from Chan, et. al. |
| // referenced in the MeanVariance function. They are analogous |
| // to the second term in (1.7) in that paper, except they use |
| // the sumWeights instead of the sample count. |
| sxx -= xcompensation * xcompensation / sumWeights |
| syy -= ycompensation * ycompensation / sumWeights |
| |
| return (sxy - xcompensation*ycompensation/sumWeights) / math.Sqrt(sxx*syy) |
| } |
| |
| // Kendall returns the weighted Tau-a Kendall correlation between the |
| // samples of x and y. The Kendall correlation measures the quantity of |
| // concordant and discordant pairs of numbers. If weights are specified then |
| // each pair is weighted by weights[i] * weights[j] and the final sum is |
| // normalized to stay between -1 and 1. |
| // The lengths of x and y must be equal. If weights is nil then all of the |
| // weights are 1. If weights is not nil, then len(x) must equal len(weights). |
| func Kendall(x, y, weights []float64) float64 { |
| if len(x) != len(y) { |
| panic("stat: slice length mismatch") |
| } |
| |
| var ( |
| cc float64 // number of concordant pairs |
| dc float64 // number of discordant pairs |
| n = len(x) |
| ) |
| |
| if weights == nil { |
| for i := 0; i < n; i++ { |
| for j := i; j < n; j++ { |
| if i == j { |
| continue |
| } |
| if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) { |
| cc++ |
| } else { |
| dc++ |
| } |
| } |
| } |
| return (cc - dc) / float64(n*(n-1)/2) |
| } |
| |
| var sumWeights float64 |
| |
| for i := 0; i < n; i++ { |
| for j := i; j < n; j++ { |
| if i == j { |
| continue |
| } |
| weight := weights[i] * weights[j] |
| if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) { |
| cc += weight |
| } else { |
| dc += weight |
| } |
| sumWeights += weight |
| } |
| } |
| return float64(cc-dc) / sumWeights |
| } |
| |
| // Covariance returns the weighted covariance between the samples of x and y. |
| // |
| // sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (sum_j {w_j} - 1) |
| // |
| // The lengths of x and y must be equal. If weights is nil then all of the |
| // weights are 1. If weights is not nil, then len(x) must equal len(weights). |
| func Covariance(x, y, weights []float64) float64 { |
| // This is a two-pass corrected implementation. It is an adaptation of the |
| // algorithm used in the MeanVariance function, which applies a correction |
| // to the typical two pass approach. |
| |
| if len(x) != len(y) { |
| panic("stat: slice length mismatch") |
| } |
| xu := Mean(x, weights) |
| yu := Mean(y, weights) |
| return covarianceMeans(x, y, weights, xu, yu) |
| } |
| |
| // covarianceMeans returns the weighted covariance between x and y with the mean |
| // of x and y already specified. See the documentation of Covariance for more |
| // information. |
| func covarianceMeans(x, y, weights []float64, xu, yu float64) float64 { |
| var ( |
| ss float64 |
| xcompensation float64 |
| ycompensation float64 |
| ) |
| if weights == nil { |
| for i, xv := range x { |
| yv := y[i] |
| xd := xv - xu |
| yd := yv - yu |
| ss += xd * yd |
| xcompensation += xd |
| ycompensation += yd |
| } |
| // xcompensation and ycompensation are from Chan, et. al. |
| // referenced in the MeanVariance function. They are analogous |
| // to the second term in (1.7) in that paper. |
| return (ss - xcompensation*ycompensation/float64(len(x))) / float64(len(x)-1) |
| } |
| |
| var sumWeights float64 |
| |
| for i, xv := range x { |
| w := weights[i] |
| yv := y[i] |
| wxd := w * (xv - xu) |
| yd := (yv - yu) |
| ss += wxd * yd |
| xcompensation += wxd |
| ycompensation += w * yd |
| sumWeights += w |
| } |
| // xcompensation and ycompensation are from Chan, et. al. |
| // referenced in the MeanVariance function. They are analogous |
| // to the second term in (1.7) in that paper, except they use |
| // the sumWeights instead of the sample count. |
| return (ss - xcompensation*ycompensation/sumWeights) / (sumWeights - 1) |
| } |
| |
| // CrossEntropy computes the cross-entropy between the two distributions specified |
| // in p and q. |
| func CrossEntropy(p, q []float64) float64 { |
| if len(p) != len(q) { |
| panic("stat: slice length mismatch") |
| } |
| var ce float64 |
| for i, v := range p { |
| if v != 0 { |
| ce -= v * math.Log(q[i]) |
| } |
| } |
| return ce |
| } |
| |
| // Entropy computes the Shannon entropy of a distribution or the distance between |
| // two distributions. The natural logarithm is used. |
| // - sum_i (p_i * log_e(p_i)) |
| func Entropy(p []float64) float64 { |
| var e float64 |
| for _, v := range p { |
| if v != 0 { // Entropy needs 0 * log(0) == 0. |
| e -= v * math.Log(v) |
| } |
| } |
| return e |
| } |
| |
| // ExKurtosis returns the population excess kurtosis of the sample. |
| // The kurtosis is defined by the 4th moment of the mean divided by the squared |
| // variance. The excess kurtosis subtracts 3.0 so that the excess kurtosis of |
| // the normal distribution is zero. |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| func ExKurtosis(x, weights []float64) float64 { |
| mean, std := MeanStdDev(x, weights) |
| if weights == nil { |
| var e float64 |
| for _, v := range x { |
| z := (v - mean) / std |
| e += z * z * z * z |
| } |
| mul, offset := kurtosisCorrection(float64(len(x))) |
| return e*mul - offset |
| } |
| |
| var ( |
| e float64 |
| sumWeights float64 |
| ) |
| for i, v := range x { |
| z := (v - mean) / std |
| e += weights[i] * z * z * z * z |
| sumWeights += weights[i] |
| } |
| mul, offset := kurtosisCorrection(sumWeights) |
| return e*mul - offset |
| } |
| |
| // n is the number of samples |
| // see https://en.wikipedia.org/wiki/Kurtosis |
| func kurtosisCorrection(n float64) (mul, offset float64) { |
| return ((n + 1) / (n - 1)) * (n / (n - 2)) * (1 / (n - 3)), 3 * ((n - 1) / (n - 2)) * ((n - 1) / (n - 3)) |
| } |
| |
| // GeometricMean returns the weighted geometric mean of the dataset |
| // |
| // \prod_i {x_i ^ w_i} |
| // |
| // This only applies with positive x and positive weights. If weights is nil |
| // then all of the weights are 1. If weights is not nil, then len(x) must equal |
| // len(weights). |
| func GeometricMean(x, weights []float64) float64 { |
| if weights == nil { |
| var s float64 |
| for _, v := range x { |
| s += math.Log(v) |
| } |
| s /= float64(len(x)) |
| return math.Exp(s) |
| } |
| if len(x) != len(weights) { |
| panic("stat: slice length mismatch") |
| } |
| var ( |
| s float64 |
| sumWeights float64 |
| ) |
| for i, v := range x { |
| s += weights[i] * math.Log(v) |
| sumWeights += weights[i] |
| } |
| s /= sumWeights |
| return math.Exp(s) |
| } |
| |
| // HarmonicMean returns the weighted harmonic mean of the dataset |
| // |
| // \sum_i {w_i} / ( sum_i {w_i / x_i} ) |
| // |
| // This only applies with positive x and positive weights. |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| func HarmonicMean(x, weights []float64) float64 { |
| if weights != nil && len(x) != len(weights) { |
| panic("stat: slice length mismatch") |
| } |
| // TODO(btracey): Fix this to make it more efficient and avoid allocation. |
| |
| // This can be numerically unstable (for example if x is very small). |
| // W = \sum_i {w_i} |
| // hm = exp(log(W) - log(\sum_i w_i / x_i)) |
| |
| logs := make([]float64, len(x)) |
| var W float64 |
| for i := range x { |
| if weights == nil { |
| logs[i] = -math.Log(x[i]) |
| W++ |
| continue |
| } |
| logs[i] = math.Log(weights[i]) - math.Log(x[i]) |
| W += weights[i] |
| } |
| |
| // Sum all of the logs |
| v := floats.LogSumExp(logs) // This computes log(\sum_i { w_i / x_i}). |
| return math.Exp(math.Log(W) - v) |
| } |
| |
| // Hellinger computes the distance between the probability distributions p and q given by: |
| // |
| // \sqrt{ 1 - \sum_i \sqrt{p_i q_i} } |
| // |
| // The lengths of p and q must be equal. It is assumed that p and q sum to 1. |
| func Hellinger(p, q []float64) float64 { |
| if len(p) != len(q) { |
| panic("stat: slice length mismatch") |
| } |
| bc := bhattacharyyaCoeff(p, q) |
| return math.Sqrt(1 - bc) |
| } |
| |
| // Histogram sums up the weighted number of data points in each bin. |
| // The weight of data point x[i] will be placed into count[j] if |
| // dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist |
| // with bin creation. |
| // |
| // The following conditions on the inputs apply: |
| // - The count variable must either be nil or have length of one less than dividers. |
| // - The values in dividers must be sorted (use the sort package). |
| // - The x values must be sorted. |
| // - If weights is nil then all of the weights are 1. |
| // - If weights is not nil, then len(x) must equal len(weights). |
| func Histogram(count, dividers, x, weights []float64) []float64 { |
| if weights != nil && len(x) != len(weights) { |
| panic("stat: slice length mismatch") |
| } |
| if count == nil { |
| count = make([]float64, len(dividers)-1) |
| } |
| if len(dividers) < 2 { |
| panic("histogram: fewer than two dividers") |
| } |
| if len(count) != len(dividers)-1 { |
| panic("histogram: bin count mismatch") |
| } |
| if !sort.Float64sAreSorted(dividers) { |
| panic("histogram: dividers are not sorted") |
| } |
| if !sort.Float64sAreSorted(x) { |
| panic("histogram: x data are not sorted") |
| } |
| for i := range count { |
| count[i] = 0 |
| } |
| if len(x) == 0 { |
| return count |
| } |
| if x[0] < dividers[0] { |
| panic("histogram: minimum x value is less than lowest divider") |
| } |
| if dividers[len(dividers)-1] <= x[len(x)-1] { |
| panic("histogram: maximum x value is greater than or equal to highest divider") |
| } |
| |
| idx := 0 |
| comp := dividers[idx+1] |
| if weights == nil { |
| for _, v := range x { |
| if v < comp { |
| // Still in the current bucket. |
| count[idx]++ |
| continue |
| } |
| // Find the next divider where v is less than the divider. |
| for j := idx + 1; j < len(dividers); j++ { |
| if v < dividers[j+1] { |
| idx = j |
| comp = dividers[j+1] |
| break |
| } |
| } |
| count[idx]++ |
| } |
| return count |
| } |
| |
| for i, v := range x { |
| if v < comp { |
| // Still in the current bucket. |
| count[idx] += weights[i] |
| continue |
| } |
| // Need to find the next divider where v is less than the divider. |
| for j := idx + 1; j < len(count); j++ { |
| if v < dividers[j+1] { |
| idx = j |
| comp = dividers[j+1] |
| break |
| } |
| } |
| count[idx] += weights[i] |
| } |
| return count |
| } |
| |
| // JensenShannon computes the JensenShannon divergence between the distributions |
| // p and q. The Jensen-Shannon divergence is defined as |
| // |
| // m = 0.5 * (p + q) |
| // JS(p, q) = 0.5 ( KL(p, m) + KL(q, m) ) |
| // |
| // Unlike Kullback-Leibler, the Jensen-Shannon distance is symmetric. The value |
| // is between 0 and ln(2). |
| func JensenShannon(p, q []float64) float64 { |
| if len(p) != len(q) { |
| panic("stat: slice length mismatch") |
| } |
| var js float64 |
| for i, v := range p { |
| qi := q[i] |
| m := 0.5 * (v + qi) |
| if v != 0 { |
| // add kl from p to m |
| js += 0.5 * v * (math.Log(v) - math.Log(m)) |
| } |
| if qi != 0 { |
| // add kl from q to m |
| js += 0.5 * qi * (math.Log(qi) - math.Log(m)) |
| } |
| } |
| return js |
| } |
| |
| // KolmogorovSmirnov computes the largest distance between two empirical CDFs. |
| // Each dataset x and y consists of sample locations and counts, xWeights and |
| // yWeights, respectively. |
| // |
| // x and y may have different lengths, though len(x) must equal len(xWeights), and |
| // len(y) must equal len(yWeights). Both x and y must be sorted. |
| // |
| // Special cases are: |
| // |
| // = 0 if len(x) == len(y) == 0 |
| // = 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0 |
| func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 { |
| if xWeights != nil && len(x) != len(xWeights) { |
| panic("stat: slice length mismatch") |
| } |
| if yWeights != nil && len(y) != len(yWeights) { |
| panic("stat: slice length mismatch") |
| } |
| if len(x) == 0 || len(y) == 0 { |
| if len(x) == 0 && len(y) == 0 { |
| return 0 |
| } |
| return 1 |
| } |
| |
| if floats.HasNaN(x) { |
| return math.NaN() |
| } |
| if floats.HasNaN(y) { |
| return math.NaN() |
| } |
| |
| if !sort.Float64sAreSorted(x) { |
| panic("x data are not sorted") |
| } |
| if !sort.Float64sAreSorted(y) { |
| panic("y data are not sorted") |
| } |
| |
| xWeightsNil := xWeights == nil |
| yWeightsNil := yWeights == nil |
| |
| var ( |
| maxDist float64 |
| xSum, ySum float64 |
| xCdf, yCdf float64 |
| xIdx, yIdx int |
| ) |
| |
| if xWeightsNil { |
| xSum = float64(len(x)) |
| } else { |
| xSum = floats.Sum(xWeights) |
| } |
| |
| if yWeightsNil { |
| ySum = float64(len(y)) |
| } else { |
| ySum = floats.Sum(yWeights) |
| } |
| |
| xVal := x[0] |
| yVal := y[0] |
| |
| // Algorithm description: |
| // The goal is to find the maximum difference in the empirical CDFs for the |
| // two datasets. The CDFs are piecewise-constant, and thus the distance |
| // between the CDFs will only change at the values themselves. |
| // |
| // To find the maximum distance, step through the data in ascending order |
| // of value between the two datasets. At each step, compute the empirical CDF |
| // and compare the local distance with the maximum distance. |
| // Due to some corner cases, equal data entries must be tallied simultaneously. |
| for { |
| switch { |
| case xVal < yVal: |
| xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) |
| case yVal < xVal: |
| yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) |
| case xVal == yVal: |
| newX := x[xIdx] |
| newY := y[yIdx] |
| if newX < newY { |
| xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) |
| } else if newY < newX { |
| yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) |
| } else { |
| // Update them both, they'll be equal next time and the right |
| // thing will happen. |
| xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) |
| yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) |
| } |
| default: |
| panic("unreachable") |
| } |
| |
| dist := math.Abs(xCdf - yCdf) |
| if dist > maxDist { |
| maxDist = dist |
| } |
| |
| // Both xCdf and yCdf will equal 1 at the end, so if we have reached the |
| // end of either sample list, the distance is as large as it can be. |
| if xIdx == len(x) || yIdx == len(y) { |
| return maxDist |
| } |
| } |
| } |
| |
| // updateKS gets the next data point from one of the set. In doing so, it combines |
| // the weight of all the data points of equal value. Upon return, val is the new |
| // value of the data set, newCdf is the total combined CDF up until this point, |
| // and newIdx is the index of the next location in that sample to examine. |
| func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) { |
| // Sum up all the weights of consecutive values that are equal. |
| if isNil { |
| newCdf = cdf + 1/sum |
| } else { |
| newCdf = cdf + weights[idx]/sum |
| } |
| newIdx = idx + 1 |
| for { |
| if newIdx == len(values) { |
| return values[newIdx-1], newCdf, newIdx |
| } |
| if values[newIdx-1] != values[newIdx] { |
| return values[newIdx], newCdf, newIdx |
| } |
| if isNil { |
| newCdf += 1 / sum |
| } else { |
| newCdf += weights[newIdx] / sum |
| } |
| newIdx++ |
| } |
| } |
| |
| // KullbackLeibler computes the Kullback-Leibler distance between the |
| // distributions p and q. The natural logarithm is used. |
| // |
| // sum_i(p_i * log(p_i / q_i)) |
| // |
| // Note that the Kullback-Leibler distance is not symmetric; |
| // KullbackLeibler(p,q) != KullbackLeibler(q,p) |
| func KullbackLeibler(p, q []float64) float64 { |
| if len(p) != len(q) { |
| panic("stat: slice length mismatch") |
| } |
| var kl float64 |
| for i, v := range p { |
| if v != 0 { // Entropy needs 0 * log(0) == 0. |
| kl += v * (math.Log(v) - math.Log(q[i])) |
| } |
| } |
| return kl |
| } |
| |
| // LinearRegression computes the best-fit line |
| // |
| // y = alpha + beta*x |
| // |
| // to the data in x and y with the given weights. If origin is true, the |
| // regression is forced to pass through the origin. |
| // |
| // Specifically, LinearRegression computes the values of alpha and |
| // beta such that the total residual |
| // |
| // \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 |
| // |
| // is minimized. If origin is true, then alpha is forced to be zero. |
| // |
| // The lengths of x and y must be equal. If weights is nil then all of the |
| // weights are 1. If weights is not nil, then len(x) must equal len(weights). |
| func LinearRegression(x, y, weights []float64, origin bool) (alpha, beta float64) { |
| if len(x) != len(y) { |
| panic("stat: slice length mismatch") |
| } |
| if weights != nil && len(weights) != len(x) { |
| panic("stat: slice length mismatch") |
| } |
| |
| w := 1.0 |
| if origin { |
| var x2Sum, xySum float64 |
| for i, xi := range x { |
| if weights != nil { |
| w = weights[i] |
| } |
| yi := y[i] |
| xySum += w * xi * yi |
| x2Sum += w * xi * xi |
| } |
| beta = xySum / x2Sum |
| |
| return 0, beta |
| } |
| |
| xu, xv := MeanVariance(x, weights) |
| yu := Mean(y, weights) |
| cov := covarianceMeans(x, y, weights, xu, yu) |
| beta = cov / xv |
| alpha = yu - beta*xu |
| return alpha, beta |
| } |
| |
| // RSquared returns the coefficient of determination defined as |
| // |
| // R^2 = 1 - \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 / \sum_i w[i]*(y[i] - mean(y))^2 |
| // |
| // for the line |
| // |
| // y = alpha + beta*x |
| // |
| // and the data in x and y with the given weights. |
| // |
| // The lengths of x and y must be equal. If weights is nil then all of the |
| // weights are 1. If weights is not nil, then len(x) must equal len(weights). |
| func RSquared(x, y, weights []float64, alpha, beta float64) float64 { |
| if len(x) != len(y) { |
| panic("stat: slice length mismatch") |
| } |
| if weights != nil && len(weights) != len(x) { |
| panic("stat: slice length mismatch") |
| } |
| |
| w := 1.0 |
| yMean := Mean(y, weights) |
| var res, tot, d float64 |
| for i, xi := range x { |
| if weights != nil { |
| w = weights[i] |
| } |
| yi := y[i] |
| fi := alpha + beta*xi |
| d = yi - fi |
| res += w * d * d |
| d = yi - yMean |
| tot += w * d * d |
| } |
| return 1 - res/tot |
| } |
| |
| // RSquaredFrom returns the coefficient of determination defined as |
| // |
| // R^2 = 1 - \sum_i w[i]*(estimate[i] - value[i])^2 / \sum_i w[i]*(value[i] - mean(values))^2 |
| // |
| // and the data in estimates and values with the given weights. |
| // |
| // The lengths of estimates and values must be equal. If weights is nil then |
| // all of the weights are 1. If weights is not nil, then len(values) must |
| // equal len(weights). |
| func RSquaredFrom(estimates, values, weights []float64) float64 { |
| if len(estimates) != len(values) { |
| panic("stat: slice length mismatch") |
| } |
| if weights != nil && len(weights) != len(values) { |
| panic("stat: slice length mismatch") |
| } |
| |
| w := 1.0 |
| mean := Mean(values, weights) |
| var res, tot, d float64 |
| for i, val := range values { |
| if weights != nil { |
| w = weights[i] |
| } |
| d = val - estimates[i] |
| res += w * d * d |
| d = val - mean |
| tot += w * d * d |
| } |
| return 1 - res/tot |
| } |
| |
| // RNoughtSquared returns the coefficient of determination defined as |
| // |
| // R₀^2 = \sum_i w[i]*(beta*x[i])^2 / \sum_i w[i]*y[i]^2 |
| // |
| // for the line |
| // |
| // y = beta*x |
| // |
| // and the data in x and y with the given weights. RNoughtSquared should |
| // only be used for best-fit lines regressed through the origin. |
| // |
| // The lengths of x and y must be equal. If weights is nil then all of the |
| // weights are 1. If weights is not nil, then len(x) must equal len(weights). |
| func RNoughtSquared(x, y, weights []float64, beta float64) float64 { |
| if len(x) != len(y) { |
| panic("stat: slice length mismatch") |
| } |
| if weights != nil && len(weights) != len(x) { |
| panic("stat: slice length mismatch") |
| } |
| |
| w := 1.0 |
| var ssr, tot float64 |
| for i, xi := range x { |
| if weights != nil { |
| w = weights[i] |
| } |
| fi := beta * xi |
| ssr += w * fi * fi |
| yi := y[i] |
| tot += w * yi * yi |
| } |
| return ssr / tot |
| } |
| |
| // Mean computes the weighted mean of the data set. |
| // |
| // sum_i {w_i * x_i} / sum_i {w_i} |
| // |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| func Mean(x, weights []float64) float64 { |
| if weights == nil { |
| return floats.Sum(x) / float64(len(x)) |
| } |
| if len(x) != len(weights) { |
| panic("stat: slice length mismatch") |
| } |
| var ( |
| sumValues float64 |
| sumWeights float64 |
| ) |
| for i, w := range weights { |
| sumValues += w * x[i] |
| sumWeights += w |
| } |
| return sumValues / sumWeights |
| } |
| |
| // Mode returns the most common value in the dataset specified by x and the |
| // given weights. Strict float64 equality is used when comparing values, so users |
| // should take caution. If several values are the mode, any of them may be returned. |
| func Mode(x, weights []float64) (val float64, count float64) { |
| if weights != nil && len(x) != len(weights) { |
| panic("stat: slice length mismatch") |
| } |
| if len(x) == 0 { |
| return 0, 0 |
| } |
| m := make(map[float64]float64) |
| if weights == nil { |
| for _, v := range x { |
| m[v]++ |
| } |
| } else { |
| for i, v := range x { |
| m[v] += weights[i] |
| } |
| } |
| var ( |
| maxCount float64 |
| max float64 |
| ) |
| for val, count := range m { |
| if count > maxCount { |
| maxCount = count |
| max = val |
| } |
| } |
| return max, maxCount |
| } |
| |
| // BivariateMoment computes the weighted mixed moment between the samples x and y. |
| // |
| // E[(x - μ_x)^r*(y - μ_y)^s] |
| // |
| // No degrees of freedom correction is done. |
| // The lengths of x and y must be equal. If weights is nil then all of the |
| // weights are 1. If weights is not nil, then len(x) must equal len(weights). |
| func BivariateMoment(r, s float64, x, y, weights []float64) float64 { |
| meanX := Mean(x, weights) |
| meanY := Mean(y, weights) |
| if len(x) != len(y) { |
| panic("stat: slice length mismatch") |
| } |
| if weights == nil { |
| var m float64 |
| for i, vx := range x { |
| vy := y[i] |
| m += math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s) |
| } |
| return m / float64(len(x)) |
| } |
| if len(weights) != len(x) { |
| panic("stat: slice length mismatch") |
| } |
| var ( |
| m float64 |
| sumWeights float64 |
| ) |
| for i, vx := range x { |
| vy := y[i] |
| w := weights[i] |
| m += w * math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s) |
| sumWeights += w |
| } |
| return m / sumWeights |
| } |
| |
| // Moment computes the weighted n^th moment of the samples, |
| // |
| // E[(x - μ)^N] |
| // |
| // No degrees of freedom correction is done. |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| func Moment(moment float64, x, weights []float64) float64 { |
| // This also checks that x and weights have the same length. |
| mean := Mean(x, weights) |
| if weights == nil { |
| var m float64 |
| for _, v := range x { |
| m += math.Pow(v-mean, moment) |
| } |
| return m / float64(len(x)) |
| } |
| var ( |
| m float64 |
| sumWeights float64 |
| ) |
| for i, v := range x { |
| w := weights[i] |
| m += w * math.Pow(v-mean, moment) |
| sumWeights += w |
| } |
| return m / sumWeights |
| } |
| |
| // MomentAbout computes the weighted n^th weighted moment of the samples about |
| // the given mean \mu, |
| // |
| // E[(x - μ)^N] |
| // |
| // No degrees of freedom correction is done. |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| func MomentAbout(moment float64, x []float64, mean float64, weights []float64) float64 { |
| if weights == nil { |
| var m float64 |
| for _, v := range x { |
| m += math.Pow(v-mean, moment) |
| } |
| m /= float64(len(x)) |
| return m |
| } |
| if len(weights) != len(x) { |
| panic("stat: slice length mismatch") |
| } |
| var ( |
| m float64 |
| sumWeights float64 |
| ) |
| for i, v := range x { |
| m += weights[i] * math.Pow(v-mean, moment) |
| sumWeights += weights[i] |
| } |
| return m / sumWeights |
| } |
| |
| // Quantile returns the sample of x such that x is greater than or |
| // equal to the fraction p of samples. The exact behavior is determined by the |
| // CumulantKind, and p should be a number between 0 and 1. Quantile is theoretically |
| // the inverse of the CDF function, though it may not be the actual inverse |
| // for all values p and CumulantKinds. |
| // |
| // The x data must be sorted in increasing order. If weights is nil then all |
| // of the weights are 1. If weights is not nil, then len(x) must equal len(weights). |
| // Quantile will panic if the length of x is zero. |
| // |
| // CumulantKind behaviors: |
| // - Empirical: Returns the lowest value q for which q is greater than or equal |
| // to the fraction p of samples |
| // - LinInterp: Returns the linearly interpolated value |
| func Quantile(p float64, c CumulantKind, x, weights []float64) float64 { |
| if !(p >= 0 && p <= 1) { |
| panic("stat: percentile out of bounds") |
| } |
| |
| if weights != nil && len(x) != len(weights) { |
| panic("stat: slice length mismatch") |
| } |
| if len(x) == 0 { |
| panic("stat: zero length slice") |
| } |
| if floats.HasNaN(x) { |
| return math.NaN() // This is needed because the algorithm breaks otherwise. |
| } |
| if !sort.Float64sAreSorted(x) { |
| panic("x data are not sorted") |
| } |
| |
| var sumWeights float64 |
| if weights == nil { |
| sumWeights = float64(len(x)) |
| } else { |
| sumWeights = floats.Sum(weights) |
| } |
| switch c { |
| case Empirical: |
| return empiricalQuantile(p, x, weights, sumWeights) |
| case LinInterp: |
| return linInterpQuantile(p, x, weights, sumWeights) |
| default: |
| panic("stat: bad cumulant kind") |
| } |
| } |
| |
| func empiricalQuantile(p float64, x, weights []float64, sumWeights float64) float64 { |
| var cumsum float64 |
| fidx := p * sumWeights |
| for i := range x { |
| if weights == nil { |
| cumsum++ |
| } else { |
| cumsum += weights[i] |
| } |
| if cumsum >= fidx { |
| return x[i] |
| } |
| } |
| panic("impossible") |
| } |
| |
| func linInterpQuantile(p float64, x, weights []float64, sumWeights float64) float64 { |
| var cumsum float64 |
| fidx := p * sumWeights |
| for i := range x { |
| if weights == nil { |
| cumsum++ |
| } else { |
| cumsum += weights[i] |
| } |
| if cumsum >= fidx { |
| if i == 0 { |
| return x[0] |
| } |
| t := cumsum - fidx |
| if weights != nil { |
| t /= weights[i] |
| } |
| return t*x[i-1] + (1-t)*x[i] |
| } |
| } |
| panic("impossible") |
| } |
| |
| // Skew computes the skewness of the sample data. |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| // When weights sum to 1 or less, a biased variance estimator should be used. |
| func Skew(x, weights []float64) float64 { |
| |
| mean, std := MeanStdDev(x, weights) |
| if weights == nil { |
| var s float64 |
| for _, v := range x { |
| z := (v - mean) / std |
| s += z * z * z |
| } |
| return s * skewCorrection(float64(len(x))) |
| } |
| var ( |
| s float64 |
| sumWeights float64 |
| ) |
| for i, v := range x { |
| z := (v - mean) / std |
| s += weights[i] * z * z * z |
| sumWeights += weights[i] |
| } |
| return s * skewCorrection(sumWeights) |
| } |
| |
| // From: http://www.amstat.org/publications/jse/v19n2/doane.pdf page 7 |
| func skewCorrection(n float64) float64 { |
| return (n / (n - 1)) * (1 / (n - 2)) |
| } |
| |
| // SortWeighted rearranges the data in x along with their corresponding |
| // weights so that the x data are sorted. The data is sorted in place. |
| // Weights may be nil, but if weights is non-nil then it must have the same |
| // length as x. |
| func SortWeighted(x, weights []float64) { |
| if weights == nil { |
| sort.Float64s(x) |
| return |
| } |
| if len(x) != len(weights) { |
| panic("stat: slice length mismatch") |
| } |
| sort.Sort(weightSorter{ |
| x: x, |
| w: weights, |
| }) |
| } |
| |
| type weightSorter struct { |
| x []float64 |
| w []float64 |
| } |
| |
| func (w weightSorter) Len() int { return len(w.x) } |
| func (w weightSorter) Less(i, j int) bool { return w.x[i] < w.x[j] } |
| func (w weightSorter) Swap(i, j int) { |
| w.x[i], w.x[j] = w.x[j], w.x[i] |
| w.w[i], w.w[j] = w.w[j], w.w[i] |
| } |
| |
| // SortWeightedLabeled rearranges the data in x along with their |
| // corresponding weights and boolean labels so that the x data are sorted. |
| // The data is sorted in place. Weights and labels may be nil, if either |
| // is non-nil it must have the same length as x. |
| func SortWeightedLabeled(x []float64, labels []bool, weights []float64) { |
| if labels == nil { |
| SortWeighted(x, weights) |
| return |
| } |
| if weights == nil { |
| if len(x) != len(labels) { |
| panic("stat: slice length mismatch") |
| } |
| sort.Sort(labelSorter{ |
| x: x, |
| l: labels, |
| }) |
| return |
| } |
| if len(x) != len(labels) || len(x) != len(weights) { |
| panic("stat: slice length mismatch") |
| } |
| sort.Sort(weightLabelSorter{ |
| x: x, |
| l: labels, |
| w: weights, |
| }) |
| } |
| |
| type labelSorter struct { |
| x []float64 |
| l []bool |
| } |
| |
| func (a labelSorter) Len() int { return len(a.x) } |
| func (a labelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] } |
| func (a labelSorter) Swap(i, j int) { |
| a.x[i], a.x[j] = a.x[j], a.x[i] |
| a.l[i], a.l[j] = a.l[j], a.l[i] |
| } |
| |
| type weightLabelSorter struct { |
| x []float64 |
| l []bool |
| w []float64 |
| } |
| |
| func (a weightLabelSorter) Len() int { return len(a.x) } |
| func (a weightLabelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] } |
| func (a weightLabelSorter) Swap(i, j int) { |
| a.x[i], a.x[j] = a.x[j], a.x[i] |
| a.l[i], a.l[j] = a.l[j], a.l[i] |
| a.w[i], a.w[j] = a.w[j], a.w[i] |
| } |
| |
| // StdDev returns the sample standard deviation. |
| func StdDev(x, weights []float64) float64 { |
| _, std := MeanStdDev(x, weights) |
| return std |
| } |
| |
| // MeanStdDev returns the sample mean and unbiased standard deviation |
| // When weights sum to 1 or less, a biased variance estimator should be used. |
| func MeanStdDev(x, weights []float64) (mean, std float64) { |
| mean, variance := MeanVariance(x, weights) |
| return mean, math.Sqrt(variance) |
| } |
| |
| // StdErr returns the standard error in the mean with the given values. |
| func StdErr(std, sampleSize float64) float64 { |
| return std / math.Sqrt(sampleSize) |
| } |
| |
| // StdScore returns the standard score (a.k.a. z-score, z-value) for the value x |
| // with the given mean and standard deviation, i.e. |
| // |
| // (x - mean) / std |
| func StdScore(x, mean, std float64) float64 { |
| return (x - mean) / std |
| } |
| |
| // Variance computes the unbiased weighted sample variance: |
| // |
| // \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1) |
| // |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| // When weights sum to 1 or less, a biased variance estimator should be used. |
| func Variance(x, weights []float64) float64 { |
| _, variance := MeanVariance(x, weights) |
| return variance |
| } |
| |
| // MeanVariance computes the sample mean and unbiased variance, where the mean and variance are |
| // |
| // \sum_i w_i * x_i / (sum_i w_i) |
| // \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1) |
| // |
| // respectively. |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| // When weights sum to 1 or less, a biased variance estimator should be used. |
| func MeanVariance(x, weights []float64) (mean, variance float64) { |
| var ( |
| unnormalisedVariance float64 |
| sumWeights float64 |
| ) |
| mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights) |
| return mean, unnormalisedVariance / (sumWeights - 1) |
| } |
| |
| // PopMeanVariance computes the sample mean and biased variance (also known as |
| // "population variance"), where the mean and variance are |
| // |
| // \sum_i w_i * x_i / (sum_i w_i) |
| // \sum_i w_i (x_i - mean)^2 / (sum_i w_i) |
| // |
| // respectively. |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| func PopMeanVariance(x, weights []float64) (mean, variance float64) { |
| var ( |
| unnormalisedVariance float64 |
| sumWeights float64 |
| ) |
| mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights) |
| return mean, unnormalisedVariance / sumWeights |
| } |
| |
| // PopMeanStdDev returns the sample mean and biased standard deviation |
| // (also known as "population standard deviation"). |
| func PopMeanStdDev(x, weights []float64) (mean, std float64) { |
| mean, variance := PopMeanVariance(x, weights) |
| return mean, math.Sqrt(variance) |
| } |
| |
| // PopStdDev returns the population standard deviation, i.e., a square root |
| // of the biased variance estimate. |
| func PopStdDev(x, weights []float64) float64 { |
| _, stDev := PopMeanStdDev(x, weights) |
| return stDev |
| } |
| |
| // PopVariance computes the unbiased weighted sample variance: |
| // |
| // \sum_i w_i (x_i - mean)^2 / (sum_i w_i) |
| // |
| // If weights is nil then all of the weights are 1. If weights is not nil, then |
| // len(x) must equal len(weights). |
| func PopVariance(x, weights []float64) float64 { |
| _, variance := PopMeanVariance(x, weights) |
| return variance |
| } |
| |
| func meanUnnormalisedVarianceSumWeights(x, weights []float64) (mean, unnormalisedVariance, sumWeights float64) { |
| // This uses the corrected two-pass algorithm (1.7), from "Algorithms for computing |
| // the sample variance: Analysis and recommendations" by Chan, Tony F., Gene H. Golub, |
| // and Randall J. LeVeque. |
| |
| // Note that this will panic if the slice lengths do not match. |
| mean = Mean(x, weights) |
| var ( |
| ss float64 |
| compensation float64 |
| ) |
| if weights == nil { |
| for _, v := range x { |
| d := v - mean |
| ss += d * d |
| compensation += d |
| } |
| unnormalisedVariance = (ss - compensation*compensation/float64(len(x))) |
| return mean, unnormalisedVariance, float64(len(x)) |
| } |
| |
| for i, v := range x { |
| w := weights[i] |
| d := v - mean |
| wd := w * d |
| ss += wd * d |
| compensation += wd |
| sumWeights += w |
| } |
| unnormalisedVariance = (ss - compensation*compensation/sumWeights) |
| return mean, unnormalisedVariance, sumWeights |
| } |