kortschak | 805531d | 2017-11-01 10:30:53 +1030 | [diff] [blame] | 1 | // Copyright ©2016 The Gonum Authors. All rights reserved. |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package stat_test |
| 6 | |
| 7 | import ( |
| 8 | "fmt" |
| 9 | "log" |
| 10 | |
Brendan Tracey | d33397a | 2017-05-23 00:03:03 -0600 | [diff] [blame] | 11 | "gonum.org/v1/gonum/floats" |
Brendan Tracey | 0d63974 | 2017-06-13 10:28:21 -0600 | [diff] [blame] | 12 | "gonum.org/v1/gonum/mat" |
Brendan Tracey | d33397a | 2017-05-23 00:03:03 -0600 | [diff] [blame] | 13 | "gonum.org/v1/gonum/stat" |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 14 | ) |
| 15 | |
| 16 | // symView is a helper for getting a View of a SymDense. |
| 17 | type symView struct { |
Brendan Tracey | 0d63974 | 2017-06-13 10:28:21 -0600 | [diff] [blame] | 18 | sym *mat.SymDense |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 19 | |
| 20 | i, j, r, c int |
| 21 | } |
| 22 | |
| 23 | func (s symView) Dims() (r, c int) { return s.r, s.c } |
| 24 | |
| 25 | func (s symView) At(i, j int) float64 { |
| 26 | if i < 0 || s.r <= i { |
| 27 | panic("i out of bounds") |
| 28 | } |
| 29 | if j < 0 || s.c <= j { |
| 30 | panic("j out of bounds") |
| 31 | } |
| 32 | return s.sym.At(s.i+i, s.j+j) |
| 33 | } |
| 34 | |
Dan Kortschak | 212233d | 2019-11-21 12:39:19 +1030 | [diff] [blame] | 35 | func (s symView) T() mat.Matrix { return mat.Transpose{Matrix: s} } |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 36 | |
kortschak | ed8a296 | 2017-03-17 18:23:36 +1030 | [diff] [blame] | 37 | func ExampleCC() { |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 38 | // This example is directly analogous to Example 3.5 on page 87 of |
| 39 | // Koch, Inge. Analysis of multivariate and high-dimensional data. |
| 40 | // Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939 |
| 41 | |
| 42 | // bostonData is the Boston Housing Data of Harrison and Rubinfeld (1978) |
| 43 | n, _ := bostonData.Dims() |
| 44 | var xd, yd = 7, 4 |
| 45 | // The variables (columns) of bostonData can be partitioned into two sets: |
| 46 | // those that deal with environmental/social variables (xdata), and those |
| 47 | // that contain information regarding the individual (ydata). Because the |
| 48 | // variables can be naturally partitioned in this way, these data are |
| 49 | // appropriate for canonical correlation analysis. The columns (variables) |
| 50 | // of xdata are, in order: |
| 51 | // per capita crime rate by town, |
| 52 | // proportion of non-retail business acres per town, |
| 53 | // nitric oxide concentration (parts per 10 million), |
| 54 | // weighted distances to Boston employment centres, |
| 55 | // index of accessibility to radial highways, |
| 56 | // pupil-teacher ratio by town, and |
| 57 | // proportion of blacks by town. |
| 58 | xdata := bostonData.Slice(0, n, 0, xd) |
| 59 | |
| 60 | // The columns (variables) of ydata are, in order: |
| 61 | // average number of rooms per dwelling, |
| 62 | // proportion of owner-occupied units built prior to 1940, |
| 63 | // full-value property-tax rate per $10000, and |
| 64 | // median value of owner-occupied homes in $1000s. |
| 65 | ydata := bostonData.Slice(0, n, xd, xd+yd) |
| 66 | |
| 67 | // For comparison, calculate the correlation matrix for the original data. |
Brendan Tracey | 0d63974 | 2017-06-13 10:28:21 -0600 | [diff] [blame] | 68 | var cor mat.SymDense |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 69 | stat.CorrelationMatrix(&cor, bostonData, nil) |
| 70 | |
| 71 | // Extract just those correlations that are between xdata and ydata. |
| 72 | var corRaw = symView{sym: &cor, i: 0, j: xd, r: xd, c: yd} |
| 73 | |
| 74 | // Note that the strongest correlation between individual variables is 0.91 |
| 75 | // between the 5th variable of xdata (index of accessibility to radial |
| 76 | // highways) and the 3rd variable of ydata (full-value property-tax rate per |
| 77 | // $10000). |
Brendan Tracey | 0d63974 | 2017-06-13 10:28:21 -0600 | [diff] [blame] | 78 | fmt.Printf("corRaw = %.4f", mat.Formatted(corRaw, mat.Prefix(" "))) |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 79 | |
| 80 | // Calculate the canonical correlations. |
kortschak | 8ae4d02 | 2016-12-19 12:26:36 +1030 | [diff] [blame] | 81 | var cc stat.CC |
| 82 | err := cc.CanonicalCorrelations(xdata, ydata, nil) |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 83 | if err != nil { |
| 84 | log.Fatal(err) |
| 85 | } |
| 86 | |
| 87 | // Unpack cc. |
Brendan Tracey | 5d5638e | 2019-10-09 23:20:26 +0100 | [diff] [blame] | 88 | var pVecs, qVecs, phiVs, psiVs mat.Dense |
kortschak | 03a2dba | 2017-06-21 14:00:44 +0930 | [diff] [blame] | 89 | ccors := cc.CorrsTo(nil) |
Brendan Tracey | 5d5638e | 2019-10-09 23:20:26 +0100 | [diff] [blame] | 90 | cc.LeftTo(&pVecs, true) |
| 91 | cc.RightTo(&qVecs, true) |
| 92 | cc.LeftTo(&phiVs, false) |
| 93 | cc.RightTo(&psiVs, false) |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 94 | |
| 95 | // Canonical Correlation Matrix, or the correlations between the sphered |
| 96 | // data. |
Brendan Tracey | 0d63974 | 2017-06-13 10:28:21 -0600 | [diff] [blame] | 97 | var corSph mat.Dense |
Brendan Tracey | 5d5638e | 2019-10-09 23:20:26 +0100 | [diff] [blame] | 98 | corSph.CloneFrom(&pVecs) |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 99 | col := make([]float64, xd) |
| 100 | for j := 0; j < yd; j++ { |
Brendan Tracey | 0d63974 | 2017-06-13 10:28:21 -0600 | [diff] [blame] | 101 | mat.Col(col, j, &corSph) |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 102 | floats.Scale(ccors[j], col) |
| 103 | corSph.SetCol(j, col) |
| 104 | } |
| 105 | corSph.Product(&corSph, qVecs.T()) |
Brendan Tracey | 0d63974 | 2017-06-13 10:28:21 -0600 | [diff] [blame] | 106 | fmt.Printf("\n\ncorSph = %.4f", mat.Formatted(&corSph, mat.Prefix(" "))) |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 107 | |
| 108 | // Canonical Correlations. Note that the first canonical correlation is |
| 109 | // 0.95, stronger than the greatest correlation in the original data, and |
| 110 | // much stronger than the greatest correlation in the sphered data. |
| 111 | fmt.Printf("\n\nccors = %.4f", ccors) |
| 112 | |
| 113 | // Left and right eigenvectors of the canonical correlation matrix. |
Brendan Tracey | 5d5638e | 2019-10-09 23:20:26 +0100 | [diff] [blame] | 114 | fmt.Printf("\n\npVecs = %.4f", mat.Formatted(&pVecs, mat.Prefix(" "))) |
| 115 | fmt.Printf("\n\nqVecs = %.4f", mat.Formatted(&qVecs, mat.Prefix(" "))) |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 116 | |
| 117 | // Canonical Correlation Transforms. These can be useful as they represent |
| 118 | // the canonical variables as linear combinations of the original variables. |
Brendan Tracey | 5d5638e | 2019-10-09 23:20:26 +0100 | [diff] [blame] | 119 | fmt.Printf("\n\nphiVs = %.4f", mat.Formatted(&phiVs, mat.Prefix(" "))) |
| 120 | fmt.Printf("\n\npsiVs = %.4f", mat.Formatted(&psiVs, mat.Prefix(" "))) |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 121 | |
| 122 | // Output: |
| 123 | // corRaw = ⎡-0.2192 0.3527 0.5828 -0.3883⎤ |
| 124 | // ⎢-0.3917 0.6448 0.7208 -0.4837⎥ |
| 125 | // ⎢-0.3022 0.7315 0.6680 -0.4273⎥ |
| 126 | // ⎢ 0.2052 -0.7479 -0.5344 0.2499⎥ |
| 127 | // ⎢-0.2098 0.4560 0.9102 -0.3816⎥ |
| 128 | // ⎢-0.3555 0.2615 0.4609 -0.5078⎥ |
| 129 | // ⎣ 0.1281 -0.2735 -0.4418 0.3335⎦ |
| 130 | // |
| 131 | // corSph = ⎡ 0.0118 0.0525 0.2300 -0.1363⎤ |
| 132 | // ⎢-0.1810 0.3213 0.3814 -0.1412⎥ |
| 133 | // ⎢ 0.0166 0.2241 0.0104 -0.2235⎥ |
| 134 | // ⎢ 0.0346 -0.5481 -0.0034 -0.1994⎥ |
| 135 | // ⎢ 0.0303 -0.0956 0.7152 0.2039⎥ |
| 136 | // ⎢-0.0298 -0.0022 0.0739 -0.3703⎥ |
| 137 | // ⎣-0.1226 -0.0746 -0.3899 0.1541⎦ |
| 138 | // |
| 139 | // ccors = [0.9451 0.6787 0.5714 0.2010] |
| 140 | // |
Vladimir Chalupecky | 7a88995 | 2021-03-16 21:54:37 +0100 | [diff] [blame] | 141 | // pVecs = ⎡-0.2574 -0.0158 -0.2122 -0.0946⎤ |
| 142 | // ⎢-0.4837 -0.3837 -0.1474 0.6597⎥ |
| 143 | // ⎢-0.0801 -0.3494 -0.3287 -0.2862⎥ |
| 144 | // ⎢ 0.1278 0.7337 -0.4851 0.2248⎥ |
| 145 | // ⎢-0.6969 0.4342 0.3603 0.0291⎥ |
| 146 | // ⎢-0.0991 -0.0503 -0.6384 0.1022⎥ |
| 147 | // ⎣ 0.4260 -0.0323 0.2290 0.6419⎦ |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 148 | // |
Vladimir Chalupecky | 7a88995 | 2021-03-16 21:54:37 +0100 | [diff] [blame] | 149 | // qVecs = ⎡ 0.0182 0.1583 0.0067 -0.9872⎤ |
| 150 | // ⎢-0.2348 -0.9483 0.1462 -0.1554⎥ |
| 151 | // ⎢-0.9701 0.2406 0.0252 0.0209⎥ |
| 152 | // ⎣ 0.0593 0.1330 0.9889 0.0291⎦ |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 153 | // |
Vladimir Chalupecky | 7a88995 | 2021-03-16 21:54:37 +0100 | [diff] [blame] | 154 | // phiVs = ⎡-0.0027 -0.0093 -0.0490 -0.0155⎤ |
| 155 | // ⎢-0.0429 0.0242 -0.0361 0.1839⎥ |
| 156 | // ⎢-1.2248 -5.6031 -5.8094 -4.7927⎥ |
| 157 | // ⎢-0.0044 0.3424 -0.4470 0.1150⎥ |
| 158 | // ⎢-0.0742 0.1193 0.1116 0.0022⎥ |
| 159 | // ⎢-0.0233 -0.1046 -0.3853 -0.0161⎥ |
| 160 | // ⎣ 0.0001 -0.0005 0.0030 0.0082⎦ |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 161 | // |
Vladimir Chalupecky | 7a88995 | 2021-03-16 21:54:37 +0100 | [diff] [blame] | 162 | // psiVs = ⎡ 0.0302 0.3002 -0.0878 -1.9583⎤ |
| 163 | // ⎢-0.0065 -0.0392 0.0118 -0.0061⎥ |
| 164 | // ⎢-0.0052 0.0046 0.0023 0.0008⎥ |
| 165 | // ⎣ 0.0020 -0.0037 0.1293 0.1038⎦ |
Armadilloa16 | c6af72d | 2016-08-29 11:08:22 +0930 | [diff] [blame] | 166 | } |