blob: ef9bfb6735dd6f2d44c7248dd7af53292065fc3c [file] [log] [blame]
kortschak805531d2017-11-01 10:30:53 +10301// Copyright ©2016 The Gonum Authors. All rights reserved.
Armadilloa16c6af72d2016-08-29 11:08:22 +09302// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package stat_test
6
7import (
8 "fmt"
9 "log"
10
Brendan Traceyd33397a2017-05-23 00:03:03 -060011 "gonum.org/v1/gonum/floats"
Brendan Tracey0d639742017-06-13 10:28:21 -060012 "gonum.org/v1/gonum/mat"
Brendan Traceyd33397a2017-05-23 00:03:03 -060013 "gonum.org/v1/gonum/stat"
Armadilloa16c6af72d2016-08-29 11:08:22 +093014)
15
16// symView is a helper for getting a View of a SymDense.
17type symView struct {
Brendan Tracey0d639742017-06-13 10:28:21 -060018 sym *mat.SymDense
Armadilloa16c6af72d2016-08-29 11:08:22 +093019
20 i, j, r, c int
21}
22
23func (s symView) Dims() (r, c int) { return s.r, s.c }
24
25func (s symView) At(i, j int) float64 {
26 if i < 0 || s.r <= i {
27 panic("i out of bounds")
28 }
29 if j < 0 || s.c <= j {
30 panic("j out of bounds")
31 }
32 return s.sym.At(s.i+i, s.j+j)
33}
34
Dan Kortschak212233d2019-11-21 12:39:19 +103035func (s symView) T() mat.Matrix { return mat.Transpose{Matrix: s} }
Armadilloa16c6af72d2016-08-29 11:08:22 +093036
kortschaked8a2962017-03-17 18:23:36 +103037func ExampleCC() {
Armadilloa16c6af72d2016-08-29 11:08:22 +093038 // This example is directly analogous to Example 3.5 on page 87 of
39 // Koch, Inge. Analysis of multivariate and high-dimensional data.
40 // Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939
41
42 // bostonData is the Boston Housing Data of Harrison and Rubinfeld (1978)
43 n, _ := bostonData.Dims()
44 var xd, yd = 7, 4
45 // The variables (columns) of bostonData can be partitioned into two sets:
46 // those that deal with environmental/social variables (xdata), and those
47 // that contain information regarding the individual (ydata). Because the
48 // variables can be naturally partitioned in this way, these data are
49 // appropriate for canonical correlation analysis. The columns (variables)
50 // of xdata are, in order:
51 // per capita crime rate by town,
52 // proportion of non-retail business acres per town,
53 // nitric oxide concentration (parts per 10 million),
54 // weighted distances to Boston employment centres,
55 // index of accessibility to radial highways,
56 // pupil-teacher ratio by town, and
57 // proportion of blacks by town.
58 xdata := bostonData.Slice(0, n, 0, xd)
59
60 // The columns (variables) of ydata are, in order:
61 // average number of rooms per dwelling,
62 // proportion of owner-occupied units built prior to 1940,
63 // full-value property-tax rate per $10000, and
64 // median value of owner-occupied homes in $1000s.
65 ydata := bostonData.Slice(0, n, xd, xd+yd)
66
67 // For comparison, calculate the correlation matrix for the original data.
Brendan Tracey0d639742017-06-13 10:28:21 -060068 var cor mat.SymDense
Armadilloa16c6af72d2016-08-29 11:08:22 +093069 stat.CorrelationMatrix(&cor, bostonData, nil)
70
71 // Extract just those correlations that are between xdata and ydata.
72 var corRaw = symView{sym: &cor, i: 0, j: xd, r: xd, c: yd}
73
74 // Note that the strongest correlation between individual variables is 0.91
75 // between the 5th variable of xdata (index of accessibility to radial
76 // highways) and the 3rd variable of ydata (full-value property-tax rate per
77 // $10000).
Brendan Tracey0d639742017-06-13 10:28:21 -060078 fmt.Printf("corRaw = %.4f", mat.Formatted(corRaw, mat.Prefix(" ")))
Armadilloa16c6af72d2016-08-29 11:08:22 +093079
80 // Calculate the canonical correlations.
kortschak8ae4d022016-12-19 12:26:36 +103081 var cc stat.CC
82 err := cc.CanonicalCorrelations(xdata, ydata, nil)
Armadilloa16c6af72d2016-08-29 11:08:22 +093083 if err != nil {
84 log.Fatal(err)
85 }
86
87 // Unpack cc.
Brendan Tracey5d5638e2019-10-09 23:20:26 +010088 var pVecs, qVecs, phiVs, psiVs mat.Dense
kortschak03a2dba2017-06-21 14:00:44 +093089 ccors := cc.CorrsTo(nil)
Brendan Tracey5d5638e2019-10-09 23:20:26 +010090 cc.LeftTo(&pVecs, true)
91 cc.RightTo(&qVecs, true)
92 cc.LeftTo(&phiVs, false)
93 cc.RightTo(&psiVs, false)
Armadilloa16c6af72d2016-08-29 11:08:22 +093094
95 // Canonical Correlation Matrix, or the correlations between the sphered
96 // data.
Brendan Tracey0d639742017-06-13 10:28:21 -060097 var corSph mat.Dense
Brendan Tracey5d5638e2019-10-09 23:20:26 +010098 corSph.CloneFrom(&pVecs)
Armadilloa16c6af72d2016-08-29 11:08:22 +093099 col := make([]float64, xd)
100 for j := 0; j < yd; j++ {
Brendan Tracey0d639742017-06-13 10:28:21 -0600101 mat.Col(col, j, &corSph)
Armadilloa16c6af72d2016-08-29 11:08:22 +0930102 floats.Scale(ccors[j], col)
103 corSph.SetCol(j, col)
104 }
105 corSph.Product(&corSph, qVecs.T())
Brendan Tracey0d639742017-06-13 10:28:21 -0600106 fmt.Printf("\n\ncorSph = %.4f", mat.Formatted(&corSph, mat.Prefix(" ")))
Armadilloa16c6af72d2016-08-29 11:08:22 +0930107
108 // Canonical Correlations. Note that the first canonical correlation is
109 // 0.95, stronger than the greatest correlation in the original data, and
110 // much stronger than the greatest correlation in the sphered data.
111 fmt.Printf("\n\nccors = %.4f", ccors)
112
113 // Left and right eigenvectors of the canonical correlation matrix.
Brendan Tracey5d5638e2019-10-09 23:20:26 +0100114 fmt.Printf("\n\npVecs = %.4f", mat.Formatted(&pVecs, mat.Prefix(" ")))
115 fmt.Printf("\n\nqVecs = %.4f", mat.Formatted(&qVecs, mat.Prefix(" ")))
Armadilloa16c6af72d2016-08-29 11:08:22 +0930116
117 // Canonical Correlation Transforms. These can be useful as they represent
118 // the canonical variables as linear combinations of the original variables.
Brendan Tracey5d5638e2019-10-09 23:20:26 +0100119 fmt.Printf("\n\nphiVs = %.4f", mat.Formatted(&phiVs, mat.Prefix(" ")))
120 fmt.Printf("\n\npsiVs = %.4f", mat.Formatted(&psiVs, mat.Prefix(" ")))
Armadilloa16c6af72d2016-08-29 11:08:22 +0930121
122 // Output:
123 // corRaw = ⎡-0.2192 0.3527 0.5828 -0.3883⎤
124 // ⎢-0.3917 0.6448 0.7208 -0.4837⎥
125 // ⎢-0.3022 0.7315 0.6680 -0.4273⎥
126 // ⎢ 0.2052 -0.7479 -0.5344 0.2499⎥
127 // ⎢-0.2098 0.4560 0.9102 -0.3816⎥
128 // ⎢-0.3555 0.2615 0.4609 -0.5078⎥
129 // ⎣ 0.1281 -0.2735 -0.4418 0.3335⎦
130 //
131 // corSph = ⎡ 0.0118 0.0525 0.2300 -0.1363⎤
132 // ⎢-0.1810 0.3213 0.3814 -0.1412⎥
133 // ⎢ 0.0166 0.2241 0.0104 -0.2235⎥
134 // ⎢ 0.0346 -0.5481 -0.0034 -0.1994⎥
135 // ⎢ 0.0303 -0.0956 0.7152 0.2039⎥
136 // ⎢-0.0298 -0.0022 0.0739 -0.3703⎥
137 // ⎣-0.1226 -0.0746 -0.3899 0.1541⎦
138 //
139 // ccors = [0.9451 0.6787 0.5714 0.2010]
140 //
Vladimir Chalupecky7a889952021-03-16 21:54:37 +0100141 // pVecs = ⎡-0.2574 -0.0158 -0.2122 -0.0946⎤
142 // ⎢-0.4837 -0.3837 -0.1474 0.6597⎥
143 // ⎢-0.0801 -0.3494 -0.3287 -0.2862⎥
144 // ⎢ 0.1278 0.7337 -0.4851 0.2248⎥
145 // ⎢-0.6969 0.4342 0.3603 0.0291⎥
146 // ⎢-0.0991 -0.0503 -0.6384 0.1022⎥
147 // ⎣ 0.4260 -0.0323 0.2290 0.6419⎦
Armadilloa16c6af72d2016-08-29 11:08:22 +0930148 //
Vladimir Chalupecky7a889952021-03-16 21:54:37 +0100149 // qVecs = ⎡ 0.0182 0.1583 0.0067 -0.9872⎤
150 // ⎢-0.2348 -0.9483 0.1462 -0.1554⎥
151 // ⎢-0.9701 0.2406 0.0252 0.0209⎥
152 // ⎣ 0.0593 0.1330 0.9889 0.0291⎦
Armadilloa16c6af72d2016-08-29 11:08:22 +0930153 //
Vladimir Chalupecky7a889952021-03-16 21:54:37 +0100154 // phiVs = ⎡-0.0027 -0.0093 -0.0490 -0.0155⎤
155 // ⎢-0.0429 0.0242 -0.0361 0.1839⎥
156 // ⎢-1.2248 -5.6031 -5.8094 -4.7927⎥
157 // ⎢-0.0044 0.3424 -0.4470 0.1150⎥
158 // ⎢-0.0742 0.1193 0.1116 0.0022⎥
159 // ⎢-0.0233 -0.1046 -0.3853 -0.0161⎥
160 // ⎣ 0.0001 -0.0005 0.0030 0.0082⎦
Armadilloa16c6af72d2016-08-29 11:08:22 +0930161 //
Vladimir Chalupecky7a889952021-03-16 21:54:37 +0100162 // psiVs = ⎡ 0.0302 0.3002 -0.0878 -1.9583⎤
163 // ⎢-0.0065 -0.0392 0.0118 -0.0061⎥
164 // ⎢-0.0052 0.0046 0.0023 0.0008⎥
165 // ⎣ 0.0020 -0.0037 0.1293 0.1038⎦
Armadilloa16c6af72d2016-08-29 11:08:22 +0930166}