| // Copyright ©2016 The gonum Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package gonum |
| |
| import ( |
| "math" |
| |
| "gonum.org/v1/gonum/blas" |
| "gonum.org/v1/gonum/blas/blas64" |
| ) |
| |
| // Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated |
| // block of a Hessenberg matrix. |
| // |
| // wantt and wantz determine whether the quasi-triangular Schur factor and the |
| // orthogonal Schur factor, respectively, will be computed. |
| // |
| // kacc22 specifies the computation mode of far-from-diagonal orthogonal |
| // updates. Permitted values are: |
| // 0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix |
| // multiply to update far-from-diagonal matrix entries. |
| // 1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to |
| // update far-from-diagonal matrix entries. |
| // 2: Dlaqr5 will accumulate reflections, use matrix-matrix multiply to update |
| // far-from-diagonal matrix entries, and take advantage of 2×2 block |
| // structure during matrix multiplies. |
| // For other values of kacc2 Dlaqr5 will panic. |
| // |
| // n is the order of the Hessenberg matrix H. |
| // |
| // ktop and kbot are indices of the first and last row and column of an isolated |
| // diagonal block upon which the QR sweep will be applied. It must hold that |
| // ktop == 0, or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and |
| // kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0, |
| // otherwise Dlaqr5 will panic. |
| // |
| // nshfts is the number of simultaneous shifts. It must be positive and even, |
| // otherwise Dlaqr5 will panic. |
| // |
| // sr and si contain the real and imaginary parts, respectively, of the shifts |
| // of origin that define the multi-shift QR sweep. On return both slices may be |
| // reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5 |
| // will panic. |
| // |
| // h and ldh represent the Hessenberg matrix H of size n×n. On return |
| // multi-shift QR sweep with shifts sr+i*si has been applied to the isolated |
| // diagonal block in rows and columns ktop through kbot, inclusive. |
| // |
| // iloz and ihiz specify the rows of Z to which transformations will be applied |
| // if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5 |
| // will panic. |
| // |
| // z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep |
| // orthogonal similarity transformation is accumulated into |
| // z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced. |
| // |
| // v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V |
| // is transposed with respect to the reference netlib implementation. |
| // |
| // u and ldu represent an auxiliary matrix of size (3*nshfts-3)×(3*nshfts-3). |
| // |
| // wh and ldwh represent an auxiliary matrix of size (3*nshfts-3)×nh. |
| // |
| // wv and ldwv represent an auxiliary matrix of size nv×(3*nshfts-3). |
| // |
| // Dlaqr5 is an internal routine. It is exported for testing purposes. |
| func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) { |
| checkMatrix(n, n, h, ldh) |
| if ktop < 0 || n <= ktop { |
| panic("lapack: invalid value of ktop") |
| } |
| if ktop > 0 && h[ktop*ldh+ktop-1] != 0 { |
| panic("lapack: diagonal block is not isolated") |
| } |
| if kbot < 0 || n <= kbot { |
| panic("lapack: invalid value of kbot") |
| } |
| if kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0 { |
| panic("lapack: diagonal block is not isolated") |
| } |
| if nshfts < 0 || nshfts&0x1 != 0 { |
| panic("lapack: invalid number of shifts") |
| } |
| if len(sr) != nshfts || len(si) != nshfts { |
| panic(badSlice) // TODO(vladimir-ch) Another message? |
| } |
| if wantz { |
| if ihiz >= n { |
| panic("lapack: invalid value of ihiz") |
| } |
| if iloz < 0 || ihiz < iloz { |
| panic("lapack: invalid value of iloz") |
| } |
| checkMatrix(n, n, z, ldz) |
| } |
| checkMatrix(nshfts/2, 3, v, ldv) // Transposed w.r.t. lapack. |
| checkMatrix(3*nshfts-3, 3*nshfts-3, u, ldu) |
| checkMatrix(nv, 3*nshfts-3, wv, ldwv) |
| checkMatrix(3*nshfts-3, nh, wh, ldwh) |
| if kacc22 != 0 && kacc22 != 1 && kacc22 != 2 { |
| panic("lapack: invalid value of kacc22") |
| } |
| |
| // If there are no shifts, then there is nothing to do. |
| if nshfts < 2 { |
| return |
| } |
| // If the active block is empty or 1×1, then there is nothing to do. |
| if ktop >= kbot { |
| return |
| } |
| |
| // Shuffle shifts into pairs of real shifts and pairs of complex |
| // conjugate shifts assuming complex conjugate shifts are already |
| // adjacent to one another. |
| for i := 0; i < nshfts-2; i += 2 { |
| if si[i] == -si[i+1] { |
| continue |
| } |
| sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i] |
| si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i] |
| } |
| |
| // Note: lapack says that nshfts must be even but allows it to be odd |
| // anyway. We panic above if nshfts is not even, so reducing it by one |
| // is unnecessary. The only caller Dlaqr04 uses only even nshfts. |
| // |
| // The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341: |
| // * ==== NSHFTS is supposed to be even, but if it is odd, |
| // * . then simply reduce it by one. The shuffle above |
| // * . ensures that the dropped shift is real and that |
| // * . the remaining shifts are paired. ==== |
| // * |
| // NS = NSHFTS - MOD( NSHFTS, 2 ) |
| ns := nshfts |
| |
| safmin := dlamchS |
| ulp := dlamchP |
| smlnum := safmin * float64(n) / ulp |
| |
| // Use accumulated reflections to update far-from-diagonal entries? |
| accum := kacc22 == 1 || kacc22 == 2 |
| // If so, exploit the 2×2 block structure? |
| blk22 := ns > 2 && kacc22 == 2 |
| |
| // Clear trash. |
| if ktop+2 <= kbot { |
| h[(ktop+2)*ldh+ktop] = 0 |
| } |
| |
| // nbmps = number of 2-shift bulges in the chain. |
| nbmps := ns / 2 |
| |
| // kdu = width of slab. |
| kdu := 6*nbmps - 3 |
| |
| // Create and chase chains of nbmps bulges. |
| for incol := 3*(1-nbmps) + ktop - 1; incol <= kbot-2; incol += 3*nbmps - 2 { |
| ndcol := incol + kdu |
| if accum { |
| impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu) |
| } |
| |
| // Near-the-diagonal bulge chase. The following loop performs |
| // the near-the-diagonal part of a small bulge multi-shift QR |
| // sweep. Each 6*nbmps-2 column diagonal chunk extends from |
| // column incol to column ndcol (including both column incol and |
| // column ndcol). The following loop chases a 3*nbmps column |
| // long chain of nbmps bulges 3*nbmps-2 columns to the right. |
| // (incol may be less than ktop and ndcol may be greater than |
| // kbot indicating phantom columns from which to chase bulges |
| // before they are actually introduced or to which to chase |
| // bulges beyond column kbot.) |
| for krcol := incol; krcol <= min(incol+3*nbmps-3, kbot-2); krcol++ { |
| // Bulges number mtop to mbot are active double implicit |
| // shift bulges. There may or may not also be small 2×2 |
| // bulge, if there is room. The inactive bulges (if any) |
| // must wait until the active bulges have moved down the |
| // diagonal to make room. The phantom matrix paradigm |
| // described above helps keep track. |
| |
| mtop := max(0, ((ktop-1)-krcol+2)/3) |
| mbot := min(nbmps, (kbot-krcol)/3) - 1 |
| m22 := mbot + 1 |
| bmp22 := (mbot < nbmps-1) && (krcol+3*m22 == kbot-2) |
| |
| // Generate reflections to chase the chain right one |
| // column. (The minimum value of k is ktop-1.) |
| for m := mtop; m <= mbot; m++ { |
| k := krcol + 3*m |
| if k == ktop-1 { |
| impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh, |
| sr[2*m], si[2*m], sr[2*m+1], si[2*m+1], |
| v[m*ldv:m*ldv+3]) |
| alpha := v[m*ldv] |
| _, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1) |
| continue |
| } |
| beta := h[(k+1)*ldh+k] |
| v[m*ldv+1] = h[(k+2)*ldh+k] |
| v[m*ldv+2] = h[(k+3)*ldh+k] |
| beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1) |
| |
| // A bulge may collapse because of vigilant deflation or |
| // destructive underflow. In the underflow case, try the |
| // two-small-subdiagonals trick to try to reinflate the |
| // bulge. |
| if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 { |
| // Typical case: not collapsed (yet). |
| h[(k+1)*ldh+k] = beta |
| h[(k+2)*ldh+k] = 0 |
| h[(k+3)*ldh+k] = 0 |
| continue |
| } |
| |
| // Atypical case: collapsed. Attempt to reintroduce |
| // ignoring H[k+1,k] and H[k+2,k]. If the fill |
| // resulting from the new reflector is too large, |
| // then abandon it. Otherwise, use the new one. |
| var vt [3]float64 |
| impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh, sr[2*m], |
| si[2*m], sr[2*m+1], si[2*m+1], vt[:]) |
| alpha := vt[0] |
| _, vt[0] = impl.Dlarfg(3, alpha, vt[1:3], 1) |
| refsum := vt[0] * (h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k]) |
| |
| dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2]) |
| if math.Abs(h[(k+2)*ldh+k]-refsum*vt[1])+math.Abs(refsum*vt[2]) > ulp*dsum { |
| // Starting a new bulge here would create |
| // non-negligible fill. Use the old one with |
| // trepidation. |
| h[(k+1)*ldh+k] = beta |
| h[(k+2)*ldh+k] = 0 |
| h[(k+3)*ldh+k] = 0 |
| continue |
| } else { |
| // Starting a new bulge here would create |
| // only negligible fill. Replace the old |
| // reflector with the new one. |
| h[(k+1)*ldh+k] -= refsum |
| h[(k+2)*ldh+k] = 0 |
| h[(k+3)*ldh+k] = 0 |
| v[m*ldv] = vt[0] |
| v[m*ldv+1] = vt[1] |
| v[m*ldv+2] = vt[2] |
| } |
| } |
| |
| // Generate a 2×2 reflection, if needed. |
| if bmp22 { |
| k := krcol + 3*m22 |
| if k == ktop-1 { |
| impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh, |
| sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1], |
| v[m22*ldv:m22*ldv+2]) |
| beta := v[m22*ldv] |
| _, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1) |
| } else { |
| beta := h[(k+1)*ldh+k] |
| v[m22*ldv+1] = h[(k+2)*ldh+k] |
| beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1) |
| h[(k+1)*ldh+k] = beta |
| h[(k+2)*ldh+k] = 0 |
| } |
| } |
| |
| // Multiply H by reflections from the left. |
| var jbot int |
| switch { |
| case accum: |
| jbot = min(ndcol, kbot) |
| case wantt: |
| jbot = n - 1 |
| default: |
| jbot = kbot |
| } |
| for j := max(ktop, krcol); j <= jbot; j++ { |
| mend := min(mbot+1, (j-krcol+2)/3) - 1 |
| for m := mtop; m <= mend; m++ { |
| k := krcol + 3*m |
| refsum := v[m*ldv] * (h[(k+1)*ldh+j] + |
| v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j]) |
| h[(k+1)*ldh+j] -= refsum |
| h[(k+2)*ldh+j] -= refsum * v[m*ldv+1] |
| h[(k+3)*ldh+j] -= refsum * v[m*ldv+2] |
| } |
| } |
| if bmp22 { |
| k := krcol + 3*m22 |
| for j := max(k+1, ktop); j <= jbot; j++ { |
| refsum := v[m22*ldv] * (h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j]) |
| h[(k+1)*ldh+j] -= refsum |
| h[(k+2)*ldh+j] -= refsum * v[m22*ldv+1] |
| } |
| } |
| |
| // Multiply H by reflections from the right. Delay filling in the last row |
| // until the vigilant deflation check is complete. |
| var jtop int |
| switch { |
| case accum: |
| jtop = max(ktop, incol) |
| case wantt: |
| jtop = 0 |
| default: |
| jtop = ktop |
| } |
| for m := mtop; m <= mbot; m++ { |
| if v[m*ldv] == 0 { |
| continue |
| } |
| k := krcol + 3*m |
| for j := jtop; j <= min(kbot, k+3); j++ { |
| refsum := v[m*ldv] * (h[j*ldh+k+1] + |
| v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3]) |
| h[j*ldh+k+1] -= refsum |
| h[j*ldh+k+2] -= refsum * v[m*ldv+1] |
| h[j*ldh+k+3] -= refsum * v[m*ldv+2] |
| } |
| if accum { |
| // Accumulate U. (If necessary, update Z later with with an |
| // efficient matrix-matrix multiply.) |
| kms := k - incol |
| for j := max(0, ktop-incol-1); j < kdu; j++ { |
| refsum := v[m*ldv] * (u[j*ldu+kms] + |
| v[m*ldv+1]*u[j*ldu+kms+1] + v[m*ldv+2]*u[j*ldu+kms+2]) |
| u[j*ldu+kms] -= refsum |
| u[j*ldu+kms+1] -= refsum * v[m*ldv+1] |
| u[j*ldu+kms+2] -= refsum * v[m*ldv+2] |
| } |
| } else if wantz { |
| // U is not accumulated, so update Z now by multiplying by |
| // reflections from the right. |
| for j := iloz; j <= ihiz; j++ { |
| refsum := v[m*ldv] * (z[j*ldz+k+1] + |
| v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3]) |
| z[j*ldz+k+1] -= refsum |
| z[j*ldz+k+2] -= refsum * v[m*ldv+1] |
| z[j*ldz+k+3] -= refsum * v[m*ldv+2] |
| } |
| } |
| } |
| |
| // Special case: 2×2 reflection (if needed). |
| if bmp22 && v[m22*ldv] != 0 { |
| k := krcol + 3*m22 |
| for j := jtop; j <= min(kbot, k+3); j++ { |
| refsum := v[m22*ldv] * (h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2]) |
| h[j*ldh+k+1] -= refsum |
| h[j*ldh+k+2] -= refsum * v[m22*ldv+1] |
| } |
| if accum { |
| kms := k - incol |
| for j := max(0, ktop-incol-1); j < kdu; j++ { |
| refsum := v[m22*ldv] * (u[j*ldu+kms] + v[m22*ldv+1]*u[j*ldu+kms+1]) |
| u[j*ldu+kms] -= refsum |
| u[j*ldu+kms+1] -= refsum * v[m22*ldv+1] |
| } |
| } else if wantz { |
| for j := iloz; j <= ihiz; j++ { |
| refsum := v[m22*ldv] * (z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2]) |
| z[j*ldz+k+1] -= refsum |
| z[j*ldz+k+2] -= refsum * v[m22*ldv+1] |
| } |
| } |
| } |
| |
| // Vigilant deflation check. |
| mstart := mtop |
| if krcol+3*mstart < ktop { |
| mstart++ |
| } |
| mend := mbot |
| if bmp22 { |
| mend++ |
| } |
| if krcol == kbot-2 { |
| mend++ |
| } |
| for m := mstart; m <= mend; m++ { |
| k := min(kbot-1, krcol+3*m) |
| |
| // The following convergence test requires that the tradition |
| // small-compared-to-nearby-diagonals criterion and the Ahues & |
| // Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter |
| // improves accuracy in some examples. Falling back on an alternate |
| // convergence criterion when tst1 or tst2 is zero (as done here) is |
| // traditional but probably unnecessary. |
| |
| if h[(k+1)*ldh+k] == 0 { |
| continue |
| } |
| tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) |
| if tst1 == 0 { |
| if k >= ktop+1 { |
| tst1 += math.Abs(h[k*ldh+k-1]) |
| } |
| if k >= ktop+2 { |
| tst1 += math.Abs(h[k*ldh+k-2]) |
| } |
| if k >= ktop+3 { |
| tst1 += math.Abs(h[k*ldh+k-3]) |
| } |
| if k <= kbot-2 { |
| tst1 += math.Abs(h[(k+2)*ldh+k+1]) |
| } |
| if k <= kbot-3 { |
| tst1 += math.Abs(h[(k+3)*ldh+k+1]) |
| } |
| if k <= kbot-4 { |
| tst1 += math.Abs(h[(k+4)*ldh+k+1]) |
| } |
| } |
| if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) { |
| h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) |
| h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) |
| h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) |
| h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) |
| scl := h11 + h12 |
| tst2 := h22 * (h11 / scl) |
| if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) { |
| h[(k+1)*ldh+k] = 0 |
| } |
| } |
| } |
| |
| // Fill in the last row of each bulge. |
| mend = min(nbmps, (kbot-krcol-1)/3) - 1 |
| for m := mtop; m <= mend; m++ { |
| k := krcol + 3*m |
| refsum := v[m*ldv] * v[m*ldv+2] * h[(k+4)*ldh+k+3] |
| h[(k+4)*ldh+k+1] = -refsum |
| h[(k+4)*ldh+k+2] = -refsum * v[m*ldv+1] |
| h[(k+4)*ldh+k+3] -= refsum * v[m*ldv+2] |
| } |
| } |
| |
| // Use U (if accumulated) to update far-from-diagonal entries in H. |
| // If required, use U to update Z as well. |
| if !accum { |
| continue |
| } |
| var jtop, jbot int |
| if wantt { |
| jtop = 0 |
| jbot = n - 1 |
| } else { |
| jtop = ktop |
| jbot = kbot |
| } |
| bi := blas64.Implementation() |
| if !blk22 || incol < ktop || kbot < ndcol || ns <= 2 { |
| // Updates not exploiting the 2×2 block structure of U. k0 and nu keep track |
| // of the location and size of U in the special cases of introducing bulges |
| // and chasing bulges off the bottom. In these special cases and in case the |
| // number of shifts is ns = 2, there is no 2×2 block structure to exploit. |
| |
| k0 := max(0, ktop-incol-1) |
| nu := kdu - max(0, ndcol-kbot) - k0 |
| |
| // Horizontal multiply. |
| for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh { |
| jlen := min(nh, jbot-jcol+1) |
| bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu, |
| 1, u[k0*ldu+k0:], ldu, |
| h[(incol+k0+1)*ldh+jcol:], ldh, |
| 0, wh, ldwh) |
| impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k0+1)*ldh+jcol:], ldh) |
| } |
| |
| // Vertical multiply. |
| for jrow := jtop; jrow <= max(ktop, incol)-1; jrow += nv { |
| jlen := min(nv, max(ktop, incol)-jrow) |
| bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu, |
| 1, h[jrow*ldh+incol+k0+1:], ldh, |
| u[k0*ldu+k0:], ldu, |
| 0, wv, ldwv) |
| impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k0+1:], ldh) |
| } |
| |
| // Z multiply (also vertical). |
| if wantz { |
| for jrow := iloz; jrow <= ihiz; jrow += nv { |
| jlen := min(nv, ihiz-jrow+1) |
| bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu, |
| 1, z[jrow*ldz+incol+k0+1:], ldz, |
| u[k0*ldu+k0:], ldu, |
| 0, wv, ldwv) |
| impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k0+1:], ldz) |
| } |
| } |
| |
| continue |
| } |
| |
| // Updates exploiting U's 2×2 block structure. |
| |
| // i2, i4, j2, j4 are the last rows and columns of the blocks. |
| i2 := (kdu + 1) / 2 |
| i4 := kdu |
| j2 := i4 - i2 |
| j4 := kdu |
| |
| // kzs and knz deal with the band of zeros along the diagonal of one of the |
| // triangular blocks. |
| kzs := (j4 - j2) - (ns + 1) |
| knz := ns + 1 |
| |
| // Horizontal multiply. |
| for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh { |
| jlen := min(nh, jbot-jcol+1) |
| |
| // Copy bottom of H to top+kzs of scratch (the first kzs |
| // rows get multiplied by zero). |
| impl.Dlacpy(blas.All, knz, jlen, h[(incol+1+j2)*ldh+jcol:], ldh, wh[kzs*ldwh:], ldwh) |
| |
| // Multiply by U21^T. |
| impl.Dlaset(blas.All, kzs, jlen, 0, 0, wh, ldwh) |
| bi.Dtrmm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, knz, jlen, |
| 1, u[j2*ldu+kzs:], ldu, wh[kzs*ldwh:], ldwh) |
| |
| // Multiply top of H by U11^T. |
| bi.Dgemm(blas.Trans, blas.NoTrans, i2, jlen, j2, |
| 1, u, ldu, h[(incol+1)*ldh+jcol:], ldh, |
| 1, wh, ldwh) |
| |
| // Copy top of H to bottom of WH. |
| impl.Dlacpy(blas.All, j2, jlen, h[(incol+1)*ldh+jcol:], ldh, wh[i2*ldwh:], ldwh) |
| |
| // Multiply by U21^T. |
| bi.Dtrmm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit, j2, jlen, |
| 1, u[i2:], ldu, wh[i2*ldwh:], ldwh) |
| |
| // Multiply by U22. |
| bi.Dgemm(blas.Trans, blas.NoTrans, i4-i2, jlen, j4-j2, |
| 1, u[j2*ldu+i2:], ldu, h[(incol+1+j2)*ldh+jcol:], ldh, |
| 1, wh[i2*ldwh:], ldwh) |
| |
| // Copy it back. |
| impl.Dlacpy(blas.All, kdu, jlen, wh, ldwh, h[(incol+1)*ldh+jcol:], ldh) |
| } |
| |
| // Vertical multiply. |
| for jrow := jtop; jrow <= max(incol, ktop)-1; jrow += nv { |
| jlen := min(nv, max(incol, ktop)-jrow) |
| |
| // Copy right of H to scratch (the first kzs columns get multiplied |
| // by zero). |
| impl.Dlacpy(blas.All, jlen, knz, h[jrow*ldh+incol+1+j2:], ldh, wv[kzs:], ldwv) |
| |
| // Multiply by U21. |
| impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv) |
| bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz, |
| 1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv) |
| |
| // Multiply by U11. |
| bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2, |
| 1, h[jrow*ldh+incol+1:], ldh, u, ldu, |
| 1, wv, ldwv) |
| |
| // Copy left of H to right of scratch. |
| impl.Dlacpy(blas.All, jlen, j2, h[jrow*ldh+incol+1:], ldh, wv[i2:], ldwv) |
| |
| // Multiply by U21. |
| bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2, |
| 1, u[i2:], ldu, wv[i2:], ldwv) |
| |
| // Multiply by U22. |
| bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2, |
| 1, h[jrow*ldh+incol+1+j2:], ldh, u[j2*ldu+i2:], ldu, |
| 1, wv[i2:], ldwv) |
| |
| // Copy it back. |
| impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, h[jrow*ldh+incol+1:], ldh) |
| } |
| |
| if !wantz { |
| continue |
| } |
| // Multiply Z (also vertical). |
| for jrow := iloz; jrow <= ihiz; jrow += nv { |
| jlen := min(nv, ihiz-jrow+1) |
| |
| // Copy right of Z to left of scratch (first kzs columns get |
| // multiplied by zero). |
| impl.Dlacpy(blas.All, jlen, knz, z[jrow*ldz+incol+1+j2:], ldz, wv[kzs:], ldwv) |
| |
| // Multiply by U12. |
| impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv) |
| bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz, |
| 1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv) |
| |
| // Multiply by U11. |
| bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2, |
| 1, z[jrow*ldz+incol+1:], ldz, u, ldu, |
| 1, wv, ldwv) |
| |
| // Copy left of Z to right of scratch. |
| impl.Dlacpy(blas.All, jlen, j2, z[jrow*ldz+incol+1:], ldz, wv[i2:], ldwv) |
| |
| // Multiply by U21. |
| bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2, |
| 1, u[i2:], ldu, wv[i2:], ldwv) |
| |
| // Multiply by U22. |
| bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2, |
| 1, z[jrow*ldz+incol+1+j2:], ldz, u[j2*ldu+i2:], ldu, |
| 1, wv[i2:], ldwv) |
| |
| // Copy the result back to Z. |
| impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, z[jrow*ldz+incol+1:], ldz) |
| } |
| } |
| } |