lapack/gonum/dlaqr04.go - third_party/github.com/gonum/gonum - Git at Google

 // Copyright ©2016 The gonum Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package gonum

 import (
 	"math"

 	"gonum.org/v1/gonum/blas"
 )

 // Dlaqr04 computes the eigenvalues of a block of an n×n upper Hessenberg matrix
 // H, and optionally the matrices T and Z from the Schur decomposition
 //  H = Z T Z^T
 // where T is an upper quasi-triangular matrix (the Schur form), and Z is the
 // orthogonal matrix of Schur vectors.
 //
 // wantt indicates whether the full Schur form T is required. If wantt is false,
 // then only enough of H will be updated to preserve the eigenvalues.
 //
 // wantz indicates whether the n×n matrix of Schur vectors Z is required. If it
 // is true, the orthogonal similarity transformation will be accumulated into
 // Z[iloz:ihiz+1,ilo:ihi+1], otherwise Z will not be referenced.
 //
 // ilo and ihi determine the block of H on which Dlaqr04 operates. It must hold that
 //  0 <= ilo <= ihi < n,     if n > 0,
 //  ilo == 0 and ihi == -1,  if n == 0,
 // and the block must be isolated, that is,
 //  ilo == 0   or H[ilo,ilo-1] == 0,
 //  ihi == n-1 or H[ihi+1,ihi] == 0,
 // otherwise Dlaqr04 will panic.
 //
 // wr and wi must have length ihi+1.
 //
 // iloz and ihiz specify the rows of Z to which transformations will be applied
 // if wantz is true. It must hold that
 //  0 <= iloz <= ilo,  and  ihi <= ihiz < n,
 // otherwise Dlaqr04 will panic.
 //
 // work must have length at least lwork and lwork must be
 //  lwork >= 1,  if n <= 11,
 //  lwork >= n,  if n > 11,
 // otherwise Dlaqr04 will panic. lwork as large as 6*n may be required for
 // optimal performance. On return, work[0] will contain the optimal value of
 // lwork.
 //
 // If lwork is -1, instead of performing Dlaqr04, the function only estimates the
 // optimal workspace size and stores it into work[0]. Neither h nor z are
 // accessed.
 //
 // recur is the non-negative recursion depth. For recur > 0, Dlaqr04 behaves
 // as DLAQR0, for recur == 0 it behaves as DLAQR4.
 //
 // unconverged indicates whether Dlaqr04 computed all the eigenvalues of H[ilo:ihi+1,ilo:ihi+1].
 //
 // If unconverged is zero and wantt is true, H will contain on return the upper
 // quasi-triangular matrix T from the Schur decomposition. 2×2 diagonal blocks
 // (corresponding to complex conjugate pairs of eigenvalues) will be returned in
 // standard form, with H[i,i] == H[i+1,i+1] and H[i+1,i]*H[i,i+1] < 0.
 //
 // If unconverged is zero and if wantt is false, the contents of h on return is
 // unspecified.
 //
 // If unconverged is zero, all the eigenvalues have been computed and their real
 // and imaginary parts will be stored on return in wr[ilo:ihi+1] and
 // wi[ilo:ihi+1], respectively. If two eigenvalues are computed as a complex
 // conjugate pair, they are stored in consecutive elements of wr and wi, say the
 // i-th and (i+1)th, with wi[i] > 0 and wi[i+1] < 0. If wantt is true, then the
 // eigenvalues are stored in the same order as on the diagonal of the Schur form
 // returned in H, with wr[i] = H[i,i] and, if H[i:i+2,i:i+2] is a 2×2 diagonal
 // block, wi[i] = sqrt(-H[i+1,i]*H[i,i+1]) and wi[i+1] = -wi[i].
 //
 // If unconverged is positive, some eigenvalues have not converged, and
 // wr[unconverged:ihi+1] and wi[unconverged:ihi+1] will contain those
 // eigenvalues which have been successfully computed. Failures are rare.
 //
 // If unconverged is positive and wantt is true, then on return
 //  (initial H)*U = U*(final H),   (*)
 // where U is an orthogonal matrix. The final H is upper Hessenberg and
 // H[unconverged:ihi+1,unconverged:ihi+1] is upper quasi-triangular.
 //
 // If unconverged is positive and wantt is false, on return the remaining
 // unconverged eigenvalues are the eigenvalues of the upper Hessenberg matrix
 // H[ilo:unconverged,ilo:unconverged].
 //
 // If unconverged is positive and wantz is true, then on return
 //  (final Z) = (initial Z)*U,
 // where U is the orthogonal matrix in (*) regardless of the value of wantt.
 //
 // References:
 //  [1] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part I:
 //      Maintaining Well-Focused Shifts and Level 3 Performance. SIAM J. Matrix
 //      Anal. Appl. 23(4) (2002), pp. 929—947
 //      URL: http://dx.doi.org/10.1137/S0895479801384573
 //  [2] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part II:
 //      Aggressive Early Deflation. SIAM J. Matrix Anal. Appl. 23(4) (2002), pp. 948—973
 //      URL: http://dx.doi.org/10.1137/S0895479801384585
 //
 // Dlaqr04 is an internal routine. It is exported for testing purposes.
 func (impl Implementation) Dlaqr04(wantt, wantz bool, n, ilo, ihi int, h []float64, ldh int, wr, wi []float64, iloz, ihiz int, z []float64, ldz int, work []float64, lwork int, recur int) (unconverged int) {
 	const (
 		// Matrices of order ntiny or smaller must be processed by
 		// Dlahqr because of insufficient subdiagonal scratch space.
 		// This is a hard limit.
 		ntiny = 11
 		// Exceptional deflation windows: try to cure rare slow
 		// convergence by varying the size of the deflation window after
 		// kexnw iterations.
 		kexnw = 5
 		// Exceptional shifts: try to cure rare slow convergence with
 		// ad-hoc exceptional shifts every kexsh iterations.
 		kexsh = 6

 		// See https://github.com/gonum/lapack/pull/151#discussion_r68162802
 		// and the surrounding discussion for an explanation where these
 		// constants come from.
 		// TODO(vladimir-ch): Similar constants for exceptional shifts
 		// are used also in dlahqr.go. The first constant is different
 		// there, it is equal to 3. Why? And does it matter?
 		wilk1 = 0.75
 		wilk2 = -0.4375
 	)

 	switch {
 	case ilo < 0 || max(0, n-1) < ilo:
 		panic(badIlo)
 	case ihi < min(ilo, n-1) || n <= ihi:
 		panic(badIhi)
 	case lwork < 1 && n <= ntiny && lwork != -1:
 		panic(badWork)
 	// TODO(vladimir-ch): Enable if and when we figure out what the minimum
 	// necessary lwork value is. Dlaqr04 says that the minimum is n which
 	// clashes with Dlaqr23's opinion about optimal work when nw <= 2
 	// (independent of n).
 	// case lwork < n && n > ntiny && lwork != -1:
 	// 	panic(badWork)
 	case len(work) < lwork:
 		panic(shortWork)
 	case recur < 0:
 		panic("lapack: recur is negative")
 	}
 	if wantz {
 		if iloz < 0 || ilo < iloz {
 			panic("lapack: invalid value of iloz")
 		}
 		if ihiz < ihi || n <= ihiz {
 			panic("lapack: invalid value of ihiz")
 		}
 	}
 	if lwork != -1 {
 		checkMatrix(n, n, h, ldh)
 		if wantz {
 			checkMatrix(n, n, z, ldz)
 		}
 		switch {
 		case ilo > 0 && h[ilo*ldh+ilo-1] != 0:
 			panic("lapack: block not isolated")
 		case ihi+1 < n && h[(ihi+1)*ldh+ihi] != 0:
 			panic("lapack: block not isolated")
 		case len(wr) != ihi+1:
 			panic("lapack: bad length of wr")
 		case len(wi) != ihi+1:
 			panic("lapack: bad length of wi")
 		}
 	}

 	// Quick return.
 	if n == 0 {
 		work[0] = 1
 		return 0
 	}

 	if n <= ntiny {
 		// Tiny matrices must use Dlahqr.
 		work[0] = 1
 		if lwork == -1 {
 			return 0
 		}
 		return impl.Dlahqr(wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, iloz, ihiz, z, ldz)
 	}

 	// Use small bulge multi-shift QR with aggressive early deflation on
 	// larger-than-tiny matrices.
 	var jbcmpz string
 	if wantt {
 		jbcmpz = "S"
 	} else {
 		jbcmpz = "E"
 	}
 	if wantz {
 		jbcmpz += "V"
 	} else {
 		jbcmpz += "N"
 	}

 	var fname string
 	if recur > 0 {
 		fname = "DLAQR0"
 	} else {
 		fname = "DLAQR4"
 	}
 	// nwr is the recommended deflation window size. n is greater than 11,
 	// so there is enough subdiagonal workspace for nwr >= 2 as required.
 	// (In fact, there is enough subdiagonal space for nwr >= 3.)
 	// TODO(vladimir-ch): If there is enough space for nwr >= 3, should we
 	// use it?
 	nwr := impl.Ilaenv(13, fname, jbcmpz, n, ilo, ihi, lwork)
 	nwr = max(2, nwr)
 	nwr = min(ihi-ilo+1, min((n-1)/3, nwr))

 	// nsr is the recommended number of simultaneous shifts. n is greater
 	// than 11, so there is enough subdiagonal workspace for nsr to be even
 	// and greater than or equal to two as required.
 	nsr := impl.Ilaenv(15, fname, jbcmpz, n, ilo, ihi, lwork)
 	nsr = min(nsr, min((n+6)/9, ihi-ilo))
 	nsr = max(2, nsr&^1)

 	// Workspace query call to Dlaqr23.
 	impl.Dlaqr23(wantt, wantz, n, ilo, ihi, nwr+1, nil, 0, iloz, ihiz, nil, 0,
 		nil, nil, nil, 0, n, nil, 0, n, nil, 0, work, -1, recur)
 	// Optimal workspace is max(Dlaqr5, Dlaqr23).
 	lwkopt := max(3*nsr/2, int(work[0]))
 	// Quick return in case of workspace query.
 	if lwork == -1 {
 		work[0] = float64(lwkopt)
 		return 0
 	}

 	// Dlahqr/Dlaqr04 crossover point.
 	nmin := impl.Ilaenv(12, fname, jbcmpz, n, ilo, ihi, lwork)
 	nmin = max(ntiny, nmin)

 	// Nibble determines when to skip a multi-shift QR sweep (Dlaqr5).
 	nibble := impl.Ilaenv(14, fname, jbcmpz, n, ilo, ihi, lwork)
 	nibble = max(0, nibble)

 	// Computation mode of far-from-diagonal orthogonal updates in Dlaqr5.
 	kacc22 := impl.Ilaenv(16, fname, jbcmpz, n, ilo, ihi, lwork)
 	kacc22 = max(0, min(kacc22, 2))

 	// nwmax is the largest possible deflation window for which there is
 	// sufficient workspace.
 	nwmax := min((n-1)/3, lwork/2)
 	nw := nwmax // Start with maximum deflation window size.

 	// nsmax is the largest number of simultaneous shifts for which there is
 	// sufficient workspace.
 	nsmax := min((n+6)/9, 2*lwork/3) &^ 1

 	ndfl := 1 // Number of iterations since last deflation.
 	ndec := 0 // Deflation window size decrement.

 	// Main loop.
 	var (
 		itmax = max(30, 2*kexsh) * max(10, (ihi-ilo+1))
 		it    = 0
 	)
 	for kbot := ihi; kbot >= ilo; {
 		if it == itmax {
 			unconverged = kbot + 1
 			break
 		}
 		it++

 		// Locate active block.
 		ktop := ilo
 		for k := kbot; k >= ilo+1; k-- {
 			if h[k*ldh+k-1] == 0 {
 				ktop = k
 				break
 			}
 		}

 		// Select deflation window size nw.
 		//
 		// Typical Case:
 		//  If possible and advisable, nibble the entire active block.
 		//  If not, use size min(nwr,nwmax) or min(nwr+1,nwmax)
 		//  depending upon which has the smaller corresponding
 		//  subdiagonal entry (a heuristic).
 		//
 		// Exceptional Case:
 		//  If there have been no deflations in kexnw or more
 		//  iterations, then vary the deflation window size. At first,
 		//  because larger windows are, in general, more powerful than
 		//  smaller ones, rapidly increase the window to the maximum
 		//  possible. Then, gradually reduce the window size.
 		nh := kbot - ktop + 1
 		nwupbd := min(nh, nwmax)
 		if ndfl < kexnw {
 			nw = min(nwupbd, nwr)
 		} else {
 			nw = min(nwupbd, 2*nw)
 		}
 		if nw < nwmax {
 			if nw >= nh-1 {
 				nw = nh
 			} else {
 				kwtop := kbot - nw + 1
 				if math.Abs(h[kwtop*ldh+kwtop-1]) > math.Abs(h[(kwtop-1)*ldh+kwtop-2]) {
 					nw++
 				}
 			}
 		}
 		if ndfl < kexnw {
 			ndec = -1
 		} else if ndec >= 0 || nw >= nwupbd {
 			ndec++
 			if nw-ndec < 2 {
 				ndec = 0
 			}
 			nw -= ndec
 		}

 		// Split workspace under the subdiagonal of H into:
 		//  - an nw×nw work array V in the lower left-hand corner,
 		//  - an nw×nhv horizontal work array along the bottom edge (nhv
 		//    must be at least nw but more is better),
 		//  - an nve×nw vertical work array along the left-hand-edge
 		//    (nhv can be any positive integer but more is better).
 		kv := n - nw
 		kt := nw
 		kwv := nw + 1
 		nhv := n - kwv - kt
 		// Aggressive early deflation.
 		ls, ld := impl.Dlaqr23(wantt, wantz, n, ktop, kbot, nw,
 			h, ldh, iloz, ihiz, z, ldz, wr[:kbot+1], wi[:kbot+1],
 			h[kv*ldh:], ldh, nhv, h[kv*ldh+kt:], ldh, nhv, h[kwv*ldh:], ldh, work, lwork, recur)

 		// Adjust kbot accounting for new deflations.
 		kbot -= ld
 		// ks points to the shifts.
 		ks := kbot - ls + 1

 		// Skip an expensive QR sweep if there is a (partly heuristic)
 		// reason to expect that many eigenvalues will deflate without
 		// it. Here, the QR sweep is skipped if many eigenvalues have
 		// just been deflated or if the remaining active block is small.
 		if ld > 0 && (100*ld > nw*nibble || kbot-ktop+1 <= min(nmin, nwmax)) {
 			// ld is positive, note progress.
 			ndfl = 1
 			continue
 		}

 		// ns is the nominal number of simultaneous shifts. This may be
 		// lowered (slightly) if Dlaqr23 did not provide that many
 		// shifts.
 		ns := min(min(nsmax, nsr), max(2, kbot-ktop)) &^ 1

 		// If there have been no deflations in a multiple of kexsh
 		// iterations, then try exceptional shifts. Otherwise use shifts
 		// provided by Dlaqr23 above or from the eigenvalues of a
 		// trailing principal submatrix.
 		if ndfl%kexsh == 0 {
 			ks = kbot - ns + 1
 			for i := kbot; i > max(ks, ktop+1); i -= 2 {
 				ss := math.Abs(h[i*ldh+i-1]) + math.Abs(h[(i-1)*ldh+i-2])
 				aa := wilk1*ss + h[i*ldh+i]
 				_, _, _, _, wr[i-1], wi[i-1], wr[i], wi[i], _, _ =
 					impl.Dlanv2(aa, ss, wilk2*ss, aa)
 			}
 			if ks == ktop {
 				wr[ks+1] = h[(ks+1)*ldh+ks+1]
 				wi[ks+1] = 0
 				wr[ks] = wr[ks+1]
 				wi[ks] = wi[ks+1]
 			}
 		} else {
 			// If we got ns/2 or fewer shifts, use Dlahqr or recur
 			// into Dlaqr04 on a trailing principal submatrix to get
 			// more. Since ns <= nsmax <=(n+6)/9, there is enough
 			// space below the subdiagonal to fit an ns×ns scratch
 			// array.
 			if kbot-ks+1 <= ns/2 {
 				ks = kbot - ns + 1
 				kt = n - ns
 				impl.Dlacpy(blas.All, ns, ns, h[ks*ldh+ks:], ldh, h[kt*ldh:], ldh)
 				if ns > nmin && recur > 0 {
 					ks += impl.Dlaqr04(false, false, ns, 1, ns-1, h[kt*ldh:], ldh,
 						wr[ks:ks+ns], wi[ks:ks+ns], 0, 0, nil, 0, work, lwork, recur-1)
 				} else {
 					ks += impl.Dlahqr(false, false, ns, 0, ns-1, h[kt*ldh:], ldh,
 						wr[ks:ks+ns], wi[ks:ks+ns], 0, 0, nil, 0)
 				}
 				// In case of a rare QR failure use eigenvalues
 				// of the trailing 2×2 principal submatrix.
 				if ks >= kbot {
 					aa := h[(kbot-1)*ldh+kbot-1]
 					bb := h[(kbot-1)*ldh+kbot]
 					cc := h[kbot*ldh+kbot-1]
 					dd := h[kbot*ldh+kbot]
 					_, _, _, _, wr[kbot-1], wi[kbot-1], wr[kbot], wi[kbot], _, _ =
 						impl.Dlanv2(aa, bb, cc, dd)
 					ks = kbot - 1
 				}
 			}

 			if kbot-ks+1 > ns {
 				// Sorting the shifts helps a little. Bubble
 				// sort keeps complex conjugate pairs together.
 				sorted := false
 				for k := kbot; k > ks; k-- {
 					if sorted {
 						break
 					}
 					sorted = true
 					for i := ks; i < k; i++ {
 						if math.Abs(wr[i])+math.Abs(wi[i]) >= math.Abs(wr[i+1])+math.Abs(wi[i+1]) {
 							continue
 						}
 						sorted = false
 						wr[i], wr[i+1] = wr[i+1], wr[i]
 						wi[i], wi[i+1] = wi[i+1], wi[i]
 					}
 				}
 			}

 			// Shuffle shifts into pairs of real shifts and pairs of
 			// complex conjugate shifts using the fact that complex
 			// conjugate shifts are already adjacent to one another.
 			// TODO(vladimir-ch): The shuffling here could probably
 			// be removed but I'm not sure right now and it's safer
 			// to leave it.
 			for i := kbot; i > ks+1; i -= 2 {
 				if wi[i] == -wi[i-1] {
 					continue
 				}
 				wr[i], wr[i-1], wr[i-2] = wr[i-1], wr[i-2], wr[i]
 				wi[i], wi[i-1], wi[i-2] = wi[i-1], wi[i-2], wi[i]
 			}
 		}

 		// If there are only two shifts and both are real, then use only one.
 		if kbot-ks+1 == 2 && wi[kbot] == 0 {
 			if math.Abs(wr[kbot]-h[kbot*ldh+kbot]) < math.Abs(wr[kbot-1]-h[kbot*ldh+kbot]) {
 				wr[kbot-1] = wr[kbot]
 			} else {
 				wr[kbot] = wr[kbot-1]
 			}
 		}

 		// Use up to ns of the the smallest magnitude shifts. If there
 		// aren't ns shifts available, then use them all, possibly
 		// dropping one to make the number of shifts even.
 		ns = min(ns, kbot-ks+1) &^ 1
 		ks = kbot - ns + 1

 		// Split workspace under the subdiagonal into:
 		// - a kdu×kdu work array U in the lower left-hand-corner,
 		// - a kdu×nhv horizontal work array WH along the bottom edge
 		//   (nhv must be at least kdu but more is better),
 		// - an nhv×kdu vertical work array WV along the left-hand-edge
 		//   (nhv must be at least kdu but more is better).
 		kdu := 3*ns - 3
 		ku := n - kdu
 		kwh := kdu
 		kwv = kdu + 3
 		nhv = n - kwv - kdu
 		// Small-bulge multi-shift QR sweep.
 		impl.Dlaqr5(wantt, wantz, kacc22, n, ktop, kbot, ns,
 			wr[ks:ks+ns], wi[ks:ks+ns], h, ldh, iloz, ihiz, z, ldz,
 			work, 3, h[ku*ldh:], ldh, nhv, h[kwv*ldh:], ldh, nhv, h[ku*ldh+kwh:], ldh)

 		// Note progress (or the lack of it).
 		if ld > 0 {
 			ndfl = 1
 		} else {
 			ndfl++
 		}
 	}

 	work[0] = float64(lwkopt)
 	return unconverged
 }
	// Copyright ©2016 The gonum Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package gonum

	import (
	"math"

	"gonum.org/v1/gonum/blas"
	)

	// Dlaqr04 computes the eigenvalues of a block of an n×n upper Hessenberg matrix
	// H, and optionally the matrices T and Z from the Schur decomposition
	// H = Z T Z^T
	// where T is an upper quasi-triangular matrix (the Schur form), and Z is the
	// orthogonal matrix of Schur vectors.
	//
	// wantt indicates whether the full Schur form T is required. If wantt is false,
	// then only enough of H will be updated to preserve the eigenvalues.
	//
	// wantz indicates whether the n×n matrix of Schur vectors Z is required. If it
	// is true, the orthogonal similarity transformation will be accumulated into
	// Z[iloz:ihiz+1,ilo:ihi+1], otherwise Z will not be referenced.
	//
	// ilo and ihi determine the block of H on which Dlaqr04 operates. It must hold that
	// 0 <= ilo <= ihi < n, if n > 0,
	// ilo == 0 and ihi == -1, if n == 0,
	// and the block must be isolated, that is,
	// ilo == 0 or H[ilo,ilo-1] == 0,
	// ihi == n-1 or H[ihi+1,ihi] == 0,
	// otherwise Dlaqr04 will panic.
	//
	// wr and wi must have length ihi+1.
	//
	// iloz and ihiz specify the rows of Z to which transformations will be applied
	// if wantz is true. It must hold that
	// 0 <= iloz <= ilo, and ihi <= ihiz < n,
	// otherwise Dlaqr04 will panic.
	//
	// work must have length at least lwork and lwork must be
	// lwork >= 1, if n <= 11,
	// lwork >= n, if n > 11,
	// otherwise Dlaqr04 will panic. lwork as large as 6*n may be required for
	// optimal performance. On return, work[0] will contain the optimal value of
	// lwork.
	//
	// If lwork is -1, instead of performing Dlaqr04, the function only estimates the
	// optimal workspace size and stores it into work[0]. Neither h nor z are
	// accessed.
	//
	// recur is the non-negative recursion depth. For recur > 0, Dlaqr04 behaves
	// as DLAQR0, for recur == 0 it behaves as DLAQR4.
	//
	// unconverged indicates whether Dlaqr04 computed all the eigenvalues of H[ilo:ihi+1,ilo:ihi+1].
	//
	// If unconverged is zero and wantt is true, H will contain on return the upper
	// quasi-triangular matrix T from the Schur decomposition. 2×2 diagonal blocks
	// (corresponding to complex conjugate pairs of eigenvalues) will be returned in
	// standard form, with H[i,i] == H[i+1,i+1] and H[i+1,i]*H[i,i+1] < 0.
	//
	// If unconverged is zero and if wantt is false, the contents of h on return is
	// unspecified.
	//
	// If unconverged is zero, all the eigenvalues have been computed and their real
	// and imaginary parts will be stored on return in wr[ilo:ihi+1] and
	// wi[ilo:ihi+1], respectively. If two eigenvalues are computed as a complex
	// conjugate pair, they are stored in consecutive elements of wr and wi, say the
	// i-th and (i+1)th, with wi[i] > 0 and wi[i+1] < 0. If wantt is true, then the
	// eigenvalues are stored in the same order as on the diagonal of the Schur form
	// returned in H, with wr[i] = H[i,i] and, if H[i:i+2,i:i+2] is a 2×2 diagonal
	// block, wi[i] = sqrt(-H[i+1,i]*H[i,i+1]) and wi[i+1] = -wi[i].
	//
	// If unconverged is positive, some eigenvalues have not converged, and
	// wr[unconverged:ihi+1] and wi[unconverged:ihi+1] will contain those
	// eigenvalues which have been successfully computed. Failures are rare.
	//
	// If unconverged is positive and wantt is true, then on return
	// (initial H)U = U(final H), (*)
	// where U is an orthogonal matrix. The final H is upper Hessenberg and
	// H[unconverged:ihi+1,unconverged:ihi+1] is upper quasi-triangular.
	//
	// If unconverged is positive and wantt is false, on return the remaining
	// unconverged eigenvalues are the eigenvalues of the upper Hessenberg matrix
	// H[ilo:unconverged,ilo:unconverged].
	//
	// If unconverged is positive and wantz is true, then on return
	// (final Z) = (initial Z)*U,
	// where U is the orthogonal matrix in (*) regardless of the value of wantt.
	//
	// References:
	// [1] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part I:
	// Maintaining Well-Focused Shifts and Level 3 Performance. SIAM J. Matrix
	// Anal. Appl. 23(4) (2002), pp. 929—947
	// URL: http://dx.doi.org/10.1137/S0895479801384573
	// [2] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part II:
	// Aggressive Early Deflation. SIAM J. Matrix Anal. Appl. 23(4) (2002), pp. 948—973
	// URL: http://dx.doi.org/10.1137/S0895479801384585
	//
	// Dlaqr04 is an internal routine. It is exported for testing purposes.
	func (impl Implementation) Dlaqr04(wantt, wantz bool, n, ilo, ihi int, h []float64, ldh int, wr, wi []float64, iloz, ihiz int, z []float64, ldz int, work []float64, lwork int, recur int) (unconverged int) {
	const (
	// Matrices of order ntiny or smaller must be processed by
	// Dlahqr because of insufficient subdiagonal scratch space.
	// This is a hard limit.
	ntiny = 11
	// Exceptional deflation windows: try to cure rare slow
	// convergence by varying the size of the deflation window after
	// kexnw iterations.
	kexnw = 5
	// Exceptional shifts: try to cure rare slow convergence with
	// ad-hoc exceptional shifts every kexsh iterations.
	kexsh = 6

	// See https://github.com/gonum/lapack/pull/151#discussion_r68162802
	// and the surrounding discussion for an explanation where these
	// constants come from.
	// TODO(vladimir-ch): Similar constants for exceptional shifts
	// are used also in dlahqr.go. The first constant is different
	// there, it is equal to 3. Why? And does it matter?
	wilk1 = 0.75
	wilk2 = -0.4375
	)

	switch {
	case ilo < 0 \|\| max(0, n-1) < ilo:
	panic(badIlo)
	case ihi < min(ilo, n-1) \|\| n <= ihi:
	panic(badIhi)
	case lwork < 1 && n <= ntiny && lwork != -1:
	panic(badWork)
	// TODO(vladimir-ch): Enable if and when we figure out what the minimum
	// necessary lwork value is. Dlaqr04 says that the minimum is n which
	// clashes with Dlaqr23's opinion about optimal work when nw <= 2
	// (independent of n).
	// case lwork < n && n > ntiny && lwork != -1:
	// panic(badWork)
	case len(work) < lwork:
	panic(shortWork)
	case recur < 0:
	panic("lapack: recur is negative")
	}
	if wantz {
	if iloz < 0 \|\| ilo < iloz {
	panic("lapack: invalid value of iloz")
	}
	if ihiz < ihi \|\| n <= ihiz {
	panic("lapack: invalid value of ihiz")
	}
	}
	if lwork != -1 {
	checkMatrix(n, n, h, ldh)
	if wantz {
	checkMatrix(n, n, z, ldz)
	}
	switch {
	case ilo > 0 && h[ilo*ldh+ilo-1] != 0:
	panic("lapack: block not isolated")
	case ihi+1 < n && h[(ihi+1)*ldh+ihi] != 0:
	panic("lapack: block not isolated")
	case len(wr) != ihi+1:
	panic("lapack: bad length of wr")
	case len(wi) != ihi+1:
	panic("lapack: bad length of wi")
	}
	}

	// Quick return.
	if n == 0 {
	work[0] = 1
	return 0
	}

	if n <= ntiny {
	// Tiny matrices must use Dlahqr.
	work[0] = 1
	if lwork == -1 {
	return 0
	}
	return impl.Dlahqr(wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, iloz, ihiz, z, ldz)
	}

	// Use small bulge multi-shift QR with aggressive early deflation on
	// larger-than-tiny matrices.
	var jbcmpz string
	if wantt {
	jbcmpz = "S"
	} else {
	jbcmpz = "E"
	}
	if wantz {
	jbcmpz += "V"
	} else {
	jbcmpz += "N"
	}

	var fname string
	if recur > 0 {
	fname = "DLAQR0"
	} else {
	fname = "DLAQR4"
	}
	// nwr is the recommended deflation window size. n is greater than 11,
	// so there is enough subdiagonal workspace for nwr >= 2 as required.
	// (In fact, there is enough subdiagonal space for nwr >= 3.)
	// TODO(vladimir-ch): If there is enough space for nwr >= 3, should we
	// use it?
	nwr := impl.Ilaenv(13, fname, jbcmpz, n, ilo, ihi, lwork)
	nwr = max(2, nwr)
	nwr = min(ihi-ilo+1, min((n-1)/3, nwr))

	// nsr is the recommended number of simultaneous shifts. n is greater
	// than 11, so there is enough subdiagonal workspace for nsr to be even
	// and greater than or equal to two as required.
	nsr := impl.Ilaenv(15, fname, jbcmpz, n, ilo, ihi, lwork)
	nsr = min(nsr, min((n+6)/9, ihi-ilo))
	nsr = max(2, nsr&^1)

	// Workspace query call to Dlaqr23.
	impl.Dlaqr23(wantt, wantz, n, ilo, ihi, nwr+1, nil, 0, iloz, ihiz, nil, 0,
	nil, nil, nil, 0, n, nil, 0, n, nil, 0, work, -1, recur)
	// Optimal workspace is max(Dlaqr5, Dlaqr23).
	lwkopt := max(3*nsr/2, int(work[0]))
	// Quick return in case of workspace query.
	if lwork == -1 {
	work[0] = float64(lwkopt)
	return 0
	}

	// Dlahqr/Dlaqr04 crossover point.
	nmin := impl.Ilaenv(12, fname, jbcmpz, n, ilo, ihi, lwork)
	nmin = max(ntiny, nmin)

	// Nibble determines when to skip a multi-shift QR sweep (Dlaqr5).
	nibble := impl.Ilaenv(14, fname, jbcmpz, n, ilo, ihi, lwork)
	nibble = max(0, nibble)

	// Computation mode of far-from-diagonal orthogonal updates in Dlaqr5.
	kacc22 := impl.Ilaenv(16, fname, jbcmpz, n, ilo, ihi, lwork)
	kacc22 = max(0, min(kacc22, 2))

	// nwmax is the largest possible deflation window for which there is
	// sufficient workspace.
	nwmax := min((n-1)/3, lwork/2)
	nw := nwmax // Start with maximum deflation window size.

	// nsmax is the largest number of simultaneous shifts for which there is
	// sufficient workspace.
	nsmax := min((n+6)/9, 2*lwork/3) &^ 1

	ndfl := 1 // Number of iterations since last deflation.
	ndec := 0 // Deflation window size decrement.

	// Main loop.
	var (
	itmax = max(30, 2kexsh) max(10, (ihi-ilo+1))
	it = 0
	)
	for kbot := ihi; kbot >= ilo; {
	if it == itmax {
	unconverged = kbot + 1
	break
	}
	it++

	// Locate active block.
	ktop := ilo
	for k := kbot; k >= ilo+1; k-- {
	if h[k*ldh+k-1] == 0 {
	ktop = k
	break
	}
	}

	// Select deflation window size nw.
	//
	// Typical Case:
	// If possible and advisable, nibble the entire active block.
	// If not, use size min(nwr,nwmax) or min(nwr+1,nwmax)
	// depending upon which has the smaller corresponding
	// subdiagonal entry (a heuristic).
	//
	// Exceptional Case:
	// If there have been no deflations in kexnw or more
	// iterations, then vary the deflation window size. At first,
	// because larger windows are, in general, more powerful than
	// smaller ones, rapidly increase the window to the maximum
	// possible. Then, gradually reduce the window size.
	nh := kbot - ktop + 1
	nwupbd := min(nh, nwmax)
	if ndfl < kexnw {
	nw = min(nwupbd, nwr)
	} else {
	nw = min(nwupbd, 2*nw)
	}
	if nw < nwmax {
	if nw >= nh-1 {
	nw = nh
	} else {
	kwtop := kbot - nw + 1
	if math.Abs(h[kwtopldh+kwtop-1]) > math.Abs(h[(kwtop-1)ldh+kwtop-2]) {
	nw++
	}
	}
	}
	if ndfl < kexnw {
	ndec = -1
	} else if ndec >= 0 \|\| nw >= nwupbd {
	ndec++
	if nw-ndec < 2 {
	ndec = 0
	}
	nw -= ndec
	}

	// Split workspace under the subdiagonal of H into:
	// - an nw×nw work array V in the lower left-hand corner,
	// - an nw×nhv horizontal work array along the bottom edge (nhv
	// must be at least nw but more is better),
	// - an nve×nw vertical work array along the left-hand-edge
	// (nhv can be any positive integer but more is better).
	kv := n - nw
	kt := nw
	kwv := nw + 1
	nhv := n - kwv - kt
	// Aggressive early deflation.
	ls, ld := impl.Dlaqr23(wantt, wantz, n, ktop, kbot, nw,
	h, ldh, iloz, ihiz, z, ldz, wr[:kbot+1], wi[:kbot+1],
	h[kvldh:], ldh, nhv, h[kvldh+kt:], ldh, nhv, h[kwv*ldh:], ldh, work, lwork, recur)

	// Adjust kbot accounting for new deflations.
	kbot -= ld
	// ks points to the shifts.
	ks := kbot - ls + 1

	// Skip an expensive QR sweep if there is a (partly heuristic)
	// reason to expect that many eigenvalues will deflate without
	// it. Here, the QR sweep is skipped if many eigenvalues have
	// just been deflated or if the remaining active block is small.
	if ld > 0 && (100ld > nwnibble \|\| kbot-ktop+1 <= min(nmin, nwmax)) {
	// ld is positive, note progress.
	ndfl = 1
	continue
	}

	// ns is the nominal number of simultaneous shifts. This may be
	// lowered (slightly) if Dlaqr23 did not provide that many
	// shifts.
	ns := min(min(nsmax, nsr), max(2, kbot-ktop)) &^ 1

	// If there have been no deflations in a multiple of kexsh
	// iterations, then try exceptional shifts. Otherwise use shifts
	// provided by Dlaqr23 above or from the eigenvalues of a
	// trailing principal submatrix.
	if ndfl%kexsh == 0 {
	ks = kbot - ns + 1
	for i := kbot; i > max(ks, ktop+1); i -= 2 {
	ss := math.Abs(h[ildh+i-1]) + math.Abs(h[(i-1)ldh+i-2])
	aa := wilk1ss + h[ildh+i]
	_, _, _, _, wr[i-1], wi[i-1], wr[i], wi[i], _, _ =
	impl.Dlanv2(aa, ss, wilk2*ss, aa)
	}
	if ks == ktop {
	wr[ks+1] = h[(ks+1)*ldh+ks+1]
	wi[ks+1] = 0
	wr[ks] = wr[ks+1]
	wi[ks] = wi[ks+1]
	}
	} else {
	// If we got ns/2 or fewer shifts, use Dlahqr or recur
	// into Dlaqr04 on a trailing principal submatrix to get
	// more. Since ns <= nsmax <=(n+6)/9, there is enough
	// space below the subdiagonal to fit an ns×ns scratch
	// array.
	if kbot-ks+1 <= ns/2 {
	ks = kbot - ns + 1
	kt = n - ns
	impl.Dlacpy(blas.All, ns, ns, h[ksldh+ks:], ldh, h[ktldh:], ldh)
	if ns > nmin && recur > 0 {
	ks += impl.Dlaqr04(false, false, ns, 1, ns-1, h[kt*ldh:], ldh,
	wr[ks:ks+ns], wi[ks:ks+ns], 0, 0, nil, 0, work, lwork, recur-1)
	} else {
	ks += impl.Dlahqr(false, false, ns, 0, ns-1, h[kt*ldh:], ldh,
	wr[ks:ks+ns], wi[ks:ks+ns], 0, 0, nil, 0)
	}
	// In case of a rare QR failure use eigenvalues
	// of the trailing 2×2 principal submatrix.
	if ks >= kbot {
	aa := h[(kbot-1)*ldh+kbot-1]
	bb := h[(kbot-1)*ldh+kbot]
	cc := h[kbot*ldh+kbot-1]
	dd := h[kbot*ldh+kbot]
	_, _, _, _, wr[kbot-1], wi[kbot-1], wr[kbot], wi[kbot], _, _ =
	impl.Dlanv2(aa, bb, cc, dd)
	ks = kbot - 1
	}
	}

	if kbot-ks+1 > ns {
	// Sorting the shifts helps a little. Bubble
	// sort keeps complex conjugate pairs together.
	sorted := false
	for k := kbot; k > ks; k-- {
	if sorted {
	break
	}
	sorted = true
	for i := ks; i < k; i++ {
	if math.Abs(wr[i])+math.Abs(wi[i]) >= math.Abs(wr[i+1])+math.Abs(wi[i+1]) {
	continue
	}
	sorted = false
	wr[i], wr[i+1] = wr[i+1], wr[i]
	wi[i], wi[i+1] = wi[i+1], wi[i]
	}
	}
	}

	// Shuffle shifts into pairs of real shifts and pairs of
	// complex conjugate shifts using the fact that complex
	// conjugate shifts are already adjacent to one another.
	// TODO(vladimir-ch): The shuffling here could probably
	// be removed but I'm not sure right now and it's safer
	// to leave it.
	for i := kbot; i > ks+1; i -= 2 {
	if wi[i] == -wi[i-1] {
	continue
	}
	wr[i], wr[i-1], wr[i-2] = wr[i-1], wr[i-2], wr[i]
	wi[i], wi[i-1], wi[i-2] = wi[i-1], wi[i-2], wi[i]
	}
	}

	// If there are only two shifts and both are real, then use only one.
	if kbot-ks+1 == 2 && wi[kbot] == 0 {
	if math.Abs(wr[kbot]-h[kbotldh+kbot]) < math.Abs(wr[kbot-1]-h[kbotldh+kbot]) {
	wr[kbot-1] = wr[kbot]
	} else {
	wr[kbot] = wr[kbot-1]
	}
	}

	// Use up to ns of the the smallest magnitude shifts. If there
	// aren't ns shifts available, then use them all, possibly
	// dropping one to make the number of shifts even.
	ns = min(ns, kbot-ks+1) &^ 1
	ks = kbot - ns + 1

	// Split workspace under the subdiagonal into:
	// - a kdu×kdu work array U in the lower left-hand-corner,
	// - a kdu×nhv horizontal work array WH along the bottom edge
	// (nhv must be at least kdu but more is better),
	// - an nhv×kdu vertical work array WV along the left-hand-edge
	// (nhv must be at least kdu but more is better).
	kdu := 3*ns - 3
	ku := n - kdu
	kwh := kdu
	kwv = kdu + 3
	nhv = n - kwv - kdu
	// Small-bulge multi-shift QR sweep.
	impl.Dlaqr5(wantt, wantz, kacc22, n, ktop, kbot, ns,
	wr[ks:ks+ns], wi[ks:ks+ns], h, ldh, iloz, ihiz, z, ldz,
	work, 3, h[kuldh:], ldh, nhv, h[kwvldh:], ldh, nhv, h[ku*ldh+kwh:], ldh)

	// Note progress (or the lack of it).
	if ld > 0 {
	ndfl = 1
	} else {
	ndfl++
	}
	}

	work[0] = float64(lwkopt)
	return unconverged
	}