Move the matrix stage of SkLinearBitmapPipeline over to using SkSmallAllocator.

The rest of the stages will follow. When all stages are completed,
this should significantly reduce stack use in the typical case.

This a step in removing the baroque stage system and moving towards
a SkRasterPipeline stage system.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2484273002

Review-Url: https://codereview.chromium.org/2484273002
diff --git a/src/core/SkLinearBitmapPipeline.cpp b/src/core/SkLinearBitmapPipeline.cpp
index d6634bc..ca42e02 100644
--- a/src/core/SkLinearBitmapPipeline.cpp
+++ b/src/core/SkLinearBitmapPipeline.cpp
@@ -88,9 +88,9 @@
         : fNext{next}
         , fStrategy{std::forward<Args>(args)...}{ }
 
-    MatrixStage(Next* next, const MatrixStage& stage)
+    MatrixStage(Next* next, MatrixStage* stage)
         : fNext{next}
-        , fStrategy{stage.fStrategy} { }
+        , fStrategy{stage->fStrategy} { }
 
     void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
         fStrategy.processPoints(&xs, &ys);
@@ -128,39 +128,6 @@
 using PerspectiveMatrix = MatrixStage<PerspectiveMatrixStrategy, Next>;
 
 
-static SkLinearBitmapPipeline::PointProcessorInterface* choose_matrix(
-    SkLinearBitmapPipeline::PointProcessorInterface* next,
-    const SkMatrix& inverse,
-    SkLinearBitmapPipeline::MatrixStage* matrixProc) {
-    if (inverse.hasPerspective()) {
-        matrixProc->initStage<PerspectiveMatrix<>>(
-            next,
-            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
-            SkVector{inverse.getScaleX(), inverse.getScaleY()},
-            SkVector{inverse.getSkewX(), inverse.getSkewY()},
-            SkVector{inverse.getPerspX(), inverse.getPerspY()},
-            inverse.get(SkMatrix::kMPersp2));
-    } else if (inverse.getSkewX() != 0.0f || inverse.getSkewY() != 0.0f) {
-        matrixProc->initStage<AffineMatrix<>>(
-            next,
-            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
-            SkVector{inverse.getScaleX(), inverse.getScaleY()},
-            SkVector{inverse.getSkewX(), inverse.getSkewY()});
-    } else if (inverse.getScaleX() != 1.0f || inverse.getScaleY() != 1.0f) {
-        matrixProc->initStage<ScaleMatrix<>>(
-            next,
-            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
-            SkVector{inverse.getScaleX(), inverse.getScaleY()});
-    } else if (inverse.getTranslateX() != 0.0f || inverse.getTranslateY() != 0.0f) {
-        matrixProc->initStage<TranslateMatrix<>>(
-            next,
-            SkVector{inverse.getTranslateX(), inverse.getTranslateY()});
-    } else {
-        return next;
-    }
-    return matrixProc->get();
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Tile Stage
 
@@ -658,7 +625,7 @@
         srcPixmap, paintColor, &fSampleStage, &fAccessor);
     auto tilerStage   = choose_tiler(samplerStage, dimensions, xTile, yTile,
                                      filterQuality, dx, &fTileStage);
-    fFirstStage       = choose_matrix(tilerStage, adjustedInverse, &fMatrixStage);
+    fFirstStage       = ChooseMatrix(tilerStage, adjustedInverse);
     fLastStage        = blenderStage;
 }
 
@@ -720,8 +687,7 @@
     auto sampleStage = fSampleStage.get();
     auto tilerStage = pipeline.fTileStage.cloneStageTo(sampleStage, &fTileStage);
     tilerStage = (tilerStage != nullptr) ? tilerStage : sampleStage;
-    auto matrixStage = pipeline.fMatrixStage.cloneStageTo(tilerStage, &fMatrixStage);
-    matrixStage = (matrixStage != nullptr) ? matrixStage : tilerStage;
+    auto matrixStage = pipeline.fMatrixStageCloner(tilerStage, &fMemory);
     fFirstStage = matrixStage;
 }
 
@@ -740,3 +706,56 @@
     // first pixel to the center of the last pixel. This implies that length is count-1.
     fFirstStage->pointSpan(Span{{x + 0.5f, y + 0.5f}, count - 1.0f, count});
 }
+
+SkLinearBitmapPipeline::PointProcessorInterface*
+SkLinearBitmapPipeline::ChooseMatrix(PointProcessorInterface* next, const SkMatrix& inverse) {
+    if (inverse.hasPerspective()) {
+        auto matrixStage = fMemory.createT<PerspectiveMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
+            SkVector{inverse.getScaleX(), inverse.getScaleY()},
+            SkVector{inverse.getSkewX(), inverse.getSkewY()},
+            SkVector{inverse.getPerspX(), inverse.getPerspY()},
+            inverse.get(SkMatrix::kMPersp2));
+        fMatrixStageCloner =
+            [matrixStage](PointProcessorInterface* cloneNext, MemoryAllocator* memory) {
+                return memory->createT<PerspectiveMatrix<>>(cloneNext, matrixStage);
+            };
+        return matrixStage;
+    } else if (inverse.getSkewX() != 0.0f || inverse.getSkewY() != 0.0f) {
+        auto matrixStage = fMemory.createT<AffineMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
+            SkVector{inverse.getScaleX(), inverse.getScaleY()},
+            SkVector{inverse.getSkewX(), inverse.getSkewY()});
+        fMatrixStageCloner =
+            [matrixStage](PointProcessorInterface* cloneNext, MemoryAllocator* memory) {
+                return memory->createT<AffineMatrix<>>(cloneNext, matrixStage);
+            };
+        return matrixStage;
+    } else if (inverse.getScaleX() != 1.0f || inverse.getScaleY() != 1.0f) {
+        auto matrixStage = fMemory.createT<ScaleMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
+            SkVector{inverse.getScaleX(), inverse.getScaleY()});
+        fMatrixStageCloner =
+            [matrixStage](PointProcessorInterface* cloneNext, MemoryAllocator* memory) {
+                return memory->createT<ScaleMatrix<>>(cloneNext, matrixStage);
+            };
+        return matrixStage;
+    } else if (inverse.getTranslateX() != 0.0f || inverse.getTranslateY() != 0.0f) {
+        auto matrixStage = fMemory.createT<TranslateMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()});
+        fMatrixStageCloner =
+            [matrixStage](PointProcessorInterface* cloneNext, MemoryAllocator* memory) {
+                return memory->createT<TranslateMatrix<>>(cloneNext, matrixStage);
+            };
+        return matrixStage;
+    } else {
+        fMatrixStageCloner = [](PointProcessorInterface* cloneNext, MemoryAllocator* memory) {
+            return cloneNext;
+        };
+        return next;
+    }
+}
diff --git a/src/core/SkLinearBitmapPipeline.h b/src/core/SkLinearBitmapPipeline.h
index 4436de9..ea78489 100644
--- a/src/core/SkLinearBitmapPipeline.h
+++ b/src/core/SkLinearBitmapPipeline.h
@@ -12,6 +12,7 @@
 #include "SkImageInfo.h"
 #include "SkMatrix.h"
 #include "SkShader.h"
+#include "SkSmallAllocator.h"
 
 class SkEmbeddableLinearPipeline;
 
@@ -125,15 +126,23 @@
     class PixelAccessorInterface;
 
     // These values were generated by the assert above in Stage::init{Sink|Stage}.
-    using MatrixStage  = Stage<PointProcessorInterface,     56, PointProcessorInterface>;
     using TileStage    = Stage<PointProcessorInterface,     48, SampleProcessorInterface>;
     using SampleStage  = Stage<SampleProcessorInterface,   160, BlendProcessorInterface>;
     using BlenderStage = Stage<BlendProcessorInterface,     48>;
     using Accessor     = PolyMemory<PixelAccessorInterface, 64>;
 
 private:
+    PointProcessorInterface* ChooseMatrix(
+        PointProcessorInterface* next,
+        const SkMatrix& inverse);
+
+    using MemoryAllocator = SkSmallAllocator<64, 1>;
+    using MatrixCloner =
+        std::function<PointProcessorInterface* (PointProcessorInterface*, MemoryAllocator*)>;
+
+    MemoryAllocator          fMemory;
     PointProcessorInterface* fFirstStage;
-    MatrixStage              fMatrixStage;
+    MatrixCloner             fMatrixStageCloner;
     TileStage                fTileStage;
     SampleStage              fSampleStage;
     BlenderStage             fBlenderStage;
diff --git a/src/core/SkLinearBitmapPipeline_matrix.h b/src/core/SkLinearBitmapPipeline_matrix.h
index 2eb475d..78f7231 100644
--- a/src/core/SkLinearBitmapPipeline_matrix.h
+++ b/src/core/SkLinearBitmapPipeline_matrix.h
@@ -17,13 +17,13 @@
         : fXOffset{X(offset)}
         , fYOffset{Y(offset)} { }
 
-    void processPoints(Sk4s* xs, Sk4s* ys) {
+    void processPoints(Sk4s* xs, Sk4s* ys) const {
         *xs = *xs + fXOffset;
         *ys = *ys + fYOffset;
     }
 
     template <typename Next>
-    bool maybeProcessSpan(Span span, Next* next) {
+    bool maybeProcessSpan(Span span, Next* next) const {
         SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
         next->pointSpan(Span{start + SkPoint{fXOffset, fYOffset}, length, count});
@@ -39,13 +39,13 @@
     ScaleMatrixStrategy(SkVector offset, SkVector scale)
         : fXOffset{X(offset)}, fYOffset{Y(offset)}
         ,  fXScale{X(scale)},   fYScale{Y(scale)} { }
-    void processPoints(Sk4s* xs, Sk4s* ys) {
+    void processPoints(Sk4s* xs, Sk4s* ys) const {
         *xs = *xs * fXScale + fXOffset;
         *ys = *ys * fYScale + fYOffset;
     }
 
     template <typename Next>
-    bool maybeProcessSpan(Span span, Next* next) {
+    bool maybeProcessSpan(Span span, Next* next) const {
         SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
         SkPoint newStart =
@@ -66,7 +66,7 @@
         : fXOffset{X(offset)}, fYOffset{Y(offset)}
         , fXScale{X(scale)},   fYScale{Y(scale)}
         , fXSkew{X(skew)},     fYSkew{Y(skew)} { }
-    void processPoints(Sk4s* xs, Sk4s* ys) {
+    void processPoints(Sk4s* xs, Sk4s* ys) const {
         Sk4s newXs = fXScale * *xs +  fXSkew * *ys + fXOffset;
         Sk4s newYs =  fYSkew * *xs + fYScale * *ys + fYOffset;
 
@@ -75,7 +75,7 @@
     }
 
     template <typename Next>
-    bool maybeProcessSpan(Span span, Next* next) {
+    bool maybeProcessSpan(Span span, Next* next) const {
         return false;
     }
 
@@ -92,7 +92,7 @@
         : fXOffset{X(offset)}, fYOffset{Y(offset)}, fZOffset{zOffset}
         , fXScale{X(scale)},   fYScale{Y(scale)}
         , fXSkew{X(skew)},     fYSkew{Y(skew)}, fZXSkew{X(zSkew)}, fZYSkew{Y(zSkew)} { }
-    void processPoints(Sk4s* xs, Sk4s* ys) {
+    void processPoints(Sk4s* xs, Sk4s* ys) const {
         Sk4s newXs = fXScale * *xs +  fXSkew * *ys + fXOffset;
         Sk4s newYs =  fYSkew * *xs + fYScale * *ys + fYOffset;
         Sk4s newZs =  fZXSkew * *xs + fZYSkew * *ys + fZOffset;
@@ -102,7 +102,7 @@
     }
 
     template <typename Next>
-    bool maybeProcessSpan(Span span, Next* next) {
+    bool maybeProcessSpan(Span span, Next* next) const {
         return false;
     }