Support SkImageShader in SkRasterPipeline blitter

First of many CLs, I'm sure.

This handles 8888 or sRGB sources with an affine matrix, clamp/clamp tiling, and nearest-neighbor sampling only.

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=4906
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Change-Id: I99f7508852b3d44b6f52f7a0bee29a793af35c48
Reviewed-on: https://skia-review.googlesource.com/4906
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index b361ab6..6345f11 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -71,10 +71,14 @@
     M(clear) M(modulate) M(multiply) M(plus_) M(screen) M(xor_)  \
     M(colorburn) M(colordodge) M(darken) M(difference)           \
     M(exclusion) M(hardlight) M(lighten) M(overlay) M(softlight) \
-    M(luminance_to_alpha) M(matrix_3x4) M(matrix_4x5)            \
+    M(luminance_to_alpha)                                        \
+    M(matrix_2x3) M(matrix_3x4) M(matrix_4x5)                    \
     M(parametric_r) M(parametric_g) M(parametric_b)              \
     M(table_r) M(table_g) M(table_b)                             \
-    M(color_lookup_table) M(lab_to_xyz) M(swap_rb)
+    M(color_lookup_table) M(lab_to_xyz) M(swap_rb)               \
+    M(clamp_x) M(mirror_x) M(repeat_x)                           \
+    M(clamp_y) M(mirror_y) M(repeat_y)                           \
+    M(nearest_565) M(nearest_8888) M(nearest_srgb) M(nearest_f16)
 
 class SkRasterPipeline {
 public:
diff --git a/src/image/SkImageShader.cpp b/src/image/SkImageShader.cpp
index 8f8f6a3..152007d 100644
--- a/src/image/SkImageShader.cpp
+++ b/src/image/SkImageShader.cpp
@@ -10,8 +10,10 @@
 #include "SkColorShader.h"
 #include "SkColorTable.h"
 #include "SkEmptyShader.h"
+#include "SkFixedAlloc.h"
 #include "SkImage_Base.h"
 #include "SkImageShader.h"
+#include "SkPM4fPriv.h"
 #include "SkReadBuffer.h"
 #include "SkWriteBuffer.h"
 
@@ -92,7 +94,7 @@
     // widen that, we have to reject bitmaps that are larger.
     //
     static const int kMaxSize = 65535;
-    
+
     return w > kMaxSize || h > kMaxSize;
 }
 
@@ -103,16 +105,16 @@
     // HWUI does not support color shaders (see b/22390304)
     return false;
 #endif
-    
+
     if (1 != image->width() || 1 != image->height()) {
         return false;
     }
-    
+
     SkPixmap pmap;
     if (!image->peekPixels(&pmap)) {
         return false;
     }
-    
+
     switch (pmap.colorType()) {
         case kN32_SkColorType:
             *color = SkUnPreMultiply::PMColorToColor(*pmap.addr32(0, 0));
@@ -263,3 +265,112 @@
 SkFlattenable::Register("SkBitmapProcShader", SkBitmapProcShader_CreateProc, kSkShader_Type);
 SK_DEFINE_FLATTENABLE_REGISTRAR_GROUP_END
 
+
+bool SkImageShader::onAppendStages(SkRasterPipeline* p, SkColorSpace* dst, SkFallbackAlloc* scratch,
+                                   const SkMatrix& ctm, SkFilterQuality quality) const {
+    SkPixmap pm;
+    if (!fImage->peekPixels(&pm)) {
+        return false;
+    }
+    auto info = pm.info();
+
+
+    auto matrix = SkMatrix::Concat(ctm, this->getLocalMatrix());
+    if (!matrix.invert(&matrix)) {
+        return false;
+    }
+
+    // TODO: perspective
+    if (!matrix.asAffine(nullptr)) {
+        return false;
+    }
+
+    // TODO: all formats
+    switch (info.colorType()) {
+        case kRGBA_8888_SkColorType:
+        case kBGRA_8888_SkColorType:
+//      case   kRGB_565_SkColorType:
+//      case  kRGBA_F16_SkColorType:
+            break;
+        default: return false;
+    }
+
+    // TODO: all tile modes
+    if (fTileModeX != kClamp_TileMode || fTileModeY != kClamp_TileMode) {
+        return false;
+    }
+
+    // TODO: bilerp
+    if (quality != kNone_SkFilterQuality) {
+        return false;
+    }
+
+    // TODO: mtklein doesn't understand why we do this.
+    if (quality == kNone_SkFilterQuality) {
+        if (matrix.getScaleX() >= 0) {
+            matrix.setTranslateX(nextafterf(matrix.getTranslateX(),
+                                            floorf(matrix.getTranslateX())));
+        }
+        if (matrix.getScaleY() >= 0) {
+            matrix.setTranslateY(nextafterf(matrix.getTranslateY(),
+                                            floorf(matrix.getTranslateY())));
+        }
+    }
+
+    struct context {
+        const void* pixels;
+        int         stride;
+        int         width;
+        int         height;
+        float       matrix[6];
+    };
+    auto ctx = scratch->make<context>();
+
+    ctx->pixels   = pm.addr();
+    ctx->stride   = pm.rowBytesAsPixels();
+    ctx->width    = pm.width();
+    ctx->height   = pm.height();
+    SkAssertResult(matrix.asAffine(ctx->matrix));
+
+    p->append(SkRasterPipeline::matrix_2x3, &ctx->matrix);
+
+    switch (fTileModeX) {
+        case kClamp_TileMode:  p->append(SkRasterPipeline::clamp_x,  &ctx->width); break;
+        case kMirror_TileMode: p->append(SkRasterPipeline::mirror_x, &ctx->width); break;
+        case kRepeat_TileMode: p->append(SkRasterPipeline::repeat_x, &ctx->width); break;
+    }
+    switch (fTileModeY) {
+        case kClamp_TileMode:  p->append(SkRasterPipeline::clamp_y,  &ctx->height); break;
+        case kMirror_TileMode: p->append(SkRasterPipeline::mirror_y, &ctx->height); break;
+        case kRepeat_TileMode: p->append(SkRasterPipeline::repeat_y, &ctx->height); break;
+    }
+
+    switch(info.colorType()) {
+        case kRGBA_8888_SkColorType:
+        case kBGRA_8888_SkColorType:
+            if (info.gammaCloseToSRGB() && dst) {
+                p->append(SkRasterPipeline::nearest_srgb, ctx);
+            } else {
+                p->append(SkRasterPipeline::nearest_8888, ctx);
+            }
+            break;
+        case kRGBA_F16_SkColorType:
+            p->append(SkRasterPipeline::nearest_f16, ctx);
+            break;
+        case kRGB_565_SkColorType:
+            p->append(SkRasterPipeline::nearest_565, ctx);
+            break;
+
+        default:
+            SkASSERT(false);
+            break;
+    }
+
+    if (info.colorType() == kBGRA_8888_SkColorType) {
+        p->append(SkRasterPipeline::swap_rb);
+    }
+    if (info.alphaType() == kUnpremul_SkAlphaType) {
+        p->append(SkRasterPipeline::premul);
+    }
+    return append_gamut_transform(p, scratch, info.colorSpace(), dst);
+}
diff --git a/src/image/SkImageShader.h b/src/image/SkImageShader.h
index 8905881..e3bab5c 100644
--- a/src/image/SkImageShader.h
+++ b/src/image/SkImageShader.h
@@ -37,6 +37,9 @@
 #endif
     SkImage* onIsAImage(SkMatrix*, TileMode*) const override;
 
+    bool onAppendStages(SkRasterPipeline*, SkColorSpace*, SkFallbackAlloc*,
+                        const SkMatrix& ctm, SkFilterQuality) const override;
+
     sk_sp<SkImage>  fImage;
     const TileMode  fTileModeX;
     const TileMode  fTileModeY;
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index c85d583..3f96375 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -514,10 +514,13 @@
     return vqmovn_u16(vcombine_u16(_16, _16));
 }
 
+template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
+    uint16x8_t _16 = vmovl_u8(src.fVec);
+    return vmovl_u16(vget_low_u16(_16));
+}
+
 template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
-    uint16x8_t _16 = vmovl_u8 (src.fVec) ;
-    uint32x4_t _32 = vmovl_u16(vget_low_u16(_16));
-    return vcvtq_f32_u32(_32);
+    return vcvtq_f32_u32(SkNx_cast<int32_t>(src).fVec);
 }
 
 template<> AI /*static*/ Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) {
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index a4783c6..3410741 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -636,8 +636,12 @@
         return _mm256_fmadd_ps(a.fVec, b.fVec, c.fVec);
     }
 
+    template<> AI /*static*/ Sk8i SkNx_cast<int>(const Sk8b& src) {
+        return _mm256_cvtepu8_epi32(src.fVec);
+    }
+
     template<> AI /*static*/ Sk8f SkNx_cast<float>(const Sk8b& src) {
-        return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(src.fVec));
+        return _mm256_cvtepi32_ps(SkNx_cast<int>(src).fVec);
     }
 
     template<> AI /*static*/ Sk8f SkNx_cast<float>(const Sk8i& src) {
@@ -700,15 +704,18 @@
 #endif
 }
 
-template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
+template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
     const int _ = ~0;
-    auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
+    return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
 #else
-    auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()),
-         _32 = _mm_unpacklo_epi16(_16,     _mm_setzero_si128());
+    auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());
+    return _mm_unpacklo_epi16(_16, _mm_setzero_si128());
 #endif
-    return _mm_cvtepi32_ps(_32);
+}
+
+template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
+    return _mm_cvtepi32_ps(SkNx_cast<int32_t>(src).fVec);
 }
 
 template<> AI /*static*/ Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index f1aa250..39366b3 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -31,6 +31,7 @@
     using SkNf = SkNx<N, float>;
     using SkNi = SkNx<N, int>;
     using SkNh = SkNx<N, uint16_t>;
+    using SkNb = SkNx<N, uint8_t>;
 
     struct BodyStage;
     struct TailStage;
@@ -548,6 +549,16 @@
     r = g = b = 0;
 }
 
+STAGE(matrix_2x3, true) {
+    auto m = (const float*)ctx;
+
+    auto fma = [](const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); };
+    auto R = fma(r,m[0], fma(g,m[2], m[4])),
+         G = fma(r,m[1], fma(g,m[3], m[5]));
+    r = R;
+    g = G;
+}
+
 STAGE(matrix_3x4, true) {
     auto m = (const float*)ctx;
 
@@ -659,6 +670,75 @@
     SkTSwap(r, b);
 }
 
+STAGE(clamp_x, true) {
+    auto w = *(const int*)ctx;
+    r = SkNf::Max(0, SkNf::Min(r, SkNf(w - 0.5f)));
+}
+STAGE(clamp_y, true) {
+    auto h = *(const int*)ctx;
+    g = SkNf::Max(0, SkNf::Min(g, SkNf(h - 0.5f)));
+}
+
+STAGE(mirror_x, true) {}  // TODO
+STAGE(mirror_y, true) {}  // TODO
+
+STAGE(repeat_x, true) {}  // TODO
+STAGE(repeat_y, true) {}  // TODO
+
+struct NearestCtx {
+    const void* pixels;
+    int         stride;
+};
+
+STAGE(nearest_565, true) {}  // TODO
+STAGE(nearest_f16, true) {}  // TODO
+
+STAGE(nearest_8888, true) {
+    auto nc = (const NearestCtx*)ctx;
+
+    SkNi ix = SkNx_cast<int>(r),
+         iy = SkNx_cast<int>(g);
+    SkNi offset = iy*nc->stride + ix;
+
+    auto p = (const uint32_t*)nc->pixels;
+    uint8_t R[N], G[N], B[N], A[N];
+    for (size_t i = 0; i < (kIsTail ? tail : N); i++) {
+        uint32_t rgba = p[offset[i]];
+        R[i] = rgba >>  0;
+        G[i] = rgba >>  8;
+        B[i] = rgba >> 16;
+        A[i] = rgba >> 24;
+    }
+
+    r = SkNx_cast<float>(SkNb::Load(R)) * (1/255.0f);
+    g = SkNx_cast<float>(SkNb::Load(G)) * (1/255.0f);
+    b = SkNx_cast<float>(SkNb::Load(B)) * (1/255.0f);
+    a = SkNx_cast<float>(SkNb::Load(A)) * (1/255.0f);
+}
+
+STAGE(nearest_srgb, true) {
+    auto nc = (const NearestCtx*)ctx;
+
+    SkNi ix = SkNx_cast<int>(r),
+         iy = SkNx_cast<int>(g);
+    SkNi offset = iy*nc->stride + ix;
+
+    auto p = (const uint32_t*)nc->pixels;
+    uint8_t R[N], G[N], B[N], A[N];
+    for (size_t i = 0; i < (kIsTail ? tail : N); i++) {
+        uint32_t rgba = p[offset[i]];
+        R[i] = rgba >>  0;
+        G[i] = rgba >>  8;
+        B[i] = rgba >> 16;
+        A[i] = rgba >> 24;
+    }
+
+    r = sk_linear_from_srgb_math(SkNx_cast<int>(SkNb::Load(R)));
+    g = sk_linear_from_srgb_math(SkNx_cast<int>(SkNb::Load(G)));
+    b = sk_linear_from_srgb_math(SkNx_cast<int>(SkNb::Load(B)));
+    a = SkNx_cast<float>(SkNb::Load(A)) * (1/255.0f);
+}
+
 template <typename Fn>
 SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) {
     switch (st) {