ARGBToUV for SSE use pshufb/pmaddubsw

Was
ARGBToJ420_Opt (377 ms)
Now
ARGBToJ420_Opt (340 ms)

Bug: None
Change-Id: Iada2d6e9ecdb141b9e2acbdf343f890e4aaebe34
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6967754
Reviewed-by: Justin Green <greenjustin@google.com>
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index c282169..c8c91e0 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1733,6 +1733,13 @@
 #endif  // HAS_ARGBTOUV444ROW_AVX2
 
 #ifdef HAS_ARGBTOUVROW_SSSE3
+
+// ARGBARGB to AARRGGBB shuffle
+static const lvec8 kShuffleAARRGGBB = {
+    0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
+    0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
+};
+
 // 8x2 -> 4x1 ARGB pixels converted to 4 U and 4 V
 // ARGBToUV does rounding average of 4 ARGB pixels
 void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
@@ -1742,69 +1749,52 @@
                              int width,
                              const struct RgbUVConstants* rgbuvconstants) {
   asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0x0101
-      "pabsb       %%xmm4,%%xmm4                 \n"
-      "movdqa      %0,%%xmm6                     \n"  // ARGB to U
-      "movdqa      %1,%%xmm7                     \n"  // ARGB to V
-      :
-      : "m"(rgbuvconstants->kRGBToU),  // %0
-        "m"(rgbuvconstants->kRGBToV)   // %1
-      : "memory", "cc");
+      "movdqa     %5,%%xmm4                     \n"  // RGBToU
+      "movdqa     %6,%%xmm5                     \n"  // RGBToV
+      "pcmpeqb    %%xmm6,%%xmm6                 \n"  // 0x0101
+      "pabsb      %%xmm6,%%xmm6                 \n"
+      "movdqa     %7,%%xmm7                     \n"  // kShuffleAARRGGBB
+      "sub         %1,%2                        \n"
 
-  asm volatile(
-
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
       "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"  // Read 8 ARGB Pixels
-      "movdqu      0x10(%0),%%xmm5               \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "shufps      $0x88,%%xmm5,%%xmm0           \n"  // Even pixels
-      "shufps      $0xdd,%%xmm5,%%xmm1           \n"  // Odd pixels
-      "movdqa      %%xmm0,%%xmm5                 \n"
-      "punpcklbw   %%xmm1,%%xmm0                 \n"  // aarrgbb
-      "punpckhbw   %%xmm5,%%xmm1                 \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // paired add argb
-      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "movdqu     (%0),%%xmm0                   \n"  // Read 8x2 ARGB Pixels
+      "movdqu     0x10(%0),%%xmm1               \n"
+      "movdqu     0x00(%0,%4,1),%%xmm2          \n"
+      "movdqu     0x10(%0,%4,1),%%xmm3          \n"
+      "pshufb     %%xmm7,%%xmm0                 \n"  // aarrggbb
+      "pshufb     %%xmm7,%%xmm1                 \n"
+      "pshufb     %%xmm7,%%xmm2                 \n"
+      "pshufb     %%xmm7,%%xmm3                 \n"
+      "pmaddubsw  %%xmm6,%%xmm0                 \n"  // 8x2 -> 4x2
+      "pmaddubsw  %%xmm6,%%xmm1                 \n"
+      "pmaddubsw  %%xmm6,%%xmm2                 \n"
+      "pmaddubsw  %%xmm6,%%xmm3                 \n"
+      "paddw      %%xmm2,%%xmm0                 \n"  // 4x2 -> 4x1
+      "paddw      %%xmm3,%%xmm1                 \n"
+      "pxor       %%xmm2,%%xmm2                 \n"  // 0 for vpavgw
+      "psrlw      $1,%%xmm0                     \n"
+      "psrlw      $1,%%xmm1                     \n"
+      "pavgw      %%xmm2,%%xmm0                 \n"
+      "pavgw      %%xmm2,%%xmm1                 \n"
+      "packuswb   %%xmm1,%%xmm0                 \n"  // mutates
 
-      "movdqu      0x00(%0,%4,1),%%xmm2          \n"  // Read 2nd row
-      "movdqu      0x10(%0,%4,1),%%xmm5          \n"
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "shufps      $0x88,%%xmm5,%%xmm2           \n"  // Even
-      "shufps      $0xdd,%%xmm5,%%xmm3           \n"  // Odd pixels
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "punpcklbw   %%xmm3,%%xmm2                 \n"  // aarrgbb
-      "punpckhbw   %%xmm5,%%xmm3                 \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"  // argb
-      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "movdqa     %%xmm6,%%xmm2                 \n"
+      "psllw      $15,%%xmm2                    \n"  // 0x8000
+      "movdqa     %%xmm0,%%xmm1                 \n"
+      "pmaddubsw  %%xmm5,%%xmm1                 \n"  // 4 V
+      "pmaddubsw  %%xmm4,%%xmm0                 \n"  // 4 U
+      "phaddw     %%xmm1,%%xmm0                 \n"  // uuuuvvvv
+      "psubw      %%xmm0,%%xmm2                 \n"
+      "psrlw      $0x8,%%xmm2                   \n"
+      "packuswb   %%xmm2,%%xmm2                 \n"
+      "movd       %%xmm2,(%1)                   \n"  // Write 4 U's
+      "pshufd     $0x55,%%xmm2,%%xmm2           \n"  // Copy V to low 4 bytes
+      "movd       %%xmm2,0x00(%1,%2,1)          \n"  // Write 4 V's
 
-      "pxor        %%xmm5,%%xmm5                 \n"  // constant 0 for pavgw
-      "paddw       %%xmm2,%%xmm0                 \n"
-      "paddw       %%xmm3,%%xmm1                 \n"
-      "psrlw       $1,%%xmm0                     \n"  // round
-      "psrlw       $1,%%xmm1                     \n"
-      "pavgw       %%xmm5,%%xmm0                 \n"
-      "pavgw       %%xmm5,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"  // 4 ARGB pixels
-
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pmaddubsw   %%xmm6,%%xmm0                 \n"  // u
-      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // v
-      "phaddw      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
-
-      "movdqa      %5,%%xmm2                     \n"  // 0x8000
-      "psubw       %%xmm0,%%xmm2                 \n"  // unsigned 0 to 0xffff
-      "psrlw       $0x8,%%xmm2                   \n"
-      "packuswb    %%xmm2,%%xmm2                 \n"
-      "movd        %%xmm2,(%1)                   \n"  // Write 4 U's
-      "shufps      $0xdd,%%xmm2,%%xmm2           \n"
-      "movd        %%xmm2,0x00(%1,%2,1)          \n"  // Write 4 V's
-
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x4(%1),%1                    \n"
-      "subl        $0x8,%3                       \n"
-      "jg          1b                            \n"
+      "lea         0x20(%0),%0                  \n"
+      "lea         0x4(%1),%1                   \n"
+      "subl        $0x8,%3                      \n"
+      "jg          1b                           \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
@@ -1814,21 +1804,17 @@
         "+rm"(width)  // %3
 #endif
       : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kAddUV128)                     // %5
-
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+        "m"(rgbuvconstants->kRGBToU),      // %5
+        "m"(rgbuvconstants->kRGBToV),      // %6
+        "m"(kShuffleAARRGGBB)              // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                        "xmm6", "xmm7");
 }
+
 #endif  // HAS_ARGBTOUVROW_SSSE3
 
 #ifdef HAS_ARGBTOUVROW_AVX2
 
-// ARGBARGB to AARRGGBB shuffle
-static const lvec8 kShuffleAARRGGBB = {
-    0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
-    0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
-};
-
 // 16x2 -> 8x1 ARGB pixels converted to 8 U and 8 V
 // ARGBToUV does rounding average of 4 ARGB pixels
 void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
@@ -1888,7 +1874,11 @@
       : "+r"(src_argb),  // %0
         "+r"(dst_u),     // %1
         "+r"(dst_v),     // %2
-        "+r"(width)      // %3
+#if defined(__i386__)
+        "+m"(width)  // %3
+#else
+        "+rm"(width)  // %3
+#endif
       : "r"((intptr_t)(src_stride_argb)),  // %4
         "m"(rgbuvconstants->kRGBToU),      // %5
         "m"(rgbuvconstants->kRGBToV),      // %6