highbd x86: consolidate tran_low_t conversions

Create new helper files specifically for converting tran_low_t types.

Change-Id: I7c4c458ef910f3b3d10a3cfbf9df4de7682fd905
diff --git a/libs.mk b/libs.mk
index 36935bd..f1e9242 100644
--- a/libs.mk
+++ b/libs.mk
@@ -149,6 +149,7 @@
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm
 endif
 CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
@@ -204,6 +205,7 @@
     third_party/x86inc/x86inc.asm \
     vpx_config.asm \
     vpx_ports/x86_abi_support.asm \
+    vpx_dsp/x86/bitdepth_conversion_sse2.asm \
 
 vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm
index ced37bd..e24cabb 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.asm
+++ b/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -11,6 +11,7 @@
 %define private_prefix vp9
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -62,25 +63,7 @@
   psllw           m0,        2
   psllw           m1,        2
 
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; sign extension
-  mova            m2,             m0
-  mova            m3,             m1
-  punpcklwd       m0,             m0
-  punpcklwd       m1,             m1
-  punpckhwd       m2,             m2
-  punpckhwd       m3,             m3
-  psrad           m0,             16
-  psrad           m1,             16
-  psrad           m2,             16
-  psrad           m3,             16
-  mova            [outputq],      m0
-  mova            [outputq + 16], m2
-  mova            [outputq + 32], m1
-  mova            [outputq + 48], m3
-%else
-  mova            [outputq],      m0
-  mova            [outputq + 16], m1
-%endif
+  STORE_TRAN_LOW 0, outputq, 0, 2, 3
+  STORE_TRAN_LOW 1, outputq, 1, 2, 3
 
   RET
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index b3c3d7b..9140085 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -14,7 +14,7 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index e11eda4..a0f7d75 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -13,6 +13,12 @@
 
 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 
+DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2.h
+# This file is included in libs.mk. Including it here would cause it to be
+# compiled into an object. Even as an empty file, this would create an
+# executable section on the stack.
+#DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2$(ASM)
+
 # bit reader
 DSP_SRCS-yes += prob.h
 DSP_SRCS-yes += prob.c
@@ -245,7 +251,6 @@
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
-DSP_SRCS-$(HAVE_SSE2)   += x86/fdct.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index 93063e6..4e89e07 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -12,7 +12,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_ports/mem.h"
 
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
diff --git a/vpx_dsp/x86/avg_ssse3_x86_64.asm b/vpx_dsp/x86/avg_ssse3_x86_64.asm
index d170a44..4b486c1 100644
--- a/vpx_dsp/x86/avg_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -9,6 +9,7 @@
 ;
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -94,20 +95,6 @@
   SWAP               7, 9
 %endmacro
 
-%if CONFIG_VP9_HIGHBITDEPTH
-; store %1 to outputq + %2
-; uses m8-m10 as scratch registers
-%macro STORE_TRAN_LOW 2
-  pxor                           m8, m8
-  mova                           m9, m%1
-  mova                          m10, m%1
-  pcmpgtw                        m8, m%1
-  punpcklwd                      m9, m8
-  punpckhwd                     m10, m8
-  mova               [outputq + %2], m9
-  mova          [outputq + %2 + 16], m10
-%endmacro
-%endif
 
 INIT_XMM ssse3
 cglobal hadamard_8x8, 3, 5, 11, input, stride, output
@@ -130,25 +117,14 @@
   TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
   HMD8_1D
 
-%if CONFIG_VP9_HIGHBITDEPTH
-  STORE_TRAN_LOW                  0, 0
-  STORE_TRAN_LOW                  1, 32
-  STORE_TRAN_LOW                  2, 64
-  STORE_TRAN_LOW                  3, 96
-  STORE_TRAN_LOW                  4, 128
-  STORE_TRAN_LOW                  5, 160
-  STORE_TRAN_LOW                  6, 192
-  STORE_TRAN_LOW                  7, 224
-%else
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
-%endif
+  STORE_TRAN_LOW 0, outputq, 0, 8, 9
+  STORE_TRAN_LOW 1, outputq, 1, 8, 9
+  STORE_TRAN_LOW 2, outputq, 2, 8, 9
+  STORE_TRAN_LOW 3, outputq, 3, 8, 9
+  STORE_TRAN_LOW 4, outputq, 4, 8, 9
+  STORE_TRAN_LOW 5, outputq, 5, 8, 9
+  STORE_TRAN_LOW 6, outputq, 6, 8, 9
+  STORE_TRAN_LOW 7, outputq, 7, 8, 9
 
   RET
 %endif
diff --git a/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/vpx_dsp/x86/bitdepth_conversion_sse2.asm
new file mode 100644
index 0000000..b2df520
--- /dev/null
+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm
@@ -0,0 +1,66 @@
+;
+;  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.
+; vpx_config.asm is not guarded so can not be included twice. Because this will
+; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be
+; included after those files.
+
+; Increment register by sizeof() tran_low_t * 8.
+%macro INCREMENT_TRAN_LOW 1
+%if CONFIG_VP9_HIGHBITDEPTH
+  add %1, 32
+%else
+  add %1, 16
+%endif
+%endmacro
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea %1, [%1 + %2 * 4]
+%else
+  lea %1, [%1 + %2 * 2]
+%endif
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bits.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova     m%1, [%2 + %3 * 32]
+  packssdw m%1, [%2 + %3 * 32 + 16]
+%else
+  mova     m%1, [%2 + %3 * 16]
+%endif
+%endmacro
+
+; Store m%1 to %2 + %3.
+; %3 is the offset in elements, not bits.
+; If tran_low_t is 16 bits (low bit depth configuration) then store the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
+; extend the values first.
+; Uses m%4-m%6 as scratch registers for high bit depth.
+%macro STORE_TRAN_LOW 5
+%if CONFIG_VP9_HIGHBITDEPTH
+  pxor                      m%4, m%4
+  mova                      m%5, m%1
+  pcmpgtw                   m%4, m%1
+  punpcklwd                 m%5, m%4
+  punpckhwd                 m%1, m%4
+  mova      [%2 + %3 * 32 +  0], m%5
+  mova      [%2 + %3 * 32 + 16], m%1
+%else
+  mova           [%2 + %3 * 16], m%1
+%endif
+%endmacro
diff --git a/vpx_dsp/x86/fdct.h b/vpx_dsp/x86/bitdepth_conversion_sse2.h
similarity index 96%
rename from vpx_dsp/x86/fdct.h
rename to vpx_dsp/x86/bitdepth_conversion_sse2.h
index 54a6d81..bff62a4 100644
--- a/vpx_dsp/x86/fdct.h
+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
index 6f8cfe5..0c3ed67 100644
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -9,6 +9,7 @@
 ;
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION_RODATA
 
@@ -230,21 +231,10 @@
 
   lea        r3, [2 * strideq]
 
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       m0, [inputq +   0]
-  packssdw   m0, [inputq +  16]
-  mova       m1, [inputq +  32]
-  packssdw   m1, [inputq +  48]
-  mova       m2, [inputq +  64]
-  packssdw   m2, [inputq +  80]
-  mova       m3, [inputq +  96]
-  packssdw   m3, [inputq + 112]
-%else
-  mova       m0, [inputq +  0]
-  mova       m1, [inputq + 16]
-  mova       m2, [inputq + 32]
-  mova       m3, [inputq + 48]
-%endif
+  LOAD_TRAN_LOW 0, inputq, 0
+  LOAD_TRAN_LOW 1, inputq, 1
+  LOAD_TRAN_LOW 2, inputq, 2
+  LOAD_TRAN_LOW 3, inputq, 3
 
   punpcklwd  m0, m1
   punpcklwd  m2, m3
@@ -752,33 +742,14 @@
   lea             r4, [rsp + transposed_in]
 
 idct32x32_34_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
+  LOAD_TRAN_LOW 0, r3,  0
+  LOAD_TRAN_LOW 1, r3,  4
+  LOAD_TRAN_LOW 2, r3,  8
+  LOAD_TRAN_LOW 3, r3, 12
+  LOAD_TRAN_LOW 4, r3, 16
+  LOAD_TRAN_LOW 5, r3, 20
+  LOAD_TRAN_LOW 6, r3, 24
+  LOAD_TRAN_LOW 7, r3, 28
 
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10
 
@@ -1182,33 +1153,15 @@
   mov             r7, 2
 
 idct32x32_135_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
+  LOAD_TRAN_LOW 0, r3,  0
+  LOAD_TRAN_LOW 1, r3,  4
+  LOAD_TRAN_LOW 2, r3,  8
+  LOAD_TRAN_LOW 3, r3, 12
+  LOAD_TRAN_LOW 4, r3, 16
+  LOAD_TRAN_LOW 5, r3, 20
+  LOAD_TRAN_LOW 6, r3, 24
+  LOAD_TRAN_LOW 7, r3, 28
+
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10
 
   mova [r4 +      0], m0
@@ -1220,22 +1173,14 @@
   mova [r4 + 16 * 6], m6
   mova [r4 + 16 * 7], m7
 
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
+  INCREMENT_TRAN_LOW r3
   add             r4, 16 * 8
   dec             r7
   jne idct32x32_135_transpose
 
   IDCT32X32_135 16*0, 16*32, 16*64, 16*96
   lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
+  INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
   dec             r6
   jnz idct32x32_135
 
@@ -1646,33 +1591,14 @@
   mov             r7, 4
 
 idct32x32_1024_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
+  LOAD_TRAN_LOW 0, r3,  0
+  LOAD_TRAN_LOW 1, r3,  4
+  LOAD_TRAN_LOW 2, r3,  8
+  LOAD_TRAN_LOW 3, r3, 12
+  LOAD_TRAN_LOW 4, r3, 16
+  LOAD_TRAN_LOW 5, r3, 20
+  LOAD_TRAN_LOW 6, r3, 24
+  LOAD_TRAN_LOW 7, r3, 28
 
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10
 
@@ -1684,11 +1610,7 @@
   mova [r4 + 16 * 5], m5
   mova [r4 + 16 * 6], m6
   mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
+  INCREMENT_TRAN_LOW r3
   add             r4, 16 * 8
   dec             r7
   jne idct32x32_1024_transpose
@@ -1696,11 +1618,7 @@
   IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
 
   lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
+  INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
   dec             r6
   jnz idct32x32_1024
 
diff --git a/vpx_dsp/x86/inv_wht_sse2.asm b/vpx_dsp/x86/inv_wht_sse2.asm
index fbbcd76..a9c52dc 100644
--- a/vpx_dsp/x86/inv_wht_sse2.asm
+++ b/vpx_dsp/x86/inv_wht_sse2.asm
@@ -9,6 +9,7 @@
 ;
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -82,15 +83,8 @@
 
 INIT_XMM sse2
 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0,        [inputq +  0]
-  packssdw        m0,        [inputq + 16]
-  mova            m1,        [inputq + 32]
-  packssdw        m1,        [inputq + 48]
-%else
-  mova            m0,        [inputq +  0]
-  mova            m1,        [inputq + 16]
-%endif
+  LOAD_TRAN_LOW    0, inputq, 0
+  LOAD_TRAN_LOW    1, inputq, 1
   psraw           m0,        2
   psraw           m1,        2
 
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 0580a7b..32721be 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -13,7 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,