avcodec/x86/h264_intrapred: Add AVX2 horizontal pred versions

pred8x8_horizontal_8_c:                   6.9
pred8x8_horizontal_8_sse2:                9.9 ( 0.70x)
pred8x8_horizontal_8_ssse3:               9.5 ( 0.73x)
pred8x8_horizontal_8_avx2:                5.1 ( 1.35x)

pred16x16_horizontal_8_c:                10.9
pred16x16_horizontal_8_sse2:             15.0 ( 0.72x)
pred16x16_horizontal_8_ssse3:            11.7 ( 0.93x)
pred16x16_horizontal_8_avx2:              9.6 ( 1.13x)

The new functions are cheap and only occupy 2*48B.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index ad4e267..d3bf6a6 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -71,10 +71,14 @@
 %macro PRED16x16_H 0
 cglobal pred16x16_horizontal_8, 2,3
     mov       r2, 8
-%if cpuflag(ssse3)
+%if cpuflag(ssse3) && notcpuflag(avx2)
     mova      m2, [pb_3]
 %endif
 .loop:
+%if cpuflag(avx2)
+    vpbroadcastb m0, [r0+r1*0-1]
+    vpbroadcastb m1, [r0+r1*1-1]
+%else
     movd      m0, [r0+r1*0-4]
     movd      m1, [r0+r1*1-4]
 
@@ -87,6 +91,7 @@
     SPLATW    m0, m0, 3
     SPLATW    m1, m1, 3
 %endif
+%endif
 
     mova [r0+r1*0], m0
     mova [r0+r1*1], m1
@@ -100,6 +105,8 @@
 PRED16x16_H
 INIT_XMM ssse3
 PRED16x16_H
+INIT_XMM avx2
+PRED16x16_H
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
@@ -586,12 +593,17 @@
 %macro PRED8x8_H 0
 cglobal pred8x8_horizontal_8, 2,3,3
     mov       r2, 4
-%if cpuflag(ssse3)
+%if cpuflag(ssse3) && notcpuflag(avx2)
     mova      m2, [pb_3]
 %endif
 .loop:
+%if cpuflag(avx2)
+    vpbroadcastb m0, [r0+r1*0-1]
+    vpbroadcastb m1, [r0+r1*1-1]
+%else
     SPLATB_LOAD m0, r0+r1*0-1, m2
     SPLATB_LOAD m1, r0+r1*1-1, m2
+%endif
     movq [r0+r1*0], m0
     movq [r0+r1*1], m1
     lea       r0, [r0+r1*2]
@@ -604,6 +616,8 @@
 PRED8x8_H
 INIT_XMM ssse3
 PRED8x8_H
+INIT_XMM avx2
+PRED8x8_H
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8_top_dc_8_sse2(uint8_t *src, ptrdiff_t stride)
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 5b308f6..b5d8269 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -102,6 +102,7 @@
 PRED16x16(vertical, 8, sse)
 PRED16x16(horizontal, 8, sse2)
 PRED16x16(horizontal, 8, ssse3)
+PRED16x16(horizontal, 8, avx2)
 PRED16x16(dc, 8, sse2)
 PRED16x16(dc, 8, ssse3)
 PRED16x16(plane_h264, 8, sse2)
@@ -119,6 +120,7 @@
 PRED8x8(vertical, 8, sse2)
 PRED8x8(horizontal, 8, sse2)
 PRED8x8(horizontal, 8, ssse3)
+PRED8x8(horizontal, 8, avx2)
 PRED8x8(plane, 8, sse2)
 PRED8x8(plane, 8, ssse3)
 PRED8x8(tm_vp8, 8, sse2)
@@ -256,6 +258,9 @@
         }
 
         if(EXTERNAL_AVX2(cpu_flags)){
+            h->pred16x16[HOR_PRED8x8          ] = ff_pred16x16_horizontal_8_avx2;
+            if (chroma_format_idc <= 1)
+                h->pred8x8  [HOR_PRED8x8      ] = ff_pred8x8_horizontal_8_avx2;
             if (codec_id == AV_CODEC_ID_VP8) {
                 h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_8_avx2;
             }