avcodec/x86/h264_intrapred: Add AVX2 horizontal pred versions pred8x8_horizontal_8_c: 6.9 pred8x8_horizontal_8_sse2: 9.9 ( 0.70x) pred8x8_horizontal_8_ssse3: 9.5 ( 0.73x) pred8x8_horizontal_8_avx2: 5.1 ( 1.35x) pred16x16_horizontal_8_c: 10.9 pred16x16_horizontal_8_sse2: 15.0 ( 0.72x) pred16x16_horizontal_8_ssse3: 11.7 ( 0.93x) pred16x16_horizontal_8_avx2: 9.6 ( 1.13x) The new functions are cheap and only occupy 2*48B. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index ad4e267..d3bf6a6 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm
@@ -71,10 +71,14 @@ %macro PRED16x16_H 0 cglobal pred16x16_horizontal_8, 2,3 mov r2, 8 -%if cpuflag(ssse3) +%if cpuflag(ssse3) && notcpuflag(avx2) mova m2, [pb_3] %endif .loop: +%if cpuflag(avx2) + vpbroadcastb m0, [r0+r1*0-1] + vpbroadcastb m1, [r0+r1*1-1] +%else movd m0, [r0+r1*0-4] movd m1, [r0+r1*1-4] @@ -87,6 +91,7 @@ SPLATW m0, m0, 3 SPLATW m1, m1, 3 %endif +%endif mova [r0+r1*0], m0 mova [r0+r1*1], m1 @@ -100,6 +105,8 @@ PRED16x16_H INIT_XMM ssse3 PRED16x16_H +INIT_XMM avx2 +PRED16x16_H ;----------------------------------------------------------------------------- ; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride) @@ -586,12 +593,17 @@ %macro PRED8x8_H 0 cglobal pred8x8_horizontal_8, 2,3,3 mov r2, 4 -%if cpuflag(ssse3) +%if cpuflag(ssse3) && notcpuflag(avx2) mova m2, [pb_3] %endif .loop: +%if cpuflag(avx2) + vpbroadcastb m0, [r0+r1*0-1] + vpbroadcastb m1, [r0+r1*1-1] +%else SPLATB_LOAD m0, r0+r1*0-1, m2 SPLATB_LOAD m1, r0+r1*1-1, m2 +%endif movq [r0+r1*0], m0 movq [r0+r1*1], m1 lea r0, [r0+r1*2] @@ -604,6 +616,8 @@ PRED8x8_H INIT_XMM ssse3 PRED8x8_H +INIT_XMM avx2 +PRED8x8_H ;----------------------------------------------------------------------------- ; void ff_pred8x8_top_dc_8_sse2(uint8_t *src, ptrdiff_t stride)
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index 5b308f6..b5d8269 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c
@@ -102,6 +102,7 @@ PRED16x16(vertical, 8, sse) PRED16x16(horizontal, 8, sse2) PRED16x16(horizontal, 8, ssse3) +PRED16x16(horizontal, 8, avx2) PRED16x16(dc, 8, sse2) PRED16x16(dc, 8, ssse3) PRED16x16(plane_h264, 8, sse2) @@ -119,6 +120,7 @@ PRED8x8(vertical, 8, sse2) PRED8x8(horizontal, 8, sse2) PRED8x8(horizontal, 8, ssse3) +PRED8x8(horizontal, 8, avx2) PRED8x8(plane, 8, sse2) PRED8x8(plane, 8, ssse3) PRED8x8(tm_vp8, 8, sse2) @@ -256,6 +258,9 @@ } if(EXTERNAL_AVX2(cpu_flags)){ + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_avx2; + if (chroma_format_idc <= 1) + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_avx2; if (codec_id == AV_CODEC_ID_VP8) { h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2; }