| /* functable.c -- Choose relevant optimized functions at runtime |
| * Copyright (C) 2017 Hans Kristian Rosbach |
| * For conditions of distribution and use, see copyright notice in zlib.h |
| */ |
| #ifndef DISABLE_RUNTIME_CPU_DETECTION |
| |
| #include "zbuild.h" |
| |
| #if defined(_MSC_VER) |
| # include <intrin.h> |
| #endif |
| |
| #include "functable.h" |
| #include "cpu_features.h" |
| #include "arch_functions.h" |
| |
| /* Platform has pointer size atomic store */ |
| #if defined(__GNUC__) || defined(__clang__) |
| # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
| __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST) |
| # define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST) |
| #elif defined(_MSC_VER) |
| # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
| _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME)) |
| # ifdef ARCH_ARM |
| # define FUNCTABLE_BARRIER() do { \ |
| _ReadWriteBarrier(); \ |
| __dmb(0xB); /* _ARM_BARRIER_ISH */ \ |
| _ReadWriteBarrier(); \ |
| } while (0) |
| # else |
| # define FUNCTABLE_BARRIER() _ReadWriteBarrier() |
| # endif |
| #else |
| # warning Unable to detect atomic intrinsic support. |
| # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
| *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME) |
| # define FUNCTABLE_BARRIER() do { /* Empty */ } while (0) |
| #endif |
| |
| /* Verify all pointers are valid before assigning, return 1 on failure |
| * This allows inflateinit/deflateinit functions to gracefully return Z_VERSION_ERROR |
| * if functable initialization fails. |
| */ |
| #define FUNCTABLE_VERIFY_ASSIGN(VAR, FUNC_NAME) \ |
| if (!VAR.FUNC_NAME) { \ |
| fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \ |
| return 1; \ |
| } \ |
| FUNCTABLE_ASSIGN(VAR, FUNC_NAME); |
| |
| /* Functable init & abort on failure. |
| * Abort is needed because some stub functions are reachable without first |
| * calling any inflateinit/deflateinit functions, and have no error propagation. |
| */ |
| #define FUNCTABLE_INIT_ABORT \ |
| if (init_functable()) { \ |
| fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \ |
| abort(); \ |
| }; |
| |
| // Empty stub, used when functable has already been initialized |
| static int force_init_empty(void) { |
| return 0; |
| } |
| |
| /* Functable initialization. |
| * Selects the best available optimized functions appropriate for the runtime cpu. |
| */ |
| static int init_functable(void) { |
| struct functable_s ft; |
| struct cpu_features cf; |
| |
| memset(&ft, 0, sizeof(struct functable_s)); |
| cpu_check_features(&cf); |
| ft.force_init = &force_init_empty; |
| |
| // Set up generic C code fallbacks |
| #ifndef WITH_ALL_FALLBACKS |
| // Only use necessary generic functions when no suitable simd versions are available. |
| # ifdef X86_SSE2_NATIVE |
| // x86_64 always has SSE2 |
| ft.adler32 = &adler32_c; |
| ft.adler32_copy = &adler32_copy_c; |
| ft.crc32 = &crc32_braid; |
| ft.crc32_copy = &crc32_copy_braid; |
| # elif defined(ARM_NEON_NATIVE) |
| # ifndef ARM_CRC32_NATIVE |
| ft.crc32 = &crc32_braid; |
| ft.crc32_copy = &crc32_copy_braid; |
| # endif |
| # elif defined(POWER8_VSX_NATIVE) |
| # ifndef POWER9_NATIVE |
| ft.compare256 = &compare256_c; |
| ft.longest_match = &longest_match_c; |
| ft.longest_match_slow = &longest_match_slow_c; |
| # endif |
| # ifndef POWER8_VSX_CRC32_NATIVE |
| ft.crc32 = &crc32_braid; |
| ft.crc32_copy = &crc32_copy_braid; |
| # endif |
| # elif defined(LOONGARCH_LSX_NATIVE) |
| # ifndef LOONGARCH_CRC |
| ft.crc32 = &crc32_braid; |
| ft.crc32_copy = &crc32_copy_braid; |
| # endif |
| # elif defined(RISCV_RVV_NATIVE) |
| # ifndef RISCV_ZBC_NATIVE |
| ft.crc32 = &crc32_braid; |
| ft.crc32_copy = &crc32_copy_braid; |
| # endif |
| # elif defined(S390_VX_NATIVE) |
| ft.adler32 = &adler32_c; |
| ft.adler32_copy = &adler32_copy_c; |
| ft.chunkmemset_safe = &chunkmemset_safe_c; |
| ft.compare256 = &compare256_c; |
| ft.inflate_fast = &inflate_fast_c; |
| ft.longest_match = &longest_match_c; |
| ft.longest_match_slow = &longest_match_slow_c; |
| ft.slide_hash = &slide_hash_c; |
| # endif |
| #else // WITH_ALL_FALLBACKS |
| ft.adler32 = &adler32_c; |
| ft.adler32_copy = &adler32_copy_c; |
| ft.chunkmemset_safe = &chunkmemset_safe_c; |
| ft.compare256 = &compare256_c; |
| ft.crc32 = &crc32_braid; |
| ft.crc32_copy = &crc32_copy_braid; |
| ft.inflate_fast = &inflate_fast_c; |
| ft.longest_match = &longest_match_c; |
| ft.longest_match_slow = &longest_match_slow_c; |
| ft.slide_hash = &slide_hash_c; |
| #endif |
| |
| // Select arch-optimized functions |
| #ifdef WITH_OPTIM |
| |
| // Chorba generic C fallback |
| #ifndef WITHOUT_CHORBA |
| ft.crc32 = &crc32_chorba; |
| ft.crc32_copy = &crc32_copy_chorba; |
| #endif |
| |
| // X86 - SSE2 |
| #ifdef X86_SSE2 |
| # ifndef X86_SSE2_NATIVE |
| if (cf.x86.has_sse2) |
| # endif |
| { |
| # ifndef X86_AVX2_NATIVE |
| ft.chunkmemset_safe = &chunkmemset_safe_sse2; |
| ft.compare256 = &compare256_sse2; |
| ft.inflate_fast = &inflate_fast_sse2; |
| ft.longest_match = &longest_match_sse2; |
| ft.longest_match_slow = &longest_match_slow_sse2; |
| ft.slide_hash = &slide_hash_sse2; |
| # endif |
| # if !defined(WITHOUT_CHORBA_SSE) && !defined(X86_PCLMULQDQ_NATIVE) |
| ft.crc32 = &crc32_chorba_sse2; |
| ft.crc32_copy = &crc32_copy_chorba_sse2; |
| # endif |
| } |
| #endif |
| // X86 - SSSE3 |
| #ifdef X86_SSSE3 |
| # ifndef X86_SSSE3_NATIVE |
| if (cf.x86.has_ssse3) |
| # endif |
| { |
| ft.adler32 = &adler32_ssse3; |
| ft.adler32_copy = &adler32_copy_ssse3; |
| # ifndef X86_AVX2_NATIVE |
| ft.chunkmemset_safe = &chunkmemset_safe_ssse3; |
| ft.inflate_fast = &inflate_fast_ssse3; |
| # endif |
| } |
| #endif |
| |
| // X86 - SSE4.1 |
| #if defined(X86_SSE41) && !defined(X86_PCLMULQDQ_NATIVE) |
| # ifndef X86_SSE41_NATIVE |
| if (cf.x86.has_sse41) |
| # endif |
| { |
| # ifndef WITHOUT_CHORBA_SSE |
| ft.crc32 = &crc32_chorba_sse41; |
| ft.crc32_copy = &crc32_copy_chorba_sse41; |
| # endif |
| } |
| #endif |
| |
| // X86 - SSE4.2 |
| #if defined(X86_SSE42) && !defined(X86_AVX512_NATIVE) |
| # ifndef X86_SSE42_NATIVE |
| if (cf.x86.has_sse42) |
| # endif |
| { |
| ft.adler32_copy = &adler32_copy_sse42; |
| } |
| #endif |
| // X86 - PCLMUL |
| #if defined(X86_PCLMULQDQ_CRC) && !defined(X86_VPCLMULQDQ_NATIVE) |
| # ifndef X86_PCLMULQDQ_NATIVE |
| if (cf.x86.has_pclmulqdq) |
| # endif |
| { |
| ft.crc32 = &crc32_pclmulqdq; |
| ft.crc32_copy = &crc32_copy_pclmulqdq; |
| } |
| #endif |
| // X86 - AVX2 |
| #ifdef X86_AVX2 |
| /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for |
| * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers |
| * for the shift results as an operand, eliminating several register-register moves when the original value needs |
| * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */ |
| # ifndef X86_AVX2_NATIVE |
| if (cf.x86.has_avx2 && cf.x86.has_bmi2) |
| # endif |
| { |
| # ifndef X86_AVX512_NATIVE |
| ft.adler32 = &adler32_avx2; |
| ft.adler32_copy = &adler32_copy_avx2; |
| ft.chunkmemset_safe = &chunkmemset_safe_avx2; |
| ft.compare256 = &compare256_avx2; |
| ft.inflate_fast = &inflate_fast_avx2; |
| ft.longest_match = &longest_match_avx2; |
| ft.longest_match_slow = &longest_match_slow_avx2; |
| # endif |
| ft.slide_hash = &slide_hash_avx2; |
| } |
| #endif |
| // X86 - AVX512 (F,DQ,BW,Vl) |
| #ifdef X86_AVX512 |
| # ifndef X86_AVX512_NATIVE |
| if (cf.x86.has_avx512_common) |
| # endif |
| { |
| # ifndef X86_AVX512VNNI_NATIVE |
| ft.adler32 = &adler32_avx512; |
| ft.adler32_copy = &adler32_copy_avx512; |
| # endif |
| ft.chunkmemset_safe = &chunkmemset_safe_avx512; |
| ft.compare256 = &compare256_avx512; |
| ft.inflate_fast = &inflate_fast_avx512; |
| ft.longest_match = &longest_match_avx512; |
| ft.longest_match_slow = &longest_match_slow_avx512; |
| } |
| #endif |
| #ifdef X86_AVX512VNNI |
| # ifndef X86_AVX512VNNI_NATIVE |
| if (cf.x86.has_avx512vnni) |
| # endif |
| { |
| ft.adler32 = &adler32_avx512_vnni; |
| ft.adler32_copy = &adler32_copy_avx512_vnni; |
| } |
| #endif |
| // X86 - VPCLMULQDQ (AVX2) |
| #ifdef X86_VPCLMULQDQ_AVX2 |
| # ifndef X86_VPCLMULQDQ_AVX2_NATIVE |
| if (cf.x86.has_pclmulqdq && cf.x86.has_avx2 && cf.x86.has_vpclmulqdq) |
| # endif |
| { |
| ft.crc32 = &crc32_vpclmulqdq_avx2; |
| ft.crc32_copy = &crc32_copy_vpclmulqdq_avx2; |
| } |
| #endif |
| // X86 - VPCLMULQDQ (AVX-512) |
| #ifdef X86_VPCLMULQDQ_AVX512 |
| # ifndef X86_VPCLMULQDQ_AVX512_NATIVE |
| if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) |
| # endif |
| { |
| ft.crc32 = &crc32_vpclmulqdq_avx512; |
| ft.crc32_copy = &crc32_copy_vpclmulqdq_avx512; |
| } |
| #endif |
| |
| |
| // ARM - SIMD |
| #if defined(ARM_SIMD) && !defined(ARM_NEON_NATIVE) |
| # ifndef ARM_SIMD_NATIVE |
| if (cf.arm.has_simd) |
| # endif |
| { |
| ft.slide_hash = &slide_hash_armv6; |
| } |
| #endif |
| // ARM - NEON |
| #ifdef ARM_NEON |
| # ifndef ARM_NEON_NATIVE |
| if (cf.arm.has_neon) |
| # endif |
| { |
| ft.adler32 = &adler32_neon; |
| ft.adler32_copy = &adler32_copy_neon; |
| ft.chunkmemset_safe = &chunkmemset_safe_neon; |
| ft.compare256 = &compare256_neon; |
| ft.inflate_fast = &inflate_fast_neon; |
| ft.longest_match = &longest_match_neon; |
| ft.longest_match_slow = &longest_match_slow_neon; |
| ft.slide_hash = &slide_hash_neon; |
| } |
| #endif |
| // ARM - CRC32 |
| #if defined(ARM_CRC32) && !defined(ARM_PMULL_EOR3_NATIVE) |
| # ifndef ARM_CRC32_NATIVE |
| if (cf.arm.has_crc32) |
| # endif |
| { |
| ft.crc32 = &crc32_armv8; |
| ft.crc32_copy = &crc32_copy_armv8; |
| } |
| #endif |
| // ARM - PMULL EOR3 |
| #ifdef ARM_PMULL_EOR3 |
| # ifndef ARM_PMULL_EOR3_NATIVE |
| if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull) |
| # endif |
| { |
| ft.crc32 = &crc32_armv8_pmull_eor3; |
| ft.crc32_copy = &crc32_copy_armv8_pmull_eor3; |
| } |
| #endif |
| |
| // Power - VMX |
| #ifdef PPC_VMX |
| # ifndef PPC_VMX_NATIVE |
| if (cf.power.has_altivec) |
| # endif |
| { |
| ft.adler32 = &adler32_vmx; |
| ft.adler32_copy = &adler32_copy_vmx; |
| ft.slide_hash = &slide_hash_vmx; |
| } |
| #endif |
| // Power8 - VSX |
| #ifdef POWER8_VSX |
| # ifndef POWER8_VSX_NATIVE |
| if (cf.power.has_arch_2_07) |
| # endif |
| { |
| ft.adler32 = &adler32_power8; |
| ft.adler32_copy = &adler32_copy_power8; |
| ft.chunkmemset_safe = &chunkmemset_safe_power8; |
| ft.inflate_fast = &inflate_fast_power8; |
| ft.slide_hash = &slide_hash_power8; |
| } |
| #endif |
| #ifdef POWER8_VSX_CRC32 |
| # ifndef POWER8_VSX_CRC32_NATIVE |
| if (cf.power.has_arch_2_07) |
| # endif |
| { |
| ft.crc32 = &crc32_power8; |
| ft.crc32_copy = &crc32_copy_power8; |
| } |
| #endif |
| // Power9 |
| #ifdef POWER9 |
| # ifndef POWER9_NATIVE |
| if (cf.power.has_arch_3_00) |
| # endif |
| { |
| ft.compare256 = &compare256_power9; |
| ft.longest_match = &longest_match_power9; |
| ft.longest_match_slow = &longest_match_slow_power9; |
| } |
| #endif |
| |
| |
| // RISCV - RVV |
| #ifdef RISCV_RVV |
| # ifndef RISCV_RVV_NATIVE |
| if (cf.riscv.has_rvv) |
| # endif |
| { |
| ft.adler32 = &adler32_rvv; |
| ft.adler32_copy = &adler32_copy_rvv; |
| ft.chunkmemset_safe = &chunkmemset_safe_rvv; |
| ft.compare256 = &compare256_rvv; |
| ft.inflate_fast = &inflate_fast_rvv; |
| ft.longest_match = &longest_match_rvv; |
| ft.longest_match_slow = &longest_match_slow_rvv; |
| ft.slide_hash = &slide_hash_rvv; |
| } |
| #endif |
| |
| // RISCV - ZBC |
| #ifdef RISCV_CRC32_ZBC |
| # ifndef RISCV_ZBC_NATIVE |
| if (cf.riscv.has_zbc) |
| # endif |
| { |
| ft.crc32 = &crc32_riscv64_zbc; |
| ft.crc32_copy = &crc32_copy_riscv64_zbc; |
| } |
| #endif |
| |
| // S390 |
| #ifdef S390_VX |
| # ifndef S390_VX_NATIVE |
| if (cf.s390.has_vx) |
| # endif |
| { |
| ft.crc32 = &crc32_s390_vx; |
| ft.crc32_copy = &crc32_copy_s390_vx; |
| ft.slide_hash = &slide_hash_vx; |
| } |
| #endif |
| |
| // LOONGARCH |
| #ifdef LOONGARCH_CRC |
| # ifndef LOONGARCH_CRC_NATIVE |
| if (cf.loongarch.has_crc) |
| # endif |
| { |
| ft.crc32 = &crc32_loongarch64; |
| ft.crc32_copy = &crc32_copy_loongarch64; |
| } |
| #endif |
| #if defined(LOONGARCH_LSX) && !defined(LOONGARCH_LASX_NATIVE) |
| # ifndef LOONGARCH_LSX_NATIVE |
| if (cf.loongarch.has_lsx) |
| # endif |
| { |
| ft.adler32 = &adler32_lsx; |
| ft.adler32_copy = &adler32_copy_lsx; |
| ft.chunkmemset_safe = &chunkmemset_safe_lsx; |
| ft.compare256 = &compare256_lsx; |
| ft.inflate_fast = &inflate_fast_lsx; |
| ft.longest_match = &longest_match_lsx; |
| ft.longest_match_slow = &longest_match_slow_lsx; |
| ft.slide_hash = &slide_hash_lsx; |
| } |
| #endif |
| #ifdef LOONGARCH_LASX |
| # ifndef LOONGARCH_LASX_NATIVE |
| if (cf.loongarch.has_lasx) |
| # endif |
| { |
| ft.adler32 = &adler32_lasx; |
| ft.adler32_copy = &adler32_copy_lasx; |
| ft.chunkmemset_safe = &chunkmemset_safe_lasx; |
| ft.compare256 = &compare256_lasx; |
| ft.inflate_fast = &inflate_fast_lasx; |
| ft.longest_match = &longest_match_lasx; |
| ft.longest_match_slow = &longest_match_slow_lasx; |
| ft.slide_hash = &slide_hash_lasx; |
| } |
| #endif |
| |
| #endif // WITH_OPTIM |
| |
| // Assign function pointers individually for atomic operation |
| FUNCTABLE_ASSIGN(ft, force_init); |
| FUNCTABLE_VERIFY_ASSIGN(ft, adler32); |
| FUNCTABLE_VERIFY_ASSIGN(ft, adler32_copy); |
| FUNCTABLE_VERIFY_ASSIGN(ft, chunkmemset_safe); |
| FUNCTABLE_VERIFY_ASSIGN(ft, compare256); |
| FUNCTABLE_VERIFY_ASSIGN(ft, crc32); |
| FUNCTABLE_VERIFY_ASSIGN(ft, crc32_copy); |
| FUNCTABLE_VERIFY_ASSIGN(ft, inflate_fast); |
| FUNCTABLE_VERIFY_ASSIGN(ft, longest_match); |
| FUNCTABLE_VERIFY_ASSIGN(ft, longest_match_slow); |
| FUNCTABLE_VERIFY_ASSIGN(ft, slide_hash); |
| |
| // Memory barrier for weak memory order CPUs |
| FUNCTABLE_BARRIER(); |
| |
| return Z_OK; |
| } |
| |
| /* stub functions */ |
| static int force_init_stub(void) { |
| return init_functable(); |
| } |
| |
| static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) { |
| FUNCTABLE_INIT_ABORT; |
| return functable.adler32(adler, buf, len); |
| } |
| |
| static uint32_t adler32_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) { |
| FUNCTABLE_INIT_ABORT; |
| return functable.adler32_copy(adler, dst, src, len); |
| } |
| |
| static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, size_t len, size_t left) { |
| FUNCTABLE_INIT_ABORT; |
| return functable.chunkmemset_safe(out, from, len, left); |
| } |
| |
| static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) { |
| FUNCTABLE_INIT_ABORT; |
| return functable.compare256(src0, src1); |
| } |
| |
| static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) { |
| FUNCTABLE_INIT_ABORT; |
| return functable.crc32(crc, buf, len); |
| } |
| |
| static uint32_t crc32_copy_stub(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { |
| FUNCTABLE_INIT_ABORT; |
| return functable.crc32_copy(crc, dst, src, len); |
| } |
| |
| static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) { |
| FUNCTABLE_INIT_ABORT; |
| functable.inflate_fast(strm, start); |
| } |
| |
| static uint32_t longest_match_stub(deflate_state* const s, uint32_t cur_match) { |
| FUNCTABLE_INIT_ABORT; |
| return functable.longest_match(s, cur_match); |
| } |
| |
| static uint32_t longest_match_slow_stub(deflate_state* const s, uint32_t cur_match) { |
| FUNCTABLE_INIT_ABORT; |
| return functable.longest_match_slow(s, cur_match); |
| } |
| |
| static void slide_hash_stub(deflate_state* s) { |
| FUNCTABLE_INIT_ABORT; |
| functable.slide_hash(s); |
| } |
| |
| /* functable init */ |
| Z_INTERNAL struct functable_s functable = { |
| force_init_stub, |
| adler32_stub, |
| adler32_copy_stub, |
| chunkmemset_safe_stub, |
| compare256_stub, |
| crc32_stub, |
| crc32_copy_stub, |
| inflate_fast_stub, |
| longest_match_stub, |
| longest_match_slow_stub, |
| slide_hash_stub, |
| }; |
| |
| #endif |