blob: fad863ecb64a0ccdf55881ecc74c348d1aa9d32f [file]
/* functable.c -- Choose relevant optimized functions at runtime
* Copyright (C) 2017 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef DISABLE_RUNTIME_CPU_DETECTION
#include "zbuild.h"
#if defined(_MSC_VER)
# include <intrin.h>
#endif
#include "functable.h"
#include "cpu_features.h"
#include "arch_functions.h"
/* Platform has pointer size atomic store */
#if defined(__GNUC__) || defined(__clang__)
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
__atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST)
# define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST)
#elif defined(_MSC_VER)
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
_InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME))
# ifdef ARCH_ARM
# define FUNCTABLE_BARRIER() do { \
_ReadWriteBarrier(); \
__dmb(0xB); /* _ARM_BARRIER_ISH */ \
_ReadWriteBarrier(); \
} while (0)
# else
# define FUNCTABLE_BARRIER() _ReadWriteBarrier()
# endif
#else
# warning Unable to detect atomic intrinsic support.
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
*((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME)
# define FUNCTABLE_BARRIER() do { /* Empty */ } while (0)
#endif
/* Verify all pointers are valid before assigning, return 1 on failure
* This allows inflateinit/deflateinit functions to gracefully return Z_VERSION_ERROR
* if functable initialization fails.
*/
#define FUNCTABLE_VERIFY_ASSIGN(VAR, FUNC_NAME) \
if (!VAR.FUNC_NAME) { \
fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \
return 1; \
} \
FUNCTABLE_ASSIGN(VAR, FUNC_NAME);
/* Functable init & abort on failure.
* Abort is needed because some stub functions are reachable without first
* calling any inflateinit/deflateinit functions, and have no error propagation.
*/
#define FUNCTABLE_INIT_ABORT \
if (init_functable()) { \
fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \
abort(); \
};
// Empty stub, used when functable has already been initialized
static int force_init_empty(void) {
return 0;
}
/* Functable initialization.
* Selects the best available optimized functions appropriate for the runtime cpu.
*/
static int init_functable(void) {
struct functable_s ft;
struct cpu_features cf;
memset(&ft, 0, sizeof(struct functable_s));
cpu_check_features(&cf);
ft.force_init = &force_init_empty;
// Set up generic C code fallbacks
#ifndef WITH_ALL_FALLBACKS
// Only use necessary generic functions when no suitable simd versions are available.
# ifdef X86_SSE2_NATIVE
// x86_64 always has SSE2
ft.adler32 = &adler32_c;
ft.adler32_copy = &adler32_copy_c;
ft.crc32 = &crc32_braid;
ft.crc32_copy = &crc32_copy_braid;
# elif defined(ARM_NEON_NATIVE)
# ifndef ARM_CRC32_NATIVE
ft.crc32 = &crc32_braid;
ft.crc32_copy = &crc32_copy_braid;
# endif
# elif defined(POWER8_VSX_NATIVE)
# ifndef POWER9_NATIVE
ft.compare256 = &compare256_c;
ft.longest_match = &longest_match_c;
ft.longest_match_slow = &longest_match_slow_c;
# endif
# ifndef POWER8_VSX_CRC32_NATIVE
ft.crc32 = &crc32_braid;
ft.crc32_copy = &crc32_copy_braid;
# endif
# elif defined(LOONGARCH_LSX_NATIVE)
# ifndef LOONGARCH_CRC
ft.crc32 = &crc32_braid;
ft.crc32_copy = &crc32_copy_braid;
# endif
# elif defined(RISCV_RVV_NATIVE)
# ifndef RISCV_ZBC_NATIVE
ft.crc32 = &crc32_braid;
ft.crc32_copy = &crc32_copy_braid;
# endif
# elif defined(S390_VX_NATIVE)
ft.adler32 = &adler32_c;
ft.adler32_copy = &adler32_copy_c;
ft.chunkmemset_safe = &chunkmemset_safe_c;
ft.compare256 = &compare256_c;
ft.inflate_fast = &inflate_fast_c;
ft.longest_match = &longest_match_c;
ft.longest_match_slow = &longest_match_slow_c;
ft.slide_hash = &slide_hash_c;
# endif
#else // WITH_ALL_FALLBACKS
ft.adler32 = &adler32_c;
ft.adler32_copy = &adler32_copy_c;
ft.chunkmemset_safe = &chunkmemset_safe_c;
ft.compare256 = &compare256_c;
ft.crc32 = &crc32_braid;
ft.crc32_copy = &crc32_copy_braid;
ft.inflate_fast = &inflate_fast_c;
ft.longest_match = &longest_match_c;
ft.longest_match_slow = &longest_match_slow_c;
ft.slide_hash = &slide_hash_c;
#endif
// Select arch-optimized functions
#ifdef WITH_OPTIM
// Chorba generic C fallback
#ifndef WITHOUT_CHORBA
ft.crc32 = &crc32_chorba;
ft.crc32_copy = &crc32_copy_chorba;
#endif
// X86 - SSE2
#ifdef X86_SSE2
# ifndef X86_SSE2_NATIVE
if (cf.x86.has_sse2)
# endif
{
# ifndef X86_AVX2_NATIVE
ft.chunkmemset_safe = &chunkmemset_safe_sse2;
ft.compare256 = &compare256_sse2;
ft.inflate_fast = &inflate_fast_sse2;
ft.longest_match = &longest_match_sse2;
ft.longest_match_slow = &longest_match_slow_sse2;
ft.slide_hash = &slide_hash_sse2;
# endif
# if !defined(WITHOUT_CHORBA_SSE) && !defined(X86_PCLMULQDQ_NATIVE)
ft.crc32 = &crc32_chorba_sse2;
ft.crc32_copy = &crc32_copy_chorba_sse2;
# endif
}
#endif
// X86 - SSSE3
#ifdef X86_SSSE3
# ifndef X86_SSSE3_NATIVE
if (cf.x86.has_ssse3)
# endif
{
ft.adler32 = &adler32_ssse3;
ft.adler32_copy = &adler32_copy_ssse3;
# ifndef X86_AVX2_NATIVE
ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
ft.inflate_fast = &inflate_fast_ssse3;
# endif
}
#endif
// X86 - SSE4.1
#if defined(X86_SSE41) && !defined(X86_PCLMULQDQ_NATIVE)
# ifndef X86_SSE41_NATIVE
if (cf.x86.has_sse41)
# endif
{
# ifndef WITHOUT_CHORBA_SSE
ft.crc32 = &crc32_chorba_sse41;
ft.crc32_copy = &crc32_copy_chorba_sse41;
# endif
}
#endif
// X86 - SSE4.2
#if defined(X86_SSE42) && !defined(X86_AVX512_NATIVE)
# ifndef X86_SSE42_NATIVE
if (cf.x86.has_sse42)
# endif
{
ft.adler32_copy = &adler32_copy_sse42;
}
#endif
// X86 - PCLMUL
#if defined(X86_PCLMULQDQ_CRC) && !defined(X86_VPCLMULQDQ_NATIVE)
# ifndef X86_PCLMULQDQ_NATIVE
if (cf.x86.has_pclmulqdq)
# endif
{
ft.crc32 = &crc32_pclmulqdq;
ft.crc32_copy = &crc32_copy_pclmulqdq;
}
#endif
// X86 - AVX2
#ifdef X86_AVX2
/* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for
* flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers
* for the shift results as an operand, eliminating several register-register moves when the original value needs
* to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */
# ifndef X86_AVX2_NATIVE
if (cf.x86.has_avx2 && cf.x86.has_bmi2)
# endif
{
# ifndef X86_AVX512_NATIVE
ft.adler32 = &adler32_avx2;
ft.adler32_copy = &adler32_copy_avx2;
ft.chunkmemset_safe = &chunkmemset_safe_avx2;
ft.compare256 = &compare256_avx2;
ft.inflate_fast = &inflate_fast_avx2;
ft.longest_match = &longest_match_avx2;
ft.longest_match_slow = &longest_match_slow_avx2;
# endif
ft.slide_hash = &slide_hash_avx2;
}
#endif
// X86 - AVX512 (F,DQ,BW,Vl)
#ifdef X86_AVX512
# ifndef X86_AVX512_NATIVE
if (cf.x86.has_avx512_common)
# endif
{
# ifndef X86_AVX512VNNI_NATIVE
ft.adler32 = &adler32_avx512;
ft.adler32_copy = &adler32_copy_avx512;
# endif
ft.chunkmemset_safe = &chunkmemset_safe_avx512;
ft.compare256 = &compare256_avx512;
ft.inflate_fast = &inflate_fast_avx512;
ft.longest_match = &longest_match_avx512;
ft.longest_match_slow = &longest_match_slow_avx512;
}
#endif
#ifdef X86_AVX512VNNI
# ifndef X86_AVX512VNNI_NATIVE
if (cf.x86.has_avx512vnni)
# endif
{
ft.adler32 = &adler32_avx512_vnni;
ft.adler32_copy = &adler32_copy_avx512_vnni;
}
#endif
// X86 - VPCLMULQDQ (AVX2)
#ifdef X86_VPCLMULQDQ_AVX2
# ifndef X86_VPCLMULQDQ_AVX2_NATIVE
if (cf.x86.has_pclmulqdq && cf.x86.has_avx2 && cf.x86.has_vpclmulqdq)
# endif
{
ft.crc32 = &crc32_vpclmulqdq_avx2;
ft.crc32_copy = &crc32_copy_vpclmulqdq_avx2;
}
#endif
// X86 - VPCLMULQDQ (AVX-512)
#ifdef X86_VPCLMULQDQ_AVX512
# ifndef X86_VPCLMULQDQ_AVX512_NATIVE
if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq)
# endif
{
ft.crc32 = &crc32_vpclmulqdq_avx512;
ft.crc32_copy = &crc32_copy_vpclmulqdq_avx512;
}
#endif
// ARM - SIMD
#if defined(ARM_SIMD) && !defined(ARM_NEON_NATIVE)
# ifndef ARM_SIMD_NATIVE
if (cf.arm.has_simd)
# endif
{
ft.slide_hash = &slide_hash_armv6;
}
#endif
// ARM - NEON
#ifdef ARM_NEON
# ifndef ARM_NEON_NATIVE
if (cf.arm.has_neon)
# endif
{
ft.adler32 = &adler32_neon;
ft.adler32_copy = &adler32_copy_neon;
ft.chunkmemset_safe = &chunkmemset_safe_neon;
ft.compare256 = &compare256_neon;
ft.inflate_fast = &inflate_fast_neon;
ft.longest_match = &longest_match_neon;
ft.longest_match_slow = &longest_match_slow_neon;
ft.slide_hash = &slide_hash_neon;
}
#endif
// ARM - CRC32
#if defined(ARM_CRC32) && !defined(ARM_PMULL_EOR3_NATIVE)
# ifndef ARM_CRC32_NATIVE
if (cf.arm.has_crc32)
# endif
{
ft.crc32 = &crc32_armv8;
ft.crc32_copy = &crc32_copy_armv8;
}
#endif
// ARM - PMULL EOR3
#ifdef ARM_PMULL_EOR3
# ifndef ARM_PMULL_EOR3_NATIVE
if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull)
# endif
{
ft.crc32 = &crc32_armv8_pmull_eor3;
ft.crc32_copy = &crc32_copy_armv8_pmull_eor3;
}
#endif
// Power - VMX
#ifdef PPC_VMX
# ifndef PPC_VMX_NATIVE
if (cf.power.has_altivec)
# endif
{
ft.adler32 = &adler32_vmx;
ft.adler32_copy = &adler32_copy_vmx;
ft.slide_hash = &slide_hash_vmx;
}
#endif
// Power8 - VSX
#ifdef POWER8_VSX
# ifndef POWER8_VSX_NATIVE
if (cf.power.has_arch_2_07)
# endif
{
ft.adler32 = &adler32_power8;
ft.adler32_copy = &adler32_copy_power8;
ft.chunkmemset_safe = &chunkmemset_safe_power8;
ft.inflate_fast = &inflate_fast_power8;
ft.slide_hash = &slide_hash_power8;
}
#endif
#ifdef POWER8_VSX_CRC32
# ifndef POWER8_VSX_CRC32_NATIVE
if (cf.power.has_arch_2_07)
# endif
{
ft.crc32 = &crc32_power8;
ft.crc32_copy = &crc32_copy_power8;
}
#endif
// Power9
#ifdef POWER9
# ifndef POWER9_NATIVE
if (cf.power.has_arch_3_00)
# endif
{
ft.compare256 = &compare256_power9;
ft.longest_match = &longest_match_power9;
ft.longest_match_slow = &longest_match_slow_power9;
}
#endif
// RISCV - RVV
#ifdef RISCV_RVV
# ifndef RISCV_RVV_NATIVE
if (cf.riscv.has_rvv)
# endif
{
ft.adler32 = &adler32_rvv;
ft.adler32_copy = &adler32_copy_rvv;
ft.chunkmemset_safe = &chunkmemset_safe_rvv;
ft.compare256 = &compare256_rvv;
ft.inflate_fast = &inflate_fast_rvv;
ft.longest_match = &longest_match_rvv;
ft.longest_match_slow = &longest_match_slow_rvv;
ft.slide_hash = &slide_hash_rvv;
}
#endif
// RISCV - ZBC
#ifdef RISCV_CRC32_ZBC
# ifndef RISCV_ZBC_NATIVE
if (cf.riscv.has_zbc)
# endif
{
ft.crc32 = &crc32_riscv64_zbc;
ft.crc32_copy = &crc32_copy_riscv64_zbc;
}
#endif
// S390
#ifdef S390_VX
# ifndef S390_VX_NATIVE
if (cf.s390.has_vx)
# endif
{
ft.crc32 = &crc32_s390_vx;
ft.crc32_copy = &crc32_copy_s390_vx;
ft.slide_hash = &slide_hash_vx;
}
#endif
// LOONGARCH
#ifdef LOONGARCH_CRC
# ifndef LOONGARCH_CRC_NATIVE
if (cf.loongarch.has_crc)
# endif
{
ft.crc32 = &crc32_loongarch64;
ft.crc32_copy = &crc32_copy_loongarch64;
}
#endif
#if defined(LOONGARCH_LSX) && !defined(LOONGARCH_LASX_NATIVE)
# ifndef LOONGARCH_LSX_NATIVE
if (cf.loongarch.has_lsx)
# endif
{
ft.adler32 = &adler32_lsx;
ft.adler32_copy = &adler32_copy_lsx;
ft.chunkmemset_safe = &chunkmemset_safe_lsx;
ft.compare256 = &compare256_lsx;
ft.inflate_fast = &inflate_fast_lsx;
ft.longest_match = &longest_match_lsx;
ft.longest_match_slow = &longest_match_slow_lsx;
ft.slide_hash = &slide_hash_lsx;
}
#endif
#ifdef LOONGARCH_LASX
# ifndef LOONGARCH_LASX_NATIVE
if (cf.loongarch.has_lasx)
# endif
{
ft.adler32 = &adler32_lasx;
ft.adler32_copy = &adler32_copy_lasx;
ft.chunkmemset_safe = &chunkmemset_safe_lasx;
ft.compare256 = &compare256_lasx;
ft.inflate_fast = &inflate_fast_lasx;
ft.longest_match = &longest_match_lasx;
ft.longest_match_slow = &longest_match_slow_lasx;
ft.slide_hash = &slide_hash_lasx;
}
#endif
#endif // WITH_OPTIM
// Assign function pointers individually for atomic operation
FUNCTABLE_ASSIGN(ft, force_init);
FUNCTABLE_VERIFY_ASSIGN(ft, adler32);
FUNCTABLE_VERIFY_ASSIGN(ft, adler32_copy);
FUNCTABLE_VERIFY_ASSIGN(ft, chunkmemset_safe);
FUNCTABLE_VERIFY_ASSIGN(ft, compare256);
FUNCTABLE_VERIFY_ASSIGN(ft, crc32);
FUNCTABLE_VERIFY_ASSIGN(ft, crc32_copy);
FUNCTABLE_VERIFY_ASSIGN(ft, inflate_fast);
FUNCTABLE_VERIFY_ASSIGN(ft, longest_match);
FUNCTABLE_VERIFY_ASSIGN(ft, longest_match_slow);
FUNCTABLE_VERIFY_ASSIGN(ft, slide_hash);
// Memory barrier for weak memory order CPUs
FUNCTABLE_BARRIER();
return Z_OK;
}
#if !defined(_WIN32) && (defined(__GNUC__) || defined(__clang__))
static void __attribute__((constructor)) functable_constructor(void) {
FUNCTABLE_INIT_ABORT;
}
#endif
/* stub functions */
static int force_init_stub(void) {
return init_functable();
}
static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) {
FUNCTABLE_INIT_ABORT;
return functable.adler32(adler, buf, len);
}
static uint32_t adler32_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) {
FUNCTABLE_INIT_ABORT;
return functable.adler32_copy(adler, dst, src, len);
}
static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, size_t len, size_t left) {
FUNCTABLE_INIT_ABORT;
return functable.chunkmemset_safe(out, from, len, left);
}
static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) {
FUNCTABLE_INIT_ABORT;
return functable.compare256(src0, src1);
}
static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) {
FUNCTABLE_INIT_ABORT;
return functable.crc32(crc, buf, len);
}
static uint32_t crc32_copy_stub(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
FUNCTABLE_INIT_ABORT;
return functable.crc32_copy(crc, dst, src, len);
}
static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
FUNCTABLE_INIT_ABORT;
functable.inflate_fast(strm, start);
}
static uint32_t longest_match_stub(deflate_state* const s, uint32_t cur_match) {
FUNCTABLE_INIT_ABORT;
return functable.longest_match(s, cur_match);
}
static uint32_t longest_match_slow_stub(deflate_state* const s, uint32_t cur_match) {
FUNCTABLE_INIT_ABORT;
return functable.longest_match_slow(s, cur_match);
}
static void slide_hash_stub(deflate_state* s) {
FUNCTABLE_INIT_ABORT;
functable.slide_hash(s);
}
/* functable init */
Z_INTERNAL struct functable_s functable = {
force_init_stub,
adler32_stub,
adler32_copy_stub,
chunkmemset_safe_stub,
compare256_stub,
crc32_stub,
crc32_copy_stub,
inflate_fast_stub,
longest_match_stub,
longest_match_slow_stub,
slide_hash_stub,
};
#endif