Have "x86_avx2" require PCLMUL, POPCNT and SSE4.2
The std/crc32 benchmark numbers get worse. The benches still run AVX2
code, but it runs slower for an unknown reason. The std/gzip and std/png
benchmark numbers are mixed.
Still, it's cleaner for Wuffs' "x86_avx2" to be a superset of
"x86_sse42" (and apart from PCLMUL, they're now similar to Clang 12 and
GCC 11's "x86-64-v3" and "x86-64-v2" micro-architecture feature levels).
Maybe future compiler versions can claw back the std/crc32 performance.
On a mid-range x86_64 laptop (2016, Skylake):
name old speed new speed delta
wuffs_crc32_ieee_10k/clang9 8.07GB/s ± 0% 7.61GB/s ± 1% -5.72% (p=0.000 n=8+8)
wuffs_crc32_ieee_100k/clang9 11.0GB/s ± 1% 9.7GB/s ± 3% -11.45% (p=0.000 n=7+8)
wuffs_crc32_ieee_10k/gcc10 9.77GB/s ± 1% 7.72GB/s ± 1% -21.00% (p=0.000 n=10+9)
wuffs_crc32_ieee_100k/gcc10 12.7GB/s ± 2% 11.9GB/s ± 2% -5.84% (p=0.000 n=9+9)
wuffs_gzip_decode_10k/clang9 217MB/s ± 2% 221MB/s ± 0% +1.80% (p=0.000 n=10+8)
wuffs_gzip_decode_100k/clang9 280MB/s ± 1% 283MB/s ± 0% +1.38% (p=0.000 n=9+8)
wuffs_gzip_decode_10k/gcc10 215MB/s ± 1% 220MB/s ± 1% +2.52% (p=0.000 n=10+9)
wuffs_gzip_decode_100k/gcc10 272MB/s ± 0% 274MB/s ± 1% +0.74% (p=0.000 n=9+10)
wuffs_png_decode_image_19k_8bpp/clang9 136MB/s ± 0% 142MB/s ± 0% +3.95% (p=0.000 n=8+8)
wuffs_png_decode_image_40k_24bpp/clang9 171MB/s ± 1% 172MB/s ± 0% +0.84% (p=0.000 n=10+8)
wuffs_png_decode_image_77k_8bpp/clang9 492MB/s ± 3% 501MB/s ± 2% +1.88% (p=0.009 n=10+10)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 456MB/s ± 1% 461MB/s ± 1% +1.04% (p=0.000 n=9+9)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 439MB/s ± 1% 440MB/s ± 3% ~ (p=0.173 n=8+10)
wuffs_png_decode_image_4002k_24bpp/clang9 172MB/s ± 1% 174MB/s ± 1% +1.29% (p=0.005 n=10+10)
wuffs_png_decode_image_19k_8bpp/gcc10 152MB/s ± 0% 149MB/s ± 1% -1.92% (p=0.000 n=9+9)
wuffs_png_decode_image_40k_24bpp/gcc10 184MB/s ± 0% 186MB/s ± 1% +1.37% (p=0.000 n=8+9)
wuffs_png_decode_image_77k_8bpp/gcc10 550MB/s ± 1% 542MB/s ± 2% -1.46% (p=0.000 n=10+10)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10 473MB/s ± 1% 478MB/s ± 1% +1.23% (p=0.000 n=9+10)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10 453MB/s ± 1% 458MB/s ± 1% +0.91% (p=0.000 n=9+10)
wuffs_png_decode_image_4002k_24bpp/gcc10 185MB/s ± 2% 188MB/s ± 0% +1.82% (p=0.000 n=9+8)
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index 191c621..11cfa4a 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -76,6 +76,8 @@
// Similarly, "cpu_arch >= x86_sse42" requires SSE4.2 but also PCLMUL and
// POPCNT. This is checked at runtime via cpuid, not at compile time.
+//
+// Likewise, "cpu_arch >= x86_avx2" also requires PCLMUL, POPCNT and SSE4.2.
#if defined(__x86_64__)
#include <cpuid.h>
#include <x86intrin.h>
@@ -154,6 +156,11 @@
// GCC defines these macros but MSVC does not.
// - bit_AVX2 = (1 << 5)
const unsigned int avx2_ebx7 = 0x00000020;
+ // GCC defines these macros but MSVC does not.
+ // - bit_PCLMUL = (1 << 1)
+ // - bit_POPCNT = (1 << 23)
+ // - bit_SSE4_2 = (1 << 20)
+ const unsigned int avx2_ecx1 = 0x00900002;
// clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
#if defined(__GNUC__)
@@ -163,13 +170,24 @@
unsigned int edx7 = 0;
if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&
((ebx7 & avx2_ebx7) == avx2_ebx7)) {
- return true;
+ unsigned int eax1 = 0;
+ unsigned int ebx1 = 0;
+ unsigned int ecx1 = 0;
+ unsigned int edx1 = 0;
+ if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1) &&
+ ((ecx1 & avx2_ecx1) == avx2_ecx1)) {
+ return true;
+ }
}
#elif defined(_MSC_VER) // defined(__GNUC__)
int x7[4];
__cpuidex(x7, 7, 0);
if ((((unsigned int)(x7[1])) & avx2_ebx7) == avx2_ebx7) {
- return true;
+ int x1[4];
+ __cpuid(x1, 1);
+ if ((((unsigned int)(x1[2])) & avx2_ecx1) == avx2_ecx1) {
+ return true;
+ }
}
#else
#error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index aac57f8..413ee06 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -55,15 +55,16 @@
"fine WUFFS_VERSION_PRE_RELEASE_LABEL \"work.in.progress\"\n#define WUFFS_VERSION_BUILD_METADATA_COMMIT_COUNT 0\n#define WUFFS_VERSION_BUILD_METADATA_COMMIT_DATE 0\n#define WUFFS_VERSION_STRING \"0.0.0+0.00000000\"\n\n" +
"" +
"// ---------------- Configuration\n\n// Define WUFFS_CONFIG__AVOID_CPU_ARCH to avoid any code tied to a specific CPU\n// architecture, such as SSE SIMD for the x86 CPU family.\n#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH) // (#if-chain ref AVOID_CPU_ARCH_0)\n// No-op.\n#else // (#if-chain ref AVOID_CPU_ARCH_0)\n\n// The \"defined(__clang__)\" isn't redundant. While vanilla clang defines\n// __GNUC__, clang-cl (which mimics MSVC's cl.exe) does not.\n#if defined(__GNUC__) || defined(__clang__)\n#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg) __attribute__((target(arg)))\n#else\n#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg)\n#endif // defined(__GNUC__) || defined(__clang__)\n\n#if defined(__GNUC__) // (#if-chain ref AVOID_CPU_ARCH_1)\n\n// To simplify Wuffs code, \"cpu_arch >= arm_xxx\" requires xxx but also\n// unaligned little-endian load/stores.\n#if defined(__ARM_FEATURE_UNALIGNED) && defined(__BYTE_ORDER__) && \\\n (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)\n// Not all gcc versions define __ARM_ACLE, even if they support crc32" +
- "\n// intrinsics. Look for __ARM_FEATURE_CRC32 instead.\n#if defined(__ARM_FEATURE_CRC32)\n#include <arm_acle.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_CRC32\n#endif // defined(__ARM_FEATURE_CRC32)\n#if defined(__ARM_NEON)\n#include <arm_neon.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_NEON\n#endif // defined(__ARM_NEON)\n#endif // defined(__ARM_FEATURE_UNALIGNED) etc\n\n// Similarly, \"cpu_arch >= x86_sse42\" requires SSE4.2 but also PCLMUL and\n// POPCNT. This is checked at runtime via cpuid, not at compile time.\n#if defined(__x86_64__)\n#include <cpuid.h>\n#include <x86intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif // defined(__x86_64__)\n\n#elif defined(_MSC_VER) // (#if-chain ref AVOID_CPU_ARCH_1)\n\n#if defined(_M_X64)\n#if defined(__AVX__) || defined(__clang__)\n\n// We need <intrin.h> for the __cpuid function.\n#include <intrin.h>\n// That's not enough for X64 SIMD, with clang-cl, if we want to use\n// \"__attribute__((target(arg)))\" without e.g. \"/arch:AVX\".\n//\n// Some web pages suggest that <immintrin.h> is all you need, as i" +
- "t pulls in\n// the earlier SIMD families like SSE4.2, but that doesn't seem to work in\n// practice, possibly for the same reason that just <intrin.h> doesn't work.\n#include <immintrin.h> // AVX, AVX2, FMA, POPCNT\n#include <nmmintrin.h> // SSE4.2\n#include <wmmintrin.h> // AES, PCLMUL\n#define WUFFS_BASE__CPU_ARCH__X86_64\n\n#else // defined(__AVX__) || defined(__clang__)\n\n// clang-cl (which defines both __clang__ and _MSC_VER) supports\n// \"__attribute__((target(arg)))\".\n//\n// For MSVC's cl.exe (unlike clang or gcc), SIMD capability is a compile-time\n// property of the source file (e.g. a /arch:AVX or -mavx compiler flag), not\n// of individual functions (that can be conditionally selected at runtime).\n#pragma message(\"Wuffs with MSVC+X64 needs /arch:AVX for best performance\")\n\n#endif // defined(__AVX__) || defined(__clang__)\n#endif // defined(_M_X64)\n\n#endif // (#if-chain ref AVOID_CPU_ARCH_1)\n#endif // (#if-chain ref AVOID_CPU_ARCH_0)\n\n" +
+ "\n// intrinsics. Look for __ARM_FEATURE_CRC32 instead.\n#if defined(__ARM_FEATURE_CRC32)\n#include <arm_acle.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_CRC32\n#endif // defined(__ARM_FEATURE_CRC32)\n#if defined(__ARM_NEON)\n#include <arm_neon.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_NEON\n#endif // defined(__ARM_NEON)\n#endif // defined(__ARM_FEATURE_UNALIGNED) etc\n\n// Similarly, \"cpu_arch >= x86_sse42\" requires SSE4.2 but also PCLMUL and\n// POPCNT. This is checked at runtime via cpuid, not at compile time.\n//\n// Likewise, \"cpu_arch >= x86_avx2\" also requires PCLMUL, POPCNT and SSE4.2.\n#if defined(__x86_64__)\n#include <cpuid.h>\n#include <x86intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif // defined(__x86_64__)\n\n#elif defined(_MSC_VER) // (#if-chain ref AVOID_CPU_ARCH_1)\n\n#if defined(_M_X64)\n#if defined(__AVX__) || defined(__clang__)\n\n// We need <intrin.h> for the __cpuid function.\n#include <intrin.h>\n// That's not enough for X64 SIMD, with clang-cl, if we want to use\n// \"__attribute__((target(arg)))\" without e.g. \"/" +
+ "arch:AVX\".\n//\n// Some web pages suggest that <immintrin.h> is all you need, as it pulls in\n// the earlier SIMD families like SSE4.2, but that doesn't seem to work in\n// practice, possibly for the same reason that just <intrin.h> doesn't work.\n#include <immintrin.h> // AVX, AVX2, FMA, POPCNT\n#include <nmmintrin.h> // SSE4.2\n#include <wmmintrin.h> // AES, PCLMUL\n#define WUFFS_BASE__CPU_ARCH__X86_64\n\n#else // defined(__AVX__) || defined(__clang__)\n\n// clang-cl (which defines both __clang__ and _MSC_VER) supports\n// \"__attribute__((target(arg)))\".\n//\n// For MSVC's cl.exe (unlike clang or gcc), SIMD capability is a compile-time\n// property of the source file (e.g. a /arch:AVX or -mavx compiler flag), not\n// of individual functions (that can be conditionally selected at runtime).\n#pragma message(\"Wuffs with MSVC+X64 needs /arch:AVX for best performance\")\n\n#endif // defined(__AVX__) || defined(__clang__)\n#endif // defined(_M_X64)\n\n#endif // (#if-chain ref AVOID_CPU_ARCH_1)\n#endif // (#if-chain ref AVOID_CPU_" +
+ "ARCH_0)\n\n" +
"" +
"// --------\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n\n" +
"" +
- "// ---------------- CPU Architecture\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_avx2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_AVX2 = (1 << 5)\n const unsigned int avx2_ebx7 = 0x00000020;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax7 = 0;\n unsigned int ebx7 = 0;\n unsigned int ecx7 = 0;\n unsigned int edx7 = 0;\n if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&\n ((ebx7 & avx2_ebx7) == avx2_ebx7)) {\n return true;\n }\n#elif defined(_MSC_VER) // defin" +
- "ed(__GNUC__)\n int x7[4];\n __cpuidex(x7, 7, 0);\n if ((((unsigned int)(x7[1])) & avx2_ebx7) == avx2_ebx7) {\n return true;\n }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_bmi2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_BMI2 = (1 << 8)\n const unsigned int bmi2_ebx7 = 0x00000100;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax7 = 0;\n unsigned int ebx7 = 0;\n unsigned int ecx7 = 0;\n unsigned int edx7 = 0;\n if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&\n ((ebx7 & bmi2_ebx7) == bmi2_ebx7)) {\n return true;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n int x7[4];\n __cpuidex(x7, 7, 0);\n if ((((unsigned int)(x7[1])) & bmi2_ebx7) == bmi2_ebx7) {\n " +
- " return true;\n }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_PCLMUL = (1 << 1)\n // - bit_POPCNT = (1 << 23)\n // - bit_SSE4_2 = (1 << 20)\n const unsigned int sse42_ecx1 = 0x00900002;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax1 = 0;\n unsigned int ebx1 = 0;\n unsigned int ecx1 = 0;\n unsigned int edx1 = 0;\n if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1) &&\n ((ecx1 & sse42_ecx1) == sse42_ecx1)) {\n return true;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n int x1[4];\n __cpuid(x1, 1);\n if ((((unsigned int)(x1[2])) & sse42_ecx1) == sse42_ecx1) {\n return true;\n }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ET" +
- "C combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\n" +
+ "// ---------------- CPU Architecture\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_avx2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_AVX2 = (1 << 5)\n const unsigned int avx2_ebx7 = 0x00000020;\n // GCC defines these macros but MSVC does not.\n // - bit_PCLMUL = (1 << 1)\n // - bit_POPCNT = (1 << 23)\n // - bit_SSE4_2 = (1 << 20)\n const unsigned int avx2_ecx1 = 0x00900002;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax7 = 0;\n unsigned int ebx7 = 0;\n unsigned int ecx7 =" +
+ " 0;\n unsigned int edx7 = 0;\n if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&\n ((ebx7 & avx2_ebx7) == avx2_ebx7)) {\n unsigned int eax1 = 0;\n unsigned int ebx1 = 0;\n unsigned int ecx1 = 0;\n unsigned int edx1 = 0;\n if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1) &&\n ((ecx1 & avx2_ecx1) == avx2_ecx1)) {\n return true;\n }\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n int x7[4];\n __cpuidex(x7, 7, 0);\n if ((((unsigned int)(x7[1])) & avx2_ebx7) == avx2_ebx7) {\n int x1[4];\n __cpuid(x1, 1);\n if ((((unsigned int)(x1[2])) & avx2_ecx1) == avx2_ecx1) {\n return true;\n }\n }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_bmi2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_BMI2 = (1 << 8)\n const u" +
+ "nsigned int bmi2_ebx7 = 0x00000100;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax7 = 0;\n unsigned int ebx7 = 0;\n unsigned int ecx7 = 0;\n unsigned int edx7 = 0;\n if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&\n ((ebx7 & bmi2_ebx7) == bmi2_ebx7)) {\n return true;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n int x7[4];\n __cpuidex(x7, 7, 0);\n if ((((unsigned int)(x7[1])) & bmi2_ebx7) == bmi2_ebx7) {\n return true;\n }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_PCLMUL = (1 << 1)\n // - bit_POPCNT = (1 << 23)\n // - bit_SSE4_2 = (1 << 20)\n const unsigned int sse42_ecx1 = 0x00900002;\n\n // clan" +
+ "g defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax1 = 0;\n unsigned int ebx1 = 0;\n unsigned int ecx1 = 0;\n unsigned int edx1 = 0;\n if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1) &&\n ((ecx1 & sse42_ecx1) == sse42_ecx1)) {\n return true;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n int x1[4];\n __cpuid(x1, 1);\n if ((((unsigned int)(x1[2])) & sse42_ecx1) == sse42_ecx1) {\n return true;\n }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\n" +
"" +
"// ---------------- Fundamentals\n\n// Wuffs assumes that:\n// - converting a uint32_t to a size_t will never overflow.\n// - converting a size_t to a uint64_t will never overflow.\n#if defined(__WORDSIZE)\n#if (__WORDSIZE != 32) && (__WORDSIZE != 64)\n#error \"Wuffs requires a word size of either 32 or 64 bits\"\n#endif\n#endif\n\n// Clang also defines \"__GNUC__\".\n#if defined(__GNUC__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED __attribute__((unused))\n#define WUFFS_BASE__WARN_UNUSED_RESULT __attribute__((warn_unused_result))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED\n#define WUFFS_BASE__WARN_UNUSED_RESULT\n#endif\n\n" +
"" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 18a4465..02ce6b7 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -115,6 +115,8 @@
// Similarly, "cpu_arch >= x86_sse42" requires SSE4.2 but also PCLMUL and
// POPCNT. This is checked at runtime via cpuid, not at compile time.
+//
+// Likewise, "cpu_arch >= x86_avx2" also requires PCLMUL, POPCNT and SSE4.2.
#if defined(__x86_64__)
#include <cpuid.h>
#include <x86intrin.h>
@@ -193,6 +195,11 @@
// GCC defines these macros but MSVC does not.
// - bit_AVX2 = (1 << 5)
const unsigned int avx2_ebx7 = 0x00000020;
+ // GCC defines these macros but MSVC does not.
+ // - bit_PCLMUL = (1 << 1)
+ // - bit_POPCNT = (1 << 23)
+ // - bit_SSE4_2 = (1 << 20)
+ const unsigned int avx2_ecx1 = 0x00900002;
// clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
#if defined(__GNUC__)
@@ -202,13 +209,24 @@
unsigned int edx7 = 0;
if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&
((ebx7 & avx2_ebx7) == avx2_ebx7)) {
- return true;
+ unsigned int eax1 = 0;
+ unsigned int ebx1 = 0;
+ unsigned int ecx1 = 0;
+ unsigned int edx1 = 0;
+ if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1) &&
+ ((ecx1 & avx2_ecx1) == avx2_ecx1)) {
+ return true;
+ }
}
#elif defined(_MSC_VER) // defined(__GNUC__)
int x7[4];
__cpuidex(x7, 7, 0);
if ((((unsigned int)(x7[1])) & avx2_ebx7) == avx2_ebx7) {
- return true;
+ int x1[4];
+ __cpuid(x1, 1);
+ if ((((unsigned int)(x1[2])) & avx2_ecx1) == avx2_ecx1) {
+ return true;
+ }
}
#else
#error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"