Have "x86_avx2" require PCLMUL, POPCNT and SSE4.2

The std/crc32 benchmark numbers get worse. The benches still run AVX2
code, but it runs slower for an unknown reason. The std/gzip and std/png
benchmark numbers are mixed.

Still, it's cleaner for Wuffs' "x86_avx2" to be a superset of
"x86_sse42" (and apart from PCLMUL, they're now similar to Clang 12 and
GCC 11's "x86-64-v3" and "x86-64-v2" micro-architecture feature levels).
Maybe future compiler versions can claw back the std/crc32 performance.

On a mid-range x86_64 laptop (2016, Skylake):

name                                                               old speed      new speed      delta

wuffs_crc32_ieee_10k/clang9                                        8.07GB/s ± 0%  7.61GB/s ± 1%   -5.72%  (p=0.000 n=8+8)
wuffs_crc32_ieee_100k/clang9                                       11.0GB/s ± 1%   9.7GB/s ± 3%  -11.45%  (p=0.000 n=7+8)

wuffs_crc32_ieee_10k/gcc10                                         9.77GB/s ± 1%  7.72GB/s ± 1%  -21.00%  (p=0.000 n=10+9)
wuffs_crc32_ieee_100k/gcc10                                        12.7GB/s ± 2%  11.9GB/s ± 2%   -5.84%  (p=0.000 n=9+9)

wuffs_gzip_decode_10k/clang9                                        217MB/s ± 2%   221MB/s ± 0%   +1.80%  (p=0.000 n=10+8)
wuffs_gzip_decode_100k/clang9                                       280MB/s ± 1%   283MB/s ± 0%   +1.38%  (p=0.000 n=9+8)

wuffs_gzip_decode_10k/gcc10                                         215MB/s ± 1%   220MB/s ± 1%   +2.52%  (p=0.000 n=10+9)
wuffs_gzip_decode_100k/gcc10                                        272MB/s ± 0%   274MB/s ± 1%   +0.74%  (p=0.000 n=9+10)

wuffs_png_decode_image_19k_8bpp/clang9                              136MB/s ± 0%   142MB/s ± 0%   +3.95%  (p=0.000 n=8+8)
wuffs_png_decode_image_40k_24bpp/clang9                             171MB/s ± 1%   172MB/s ± 0%   +0.84%  (p=0.000 n=10+8)
wuffs_png_decode_image_77k_8bpp/clang9                              492MB/s ± 3%   501MB/s ± 2%   +1.88%  (p=0.009 n=10+10)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9            456MB/s ± 1%   461MB/s ± 1%   +1.04%  (p=0.000 n=9+9)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9            439MB/s ± 1%   440MB/s ± 3%     ~     (p=0.173 n=8+10)
wuffs_png_decode_image_4002k_24bpp/clang9                           172MB/s ± 1%   174MB/s ± 1%   +1.29%  (p=0.005 n=10+10)

wuffs_png_decode_image_19k_8bpp/gcc10                               152MB/s ± 0%   149MB/s ± 1%   -1.92%  (p=0.000 n=9+9)
wuffs_png_decode_image_40k_24bpp/gcc10                              184MB/s ± 0%   186MB/s ± 1%   +1.37%  (p=0.000 n=8+9)
wuffs_png_decode_image_77k_8bpp/gcc10                               550MB/s ± 1%   542MB/s ± 2%   -1.46%  (p=0.000 n=10+10)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10             473MB/s ± 1%   478MB/s ± 1%   +1.23%  (p=0.000 n=9+10)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10             453MB/s ± 1%   458MB/s ± 1%   +0.91%  (p=0.000 n=9+10)
wuffs_png_decode_image_4002k_24bpp/gcc10                            185MB/s ± 2%   188MB/s ± 0%   +1.82%  (p=0.000 n=9+8)
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index 191c621..11cfa4a 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -76,6 +76,8 @@
 
 // Similarly, "cpu_arch >= x86_sse42" requires SSE4.2 but also PCLMUL and
 // POPCNT. This is checked at runtime via cpuid, not at compile time.
+//
+// Likewise, "cpu_arch >= x86_avx2" also requires PCLMUL, POPCNT and SSE4.2.
 #if defined(__x86_64__)
 #include <cpuid.h>
 #include <x86intrin.h>
@@ -154,6 +156,11 @@
   // GCC defines these macros but MSVC does not.
   //  - bit_AVX2 = (1 <<  5)
   const unsigned int avx2_ebx7 = 0x00000020;
+  // GCC defines these macros but MSVC does not.
+  //  - bit_PCLMUL = (1 <<  1)
+  //  - bit_POPCNT = (1 << 23)
+  //  - bit_SSE4_2 = (1 << 20)
+  const unsigned int avx2_ecx1 = 0x00900002;
 
   // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
 #if defined(__GNUC__)
@@ -163,13 +170,24 @@
   unsigned int edx7 = 0;
   if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&
       ((ebx7 & avx2_ebx7) == avx2_ebx7)) {
-    return true;
+    unsigned int eax1 = 0;
+    unsigned int ebx1 = 0;
+    unsigned int ecx1 = 0;
+    unsigned int edx1 = 0;
+    if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1) &&
+        ((ecx1 & avx2_ecx1) == avx2_ecx1)) {
+      return true;
+    }
   }
 #elif defined(_MSC_VER)  // defined(__GNUC__)
   int x7[4];
   __cpuidex(x7, 7, 0);
   if ((((unsigned int)(x7[1])) & avx2_ebx7) == avx2_ebx7) {
-    return true;
+    int x1[4];
+    __cpuid(x1, 1);
+    if ((((unsigned int)(x1[2])) & avx2_ecx1) == avx2_ecx1) {
+      return true;
+    }
   }
 #else
 #error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index aac57f8..413ee06 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -55,15 +55,16 @@
 	"fine WUFFS_VERSION_PRE_RELEASE_LABEL \"work.in.progress\"\n#define WUFFS_VERSION_BUILD_METADATA_COMMIT_COUNT 0\n#define WUFFS_VERSION_BUILD_METADATA_COMMIT_DATE 0\n#define WUFFS_VERSION_STRING \"0.0.0+0.00000000\"\n\n" +
 	"" +
 	"// ---------------- Configuration\n\n// Define WUFFS_CONFIG__AVOID_CPU_ARCH to avoid any code tied to a specific CPU\n// architecture, such as SSE SIMD for the x86 CPU family.\n#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)  // (#if-chain ref AVOID_CPU_ARCH_0)\n// No-op.\n#else  // (#if-chain ref AVOID_CPU_ARCH_0)\n\n// The \"defined(__clang__)\" isn't redundant. While vanilla clang defines\n// __GNUC__, clang-cl (which mimics MSVC's cl.exe) does not.\n#if defined(__GNUC__) || defined(__clang__)\n#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg) __attribute__((target(arg)))\n#else\n#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg)\n#endif  // defined(__GNUC__) || defined(__clang__)\n\n#if defined(__GNUC__)  // (#if-chain ref AVOID_CPU_ARCH_1)\n\n// To simplify Wuffs code, \"cpu_arch >= arm_xxx\" requires xxx but also\n// unaligned little-endian load/stores.\n#if defined(__ARM_FEATURE_UNALIGNED) && defined(__BYTE_ORDER__) && \\\n    (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)\n// Not all gcc versions define __ARM_ACLE, even if they support crc32" +
-	"\n// intrinsics. Look for __ARM_FEATURE_CRC32 instead.\n#if defined(__ARM_FEATURE_CRC32)\n#include <arm_acle.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_CRC32\n#endif  // defined(__ARM_FEATURE_CRC32)\n#if defined(__ARM_NEON)\n#include <arm_neon.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_NEON\n#endif  // defined(__ARM_NEON)\n#endif  // defined(__ARM_FEATURE_UNALIGNED) etc\n\n// Similarly, \"cpu_arch >= x86_sse42\" requires SSE4.2 but also PCLMUL and\n// POPCNT. This is checked at runtime via cpuid, not at compile time.\n#if defined(__x86_64__)\n#include <cpuid.h>\n#include <x86intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif  // defined(__x86_64__)\n\n#elif defined(_MSC_VER)  // (#if-chain ref AVOID_CPU_ARCH_1)\n\n#if defined(_M_X64)\n#if defined(__AVX__) || defined(__clang__)\n\n// We need <intrin.h> for the __cpuid function.\n#include <intrin.h>\n// That's not enough for X64 SIMD, with clang-cl, if we want to use\n// \"__attribute__((target(arg)))\" without e.g. \"/arch:AVX\".\n//\n// Some web pages suggest that <immintrin.h> is all you need, as i" +
-	"t pulls in\n// the earlier SIMD families like SSE4.2, but that doesn't seem to work in\n// practice, possibly for the same reason that just <intrin.h> doesn't work.\n#include <immintrin.h>  // AVX, AVX2, FMA, POPCNT\n#include <nmmintrin.h>  // SSE4.2\n#include <wmmintrin.h>  // AES, PCLMUL\n#define WUFFS_BASE__CPU_ARCH__X86_64\n\n#else  // defined(__AVX__) || defined(__clang__)\n\n// clang-cl (which defines both __clang__ and _MSC_VER) supports\n// \"__attribute__((target(arg)))\".\n//\n// For MSVC's cl.exe (unlike clang or gcc), SIMD capability is a compile-time\n// property of the source file (e.g. a /arch:AVX or -mavx compiler flag), not\n// of individual functions (that can be conditionally selected at runtime).\n#pragma message(\"Wuffs with MSVC+X64 needs /arch:AVX for best performance\")\n\n#endif  // defined(__AVX__) || defined(__clang__)\n#endif  // defined(_M_X64)\n\n#endif  // (#if-chain ref AVOID_CPU_ARCH_1)\n#endif  // (#if-chain ref AVOID_CPU_ARCH_0)\n\n" +
+	"\n// intrinsics. Look for __ARM_FEATURE_CRC32 instead.\n#if defined(__ARM_FEATURE_CRC32)\n#include <arm_acle.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_CRC32\n#endif  // defined(__ARM_FEATURE_CRC32)\n#if defined(__ARM_NEON)\n#include <arm_neon.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_NEON\n#endif  // defined(__ARM_NEON)\n#endif  // defined(__ARM_FEATURE_UNALIGNED) etc\n\n// Similarly, \"cpu_arch >= x86_sse42\" requires SSE4.2 but also PCLMUL and\n// POPCNT. This is checked at runtime via cpuid, not at compile time.\n//\n// Likewise, \"cpu_arch >= x86_avx2\" also requires PCLMUL, POPCNT and SSE4.2.\n#if defined(__x86_64__)\n#include <cpuid.h>\n#include <x86intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif  // defined(__x86_64__)\n\n#elif defined(_MSC_VER)  // (#if-chain ref AVOID_CPU_ARCH_1)\n\n#if defined(_M_X64)\n#if defined(__AVX__) || defined(__clang__)\n\n// We need <intrin.h> for the __cpuid function.\n#include <intrin.h>\n// That's not enough for X64 SIMD, with clang-cl, if we want to use\n// \"__attribute__((target(arg)))\" without e.g. \"/" +
+	"arch:AVX\".\n//\n// Some web pages suggest that <immintrin.h> is all you need, as it pulls in\n// the earlier SIMD families like SSE4.2, but that doesn't seem to work in\n// practice, possibly for the same reason that just <intrin.h> doesn't work.\n#include <immintrin.h>  // AVX, AVX2, FMA, POPCNT\n#include <nmmintrin.h>  // SSE4.2\n#include <wmmintrin.h>  // AES, PCLMUL\n#define WUFFS_BASE__CPU_ARCH__X86_64\n\n#else  // defined(__AVX__) || defined(__clang__)\n\n// clang-cl (which defines both __clang__ and _MSC_VER) supports\n// \"__attribute__((target(arg)))\".\n//\n// For MSVC's cl.exe (unlike clang or gcc), SIMD capability is a compile-time\n// property of the source file (e.g. a /arch:AVX or -mavx compiler flag), not\n// of individual functions (that can be conditionally selected at runtime).\n#pragma message(\"Wuffs with MSVC+X64 needs /arch:AVX for best performance\")\n\n#endif  // defined(__AVX__) || defined(__clang__)\n#endif  // defined(_M_X64)\n\n#endif  // (#if-chain ref AVOID_CPU_ARCH_1)\n#endif  // (#if-chain ref AVOID_CPU_" +
+	"ARCH_0)\n\n" +
 	"" +
 	"// --------\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif  // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n\n" +
 	"" +
-	"// ---------------- CPU Architecture\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_avx2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_AVX2 = (1 <<  5)\n  const unsigned int avx2_ebx7 = 0x00000020;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax7 = 0;\n  unsigned int ebx7 = 0;\n  unsigned int ecx7 = 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&\n      ((ebx7 & avx2_ebx7) == avx2_ebx7)) {\n    return true;\n  }\n#elif defined(_MSC_VER)  // defin" +
-	"ed(__GNUC__)\n  int x7[4];\n  __cpuidex(x7, 7, 0);\n  if ((((unsigned int)(x7[1])) & avx2_ebx7) == avx2_ebx7) {\n    return true;\n  }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_bmi2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_BMI2 = (1 <<  8)\n  const unsigned int bmi2_ebx7 = 0x00000100;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax7 = 0;\n  unsigned int ebx7 = 0;\n  unsigned int ecx7 = 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&\n      ((ebx7 & bmi2_ebx7) == bmi2_ebx7)) {\n    return true;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x7[4];\n  __cpuidex(x7, 7, 0);\n  if ((((unsigned int)(x7[1])) & bmi2_ebx7) == bmi2_ebx7) {\n   " +
-	" return true;\n  }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_PCLMUL = (1 <<  1)\n  //  - bit_POPCNT = (1 << 23)\n  //  - bit_SSE4_2 = (1 << 20)\n  const unsigned int sse42_ecx1 = 0x00900002;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1) &&\n      ((ecx1 & sse42_ecx1) == sse42_ecx1)) {\n    return true;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x1[4];\n  __cpuid(x1, 1);\n  if ((((unsigned int)(x1[2])) & sse42_ecx1) == sse42_ecx1) {\n    return true;\n  }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ET" +
-	"C combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\n" +
+	"// ---------------- CPU Architecture\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_avx2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_AVX2 = (1 <<  5)\n  const unsigned int avx2_ebx7 = 0x00000020;\n  // GCC defines these macros but MSVC does not.\n  //  - bit_PCLMUL = (1 <<  1)\n  //  - bit_POPCNT = (1 << 23)\n  //  - bit_SSE4_2 = (1 << 20)\n  const unsigned int avx2_ecx1 = 0x00900002;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax7 = 0;\n  unsigned int ebx7 = 0;\n  unsigned int ecx7 =" +
+	" 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&\n      ((ebx7 & avx2_ebx7) == avx2_ebx7)) {\n    unsigned int eax1 = 0;\n    unsigned int ebx1 = 0;\n    unsigned int ecx1 = 0;\n    unsigned int edx1 = 0;\n    if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1) &&\n        ((ecx1 & avx2_ecx1) == avx2_ecx1)) {\n      return true;\n    }\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x7[4];\n  __cpuidex(x7, 7, 0);\n  if ((((unsigned int)(x7[1])) & avx2_ebx7) == avx2_ebx7) {\n    int x1[4];\n    __cpuid(x1, 1);\n    if ((((unsigned int)(x1[2])) & avx2_ecx1) == avx2_ecx1) {\n      return true;\n    }\n  }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_bmi2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_BMI2 = (1 <<  8)\n  const u" +
+	"nsigned int bmi2_ebx7 = 0x00000100;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax7 = 0;\n  unsigned int ebx7 = 0;\n  unsigned int ecx7 = 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&\n      ((ebx7 & bmi2_ebx7) == bmi2_ebx7)) {\n    return true;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x7[4];\n  __cpuidex(x7, 7, 0);\n  if ((((unsigned int)(x7[1])) & bmi2_ebx7) == bmi2_ebx7) {\n    return true;\n  }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_PCLMUL = (1 <<  1)\n  //  - bit_POPCNT = (1 << 23)\n  //  - bit_SSE4_2 = (1 << 20)\n  const unsigned int sse42_ecx1 = 0x00900002;\n\n  // clan" +
+	"g defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1) &&\n      ((ecx1 & sse42_ecx1) == sse42_ecx1)) {\n    return true;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x1[4];\n  __cpuid(x1, 1);\n  if ((((unsigned int)(x1[2])) & sse42_ecx1) == sse42_ecx1) {\n    return true;\n  }\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\n" +
 	"" +
 	"// ---------------- Fundamentals\n\n// Wuffs assumes that:\n//  - converting a uint32_t to a size_t will never overflow.\n//  - converting a size_t to a uint64_t will never overflow.\n#if defined(__WORDSIZE)\n#if (__WORDSIZE != 32) && (__WORDSIZE != 64)\n#error \"Wuffs requires a word size of either 32 or 64 bits\"\n#endif\n#endif\n\n// Clang also defines \"__GNUC__\".\n#if defined(__GNUC__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED __attribute__((unused))\n#define WUFFS_BASE__WARN_UNUSED_RESULT __attribute__((warn_unused_result))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED\n#define WUFFS_BASE__WARN_UNUSED_RESULT\n#endif\n\n" +
 	"" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 18a4465..02ce6b7 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -115,6 +115,8 @@
 
 // Similarly, "cpu_arch >= x86_sse42" requires SSE4.2 but also PCLMUL and
 // POPCNT. This is checked at runtime via cpuid, not at compile time.
+//
+// Likewise, "cpu_arch >= x86_avx2" also requires PCLMUL, POPCNT and SSE4.2.
 #if defined(__x86_64__)
 #include <cpuid.h>
 #include <x86intrin.h>
@@ -193,6 +195,11 @@
   // GCC defines these macros but MSVC does not.
   //  - bit_AVX2 = (1 <<  5)
   const unsigned int avx2_ebx7 = 0x00000020;
+  // GCC defines these macros but MSVC does not.
+  //  - bit_PCLMUL = (1 <<  1)
+  //  - bit_POPCNT = (1 << 23)
+  //  - bit_SSE4_2 = (1 << 20)
+  const unsigned int avx2_ecx1 = 0x00900002;
 
   // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
 #if defined(__GNUC__)
@@ -202,13 +209,24 @@
   unsigned int edx7 = 0;
   if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7) &&
       ((ebx7 & avx2_ebx7) == avx2_ebx7)) {
-    return true;
+    unsigned int eax1 = 0;
+    unsigned int ebx1 = 0;
+    unsigned int ecx1 = 0;
+    unsigned int edx1 = 0;
+    if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1) &&
+        ((ecx1 & avx2_ecx1) == avx2_ecx1)) {
+      return true;
+    }
   }
 #elif defined(_MSC_VER)  // defined(__GNUC__)
   int x7[4];
   __cpuidex(x7, 7, 0);
   if ((((unsigned int)(x7[1])) & avx2_ebx7) == avx2_ebx7) {
-    return true;
+    int x1[4];
+    __cpuid(x1, 1);
+    if ((((unsigned int)(x1[2])) & avx2_ecx1) == avx2_ecx1) {
+      return true;
+    }
   }
 #else
 #error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"