Use ruy's newly curated default set of paths.
This:
- on all CPU architecturesremoves the reference code path, which is never used (the need for a portable non-simd fallback is already filled by the 'standard c++' path).
- on x86, with --define=tflite_with_ruy=true (non-default), this removes a couple of experimental paths, keeping only AVX2 and AVX512 paths.
- on x86 for per-channel-quantized ops where ruy is unconditionally used (no alternativee implementation), this enables AVX2 and AVX512 paths. This should fix slowness issues for such per-channel-quantized ops on x86.
PiperOrigin-RevId: 306981662
diff --git a/ruy/path.h b/ruy/path.h
index 6da87aa..697ddfc 100644
--- a/ruy/path.h
+++ b/ruy/path.h
@@ -28,10 +28,10 @@
// Path::kNeon means using NEON instructions, and Path::kNeonDotprod means
// also using the newer NEON dot-product instructions.
//
-// Different Path enum values are defined on different CPU architectures,
+// Different Path enum values are defined on different CPU Archs,
// corresponding to different SIMD ISA extensions available there.
//
-// There are two special Path's universally defined on all CPU architectures:
+// There are two special Path's universally defined on all CPU Archs:
// kReference and kStandardCpp. From a user's perspective, they are similar
// in that both are slow, portable, standard-c++-only implementation paths.
// They differ in that kStandardCpp is structurally similar to the actual
@@ -76,7 +76,7 @@
kStandardCpp = 0x2,
#if RUY_PLATFORM(ARM)
- // ARM architectures.
+ // ARM Archs.
//
// Optimized path using a widely available subset of ARM NEON instructions.
kNeon = 0x4,
@@ -86,7 +86,7 @@
#endif // RUY_PLATFORM(ARM)
#if RUY_PLATFORM(X86)
- // x86 architectures.
+ // x86 Archs.
//
// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete /
// placeholder.
@@ -128,37 +128,78 @@
return static_cast<Path>(~static_cast<std::uint32_t>(p));
}
+inline constexpr bool Disjoint(Path p, Path q) {
+ return (p & q) == Path::kNone;
+}
+
inline Path GetMostSignificantPath(Path path_mask) {
return static_cast<Path>(round_down_pot(static_cast<int>(path_mask)));
}
-// ruy::kAllPaths represents all Path's that make sense to on a given
-// base architecture.
-#ifdef __linux__
+// We define three disjoint sets of paths.
+//
+// kNonArchPaths is the set of paths that are defined regardless of
+// the CPU architecture. These paths are slow, but portable.
+constexpr Path kNonArchPaths = Path::kReference | Path::kStandardCpp;
+
+// The other two are specific to each CPU architecture. Note that these sets
+// do NOT include a fallback for when none of these architecture paths are
+// supported at runtime by the CPU. For that, see the other constants defined
+// further below.
+//
+// kDefaultArchPaths is the set of architecture-specific paths that
+// we recommend for most users. It is part of kDefaultPaths defined
+// below.
+//
+// kExtraArchPaths is the set of all other architecture-specific paths
+// that for whatever reason we're not recommending to most users at the moment.
+// Typically that would include work-in-progress paths, or paths targeting
+// minority hardware that isn't the best compromise of code size to performance
+// for most users.
+
#if RUY_PLATFORM(NEON_64)
-constexpr Path kAllPaths =
- Path::kReference | Path::kStandardCpp | Path::kNeon | Path::kNeonDotprod;
-#elif RUY_PLATFORM(NEON_32)
-constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp | Path::kNeon;
-#elif RUY_PLATFORM(X86)
-constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp |
- Path::kSse42 | Path::kAvx2 | Path::kAvx512 |
- Path::kAvxVnni;
+#ifdef __linux__
+constexpr Path kDefaultArchPaths = Path::kNeon | Path::kNeonDotprod;
#else
-constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp;
-#endif
-#else // __linux__
// We don't know how to do runtime dotprod detection outside of linux for now.
-#if RUY_PLATFORM(NEON)
-constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp | Path::kNeon;
-#elif RUY_PLATFORM(X86)
-constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp |
- Path::kSse42 | Path::kAvx2 | Path::kAvx512 |
- Path::kAvxVnni;
-#else
-constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp;
+constexpr Path kDefaultArchPaths = Path::kNeon;
#endif
-#endif // __linux__
+constexpr Path kExtraArchPaths = Path::kNone;
+#elif RUY_PLATFORM(NEON_32)
+constexpr Path kDefaultArchPaths = Path::kNeon;
+constexpr Path kExtraArchPaths = Path::kNone;
+#elif RUY_PLATFORM(X86)
+constexpr Path kDefaultArchPaths = Path::kAvx2 | Path::kAvx512;
+constexpr Path kExtraArchPaths = Path::kSse42 | Path::kAvxVnni;
+#else
+constexpr Path kDefaultArchPaths = Path::kNone;
+constexpr Path kExtraArchPaths = Path::kNone;
+#endif
+
+// Enforce that kDefaultArchPaths, kExtraArchPaths and
+// kNonArchPaths are mutually disjoint.
+static_assert(Disjoint(kDefaultArchPaths, kExtraArchPaths), "");
+static_assert(Disjoint(kDefaultArchPaths, kNonArchPaths), "");
+static_assert(Disjoint(kExtraArchPaths, kNonArchPaths), "");
+
+// We now define two aggregate sets of paths for convenience, including
+// both architecture-specific paths and some portable fallbacks.
+//
+// kDefaultPaths is the set of paths that we recommend most users to use.
+// It is what ruy::Mul(...), the entry point not taking an explicit Path value,
+// uses.
+// Note that kReference is left out of it: there should be no need for it in
+// user applications (not counting debugging). The need for some portable
+// fallback when no architecture-specific path can be used, is filled already by
+// kStandardCpp.
+constexpr Path kDefaultPaths = Path::kStandardCpp | kDefaultArchPaths;
+
+// kAllPaths is the set of all paths that are available to compile.
+// In addition to the Default paths, it also includes the extra
+// architecture paths, as well as the reference path.
+constexpr Path kAllPaths = kNonArchPaths | kDefaultArchPaths | kExtraArchPaths;
+
+static_assert(Disjoint(kDefaultPaths, ~kAllPaths), "");
} // namespace ruy
diff --git a/ruy/ruy.h b/ruy/ruy.h
index b5d3871..2260d71 100644
--- a/ruy/ruy.h
+++ b/ruy/ruy.h
@@ -74,8 +74,8 @@
void Mul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
const MulParamsType& mul_params, Context* context,
Matrix<DstScalar>* dst) {
- DispatchMul<ruy::kAllPaths, LhsScalar, RhsScalar, DstScalar, MulParamsType>(
- lhs, rhs, mul_params, context, dst);
+ DispatchMul<ruy::kDefaultPaths, LhsScalar, RhsScalar, DstScalar,
+ MulParamsType>(lhs, rhs, mul_params, context, dst);
}
// Variant of ruy::Mul allowing to specify a custom OR-ed set of Path's to