Performance improvement for absl::AsciiStrToUpper() and absl::AsciiStrToLower()

PiperOrigin-RevId: 608661989
Change-Id: Ibfd94f8b2d23fd232bf93904ed68e11a400b3644
diff --git a/absl/strings/ascii.cc b/absl/strings/ascii.cc
index 5460b2c..8f778a4 100644
--- a/absl/strings/ascii.cc
+++ b/absl/strings/ascii.cc
@@ -15,10 +15,8 @@
 #include "absl/strings/ascii.h"
 
 #include <climits>
-#include <cstdint>
 #include <cstring>
 #include <string>
-#include <type_traits>
 
 #include "absl/base/config.h"
 #include "absl/base/nullability.h"
@@ -162,19 +160,6 @@
 };
 // clang-format on
 
-template <class T>
-static constexpr T BroadcastByte(unsigned char value) {
-  static_assert(std::is_integral<T>::value && sizeof(T) <= sizeof(uint64_t) &&
-                    std::is_unsigned<T>::value,
-                "only unsigned integers up to 64-bit allowed");
-  T result = value;
-  constexpr size_t result_bit_width = sizeof(result) * CHAR_BIT;
-  result |= result << ((CHAR_BIT << 0) & (result_bit_width - 1));
-  result |= result << ((CHAR_BIT << 1) & (result_bit_width - 1));
-  result |= result << ((CHAR_BIT << 2) & (result_bit_width - 1));
-  return result;
-}
-
 // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`).
 // Implemented by:
 //  1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26).
@@ -190,46 +175,8 @@
 }
 
 template <bool ToUpper>
-static constexpr char* PartialAsciiStrCaseFold(absl::Nonnull<char*> p,
-                                               absl::Nonnull<char*> end) {
-  using vec_t = size_t;
-  const size_t n = static_cast<size_t>(end - p);
-
-  // SWAR algorithm: http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
-  constexpr char ch_a = ToUpper ? 'a' : 'A', ch_z = ToUpper ? 'z' : 'Z';
-  char* const swar_end = p + (n / sizeof(vec_t)) * sizeof(vec_t);
-  while (p < swar_end) {
-    vec_t v = vec_t();
-
-    // memcpy the vector, but constexpr
-    for (size_t i = 0; i < sizeof(vec_t); ++i) {
-      v |= static_cast<vec_t>(static_cast<unsigned char>(p[i]))
-           << (i * CHAR_BIT);
-    }
-
-    constexpr unsigned int msb = 1u << (CHAR_BIT - 1);
-    const vec_t v_msb = v & BroadcastByte<vec_t>(msb);
-    const vec_t v_nonascii_mask = (v_msb << 1) - (v_msb >> (CHAR_BIT - 1));
-    const vec_t v_nonascii = v & v_nonascii_mask;
-    const vec_t v_ascii = v & ~v_nonascii_mask;
-    const vec_t a = v_ascii + BroadcastByte<vec_t>(msb - ch_a - 0),
-                z = v_ascii + BroadcastByte<vec_t>(msb - ch_z - 1);
-    v = v_nonascii | (v_ascii ^ ((a ^ z) & BroadcastByte<vec_t>(msb)) >> 2);
-
-    // memcpy the vector, but constexpr
-    for (size_t i = 0; i < sizeof(vec_t); ++i) {
-      p[i] = static_cast<char>(v >> (i * CHAR_BIT));
-    }
-
-    p += sizeof(v);
-  }
-
-  return p;
-}
-
-template <bool ToUpper>
-static constexpr void AsciiStrCaseFold(absl::Nonnull<char*> p,
-                                       absl::Nonnull<char*> end) {
+constexpr void AsciiStrCaseFold(absl::Nonnull<char*> p,
+                                absl::Nonnull<char*> end) {
   // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
   // When we need to flip the case, we can xor with this bit to achieve the
   // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
@@ -237,17 +184,15 @@
   // have the same single bit difference.
   constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
 
-  using vec_t = size_t;
-  // TODO(b/316380338): When FDO becomes able to vectorize these,
-  // revert this manual optimization and just leave the naive loop.
-  if (static_cast<size_t>(end - p) >= sizeof(vec_t)) {
-    p = ascii_internal::PartialAsciiStrCaseFold<ToUpper>(p, end);
-  }
-  while (p < end) {
+#ifdef __clang__
+// Temporary workaround until the mentioned bug is fixed.
+// NOLINTNEXTLINE(whitespace/line_length)
+#pragma clang loop vectorize(enable)
+#endif
+  for (; p < end; ++p) {
     unsigned char v = static_cast<unsigned char>(*p);
     v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
     *p = static_cast<char>(v);
-    ++p;
   }
 }