Improve the bytemap computation.

Change-Id: Ie3c81a7f6621d1f9b84f807a2679df5c3cccf897
Reviewed-on: https://code-review.googlesource.com/5242
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/prog.cc b/re2/prog.cc
index cd49923..ccdd568 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -318,7 +318,7 @@
  public:
   ByteMapBuilder() {
     // This allows the outer loop in Build() to be simple.
-    bitmap_.Set(255);
+    subranges_.Set(255);
   }
 
   // Marks the [lo, hi] subrange.
@@ -329,35 +329,64 @@
     DCHECK_LE(hi, 255);
     DCHECK_LE(lo, hi);
 
-    // We just track the end of each subrange. :)
+    if (lo == 0 && hi == 255)
+      return;
+
+    // We track the end of each subrange. We also track which subranges
+    // are "present" (i.e. were marked) so that the remaining, "absent"
+    // subranges can subsequently be made to share a single byte class.
     lo--;
-    if (0 <= lo)
-      bitmap_.Set(lo);
-    bitmap_.Set(hi);
+    if (0 <= lo && !subranges_.Test(lo)) {
+      subranges_.Set(lo);
+      int next = subranges_.FindNextSetBit(lo+1);
+      if (present_.Test(next))
+        present_.Set(lo);
+    }
+    if (!subranges_.Test(hi))
+      subranges_.Set(hi);
+    // Flag as "present" the subranges above lo up to and including the
+    // subrange ending at hi.
+    int c = lo+1;
+    while (c < 256) {
+      int next = subranges_.FindNextSetBit(c);
+      if (!present_.Test(next))
+        present_.Set(next);
+      if (next == hi)
+        break;
+      c = next+1;
+    }
   }
 
   // Builds the bytemap from the subranges.
-  // Returns the number of subranges.
+  // Returns the number of byte classes.
   int Build(uint8* bytemap) {
-    uint8 b = 0;
+    int present = 0;
+    int absent = -1;
     int c = 0;
     while (c < 256) {
-      // Find the end of this subrange.
-      int next = bitmap_.FindNextSetBit(c);
-      DCHECK_GE(next, 0);
-      DCHECK_LE(next, 255);
-
+      int next = subranges_.FindNextSetBit(c);
+      uint8 b = 0;
+      if (present_.Test(next)) {
+        b = static_cast<uint8>(present);
+        present++;
+      } else {
+        if (absent == -1) {
+          absent = present;
+          present++;
+        }
+        b = static_cast<uint8>(absent);
+      }
       while (c <= next) {
         bytemap[c] = b;
         c++;
       }
-      b++;
     }
-    return b;
+    return present;
   }
 
  private:
-  Bitmap256 bitmap_;
+  Bitmap256 subranges_;
+  Bitmap256 present_;
 
   DISALLOW_COPY_AND_ASSIGN(ByteMapBuilder);
 };
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 6e363e7..6675548 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -170,11 +170,11 @@
   DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
   EXPECT_EQ("[00-2f] -> 0\n"
             "[30-39] -> 1\n"
-            "[3a-40] -> 2\n"
-            "[41-46] -> 3\n"
-            "[47-60] -> 4\n"
-            "[61-66] -> 5\n"
-            "[67-ff] -> 6\n",
+            "[3a-40] -> 0\n"
+            "[41-46] -> 2\n"
+            "[47-60] -> 0\n"
+            "[61-66] -> 3\n"
+            "[67-ff] -> 0\n",
             bytemap);
 }
 
@@ -192,14 +192,14 @@
             "[80-8f] -> 3\n"
             "[90-9f] -> 4\n"
             "[a0-bf] -> 5\n"
-            "[c0-c1] -> 6\n"
-            "[c2-df] -> 7\n"
-            "[e0-e0] -> 8\n"
-            "[e1-ef] -> 9\n"
-            "[f0-f0] -> 10\n"
-            "[f1-f3] -> 11\n"
-            "[f4-f4] -> 12\n"
-            "[f5-ff] -> 13\n",
+            "[c0-c1] -> 1\n"
+            "[c2-df] -> 6\n"
+            "[e0-e0] -> 7\n"
+            "[e1-ef] -> 8\n"
+            "[f0-f0] -> 9\n"
+            "[f1-f3] -> 10\n"
+            "[f4-f4] -> 11\n"
+            "[f5-ff] -> 1\n",
             bytemap);
 }
 
diff --git a/util/bitmap.h b/util/bitmap.h
index 02fa923..91519e6 100644
--- a/util/bitmap.h
+++ b/util/bitmap.h
@@ -23,7 +23,7 @@
     DCHECK_GE(c, 0);
     DCHECK_LE(c, 255);
 
-    return (words_[c >> 6] & (1ULL << (c & 63))) != 0;
+    return (words_[c / 64] & (1ULL << (c % 64))) != 0;
   }
 
   // Sets the bit with index c.
@@ -31,7 +31,7 @@
     DCHECK_GE(c, 0);
     DCHECK_LE(c, 255);
 
-    words_[c >> 6] |= (1ULL << (c & 63));
+    words_[c / 64] |= (1ULL << (c % 64));
   }
 
   // Finds the next non-zero bit with index >= c.
@@ -41,53 +41,23 @@
     DCHECK_LE(c, 255);
 
     // Mask out any lower bits.
-    int i = c >> 6;
-    uint64 word = words_[i] & (~0ULL << (c & 63));
+    int i = c / 64;
+    uint64 word = words_[i] & (~0ULL << (c % 64));
     if (word != 0)
-      return (i << 6) + FindLSBSet(word);
+      return (i * 64) + FindLSBSet(word);
     i++;
     switch (i) {
       case 1:
         if (words_[1] != 0)
-          return (1 << 6) + FindLSBSet(words_[1]);
+          return (1 * 64) + FindLSBSet(words_[1]);
         // Fall through.
       case 2:
         if (words_[2] != 0)
-          return (2 << 6) + FindLSBSet(words_[2]);
+          return (2 * 64) + FindLSBSet(words_[2]);
         // Fall through.
       case 3:
         if (words_[3] != 0)
-          return (3 << 6) + FindLSBSet(words_[3]);
-        // Fall through.
-      default:
-        return -1;
-    }
-  }
-
-  // Finds the previous non-zero bit with index <= c.
-  // Returns -1 if no such bit exists.
-  int FindPrevSetBit(int c) const {
-    DCHECK_GE(c, 0);
-    DCHECK_LE(c, 255);
-
-    // Mask out any higher bits.
-    int i = c >> 6;
-    uint64 word = words_[i] & ~((~0ULL - 1) << (c & 63));
-    if (word != 0)
-      return (i << 6) + FindMSBSet(word);
-    i--;
-    switch (i) {
-      case 2:
-        if (words_[2] != 0)
-          return (2 << 6) + FindMSBSet(words_[2]);
-        // Fall through.
-      case 1:
-        if (words_[1] != 0)
-          return (1 << 6) + FindMSBSet(words_[1]);
-        // Fall through.
-      case 0:
-        if (words_[0] != 0)
-          return (0 << 6) + FindMSBSet(words_[0]);
+          return (3 * 64) + FindLSBSet(words_[3]);
         // Fall through.
       default:
         return -1;
@@ -110,21 +80,6 @@
 #endif
   }
 
-  // Finds the most significant non-zero bit in n.
-  static int FindMSBSet(uint64 n) {
-    DCHECK_NE(n, 0);
-
-#if defined(__GNUC__)
-    return 63 - __builtin_clzll(n);
-#elif defined(_MSC_VER)
-    int c;
-    _BitScanReverse64(&c, n);
-    return c;
-#else
-#error "bit scan reverse not implemented"
-#endif
-  }
-
   uint64 words_[4];
 };