Improve the bytemap computation some more. Change-Id: I319462fa445124228e55dabc7025948ccc92a486 Reviewed-on: https://code-review.googlesource.com/5250 Reviewed-by: Paul Wankadia <junyer@google.com>

commit: 1051d1c4ac12ee9606d3d38ed26211a42c1905a5 [log] [tgz]
author: Paul Wankadia <junyer@google.com> Fri Jul 15 19:53:39 2016 +1000
committer: Paul Wankadia <junyer@google.com> Fri Jul 15 09:56:58 2016 +0000
tree: 3238544ee899dde16b6b75cdcb6410618a370b7a
parent: bd963ac5a1a6368249a0b90ab21100091ae88d80 [diff]
diff --git a/re2/prog.cc b/re2/prog.cc
index 4573998..b99ff9b 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc

@@ -314,22 +314,43 @@
   return flags;
 }
 
+// ByteMapBuilder implements a coloring algorithm.
+//
+// The first phase is a series of "mark and merge" batches: we mark one or more
+// [lo-hi] ranges, then merge them into our internal state. Batching is not for
+// performance; rather, it means that the ranges are treated indistinguishably.
+//
+// Internally, the ranges are represented using a bitmap that stores the splits
+// and a vector that stores the colors; both of them are indexed by the ranges'
+// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
+// hi (if not already split), then recolor each range in between. The color map
+// (i.e. from the old color to the new color) is maintained for the lifetime of
+// the batch and so underpins this somewhat obscure approach to set operations.
+//
+// The second phase builds the bytemap from our internal state: we recolor each
+// range, then store the new color (which is now the byte class) in each of the
+// corresponding array elements. Finally, we output the number of byte classes.
 class ByteMapBuilder {
  public:
   ByteMapBuilder() {
-    ranges_.Set(255);
+    // Initial state: the [0-255] range has color 0.
+    splits_.Set(255);
+    colors_.resize(256);
+    nextcolor_ = 1;
   }
 
-  // Marks the [lo, hi] range.
   void Mark(int lo, int hi);
-
-  // Builds the bytemap from the ranges.
-  // Returns the number of byte classes.
-  int Build(uint8* bytemap);
+  void Merge();
+  void Build(uint8* bytemap, int* bytemap_range);
 
  private:
-  Bitmap256 ranges_;
-  Bitmap256 present_;
+  int Recolor(int oldcolor);
+
+  Bitmap256 splits_;
+  vector<int> colors_;
+  int nextcolor_;
+  vector<pair<int, int>> colormap_;
+  vector<pair<int, int>> batch_;
 
   DISALLOW_COPY_AND_ASSIGN(ByteMapBuilder);
 };
@@ -341,61 +362,78 @@
   DCHECK_LE(hi, 255);
   DCHECK_LE(lo, hi);
 
+  // Ignore any [0-255] ranges. They cause us to recolor every range, which
+  // has no effect on the eventual result and is therefore a waste of time.
   if (lo == 0 && hi == 255)
     return;
 
-  // We track the end of each range. We also track which ranges
-  // are "present" (i.e. were marked) so that the remaining, "absent"
-  // ranges can subsequently be made to share a single byte class.
-  lo--;
-  if (0 <= lo && !ranges_.Test(lo)) {
-    ranges_.Set(lo);
-    int next = ranges_.FindNextSetBit(lo+1);
-    if (present_.Test(next))
-      present_.Set(lo);
-  }
-  if (!ranges_.Test(hi))
-    ranges_.Set(hi);
-  // Flag as "present" the ranges above lo up to and including the
-  // range ending at hi.
-  int c = lo+1;
-  while (c < 256) {
-    int next = ranges_.FindNextSetBit(c);
-    if (!present_.Test(next))
-      present_.Set(next);
-    if (next == hi)
-      break;
-    c = next+1;
-  }
+  batch_.emplace_back(lo, hi);
 }
 
-int ByteMapBuilder::Build(uint8* bytemap) {
-  int present = 0;
-  int absent = -1;
+void ByteMapBuilder::Merge() {
+  for (vector<pair<int, int>>::const_iterator it = batch_.begin();
+       it != batch_.end();
+       ++it) {
+    int lo = it->first-1;
+    int hi = it->second;
+
+    if (0 <= lo && !splits_.Test(lo)) {
+      splits_.Set(lo);
+      int next = splits_.FindNextSetBit(lo+1);
+      colors_[lo] = colors_[next];
+    }
+    if (!splits_.Test(hi)) {
+      splits_.Set(hi);
+      int next = splits_.FindNextSetBit(hi+1);
+      colors_[hi] = colors_[next];
+    }
+
+    int c = lo+1;
+    while (c < 256) {
+      int next = splits_.FindNextSetBit(c);
+      colors_[next] = Recolor(colors_[next]);
+      if (next == hi)
+        break;
+      c = next+1;
+    }
+  }
+  colormap_.clear();
+  batch_.clear();
+}
+
+void ByteMapBuilder::Build(uint8* bytemap, int* bytemap_range) {
+  // Reset in order to obtain byte classes numbered from 0.
+  nextcolor_ = 0;
+
   int c = 0;
   while (c < 256) {
-    int next = ranges_.FindNextSetBit(c);
-    uint8 b = 0;
-    if (present_.Test(next)) {
-      b = static_cast<uint8>(present);
-      present++;
-    } else {
-      if (absent == -1) {
-        absent = present;
-        present++;
-      }
-      b = static_cast<uint8>(absent);
-    }
+    int next = splits_.FindNextSetBit(c);
+    uint8 b = static_cast<uint8>(Recolor(colors_[next]));
     while (c <= next) {
       bytemap[c] = b;
       c++;
     }
   }
-  return present;
+
+  *bytemap_range = nextcolor_;
+}
+
+int ByteMapBuilder::Recolor(int oldcolor) {
+  // Yes, this is a linear search. There can be at most 256
+  // colors and there will typically be far fewer than that.
+  vector<pair<int, int>>::const_iterator it = std::find_if(
+      colormap_.begin(), colormap_.end(),
+      [&](const pair<int, int>& kv) -> bool { return kv.first == oldcolor; });
+  if (it != colormap_.end())
+    return it->second;
+  int newcolor = nextcolor_;
+  nextcolor_++;
+  colormap_.emplace_back(oldcolor, newcolor);
+  return newcolor;
 }
 
 void Prog::ComputeByteMap() {
-  // Fill in byte map with byte classes for the program.
+  // Fill in bytemap with byte classes for the program.
   // Ranges of bytes that are treated indistinguishably
   // will be mapped to a single byte class.
   ByteMapBuilder builder;
@@ -412,38 +450,48 @@
       int hi = ip->hi();
       builder.Mark(lo, hi);
       if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
-        if (lo < 'a')
-          lo = 'a';
-        if (hi > 'z')
-          hi = 'z';
-        if (lo <= hi)
-          builder.Mark(lo + 'A' - 'a', hi + 'A' - 'a');
+        int foldlo = lo;
+        int foldhi = hi;
+        if (foldlo < 'a')
+          foldlo = 'a';
+        if (foldhi > 'z')
+          foldhi = 'z';
+        if (foldlo <= foldhi)
+          builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a');
       }
+      builder.Merge();
     } else if (ip->opcode() == kInstEmptyWidth) {
       if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
           !marked_line_boundaries) {
         builder.Mark('\n', '\n');
+        builder.Merge();
         marked_line_boundaries = true;
       }
       if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
           !marked_word_boundaries) {
-        int j;
-        for (int i = 0; i < 256; i = j) {
-          for (j = i + 1; j < 256 &&
-                          Prog::IsWordChar(static_cast<uint8>(i)) ==
-                              Prog::IsWordChar(static_cast<uint8>(j));
-               j++)
-            ;
-          builder.Mark(i, j - 1);
+        // We require two batches here: the first for ranges that are word
+        // characters, the second for ranges that are not word characters.
+        for (bool isword : {true, false}) {
+          int j;
+          for (int i = 0; i < 256; i = j) {
+            for (j = i + 1; j < 256 &&
+                            Prog::IsWordChar(static_cast<uint8>(i)) ==
+                                Prog::IsWordChar(static_cast<uint8>(j));
+                 j++)
+              ;
+            if (Prog::IsWordChar(static_cast<uint8>(i)) == isword)
+              builder.Mark(i, j - 1);
+          }
+          builder.Merge();
         }
         marked_word_boundaries = true;
       }
     }
   }
 
-  bytemap_range_ = builder.Build(bytemap_);
+  builder.Build(bytemap_, &bytemap_range_);
 
-  if (0) {  // For debugging: use trivial byte map.
+  if (0) {  // For debugging: use trivial bytemap.
     for (int i = 0; i < 256; i++)
       bytemap_[i] = static_cast<uint8>(i);
     bytemap_range_ = 256;

diff --git a/re2/prog.h b/re2/prog.h
index 9c8026f..8499851 100644
--- a/re2/prog.h
+++ b/re2/prog.h

@@ -259,7 +259,7 @@
   // for testing purposes.  Returns number of states.
   int BuildEntireDFA(MatchKind kind);
 
-  // Compute byte map.
+  // Compute bytemap.
   void ComputeByteMap();
 
   // Computes whether all matches must begin with the same first

diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 2cf2dd2..7897b38 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc

@@ -178,7 +178,7 @@
             "[3a-40] -> 0\n"
             "[41-46] -> 2\n"
             "[47-60] -> 0\n"
-            "[61-66] -> 3\n"
+            "[61-66] -> 2\n"
             "[67-ff] -> 0\n",
             bytemap);
 
@@ -186,22 +186,22 @@
   DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
   EXPECT_EQ("[00-2f] -> 0\n"
             "[30-39] -> 1\n"
-            "[3a-40] -> 2\n"
-            "[41-5a] -> 3\n"
-            "[5b-5e] -> 4\n"
-            "[5f-5f] -> 5\n"
-            "[60-60] -> 6\n"
-            "[61-7a] -> 7\n"
-            "[7b-ff] -> 8\n",
+            "[3a-40] -> 0\n"
+            "[41-5a] -> 1\n"
+            "[5b-5e] -> 0\n"
+            "[5f-5f] -> 1\n"
+            "[60-60] -> 0\n"
+            "[61-7a] -> 1\n"
+            "[7b-ff] -> 0\n",
             bytemap);
 
   // FIXME: The ASCII case-folding optimization creates too many byte classes!
   DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
   EXPECT_EQ("[00-40] -> 0\n"
             "[41-5a] -> 1\n"
-            "[5b-5e] -> 2\n"
-            "[5f-5f] -> 3\n"
-            "[60-ff] -> 4\n",
+            "[5b-5e] -> 0\n"
+            "[5f-5f] -> 2\n"
+            "[60-ff] -> 3\n",
             bytemap);
 }
 

diff --git a/util/util.h b/util/util.h
index c75fad0..733222d 100644
--- a/util/util.h
+++ b/util/util.h

@@ -33,6 +33,7 @@
 #include <atomic>
 #include <mutex>        // For std::call_once
 #include <unordered_set>
+#include <initializer_list>
 
 // Use std names.
 using std::set;
commit	1051d1c4ac12ee9606d3d38ed26211a42c1905a5	[log] [tgz]
author	Paul Wankadia <junyer@google.com>	Fri Jul 15 19:53:39 2016 +1000
committer	Paul Wankadia <junyer@google.com>	Fri Jul 15 09:56:58 2016 +0000
tree	3238544ee899dde16b6b75cdcb6410618a370b7a
parent	bd963ac5a1a6368249a0b90ab21100091ae88d80 [diff]