Consolidate bytemap logic into Prog::ComputeByteMap().

Change-Id: I9740bba924632c869efc3c395295c86748994c62
Reviewed-on: https://code-review.googlesource.com/4764
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/compile.cc b/re2/compile.cc
index 5faf7f6..117679f 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -396,15 +396,6 @@
   if (id < 0)
     return NoMatch();
   inst_[id].InitByteRange(lo, hi, foldcase, 0);
-  prog_->MarkByteRange(lo, hi);
-  if (foldcase && lo <= 'z' && hi >= 'a') {
-    if (lo < 'a')
-      lo = 'a';
-    if (hi > 'z')
-      hi = 'z';
-    if (lo <= hi)
-      prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
-  }
   return Frag(id, PatchList::Mk(id << 1));
 }
 
@@ -432,19 +423,6 @@
   if (id < 0)
     return NoMatch();
   inst_[id].InitEmptyWidth(empty, 0);
-  if (empty & (kEmptyBeginLine|kEmptyEndLine))
-    prog_->MarkByteRange('\n', '\n');
-  if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
-    int j;
-    for (int i = 0; i < 256; i = j) {
-      for (j = i + 1; j < 256 &&
-                      Prog::IsWordChar(static_cast<uint8>(i)) ==
-                          Prog::IsWordChar(static_cast<uint8>(j));
-           j++)
-        ;
-      prog_->MarkByteRange(i, j-1);
-    }
-  }
   return Frag(id, PatchList::Mk(id << 1));
 }
 
@@ -1215,11 +1193,9 @@
   prog_->size_ = inst_len_;
   inst_ = NULL;
 
-  // Compute byte map.
-  prog_->ComputeByteMap();
-
   prog_->Optimize();
   prog_->Flatten();
+  prog_->ComputeByteMap();
 
   // Record remaining memory for DFA.
   if (max_mem_ <= 0) {
diff --git a/re2/prog.cc b/re2/prog.cc
index 9fcf418..931903a 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -305,31 +305,61 @@
   return flags;
 }
 
-void Prog::MarkByteRange(int lo, int hi) {
-  DCHECK_GE(lo, 0);
-  DCHECK_GE(hi, 0);
-  DCHECK_LE(lo, 255);
-  DCHECK_LE(hi, 255);
-  DCHECK_LE(lo, hi);
-  if (0 < lo && lo <= 255)
-    byterange_.Set(lo - 1);
-  if (0 <= hi && hi <= 255)
-    byterange_.Set(hi);
-}
-
 void Prog::ComputeByteMap() {
-  // Fill in bytemap with byte classes for prog_.
-  // Ranges of bytes that are treated as indistinguishable
-  // by the regexp program are mapped to a single byte class.
-  // The vector prog_->byterange() marks the end of each
-  // such range.
-  const Bitmap<256>& v = byterange();
+  // Fill in byte map with byte classes for the program.
+  // Ranges of bytes that are treated indistinguishably
+  // are mapped to a single byte class.
+  Bitmap<256> v;
+
+  // Don't repeat the work for \b and \B.
+  bool done_word_boundaries = false;
+
+  for (int id = 0; id < static_cast<int>(size()); id++) {
+    Inst* ip = inst(id);
+    if (ip->opcode() == kInstByteRange) {
+      int lo = ip->lo();
+      int hi = ip->hi();
+      if (0 < lo)
+        v.Set(lo - 1);
+      v.Set(hi);
+      if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
+        if (lo < 'a')
+          lo = 'a';
+        if (hi > 'z')
+          hi = 'z';
+        if (lo <= hi) {
+          v.Set(lo + 'A' - 'a' - 1);
+          v.Set(hi + 'A' - 'a');
+        }
+      }
+    } else if (ip->opcode() == kInstEmptyWidth) {
+      if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine)) {
+        v.Set('\n' - 1);
+        v.Set('\n');
+      }
+      if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
+        if (done_word_boundaries)
+          continue;
+        int j;
+        for (int i = 0; i < 256; i = j) {
+          for (j = i + 1; j < 256 &&
+                          Prog::IsWordChar(static_cast<uint8>(i)) ==
+                              Prog::IsWordChar(static_cast<uint8>(j));
+               j++)
+            ;
+          v.Set(i - 1);
+          v.Set(j - 1);
+        }
+        done_word_boundaries = true;
+      }
+    }
+  }
 
   COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
   uint8 n = 0;
   uint32 bits = 0;
   for (int i = 0; i < 256; i++) {
-    if ((i&31) == 0)
+    if ((i & 31) == 0)
       bits = v.Word(i >> 5);
     bytemap_[i] = n;
     n += bits & 1;
diff --git a/re2/prog.h b/re2/prog.h
index ee2093d..a9bd448 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -218,7 +218,6 @@
   void set_reversed(bool reversed) { reversed_ = reversed; }
   int list_count() { return list_count_; }
   int inst_count(InstOp op) { return inst_count_[op]; }
-  const Bitmap<256>& byterange() { return byterange_; }
   void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
   int64 dfa_mem() { return dfa_mem_; }
   int flags() { return flags_; }
@@ -235,12 +234,6 @@
   string DumpUnanchored();
   string DumpByteMap();
 
-  // Record that at some point in the prog, the bytes in the range
-  // lo-hi (inclusive) are treated as different from bytes outside the range.
-  // Tracking this lets the DFA collapse commonly-treated byte ranges
-  // when recording state pointers, greatly reducing its memory footprint.
-  void MarkByteRange(int lo, int hi);
-
   // Returns the set of kEmpty flags that are in effect at
   // position p within context.
   static uint32 EmptyFlags(const StringPiece& context, const char* p);
@@ -396,8 +389,6 @@
   std::atomic<DFA*> dfa_longest_;   // DFA cached for kLongestMatch and kFullMatch
   int64 dfa_mem_;      // Maximum memory for DFAs.
 
-  Bitmap<256> byterange_;    // byterange.Get(x) true if x ends a
-                             // commonly-treated byte range.
   uint8 bytemap_[256];       // map from input bytes to byte classes
 
   uint8* onepass_nodes_;     // data for OnePass nodes
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index cba8033..0c598e0 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -147,11 +147,10 @@
 static struct Latin1ByteRange {
   int lo;
   int hi;
-  int ec;  // equivalence class
 } latin1ranges[] = {
-  { 0x00, 0x09, 0 },
-  { 0x0A, 0x0A, 1 },
-  { 0x0B, 0xFF, 2 },
+  { 0x00, 0x09 },
+  { 0x0A, 0x0A },
+  { 0x0B, 0xFF },
 };
 
 TEST(TestCompile, Latin1Ranges) {
@@ -162,7 +161,7 @@
   EXPECT_EQ(prog->bytemap_range(), arraysize(latin1ranges));
   for (int i = 0; i < arraysize(latin1ranges); i++)
     for (int j = latin1ranges[i].lo; j <= latin1ranges[i].hi; j++)
-      EXPECT_EQ(prog->bytemap()[j], latin1ranges[i].ec) << " byte " << j;
+      EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
   delete prog;
   re->Decref();
 }
@@ -173,22 +172,21 @@
 static struct UTF8ByteRange {
   int lo;
   int hi;
-  int ec;  // equivalence class
 } utf8ranges[] = {
-  { 0x00, 0x09, 0 },
-  { 0x0A, 0x0A, 1 },
-  { 0x0B, 0x7F, 2 },
-  { 0x80, 0x8F, 3 },
-  { 0x90, 0x9F, 4 },
-  { 0xA0, 0xBF, 5 },
-  { 0xC0, 0xC1, 6 },
-  { 0xC2, 0xDF, 7 },
-  { 0xE0, 0xE0, 8 },
-  { 0xE1, 0xEF, 9 },
-  { 0xF0, 0xF0, 10 },
-  { 0xF1, 0xF3, 11 },
-  { 0xF4, 0xF4, 12 },
-  { 0xF5, 0xFF, 13 },
+  { 0x00, 0x09 },
+  { 0x0A, 0x0A },
+  { 0x0B, 0x7F },
+  { 0x80, 0x8F },
+  { 0x90, 0x9F },
+  { 0xA0, 0xBF },
+  { 0xC0, 0xC1 },
+  { 0xC2, 0xDF },
+  { 0xE0, 0xE0 },
+  { 0xE1, 0xEF },
+  { 0xF0, 0xF0 },
+  { 0xF1, 0xF3 },
+  { 0xF4, 0xF4 },
+  { 0xF5, 0xFF },
 };
 
 TEST(TestCompile, UTF8Ranges) {
@@ -199,7 +197,7 @@
   EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
   for (int i = 0; i < arraysize(utf8ranges); i++)
     for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
-      EXPECT_EQ(prog->bytemap()[j], utf8ranges[i].ec) << " byte " << j;
+      EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
   delete prog;
   re->Decref();
 }