Add an experimental program fanout computation.

Change-Id: I92cc607d40d9370be7eba14fe258c08a6fff91c1
Reviewed-on: https://code-review.googlesource.com/2311
Reviewed-by: Russ Cox <rsc@swtch.com>
diff --git a/re2/nfa.cc b/re2/nfa.cc
index b588c7e..57a18fe 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -706,5 +706,52 @@
   return true;
 }
 
-}  // namespace re2
+// For each instruction i in the program reachable from the start, compute the
+// number of instructions reachable from i by following only empty transitions
+// and record that count as fanout[i].
+//
+// fanout holds the results and is also the work queue for the outer iteration.
+// reachable holds the reached nodes for the inner iteration.
+void Prog::Fanout(SparseArray<int>* fanout) {
+  DCHECK_EQ(fanout->max_size(), size());
+  SparseSet reachable(size());
+  fanout->clear();
+  fanout->set_new(start(), 0);
+  for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
+    int* count = &i->second;
+    reachable.clear();
+    reachable.insert(i->index());
+    for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
+      Prog::Inst* ip = inst(*j);
+      switch (ip->opcode()) {
+        default:
+          LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
+          break;
 
+        case kInstByteRange:
+          (*count)++;
+          if (!fanout->has_index(ip->out())) {
+            fanout->set_new(ip->out(), 0);
+          }
+          break;
+
+        case kInstAlt:
+        case kInstAltMatch:
+          reachable.insert(ip->out1());
+          // fall through
+
+        case kInstCapture:
+        case kInstEmptyWidth:
+        case kInstNop:
+          reachable.insert(ip->out());
+          break;
+
+        case kInstMatch:
+        case kInstFail:
+          break;
+      }
+    }
+  }
+}
+
+}  // namespace re2
diff --git a/re2/prog.h b/re2/prog.h
index 64707c3..26c8676 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -10,6 +10,7 @@
 #define RE2_PROG_H__
 
 #include "util/util.h"
+#include "util/sparse_array.h"
 #include "re2/re2.h"
 
 namespace re2 {
@@ -329,6 +330,10 @@
   // Returns true on success, false on error.
   bool PossibleMatchRange(string* min, string* max, int maxlen);
 
+  // EXPERIMENTAL! SUBJECT TO CHANGE!
+  // Outputs the program fanout into the given sparse array.
+  void Fanout(SparseArray<int>* fanout);
+
   // Compiles a collection of regexps to Prog.  Each regexp will have
   // its own Match instruction recording the index in the vector.
   static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
diff --git a/re2/re2.cc b/re2/re2.cc
index 2283f69..3a2262e 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -16,6 +16,7 @@
 #include "util/atomicops.h"
 #include "util/util.h"
 #include "util/flags.h"
+#include "util/sparse_array.h"
 #include "re2/prog.h"
 #include "re2/regexp.h"
 
@@ -272,6 +273,23 @@
   return prog_->size();
 }
 
+int RE2::ProgramFanout(map<int, int>* histogram) const {
+  if (prog_ == NULL)
+    return -1;
+  SparseArray<int> fanout(prog_->size());
+  prog_->Fanout(&fanout);
+  histogram->clear();
+  for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
+    // TODO(junyer): Optimise this?
+    int bucket = 0;
+    while (1 << bucket < i->second) {
+      bucket++;
+    }
+    (*histogram)[bucket]++;
+  }
+  return histogram->rbegin()->first;
+}
+
 // Returns named_groups_, computing it if needed.
 const map<string, int>&  RE2::NamedCapturingGroups() const {
   MutexLock l(mutex_);
diff --git a/re2/re2.h b/re2/re2.h
index 85a711f..6bf7ac6 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -293,6 +293,11 @@
   // Larger numbers are more expensive than smaller numbers.
   int ProgramSize() const;
 
+  // EXPERIMENTAL! SUBJECT TO CHANGE!
+  // Outputs the program fanout as a histogram bucketed by powers of 2.
+  // Returns the number of the largest non-empty bucket.
+  int ProgramFanout(map<int, int>* histogram) const;
+
   // Returns the underlying Regexp; not for general use.
   // Returns entire_regexp_ so that callers don't need
   // to know about prefix_ and prefix_foldcase_.
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index efd3a20..8505195 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -462,13 +462,38 @@
 TEST(ProgramSize, BigProgram) {
   RE2 re_simple("simple regexp");
   RE2 re_medium("medium.*regexp");
-  RE2 re_complex("hard.{1,128}regexp");
+  RE2 re_complex("complex.{1,128}regexp");
 
   CHECK_GT(re_simple.ProgramSize(), 0);
   CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
   CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
 }
 
+TEST(ProgramFanout, BigProgram) {
+  RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)");
+  RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)");
+  RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)");
+  RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)");
+
+  map<int, int> histogram;
+
+  // 3 is the largest non-empty bucket and has 1 element.
+  CHECK_EQ(3, re1.ProgramFanout(&histogram));
+  CHECK_EQ(1, histogram[3]);
+
+  // 7 is the largest non-empty bucket and has 10 elements.
+  CHECK_EQ(7, re10.ProgramFanout(&histogram));
+  CHECK_EQ(10, histogram[7]);
+
+  // 10 is the largest non-empty bucket and has 100 elements.
+  CHECK_EQ(10, re100.ProgramFanout(&histogram));
+  CHECK_EQ(100, histogram[10]);
+
+  // 13 is the largest non-empty bucket and has 1000 elements.
+  CHECK_EQ(13, re1000.ProgramFanout(&histogram));
+  CHECK_EQ(1000, histogram[13]);
+}
+
 // Issue 956519: handling empty character sets was
 // causing NULL dereference.  This tests a few empty character sets.
 // (The way to get an empty character set is to negate a full one.)