Add an experimental program fanout computation.
Change-Id: I92cc607d40d9370be7eba14fe258c08a6fff91c1
Reviewed-on: https://code-review.googlesource.com/2311
Reviewed-by: Russ Cox <rsc@swtch.com>
diff --git a/re2/nfa.cc b/re2/nfa.cc
index b588c7e..57a18fe 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -706,5 +706,52 @@
return true;
}
-} // namespace re2
+// For each instruction i in the program reachable from the start, compute the
+// number of instructions reachable from i by following only empty transitions
+// and record that count as fanout[i].
+//
+// fanout holds the results and is also the work queue for the outer iteration.
+// reachable holds the reached nodes for the inner iteration.
+void Prog::Fanout(SparseArray<int>* fanout) {
+ DCHECK_EQ(fanout->max_size(), size());
+ SparseSet reachable(size());
+ fanout->clear();
+ fanout->set_new(start(), 0);
+ for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
+ int* count = &i->second;
+ reachable.clear();
+ reachable.insert(i->index());
+ for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
+ Prog::Inst* ip = inst(*j);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
+ break;
+ case kInstByteRange:
+ (*count)++;
+ if (!fanout->has_index(ip->out())) {
+ fanout->set_new(ip->out(), 0);
+ }
+ break;
+
+ case kInstAlt:
+ case kInstAltMatch:
+ reachable.insert(ip->out1());
+ // fall through
+
+ case kInstCapture:
+ case kInstEmptyWidth:
+ case kInstNop:
+ reachable.insert(ip->out());
+ break;
+
+ case kInstMatch:
+ case kInstFail:
+ break;
+ }
+ }
+ }
+}
+
+} // namespace re2
diff --git a/re2/prog.h b/re2/prog.h
index 64707c3..26c8676 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -10,6 +10,7 @@
#define RE2_PROG_H__
#include "util/util.h"
+#include "util/sparse_array.h"
#include "re2/re2.h"
namespace re2 {
@@ -329,6 +330,10 @@
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen);
+ // EXPERIMENTAL! SUBJECT TO CHANGE!
+ // Outputs the program fanout into the given sparse array.
+ void Fanout(SparseArray<int>* fanout);
+
// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
diff --git a/re2/re2.cc b/re2/re2.cc
index 2283f69..3a2262e 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -16,6 +16,7 @@
#include "util/atomicops.h"
#include "util/util.h"
#include "util/flags.h"
+#include "util/sparse_array.h"
#include "re2/prog.h"
#include "re2/regexp.h"
@@ -272,6 +273,23 @@
return prog_->size();
}
+int RE2::ProgramFanout(map<int, int>* histogram) const {
+ if (prog_ == NULL)
+ return -1;
+ SparseArray<int> fanout(prog_->size());
+ prog_->Fanout(&fanout);
+ histogram->clear();
+ for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
+ // TODO(junyer): Optimise this?
+ int bucket = 0;
+ while (1 << bucket < i->second) {
+ bucket++;
+ }
+ (*histogram)[bucket]++;
+ }
+ return histogram->rbegin()->first;
+}
+
// Returns named_groups_, computing it if needed.
const map<string, int>& RE2::NamedCapturingGroups() const {
MutexLock l(mutex_);
diff --git a/re2/re2.h b/re2/re2.h
index 85a711f..6bf7ac6 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -293,6 +293,11 @@
// Larger numbers are more expensive than smaller numbers.
int ProgramSize() const;
+ // EXPERIMENTAL! SUBJECT TO CHANGE!
+ // Outputs the program fanout as a histogram bucketed by powers of 2.
+ // Returns the number of the largest non-empty bucket.
+ int ProgramFanout(map<int, int>* histogram) const;
+
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
// to know about prefix_ and prefix_foldcase_.
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index efd3a20..8505195 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -462,13 +462,38 @@
TEST(ProgramSize, BigProgram) {
RE2 re_simple("simple regexp");
RE2 re_medium("medium.*regexp");
- RE2 re_complex("hard.{1,128}regexp");
+ RE2 re_complex("complex.{1,128}regexp");
CHECK_GT(re_simple.ProgramSize(), 0);
CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
}
+TEST(ProgramFanout, BigProgram) {
+ RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)");
+ RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)");
+ RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)");
+ RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)");
+
+ map<int, int> histogram;
+
+ // 3 is the largest non-empty bucket and has 1 element.
+ CHECK_EQ(3, re1.ProgramFanout(&histogram));
+ CHECK_EQ(1, histogram[3]);
+
+ // 7 is the largest non-empty bucket and has 10 elements.
+ CHECK_EQ(7, re10.ProgramFanout(&histogram));
+ CHECK_EQ(10, histogram[7]);
+
+ // 10 is the largest non-empty bucket and has 100 elements.
+ CHECK_EQ(10, re100.ProgramFanout(&histogram));
+ CHECK_EQ(100, histogram[10]);
+
+ // 13 is the largest non-empty bucket and has 1000 elements.
+ CHECK_EQ(13, re1000.ProgramFanout(&histogram));
+ CHECK_EQ(1000, histogram[13]);
+}
+
// Issue 956519: handling empty character sets was
// causing NULL dereference. This tests a few empty character sets.
// (The way to get an empty character set is to negate a full one.)