Enqueue only the ByteRange instructions that match.

Change-Id: Ic9f6f6771ea8bee95cec11cc5df490137456fcd4
Reviewed-on: https://code-review.googlesource.com/4671
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 0a5cfd1..23edcb3 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -84,17 +84,21 @@
   inline Thread* Incref(Thread* t);
   inline void Decref(Thread* t);
 
-  // Add id0 (or its children, following unlabeled arrows)
-  // to the workqueue q with associated capture info.
-  void AddToThreadq(Threadq* q, int id0, int flag,
+  // Follows all empty arrows from id0 and enqueues all the states reached.
+  // Enqueues only the ByteRange instructions that match byte c.
+  // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
+  // p is the current input position, and t0 is the current thread.
+  void AddToThreadq(Threadq* q, int id0, int c, int flag,
                     const char* p, Thread* t0);
 
   // Run runq on byte c, appending new states to nextq.
   // Updates matched_ and match_ as new, better matches are found.
-  // p is position of the next byte (the one after c)
-  // in the input string, used when processing capturing parens.
-  // flag is the bitwise or of Bol, Eol, etc., specifying whether
-  // ^, $ and \b match the current input point (after c).
+  // p is the position of the previous byte (the one before c)
+  // in the input string, used when processing Match instructions.
+  // flag is the bitwise OR of Bol, Eol, etc., specifying whether
+  // ^, $ and \b match the current input position (after c).
+  // Frees all the threads on runq.
+  // If there is a shortcut to the end, returns that shortcut.
   inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
 
   // Returns text version of capture information, for debugging.
@@ -195,9 +199,10 @@
 }
 
 // Follows all empty arrows from id0 and enqueues all the states reached.
+// Enqueues only the ByteRange instructions that match byte c.
 // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
 // p is the current input position, and t0 is the current thread.
-void NFA::AddToThreadq(Threadq* q, int id0, int flag,
+void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
                        const char* p, Thread* t0) {
   if (id0 == 0)
     return;
@@ -286,14 +291,19 @@
       a = AddState(ip->out());
       goto Loop;
 
-    case kInstMatch:
     case kInstByteRange:
+      if (!ip->Matches(c))
+        goto Next;
+      // Fallthrough intended.
+
+    case kInstMatch:
       // Save state; will pick up at next byte.
       t = Incref(t0);
       *tp = t;
       if (Debug)
         fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
 
+    Next:
       if (ip->last())
         break;
       a = AddState(id+1);
@@ -313,11 +323,11 @@
 }
 
 // Run runq on byte c, appending new states to nextq.
-// Updates match as new, better matches are found.
-// p is position of the byte c in the input string,
-// used when processing capturing parens.
-// flag is the bitwise or of Bol, Eol, etc., specifying whether
-// ^, $ and \b match the current input point (after c).
+// Updates matched_ and match_ as new, better matches are found.
+// p is the position of the previous byte (the one before c)
+// in the input string, used when processing Match instructions.
+// flag is the bitwise OR of Bol, Eol, etc., specifying whether
+// ^, $ and \b match the current input position (after c).
 // Frees all the threads on runq.
 // If there is a shortcut to the end, returns that shortcut.
 int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
@@ -346,8 +356,7 @@
         break;
 
       case kInstByteRange:
-        if (ip->Matches(c))
-          AddToThreadq(nextq, ip->out(), flag, p+1, t);
+        AddToThreadq(nextq, ip->out(), c, flag, p+1, t);
         break;
 
       case kInstAltMatch:
@@ -492,14 +501,10 @@
   runq->clear();
   nextq->clear();
   memset(&match_[0], 0, ncapture_*sizeof match_[0]);
-  const char* bp = context.begin();
-  int c = -1;
   int wasword = 0;
 
-  if (text.begin() > context.begin()) {
-    c = text.begin()[-1] & 0xFF;
-    wasword = Prog::IsWordChar(static_cast<uint8>(c));
-  }
+  if (text.begin() > context.begin())
+    wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF);
 
   // Loop over the text, stepping the machine.
   for (const char* p = text.begin();; p++) {
@@ -529,7 +534,15 @@
       flag |= kEmptyNonWordBoundary;
 
     if (Debug) {
-      fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
+      int c = 0;
+      if (p == context.begin())
+        c = '^';
+      else if (p > text.end())
+        c = '$';
+      else if (p < text.end())
+        c = p[0] & 0xFF;
+
+      fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword);
       for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
         Thread* t = i->second;
         if (t == NULL)
@@ -539,11 +552,12 @@
       fprintf(stderr, "\n");
     }
 
-    // Process previous character (waited until now to avoid
-    // repeating the flag computation above).
-    // This is a no-op the first time around the loop, because
-    // runq is empty.
-    int id = Step(runq, nextq, c, flag, p-1);
+    // Note that we pass p-1 to Step() because it needs the previous pointer
+    // value in order to handle Match instructions appropriately. It plumbs
+    // c and flag through to AddToThreadq() along with p-1+1, which is p.
+    //
+    // This is a no-op the first time around the loop because runq is empty.
+    int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p-1);
     DCHECK_EQ(runq->size(), 0);
     swap(nextq, runq);
     nextq->clear();
@@ -604,7 +618,7 @@
       Thread* t = AllocThread();
       CopyCapture(t->capture, match_);
       t->capture[0] = p;
-      AddToThreadq(runq, start_, flag, p, t);
+      AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t);
       Decref(t);
     }
 
@@ -615,13 +629,7 @@
       break;
     }
 
-    if (p == text.end())
-      c = 0;
-    else
-      c = *p & 0xFF;
     wasword = isword;
-
-    // Will run step(runq, nextq, c, ...) on next iteration.  See above.
   }
 
   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)