Consolidate bytemap logic into Prog::ComputeByteMap().
Change-Id: I9740bba924632c869efc3c395295c86748994c62
Reviewed-on: https://code-review.googlesource.com/4764
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/compile.cc b/re2/compile.cc
index 5faf7f6..117679f 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -396,15 +396,6 @@
if (id < 0)
return NoMatch();
inst_[id].InitByteRange(lo, hi, foldcase, 0);
- prog_->MarkByteRange(lo, hi);
- if (foldcase && lo <= 'z' && hi >= 'a') {
- if (lo < 'a')
- lo = 'a';
- if (hi > 'z')
- hi = 'z';
- if (lo <= hi)
- prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
- }
return Frag(id, PatchList::Mk(id << 1));
}
@@ -432,19 +423,6 @@
if (id < 0)
return NoMatch();
inst_[id].InitEmptyWidth(empty, 0);
- if (empty & (kEmptyBeginLine|kEmptyEndLine))
- prog_->MarkByteRange('\n', '\n');
- if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
- int j;
- for (int i = 0; i < 256; i = j) {
- for (j = i + 1; j < 256 &&
- Prog::IsWordChar(static_cast<uint8>(i)) ==
- Prog::IsWordChar(static_cast<uint8>(j));
- j++)
- ;
- prog_->MarkByteRange(i, j-1);
- }
- }
return Frag(id, PatchList::Mk(id << 1));
}
@@ -1215,11 +1193,9 @@
prog_->size_ = inst_len_;
inst_ = NULL;
- // Compute byte map.
- prog_->ComputeByteMap();
-
prog_->Optimize();
prog_->Flatten();
+ prog_->ComputeByteMap();
// Record remaining memory for DFA.
if (max_mem_ <= 0) {
diff --git a/re2/prog.cc b/re2/prog.cc
index 9fcf418..931903a 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -305,31 +305,61 @@
return flags;
}
-void Prog::MarkByteRange(int lo, int hi) {
- DCHECK_GE(lo, 0);
- DCHECK_GE(hi, 0);
- DCHECK_LE(lo, 255);
- DCHECK_LE(hi, 255);
- DCHECK_LE(lo, hi);
- if (0 < lo && lo <= 255)
- byterange_.Set(lo - 1);
- if (0 <= hi && hi <= 255)
- byterange_.Set(hi);
-}
-
void Prog::ComputeByteMap() {
- // Fill in bytemap with byte classes for prog_.
- // Ranges of bytes that are treated as indistinguishable
- // by the regexp program are mapped to a single byte class.
- // The vector prog_->byterange() marks the end of each
- // such range.
- const Bitmap<256>& v = byterange();
+ // Fill in byte map with byte classes for the program.
+ // Ranges of bytes that are treated indistinguishably
+ // are mapped to a single byte class.
+ Bitmap<256> v;
+
+ // Don't repeat the work for \b and \B.
+ bool done_word_boundaries = false;
+
+ for (int id = 0; id < static_cast<int>(size()); id++) {
+ Inst* ip = inst(id);
+ if (ip->opcode() == kInstByteRange) {
+ int lo = ip->lo();
+ int hi = ip->hi();
+ if (0 < lo)
+ v.Set(lo - 1);
+ v.Set(hi);
+ if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
+ if (lo < 'a')
+ lo = 'a';
+ if (hi > 'z')
+ hi = 'z';
+ if (lo <= hi) {
+ v.Set(lo + 'A' - 'a' - 1);
+ v.Set(hi + 'A' - 'a');
+ }
+ }
+ } else if (ip->opcode() == kInstEmptyWidth) {
+ if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine)) {
+ v.Set('\n' - 1);
+ v.Set('\n');
+ }
+ if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
+ if (done_word_boundaries)
+ continue;
+ int j;
+ for (int i = 0; i < 256; i = j) {
+ for (j = i + 1; j < 256 &&
+ Prog::IsWordChar(static_cast<uint8>(i)) ==
+ Prog::IsWordChar(static_cast<uint8>(j));
+ j++)
+ ;
+ v.Set(i - 1);
+ v.Set(j - 1);
+ }
+ done_word_boundaries = true;
+ }
+ }
+ }
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
uint8 n = 0;
uint32 bits = 0;
for (int i = 0; i < 256; i++) {
- if ((i&31) == 0)
+ if ((i & 31) == 0)
bits = v.Word(i >> 5);
bytemap_[i] = n;
n += bits & 1;
diff --git a/re2/prog.h b/re2/prog.h
index ee2093d..a9bd448 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -218,7 +218,6 @@
void set_reversed(bool reversed) { reversed_ = reversed; }
int list_count() { return list_count_; }
int inst_count(InstOp op) { return inst_count_[op]; }
- const Bitmap<256>& byterange() { return byterange_; }
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
int64 dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
@@ -235,12 +234,6 @@
string DumpUnanchored();
string DumpByteMap();
- // Record that at some point in the prog, the bytes in the range
- // lo-hi (inclusive) are treated as different from bytes outside the range.
- // Tracking this lets the DFA collapse commonly-treated byte ranges
- // when recording state pointers, greatly reducing its memory footprint.
- void MarkByteRange(int lo, int hi);
-
// Returns the set of kEmpty flags that are in effect at
// position p within context.
static uint32 EmptyFlags(const StringPiece& context, const char* p);
@@ -396,8 +389,6 @@
std::atomic<DFA*> dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
int64 dfa_mem_; // Maximum memory for DFAs.
- Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
- // commonly-treated byte range.
uint8 bytemap_[256]; // map from input bytes to byte classes
uint8* onepass_nodes_; // data for OnePass nodes
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index cba8033..0c598e0 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -147,11 +147,10 @@
static struct Latin1ByteRange {
int lo;
int hi;
- int ec; // equivalence class
} latin1ranges[] = {
- { 0x00, 0x09, 0 },
- { 0x0A, 0x0A, 1 },
- { 0x0B, 0xFF, 2 },
+ { 0x00, 0x09 },
+ { 0x0A, 0x0A },
+ { 0x0B, 0xFF },
};
TEST(TestCompile, Latin1Ranges) {
@@ -162,7 +161,7 @@
EXPECT_EQ(prog->bytemap_range(), arraysize(latin1ranges));
for (int i = 0; i < arraysize(latin1ranges); i++)
for (int j = latin1ranges[i].lo; j <= latin1ranges[i].hi; j++)
- EXPECT_EQ(prog->bytemap()[j], latin1ranges[i].ec) << " byte " << j;
+ EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
delete prog;
re->Decref();
}
@@ -173,22 +172,21 @@
static struct UTF8ByteRange {
int lo;
int hi;
- int ec; // equivalence class
} utf8ranges[] = {
- { 0x00, 0x09, 0 },
- { 0x0A, 0x0A, 1 },
- { 0x0B, 0x7F, 2 },
- { 0x80, 0x8F, 3 },
- { 0x90, 0x9F, 4 },
- { 0xA0, 0xBF, 5 },
- { 0xC0, 0xC1, 6 },
- { 0xC2, 0xDF, 7 },
- { 0xE0, 0xE0, 8 },
- { 0xE1, 0xEF, 9 },
- { 0xF0, 0xF0, 10 },
- { 0xF1, 0xF3, 11 },
- { 0xF4, 0xF4, 12 },
- { 0xF5, 0xFF, 13 },
+ { 0x00, 0x09 },
+ { 0x0A, 0x0A },
+ { 0x0B, 0x7F },
+ { 0x80, 0x8F },
+ { 0x90, 0x9F },
+ { 0xA0, 0xBF },
+ { 0xC0, 0xC1 },
+ { 0xC2, 0xDF },
+ { 0xE0, 0xE0 },
+ { 0xE1, 0xEF },
+ { 0xF0, 0xF0 },
+ { 0xF1, 0xF3 },
+ { 0xF4, 0xF4 },
+ { 0xF5, 0xFF },
};
TEST(TestCompile, UTF8Ranges) {
@@ -199,7 +197,7 @@
EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
for (int i = 0; i < arraysize(utf8ranges); i++)
for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
- EXPECT_EQ(prog->bytemap()[j], utf8ranges[i].ec) << " byte " << j;
+ EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
delete prog;
re->Decref();
}