Improve the bytemap computation.
Change-Id: Ie3c81a7f6621d1f9b84f807a2679df5c3cccf897
Reviewed-on: https://code-review.googlesource.com/5242
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/prog.cc b/re2/prog.cc
index cd49923..ccdd568 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -318,7 +318,7 @@
public:
ByteMapBuilder() {
// This allows the outer loop in Build() to be simple.
- bitmap_.Set(255);
+ subranges_.Set(255);
}
// Marks the [lo, hi] subrange.
@@ -329,35 +329,64 @@
DCHECK_LE(hi, 255);
DCHECK_LE(lo, hi);
- // We just track the end of each subrange. :)
+ if (lo == 0 && hi == 255)
+ return;
+
+ // We track the end of each subrange. We also track which subranges
+ // are "present" (i.e. were marked) so that the remaining, "absent"
+ // subranges can subsequently be made to share a single byte class.
lo--;
- if (0 <= lo)
- bitmap_.Set(lo);
- bitmap_.Set(hi);
+ if (0 <= lo && !subranges_.Test(lo)) {
+ subranges_.Set(lo);
+ int next = subranges_.FindNextSetBit(lo+1);
+ if (present_.Test(next))
+ present_.Set(lo);
+ }
+ if (!subranges_.Test(hi))
+ subranges_.Set(hi);
+ // Flag as "present" the subranges above lo up to and including the
+ // subrange ending at hi.
+ int c = lo+1;
+ while (c < 256) {
+ int next = subranges_.FindNextSetBit(c);
+ if (!present_.Test(next))
+ present_.Set(next);
+ if (next == hi)
+ break;
+ c = next+1;
+ }
}
// Builds the bytemap from the subranges.
- // Returns the number of subranges.
+ // Returns the number of byte classes.
int Build(uint8* bytemap) {
- uint8 b = 0;
+ int present = 0;
+ int absent = -1;
int c = 0;
while (c < 256) {
- // Find the end of this subrange.
- int next = bitmap_.FindNextSetBit(c);
- DCHECK_GE(next, 0);
- DCHECK_LE(next, 255);
-
+ int next = subranges_.FindNextSetBit(c);
+ uint8 b = 0;
+ if (present_.Test(next)) {
+ b = static_cast<uint8>(present);
+ present++;
+ } else {
+ if (absent == -1) {
+ absent = present;
+ present++;
+ }
+ b = static_cast<uint8>(absent);
+ }
while (c <= next) {
bytemap[c] = b;
c++;
}
- b++;
}
- return b;
+ return present;
}
private:
- Bitmap256 bitmap_;
+ Bitmap256 subranges_;
+ Bitmap256 present_;
DISALLOW_COPY_AND_ASSIGN(ByteMapBuilder);
};
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 6e363e7..6675548 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -170,11 +170,11 @@
DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
EXPECT_EQ("[00-2f] -> 0\n"
"[30-39] -> 1\n"
- "[3a-40] -> 2\n"
- "[41-46] -> 3\n"
- "[47-60] -> 4\n"
- "[61-66] -> 5\n"
- "[67-ff] -> 6\n",
+ "[3a-40] -> 0\n"
+ "[41-46] -> 2\n"
+ "[47-60] -> 0\n"
+ "[61-66] -> 3\n"
+ "[67-ff] -> 0\n",
bytemap);
}
@@ -192,14 +192,14 @@
"[80-8f] -> 3\n"
"[90-9f] -> 4\n"
"[a0-bf] -> 5\n"
- "[c0-c1] -> 6\n"
- "[c2-df] -> 7\n"
- "[e0-e0] -> 8\n"
- "[e1-ef] -> 9\n"
- "[f0-f0] -> 10\n"
- "[f1-f3] -> 11\n"
- "[f4-f4] -> 12\n"
- "[f5-ff] -> 13\n",
+ "[c0-c1] -> 1\n"
+ "[c2-df] -> 6\n"
+ "[e0-e0] -> 7\n"
+ "[e1-ef] -> 8\n"
+ "[f0-f0] -> 9\n"
+ "[f1-f3] -> 10\n"
+ "[f4-f4] -> 11\n"
+ "[f5-ff] -> 1\n",
bytemap);
}
diff --git a/util/bitmap.h b/util/bitmap.h
index 02fa923..91519e6 100644
--- a/util/bitmap.h
+++ b/util/bitmap.h
@@ -23,7 +23,7 @@
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
- return (words_[c >> 6] & (1ULL << (c & 63))) != 0;
+ return (words_[c / 64] & (1ULL << (c % 64))) != 0;
}
// Sets the bit with index c.
@@ -31,7 +31,7 @@
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
- words_[c >> 6] |= (1ULL << (c & 63));
+ words_[c / 64] |= (1ULL << (c % 64));
}
// Finds the next non-zero bit with index >= c.
@@ -41,53 +41,23 @@
DCHECK_LE(c, 255);
// Mask out any lower bits.
- int i = c >> 6;
- uint64 word = words_[i] & (~0ULL << (c & 63));
+ int i = c / 64;
+ uint64 word = words_[i] & (~0ULL << (c % 64));
if (word != 0)
- return (i << 6) + FindLSBSet(word);
+ return (i * 64) + FindLSBSet(word);
i++;
switch (i) {
case 1:
if (words_[1] != 0)
- return (1 << 6) + FindLSBSet(words_[1]);
+ return (1 * 64) + FindLSBSet(words_[1]);
// Fall through.
case 2:
if (words_[2] != 0)
- return (2 << 6) + FindLSBSet(words_[2]);
+ return (2 * 64) + FindLSBSet(words_[2]);
// Fall through.
case 3:
if (words_[3] != 0)
- return (3 << 6) + FindLSBSet(words_[3]);
- // Fall through.
- default:
- return -1;
- }
- }
-
- // Finds the previous non-zero bit with index <= c.
- // Returns -1 if no such bit exists.
- int FindPrevSetBit(int c) const {
- DCHECK_GE(c, 0);
- DCHECK_LE(c, 255);
-
- // Mask out any higher bits.
- int i = c >> 6;
- uint64 word = words_[i] & ~((~0ULL - 1) << (c & 63));
- if (word != 0)
- return (i << 6) + FindMSBSet(word);
- i--;
- switch (i) {
- case 2:
- if (words_[2] != 0)
- return (2 << 6) + FindMSBSet(words_[2]);
- // Fall through.
- case 1:
- if (words_[1] != 0)
- return (1 << 6) + FindMSBSet(words_[1]);
- // Fall through.
- case 0:
- if (words_[0] != 0)
- return (0 << 6) + FindMSBSet(words_[0]);
+ return (3 * 64) + FindLSBSet(words_[3]);
// Fall through.
default:
return -1;
@@ -110,21 +80,6 @@
#endif
}
- // Finds the most significant non-zero bit in n.
- static int FindMSBSet(uint64 n) {
- DCHECK_NE(n, 0);
-
-#if defined(__GNUC__)
- return 63 - __builtin_clzll(n);
-#elif defined(_MSC_VER)
- int c;
- _BitScanReverse64(&c, n);
- return c;
-#else
-#error "bit scan reverse not implemented"
-#endif
- }
-
uint64 words_[4];
};