[libfuzzer] adding utf32 variant of break iterator fuzzer.
I will compare utf8/utf32 fuzzer performance and coverage on
ClusterFuzz.
diff --git a/fuzzers/BUILD.gn b/fuzzers/BUILD.gn
index 45645c7..1503550 100644
--- a/fuzzers/BUILD.gn
+++ b/fuzzers/BUILD.gn
@@ -11,13 +11,24 @@
group("fuzzers") {
}
+source_set("fuzzer_support") {
+ public = [
+ "fuzzer_utils.h",
+ ]
+ deps = [
+ "//base",
+ "//base:i18n",
+ "//third_party/icu",
+ ]
+ public_configs = [ "//third_party/icu:icu_config" ]
+}
+
fuzzer_test("icu_uregex_open_fuzzer") {
sources = [
"icu_uregex_open_fuzzer.cc",
]
deps = [
- "//base",
- "//third_party/icu",
+ ":fuzzer_support",
]
dict = "icu_regex.dict"
libfuzzer_options = [ "max_len=128" ]
@@ -28,8 +39,7 @@
"icu_unicode_string_codepage_create_fuzzer.cc",
]
deps = [
- "//base",
- "//third_party/icu",
+ ":fuzzer_support",
]
seed_corpus = "//third_party/icu/source/test/testdata"
libfuzzer_options = [ "max_len=10240" ]
@@ -40,9 +50,7 @@
"icu_number_format_fuzzer.cc",
]
deps = [
- "//base",
- "//base:i18n",
- "//third_party/icu",
+ ":fuzzer_support",
]
seed_corpus = "//third_party/icu/source/test/testdata"
libfuzzer_options = [ "max_len=10240" ]
@@ -53,9 +61,16 @@
"icu_break_iterator_fuzzer.cc",
]
deps = [
- "//base",
- "//base:i18n",
- "//third_party/icu",
+ ":fuzzer_support",
+ ]
+}
+
+fuzzer_test("icu_break_iterator_utf32_fuzzer") {
+ sources = [
+ "icu_break_iterator_utf32_fuzzer.cc",
+ ]
+ deps = [
+ ":fuzzer_support",
]
seed_corpus = "//third_party/icu/source/test/testdata"
libfuzzer_options = [ "max_len=10240" ]
@@ -66,11 +81,8 @@
"icu_ucasemap_fuzzer.cc",
]
deps = [
- "//base",
- "//base:i18n",
- "//third_party/icu",
+ ":fuzzer_support",
]
seed_corpus = "//third_party/icu/source/test/testdata"
libfuzzer_options = [ "max_len=10240" ]
}
-
diff --git a/fuzzers/fuzzer_utils.h b/fuzzers/fuzzer_utils.h
index f9ba849..8b3161a 100644
--- a/fuzzers/fuzzer_utils.h
+++ b/fuzzers/fuzzer_utils.h
@@ -4,10 +4,13 @@
#define THIRD_PARTY_ICU_FUZZERS_FUZZER_UTILS_H_
#include <assert.h>
+#include <algorithm>
#include <random>
+
#include "base/at_exit.h"
#include "base/i18n/icu_util.h"
#include "third_party/icu/source/common/unicode/locid.h"
+#include "third_party/icu/source/common/unicode/uchar.h"
struct IcuEnvironment {
IcuEnvironment() {
@@ -31,4 +34,20 @@
return locales[(*rng)() % num_locales];
}
+icu::UnicodeString UnicodeStringFromUtf8(const uint8_t* data, size_t size) {
+ return icu::UnicodeString::fromUTF8(
+ icu::StringPiece(reinterpret_cast<const char*>(data), size));
+}
+
+icu::UnicodeString UnicodeStringFromUtf32(const uint8_t* data, size_t size) {
+ std::vector<UChar32> uchars;
+ uchars.resize(size * sizeof(uint8_t) / (sizeof(UChar32)));
+ memcpy(uchars.data(), data, uchars.size() * sizeof(UChar32));
+ for (size_t i = 0; i < uchars.size(); ++i) {
+ uchars[i] = std::min(uchars[i], UCHAR_MAX_VALUE);
+ }
+
+ return icu::UnicodeString::fromUTF32(uchars.data(), uchars.size());
+}
+
#endif // THIRD_PARTY_ICU_FUZZERS_FUZZER_UTILS_H_
diff --git a/fuzzers/icu_break_iterator_fuzzer.cc b/fuzzers/icu_break_iterator_fuzzer.cc
index b9bc8e5..3b0d336 100644
--- a/fuzzers/icu_break_iterator_fuzzer.cc
+++ b/fuzzers/icu_break_iterator_fuzzer.cc
@@ -11,7 +11,7 @@
// Entry point for LibFuzzer.
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
UErrorCode status = U_ZERO_ERROR;
- icu::UnicodeString str(reinterpret_cast<const char*>(data), size);
+ icu::UnicodeString str(UnicodeStringFromUtf8(data, size));
auto rng = CreateRng(data, size);
const icu::Locale& locale = GetRandomLocale(&rng);
diff --git a/fuzzers/icu_break_iterator_utf32_fuzzer.cc b/fuzzers/icu_break_iterator_utf32_fuzzer.cc
new file mode 100644
index 0000000..ba66c0a
--- /dev/null
+++ b/fuzzers/icu_break_iterator_utf32_fuzzer.cc
@@ -0,0 +1,46 @@
+// Copyright 2016 The Chromium Authors. All rights reserved.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <memory>
+#include "third_party/icu/fuzzers/fuzzer_utils.h"
+#include "third_party/icu/source/common/unicode/brkiter.h"
+
+IcuEnvironment* env = new IcuEnvironment();
+
+// Entry point for LibFuzzer.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ UErrorCode status = U_ZERO_ERROR;
+ icu::UnicodeString str(UnicodeStringFromUtf32(data, size));
+
+ auto rng = CreateRng(data, size);
+ const icu::Locale& locale = GetRandomLocale(&rng);
+
+ std::unique_ptr<icu::BreakIterator> bi;
+
+ switch (rng() % 5) {
+ case 0:
+ bi.reset(icu::BreakIterator::createWordInstance(locale, status));
+ break;
+ case 1:
+ bi.reset(icu::BreakIterator::createLineInstance(locale, status));
+ break;
+ case 2:
+ bi.reset(icu::BreakIterator::createCharacterInstance(locale, status));
+ break;
+ case 3:
+ bi.reset(icu::BreakIterator::createSentenceInstance(locale, status));
+ break;
+ case 4:
+ bi.reset(icu::BreakIterator::createTitleInstance(locale, status));
+ break;
+ }
+ if (U_FAILURE(status))
+ return 0;
+
+ for (int32_t p = bi->first(); p != icu::BreakIterator::DONE; p = bi->next())
+ if (U_FAILURE(status))
+ return 0;
+
+ return 0;
+}
diff --git a/fuzzers/icu_number_format_fuzzer.cc b/fuzzers/icu_number_format_fuzzer.cc
index cb8c573..ab81e79 100644
--- a/fuzzers/icu_number_format_fuzzer.cc
+++ b/fuzzers/icu_number_format_fuzzer.cc
@@ -21,7 +21,7 @@
icu::NumberFormat::createInstance(locale, status));
if (U_FAILURE(status)) return 0;
- icu::UnicodeString str(reinterpret_cast<const char*>(data), size);
+ icu::UnicodeString str(UnicodeStringFromUtf8(data, size));
icu::Formattable result;
fmt->parse(str, result, status);