blob: 516989fea7f1171ff0e91e0c272a90c0f6c78b5d [file] [log] [blame]
// Copyright 2022 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/developer/forensics/utils/redact/replacer.h"
#include <lib/syslog/cpp/macros.h>
#include <algorithm>
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include <re2/re2.h>
#include "src/developer/forensics/utils/regexp.h"
#include "src/lib/fxl/strings/string_printf.h"
namespace forensics {
Replacer ReplaceWithText(const std::string_view pattern, const std::string_view replacement) {
auto regexp = std::make_unique<re2::RE2>(pattern);
if (!regexp->ok()) {
FX_LOGS(ERROR) << "Failed to compile regexp: \"" << pattern << "\"";
return nullptr;
}
return [regexp = std::move(regexp), replace = std::string(replacement)](
RedactionIdCache& cache, std::string& text) -> std::string& {
RE2::GlobalReplace(&text, *regexp, replace);
return text;
};
}
namespace {
struct Redaction {
int64_t original_position;
size_t match_size;
int64_t offset;
std::string replacement;
};
// Given |redactions|, replaces the text at position |original_position| with |replacement|. As
// replacements are made, this function accounts for the position of characters shifting due to the
// difference in size between the original text and the replacement text.
//
// |redactions| is assumed to be sorted in order of |original_position|. The matches are assumed to
// not overlap.
void ApplyRedactions(const std::vector<Redaction>& redactions, std::string& text) {
size_t running_offset = 0;
for (const Redaction& redaction : redactions) {
text.replace(redaction.original_position + running_offset, redaction.match_size,
redaction.replacement);
running_offset += redaction.offset;
}
}
// Finds strings in |text| that match |regexp| and constructs their redacted replacements with
// |build_redacted|.
std::vector<Redaction> BuildRedactions(
const std::string& text, const re2::RE2& regexp,
const std::vector<std::string>& ignore_prefixes,
::fit::function<std::string(const std::string& match)> build_redacted) {
std::vector<Redaction> redactions;
re2::StringPiece text_view(text);
re2::StringPiece match;
while (RE2::FindAndConsume(&text_view, regexp, &match)) {
const bool has_prefix =
std::any_of(ignore_prefixes.begin(), ignore_prefixes.end(),
[&text, &match](const std::string_view ignore_prefix) {
const char* prefix_start = match.data() - ignore_prefix.size();
// Don't access memory before the buffer that |text| owns.
return prefix_start >= text.data() &&
ignore_prefix == std::string_view(prefix_start, ignore_prefix.size());
});
if (!match.empty() && !has_prefix) {
const std::string replacement = build_redacted(std::string(match));
redactions.push_back(Redaction{
// We're working with pointers, but want a relative position within |text| so we need to
// subtract the original start pointer.
.original_position = match.data() - text.data(),
.match_size = match.size(),
.offset = static_cast<int64_t>(replacement.size()) - static_cast<int64_t>(match.size()),
.replacement = replacement,
});
}
}
return redactions;
}
// Builds a Replacer that redacts instances of |pattern| with strings constructed by
// |build_redacted|. Does NOT replace if any strings from |ignore_prefixes| occur just before the
// matching string begins.
//
// Returns nullptr if pattern produces a bad regexp.
Replacer FunctionBasedReplacer(
const std::string_view pattern, const std::vector<std::string>& ignore_prefixes,
::fit::function<std::string(RedactionIdCache& cache, const std::string& match)>
build_redacted) {
auto regexp = std::make_unique<re2::RE2>(pattern);
if (!regexp->ok()) {
FX_LOGS(ERROR) << "Failed to compile regexp: \"" << pattern << "\"";
return nullptr;
}
if (regexp->NumberOfCapturingGroups() < 1) {
FX_LOGS(ERROR) << "Regexp \"" << pattern
<< "\" expected to have at least 1 capturing group, has "
<< regexp->NumberOfCapturingGroups();
return nullptr;
}
return [regexp = std::move(regexp), build_redacted = std::move(build_redacted), ignore_prefixes](
RedactionIdCache& cache, std::string& text) mutable -> std::string& {
const auto redactions = BuildRedactions(text, *regexp, ignore_prefixes,
[&cache, &build_redacted](const std::string& match) {
return build_redacted(cache, match);
});
ApplyRedactions(redactions, text);
return text;
};
}
} // namespace
Replacer ReplaceWithIdFormatString(const std::string_view pattern,
const std::string_view format_str,
const std::vector<std::string>& ignore_prefixes) {
bool specificier_found{false};
for (size_t pos{0}; (pos = format_str.find("%d", pos)) != std::string::npos; ++pos) {
if (specificier_found) {
FX_LOGS(ERROR) << "Format string \"" << format_str
<< "\" expected to have 1 \"%d\" specifier";
return nullptr;
}
specificier_found = true;
}
if (!specificier_found) {
FX_LOGS(ERROR) << "Format string \"" << format_str << "\" expected to have 1 \"%d\" specifier";
return nullptr;
}
return FunctionBasedReplacer(
pattern, ignore_prefixes,
[format = std::string(format_str)](RedactionIdCache& cache, const std::string& match) {
return fxl::StringPrintf(format.c_str(), cache.GetId(match));
});
}
namespace {
constexpr std::string_view kIPv4Pattern{R"(\b()"
R"((?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3})"
R"((?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]|[a-zA-Z]+))"
R"()\b)"};
// 0.*.*.* = current network (as source)
// 127.*.*.* = loopback
// 169.254.*.* = link-local addresses
// 224.0.0.* = link-local multicast
constexpr re2::LazyRE2 kCleartextIPv4 = MakeLazyRE2(R"(^0\..*)"
R"(|)"
R"(^127\..*)"
R"(|)"
R"(^169\.254\..*)"
R"(|)"
R"(^224\.0\.0\..*)"
R"(|)"
R"(^255.255.255.255$)");
std::string RedactIPv4(RedactionIdCache& cache, const std::string& match) {
return re2::RE2::FullMatch(match, *kCleartextIPv4)
? match
: fxl::StringPrintf("<REDACTED-IPV4: %d>", cache.GetId(match));
}
} // namespace
Replacer ReplaceIPv4() {
return FunctionBasedReplacer(kIPv4Pattern, /*ignore_prefixes=*/{}, RedactIPv4);
}
namespace {
// Matches a string like "Ipv4Address { addr: [1, 2, 3, 4] }". The two inner capture groups are used
// to replace just the address within the match.
constexpr re2::LazyRE2 kFidlIpv4 =
MakeLazyRE2(R"(((Ipv4Address { )addr: \[(?:[0-9a-fA-F]{1,3}, ){3}[0-9a-fA-F]{1,3}]( })))");
std::string RedactFidlIPv4(RedactionIdCache& cache, const std::string& match) {
std::string content = match;
const std::string replacement =
fxl::StringPrintf("\\2<REDACTED-IPV4: %d>\\3", cache.GetId(match));
RE2::GlobalReplace(&content, *kFidlIpv4, replacement);
return content;
}
} // namespace
Replacer ReplaceFidlIPv4() {
return FunctionBasedReplacer(kFidlIpv4->pattern(), /*ignore_prefixes=*/{}, RedactFidlIPv4);
}
namespace {
constexpr std::string_view kIPv6Pattern{
// IPv6 without ::
R"(()"
R"(\b(?:(?:[[:xdigit:]]{1,4}:){7}[[:xdigit:]]{1,4})\b)"
R"(|)"
// IPv6 with embedded ::
R"(\b(?:(?:[[:xdigit:]]{1,4}:)+:(?:[[:xdigit:]]{1,4}:)*[[:xdigit:]]{1,4})\b)"
R"(|)"
// IPv6 starting with :: and 3-7 non-zero fields
R"(::[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4}){2,6}\b)"
R"(|)"
// IPv6 with 3-7 non-zero fields ending with ::
R"(\b[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4}){2,6}::)"
R"())"};
// ff.1:** and ff.2:** = local multicast
constexpr re2::LazyRE2 kCleartextIPv6 = MakeLazyRE2(R"((?i)^ff[[:xdigit:]][12]:)");
// ff..:** = multicast - display first 2 bytes and redact
constexpr re2::LazyRE2 kMulticastIPv6 = MakeLazyRE2(R"((?i)^(ff[[:xdigit:]][[:xdigit:]]:))");
// fe80/10 = link-local - display first 2 bytes and redact
constexpr re2::LazyRE2 kLinkLocalIPv6 = MakeLazyRE2(R"((?i)^(fe[89ab][[:xdigit:]]:))");
// ::ffff:*:* = IPv4
constexpr re2::LazyRE2 kIPv4InIPv6 = MakeLazyRE2(R"((?i)^::f{4}(:[[:xdigit:]]{1,4}){2}$)");
std::string RedactIPv6(RedactionIdCache& cache, const std::string& match) {
if (re2::RE2::PartialMatch(match, *kCleartextIPv6)) {
return match;
}
const int id = cache.GetId(match);
std::string submatch;
if (re2::RE2::PartialMatch(match, *kMulticastIPv6, &submatch)) {
return fxl::StringPrintf("%s<REDACTED-IPV6-MULTI: %d>", submatch.c_str(), id);
}
if (re2::RE2::PartialMatch(match, *kLinkLocalIPv6, &submatch)) {
return fxl::StringPrintf("%s<REDACTED-IPV6-LL: %d>", submatch.c_str(), id);
}
if (re2::RE2::FullMatch(match, *kIPv4InIPv6)) {
return fxl::StringPrintf("::ffff:<REDACTED-IPV4: %d>", id);
}
return fxl::StringPrintf("<REDACTED-IPV6: %d>", id);
}
} // namespace
Replacer ReplaceIPv6() {
return FunctionBasedReplacer(kIPv6Pattern, /*ignore_prefixes=*/{}, RedactIPv6);
}
namespace {
// Matches a string like
// "Ipv6Address { addr: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] }". The two inner
// capture groups are used to replace just the address within the match.
constexpr re2::LazyRE2 kFidlIpv6 =
MakeLazyRE2(R"(((Ipv6Address { )addr: \[(?:[0-9a-fA-F]{1,3}, ){15}[0-9a-fA-F]{1,3}]( })))");
std::string RedactFidlIPv6(RedactionIdCache& cache, const std::string& match) {
std::string content = match;
const std::string replacement =
fxl::StringPrintf("\\2<REDACTED-IPV6: %d>\\3", cache.GetId(match));
RE2::GlobalReplace(&content, *kFidlIpv6, replacement);
return content;
}
} // namespace
Replacer ReplaceFidlIPv6() {
return FunctionBasedReplacer(kFidlIpv6->pattern(), /*ignore_prefixes=*/{}, RedactFidlIPv6);
}
namespace mac_utils {
const size_t NUM_MAC_BYTES = 6;
static constexpr std::string_view kMacPattern{
R"(\b()"
R"(\b((?:[[:xdigit:]]{1,2}(?:[\.:-])){3})(?:[[:xdigit:]]{1,2}(?:[\.:-])){2}[[:xdigit:]]{1,2}\b)"
R"()\b)"};
std::string GetOuiPrefix(const std::string& mac) {
static constexpr re2::LazyRE2 regexp = MakeLazyRE2(kMacPattern.data());
std::string oui;
re2::RE2::FullMatch(mac, *regexp, nullptr, &oui);
return oui;
}
std::string CanonicalizeMac(const std::string& original_mac) {
std::string lowercased_mac(original_mac);
std::transform(lowercased_mac.begin(), lowercased_mac.end(), lowercased_mac.begin(),
[](char c) { return std::tolower(c); });
std::string canonical_mac = "00:00:00:00:00:00";
re2::StringPiece lowercased_mac_view(lowercased_mac);
for (size_t i = 0; i < NUM_MAC_BYTES; ++i) {
re2::StringPiece mac_byte;
re2::RE2::FindAndConsume(&lowercased_mac_view, R"(([[:xdigit:]]{1,2}))", &mac_byte);
if (mac_byte.length() == 2) {
canonical_mac.replace(3 * i, 2, mac_byte.data(), 2);
} else if (mac_byte.length() == 1) {
canonical_mac.replace(3 * i + 1, 1, mac_byte.data(), 1);
} else {
// The regular expression used in |FindAndConsume()| above ensure |mac_byte|
// will have either 1 or 2 characters.
__builtin_unreachable();
}
}
return canonical_mac;
}
} // namespace mac_utils
namespace {
std::string RedactMac(RedactionIdCache& cache, const std::string& mac) {
const std::string oui = mac_utils::GetOuiPrefix(mac);
const int id = cache.GetId(mac_utils::CanonicalizeMac(mac));
return fxl::StringPrintf("%s<REDACTED-MAC: %d>", oui.c_str(), id);
}
} // namespace
Replacer ReplaceMac() {
return FunctionBasedReplacer(mac_utils::kMacPattern, /*ignore_prefixes=*/{}, RedactMac);
}
namespace {
// Matches a string like "MacAddress { octets: [1, 2, 3, 4, 5, 6] }". The two inner capture groups
// are used to replace just the address within the match.
constexpr re2::LazyRE2 kFidlMac =
MakeLazyRE2(R"(((MacAddress { )octets: \[(?:[0-9a-fA-F]{1,3}, ){5}[0-9a-fA-F]{1,3}]( })))");
std::string RedactFidlMac(RedactionIdCache& cache, const std::string& match) {
std::string content = match;
const std::string replacement = fxl::StringPrintf("\\2<REDACTED-MAC: %d>\\3", cache.GetId(match));
RE2::GlobalReplace(&content, *kFidlMac, replacement);
return content;
}
} // namespace
Replacer ReplaceFidlMac() {
return FunctionBasedReplacer(kFidlMac->pattern(), /*ignore_prefixes=*/{}, RedactFidlMac);
}
namespace {
// The SSID identifier contains at most 32 pairs of hexadecimal characters, but match any number so
// SSID identifiers with the wrong number of hexadecimal characters are also redacted.
constexpr std::string_view kSsidPattern = R"((<ssid-[[:xdigit:]]*>))";
std::string RedactSsid(RedactionIdCache& cache, const std::string& match) {
const int id = cache.GetId(match);
return fxl::StringPrintf("<REDACTED-SSID: %d>", id);
}
} // namespace
Replacer ReplaceSsid() {
return FunctionBasedReplacer(kSsidPattern, /*ignore_prefixes=*/{}, RedactSsid);
}
} // namespace forensics