| /* |
| * Copyright (C) 2021 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_ |
| #define SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_ |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include <vector> |
| |
| namespace protozero { |
| |
| // Loads the proto-encoded bytecode in memory and allows fast lookups for tuples |
| // (msg_index, field_id) to tell if a given field should be allowed or not and, |
| // in the case of nested fields, what is the next message index to recurse into. |
| // This class does two things: |
| // 1. Expands the array of varint from the proto into a vector<uint32_t>. This |
| // is to avoid performing varint decoding on every lookup, at the cost of |
| // some extra memory (2KB-4KB). Note that the expanded vector is not just a |
| // 1:1 copy of the proto one (more below). This is to avoid O(Fields) linear |
| // lookup complexity. |
| // 2. Creates an index of offsets to remember the start word for each message. |
| // This is so we can jump to O(1) to the N-th message when recursing into a |
| // nested fields, without having to scan and find the (N-1)-th END_OF_MESSAGE |
| // marker. |
| // Overall lookups are O(1) for field ids < 128 (kDirectlyIndexLimit) and O(N), |
| // with N being the number of allowed field ranges for other fields. |
| // See comments around |word_| below for the structure of the word vector. |
| class FilterBytecodeParser { |
| public: |
| // Result of a Query() operation |
| struct QueryResult { |
| bool allowed; // Whether the field is allowed at all or no. |
| |
| // If |allowed|==true && simple_field()==false, this tells the message index |
| // of the nested field that should be used when recursing in the parser. |
| uint32_t nested_msg_index; |
| |
| // If |allowed|==true, specifies if the field is of a simple type (varint, |
| // fixed32/64, string or byte) or a nested field that needs recursion. |
| // In the latter case the caller is expected to use |nested_msg_index| for |
| // the next Query() calls. |
| bool simple_field() const { return nested_msg_index == kSimpleField; } |
| }; |
| |
| // Loads a filter. The filter data consists of a sequence of varints which |
| // contains the filter opcodes and a final checksum. |
| bool Load(const void* filter_data, size_t len); |
| |
| // Checks wheter a given field is allowed or not. |
| // msg_index = 0 is the index of the root message, where all queries should |
| // start from (typically perfetto.protos.Trace). |
| QueryResult Query(uint32_t msg_index, uint32_t field_id); |
| |
| void Reset(); |
| void set_suppress_logs_for_fuzzer(bool x) { suppress_logs_for_fuzzer_ = x; } |
| |
| private: |
| static constexpr uint32_t kDirectlyIndexLimit = 128; |
| static constexpr uint32_t kAllowed = 1u << 31u; |
| static constexpr uint32_t kSimpleField = 0x7fffffff; |
| |
| bool LoadInternal(const uint8_t* filter_data, size_t len); |
| |
| // The state of all fields for all messages is stored in one contiguous array. |
| // This is to avoid memory fragmentation and allocator overhead. |
| // We expect a high number of messages (hundreds), but each message is small. |
| // For each message we store two sets of uint32: |
| // 1. A set of "directly indexed" fields, for field ids < 128. |
| // 2. The remainder is a set of ranges. |
| // So each message descriptor consists of a sequence of words as follows: |
| // |
| // [0] -> how many directly indexed fields are stored next (up to 128) |
| // |
| // [1..N] -> One word per field id (See "field state" below). |
| // |
| // [N + 1] -> Start of field id range 1 |
| // [N + 2] -> End of field id range 1 (exclusive, STL-style). |
| // [N + 3] -> Field state for fields in range 1 (below) |
| // |
| // [N + 4] -> Start of field id range 2 |
| // [N + 5] -> End of field id range 2 (exclusive, STL-style). |
| // [N + 6] -> Field state for fields in range 2 (below) |
| |
| // The "field state" word is as follows: |
| // Bit 31: 0 if the field is disallowed, 1 if allowed. |
| // Only directly indexed fields can be 0 (it doesn't make sense to add |
| // a range and then say "btw it's NOT allowed".. don't add it then. |
| // 0 is only used for filling gaps in the directly indexed bucket. |
| // Bits [30..0] (only when MSB == allowed): |
| // 0x7fffffff: The field is "simple" (varint, fixed32/64, string, bytes) and |
| // can be directly passed through in output. No recursion is needed. |
| // [0, 7ffffffe]: The field is a nested submessage. The value is the index |
| // that must be passed as first argument to the next Query() calls. |
| // Note that the message index is purely a monotonic counter in the |
| // filter bytecode, has no proto-equivalent match (unlike field ids). |
| std::vector<uint32_t> words_; |
| |
| // One entry for each message index stored in the filter plus a sentinel at |
| // the end. Maps each message index to the offset in |words_| where the |
| // Nth message start. |
| // message_offset_.size() - 2 == the max message id that can be parsed. |
| std::vector<uint32_t> message_offset_; |
| |
| bool suppress_logs_for_fuzzer_ = false; |
| }; |
| |
| } // namespace protozero |
| |
| #endif // SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_ |