|  | // Copyright 2020 The Fuchsia Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #ifndef SRC_LIB_CHUNKED_COMPRESSION_CHUNKED_ARCHIVE_H_ | 
|  | #define SRC_LIB_CHUNKED_COMPRESSION_CHUNKED_ARCHIVE_H_ | 
|  |  | 
|  | #include <optional> | 
|  |  | 
|  | #include <fbl/algorithm.h> | 
|  | #include <fbl/array.h> | 
|  | #include <fbl/macros.h> | 
|  |  | 
|  | #include "status.h" | 
|  |  | 
|  | // Format specification for chunked archives. | 
|  | // | 
|  | // A chunked archive has a Header followed by zero or more Frames. | 
|  | // | 
|  | // ## Header | 
|  | // | 
|  | // The header describes the format of the archive and contains the Seek Table which maps the | 
|  | // compressed frames to decompressed space. | 
|  | // | 
|  | // This header describes *Version 2* of the format. All other versions are unsupported. | 
|  | // | 
|  | //       0     1     2     3     4     5     6     7 | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | //  0 |                 Magic Number                  | | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | //  8 |  Version  |  Reserved |       Num Frames      |  // Reserved bytes must be zero. | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | // 16 |    Header CRC32       |        Reserved       |  // Reserved bytes must be zero. | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | // 24 |                    Reserved                   |  // Reserved bytes must be zero. | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | // 32 |                                               | | 
|  | // 40 |                   Seek Table                  | | 
|  | // 48 |                     Entry                     | | 
|  | // 56 |                                               | | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | // .. |                                               | | 
|  | // .. |                   Seek Table                  | | 
|  | // .. |                     Entry                     | | 
|  | // .. |                                               | | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | // | 
|  | // The Header CRC32 is computed based on the entire header including each Seek Table Entry. | 
|  | // | 
|  | // ### Seek Table | 
|  | // | 
|  | // Each Seek Table Entry describes a contiguous range of data in the compressed space, and where | 
|  | // in the decompressed data it expands to. | 
|  | // | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | //  0 |               Decompressed Offset             | | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | //  8 |                Decompressed Size              | | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | // 16 |                Compressed Offset              | | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | // 24 |                 Compressed Size               | | 
|  | //    +-----+-----+-----+-----+-----+-----+-----+-----+ | 
|  | // | 
|  | // Seek table entries are *contiguous* in decompressed space, but may be *discontiguous* in | 
|  | // compressed space. This is to support adding alignment/padding to output files to improve storage | 
|  | // access efficiency. | 
|  | // | 
|  | // A seek table can hold at most 1023 entries (which results in a 32KiB header). | 
|  | // | 
|  | // ### Seek Table Invariants | 
|  | // | 
|  | // I0: The first seek table entry must have decompressed offset 0. | 
|  | // I1: The first seek table entry must have compressed offset greater than or equal to the size of | 
|  | //     the header. | 
|  | // I2: Each entry's decompressed offset must be equal to the end of the previous frame (i.e. to | 
|  | //     the previous frame's decompressed offset+length). | 
|  | // I3: Each entry's compressed offset must be greater than or equal to the end of the previous | 
|  | //     frame (i.e. to the previous frame's compressed offset+length). | 
|  | // I4: Each entry must have a non-zero decompressed and compressed length. | 
|  | // I5: No compressed frame may exceed the end of the file. | 
|  | // | 
|  | // ## Compressed Frames | 
|  | // | 
|  | // The compressed frames are contiguous ranges of bytes stored in the file at the offsets described | 
|  | // by their seek table entry. | 
|  | // | 
|  | // Any ranges of bytes in the file not covered by the seek table are ignored. | 
|  |  | 
|  | namespace chunked_compression { | 
|  |  | 
|  | using ArchiveVersionType = uint16_t; | 
|  | using ChunkCountType = uint32_t; | 
|  |  | 
|  | // The magic number is an arbitrary unique value used to identify files as being of this format.  It | 
|  | // can be derived as follows: | 
|  | // | 
|  | //   sha256sum <<< "Fuchsia is a vivid purplish red color" | head -c16 | 
|  | // | 
|  | constexpr size_t kArchiveMagicLength = sizeof(uint64_t); | 
|  | constexpr uint8_t kChunkArchiveMagic[kArchiveMagicLength] = { | 
|  | 0x46, 0x9b, 0x78, 0xef, 0x0f, 0xd0, 0xb2, 0x03, | 
|  | }; | 
|  | constexpr ArchiveVersionType kVersion = 2u; | 
|  |  | 
|  | constexpr size_t kChunkArchiveMagicOffset = 0ul; | 
|  | constexpr size_t kChunkArchiveVersionOffset = 8ul; | 
|  | constexpr size_t kChunkArchiveReserved1Offset = 10ul; | 
|  | constexpr size_t kChunkArchiveNumChunksOffset = 12ul; | 
|  | constexpr size_t kChunkArchiveHeaderCrc32Offset = 16ul; | 
|  | constexpr size_t kChunkArchiveReserved2Offset = 20ul; | 
|  | constexpr size_t kChunkArchiveSeekTableOffset = 32ul; | 
|  |  | 
|  | // A single entry into the seek table. Describes where an extent of decompressed | 
|  | // data lives in the compressed space. | 
|  | struct SeekTableEntry { | 
|  | uint64_t decompressed_offset; | 
|  | uint64_t decompressed_size; | 
|  | uint64_t compressed_offset; | 
|  | uint64_t compressed_size; | 
|  | }; | 
|  | static_assert(sizeof(SeekTableEntry) == 32ul, "Breaking change to archive format"); | 
|  |  | 
|  | constexpr ChunkCountType kChunkArchiveMaxFrames = 1023; | 
|  |  | 
|  | constexpr size_t kChunkArchiveMinHeaderSize = kChunkArchiveSeekTableOffset; | 
|  | constexpr size_t kChunkArchiveMaxHeaderSize = | 
|  | kChunkArchiveSeekTableOffset + (kChunkArchiveMaxFrames * sizeof(SeekTableEntry)); | 
|  |  | 
|  | // This assert just documents the relationship between the maximum number of frames and the actual | 
|  | // maximum header size (32KiB). | 
|  | static_assert(kChunkArchiveMaxHeaderSize == (32 * 1024), ""); | 
|  |  | 
|  | static_assert(kChunkArchiveMagicOffset == 0ul, "Breaking change to archive format"); | 
|  | static_assert(kChunkArchiveVersionOffset == kChunkArchiveMagicOffset + kArchiveMagicLength, | 
|  | "Breaking change to archive format"); | 
|  | static_assert(kChunkArchiveReserved1Offset == | 
|  | kChunkArchiveVersionOffset + sizeof(ArchiveVersionType), | 
|  | "Breaking change to archive format"); | 
|  | static_assert(kChunkArchiveNumChunksOffset == kChunkArchiveReserved1Offset + sizeof(uint16_t), | 
|  | "Breaking change to archive format"); | 
|  | static_assert(kChunkArchiveHeaderCrc32Offset == | 
|  | kChunkArchiveNumChunksOffset + sizeof(ChunkCountType), | 
|  | "Breaking change to archive format"); | 
|  | static_assert(kChunkArchiveReserved2Offset == kChunkArchiveHeaderCrc32Offset + sizeof(uint32_t), | 
|  | "Breaking change to archive format"); | 
|  | static_assert(kChunkArchiveSeekTableOffset == | 
|  | kChunkArchiveReserved2Offset + sizeof(uint32_t) + sizeof(uint64_t), | 
|  | "Breaking change to archive format"); | 
|  |  | 
|  | // A parsed view of a chunked archive's seek table. | 
|  | // Constructed by parsing a buffer containing a raw archive (see |HeaderReader::Parse()|). | 
|  | class SeekTable { | 
|  | public: | 
|  | SeekTable() = default; | 
|  | SeekTable(SeekTable&& o) = default; | 
|  | SeekTable& operator=(SeekTable&& o) = default; | 
|  | ~SeekTable() = default; | 
|  | DISALLOW_COPY_AND_ASSIGN_ALLOW_MOVE(SeekTable); | 
|  |  | 
|  | // Returns a reference to the seek table of the archive. | 
|  | const fbl::Array<SeekTableEntry>& Entries() const { return entries_; } | 
|  |  | 
|  | // Returns the size of the compressed archive. | 
|  | // Equal to the end of the greatest frame (i.e. its offset + size). | 
|  | size_t CompressedSize() const; | 
|  |  | 
|  | // Returns the size of the serialized header (i.e. everything but the actual compressed frames). | 
|  | size_t SerializedHeaderSize() const; | 
|  |  | 
|  | // Returns the expected size of the archive after decompression. | 
|  | // Equal to the end of the greatest frame (i.e. its offset + size). | 
|  | size_t DecompressedSize() const; | 
|  |  | 
|  | // Lookup functions to find the entry in the seek table which covers |offset| in either the | 
|  | // compressed or decompressed space. | 
|  | // Returns the index into |SeekTable()| where the entry is stored, or std::nullopt if the | 
|  | // offset is not covered. (Note that there can be gaps in the *compressed* frames, but the | 
|  | // decompressed frames are contiguous). | 
|  | std::optional<unsigned> EntryForCompressedOffset(size_t offset) const; | 
|  | std::optional<unsigned> EntryForDecompressedOffset(size_t offset) const; | 
|  |  | 
|  | // Allow HeaderReader to initialize these objects with a validated parsed seek table | 
|  | friend class HeaderReader; | 
|  |  | 
|  | private: | 
|  | fbl::Array<SeekTableEntry> entries_; | 
|  | }; | 
|  |  | 
|  | // HeaderReader reads chunked archive headers and produces in-memory SeekTable representations. | 
|  | class HeaderReader { | 
|  | public: | 
|  | HeaderReader() = default; | 
|  | DISALLOW_COPY_ASSIGN_AND_MOVE(HeaderReader); | 
|  |  | 
|  | // Validates that |data| is a valid chunked archive header and fills |out| with a copy of its | 
|  | // seek table. | 
|  | // |len| must be at least long enough to include the entire header; any actual compressed frames | 
|  | // contained in |data| will not be accessed. | 
|  | // |file_length| is the known length of the overall file. This is used for sanity checking the | 
|  | // entries in the seek table. If any compressed frames exceed this length, the header is assumed | 
|  | // to be corrupted. | 
|  | Status Parse(const void* data, size_t len, size_t file_length, SeekTable* out); | 
|  |  | 
|  | // Share validation routines with HeaderWriter | 
|  | friend class HeaderWriter; | 
|  |  | 
|  | private: | 
|  | static Status CheckMagic(const uint8_t* data, size_t len); | 
|  | static Status CheckVersion(const uint8_t* data, size_t len); | 
|  | static Status CheckChecksum(const uint8_t* data, size_t len); | 
|  | static Status GetNumChunks(const uint8_t* data, size_t len, ChunkCountType* num_chunks_out); | 
|  | static Status ParseSeekTable(const uint8_t* data, size_t len, size_t file_length, | 
|  | fbl::Array<SeekTableEntry>* entries_out); | 
|  | static Status CheckSeekTable(const fbl::Array<SeekTableEntry>& seek_table, size_t header_end, | 
|  | size_t file_length); | 
|  | // NOTE: |prev| is optional, and is nullptr if |entry| is the first entry. | 
|  | // (Ideally this would be a std::optional<const T&>, but optional references are not legal.) | 
|  | static Status CheckSeekTableEntry(const SeekTableEntry& entry, const SeekTableEntry* prev, | 
|  | size_t header_end, size_t file_length); | 
|  |  | 
|  | // Computes the CRC32 checksum for |header|. | 
|  | static uint32_t ComputeChecksum(const uint8_t* header, size_t header_length); | 
|  | }; | 
|  |  | 
|  | // HeaderWriter writes chunked archive headers to a target buffer. | 
|  | class HeaderWriter { | 
|  | public: | 
|  | HeaderWriter() = default; | 
|  | HeaderWriter(HeaderWriter&& o) = default; | 
|  | HeaderWriter& operator=(HeaderWriter&& o) = default; | 
|  | DISALLOW_COPY_AND_ASSIGN_ALLOW_MOVE(HeaderWriter); | 
|  |  | 
|  | // Initializes |out| to write a header to |dst| (which is a buffer of |dst_len| bytes). | 
|  | // Exactly |num_frames| will be written. | 
|  | static Status Create(void* dst, size_t dst_len, size_t num_frames, HeaderWriter* out); | 
|  |  | 
|  | // Computes the number of frames which will be used to compress a |size|-byte input. | 
|  | static size_t NumFramesForDataSize(size_t size, size_t chunk_size) { | 
|  | return fbl::round_up(size, chunk_size) / chunk_size; | 
|  | } | 
|  |  | 
|  | // Computes the size of the header for an archive with |num_frames|. | 
|  | static size_t MetadataSizeForNumFrames(size_t num_frames) { | 
|  | return kChunkArchiveSeekTableOffset + (num_frames * sizeof(SeekTableEntry)); | 
|  | } | 
|  |  | 
|  | // Adds a copy of |entry| to the seek table. | 
|  | // Returns an error if |entry| is invalid, overlaps an existing entry, or if the table is already | 
|  | // full. | 
|  | Status AddEntry(const SeekTableEntry& entry); | 
|  |  | 
|  | // Finishes writing the header out to the target buffer. | 
|  | // | 
|  | // Returns an error if the header was not fully initialized (i.e. not every seek table entry | 
|  | // was filled). | 
|  | // | 
|  | // The target buffer is in an undefined state before Finalize() is called, and should not be | 
|  | // serialized until Finalize() returns successfully. | 
|  | // | 
|  | // The HeaderWriter is in an undefined state after Finalize() returns, regardless of | 
|  | // whether Finalize() succeeded or not. | 
|  | Status Finalize(); | 
|  |  | 
|  | private: | 
|  | HeaderWriter(void* dst, size_t dst_len, size_t num_frames); | 
|  |  | 
|  | uint8_t* dst_ = nullptr; | 
|  | size_t dst_length_; | 
|  | SeekTableEntry* entries_ = nullptr; | 
|  | unsigned current_frame_ = 0; | 
|  | ChunkCountType num_frames_; | 
|  | }; | 
|  |  | 
|  | }  // namespace chunked_compression | 
|  |  | 
|  | #endif  // SRC_LIB_CHUNKED_COMPRESSION_CHUNKED_ARCHIVE_H_ |