blob: 0aa0425f1e8ef4eb59c58f270a115607a4aa3ba1 [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file describes the on-disk structure of Blobfs.
#ifndef SRC_STORAGE_BLOBFS_FORMAT_H_
#define SRC_STORAGE_BLOBFS_FORMAT_H_
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <zircon/assert.h>
#include <zircon/compiler.h>
#include <zircon/types.h>
#include <algorithm>
#include <limits>
#include <ostream>
#include <fbl/algorithm.h>
#include <fbl/macros.h>
#include <safemath/safe_conversions.h>
#include "src/storage/lib/vfs/cpp/journal/format.h"
#ifdef __Fuchsia__
#include <zircon/syscalls.h>
#endif
#include "src/lib/digest/digest.h"
#include "src/lib/digest/merkle-tree.h"
namespace blobfs {
// clang-format off
constexpr uint64_t kBlobfsMagic0 = (0xac2153479e694d21ULL);
constexpr uint64_t kBlobfsMagic1 = (0x985000d4d4d3d314ULL);
// Current version of the format. The major version determines backwards-compatibility. The minor
// version can be freely incremented at any time and does not impact backwards-compatibility; the
// more often it is updated, the more granularly we can find out what the oldest driver that has
// touched a filesystem instance.
//
// Minimally, the minor version should be incremented whenever a (backwards-compatible) format
// change is made, but it can also be incremented when major logic changes are made in case there is
// chance of bugs being introduced and we would like to be able to detect if the filesystem has been
// touched by a potentially buggy driver. The kBlobfsCurrentMinorVersion is used to updated the
// oldest_minor_version field in the header when it is opened.
//
// See //src/storage/docs/versioning.md for more.
//
// *************************************************************************************************
//
// IMPORTANT: When changing either kBlobfsCurrentMajorVersion or kBlobfsCurrentMinorVersion:
//
// * Update //third_party/cobalt_config/fuchsia/local_storage/versions.txt
// (submission order does not matter).
//
// * Update //src/storage/blobfs/README.md with what changed.
//
// *************************************************************************************************
constexpr uint32_t kBlobfsCurrentMajorVersion = 0x00000009;
// When this next changes, consider enabling the OldestMinorVersionNotUpdated test.
constexpr uint64_t kBlobfsCurrentMinorVersion = 0x00000004;
// Version 9 introduced a compact merkle tree version. Version 8 uses padded merkle trees. This
// format is controlled by a build flag so version 9 is not necessarily newer (see README.md).
constexpr uint32_t kBlobfsCompactMerkleTreeVersion = 0x00000009;
// Revision 2: introduced a backup superblock.
constexpr uint64_t kBlobfsMinorVersionBackupSuperblock = 0x00000002;
// Revision 3: migrated away from old compression formats.
constexpr uint64_t kBlobfsMinorVersionNoOldCompressionFormats = 0x00000003;
// Revision 4: fixed host-side tool bug which generated a zero-length extent for the null blob.
constexpr uint64_t kBlobfsMinorVersionHostToolHandlesNullBlobCorrectly = 0x00000004;
constexpr uint32_t kBlobFlagClean = 1;
constexpr uint32_t kBlobFlagFVM = 4;
// These constants should all be defined as uint64_t to prevent unintended truncation errors
// when performing arithmetic.
constexpr uint64_t kBlobfsBlockSize = 8192;
constexpr uint64_t kBlobfsBlockBits = (kBlobfsBlockSize * 8);
constexpr uint64_t kBlobfsSuperblockBlocks = 1;
constexpr uint64_t kBlobfsBlockMapStart = 1;
constexpr uint64_t kBlobfsInodeSize = 64;
constexpr uint64_t kBlobfsInodesPerBlock = (kBlobfsBlockSize / kBlobfsInodeSize);
constexpr uint64_t kBlobfsMaxFileSize = kBlobfsBlockSize * std::numeric_limits<uint32_t>::max();
// Known Blobfs metadata locations. Unit of the location is blobfs block.
constexpr uint64_t kSuperblockOffset = 0;
// Blobfs has a backup superblock but only with FVM. The backup superblock only needs to be
// sufficient to get to the journal, since once there, either the primary superblock is valid or
// there's a pending write in the journal to the primary superblock. Note that the backup
// superblock is not there for the purpose of solving random corruption issues i.e. a random
// corruption of the primary superblock will still render the volume unusable. It only exists to
// guard against the potential for corruption whilst updating the primary superblock. In practice,
// after a write to a device but before a successful flush, we will either see the data before the
// write or after the write and not some indeterminate state between so it's unlikely that the
// primary superblock wouldn't be readable, but if it's easy to do so, we should be resilient to the
// case where data is in an indeterminate state between write and flush, and we should have tests
// for this. A backup superblock solves this issue. Regarding random corruption: there are other
// blocks within a filesystem that could be corrupted and would have equally serious consequences,
// so if this is something to be addressed, it might benefit from a holistic solution, although one
// could argue that corruption of the first block on the device is more likely than other blocks.
// Note that there's no need to update the superblock when the primary superblock changes since its
// only purpose is to help locate the journal, so some aspects of the backup superblock are likely
// to be inconsistent.
constexpr uint64_t kFVMBackupSuperblockOffset = 1;
// Blobfs block offset of various filesystem structures, when using the FVM.
constexpr uint64_t kFVMBlockMapStart = 0x10000;
constexpr uint64_t kFVMNodeMapStart = 0x20000;
constexpr uint64_t kFVMJournalStart = 0x30000;
constexpr uint64_t kFVMDataStart = 0x40000;
// clang-format on
// Maximum number of data blocks possible for a single entry:
// - Blobfs Superblock
// - Inode Table Blocks
// - Block Bitmap Blocks
// TODO(https://fxbug.dev/42108049): Calculate the actual upper bound here; this number is not
// necessarily considering the worst cases of fragmentation.
constexpr uint64_t kMaxEntryDataBlocks = 64;
// Minimum possible size for the journal, allowing the maximum size for one entry.
constexpr uint64_t kMinimumJournalBlocks =
fs::kJournalMetadataBlocks + fs::kEntryMetadataBlocks + kMaxEntryDataBlocks;
// This serves as both default inode count when mkfs arguments do not specify
// inode count and as absolute minimum inodes allowed in the fs.
// This value is somewhat arbitrarily chosen. It is large enough to allow us
// to create a few blobs and still small so that resources spent on inodes
// are limited. Mkfs can override this value.
constexpr uint64_t kBlobfsDefaultInodeCount = 10240;
constexpr uint64_t kMinimumDataBlocks = 2;
struct __PACKED alignas(8) Superblock {
uint64_t magic0;
uint64_t magic1;
uint32_t major_version;
uint32_t flags;
uint32_t block_size; // 8K typical.
uint32_t reserved1; // Unused, reserved (for padding).
uint64_t data_block_count; // Number of data blocks in this area.
uint64_t journal_block_count; // Number of journal blocks in this area.
uint64_t inode_count; // Number of blobs in this area.
uint64_t alloc_block_count; // Total number of allocated blocks.
uint64_t alloc_inode_count; // Total number of allocated blobs and container nodes.
// NOTE: prior to https://fuchsia-review.googlesource.com/c/fuchsia/+/404619, |reserved2| was
// explicitly required to be zero. This field may be used for other purposes, but doing so is a
// backwards-incompatible change.
uint64_t reserved2; // Unused.
// The following 6 fields are only valid with (flags & kBlobFlagFVM):
uint64_t slice_size; // Underlying slice size.
uint64_t deprecated1; // Unused but not necessarily 0 (saved total vslices in old vers.).
uint32_t abm_slices; // Slices allocated to block bitmap.
uint32_t ino_slices; // Slices allocated to node map.
uint32_t dat_slices; // Slices allocated to file data section.
uint32_t journal_slices; // Slices allocated to journal section.
// End FVM-specific fields
uint8_t zeroes[8]; // Padding. Set to zeroes, can be reclaimed.
// The oldest minor version corresponding to the kBlobfsCurrentMinorVersion of the software that
// has written to this blobfs instance. When opening for writes, the driver should check this and
// lower it if the current revision is lower than the one stored in this header. This does not say
// anything about backwards-compatibility, that is determined by major_version above.
//
// See //src/storage/docs/versioning.md for more.
uint64_t oldest_minor_version;
uint8_t reserved[8064];
};
static_assert(sizeof(Superblock) == kBlobfsBlockSize, "Invalid blobfs superblock size");
static_assert(kBlobfsBlockSize <= std::numeric_limits<decltype(Superblock::block_size)>::max(),
"Defined block size constant exceeds range of block_size field in superblock!");
constexpr uint64_t SuperblockBlocks(const Superblock& info) { return kBlobfsSuperblockBlocks; }
constexpr uint64_t BlockMapStartBlock(const Superblock& info) {
if (info.flags & kBlobFlagFVM) {
return kFVMBlockMapStart;
} else {
return kBlobfsBlockMapStart;
}
}
constexpr uint64_t BlockMapBlocks(const Superblock& info) {
return fbl::round_up(info.data_block_count, kBlobfsBlockBits) / kBlobfsBlockBits;
}
constexpr uint64_t NodeMapStartBlock(const Superblock& info) {
// Node map immediately follows the block map
if (info.flags & kBlobFlagFVM) {
return kFVMNodeMapStart;
} else {
// Node map immediately follows the block map.
return BlockMapStartBlock(info) + BlockMapBlocks(info);
}
}
constexpr uint64_t NodeBitmapBlocks(const Superblock& info) {
return fbl::round_up(info.inode_count, kBlobfsBlockBits) / kBlobfsBlockBits;
}
constexpr uint64_t NodeMapBlocks(const Superblock& info) {
return fbl::round_up(info.inode_count, kBlobfsInodesPerBlock) / kBlobfsInodesPerBlock;
}
constexpr uint64_t JournalStartBlock(const Superblock& info) {
if (info.flags & kBlobFlagFVM) {
return kFVMJournalStart;
}
// Journal immediately follows the node map.
return NodeMapStartBlock(info) + NodeMapBlocks(info);
}
constexpr uint64_t JournalBlocks(const Superblock& info) { return info.journal_block_count; }
constexpr uint64_t DataStartBlock(const Superblock& info) {
if (info.flags & kBlobFlagFVM) {
return kFVMDataStart;
}
// Data immediately follows the journal.
return JournalStartBlock(info) + JournalBlocks(info);
}
constexpr uint64_t DataBlocks(const Superblock& info) { return info.data_block_count; }
constexpr uint64_t TotalNonDataBlocks(const Superblock& info) {
return SuperblockBlocks(info) + BlockMapBlocks(info) + NodeMapBlocks(info) + JournalBlocks(info);
}
constexpr uint64_t TotalBlocks(const Superblock& info) {
return TotalNonDataBlocks(info) + DataBlocks(info);
}
// States of 'Blob' identified via start block.
constexpr uint64_t kStartBlockMinimum = 1; // Smallest 'data' block possible.
using digest::Digest;
class __PACKED alignas(8) Extent {
public:
static constexpr size_t kBlockOffsetBits = 48;
static constexpr uint64_t kBlockOffsetMax = (1LLU << kBlockOffsetBits) - 1;
static constexpr uint64_t kBlockOffsetMask = kBlockOffsetMax;
static constexpr size_t kBlockCountBits = 16;
static constexpr uint64_t kBlockCountMax = (1LLU << kBlockCountBits) - 1;
static constexpr uint64_t kBlockCountMask = kBlockCountMax << kBlockOffsetBits;
Extent() = default;
// Create an extent starting at |start_block| blocks, spanning |length_blocks|.
// |start_block| must be <= kBlockOffsetMax and |length_blocks| must be <= kBlockCountMax.
// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
Extent(uint64_t start_block, uint64_t length_blocks) {
SetStart(start_block);
SetLength(length_blocks);
}
// Get the start block offset this Extent refers to.
uint64_t Start() const { return data_ & kBlockOffsetMask; }
// Set the start block offset of this Extent. |start_block| must be <= kBlockOffsetMax.
void SetStart(uint64_t start_block) {
ZX_ASSERT(start_block <= kBlockOffsetMax);
data_ = (data_ & ~kBlockOffsetMask) | (start_block & kBlockOffsetMask);
}
// Get the length, in blocks, of this Extent.
uint64_t Length() const { return (data_ & kBlockCountMask) >> kBlockOffsetBits; }
// Set the length, in blocks, of this Extent. |length_blocks| must be <= kBlockCountMask
void SetLength(uint64_t length_blocks) {
ZX_ASSERT(length_blocks <= kBlockCountMax);
data_ = (data_ & ~kBlockCountMask) | ((length_blocks & kBlockCountMax) << kBlockOffsetBits);
}
bool operator==(const Extent& rhs) const {
return Start() == rhs.Start() && Length() == rhs.Length();
}
private:
uint64_t data_ = 0;
};
// This is inlined because format.cc only compiles on Fuchsia builds (not host).
inline std::ostream& operator<<(std::ostream& stream, const Extent& extent) {
stream << "{start:" << extent.Start() << ", len:" << extent.Length() << "}";
return stream;
}
template <size_t N>
inline std::ostream& operator<<(std::ostream& stream, const Extent (&extents)[N]) {
stream << "[";
for (size_t i = 0; i < N; ++i) {
if (i > 0) {
stream << ", ";
}
stream << extents[i];
}
stream << "]";
return stream;
}
static_assert(sizeof(Extent) == sizeof(uint64_t), "Extent class should only contain data");
// The largest node id representable in a node list.
constexpr uint32_t kMaxNodeId = 0xffffffffu;
// Identifies that the node is allocated.
// Both inodes and extent containers can be allocated.
constexpr uint16_t kBlobFlagAllocated = 1 << 0;
// Bits 1, 3, and 4 were used for obsolete compression flags. They were set only for old internal
// test builds and no devices were released with these flags. They can be assumed to be 0 if needed
// for future uses.
// Identifies that this node is a container for extents.
constexpr uint16_t kBlobFlagExtentContainer = 1 << 2;
// Identifies that the on-disk storage of the blob is chunk-compression compressed.
constexpr uint16_t kBlobFlagChunkCompressed = 1 << 5;
// When adding another compression flag, it must be added to kBlobFlagMaskAnyCompression below.
// Bitmask of all compression flags (this allows additional flags to be added more easily).
constexpr uint16_t kBlobFlagMaskAnyCompression = kBlobFlagChunkCompressed;
// This mask is the mask of all valid flag bits, but we should be tolerant if we encounter bits set
// on a filesystem in case the filesystem has been touched by a future version of blobfs.
constexpr uint16_t kBlobFlagMaskValid =
kBlobFlagAllocated | kBlobFlagExtentContainer | kBlobFlagMaskAnyCompression;
// The number of extents within a normal inode.
constexpr uint64_t kInlineMaxExtents = 1;
// The number of extents within an extent container node.
constexpr uint64_t kContainerMaxExtents = 6;
constexpr uint16_t kBlobNodeVersion = 0;
struct __PACKED alignas(8) NodePrelude {
uint16_t flags = 0;
uint16_t version = kBlobNodeVersion;
// The next node containing this blob's extents.
// Should not be used or read if there are no more extents.
uint32_t next_node = 0;
bool IsAllocated() const { return flags & kBlobFlagAllocated; }
bool IsExtentContainer() const { return flags & kBlobFlagExtentContainer; }
bool IsInode() const { return !IsExtentContainer(); }
bool IsCompressedZstdChunked() const { return flags & kBlobFlagChunkCompressed; }
};
// This is inlined because format.cc only compiles on Fuchsia builds (not host).
inline std::ostream& operator<<(std::ostream& stream, const NodePrelude& prelude) {
stream << "Node {allocated:" << prelude.IsAllocated() << " is_inode:" << prelude.IsInode()
<< " version:" << prelude.version << " next_node:" << prelude.next_node << "}";
return stream;
}
struct ExtentContainer;
struct __PACKED alignas(8) Inode {
NodePrelude header;
uint8_t merkle_root_hash[digest::kSha256Length];
uint64_t blob_size;
// The total number of Blocks used to represent this blob.
uint32_t block_count;
// The total number of Extent objects necessary to represent this blob.
// Identifies when to stop iterating through the node list.
uint16_t extent_count;
uint16_t reserved;
Extent extents[kInlineMaxExtents];
ExtentContainer* AsExtentContainer() { return reinterpret_cast<ExtentContainer*>(this); }
bool IsCompressed() const { return header.flags & kBlobFlagMaskAnyCompression; }
};
// The largest number of extents which can compose a blob.
constexpr uint64_t kMaxExtentsPerBlob = std::numeric_limits<decltype(Inode::extent_count)>::max();
// The largest amount of blocks a given blob can span.
constexpr uint64_t kMaxBlocksPerBlob = kMaxExtentsPerBlob * Extent::kBlockCountMax;
static_assert(kMaxBlocksPerBlob <= std::numeric_limits<decltype(Inode::block_count)>::max(),
"Max blocks per blob exceeds maximum block count in Inode!");
// This is inlined because format.cc only compiles on Fuchsia builds (not host).
inline std::ostream& operator<<(std::ostream& stream, const Inode& inode) {
digest::Digest d(inode.merkle_root_hash);
stream << "Inode {header:" << inode.header << " merkle:" << d.ToString()
<< " blob_size:" << inode.blob_size << " block_count:" << inode.block_count
<< " extent_count:" << inode.extent_count << " extents:" << inode.extents << "}";
return stream;
}
struct __PACKED alignas(8) ExtentContainer {
NodePrelude header;
// The map index of the previous node.
uint32_t previous_node;
// The number of extents within this container.
uint16_t extent_count;
uint16_t reserved;
Extent extents[kContainerMaxExtents];
};
// This is inlined because format.cc only compiles on Fuchsia builds (not host).
inline std::ostream& operator<<(std::ostream& stream, const ExtentContainer& container) {
stream << "ExtentContainer {header:" << container.header
<< " prev_node:" << container.previous_node << " extent_count:" << container.extent_count
<< " extents:" << container.extents << "}";
return stream;
}
static_assert(sizeof(Inode) == sizeof(ExtentContainer), "Extent nodes must be as large as inodes");
static_assert(sizeof(Inode) == kBlobfsInodeSize, "Blobfs Inode size is wrong");
static_assert(kBlobfsBlockSize % kBlobfsInodeSize == 0,
"Blobfs Inodes should fit cleanly within a blobfs block");
} // namespace blobfs
#endif // SRC_STORAGE_BLOBFS_FORMAT_H_