blob: 6760319c49f8320784f47d77fdf79896eac9b80f [file] [log] [blame] [edit]
// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file describes the on-disk structure of Blobfs.
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <zircon/assert.h>
#include <zircon/compiler.h>
#include <zircon/types.h>
#include <algorithm>
#include <limits>
#include <ostream>
#include <digest/digest.h>
#include <digest/merkle-tree.h>
#include <fbl/algorithm.h>
#include <fbl/macros.h>
#include <fs/journal/format.h>
#ifdef __Fuchsia__
#include <zircon/syscalls.h>
namespace blobfs {
// clang-format off
constexpr uint64_t kBlobfsMagic0 = (0xac2153479e694d21ULL);
constexpr uint64_t kBlobfsMagic1 = (0x985000d4d4d3d314ULL);
// Current version of the format and the revision of the software. The format version determines
// backwards-compatibility. The revision can be freely incremented at any time and does not impact
// backwards-compatibility; the more often it is updated, the more granularly we can find out what
// the oldest revision of the driver is that has touched a filesystem instance.
// Minimally, the revision should be incremented whenever a (backwards-compatible) format change is
// made, but it can also be incremented when major logic changes are made in case there is chance of
// bugs being introduced and we would like to be able to detect if the filesystem has been touched
// by a potentially buggy driver.
// The revision is used to updated the oldest_revision field in the header.
// See //src/storage/docs/ for more.
// *************************************************************************************************
// * IMPORTANT: When changing either kBlobfsCurrentFormatVersion or kBlobfsCurrentRevision, be *
// * sure to make an appropriate change to *
// * //third_party/cobalt_config/fuchsia/local_storage/versions.txt (submission order does not *
// * matter). *
// *************************************************************************************************
constexpr uint32_t kBlobfsCurrentFormatVersion = 0x00000009;
// Version 9 introduced a compact merkle tree version.
constexpr uint32_t kBlobfsCompactMerkleTreeVersion = 0x00000009;
// Revision 2: introduced a backup superblock.
constexpr uint64_t kBlobfsRevisionBackupSuperblock = 0x00000002;
// Revision 3: migrated away from old compression formats.
constexpr uint64_t kBlobfsRevisionNoOldCompressionFormats = 0x00000003;
// Revision 4: fixed host-side tool bug which generated a zero-length extent for the null blob.
constexpr uint64_t kBlobfsRevisionHostToolHandlesNullBlobCorrectly = 0x00000004;
// When this next changes, consider enabling the OldestRevisionNotUpdated test.
constexpr uint64_t kBlobfsCurrentRevision = 0x00000004;
constexpr uint32_t kBlobFlagClean = 1;
constexpr uint32_t kBlobFlagFVM = 4;
constexpr uint32_t kBlobfsBlockSize = 8192;
constexpr uint32_t kBlobfsBlockBits = (kBlobfsBlockSize * 8);
constexpr uint32_t kBlobfsSuperblockBlocks = 1;
constexpr uint32_t kBlobfsBlockMapStart = 1;
constexpr uint32_t kBlobfsInodeSize = 64;
constexpr uint32_t kBlobfsInodesPerBlock = (kBlobfsBlockSize / kBlobfsInodeSize);
constexpr uint64_t kBlobfsMaxFileSize = kBlobfsBlockSize * sizeof(uint32_t);
// Known Blobfs metadata locations. Unit of the location is blobfs block.
constexpr size_t kSuperblockOffset = 0;
// Blobfs has a backup superblock but only with FVM. The backup superblock only needs to be
// sufficient to get to the journal, since once there, either the primary superblock is valid or
// there's a pending write in the journal to the primary superblock. Note that the backup
// superblock is not there for the purpose of solving random corruption issues i.e. a random
// corruption of the primary superblock will still render the volume unusable. It only exists to
// guard against the potential for corruption whilst updating the primary superblock. In practice,
// after a write to a device but before a sucessful flush, we will either see the data before the
// write or after the write and not some indeterminate state inbetween so it's unlikely that the
// primary superblock wouldn't be readable, but if it's easy to do so, we should be resilient to the
// case where data is in an indeterminate state between write and flush, and we should have tests
// for this. A backup superblock solves this issue. Regarding random corruption: there are other
// blocks within a filesystem that could be corrupted and would have equally serious consequences,
// so if this is something to be addressed, it might benefit from a holistic solution, although one
// could argue that corruption of the first block on the device is more likely than other blocks.
// Note that there's no need to update the superblock when the primary superblock changes since its
// only purpose is to help locate the journal, so some aspects of the backup superblock are likely
// to be inconsistent.
constexpr size_t kFVMBackupSuperblockOffset = 1;
// Blobfs block offset of various filesystem structures, when using the FVM.
constexpr size_t kFVMBlockMapStart = 0x10000;
constexpr size_t kFVMNodeMapStart = 0x20000;
constexpr size_t kFVMJournalStart = 0x30000;
constexpr size_t kFVMDataStart = 0x40000;
// clang-format on
// Maximum number of data blocks possible for a single entry:
// - Blobfs Superblock
// - Inode Table Blocks
// - Block Bitmap Blocks
// TODO( Calculate the actual upper bound here; this number is not
// necessarily considering the worst cases of fragmentation.
constexpr uint32_t kMaxEntryDataBlocks = 64;
// Minimum possible size for the journal, allowing the maximum size for one entry.
constexpr size_t kMinimumJournalBlocks =
fs::kJournalMetadataBlocks + fs::kEntryMetadataBlocks + kMaxEntryDataBlocks;
// This serves as both default journal size and as minimum journal size.
// This value is somewhat arbitrarily chosen. It is large enough to allow
// us to run transactions and still small so that resources spent on
// journals are limited. Mkfs can override this value.
constexpr size_t kDefaultJournalBlocks = std::max(kMinimumJournalBlocks, static_cast<size_t>(16));
// This serves as both default inode count when mkfs arguments do not specify
// inode count and as absolute minimum inodes allowed in the fs.
// This value is somewhat arbitrarily chosen. It is large enough to allow us
// to create a few blobs and still small so that resources spent on inodes
// are limited. Mkfs can override this value.
constexpr uint64_t kBlobfsDefaultInodeCount = 10240;
constexpr size_t kMinimumDataBlocks = 2;
struct __PACKED alignas(8) Superblock {
uint64_t magic0;
uint64_t magic1;
uint32_t format_version;
uint32_t flags;
uint32_t block_size; // 8K typical.
uint32_t reserved1; // Unused, reserved (for padding).
uint64_t data_block_count; // Number of data blocks in this area.
uint64_t journal_block_count; // Number of journal blocks in this area.
uint64_t inode_count; // Number of blobs in this area.
uint64_t alloc_block_count; // Total number of allocated blocks.
uint64_t alloc_inode_count; // Total number of allocated blobs and container nodes.
// NOTE: prior to, |reserved2| was
// explicitly required to be zero. This field may be used for other purposes, but doing so is a
// backwards-incompatible change.
uint64_t reserved2; // Unused.
// The following 6 fields are only valid with (flags & kBlobFlagFVM):
uint64_t slice_size; // Underlying slice size.
uint64_t deprecated1; // Unused but not necessarily 0 (saved total vslices in old vers.).
uint32_t abm_slices; // Slices allocated to block bitmap.
uint32_t ino_slices; // Slices allocated to node map.
uint32_t dat_slices; // Slices allocated to file data section.
uint32_t journal_slices; // Slices allocated to journal section.
// End FVM-specific fields
uint8_t zeroes[8]; // Padding. Set to zeroes, can be reclaimed.
// The oldest revision of the software that has written to this blobfs instance. When opening for
// writes, the driver should check this and lower it if the current revision is lower than the one
// stored in this header. This does not say anything about backwards-compatibility, that is
// determined by format_version above.
// See //src/storage/docs/ for more.
uint64_t oldest_revision;
uint8_t reserved[8064];
static_assert(sizeof(Superblock) == kBlobfsBlockSize, "Invalid blobfs superblock size");
constexpr uint64_t SuperblockBlocks(const Superblock& info) { return kBlobfsSuperblockBlocks; }
constexpr uint64_t BlockMapStartBlock(const Superblock& info) {
if (info.flags & kBlobFlagFVM) {
return kFVMBlockMapStart;
} else {
return kBlobfsBlockMapStart;
constexpr uint64_t BlockMapBlocks(const Superblock& info) {
return fbl::round_up(info.data_block_count, kBlobfsBlockBits) / kBlobfsBlockBits;
constexpr uint64_t NodeMapStartBlock(const Superblock& info) {
// Node map immediately follows the block map
if (info.flags & kBlobFlagFVM) {
return kFVMNodeMapStart;
} else {
// Node map immediately follows the block map.
return BlockMapStartBlock(info) + BlockMapBlocks(info);
constexpr uint64_t NodeBitmapBlocks(const Superblock& info) {
return fbl::round_up(info.inode_count, kBlobfsBlockBits) / kBlobfsBlockBits;
constexpr uint64_t NodeMapBlocks(const Superblock& info) {
return fbl::round_up(info.inode_count, kBlobfsInodesPerBlock) / kBlobfsInodesPerBlock;
constexpr uint64_t JournalStartBlock(const Superblock& info) {
if (info.flags & kBlobFlagFVM) {
return kFVMJournalStart;
// Journal immediately follows the node map.
return NodeMapStartBlock(info) + NodeMapBlocks(info);
constexpr uint64_t JournalBlocks(const Superblock& info) { return info.journal_block_count; }
constexpr uint64_t DataStartBlock(const Superblock& info) {
if (info.flags & kBlobFlagFVM) {
return kFVMDataStart;
// Data immediately follows the journal.
return JournalStartBlock(info) + JournalBlocks(info);
constexpr uint64_t DataBlocks(const Superblock& info) { return info.data_block_count; }
constexpr uint64_t TotalNonDataBlocks(const Superblock& info) {
return SuperblockBlocks(info) + BlockMapBlocks(info) + NodeMapBlocks(info) + JournalBlocks(info);
constexpr uint64_t TotalBlocks(const Superblock& info) {
return TotalNonDataBlocks(info) + DataBlocks(info);
// States of 'Blob' identified via start block.
constexpr uint64_t kStartBlockMinimum = 1; // Smallest 'data' block possible.
using digest::Digest;
typedef uint64_t BlockOffsetType;
constexpr size_t kBlockOffsetBits = 48;
constexpr BlockOffsetType kBlockOffsetMax = (1LLU << kBlockOffsetBits) - 1;
constexpr uint64_t kBlockOffsetMask = kBlockOffsetMax;
typedef uint16_t BlockCountType;
constexpr size_t kBlockCountBits = 16;
constexpr size_t kBlockCountMax = std::numeric_limits<BlockCountType>::max();
constexpr uint64_t kBlockCountMask = kBlockCountMax << kBlockOffsetBits;
class Extent {
Extent() = default;
Extent(BlockOffsetType start, BlockCountType length) {
BlockOffsetType Start() const { return data_ & kBlockOffsetMask; }
void SetStart(BlockOffsetType start) {
ZX_DEBUG_ASSERT(start <= kBlockOffsetMax);
data_ = (data_ & ~kBlockOffsetMask) | (start & kBlockOffsetMask);
BlockCountType Length() const {
return static_cast<BlockCountType>((data_ & kBlockCountMask) >> kBlockOffsetBits);
void SetLength(BlockCountType length) {
data_ = (data_ & ~kBlockCountMask) | ((length & kBlockCountMax) << kBlockOffsetBits);
bool operator==(const Extent& rhs) const {
return Start() == rhs.Start() && Length() == rhs.Length();
uint64_t data_ = 0;
// This is inlined because only compiles on Fuchsia builds (not host).
inline std::ostream& operator<<(std::ostream& stream, const Extent& extent) {
stream << "{start:" << extent.Start() << ", len:" << extent.Length() << "}";
return stream;
template <size_t N>
inline std::ostream& operator<<(std::ostream& stream, const Extent (&extents)[N]) {
stream << "[";
for (size_t i = 0; i < N; ++i) {
if (i > 0) {
stream << ", ";
stream << extents[i];
stream << "]";
return stream;
static_assert(sizeof(Extent) == sizeof(uint64_t), "Extent class should only contain data");
// The number of extents within a single blob.
typedef uint16_t ExtentCountType;
// The largest number of extents which can compose a blob.
constexpr size_t kMaxBlobExtents = std::numeric_limits<ExtentCountType>::max();
// The largest node id representable in a node list.
constexpr uint32_t kMaxNodeId = 0xffffffffu;
// Identifies that the node is allocated.
// Both inodes and extent containers can be allocated.
constexpr uint16_t kBlobFlagAllocated = 1 << 0;
// Identifies that the on-disk storage of the blob is LZ4 compressed.
constexpr uint16_t kBlobFlagLZ4Compressed = 1 << 1;
// Identifies that this node is a container for extents.
constexpr uint16_t kBlobFlagExtentContainer = 1 << 2;
// Identifies that the on-disk storage of the blob is ZSTD compressed.
constexpr uint16_t kBlobFlagZSTDCompressed = 1 << 3;
// Identifies that the on-disk storage of the blob is ZSTD-seekable compressed.
constexpr uint16_t kBlobFlagZSTDSeekableCompressed = 1 << 4;
// Identifies that the on-disk storage of the blob is chunk-compression compressed.
constexpr uint16_t kBlobFlagChunkCompressed = 1 << 5;
// When adding another compression flag, it must be added to
// kBlobFlagMaskAnyCompression below.
// Bitmask of all compression flags.
constexpr uint16_t kBlobFlagMaskAnyCompression =
(kBlobFlagLZ4Compressed | kBlobFlagZSTDCompressed | kBlobFlagZSTDSeekableCompressed |
// The number of extents within a normal inode.
constexpr uint32_t kInlineMaxExtents = 1;
// The number of extents within an extent container node.
constexpr uint32_t kContainerMaxExtents = 6;
struct __PACKED NodePrelude {
uint16_t flags;
uint16_t version;
// The next node containing this blob's extents.
// Should not be used or read if there are no more extents.
uint32_t next_node;
bool IsAllocated() const { return flags & kBlobFlagAllocated; }
bool IsExtentContainer() const { return flags & kBlobFlagExtentContainer; }
bool IsInode() const { return !IsExtentContainer(); }
// This is inlined because only compiles on Fuchsia builds (not host).
inline std::ostream& operator<<(std::ostream& stream, const NodePrelude& prelude) {
stream << "Node {allocated:" << prelude.IsAllocated() << " is_inode:" << prelude.IsInode()
<< " version:" << prelude.version << " next_node:" << prelude.next_node << "}";
return stream;
struct ExtentContainer;
struct __PACKED alignas(8) Inode {
NodePrelude header;
uint8_t merkle_root_hash[digest::kSha256Length];
uint64_t blob_size;
// The total number of Blocks used to represent this blob.
uint32_t block_count;
// The total number of Extent objects necessary to represent this blob.
// Identifies when to stop iterating through the node list.
ExtentCountType extent_count;
uint16_t reserved;
Extent extents[kInlineMaxExtents];
ExtentContainer* AsExtentContainer() { return reinterpret_cast<ExtentContainer*>(this); }
bool IsCompressed() const { return header.flags & kBlobFlagMaskAnyCompression; }
// This is inlined because only compiles on Fuchsia builds (not host).
inline std::ostream& operator<<(std::ostream& stream, const Inode& inode) {
digest::Digest d(inode.merkle_root_hash);
stream << "Inode {header:" << inode.header << " merkle:" << d.ToString()
<< " blob_size:" << inode.blob_size << " block_count:" << inode.block_count
<< " extent_count:" << inode.extent_count << " extents:" << inode.extents << "}";
return stream;
struct __PACKED alignas(8) ExtentContainer {
NodePrelude header;
// The map index of the previous node.
uint32_t previous_node;
// The number of extents within this container.
ExtentCountType extent_count;
uint16_t reserved;
Extent extents[kContainerMaxExtents];
// This is inlined because only compiles on Fuchsia builds (not host).
inline std::ostream& operator<<(std::ostream& stream, const ExtentContainer& container) {
stream << "ExtentContainer {header:" << container.header
<< " prev_node:" << container.previous_node << " extent_count:" << container.extent_count
<< " extents:" << container.extents << "}";
return stream;
static_assert(sizeof(Inode) == sizeof(ExtentContainer), "Extent nodes must be as large as inodes");
static_assert(sizeof(Inode) == kBlobfsInodeSize, "Blobfs Inode size is wrong");
static_assert(kBlobfsBlockSize % kBlobfsInodeSize == 0,
"Blobfs Inodes should fit cleanly within a blobfs block");
} // namespace blobfs