// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <fcntl.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
#include <digest/digest.h>
#include <digest/merkle-tree.h>
#include <fbl/alloc_checker.h>
#include <fbl/auto_call.h>
#include <fbl/limits.h>
#include <fbl/ref_ptr.h>
#include <lib/fdio/debug.h>
#include <fs/block-txn.h>
#include <fs/ticker.h>
#include <lib/zx/event.h>
#include <lib/async/cpp/task.h>
#include <zircon/compiler.h>
#include <zircon/process.h>
#include <zircon/status.h>
#include <zircon/syscalls.h>
#define ZXDEBUG 0
#include <blobfs/blobfs.h>
#include <blobfs/lz4.h>
using digest::Digest;
using digest::MerkleTree;
namespace blobfs {
namespace {
zx_status_t CheckFvmConsistency(const blobfs_info_t* info, int block_fd) {
if ((info->flags & kBlobFlagFVM) == 0) {
return ZX_OK;
fvm_info_t fvm_info;
zx_status_t status = static_cast<zx_status_t>(ioctl_block_fvm_query(block_fd, &fvm_info));
if (status < ZX_OK) {
FS_TRACE_ERROR("blobfs: Unable to query FVM, fd: %d status: 0x%x\n", block_fd, status);
if (info->slice_size != fvm_info.slice_size) {
FS_TRACE_ERROR("blobfs: Slice size did not match expected\n");
const size_t kBlocksPerSlice = info->slice_size / kBlobfsBlockSize;
size_t expected_count[3];
expected_count[0] = info->abm_slices;
expected_count[1] = info->ino_slices;
expected_count[2] = info->dat_slices;
query_request_t request;
request.count = 3;
request.vslice_start[0] = kFVMBlockMapStart / kBlocksPerSlice;
request.vslice_start[1] = kFVMNodeMapStart / kBlocksPerSlice;
request.vslice_start[2] = kFVMDataStart / kBlocksPerSlice;
query_response_t response;
status = static_cast<zx_status_t>(ioctl_block_fvm_vslice_query(block_fd, &request, &response));
if (status < ZX_OK) {
FS_TRACE_ERROR("blobfs: Unable to query slices, status: 0x%x\n", status);
if (response.count != request.count) {
FS_TRACE_ERROR("blobfs: Missing slice\n");
for (size_t i = 0; i < request.count; i++) {
size_t blobfs_count = expected_count[i];
size_t fvm_count = response.vslice_range[i].count;
if (!response.vslice_range[i].allocated || fvm_count < blobfs_count) {
// Currently, since Blobfs can only grow new slices, it should not be possible for
// the FVM to report a slice size smaller than what is reported by Blobfs. In this
// case, automatically fail without trying to resolve the situation, as it is
// possible that Blobfs structures are allocated in the slices that have been lost.
FS_TRACE_ERROR("blobfs: Mismatched slice count\n");
if (fvm_count > blobfs_count) {
// If FVM reports more slices than we expect, try to free remainder.
extend_request_t shrink;
shrink.length = fvm_count - blobfs_count;
shrink.offset = request.vslice_start[i] + blobfs_count;
ssize_t r;
if ((r = ioctl_block_fvm_shrink(block_fd, &shrink)) != ZX_OK) {
FS_TRACE_ERROR("blobfs: Unable to shrink to expected size, status: %zd\n", r);
return ZX_OK;
// A wrapper around "Enqueue" for content which risks being larger
// than the writeback buffer.
// For content which is smaller than 3/4 the size of the writeback buffer: the
// content is enqueued to |work| without flushing.
// For content which is larger than 3/4 the size of the writeback buffer: flush
// the data by enqueueing it to the writeback thread in chunks until the
// remainder is small enough to comfortably fit within the writeback buffer.
zx_status_t EnqueuePaginated(fbl::unique_ptr<WritebackWork>* work, Blobfs* blobfs, VnodeBlob* vn,
zx_handle_t vmo, uint64_t relative_block, uint64_t absolute_block,
uint64_t nblocks) {
const size_t kMaxChunkBlocks = (3 * blobfs->WritebackCapacity()) / 4;
uint64_t delta_blocks = fbl::min(nblocks, kMaxChunkBlocks);
while (nblocks > 0) {
(*work)->Enqueue(vmo, relative_block, absolute_block, delta_blocks);
relative_block += delta_blocks;
absolute_block += delta_blocks;
nblocks -= delta_blocks;
delta_blocks = fbl::min(nblocks, kMaxChunkBlocks);
if (nblocks) {
fbl::unique_ptr<WritebackWork> tmp;
zx_status_t status = blobfs->CreateWork(&tmp, vn);
if (status != ZX_OK) {
return status;
*work = fbl::move(tmp);
return ZX_OK;
} // namespace
blobfs_inode_t* Blobfs::GetNode(size_t index) const {
return &reinterpret_cast<blobfs_inode_t*>(node_map_->GetData())[index];
zx_status_t VnodeBlob::Verify() const {
TRACE_DURATION("blobfs", "Blobfs::Verify");
fs::Ticker ticker(blobfs_->CollectingMetrics());
const void* data = inode_.blob_size ? GetData() : nullptr;
const void* tree = inode_.blob_size ? GetMerkle() : nullptr;
const uint64_t data_size = inode_.blob_size;
const uint64_t merkle_size = MerkleTree::GetTreeLength(data_size);
// TODO(smklein): We could lazily verify more of the VMO if
// we could fault in pages on-demand.
// For now, we aggressively verify the entire VMO up front.
Digest digest;
digest = reinterpret_cast<const uint8_t*>(&digest_[0]);
zx_status_t status = MerkleTree::Verify(data, data_size, tree,
merkle_size, 0, data_size, digest);
blobfs_->UpdateMerkleVerifyMetrics(data_size, merkle_size, ticker.End());
if (status != ZX_OK) {
char name[Digest::kLength * 2 + 1];
ZX_ASSERT(digest.ToString(name, sizeof(name)) == ZX_OK);
FS_TRACE_ERROR("blobfs verify(%s) Failure: %s\n", name, zx_status_get_string(status));
return status;
zx_status_t VnodeBlob::InitVmos() {
TRACE_DURATION("blobfs", "Blobfs::InitVmos");
if (blob_ != nullptr) {
return ZX_OK;
// Reverts blob back to uninitialized state on error.
auto cleanup = fbl::MakeAutoCall([this]() { BlobCloseHandles(); });
zx_status_t status;
uint64_t data_blocks = BlobDataBlocks(inode_);
uint64_t merkle_blocks = MerkleTreeBlocks(inode_);
uint64_t num_blocks = data_blocks + merkle_blocks;
size_t vmo_size;
if (mul_overflow(num_blocks, kBlobfsBlockSize, &vmo_size)) {
FS_TRACE_ERROR("Multiplication overflow");
if ((status = fzl::MappedVmo::Create(vmo_size, "blob", &blob_)) != ZX_OK) {
FS_TRACE_ERROR("Failed to initialize vmo; error: %d\n", status);
return status;
if ((status = blobfs_->AttachVmo(blob_->GetVmo(), &vmoid_)) != ZX_OK) {
FS_TRACE_ERROR("Failed to attach VMO to block device; error: %d\n", status);
return status;
if ((inode_.flags & kBlobFlagLZ4Compressed) != 0) {
if ((status = InitCompressed()) != ZX_OK) {
return status;
} else {
if ((status = InitUncompressed()) != ZX_OK) {
return status;
if ((status = Verify()) != ZX_OK) {
return status;
return ZX_OK;
zx_status_t VnodeBlob::InitCompressed() {
TRACE_DURATION("blobfs", "Blobfs::InitCompressed", "size", inode_.blob_size,
"blocks", inode_.num_blocks);
fs::Ticker ticker(blobfs_->CollectingMetrics());
fs::ReadTxn txn(blobfs_);
uint64_t start = inode_.start_block + DataStartBlock(blobfs_->info_);
uint64_t merkle_blocks = MerkleTreeBlocks(inode_);
fbl::unique_ptr<fzl::MappedVmo> compressed_blob;
size_t compressed_blocks = (inode_.num_blocks - merkle_blocks);
size_t compressed_size;
if (mul_overflow(compressed_blocks, kBlobfsBlockSize, &compressed_size)) {
FS_TRACE_ERROR("Multiplication overflow\n");
zx_status_t status = fzl::MappedVmo::Create(compressed_size, "compressed-blob",
if (status != ZX_OK) {
FS_TRACE_ERROR("Failed to initialized compressed vmo; error: %d\n", status);
return status;
vmoid_t compressed_vmoid;
status = blobfs_->AttachVmo(compressed_blob->GetVmo(), &compressed_vmoid);
if (status != ZX_OK) {
FS_TRACE_ERROR("Failed to attach commpressed VMO to blkdev: %d\n", status);
return status;
auto detach = fbl::MakeAutoCall([this, &compressed_vmoid]() {
// Read the uncompressed merkle tree.
txn.Enqueue(vmoid_, 0, start, merkle_blocks);
// Read the compressed data.
txn.Enqueue(compressed_vmoid, 0, start + merkle_blocks, compressed_blocks);
if ((status = txn.Transact()) != ZX_OK) {
FS_TRACE_ERROR("Failed to flush read transaction: %d\n", status);
return status;
fs::Duration read_time = ticker.End();
// Decompress the compressed data into the target buffer.
size_t target_size = inode_.blob_size;
status = Decompressor::Decompress(GetData(), &target_size,
compressed_blob->GetData(), &compressed_size);
if (status != ZX_OK) {
FS_TRACE_ERROR("Failed to decompress data: %d\n", status);
return status;
} else if (target_size != inode_.blob_size) {
FS_TRACE_ERROR("Failed to fully decompress blob (%zu of %zu expected)\n",
target_size, inode_.blob_size);
blobfs_->UpdateMerkleDecompressMetrics((compressed_blocks) * kBlobfsBlockSize,
inode_.blob_size, read_time, ticker.End());
return ZX_OK;
zx_status_t VnodeBlob::InitUncompressed() {
TRACE_DURATION("blobfs", "Blobfs::InitUncompressed", "size", inode_.blob_size,
"blocks", inode_.num_blocks);
fs::Ticker ticker(blobfs_->CollectingMetrics());
fs::ReadTxn txn(blobfs_);
uint64_t start = inode_.start_block + DataStartBlock(blobfs_->info_);
// Read both the uncompressed merkle tree and data.
uint64_t length = BlobDataBlocks(inode_) + MerkleTreeBlocks(inode_);
txn.Enqueue(vmoid_, 0, start, length);
zx_status_t status = txn.Transact();
blobfs_->UpdateMerkleDiskReadMetrics(length * kBlobfsBlockSize, ticker.End());
return status;
void VnodeBlob::PopulateInode(size_t node_index) {
ZX_DEBUG_ASSERT(map_index_ == 0);
ZX_DEBUG_ASSERT(inode_.start_block < kStartBlockMinimum);
map_index_ = node_index;
blobfs_inode_t* inode = blobfs_->GetNode(node_index);
inode_ = *inode;
uint64_t VnodeBlob::SizeData() const {
if (GetState() == kBlobStateReadable) {
return inode_.blob_size;
return 0;
VnodeBlob::VnodeBlob(Blobfs* bs, const Digest& digest)
: blobfs_(bs),
flags_(kBlobStateEmpty), syncing_(false), clone_watcher_(this) {
digest.CopyTo(digest_, sizeof(digest_));
VnodeBlob::VnodeBlob(Blobfs* bs)
: blobfs_(bs),
flags_(kBlobStateEmpty | kBlobFlagDirectory),
syncing_(false), clone_watcher_(this) {}
void VnodeBlob::BlobCloseHandles() {
blob_ = nullptr;
zx_status_t VnodeBlob::SpaceAllocate(uint64_t size_data) {
TRACE_DURATION("blobfs", "Blobfs::SpaceAllocate", "size_data", size_data);
fs::Ticker ticker(blobfs_->CollectingMetrics());
if (GetState() != kBlobStateEmpty) {
// Find a free node, mark it as reserved.
zx_status_t status;
if ((status = blobfs_->ReserveNode(&map_index_)) != ZX_OK) {
return status;
// Initialize the inode with known fields
memset(inode_.merkle_root_hash, 0, Digest::kLength);
inode_.blob_size = size_data;
inode_.num_blocks = MerkleTreeBlocks(inode_) + BlobDataBlocks(inode_);
// Special case for the null blob: We skip the write phase
if (inode_.blob_size == 0) {
// Toss a valid block to the null blob, to distinguish it from
// unallocated nodes.
inode_.start_block = kStartBlockMinimum;
if ((status = Verify()) != ZX_OK) {
return status;
fbl::unique_ptr<WritebackWork> wb;
if ((status = blobfs_->CreateWork(&wb, this)) != ZX_OK) {
return status;
} else if ((status = WriteMetadata(fbl::move(wb))) != ZX_OK) {
fprintf(stderr, "Null blob metadata fail: %d\n", status);
goto fail;
return ZX_OK;
// Open VMOs, so we can begin writing after allocate succeeds.
if ((status = fzl::MappedVmo::Create(inode_.num_blocks * kBlobfsBlockSize, "blob", &blob_))
!= ZX_OK) {
goto fail;
if ((status = blobfs_->AttachVmo(blob_->GetVmo(), &vmoid_)) != ZX_OK) {
goto fail;
// Reserve space for the blob.
if ((status = blobfs_->ReserveBlocks(inode_.num_blocks, &inode_.start_block)) != ZX_OK) {
goto fail;
write_info_ = fbl::make_unique<WritebackInfo>();
if (inode_.blob_size >= kCompressionMinBytesSaved) {
size_t max = write_info_->compressor.BufferMax(inode_.blob_size);
status = fzl::MappedVmo::Create(max, "compressed-blob", &write_info_->compressed_blob);
if (status != ZX_OK) {
return status;
status = write_info_->compressor.Initialize(write_info_->compressed_blob->GetData(),
if (status != ZX_OK) {
fprintf(stderr, "blobfs: Failed to initalize compressor: %d\n", status);
return status;
blobfs_->UpdateAllocationMetrics(size_data, ticker.End());
return ZX_OK;
blobfs_->FreeNode(nullptr, map_index_);
return status;
void* VnodeBlob::GetData() const {
return fs::GetBlock(kBlobfsBlockSize, blob_->GetData(), MerkleTreeBlocks(inode_));
void* VnodeBlob::GetMerkle() const {
return blob_->GetData();
zx_status_t VnodeBlob::WriteMetadata(fbl::unique_ptr<WritebackWork> wb) {
TRACE_DURATION("blobfs", "Blobfs::WriteMetadata");
assert(GetState() == kBlobStateDataWrite);
// Update the on-disk hash.
memcpy(inode_.merkle_root_hash, &digest_[0], Digest::kLength);
// All data has been written to the containing VMO.
if (readable_event_.is_valid()) {
zx_status_t status = readable_event_.signal(0u, ZX_USER_SIGNAL_0);
if (status != ZX_OK) {
return status;
atomic_store(&syncing_, true);
// Allocate and persist previously reserved blocks/node.
if (inode_.blob_size) {
blobfs_->PersistBlocks(wb.get(), inode_.num_blocks, inode_.start_block);
blobfs_->PersistNode(wb.get(), map_index_, inode_);
// Drop the write info, since we no longer need it.
return ZX_OK;
zx_status_t VnodeBlob::WriteInternal(const void* data, size_t len, size_t* actual) {
TRACE_DURATION("blobfs", "Blobfs::WriteInternal", "data", data, "len", len);
*actual = 0;
if (len == 0) {
return ZX_OK;
const uint64_t merkle_blocks = MerkleTreeBlocks(inode_);
const size_t merkle_bytes = MerkleTreeBlocks(inode_) * kBlobfsBlockSize;
if (GetState() == kBlobStateDataWrite) {
size_t to_write = fbl::min(len, inode_.blob_size - write_info_->bytes_written);
size_t offset = write_info_->bytes_written + merkle_bytes;
zx_status_t status = zx_vmo_write(blob_->GetVmo(), data, offset, to_write);
if (status != ZX_OK) {
return status;
*actual = to_write;
write_info_->bytes_written += to_write;
if (write_info_->compressor.Compressing()) {
if ((status = write_info_->compressor.Update(data, to_write)) != ZX_OK) {
return status;
// More data to write.
if (write_info_->bytes_written < inode_.blob_size) {
return ZX_OK;
// Only write data to disk once we've buffered the file into memory.
// This gives us a chance to try compressing the blob before we write it back.
fbl::unique_ptr<WritebackWork> wb;
if ((status = blobfs_->CreateWork(&wb, this)) != ZX_OK) {
return status;
if (write_info_->compressor.Compressing()) {
if ((status = write_info_->compressor.End()) != ZX_OK) {
return status;
uint64_t dev_offset = DataStartBlock(blobfs_->info_) + inode_.start_block + merkle_blocks;
if (write_info_->compressor.Compressing()) {
uint64_t blocks = fbl::round_up(write_info_->compressor.Size(),
kBlobfsBlockSize) / kBlobfsBlockSize;
if ((status = EnqueuePaginated(&wb, blobfs_, this,
0, dev_offset, blocks)) != ZX_OK) {
return status;
blocks += MerkleTreeBlocks(inode_);
ZX_DEBUG_ASSERT(inode_.num_blocks > blocks);
blobfs_->UnreserveBlocks(inode_.num_blocks - blocks,
inode_.start_block + blocks);
inode_.num_blocks = blocks;
inode_.flags |= kBlobFlagLZ4Compressed;
} else {
uint64_t blocks = fbl::round_up(inode_.blob_size, kBlobfsBlockSize) / kBlobfsBlockSize;
if ((status = EnqueuePaginated(&wb, blobfs_, this, blob_->GetVmo(),
merkle_blocks, dev_offset, blocks)) != ZX_OK) {
return status;
// TODO(smklein): As an optimization, use the CreateInit/Update/Final
// methods to create the merkle tree as we write data, rather than
// waiting until the data is fully downloaded to create the tree.
size_t merkle_size = MerkleTree::GetTreeLength(inode_.blob_size);
fs::Duration generation_time;
if (merkle_size > 0) {
Digest digest;
void* merkle_data = GetMerkle();
const void* blob_data = GetData();
fs::Ticker ticker(blobfs_->CollectingMetrics()); // Tracking generation time.
if ((status = MerkleTree::Create(blob_data, inode_.blob_size, merkle_data,
merkle_size, &digest)) != ZX_OK) {
return status;
} else if (digest != digest_) {
// Downloaded blob did not match provided digest.
uint64_t dev_offset = DataStartBlock(blobfs_->info_) + inode_.start_block;
wb->Enqueue(blob_->GetVmo(), 0, dev_offset, merkle_blocks);
generation_time = ticker.End();
} else if ((status = Verify()) != ZX_OK) {
// Small blobs may not have associated Merkle Trees, and will
// require validation, since we are not regenerating and checking
// the digest.
return status;
// No more data to write. Flush to disk.
fs::Ticker ticker(blobfs_->CollectingMetrics()); // Tracking enqueue time.
if ((status = WriteMetadata(fbl::move(wb))) != ZX_OK) {
return status;
blobfs_->UpdateClientWriteMetrics(to_write, merkle_size, ticker.End(),
return ZX_OK;
void VnodeBlob::ConsiderCompressionAbort() {
if (inode_.blob_size - kCompressionMinBytesSaved < write_info_->compressor.Size()) {
write_info_->compressed_blob = nullptr;
zx_status_t VnodeBlob::GetReadableEvent(zx_handle_t* out) {
TRACE_DURATION("blobfs", "Blobfs::GetReadableEvent");
zx_status_t status;
// This is the first 'wait until read event' request received.
if (!readable_event_.is_valid()) {
status = zx::event::create(0, &readable_event_);
if (status != ZX_OK) {
return status;
} else if (GetState() == kBlobStateReadable) {
readable_event_.signal(0u, ZX_USER_SIGNAL_0);
status = zx_handle_duplicate(readable_event_.get(), ZX_RIGHTS_BASIC | ZX_RIGHT_READ, out);
if (status != ZX_OK) {
return status;
return sizeof(zx_handle_t);
zx_status_t VnodeBlob::CloneVmo(zx_rights_t rights, zx_handle_t* out) {
TRACE_DURATION("blobfs", "Blobfs::CloneVmo", "rights", rights, "out", out);
if (GetState() != kBlobStateReadable) {
if (inode_.blob_size == 0) {
zx_status_t status = InitVmos();
if (status != ZX_OK) {
return status;
// TODO(smklein): Only clone / verify the part of the vmo that
// was requested.
const size_t merkle_bytes = MerkleTreeBlocks(inode_) * kBlobfsBlockSize;
zx_handle_t clone;
if ((status = zx_vmo_clone(blob_->GetVmo(), ZX_VMO_CLONE_COPY_ON_WRITE,
merkle_bytes, inode_.blob_size, &clone)) != ZX_OK) {
return status;
// TODO(mdempsky): Push elsewhere.
if ((status = zx_vmo_replace_as_executable(clone, ZX_HANDLE_INVALID, &clone)) != ZX_OK) {
return status;
if ((status = zx_handle_replace(clone, rights, out)) != ZX_OK) {
return status;
if (clone_watcher_.object() == ZX_HANDLE_INVALID) {
// Keep a reference to "this" alive, preventing the blob
// from being closed while someone may still be using the
// underlying memory.
// We'll release it when no client-held VMOs are in use.
clone_ref_ = fbl::RefPtr<VnodeBlob>(this);
return ZX_OK;
void VnodeBlob::HandleNoClones(async_dispatcher_t* dispatcher, async::WaitBase* wait,
zx_status_t status, const zx_packet_signal_t* signal) {
ZX_DEBUG_ASSERT((signal->observed & ZX_VMO_ZERO_CHILDREN) != 0);
ZX_DEBUG_ASSERT(clone_watcher_.object() != ZX_HANDLE_INVALID);
clone_ref_ = nullptr;
zx_status_t VnodeBlob::ReadInternal(void* data, size_t len, size_t off, size_t* actual) {
TRACE_DURATION("blobfs", "Blobfs::ReadInternal", "len", len, "off", off);
if (GetState() != kBlobStateReadable) {
if (inode_.blob_size == 0) {
*actual = 0;
return ZX_OK;
zx_status_t status = InitVmos();
if (status != ZX_OK) {
return status;
Digest d;
d = reinterpret_cast<const uint8_t*>(&digest_[0]);
if (off >= inode_.blob_size) {
*actual = 0;
return ZX_OK;
if (len > (inode_.blob_size - off)) {
len = inode_.blob_size - off;
const size_t merkle_bytes = MerkleTreeBlocks(inode_) * kBlobfsBlockSize;
status = zx_vmo_read(blob_->GetVmo(), data, merkle_bytes + off, len);
if (status == ZX_OK) {
*actual = len;
return status;
void VnodeBlob::QueueUnlink() {
flags_ |= kBlobFlagDeletable;
// Attempt to purge in case the blob has been unlinked with no open fds
zx_status_t VnodeBlob::VerifyBlob(Blobfs* bs, size_t node_index) {
blobfs_inode_t* inode = bs->GetNode(node_index);
Digest digest(inode->merkle_root_hash);
fbl::AllocChecker ac;
fbl::RefPtr<VnodeBlob> vn =
fbl::AdoptRef(new (&ac) VnodeBlob(bs, digest));
if (!ac.check()) {
// Set blob state to "Purged" so we do not try to add it to the cached map on recycle.
return vn->Verify();
zx_status_t Blobfs::VerifyBlob(size_t node_index) {
return VnodeBlob::VerifyBlob(this, node_index);
zx_status_t Blobfs::FindBlocks(size_t start, size_t num_blocks, size_t* blkno_out) {
while (true) {
// Search for a range of nblocks in block_map_.
size_t block_num;
zx_status_t status = block_map_.Find(false, start, block_map_.size(), num_blocks,
if (status != ZX_OK) {
return status;
// Find out how large the unallocated range is starting from |block_num| so we can search
// the reserved_blocks_ map for this entire range in one call.
size_t upper_limit = block_map_.size();
block_map_.Scan(block_num, block_map_.size(), false, &upper_limit);
size_t max_len = upper_limit - block_num;
// Check the reserved map to see if there are |nblocks| free blocks from |block_num| to
// |block_num + max_len|.
size_t out;
status = reserved_blocks_.Find(false, block_num, block_num + max_len, num_blocks, &out);
// If we found a valid range, return; otherwise start searching from block_num + max_len.
if (status == ZX_OK && out < block_num + max_len) {
*blkno_out = out;
start = out;
return ZX_OK;
zx_status_t Blobfs::ReserveBlocks(size_t num_blocks, size_t* block_index_out) {
zx_status_t status;
if ((status = FindBlocks(0, num_blocks, block_index_out) != ZX_OK)) {
// If we have run out of blocks, attempt to add block slices via FVM.
size_t hint = block_map_.size() - fbl::min(num_blocks, block_map_.size());
if (AddBlocks(num_blocks) != ZX_OK) {
} else if ((status = FindBlocks(hint, num_blocks, block_index_out)) != ZX_OK) {
status = reserved_blocks_.Set(*block_index_out, *block_index_out + num_blocks);
return ZX_OK;
void Blobfs::UnreserveBlocks(size_t num_blocks, size_t block_index) {
// Ensure the blocks are already reserved.
size_t blkno_out;
ZX_DEBUG_ASSERT(reserved_blocks_.Find(true, block_index, block_index +
num_blocks, num_blocks, &blkno_out) ==
zx_status_t status = reserved_blocks_.Clear(block_index, block_index + num_blocks);
void Blobfs::PersistBlocks(WritebackWork* wb, size_t num_blocks, size_t block_index) {
TRACE_DURATION("blobfs", "Blobfs::PersistBlocks", "num_blocks", num_blocks);
size_t blkno_out;
// Make sure that blkno + nblocks are already reserved.
ZX_DEBUG_ASSERT(reserved_blocks_.Find(true, block_index, block_index + num_blocks, num_blocks,
&blkno_out) == ZX_OK);
// Make sure that blkno + nblocks are NOT already allocated.
ZX_DEBUG_ASSERT(block_map_.Find(false, block_index, block_index + num_blocks, num_blocks,
&blkno_out) == ZX_OK);
// Allocate blocks in bitmap.
zx_status_t status = block_map_.Set(block_index, block_index + num_blocks);
info_.alloc_block_count += num_blocks;
status = reserved_blocks_.Clear(block_index, block_index + num_blocks);
// Write out to disk.
WriteBitmap(wb, num_blocks, block_index);
// Frees blocks from reserved and allocated maps, updates disk in the latter case.
void Blobfs::FreeBlocks(WritebackWork* wb, size_t num_blocks, size_t block_index) {
TRACE_DURATION("blobfs", "Blobfs::FreeBlocks", "nblocks", num_blocks, "blkno", block_index);
// Check if blocks were allocated on disk.
size_t blkno_out;
if (block_map_.Find(true, block_index, block_index + num_blocks, num_blocks, &blkno_out)
== ZX_OK) {
zx_status_t status = block_map_.Clear(block_index, block_index + num_blocks);
info_.alloc_block_count -= num_blocks;
WriteBitmap(wb, num_blocks, block_index);
zx_status_t status = reserved_blocks_.Clear(block_index, block_index + num_blocks);
zx_status_t Blobfs::FindNode(size_t* node_index_out) {
for (size_t i = free_node_lower_bound_; i < info_.inode_count; ++i) {
if (GetNode(i)->start_block == kStartBlockFree) {
// Found a free node. Mark it as reserved so no one else can allocate it.
if (!reserved_nodes_.Get(i, i + 1, nullptr)) {
reserved_nodes_.Set(i, i + 1);
*node_index_out = i;
// We don't know where the next free node is but we know that there
// are no free nodes until index i.
free_node_lower_bound_ = i + 1;
return ZX_OK;
// There are no free nodes available. Setting free_node_lower_bound_ to
// inodes_count will help to fail fast for next allocation. This will
// also help to find nodes if nodes are added.
free_node_lower_bound_ = info_.inode_count;
// Reserves a node IN MEMORY.
zx_status_t Blobfs::ReserveNode(size_t* node_index_out) {
TRACE_DURATION("blobfs", "Blobfs::ReserveNode");
zx_status_t status;
if ((status = FindNode(node_index_out)) == ZX_OK) {
return ZX_OK;
// If we didn't find any free inodes, try adding more via FVM.
if (AddInodes() != ZX_OK) {
if ((status = FindNode(node_index_out)) == ZX_OK) {
return ZX_OK;
void Blobfs::PersistNode(WritebackWork* wb, size_t node_index, const blobfs_inode_t& inode) {
TRACE_DURATION("blobfs", "Blobfs::AllocateNode");
ZX_DEBUG_ASSERT(inode.start_block >= kStartBlockMinimum);
blobfs_inode_t* mapped_inode = GetNode(node_index);
ZX_DEBUG_ASSERT(mapped_inode->start_block < kStartBlockMinimum);
size_t blkno_out;
ZX_DEBUG_ASSERT(reserved_nodes_.Find(true, node_index, node_index + 1, 1, &blkno_out) == ZX_OK);
*mapped_inode = inode;
zx_status_t status = reserved_nodes_.Clear(node_index, node_index + 1);
WriteNode(wb, node_index);
void Blobfs::FreeNode(WritebackWork* wb, size_t node_index) {
TRACE_DURATION("blobfs", "Blobfs::FreeNode", "node_index", node_index);
blobfs_inode_t* mapped_inode = GetNode(node_index);
// Write to disk if node has been allocated within inode table
if (mapped_inode->start_block >= kStartBlockMinimum) {
ZX_DEBUG_ASSERT(wb != nullptr);
*mapped_inode = {};
WriteNode(wb, node_index);
// We update lower bound if the freed node is the smallest free node.
if (free_node_lower_bound_ > node_index) {
free_node_lower_bound_ = node_index;
zx_status_t status = reserved_nodes_.Clear(node_index, node_index + 1);
zx_status_t Blobfs::InitializeWriteback() {
zx_status_t status;
fbl::unique_ptr<fzl::MappedVmo> buffer;
if ((status = fzl::MappedVmo::Create(WriteBufferSize(), "blobfs-writeback",
&buffer)) != ZX_OK) {
return status;
if ((status = WritebackBuffer::Create(this, fbl::move(buffer), &writeback_)) != ZX_OK) {
return status;
return ZX_OK;
void Blobfs::Shutdown(fs::Vfs::ShutdownCallback cb) {
TRACE_DURATION("blobfs", "Blobfs::Unmount");
ZX_DEBUG_ASSERT_MSG(writeback_ != nullptr, "Shutdown requires writeback thread to sync");
// 1) Shutdown all external connections to blobfs.
ManagedVfs::Shutdown([this, cb = fbl::move(cb)](zx_status_t status) mutable {
// 2a) Shutdown all internal connections to blobfs.
// Store the Vnodes in a vector to avoid destroying
// them while holding the hash lock.
fbl::Vector<fbl::RefPtr<VnodeBlob>> internal_references;
fbl::AutoLock lock(&hash_lock_);
for (auto& blob : open_hash_) {
auto vn = blob.CloneWatcherTeardown();
if (vn != nullptr) {
// 2b) Flush all pending work to blobfs to the underlying storage.
Sync([this, cb = fbl::move(cb)](zx_status_t status) mutable {
async::PostTask(dispatcher(), [this, cb = fbl::move(cb)]() mutable {
// 3) Ensure the underlying disk has also flushed.
fs::WriteTxn sync_txn(this);
auto on_unmount = fbl::move(on_unmount_);
// Manually destroy Blobfs. The promise of Shutdown is that no
// connections are active, and destroying the Blobfs object
// should terminate all background workers.
delete this;
// Identify to the unmounting channel that we've completed teardown.
// Identify to the mounting thread that the filesystem has
// terminated.
if (on_unmount) {
void Blobfs::WriteBitmap(WritebackWork* wb, uint64_t nblocks, uint64_t start_block) {
TRACE_DURATION("blobfs", "Blobfs::WriteBitmap", "nblocks", nblocks, "start_block",
uint64_t bbm_start_block = start_block / kBlobfsBlockBits;
uint64_t bbm_end_block = fbl::round_up(start_block + nblocks,
kBlobfsBlockBits) /
// Write back the block allocation bitmap
wb->Enqueue(block_map_.StorageUnsafe()->GetVmo(), bbm_start_block,
BlockMapStartBlock(info_) + bbm_start_block, bbm_end_block - bbm_start_block);
void Blobfs::WriteNode(WritebackWork* wb, size_t map_index) {
TRACE_DURATION("blobfs", "Blobfs::WriteNode", "map_index", map_index);
uint64_t b = (map_index * sizeof(blobfs_inode_t)) / kBlobfsBlockSize;
wb->Enqueue(node_map_->GetVmo(), b, NodeMapStartBlock(info_) + b, 1);
zx_status_t Blobfs::NewBlob(const Digest& digest, fbl::RefPtr<VnodeBlob>* out) {
TRACE_DURATION("blobfs", "Blobfs::NewBlob");
zx_status_t status;
// If the blob already exists (or we're having trouble looking up the blob),
// return an error.
if ((status = LookupBlob(digest, nullptr)) != ZX_ERR_NOT_FOUND) {
return (status == ZX_OK) ? ZX_ERR_ALREADY_EXISTS : status;
fbl::AllocChecker ac;
*out = fbl::AdoptRef(new (&ac) VnodeBlob(this, digest));
if (!ac.check()) {
fbl::AutoLock lock(&hash_lock_);
return ZX_OK;
// If no client references to the blob still exist and the blob is either queued for deletion or
// not in a readable state, purge all traces of the blob from blobfs.
// This is only called when we do not expect the blob to be accessed again.
zx_status_t Blobfs::PurgeBlob(VnodeBlob* vn) {
TRACE_DURATION("blobfs", "Blobfs::PurgeBlob");
switch (vn->GetState()) {
case kBlobStateEmpty: {
return ZX_OK;
case kBlobStateReadable: {
// A readable blob should only be purged if it has been unlinked
case kBlobStateDataWrite:
case kBlobStateError: {
size_t node_index = vn->GetMapIndex();
uint64_t start_block = vn->GetNode().start_block;
uint64_t nblocks = vn->GetNode().num_blocks;
zx_status_t status;
fbl::unique_ptr<WritebackWork> wb;
if ((status = CreateWork(&wb, vn)) != ZX_OK) {
return status;
FreeNode(wb.get(), node_index);
FreeBlocks(wb.get(), nblocks, start_block);
return ZX_OK;
default: {
void Blobfs::WriteInfo(WritebackWork* wb) {
void* infodata = info_vmo_->GetData();
memcpy(infodata, &info_, sizeof(info_));
wb->Enqueue(info_vmo_->GetVmo(), 0, 0, 1);
zx_status_t Blobfs::CreateFsId() {
zx::event event;
zx_status_t status = zx::event::create(0, &event);
if (status != ZX_OK) {
return status;
zx_info_handle_basic_t info;
status = event.get_info(ZX_INFO_HANDLE_BASIC, &info, sizeof(info), nullptr, nullptr);
if (status != ZX_OK) {
return status;
fs_id_ = info.koid;
return ZX_OK;
typedef struct dircookie {
size_t index; // Index into node map
uint64_t reserved; // Unused
} dircookie_t;
static_assert(sizeof(dircookie_t) <= sizeof(fs::vdircookie_t),
"Blobfs dircookie too large to fit in IO state");
zx_status_t Blobfs::Readdir(fs::vdircookie_t* cookie, void* dirents, size_t len,
size_t* out_actual) {
TRACE_DURATION("blobfs", "Blobfs::Readdir", "len", len);
fs::DirentFiller df(dirents, len);
dircookie_t* c = reinterpret_cast<dircookie_t*>(cookie);
for (size_t i = c->index; i < info_.inode_count; ++i) {
if (GetNode(i)->start_block >= kStartBlockMinimum) {
Digest digest(GetNode(i)->merkle_root_hash);
char name[Digest::kLength * 2 + 1];
zx_status_t r = digest.ToString(name, sizeof(name));
if (r < 0) {
return r;
uint64_t ino = fuchsia_io_INO_UNKNOWN;
if ((r = df.Next(fbl::StringPiece(name, Digest::kLength * 2),
c->index = i + 1;
*out_actual = df.BytesFilled();
return ZX_OK;
zx_status_t Blobfs::LookupBlob(const Digest& digest, fbl::RefPtr<VnodeBlob>* out) {
TRACE_DURATION("blobfs", "Blobfs::LookupBlob");
const uint8_t* key = digest.AcquireBytes();
auto release = fbl::MakeAutoCall([&digest]() {
// Look up the blob in the maps.
fbl::RefPtr<VnodeBlob> vn;
while (true) {
// Avoid releasing a reference to |vn| while holding |hash_lock_|.
fbl::AutoLock lock(&hash_lock_);
auto raw_vn = open_hash_.find(key).CopyPointer();
if (raw_vn != nullptr) {
vn = fbl::internal::MakeRefPtrUpgradeFromRaw(raw_vn, hash_lock_);
if (vn == nullptr) {
// This condition is only possible if:
// - The raw pointer to the Vnode exists in the open map,
// with refcount == 0.
// - Another thread is fbl_recycling this Vnode, but has not
// yet resurrected it.
// - The vnode is being moved to the close cache, and is
// not yet purged.
// It is not safe for us to attempt to Resurrect the Vnode. If
// we do so, then the caller of LookupBlob may unlink, purge, and
// destroy the Vnode concurrently before the original caller of
// "fbl_recycle" completes.
// Since the window of time for this condition is extremely
// small (between Release and the resurrection of the Vnode),
// and only contains a single flag check, we unlock and try
// again.
} else {
vn = VnodeUpgradeLocked(key);
if (vn != nullptr) {
if (out != nullptr) {
*out = fbl::move(vn);
return ZX_OK;
zx_status_t Blobfs::AttachVmo(zx_handle_t vmo, vmoid_t* out) {
zx_handle_t xfer_vmo;
zx_status_t status = zx_handle_duplicate(vmo, ZX_RIGHT_SAME_RIGHTS, &xfer_vmo);
if (status != ZX_OK) {
return status;
ssize_t r = ioctl_block_attach_vmo(Fd(), &xfer_vmo, out);
if (r < 0) {
return static_cast<zx_status_t>(r);
return ZX_OK;
zx_status_t Blobfs::DetachVmo(vmoid_t vmoid) {
block_fifo_request_t request; = BlockGroupID();
request.vmoid = vmoid;
request.opcode = BLOCKIO_CLOSE_VMO;
return Transaction(&request, 1);
zx_status_t Blobfs::AddInodes() {
TRACE_DURATION("blobfs", "Blobfs::AddInodes");
if (!(info_.flags & kBlobFlagFVM)) {
const size_t kBlocksPerSlice = info_.slice_size / kBlobfsBlockSize;
extend_request_t request;
request.length = 1;
request.offset = (kFVMNodeMapStart / kBlocksPerSlice) + info_.ino_slices;
if (ioctl_block_fvm_extend(Fd(), &request) < 0) {
fprintf(stderr, "Blobfs::AddInodes fvm_extend failure");
const uint32_t kInodesPerSlice = static_cast<uint32_t>(info_.slice_size / kBlobfsInodeSize);
uint64_t inodes64 = (info_.ino_slices + static_cast<uint32_t>(request.length)) * kInodesPerSlice;
ZX_DEBUG_ASSERT(inodes64 <= fbl::numeric_limits<uint32_t>::max());
uint32_t inodes = static_cast<uint32_t>(inodes64);
uint32_t inoblks = (inodes + kBlobfsInodesPerBlock - 1) / kBlobfsInodesPerBlock;
ZX_DEBUG_ASSERT(info_.inode_count <= fbl::numeric_limits<uint32_t>::max());
uint32_t inoblks_old = (static_cast<uint32_t>(info_.inode_count) + kBlobfsInodesPerBlock - 1) / kBlobfsInodesPerBlock;
ZX_DEBUG_ASSERT(inoblks_old <= inoblks);
if (node_map_->Grow(inoblks * kBlobfsBlockSize) != ZX_OK) {
info_.vslice_count += request.length;
info_.ino_slices += static_cast<uint32_t>(request.length);
info_.inode_count = inodes;
// Reset new inodes to 0
uintptr_t addr = reinterpret_cast<uintptr_t>(node_map_->GetData());
memset(reinterpret_cast<void*>(addr + kBlobfsBlockSize * inoblks_old), 0,
(kBlobfsBlockSize * (inoblks - inoblks_old)));
zx_status_t status;
fbl::unique_ptr<WritebackWork> wb;
if ((status = CreateWork(&wb, nullptr)) != ZX_OK) {
return status;
wb.get()->Enqueue(node_map_->GetVmo(), inoblks_old, NodeMapStartBlock(info_) + inoblks_old,
inoblks - inoblks_old);
return ZX_OK;
zx_status_t Blobfs::AddBlocks(size_t nblocks) {
TRACE_DURATION("blobfs", "Blobfs::AddBlocks", "nblocks", nblocks);
if (!(info_.flags & kBlobFlagFVM)) {
const size_t kBlocksPerSlice = info_.slice_size / kBlobfsBlockSize;
extend_request_t request;
// Number of slices required to add nblocks
request.length = (nblocks + kBlocksPerSlice - 1) / kBlocksPerSlice;
request.offset = (kFVMDataStart / kBlocksPerSlice) + info_.dat_slices;
uint64_t blocks64 = (info_.dat_slices + request.length) * kBlocksPerSlice;
ZX_DEBUG_ASSERT(blocks64 <= fbl::numeric_limits<uint32_t>::max());
uint32_t blocks = static_cast<uint32_t>(blocks64);
uint32_t abmblks = (blocks + kBlobfsBlockBits - 1) / kBlobfsBlockBits;
uint64_t abmblks_old = (info_.block_count + kBlobfsBlockBits - 1) / kBlobfsBlockBits;
ZX_DEBUG_ASSERT(abmblks_old <= abmblks);
if (abmblks > kBlocksPerSlice) {
//TODO(planders): Allocate more slices for the block bitmap.
fprintf(stderr, "Blobfs::AddBlocks needs to increase block bitmap size\n");
if (ioctl_block_fvm_extend(Fd(), &request) < 0) {
fprintf(stderr, "Blobfs::AddBlocks FVM Extend failure\n");
// Grow the block bitmap to hold new number of blocks
if (block_map_.Grow(fbl::round_up(blocks, kBlobfsBlockBits)) != ZX_OK) {
// Grow before shrinking to ensure the underlying storage is a multiple
// of kBlobfsBlockSize.
zx_status_t status;
fbl::unique_ptr<WritebackWork> wb;
if ((status = CreateWork(&wb, nullptr)) != ZX_OK) {
return status;
// Since we are extending the bitmap, we need to fill the expanded
// portion of the allocation block bitmap with zeroes.
if (abmblks > abmblks_old) {
uint64_t vmo_offset = abmblks_old;
uint64_t dev_offset = BlockMapStartBlock(info_) + abmblks_old;
uint64_t length = abmblks - abmblks_old;
wb.get()->Enqueue(block_map_.StorageUnsafe()->GetVmo(), vmo_offset, dev_offset, length);
info_.vslice_count += request.length;
info_.dat_slices += static_cast<uint32_t>(request.length);
info_.block_count = blocks;
return ZX_OK;
void Blobfs::Sync(SyncCallback closure) {
zx_status_t status;
fbl::unique_ptr<WritebackWork> wb;
if ((status = CreateWork(&wb, nullptr)) != ZX_OK) {
void Blobfs::UpdateAllocationMetrics(uint64_t size_data, const fs::Duration& duration) {
if (CollectingMetrics()) {
metrics_.blobs_created_total_size += size_data;
metrics_.total_allocation_time_ticks += duration;
void Blobfs::UpdateLookupMetrics(uint64_t size) {
if (CollectingMetrics()) {
metrics_.blobs_opened_total_size += size;
void Blobfs::UpdateClientWriteMetrics(uint64_t data_size, uint64_t merkle_size,
const fs::Duration& enqueue_duration,
const fs::Duration& generate_duration) {
if (CollectingMetrics()) {
metrics_.data_bytes_written += data_size;
metrics_.merkle_bytes_written += merkle_size;
metrics_.total_write_enqueue_time_ticks += enqueue_duration;
metrics_.total_merkle_generation_time_ticks += generate_duration;
void Blobfs::UpdateWritebackMetrics(uint64_t size, const fs::Duration& duration) {
if (CollectingMetrics()) {
metrics_.total_writeback_time_ticks += duration;
metrics_.total_writeback_bytes_written += size;
void Blobfs::UpdateMerkleDiskReadMetrics(uint64_t size, const fs::Duration& duration) {
if (CollectingMetrics()) {
metrics_.total_read_from_disk_time_ticks += duration;
metrics_.bytes_read_from_disk += size;
void Blobfs::UpdateMerkleDecompressMetrics(uint64_t size_compressed,
uint64_t size_uncompressed,
const fs::Duration& read_duration,
const fs::Duration& decompress_duration) {
if (CollectingMetrics()) {
metrics_.bytes_compressed_read_from_disk += size_compressed;
metrics_.bytes_decompressed_from_disk += size_uncompressed;
metrics_.total_read_compressed_time_ticks += read_duration;
metrics_.total_decompress_time_ticks += decompress_duration;
void Blobfs::UpdateMerkleVerifyMetrics(uint64_t size_data, uint64_t size_merkle,
const fs::Duration& duration) {
if (CollectingMetrics()) {
metrics_.blobs_verified_total_size_data += size_data;
metrics_.blobs_verified_total_size_merkle += size_merkle;
metrics_.total_verification_time_ticks += duration;
Blobfs::Blobfs(fbl::unique_fd fd, const blobfs_info_t* info)
: blockfd_(fbl::move(fd)) {
memcpy(&info_, info, sizeof(blobfs_info_t));
Blobfs::~Blobfs() {
writeback_ = nullptr;
if (blockfd_) {
zx_status_t Blobfs::Create(fbl::unique_fd fd, const blobfs_info_t* info,
fbl::unique_ptr<Blobfs>* out) {
TRACE_DURATION("blobfs", "Blobfs::Create");
zx_status_t status = blobfs_check_info(info, TotalBlocks(*info));
if (status < 0) {
fprintf(stderr, "blobfs: Check info failure\n");
return status;
fbl::AllocChecker ac;
auto fs = fbl::unique_ptr<Blobfs>(new Blobfs(fbl::move(fd), info));
zx::fifo fifo;
ssize_t r;
if ((r = ioctl_block_get_info(fs->Fd(), &fs->block_info_)) < 0) {
return static_cast<zx_status_t>(r);
} else if (kBlobfsBlockSize % fs->block_info_.block_size != 0) {
return ZX_ERR_IO;
} else if ((r = ioctl_block_get_fifos(fs->Fd(), fifo.reset_and_get_address())) < 0) {
fprintf(stderr, "Failed to mount blobfs: Someone else is using the block device\n");
return static_cast<zx_status_t>(r);
if ((status = block_client::Client::Create(fbl::move(fifo), &fs->fifo_client_)) != ZX_OK) {
return status;
// Keep the block_map_ aligned to a block multiple
if ((status = fs->block_map_.Reset(BlockMapBlocks(fs->info_) * kBlobfsBlockBits)) < 0) {
fprintf(stderr, "blobfs: Could not reset block bitmap\n");
return status;
} else if ((status = fs->block_map_.Shrink(fs->info_.block_count)) < 0) {
fprintf(stderr, "blobfs: Could not shrink block bitmap\n");
return status;
size_t nodemap_size = kBlobfsInodeSize * fs->info_.inode_count;
ZX_DEBUG_ASSERT(fbl::round_up(nodemap_size, kBlobfsBlockSize) == nodemap_size);
ZX_DEBUG_ASSERT(nodemap_size / kBlobfsBlockSize == NodeMapBlocks(fs->info_));
if ((status = fzl::MappedVmo::Create(nodemap_size, "nodemap", &fs->node_map_)) != ZX_OK) {
return status;
} else if ((status = fs->AttachVmo(fs->block_map_.StorageUnsafe()->GetVmo(),
&fs->block_map_vmoid_)) != ZX_OK) {
return status;
} else if ((status = fs->AttachVmo(fs->node_map_->GetVmo(),
&fs->node_map_vmoid_)) != ZX_OK) {
return status;
} else if ((status = fs->LoadBitmaps()) < 0) {
fprintf(stderr, "blobfs: Failed to load bitmaps: %d\n", status);
return status;
} else if ((status = fzl::MappedVmo::Create(kBlobfsBlockSize, "blobfs-superblock",
&fs->info_vmo_)) != ZX_OK) {
fprintf(stderr, "blobfs: Failed to create info vmo: %d\n", status);
return status;
} else if ((status = fs->AttachVmo(fs->info_vmo_->GetVmo(),
&fs->info_vmoid_)) != ZX_OK) {
fprintf(stderr, "blobfs: Failed to attach info vmo: %d\n", status);
return status;
} else if ((status = fs->CreateFsId()) != ZX_OK) {
fprintf(stderr, "blobfs: Failed to create fs_id: %d\n", status);
return status;
} else if ((status = fs->InitializeVnodes() != ZX_OK)) {
fprintf(stderr, "blobfs: Failed to initialize Vnodes\n");
return status;
*out = fbl::move(fs);
return ZX_OK;
zx_status_t Blobfs::InitializeVnodes() {
fbl::AutoLock lock(&hash_lock_);
for (size_t i = 0; i < info_.inode_count; ++i) {
const blobfs_inode_t* inode = GetNode(i);
if (inode->start_block >= kStartBlockMinimum) {
fbl::AllocChecker ac;
Digest digest(inode->merkle_root_hash);
fbl::RefPtr<VnodeBlob> vn = fbl::AdoptRef(new (&ac) VnodeBlob(this, digest));
if (!ac.check()) {
// Delay reading any data from disk until read.
size_t size = vn->SizeData();
zx_status_t status = VnodeInsertClosedLocked(fbl::move(vn));
if (status != ZX_OK) {
char name[digest::Digest::kLength * 2 + 1];
digest.ToString(name, sizeof(name));
fprintf(stderr, "blobfs: CORRUPTED FILESYSTEM: Duplicate node: "
"%s @ index %zu\n", name, i);
return status;
return ZX_OK;
void Blobfs::VnodeReleaseHard(VnodeBlob* vn) {
fbl::AutoLock lock(&hash_lock_);
ZX_ASSERT(open_hash_.erase(vn->GetKey()) != nullptr);
void Blobfs::VnodeReleaseSoft(VnodeBlob* raw_vn) {
fbl::AutoLock lock(&hash_lock_);
fbl::RefPtr<VnodeBlob> vn = fbl::internal::MakeRefPtrNoAdopt(raw_vn);
ZX_ASSERT(open_hash_.erase(raw_vn->GetKey()) != nullptr);
ZX_ASSERT(VnodeInsertClosedLocked(fbl::move(vn)) == ZX_OK);
zx_status_t Blobfs::VnodeInsertClosedLocked(fbl::RefPtr<VnodeBlob> vn) {
// To exist in the closed_hash_, this RefPtr must be leaked.
if (!closed_hash_.insert_or_find(vn.get())) {
// Set blob state to "Purged" so we do not try to add it to the cached map on recycle.
__UNUSED auto leak = vn.leak_ref();
return ZX_OK;
fbl::RefPtr<VnodeBlob> Blobfs::VnodeUpgradeLocked(const uint8_t* key) {
ZX_DEBUG_ASSERT(open_hash_.find(key).CopyPointer() == nullptr);
VnodeBlob* raw_vn = closed_hash_.erase(key);
if (raw_vn == nullptr) {
return nullptr;
// To have existed in the closed_hash_, this RefPtr must have
// been leaked.
return fbl::internal::MakeRefPtrNoAdopt(raw_vn);
zx_status_t Blobfs::OpenRootNode(fbl::RefPtr<VnodeBlob>* out) {
fbl::AllocChecker ac;
fbl::RefPtr<VnodeBlob> vn =
fbl::AdoptRef(new (&ac) VnodeBlob(this));
if (!ac.check()) {
zx_status_t status = vn->Open(0, nullptr);
if (status != ZX_OK) {
return status;
*out = fbl::move(vn);
return ZX_OK;
zx_status_t Blobfs::LoadBitmaps() {
TRACE_DURATION("blobfs", "Blobfs::LoadBitmaps");
fs::ReadTxn txn(this);
txn.Enqueue(block_map_vmoid_, 0, BlockMapStartBlock(info_), BlockMapBlocks(info_));
txn.Enqueue(node_map_vmoid_, 0, NodeMapStartBlock(info_), NodeMapBlocks(info_));
return txn.Transact();
zx_status_t blobfs_create(fbl::unique_ptr<Blobfs>* out, fbl::unique_fd blockfd) {
zx_status_t status;
char block[kBlobfsBlockSize];
if ((status = readblk(blockfd.get(), 0, (void*)block)) < 0) {
fprintf(stderr, "blobfs: could not read info block\n");
return status;
blobfs_info_t* info = reinterpret_cast<blobfs_info_t*>(&block[0]);
uint64_t blocks;
if ((status = blobfs_get_blockcount(blockfd.get(), &blocks)) != ZX_OK) {
fprintf(stderr, "blobfs: cannot find end of underlying device\n");
return status;
if ((status = blobfs_check_info(info, blocks)) != ZX_OK) {
fprintf(stderr, "blobfs: Info check failed\n");
return status;
if ((status = CheckFvmConsistency(info, blockfd.get())) != ZX_OK) {
fprintf(stderr, "blobfs: FVM info check failed\n");
return status;
if ((status = Blobfs::Create(fbl::move(blockfd), info, out)) != ZX_OK) {
fprintf(stderr, "blobfs: mount failed; could not create blobfs\n");
return status;
return ZX_OK;
zx_status_t blobfs_mount(async_dispatcher_t* dispatcher, fbl::unique_fd blockfd,
const blob_options_t* options, zx::channel root,
fbl::Closure on_unmount) {
zx_status_t status;
fbl::unique_ptr<Blobfs> fs;
if ((status = blobfs_create(&fs, fbl::move(blockfd))) != ZX_OK) {
return status;
if ((status = fs->InitializeWriteback()) != ZX_OK) {
return status;
if (options->metrics) {
fbl::RefPtr<VnodeBlob> vn;
if ((status = fs->OpenRootNode(&vn)) != ZX_OK) {
fprintf(stderr, "blobfs: mount failed; could not get root blob\n");
return status;
if ((status = fs->ServeDirectory(fbl::move(vn), fbl::move(root))) != ZX_OK) {
fprintf(stderr, "blobfs: mount failed; could not serve root directory\n");
return status;
// Shutdown is now responsible for deleting the Blobfs object.
__UNUSED auto r = fs.release();
return ZX_OK;
} // namespace blobfs