blob: c5edb5e686949697599225f6409bedb623e4d882 [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <zircon/types.h>
#include <optional>
#include <fbl/vector.h>
#include <fs/journal/format.h>
#include <fs/journal/replay.h>
#include <fs/journal/superblock.h>
#include <fs/transaction/writeback.h>
#include <storage/operation/buffered-operation.h>
#include "entry_view.h"
#include "replay_tree.h"
namespace fs {
namespace {
// Reads and validates the length of the entry from a header.
// Ensures the payload length is not zero, and that the entry length does not overflow
// the journal buffer.
uint64_t ParseEntryLength(const storage::VmoBuffer* journal_buffer,
const JournalHeaderView& header) {
uint64_t entry_length = 0;
if (unlikely(add_overflow(header.PayloadBlocks(), kEntryMetadataBlocks, &entry_length))) {
return 0;
}
if (header.PayloadBlocks() == 0 || entry_length > journal_buffer->capacity()) {
// Zero-length entries and larger-than-buffer entries disallowed.
return 0;
}
ZX_DEBUG_ASSERT(entry_length != 0);
return entry_length;
}
std::optional<const JournalEntryView> ParseEntry(storage::VmoBuffer* journal_buffer, uint64_t start,
uint64_t sequence_number) {
// To know how much of the journal we need to parse, first observe only one block.
storage::BlockBufferView small_view(journal_buffer, start, 1);
const auto header = JournalHeaderView::Create(
fbl::Span<uint8_t>(static_cast<uint8_t*>(small_view.Data(0)), small_view.BlockSize()),
sequence_number);
// This is not a header block.
if (header.is_error()) {
return std::nullopt;
}
uint64_t entry_length = ParseEntryLength(journal_buffer, header.value());
if (!entry_length) {
return std::nullopt;
}
// Looks good enough. Create a JournalEntryView that now includes the footer.
storage::BlockBufferView view(journal_buffer, start, entry_length);
JournalEntryView entry_view(std::move(view));
auto& const_entry_view = const_cast<const JournalEntryView&>(entry_view);
// Validate the footer.
if (const_entry_view.footer()->prefix.magic != kJournalEntryMagic) {
return std::nullopt;
}
if (header.value().SequenceNumber() != const_entry_view.footer()->prefix.sequence_number) {
return std::nullopt;
}
// Validate the contents of the entry itself.
if (const_entry_view.footer()->checksum != const_entry_view.CalculateChecksum()) {
return std::nullopt;
}
// Decode any blocks within the entry which were previously encoded (escaped).
//
// This way, the internal details of on-disk journal storage are hidden from the public
// API of parsing entries.
entry_view.DecodePayloadBlocks();
return entry_view;
}
bool IsSubsequentEntryValid(storage::VmoBuffer* journal_buffer, uint64_t start,
uint64_t sequence_number) {
// Access the current entry, but ignore everything except the "length" field.
// WARNING: This (intentionally) does not validate the current entry.
storage::BlockBufferView small_view(journal_buffer, start, kJournalEntryHeaderBlocks);
const auto header = JournalHeaderView::Create(
fbl::Span<uint8_t>(reinterpret_cast<uint8_t*>(small_view.Data(0)), small_view.BlockSize()),
sequence_number);
if (header.is_error()) {
// If this isn't a header, we can't find the subsequent entry.
return false;
}
// Check the next entry, if the current entry's length field is (somehow) valid.
uint64_t entry_length = ParseEntryLength(journal_buffer, header.value());
if (!entry_length) {
// If we can't parse the length, then we can't check the subsequent entry.
// If two neighboring entries are corrupted, this is treated as an interruption.
return false;
}
start = (start + entry_length) % journal_buffer->capacity();
return JournalHeaderView::Create(
fbl::Span<uint8_t>(reinterpret_cast<uint8_t*>(journal_buffer->Data(start)),
journal_buffer->BlockSize()),
sequence_number + 1)
.is_ok();
}
void ParseBlocks(const storage::VmoBuffer& journal_buffer, const JournalEntryView& entry,
uint64_t entry_start, ReplayTree* operation_tree) {
// Collect all the operations to be replayed from this entry into |operation_tree|.
storage::BufferedOperation operation;
for (uint32_t i = 0; i < entry.header().PayloadBlocks(); i++) {
operation.vmoid = journal_buffer.vmoid();
operation.op.type = storage::OperationType::kWrite;
operation.op.vmo_offset =
(entry_start + kJournalEntryHeaderBlocks + i) % journal_buffer.capacity();
operation.op.dev_offset = entry.header().TargetBlock(i);
operation.op.length = 1;
operation_tree->insert(operation);
}
}
} // namespace
zx_status_t ParseJournalEntries(const JournalSuperblock* info, storage::VmoBuffer* journal_buffer,
fbl::Vector<storage::BufferedOperation>* operations,
uint64_t* out_sequence_number, uint64_t* out_start) {
// Validate |info| before using it.
zx_status_t status = info->Validate();
if (status != ZX_OK) {
FS_TRACE_ERROR("Journal Superblock does not validate: %d\n", status);
return status;
}
if (info->start() >= journal_buffer->capacity()) {
FS_TRACE_ERROR("Journal entries start beyond end of journal capacity (%zu vs %zu)\n",
info->start(), journal_buffer->capacity());
return ZX_ERR_IO_DATA_INTEGRITY;
}
// Start parsing the journal, and replay as many entries as possible.
uint64_t entry_start = info->start();
uint64_t sequence_number = info->sequence_number();
FS_TRACE_INFO("replay: entry_start: %zu, sequence_number: %zu\n", entry_start, sequence_number);
ReplayTree operation_tree;
while (true) {
// Attempt to parse the next entry in the journal. Eventually, we expect this to fail.
std::optional<const JournalEntryView> entry =
ParseEntry(journal_buffer, entry_start, sequence_number);
if (!entry) {
// Typically, an invalid entry will imply that the entry was interrupted
// partway through being written. However, if the subsequent entry in the journal
// looks valid, that implies the entry at |entry_start| was corrupted for some unknown
// reason. The inability to replay committed journal entries may lead to filesystem
// corruption, so we return an explicit error in this case.
if (IsSubsequentEntryValid(journal_buffer, entry_start, sequence_number)) {
return ZX_ERR_IO_DATA_INTEGRITY;
}
break;
}
if (entry->header().ObjectType() == JournalObjectType::kRevocation) {
// TODO(ZX-4752): Revocation records advise us to avoid replaying the provided
// operations.
//
// We should implement this by:
// 1) Parsing all blocks into a non-|operations| vector
// 2) Iterate over |operations| and look for collision
// 3) Omit the intersect
return ZX_ERR_NOT_SUPPORTED;
} else {
// Replay all operations within this entry.
ParseBlocks(*journal_buffer, *entry, entry_start, &operation_tree);
}
// Move to the next entry.
auto entry_blocks = entry->header().PayloadBlocks() + kEntryMetadataBlocks;
entry_start = (entry_start + entry_blocks) % journal_buffer->capacity();
// Move the sequence_number forward beyond the most recently seen entry.
sequence_number = entry->header().SequenceNumber() + 1;
}
// Now that we've finished replaying entries, return the next sequence_number to use.
// It is the responsibility of the caller to update the info block, but only after
// all prior operations have been replayed.
*out_sequence_number = sequence_number;
*out_start = entry_start;
for (const auto& [_, range] : operation_tree) {
operations->push_back(range.container().operation);
}
return ZX_OK;
}
zx_status_t ReplayJournal(fs::TransactionHandler* transaction_handler,
storage::VmoidRegistry* registry, uint64_t journal_start,
uint64_t journal_length, JournalSuperblock* out_journal_superblock) {
const uint64_t journal_entry_start = journal_start + kJournalMetadataBlocks;
const uint64_t journal_entry_blocks = journal_length - kJournalMetadataBlocks;
FS_TRACE_DEBUG("replay: Initializing journal superblock\n");
// Initialize and read the journal superblock and journal buffer.
auto journal_superblock_buffer = std::make_unique<storage::VmoBuffer>();
zx_status_t status = journal_superblock_buffer->Initialize(
registry, kJournalMetadataBlocks, transaction_handler->FsBlockSize(), "journal-superblock");
if (status != ZX_OK) {
FS_TRACE_ERROR("journal: Cannot initialize journal info block: %d\n", status);
return status;
}
// Initialize and read the journal itself.
FS_TRACE_INFO("replay: Initializing journal buffer (%zu blocks)\n", journal_entry_blocks);
storage::VmoBuffer journal_buffer;
status = journal_buffer.Initialize(registry, journal_entry_blocks,
transaction_handler->FsBlockSize(), "journal-buffer");
if (status != ZX_OK) {
FS_TRACE_ERROR("journal: Cannot initialize journal buffer: %d\n", status);
return status;
}
FS_TRACE_DEBUG("replay: Reading from storage\n");
fs::ReadTxn transaction(transaction_handler);
transaction.Enqueue(journal_superblock_buffer->vmoid(), 0, journal_start, kJournalMetadataBlocks);
transaction.Enqueue(journal_buffer.vmoid(), 0, journal_entry_start, journal_entry_blocks);
status = transaction.Transact();
if (status != ZX_OK) {
FS_TRACE_ERROR("journal: Cannot load journal: %d\n", status);
return status;
}
// Parse the journal, deciding which entries should be replayed.
//
// NOTE(ZX-4737): This current implementation of replay is built against the specification of
// the journaling format, not against how the journaling writeback code happens to be
// implemented. In the current implementation, "write to journal" and "write to final location"
// are tightly coupled, so although we will replay a multi-entry journal, it is unlikely the
// disk will end up in that state. However, this use case is supported by this replay code
// regardless.
fbl::Vector<storage::BufferedOperation> operations;
uint64_t sequence_number = 0;
uint64_t next_entry_start = 0;
FS_TRACE_DEBUG("replay: Parsing journal entries\n");
JournalSuperblock journal_superblock(std::move(journal_superblock_buffer));
status = ParseJournalEntries(&journal_superblock, &journal_buffer, &operations, &sequence_number,
&next_entry_start);
if (status != ZX_OK) {
FS_TRACE_ERROR("journal: Cannot parse journal entries: %d\n", status);
return status;
}
// Replay the requested journal entries, then the new header.
if (operations.size() > 0) {
// Update to the new sequence_number (in-memory).
journal_superblock.Update(next_entry_start, sequence_number);
for (auto& op : operations) {
FS_TRACE_INFO("replay: writing operation @ dev_offset: %zu, vmo_offset: %zu, length: %zu\n",
op.op.dev_offset, op.op.vmo_offset, op.op.length);
}
status = FlushRequests(transaction_handler, operations);
if (status != ZX_OK) {
FS_TRACE_ERROR("journal: Cannot replay entries: %d\n", status);
return status;
}
operations.reset();
FS_TRACE_INFO("replay: New start: %zu, sequence_number: %zu\n", next_entry_start,
sequence_number);
storage::BufferedOperation operation;
operation.vmoid = journal_superblock.buffer().vmoid();
operation.op.type = storage::OperationType::kWrite;
operation.op.vmo_offset = 0;
operation.op.dev_offset = journal_start;
operation.op.length = kJournalMetadataBlocks;
operations.push_back(std::move(operation));
status = FlushRequests(transaction_handler, operations);
if (status != ZX_OK) {
FS_TRACE_ERROR("journal: Cannot update journal superblock: %d\n", status);
return status;
}
} else {
FS_TRACE_DEBUG("replay: Not replaying entries\n");
}
if (out_journal_superblock) {
*out_journal_superblock = std::move(journal_superblock);
}
return ZX_OK;
}
} // namespace fs