blob: 01123f83cbd3440f68aeea0f686f90d04796a5c1 [file] [log] [blame]
/*
* Copyright (C) 2021 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "snapuserd.h"
#include <csignal>
#include <optional>
#include <set>
#include <snapuserd/snapuserd_client.h>
namespace android {
namespace snapshot {
using namespace android;
using namespace android::dm;
using android::base::unique_fd;
#define SNAP_LOG(level) LOG(level) << misc_name_ << ": "
#define SNAP_PLOG(level) PLOG(level) << misc_name_ << ": "
/*
* Merging a copy operation involves the following flow:
*
* 1: dm-snapshot layer requests merge for a 4k block. dm-user sends the request
* to the daemon
* 2: daemon reads the source block
* 3: daemon copies the source data
* 4: IO completion sent back to dm-user (a switch from user space to kernel)
* 5: dm-snapshot merges the data to base device
* 6: dm-snapshot sends the merge-completion IO to dm-user
* 7: dm-user re-directs the merge completion IO to daemon (one more switch)
* 8: daemon updates the COW file about the completed merge request (a write syscall) and followed
* by a fysnc. 9: Send the IO completion back to dm-user
*
* The above sequence is a significant overhead especially when merging one 4k
* block at a time.
*
* Read-ahead layer will optimize the above path by reading the data from base
* device in the background so that merging thread can retrieve the data from
* the read-ahead cache. Additionally, syncing of merged data is deferred to
* read-ahead thread threadby the IO path is not bottlenecked.
*
* We create a scratch space of 2MB to store the read-ahead data in the COW
* device.
*
* +-----------------------+
* | Header (fixed) |
* +-----------------------+
* | Scratch space | <-- 2MB
* +-----------------------+
*
* Scratch space is as follows:
*
* +-----------------------+
* | Metadata | <- 4k page
* +-----------------------+
* | Metadata | <- 4k page
* +-----------------------+
* | |
* | Read-ahead data |
* | |
* +-----------------------+
*
* State transitions and communication between read-ahead thread and worker
* thread during merge:
* =====================================================================
*
* Worker Threads Read-Ahead thread
* ------------------------------------------------------------------
*
* |
* |
* --> -----------------READ_AHEAD_BEGIN------------->|
* | | | READ_AHEAD_IN_PROGRESS
* | WAIT |
* | | |
* | |<-----------------IO_IN_PROGRESS---------------
* | | |
* | | IO_IN_PRGRESS WAIT
* | | |
* |<--| |
* | |
* ------------------IO_TERMINATED--------------->|
* END
*
*
* ===================================================================
*
* Example:
*
* We have 6 copy operations to be executed in OTA and there is a overlap. Update-engine
* will write to COW file as follows:
*
* Op-1: 20 -> 23
* Op-2: 19 -> 22
* Op-3: 18 -> 21
* Op-4: 17 -> 20
* Op-5: 16 -> 19
* Op-6: 15 -> 18
*
* Read-ahead thread will read all the 6 source blocks and store the data in the
* scratch space. Metadata will contain the destination block numbers. Thus,
* scratch space will look something like this:
*
* +--------------+
* | Block 23 |
* | offset - 1 |
* +--------------+
* | Block 22 |
* | offset - 2 |
* +--------------+
* | Block 21 |
* | offset - 3 |
* +--------------+
* ...
* ...
* +--------------+
* | Data-Block 20| <-- offset - 1
* +--------------+
* | Data-Block 19| <-- offset - 2
* +--------------+
* | Data-Block 18| <-- offset - 3
* +--------------+
* ...
* ...
*
* ====================================================================
* IO Path:
*
* Read-ahead will serve the data to worker threads during merge only
* after metadata and data are persisted to the scratch space. Worker
* threads during merge will always retrieve the data from cache; if the
* cache is not populated, it will wait for the read-ahead thread to finish.
* Furthermore, the number of operations merged will by synced to the header
* only when all the blocks in the read-ahead cache are merged. In the above
* case, when all 6 operations are merged, COW Header is updated with
* num_merge_ops = 6.
*
* Merge resume after crash:
*
* Let's say we have a crash after 5 operations are merged. i.e. after
* Op-5: 16->19 is completed but before the Op-6 is merged. Thus, COW Header
* num_merge_ops will be 0 as the all the ops were not merged yet. During next
* reboot, read-ahead thread will re-construct the data in-memory from the
* scratch space; when merge resumes, Op-1 will be re-exectued. However,
* data will be served from read-ahead cache safely even though, block 20
* was over-written by Op-4.
*
*/
ReadAheadThread::ReadAheadThread(const std::string& cow_device, const std::string& backing_device,
const std::string& misc_name,
std::shared_ptr<Snapuserd> snapuserd) {
cow_device_ = cow_device;
backing_store_device_ = backing_device;
misc_name_ = misc_name;
snapuserd_ = snapuserd;
}
void ReadAheadThread::CheckOverlap(const CowOperation* cow_op) {
uint64_t source_block = cow_op->source;
uint64_t source_offset = 0;
if (dest_blocks_.count(cow_op->new_block) || source_blocks_.count(source_block) ||
(source_offset > 0 && source_blocks_.count(source_block + 1))) {
overlap_ = true;
}
dest_blocks_.insert(source_block);
if (source_offset > 0) {
dest_blocks_.insert(source_block + 1);
}
source_blocks_.insert(cow_op->new_block);
}
void ReadAheadThread::PrepareReadAhead(uint64_t* source_offset, int* pending_ops,
std::vector<uint64_t>& blocks) {
int num_ops = *pending_ops;
int nr_consecutive = 0;
CHECK_NE(source_offset, nullptr);
if (!RAIterDone() && num_ops) {
// Get the first block with offset
const CowOperation* cow_op = GetRAOpIter();
CHECK_NE(cow_op, nullptr);
*source_offset = cow_op->source;
if (cow_op->type == kCowCopyOp) {
*source_offset *= BLOCK_SZ;
}
RAIterNext();
num_ops -= 1;
nr_consecutive = 1;
blocks.push_back(cow_op->new_block);
if (!overlap_) {
CheckOverlap(cow_op);
}
/*
* Find number of consecutive blocks working backwards.
*/
while (!RAIterDone() && num_ops) {
const CowOperation* op = GetRAOpIter();
CHECK_NE(op, nullptr);
uint64_t next_offset = op->source;
if (op->type == kCowCopyOp) {
next_offset *= BLOCK_SZ;
}
if (next_offset + nr_consecutive * BLOCK_SZ != *source_offset) {
break;
}
nr_consecutive += 1;
num_ops -= 1;
blocks.push_back(op->new_block);
RAIterNext();
if (!overlap_) {
CheckOverlap(op);
}
}
}
}
bool ReadAheadThread::ReconstructDataFromCow() {
std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
read_ahead_buffer_map.clear();
loff_t metadata_offset = 0;
loff_t start_data_offset = snapuserd_->GetBufferDataOffset();
int num_ops = 0;
int total_blocks_merged = 0;
// This memcpy is important as metadata_buffer_ will be an unaligned address and will fault
// on 32-bit systems
std::unique_ptr<uint8_t[]> metadata_buffer =
std::make_unique<uint8_t[]>(snapuserd_->GetBufferMetadataSize());
memcpy(metadata_buffer.get(), metadata_buffer_, snapuserd_->GetBufferMetadataSize());
while (true) {
struct ScratchMetadata* bm = reinterpret_cast<struct ScratchMetadata*>(
(char*)metadata_buffer.get() + metadata_offset);
// Done reading metadata
if (bm->new_block == 0 && bm->file_offset == 0) {
break;
}
loff_t buffer_offset = bm->file_offset - start_data_offset;
void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + buffer_offset);
read_ahead_buffer_map[bm->new_block] = bufptr;
num_ops += 1;
total_blocks_merged += 1;
metadata_offset += sizeof(struct ScratchMetadata);
}
// We are done re-constructing the mapping; however, we need to make sure
// all the COW operations to-be merged are present in the re-constructed
// mapping.
while (!RAIterDone()) {
const CowOperation* op = GetRAOpIter();
if (read_ahead_buffer_map.find(op->new_block) != read_ahead_buffer_map.end()) {
num_ops -= 1;
snapuserd_->SetFinalBlockMerged(op->new_block);
RAIterNext();
} else {
// Verify that we have covered all the ops which were re-constructed
// from COW device - These are the ops which are being
// re-constructed after crash.
if (!(num_ops == 0)) {
SNAP_LOG(ERROR) << "ReconstructDataFromCow failed. Not all ops recoverd "
<< " Pending ops: " << num_ops;
snapuserd_->ReadAheadIOFailed();
return false;
}
break;
}
}
snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);
snapuserd_->ReconstructDataFromCowFinish();
if (!snapuserd_->ReadAheadIOCompleted(true)) {
SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
snapuserd_->ReadAheadIOFailed();
return false;
}
SNAP_LOG(INFO) << "ReconstructDataFromCow success";
return true;
}
bool ReadAheadThread::ReadAheadIOStart() {
// Check if the data has to be constructed from the COW file.
// This will be true only once during boot up after a crash
// during merge.
if (snapuserd_->ReconstructDataFromCow()) {
return ReconstructDataFromCow();
}
std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
read_ahead_buffer_map.clear();
int num_ops = (snapuserd_->GetBufferDataSize()) / BLOCK_SZ;
loff_t metadata_offset = 0;
struct ScratchMetadata* bm =
reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ + metadata_offset);
bm->new_block = 0;
bm->file_offset = 0;
std::vector<uint64_t> blocks;
loff_t buffer_offset = 0;
loff_t offset = 0;
loff_t file_offset = snapuserd_->GetBufferDataOffset();
int total_blocks_merged = 0;
overlap_ = false;
dest_blocks_.clear();
source_blocks_.clear();
while (true) {
uint64_t source_offset;
int linear_blocks;
PrepareReadAhead(&source_offset, &num_ops, blocks);
linear_blocks = blocks.size();
if (linear_blocks == 0) {
// No more blocks to read
SNAP_LOG(DEBUG) << " Read-ahead completed....";
break;
}
// Get the first block in the consecutive set of blocks
source_offset = source_offset - (linear_blocks - 1) * BLOCK_SZ;
size_t io_size = (linear_blocks * BLOCK_SZ);
num_ops -= linear_blocks;
total_blocks_merged += linear_blocks;
// Mark the block number as the one which will
// be the final block to be merged in this entire region.
// Read-ahead thread will get
// notified when this block is merged to make
// forward progress
snapuserd_->SetFinalBlockMerged(blocks.back());
while (linear_blocks) {
uint64_t new_block = blocks.back();
blocks.pop_back();
// Assign the mapping
void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + offset);
read_ahead_buffer_map[new_block] = bufptr;
offset += BLOCK_SZ;
bm = reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ +
metadata_offset);
bm->new_block = new_block;
bm->file_offset = file_offset;
metadata_offset += sizeof(struct ScratchMetadata);
file_offset += BLOCK_SZ;
linear_blocks -= 1;
}
// Read from the base device consecutive set of blocks in one shot
if (!android::base::ReadFullyAtOffset(backing_store_fd_,
(char*)read_ahead_buffer_ + buffer_offset, io_size,
source_offset)) {
SNAP_PLOG(ERROR) << "Ordered-op failed. Read from backing store: "
<< backing_store_device_ << "at block :" << source_offset / BLOCK_SZ
<< " offset :" << source_offset % BLOCK_SZ
<< " buffer_offset : " << buffer_offset << " io_size : " << io_size
<< " buf-addr : " << read_ahead_buffer_;
snapuserd_->ReadAheadIOFailed();
return false;
}
// This is important - explicitly set the contents to zero. This is used
// when re-constructing the data after crash. This indicates end of
// reading metadata contents when re-constructing the data
bm = reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ + metadata_offset);
bm->new_block = 0;
bm->file_offset = 0;
buffer_offset += io_size;
}
snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);
// Flush the data only if we have a overlapping blocks in the region
if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
snapuserd_->ReadAheadIOFailed();
return false;
}
return true;
}
bool ReadAheadThread::RunThread() {
if (!InitializeFds()) {
return false;
}
InitializeRAIter();
InitializeBuffer();
while (!RAIterDone()) {
if (!ReadAheadIOStart()) {
return false;
}
bool status = snapuserd_->WaitForMergeToComplete();
if (status && !snapuserd_->CommitMerge(snapuserd_->GetTotalRaBlocksMerged())) {
return false;
}
if (!status) break;
}
CloseFds();
SNAP_LOG(INFO) << " ReadAhead thread terminating....";
return true;
}
// Initialization
bool ReadAheadThread::InitializeFds() {
backing_store_fd_.reset(open(backing_store_device_.c_str(), O_RDONLY));
if (backing_store_fd_ < 0) {
SNAP_PLOG(ERROR) << "Open Failed: " << backing_store_device_;
return false;
}
cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
if (cow_fd_ < 0) {
SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
return false;
}
return true;
}
void ReadAheadThread::InitializeRAIter() {
std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
read_ahead_iter_ = read_ahead_ops.rbegin();
}
bool ReadAheadThread::RAIterDone() {
std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
return read_ahead_iter_ == read_ahead_ops.rend();
}
void ReadAheadThread::RAIterNext() {
read_ahead_iter_++;
}
const CowOperation* ReadAheadThread::GetRAOpIter() {
return *read_ahead_iter_;
}
void ReadAheadThread::InitializeBuffer() {
void* mapped_addr = snapuserd_->GetMappedAddr();
// Map the scratch space region into memory
metadata_buffer_ =
static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferMetadataOffset());
read_ahead_buffer_ = static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferDataOffset());
}
} // namespace snapshot
} // namespace android