fs_mgr/libsnapshot/snapuserd/dm-snapshot-merge/snapuserd_readahead.cpp - third_party/android/platform/system/core - Git at Google

 /*
  * Copyright (C) 2021 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "snapuserd.h"

 #include <csignal>
 #include <optional>
 #include <set>

 #include <snapuserd/snapuserd_client.h>

 namespace android {
 namespace snapshot {

 using namespace android;
 using namespace android::dm;
 using android::base::unique_fd;

 #define SNAP_LOG(level) LOG(level) << misc_name_ << ": "
 #define SNAP_PLOG(level) PLOG(level) << misc_name_ << ": "

 /*
  * Merging a copy operation involves the following flow:
  *
  * 1: dm-snapshot layer requests merge for a 4k block. dm-user sends the request
  *    to the daemon
  * 2: daemon reads the source block
  * 3: daemon copies the source data
  * 4: IO completion sent back to dm-user (a switch from user space to kernel)
  * 5: dm-snapshot merges the data to base device
  * 6: dm-snapshot sends the merge-completion IO to dm-user
  * 7: dm-user re-directs the merge completion IO to daemon (one more switch)
  * 8: daemon updates the COW file about the completed merge request (a write syscall) and followed
  * by a fysnc. 9: Send the IO completion back to dm-user
  *
  * The above sequence is a significant overhead especially when merging one 4k
  * block at a time.
  *
  * Read-ahead layer will optimize the above path by reading the data from base
  * device in the background so that merging thread can retrieve the data from
  * the read-ahead cache. Additionally, syncing of merged data is deferred to
  * read-ahead thread threadby the IO path is not bottlenecked.
  *
  * We create a scratch space of 2MB to store the read-ahead data in the COW
  * device.
  *
  *      +-----------------------+
  *      |     Header (fixed)    |
  *      +-----------------------+
  *      |    Scratch space      |  <-- 2MB
  *      +-----------------------+
  *
  *      Scratch space is as follows:
  *
  *      +-----------------------+
  *      |       Metadata        | <- 4k page
  *      +-----------------------+
  *      |       Metadata        | <- 4k page
  *      +-----------------------+
  *      |                       |
  *      |    Read-ahead data    |
  *      |                       |
  *      +-----------------------+
  *
  * State transitions and communication between read-ahead thread and worker
  * thread during merge:
  * =====================================================================
  *
  *   Worker Threads                                 Read-Ahead thread
  *   ------------------------------------------------------------------
  *
  *      |
  *      |
  *  --> -----------------READ_AHEAD_BEGIN------------->|
  *  |   |                                              | READ_AHEAD_IN_PROGRESS
  *  |  WAIT                                            |
  *  |   |                                              |
  *  |   |<-----------------IO_IN_PROGRESS---------------
  *  |   |                                              |
  *  |   | IO_IN_PRGRESS                               WAIT
  *  |   |                                              |
  *  |<--|                                              |
  *      |                                              |
  *      ------------------IO_TERMINATED--------------->|
  *                                                     END
  *
  *
  * ===================================================================
  *
  * Example:
  *
  * We have 6 copy operations to be executed in OTA and there is a overlap. Update-engine
  * will write to COW file as follows:
  *
  * Op-1: 20 -> 23
  * Op-2: 19 -> 22
  * Op-3: 18 -> 21
  * Op-4: 17 -> 20
  * Op-5: 16 -> 19
  * Op-6: 15 -> 18
  *
  * Read-ahead thread will read all the 6 source blocks and store the data in the
  * scratch space. Metadata will contain the destination block numbers. Thus,
  * scratch space will look something like this:
  *
  * +--------------+
  * | Block   23   |
  * | offset - 1   |
  * +--------------+
  * | Block   22   |
  * | offset - 2   |
  * +--------------+
  * | Block   21   |
  * | offset - 3   |
  * +--------------+
  *    ...
  *    ...
  * +--------------+
  * | Data-Block 20| <-- offset - 1
  * +--------------+
  * | Data-Block 19| <-- offset - 2
  * +--------------+
  * | Data-Block 18| <-- offset - 3
  * +--------------+
  *     ...
  *     ...
  *
  * ====================================================================
  * IO Path:
  *
  * Read-ahead will serve the data to worker threads during merge only
  * after metadata and data are persisted to the scratch space. Worker
  * threads during merge will always retrieve the data from cache; if the
  * cache is not populated, it will wait for the read-ahead thread to finish.
  * Furthermore, the number of operations merged will by synced to the header
  * only when all the blocks in the read-ahead cache are merged. In the above
  * case, when all 6 operations are merged, COW Header is updated with
  * num_merge_ops = 6.
  *
  * Merge resume after crash:
  *
  * Let's say we have a crash after 5 operations are merged. i.e. after
  * Op-5: 16->19 is completed but before the Op-6 is merged. Thus, COW Header
  * num_merge_ops will be 0 as the all the ops were not merged yet. During next
  * reboot, read-ahead thread will re-construct the data in-memory from the
  * scratch space; when merge resumes, Op-1 will be re-exectued. However,
  * data will be served from read-ahead cache safely even though, block 20
  * was over-written by Op-4.
  *
  */

 ReadAheadThread::ReadAheadThread(const std::string& cow_device, const std::string& backing_device,
                                  const std::string& misc_name,
                                  std::shared_ptr<Snapuserd> snapuserd) {
     cow_device_ = cow_device;
     backing_store_device_ = backing_device;
     misc_name_ = misc_name;
     snapuserd_ = snapuserd;
 }

 void ReadAheadThread::CheckOverlap(const CowOperation* cow_op) {
     uint64_t source_block = cow_op->source;
     uint64_t source_offset = 0;
     if (dest_blocks_.count(cow_op->new_block) || source_blocks_.count(source_block) ||
         (source_offset > 0 && source_blocks_.count(source_block + 1))) {
         overlap_ = true;
     }

     dest_blocks_.insert(source_block);
     if (source_offset > 0) {
         dest_blocks_.insert(source_block + 1);
     }
     source_blocks_.insert(cow_op->new_block);
 }

 void ReadAheadThread::PrepareReadAhead(uint64_t* source_offset, int* pending_ops,
                                        std::vector<uint64_t>& blocks) {
     int num_ops = *pending_ops;
     int nr_consecutive = 0;
     CHECK_NE(source_offset, nullptr);

     if (!RAIterDone() && num_ops) {
         // Get the first block with offset
         const CowOperation* cow_op = GetRAOpIter();
         CHECK_NE(cow_op, nullptr);
         *source_offset = cow_op->source;
         if (cow_op->type == kCowCopyOp) {
             *source_offset *= BLOCK_SZ;
         }
         RAIterNext();
         num_ops -= 1;
         nr_consecutive = 1;
         blocks.push_back(cow_op->new_block);

         if (!overlap_) {
             CheckOverlap(cow_op);
         }

         /*
          * Find number of consecutive blocks working backwards.
          */
         while (!RAIterDone() && num_ops) {
             const CowOperation* op = GetRAOpIter();
             CHECK_NE(op, nullptr);
             uint64_t next_offset = op->source;
             if (op->type == kCowCopyOp) {
                 next_offset *= BLOCK_SZ;
             }
             if (next_offset + nr_consecutive * BLOCK_SZ != *source_offset) {
                 break;
             }
             nr_consecutive += 1;
             num_ops -= 1;
             blocks.push_back(op->new_block);
             RAIterNext();

             if (!overlap_) {
                 CheckOverlap(op);
             }
         }
     }
 }

 bool ReadAheadThread::ReconstructDataFromCow() {
     std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
     read_ahead_buffer_map.clear();
     loff_t metadata_offset = 0;
     loff_t start_data_offset = snapuserd_->GetBufferDataOffset();
     int num_ops = 0;
     int total_blocks_merged = 0;

     // This memcpy is important as metadata_buffer_ will be an unaligned address and will fault
     // on 32-bit systems
     std::unique_ptr<uint8_t[]> metadata_buffer =
             std::make_unique<uint8_t[]>(snapuserd_->GetBufferMetadataSize());
     memcpy(metadata_buffer.get(), metadata_buffer_, snapuserd_->GetBufferMetadataSize());

     while (true) {
         struct ScratchMetadata* bm = reinterpret_cast<struct ScratchMetadata*>(
                 (char*)metadata_buffer.get() + metadata_offset);

         // Done reading metadata
         if (bm->new_block == 0 && bm->file_offset == 0) {
             break;
         }

         loff_t buffer_offset = bm->file_offset - start_data_offset;
         void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + buffer_offset);
         read_ahead_buffer_map[bm->new_block] = bufptr;
         num_ops += 1;
         total_blocks_merged += 1;

         metadata_offset += sizeof(struct ScratchMetadata);
     }

     // We are done re-constructing the mapping; however, we need to make sure
     // all the COW operations to-be merged are present in the re-constructed
     // mapping.
     while (!RAIterDone()) {
         const CowOperation* op = GetRAOpIter();
         if (read_ahead_buffer_map.find(op->new_block) != read_ahead_buffer_map.end()) {
             num_ops -= 1;
             snapuserd_->SetFinalBlockMerged(op->new_block);
             RAIterNext();
         } else {
             // Verify that we have covered all the ops which were re-constructed
             // from COW device - These are the ops which are being
             // re-constructed after crash.
             if (!(num_ops == 0)) {
                 SNAP_LOG(ERROR) << "ReconstructDataFromCow failed. Not all ops recoverd "
                                 << " Pending ops: " << num_ops;
                 snapuserd_->ReadAheadIOFailed();
                 return false;
             }
             break;
         }
     }

     snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);

     snapuserd_->ReconstructDataFromCowFinish();

     if (!snapuserd_->ReadAheadIOCompleted(true)) {
         SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
         snapuserd_->ReadAheadIOFailed();
         return false;
     }

     SNAP_LOG(INFO) << "ReconstructDataFromCow success";
     return true;
 }

 bool ReadAheadThread::ReadAheadIOStart() {
     // Check if the data has to be constructed from the COW file.
     // This will be true only once during boot up after a crash
     // during merge.
     if (snapuserd_->ReconstructDataFromCow()) {
         return ReconstructDataFromCow();
     }

     std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
     read_ahead_buffer_map.clear();

     int num_ops = (snapuserd_->GetBufferDataSize()) / BLOCK_SZ;
     loff_t metadata_offset = 0;

     struct ScratchMetadata* bm =
             reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ + metadata_offset);

     bm->new_block = 0;
     bm->file_offset = 0;

     std::vector<uint64_t> blocks;

     loff_t buffer_offset = 0;
     loff_t offset = 0;
     loff_t file_offset = snapuserd_->GetBufferDataOffset();
     int total_blocks_merged = 0;
     overlap_ = false;
     dest_blocks_.clear();
     source_blocks_.clear();

     while (true) {
         uint64_t source_offset;
         int linear_blocks;

         PrepareReadAhead(&source_offset, &num_ops, blocks);
         linear_blocks = blocks.size();
         if (linear_blocks == 0) {
             // No more blocks to read
             SNAP_LOG(DEBUG) << " Read-ahead completed....";
             break;
         }

         // Get the first block in the consecutive set of blocks
         source_offset = source_offset - (linear_blocks - 1) * BLOCK_SZ;
         size_t io_size = (linear_blocks * BLOCK_SZ);
         num_ops -= linear_blocks;
         total_blocks_merged += linear_blocks;

         // Mark the block number as the one which will
         // be the final block to be merged in this entire region.
         // Read-ahead thread will get
         // notified when this block is merged to make
         // forward progress
         snapuserd_->SetFinalBlockMerged(blocks.back());

         while (linear_blocks) {
             uint64_t new_block = blocks.back();
             blocks.pop_back();
             // Assign the mapping
             void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + offset);
             read_ahead_buffer_map[new_block] = bufptr;
             offset += BLOCK_SZ;

             bm = reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ +
                                                            metadata_offset);
             bm->new_block = new_block;
             bm->file_offset = file_offset;

             metadata_offset += sizeof(struct ScratchMetadata);
             file_offset += BLOCK_SZ;

             linear_blocks -= 1;
         }

         // Read from the base device consecutive set of blocks in one shot
         if (!android::base::ReadFullyAtOffset(backing_store_fd_,
                                               (char*)read_ahead_buffer_ + buffer_offset, io_size,
                                               source_offset)) {
             SNAP_PLOG(ERROR) << "Ordered-op failed. Read from backing store: "
                              << backing_store_device_ << "at block :" << source_offset / BLOCK_SZ
                              << " offset :" << source_offset % BLOCK_SZ
                              << " buffer_offset : " << buffer_offset << " io_size : " << io_size
                              << " buf-addr : " << read_ahead_buffer_;

             snapuserd_->ReadAheadIOFailed();
             return false;
         }

         // This is important - explicitly set the contents to zero. This is used
         // when re-constructing the data after crash. This indicates end of
         // reading metadata contents when re-constructing the data
         bm = reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ + metadata_offset);
         bm->new_block = 0;
         bm->file_offset = 0;

         buffer_offset += io_size;
     }

     snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);

     // Flush the data only if we have a overlapping blocks in the region
     if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
         SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
         snapuserd_->ReadAheadIOFailed();
         return false;
     }

     return true;
 }

 bool ReadAheadThread::RunThread() {
     if (!InitializeFds()) {
         return false;
     }

     InitializeRAIter();
     InitializeBuffer();

     while (!RAIterDone()) {
         if (!ReadAheadIOStart()) {
             return false;
         }

         bool status = snapuserd_->WaitForMergeToComplete();

         if (status && !snapuserd_->CommitMerge(snapuserd_->GetTotalRaBlocksMerged())) {
             return false;
         }

         if (!status) break;
     }

     CloseFds();
     SNAP_LOG(INFO) << " ReadAhead thread terminating....";
     return true;
 }

 // Initialization
 bool ReadAheadThread::InitializeFds() {
     backing_store_fd_.reset(open(backing_store_device_.c_str(), O_RDONLY));
     if (backing_store_fd_ < 0) {
         SNAP_PLOG(ERROR) << "Open Failed: " << backing_store_device_;
         return false;
     }

     cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
     if (cow_fd_ < 0) {
         SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
         return false;
     }

     return true;
 }

 void ReadAheadThread::InitializeRAIter() {
     std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
     read_ahead_iter_ = read_ahead_ops.rbegin();
 }

 bool ReadAheadThread::RAIterDone() {
     std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
     return read_ahead_iter_ == read_ahead_ops.rend();
 }

 void ReadAheadThread::RAIterNext() {
     read_ahead_iter_++;
 }

 const CowOperation* ReadAheadThread::GetRAOpIter() {
     return *read_ahead_iter_;
 }

 void ReadAheadThread::InitializeBuffer() {
     void* mapped_addr = snapuserd_->GetMappedAddr();
     // Map the scratch space region into memory
     metadata_buffer_ =
             static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferMetadataOffset());
     read_ahead_buffer_ = static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferDataOffset());
 }

 }  // namespace snapshot
 }  // namespace android
	/*
	* Copyright (C) 2021 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "snapuserd.h"

	#include <csignal>
	#include <optional>
	#include <set>

	#include <snapuserd/snapuserd_client.h>

	namespace android {
	namespace snapshot {

	using namespace android;
	using namespace android::dm;
	using android::base::unique_fd;

	#define SNAP_LOG(level) LOG(level) << misc_name_ << ": "
	#define SNAP_PLOG(level) PLOG(level) << misc_name_ << ": "

	/*
	* Merging a copy operation involves the following flow:
	*
	* 1: dm-snapshot layer requests merge for a 4k block. dm-user sends the request
	* to the daemon
	* 2: daemon reads the source block
	* 3: daemon copies the source data
	* 4: IO completion sent back to dm-user (a switch from user space to kernel)
	* 5: dm-snapshot merges the data to base device
	* 6: dm-snapshot sends the merge-completion IO to dm-user
	* 7: dm-user re-directs the merge completion IO to daemon (one more switch)
	* 8: daemon updates the COW file about the completed merge request (a write syscall) and followed
	* by a fysnc. 9: Send the IO completion back to dm-user
	*
	* The above sequence is a significant overhead especially when merging one 4k
	* block at a time.
	*
	* Read-ahead layer will optimize the above path by reading the data from base
	* device in the background so that merging thread can retrieve the data from
	* the read-ahead cache. Additionally, syncing of merged data is deferred to
	* read-ahead thread threadby the IO path is not bottlenecked.
	*
	* We create a scratch space of 2MB to store the read-ahead data in the COW
	* device.
	*
	* +-----------------------+
	* \| Header (fixed) \|
	* +-----------------------+
	* \| Scratch space \| <-- 2MB
	* +-----------------------+
	*
	* Scratch space is as follows:
	*
	* +-----------------------+
	* \| Metadata \| <- 4k page
	* +-----------------------+
	* \| Metadata \| <- 4k page
	* +-----------------------+
	* \| \|
	* \| Read-ahead data \|
	* \| \|
	* +-----------------------+
	*
	* State transitions and communication between read-ahead thread and worker
	* thread during merge:
	* =====================================================================
	*
	* Worker Threads Read-Ahead thread
	* ------------------------------------------------------------------
	*
	* \|
	* \|
	* --> -----------------READ_AHEAD_BEGIN------------->\|
	* \| \| \| READ_AHEAD_IN_PROGRESS
	* \| WAIT \|
	* \| \| \|
	* \| \|<-----------------IO_IN_PROGRESS---------------
	* \| \| \|
	* \| \| IO_IN_PRGRESS WAIT
	* \| \| \|
	* \|<--\| \|
	* \| \|
	* ------------------IO_TERMINATED--------------->\|
	* END
	*
	*
	* ===================================================================
	*
	* Example:
	*
	* We have 6 copy operations to be executed in OTA and there is a overlap. Update-engine
	* will write to COW file as follows:
	*
	* Op-1: 20 -> 23
	* Op-2: 19 -> 22
	* Op-3: 18 -> 21
	* Op-4: 17 -> 20
	* Op-5: 16 -> 19
	* Op-6: 15 -> 18
	*
	* Read-ahead thread will read all the 6 source blocks and store the data in the
	* scratch space. Metadata will contain the destination block numbers. Thus,
	* scratch space will look something like this:
	*
	* +--------------+
	* \| Block 23 \|
	* \| offset - 1 \|
	* +--------------+
	* \| Block 22 \|
	* \| offset - 2 \|
	* +--------------+
	* \| Block 21 \|
	* \| offset - 3 \|
	* +--------------+
	* ...
	* ...
	* +--------------+
	* \| Data-Block 20\| <-- offset - 1
	* +--------------+
	* \| Data-Block 19\| <-- offset - 2
	* +--------------+
	* \| Data-Block 18\| <-- offset - 3
	* +--------------+
	* ...
	* ...
	*
	* ====================================================================
	* IO Path:
	*
	* Read-ahead will serve the data to worker threads during merge only
	* after metadata and data are persisted to the scratch space. Worker
	* threads during merge will always retrieve the data from cache; if the
	* cache is not populated, it will wait for the read-ahead thread to finish.
	* Furthermore, the number of operations merged will by synced to the header
	* only when all the blocks in the read-ahead cache are merged. In the above
	* case, when all 6 operations are merged, COW Header is updated with
	* num_merge_ops = 6.
	*
	* Merge resume after crash:
	*
	* Let's say we have a crash after 5 operations are merged. i.e. after
	* Op-5: 16->19 is completed but before the Op-6 is merged. Thus, COW Header
	* num_merge_ops will be 0 as the all the ops were not merged yet. During next
	* reboot, read-ahead thread will re-construct the data in-memory from the
	* scratch space; when merge resumes, Op-1 will be re-exectued. However,
	* data will be served from read-ahead cache safely even though, block 20
	* was over-written by Op-4.
	*
	*/

	ReadAheadThread::ReadAheadThread(const std::string& cow_device, const std::string& backing_device,
	const std::string& misc_name,
	std::shared_ptr<Snapuserd> snapuserd) {
	cow_device_ = cow_device;
	backing_store_device_ = backing_device;
	misc_name_ = misc_name;
	snapuserd_ = snapuserd;
	}

	void ReadAheadThread::CheckOverlap(const CowOperation* cow_op) {
	uint64_t source_block = cow_op->source;
	uint64_t source_offset = 0;
	if (dest_blocks_.count(cow_op->new_block) \|\| source_blocks_.count(source_block) \|\|
	(source_offset > 0 && source_blocks_.count(source_block + 1))) {
	overlap_ = true;
	}

	dest_blocks_.insert(source_block);
	if (source_offset > 0) {
	dest_blocks_.insert(source_block + 1);
	}
	source_blocks_.insert(cow_op->new_block);
	}

	void ReadAheadThread::PrepareReadAhead(uint64_t* source_offset, int* pending_ops,
	std::vector<uint64_t>& blocks) {
	int num_ops = *pending_ops;
	int nr_consecutive = 0;
	CHECK_NE(source_offset, nullptr);

	if (!RAIterDone() && num_ops) {
	// Get the first block with offset
	const CowOperation* cow_op = GetRAOpIter();
	CHECK_NE(cow_op, nullptr);
	*source_offset = cow_op->source;
	if (cow_op->type == kCowCopyOp) {
	source_offset = BLOCK_SZ;
	}
	RAIterNext();
	num_ops -= 1;
	nr_consecutive = 1;
	blocks.push_back(cow_op->new_block);

	if (!overlap_) {
	CheckOverlap(cow_op);
	}

	/*
	* Find number of consecutive blocks working backwards.
	*/
	while (!RAIterDone() && num_ops) {
	const CowOperation* op = GetRAOpIter();
	CHECK_NE(op, nullptr);
	uint64_t next_offset = op->source;
	if (op->type == kCowCopyOp) {
	next_offset *= BLOCK_SZ;
	}
	if (next_offset + nr_consecutive * BLOCK_SZ != *source_offset) {
	break;
	}
	nr_consecutive += 1;
	num_ops -= 1;
	blocks.push_back(op->new_block);
	RAIterNext();

	if (!overlap_) {
	CheckOverlap(op);
	}
	}
	}
	}

	bool ReadAheadThread::ReconstructDataFromCow() {
	std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
	read_ahead_buffer_map.clear();
	loff_t metadata_offset = 0;
	loff_t start_data_offset = snapuserd_->GetBufferDataOffset();
	int num_ops = 0;
	int total_blocks_merged = 0;

	// This memcpy is important as metadata_buffer_ will be an unaligned address and will fault
	// on 32-bit systems
	std::unique_ptr<uint8_t[]> metadata_buffer =
	std::make_unique<uint8_t[]>(snapuserd_->GetBufferMetadataSize());
	memcpy(metadata_buffer.get(), metadata_buffer_, snapuserd_->GetBufferMetadataSize());

	while (true) {
	struct ScratchMetadata* bm = reinterpret_cast<struct ScratchMetadata*>(
	(char*)metadata_buffer.get() + metadata_offset);

	// Done reading metadata
	if (bm->new_block == 0 && bm->file_offset == 0) {
	break;
	}

	loff_t buffer_offset = bm->file_offset - start_data_offset;
	void* bufptr = static_cast<void>((char)read_ahead_buffer_ + buffer_offset);
	read_ahead_buffer_map[bm->new_block] = bufptr;
	num_ops += 1;
	total_blocks_merged += 1;

	metadata_offset += sizeof(struct ScratchMetadata);
	}

	// We are done re-constructing the mapping; however, we need to make sure
	// all the COW operations to-be merged are present in the re-constructed
	// mapping.
	while (!RAIterDone()) {
	const CowOperation* op = GetRAOpIter();
	if (read_ahead_buffer_map.find(op->new_block) != read_ahead_buffer_map.end()) {
	num_ops -= 1;
	snapuserd_->SetFinalBlockMerged(op->new_block);
	RAIterNext();
	} else {
	// Verify that we have covered all the ops which were re-constructed
	// from COW device - These are the ops which are being
	// re-constructed after crash.
	if (!(num_ops == 0)) {
	SNAP_LOG(ERROR) << "ReconstructDataFromCow failed. Not all ops recoverd "
	<< " Pending ops: " << num_ops;
	snapuserd_->ReadAheadIOFailed();
	return false;
	}
	break;
	}
	}

	snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);

	snapuserd_->ReconstructDataFromCowFinish();

	if (!snapuserd_->ReadAheadIOCompleted(true)) {
	SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
	snapuserd_->ReadAheadIOFailed();
	return false;
	}

	SNAP_LOG(INFO) << "ReconstructDataFromCow success";
	return true;
	}

	bool ReadAheadThread::ReadAheadIOStart() {
	// Check if the data has to be constructed from the COW file.
	// This will be true only once during boot up after a crash
	// during merge.
	if (snapuserd_->ReconstructDataFromCow()) {
	return ReconstructDataFromCow();
	}

	std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
	read_ahead_buffer_map.clear();

	int num_ops = (snapuserd_->GetBufferDataSize()) / BLOCK_SZ;
	loff_t metadata_offset = 0;

	struct ScratchMetadata* bm =
	reinterpret_cast<struct ScratchMetadata>((char)metadata_buffer_ + metadata_offset);

	bm->new_block = 0;
	bm->file_offset = 0;

	std::vector<uint64_t> blocks;

	loff_t buffer_offset = 0;
	loff_t offset = 0;
	loff_t file_offset = snapuserd_->GetBufferDataOffset();
	int total_blocks_merged = 0;
	overlap_ = false;
	dest_blocks_.clear();
	source_blocks_.clear();

	while (true) {
	uint64_t source_offset;
	int linear_blocks;

	PrepareReadAhead(&source_offset, &num_ops, blocks);
	linear_blocks = blocks.size();
	if (linear_blocks == 0) {
	// No more blocks to read
	SNAP_LOG(DEBUG) << " Read-ahead completed....";
	break;
	}

	// Get the first block in the consecutive set of blocks
	source_offset = source_offset - (linear_blocks - 1) * BLOCK_SZ;
	size_t io_size = (linear_blocks * BLOCK_SZ);
	num_ops -= linear_blocks;
	total_blocks_merged += linear_blocks;

	// Mark the block number as the one which will
	// be the final block to be merged in this entire region.
	// Read-ahead thread will get
	// notified when this block is merged to make
	// forward progress
	snapuserd_->SetFinalBlockMerged(blocks.back());

	while (linear_blocks) {
	uint64_t new_block = blocks.back();
	blocks.pop_back();
	// Assign the mapping
	void* bufptr = static_cast<void>((char)read_ahead_buffer_ + offset);
	read_ahead_buffer_map[new_block] = bufptr;
	offset += BLOCK_SZ;

	bm = reinterpret_cast<struct ScratchMetadata>((char)metadata_buffer_ +
	metadata_offset);
	bm->new_block = new_block;
	bm->file_offset = file_offset;

	metadata_offset += sizeof(struct ScratchMetadata);
	file_offset += BLOCK_SZ;

	linear_blocks -= 1;
	}

	// Read from the base device consecutive set of blocks in one shot
	if (!android::base::ReadFullyAtOffset(backing_store_fd_,
	(char*)read_ahead_buffer_ + buffer_offset, io_size,
	source_offset)) {
	SNAP_PLOG(ERROR) << "Ordered-op failed. Read from backing store: "
	<< backing_store_device_ << "at block :" << source_offset / BLOCK_SZ
	<< " offset :" << source_offset % BLOCK_SZ
	<< " buffer_offset : " << buffer_offset << " io_size : " << io_size
	<< " buf-addr : " << read_ahead_buffer_;

	snapuserd_->ReadAheadIOFailed();
	return false;
	}

	// This is important - explicitly set the contents to zero. This is used
	// when re-constructing the data after crash. This indicates end of
	// reading metadata contents when re-constructing the data
	bm = reinterpret_cast<struct ScratchMetadata>((char)metadata_buffer_ + metadata_offset);
	bm->new_block = 0;
	bm->file_offset = 0;

	buffer_offset += io_size;
	}

	snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);

	// Flush the data only if we have a overlapping blocks in the region
	if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
	SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
	snapuserd_->ReadAheadIOFailed();
	return false;
	}

	return true;
	}

	bool ReadAheadThread::RunThread() {
	if (!InitializeFds()) {
	return false;
	}

	InitializeRAIter();
	InitializeBuffer();

	while (!RAIterDone()) {
	if (!ReadAheadIOStart()) {
	return false;
	}

	bool status = snapuserd_->WaitForMergeToComplete();

	if (status && !snapuserd_->CommitMerge(snapuserd_->GetTotalRaBlocksMerged())) {
	return false;
	}

	if (!status) break;
	}

	CloseFds();
	SNAP_LOG(INFO) << " ReadAhead thread terminating....";
	return true;
	}

	// Initialization
	bool ReadAheadThread::InitializeFds() {
	backing_store_fd_.reset(open(backing_store_device_.c_str(), O_RDONLY));
	if (backing_store_fd_ < 0) {
	SNAP_PLOG(ERROR) << "Open Failed: " << backing_store_device_;
	return false;
	}

	cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
	if (cow_fd_ < 0) {
	SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
	return false;
	}

	return true;
	}

	void ReadAheadThread::InitializeRAIter() {
	std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
	read_ahead_iter_ = read_ahead_ops.rbegin();
	}

	bool ReadAheadThread::RAIterDone() {
	std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
	return read_ahead_iter_ == read_ahead_ops.rend();
	}

	void ReadAheadThread::RAIterNext() {
	read_ahead_iter_++;
	}

	const CowOperation* ReadAheadThread::GetRAOpIter() {
	return *read_ahead_iter_;
	}

	void ReadAheadThread::InitializeBuffer() {
	void* mapped_addr = snapuserd_->GetMappedAddr();
	// Map the scratch space region into memory
	metadata_buffer_ =
	static_cast<void>((char)mapped_addr + snapuserd_->GetBufferMetadataOffset());
	read_ahead_buffer_ = static_cast<void>((char)mapped_addr + snapuserd_->GetBufferDataOffset());
	}

	} // namespace snapshot
	} // namespace android