system/dev/block/ramdisk/ramdisk.c - zircon/ - Git at Google

 // Copyright 2017 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include <ddk/device.h>
 #include <ddk/driver.h>
 #include <ddk/binding.h>
 #include <ddk/protocol/block.h>

 #include <zircon/device/ramdisk.h>
 #include <lib/sync/completion.h>

 #include <assert.h>
 #include <inttypes.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <sys/param.h>
 #include <threads.h>
 #include <zircon/boot/image.h>
 #include <zircon/device/block.h>
 #include <zircon/listnode.h>
 #include <zircon/process.h>
 #include <zircon/syscalls.h>
 #include <zircon/types.h>

 #define MAX_TRANSFER_SIZE (1 << 19)

 typedef struct {
     zx_device_t* zxdev;
 } ramctl_device_t;

 typedef struct ramdisk_device {
     zx_device_t* zxdev;
     uintptr_t mapped_addr;
     uint64_t blk_size;
     uint64_t blk_count;
     uint8_t type_guid[ZBI_PARTITION_GUID_LEN];

     mtx_t lock;
     sync_completion_t signal;
     list_node_t txn_list;
     list_node_t deferred_list;
     bool dead;

     uint32_t flags;
     zx_handle_t vmo;

     bool asleep; // true if the ramdisk is "sleeping"
     uint64_t sa_blk_count; // number of blocks to sleep after
     ramdisk_blk_counts_t blk_counts; // current block counts

     thrd_t worker;
     char name[NAME_MAX];
 } ramdisk_device_t;

 typedef struct {
     block_op_t op;
     list_node_t node;
     block_impl_queue_callback completion_cb;
     void* cookie;
 } ramdisk_txn_t;

 // The worker thread processes messages from iotxns in the background
 static int worker_thread(void* arg) {
     zx_status_t status = ZX_OK;
     ramdisk_device_t* dev = (ramdisk_device_t*)arg;
     ramdisk_txn_t* txn = NULL;
     bool dead, asleep, defer;
     size_t blocks = 0;

     for (;;) {
         for (;;) {
             mtx_lock(&dev->lock);
             txn = NULL;
             dead = dev->dead;
             asleep = dev->asleep;
             defer = (dev->flags & RAMDISK_FLAG_RESUME_ON_WAKE) != 0;
             blocks = dev->sa_blk_count;

             if (!asleep) {
                 // If we are awake, try grabbing pending transactions from the deferred list.
                 txn = list_remove_head_type(&dev->deferred_list, ramdisk_txn_t, node);
             }

             if (txn == NULL) {
                 // If no transactions were available in the deferred list (or we are asleep),
                 // grab one from the regular txn_list.
                 txn = list_remove_head_type(&dev->txn_list, ramdisk_txn_t, node);
             }

             mtx_unlock(&dev->lock);

             if (dead) {
                 goto goodbye;
             }

             if (txn == NULL) {
                 sync_completion_wait(&dev->signal, ZX_TIME_INFINITE);
             } else {
                 sync_completion_reset(&dev->signal);
                 break;
             }
         }

         size_t txn_blocks = txn->op.rw.length;
         if (txn->op.command == BLOCK_OP_READ || blocks == 0 || blocks > txn_blocks) {
             // If the ramdisk is not configured to sleep after x blocks, or the number of blocks in
             // this transaction does not exceed the sa_blk_count, or we are performing a read
             // operation, use the current transaction length.
             blocks = txn_blocks;
         }

         size_t length = blocks * dev->blk_size;
         size_t dev_offset = txn->op.rw.offset_dev * dev->blk_size;
         size_t vmo_offset = txn->op.rw.offset_vmo * dev->blk_size;
         void* addr = (void*) dev->mapped_addr + dev_offset;

         if (length > MAX_TRANSFER_SIZE) {
             status = ZX_ERR_OUT_OF_RANGE;
         } else if (txn->op.command == BLOCK_OP_READ) {
             // A read operation should always succeed, even if the ramdisk is "asleep".
             status = zx_vmo_write(txn->op.rw.vmo, addr, vmo_offset, length);
         } else if (asleep) {
             if (defer) {
                 // If we are asleep but resuming on wake, add txn to the deferred_list.
                 // deferred_list is only accessed by the worker_thread, so a lock is not needed.
                 list_add_tail(&dev->deferred_list, &txn->node);
                 continue;
             } else {
                 status = ZX_ERR_UNAVAILABLE;
             }
         } else { // BLOCK_OP_WRITE
             status = zx_vmo_read(txn->op.rw.vmo, addr, vmo_offset, length);

             if (status == ZX_OK && blocks < txn->op.rw.length && defer) {
                 // If the first part of the transaction succeeded but the entire transaction is not
                 // complete, we need to address the remainder.

                 // If we are deferring after this block count, update the transaction to
                 // reflect the blocks that have already been written, and add it to the
                 // deferred queue.
                 txn->op.rw.length -= blocks;
                 txn->op.rw.offset_vmo += blocks;
                 txn->op.rw.offset_dev += blocks;

                 // Add the remaining blocks to the deferred list.
                 list_add_tail(&dev->deferred_list, &txn->node);
             }
         }

         if (txn->op.command == BLOCK_OP_WRITE) {
             // Update the ramdisk block counts. Since we aren't failing read transactions,
             // only include write transaction counts.
             mtx_lock(&dev->lock);
             // Increment the count based on the result of the last transaction.
             if (status == ZX_OK) {
                 dev->blk_counts.successful += blocks;

                 if (blocks != txn_blocks && !defer) {
                     // If we are not deferring, then any excess blocks have failed.
                     dev->blk_counts.failed += txn_blocks - blocks;
                     status = ZX_ERR_UNAVAILABLE;
                 }
             } else {
                 dev->blk_counts.failed += txn_blocks;
             }

             // Put the ramdisk to sleep if we have reached the required # of blocks.
             if (dev->sa_blk_count > 0) {
                 dev->sa_blk_count -= blocks;
                 dev->asleep = (dev->sa_blk_count == 0);
             }
             mtx_unlock(&dev->lock);

             if (defer && blocks != txn_blocks && status == ZX_OK) {
                 // If we deferred partway through a transaction, hold off on returning the
                 // result until the remainder of the transaction is completed.
                 continue;
             }
         }

         if (txn->completion_cb) {
             txn->completion_cb(txn->cookie, status, &txn->op);
         }
     }

 goodbye:
     while (txn != NULL) {
         txn->completion_cb(txn->cookie, ZX_ERR_BAD_STATE, &txn->op);
         txn = list_remove_head_type(&dev->deferred_list, ramdisk_txn_t, node);

         if (txn == NULL) {
             mtx_lock(&dev->lock);
             txn = list_remove_head_type(&dev->txn_list, ramdisk_txn_t, node);
             mtx_unlock(&dev->lock);
         }
     }
     return 0;
 }

 static uint64_t sizebytes(ramdisk_device_t* rdev) {
     return rdev->blk_size * rdev->blk_count;
 }

 static void ramdisk_get_info(void* ctx, block_info_t* info) {
     ramdisk_device_t* ramdev = ctx;
     memset(info, 0, sizeof(*info));
     info->block_size = ramdev->blk_size;
     info->block_count = ramdev->blk_count;
     // Arbitrarily set, but matches the SATA driver for testing
     info->max_transfer_size = MAX_TRANSFER_SIZE;
     info->flags = ramdev->flags;
 }

 // implement device protocol:

 static void ramdisk_unbind(void* ctx) {
     ramdisk_device_t* ramdev = ctx;
     mtx_lock(&ramdev->lock);
     ramdev->dead = true;
     mtx_unlock(&ramdev->lock);
     sync_completion_signal(&ramdev->signal);
     device_remove(ramdev->zxdev);
 }

 static zx_status_t ramdisk_ioctl(void* ctx, uint32_t op, const void* cmd, size_t cmd_len,
                                  void* reply, size_t max, size_t* out_actual) {
     ramdisk_device_t* ramdev = ctx;
     if (ramdev->dead) {
         return ZX_ERR_BAD_STATE;
     }

     switch (op) {
     case IOCTL_RAMDISK_UNLINK: {
         ramdisk_unbind(ramdev);
         return ZX_OK;
     }
     case IOCTL_RAMDISK_SET_FLAGS: {
         if (cmd_len < sizeof(uint32_t)) {
             return ZX_ERR_INVALID_ARGS;
         }
         uint32_t* flags = (uint32_t*)cmd;
         ramdev->flags = *flags;
         return ZX_OK;
     }
     case IOCTL_RAMDISK_WAKE_UP: {
         // Reset state and transaction counts
         mtx_lock(&ramdev->lock);
         ramdev->asleep = false;
         memset(&ramdev->blk_counts, 0, sizeof(ramdev->blk_counts));
         ramdev->sa_blk_count = 0;
         mtx_unlock(&ramdev->lock);
         sync_completion_signal(&ramdev->signal);
         return ZX_OK;
     }
     case IOCTL_RAMDISK_SLEEP_AFTER: {
         if (cmd_len < sizeof(uint64_t)) {
             return ZX_ERR_INVALID_ARGS;
         }
         uint64_t* blk_count = (uint64_t*)cmd;
         mtx_lock(&ramdev->lock);
         ramdev->asleep = false;
         memset(&ramdev->blk_counts, 0, sizeof(ramdev->blk_counts));
         ramdev->sa_blk_count = *blk_count;

         if (*blk_count == 0) {
             ramdev->asleep = true;
         }
         mtx_unlock(&ramdev->lock);
         return ZX_OK;
     }
     case IOCTL_RAMDISK_GET_BLK_COUNTS: {
         if (max < sizeof(ramdisk_blk_counts_t)) {
             return ZX_ERR_INVALID_ARGS;
         }
         mtx_lock(&ramdev->lock);
         memcpy(reply, &ramdev->blk_counts, sizeof(ramdisk_blk_counts_t));
         mtx_unlock(&ramdev->lock);
         *out_actual = sizeof(ramdisk_blk_counts_t);
         return ZX_OK;
     }
     // Block Protocol
     case IOCTL_BLOCK_GET_NAME: {
         char* name = reply;
         memset(name, 0, max);
         strncpy(name, ramdev->name, max);
         *out_actual = strnlen(name, max);
         return ZX_OK;
     }
     case IOCTL_BLOCK_GET_INFO: {
         block_info_t* info = reply;
         if (max < sizeof(*info))
             return ZX_ERR_BUFFER_TOO_SMALL;
         ramdisk_get_info(ramdev, info);
         *out_actual = sizeof(*info);
         return ZX_OK;
     }
     case IOCTL_BLOCK_GET_TYPE_GUID: {
         if (max < ZBI_PARTITION_GUID_LEN)
             return ZX_ERR_BUFFER_TOO_SMALL;
         memcpy(reply, ramdev->type_guid, sizeof(ramdev->type_guid));
         *out_actual = sizeof(ramdev->type_guid);
         return ZX_OK;
     }
     case IOCTL_DEVICE_SYNC: {
         // Wow, we sync so quickly!
         return ZX_OK;
     }
     default:
         return ZX_ERR_NOT_SUPPORTED;
     }
 }

 static void ramdisk_queue(void* ctx, block_op_t* bop, block_impl_queue_callback completion_cb,
                           void* cookie) {
     ramdisk_device_t* ramdev = ctx;
     ramdisk_txn_t* txn = containerof(bop, ramdisk_txn_t, op);
     bool dead;
     bool read = false;

     switch ((txn->op.command &= BLOCK_OP_MASK)) {
     case BLOCK_OP_READ:
         read = true;
         __FALLTHROUGH;
     case BLOCK_OP_WRITE:
         if ((txn->op.rw.offset_dev >= ramdev->blk_count) ||
             ((ramdev->blk_count - txn->op.rw.offset_dev) < txn->op.rw.length)) {
             completion_cb(cookie, ZX_ERR_OUT_OF_RANGE, bop);
             return;
         }

         mtx_lock(&ramdev->lock);
         if (!(dead = ramdev->dead)) {
             if (!read) {
                 ramdev->blk_counts.received += txn->op.rw.length;
             }
             txn->completion_cb = completion_cb;
             txn->cookie = cookie;
             list_add_tail(&ramdev->txn_list, &txn->node);
         }
         mtx_unlock(&ramdev->lock);
         if (dead) {
             completion_cb(cookie, ZX_ERR_BAD_STATE, bop);
         } else {
             sync_completion_signal(&ramdev->signal);
         }
         break;
     case BLOCK_OP_FLUSH:
         completion_cb(cookie, ZX_OK, bop);
         break;
     default:
         completion_cb(cookie, ZX_ERR_NOT_SUPPORTED, bop);
         break;
     }
 }

 static void ramdisk_query(void* ctx, block_info_t* bi, size_t* bopsz) {
     ramdisk_get_info(ctx, bi);
     *bopsz = sizeof(ramdisk_txn_t);
 }

 static zx_off_t ramdisk_getsize(void* ctx) {
     return sizebytes(ctx);
 }

 static void ramdisk_release(void* ctx) {
     ramdisk_device_t* ramdev = ctx;

     // Wake up the worker thread, in case it is sleeping
     mtx_lock(&ramdev->lock);
     ramdev->dead = true;
     mtx_unlock(&ramdev->lock);
     sync_completion_signal(&ramdev->signal);

     int r;
     thrd_join(ramdev->worker, &r);
     if (ramdev->vmo != ZX_HANDLE_INVALID) {
         zx_vmar_unmap(zx_vmar_root_self(), ramdev->mapped_addr, sizebytes(ramdev));
         zx_handle_close(ramdev->vmo);
     }
     free(ramdev);
 }

 static block_impl_protocol_ops_t block_ops = {
     .query = ramdisk_query,
     .queue = ramdisk_queue,
 };

 static zx_protocol_device_t ramdisk_instance_proto = {
     .version = DEVICE_OPS_VERSION,
     .ioctl = ramdisk_ioctl,
     .get_size = ramdisk_getsize,
     .unbind = ramdisk_unbind,
     .release = ramdisk_release,
 };

 // implement device protocol:

 static uint64_t ramdisk_count = 0;

 // This always consumes the VMO handle.
 static zx_status_t ramctl_config(ramctl_device_t* ramctl, zx_handle_t vmo,
                                  uint64_t blk_size, uint64_t blk_count,
                                  uint8_t* type_guid, void* reply, size_t max,
                                  size_t* out_actual) {
     zx_status_t status = ZX_ERR_INVALID_ARGS;
     if (max < 32) {
         goto fail;
     }

     ramdisk_device_t* ramdev = calloc(1, sizeof(ramdisk_device_t));
     if (!ramdev) {
         status = ZX_ERR_NO_MEMORY;
         goto fail;
     }
     if (mtx_init(&ramdev->lock, mtx_plain) != thrd_success) {
         goto fail_free;
     }
     ramdev->vmo = vmo;
     ramdev->blk_size = blk_size;
     ramdev->blk_count = blk_count;
     if (type_guid) {
         memcpy(ramdev->type_guid, type_guid, ZBI_PARTITION_GUID_LEN);
     } else {
         memset(ramdev->type_guid, 0, ZBI_PARTITION_GUID_LEN);
     }
     snprintf(ramdev->name, sizeof(ramdev->name),
              "ramdisk-%" PRIu64, ramdisk_count++);

     status = zx_vmar_map(zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE,
                          0, ramdev->vmo, 0, sizebytes(ramdev), &ramdev->mapped_addr);
     if (status != ZX_OK) {
         goto fail_mtx;
     }
     list_initialize(&ramdev->txn_list);
     list_initialize(&ramdev->deferred_list);
     if (thrd_create(&ramdev->worker, worker_thread, ramdev) != thrd_success) {
         goto fail_unmap;
     }

     device_add_args_t args = {
         .version = DEVICE_ADD_ARGS_VERSION,
         .name = ramdev->name,
         .ctx = ramdev,
         .ops = &ramdisk_instance_proto,
         .proto_id = ZX_PROTOCOL_BLOCK_IMPL,
         .proto_ops = &block_ops,
     };

     if ((status = device_add(ramctl->zxdev, &args, &ramdev->zxdev)) != ZX_OK) {
         ramdisk_release(ramdev);
         return status;
     }
     strcpy(reply, ramdev->name);
     *out_actual = strlen(reply);
     return ZX_OK;

 fail_unmap:
     zx_vmar_unmap(zx_vmar_root_self(), ramdev->mapped_addr, sizebytes(ramdev));
 fail_mtx:
     mtx_destroy(&ramdev->lock);
 fail_free:
     free(ramdev);
 fail:
     zx_handle_close(vmo);
     return status;

 }

 static zx_status_t ramctl_ioctl(void* ctx, uint32_t op, const void* cmd,
                                 size_t cmdlen, void* reply, size_t max, size_t* out_actual) {
     ramctl_device_t* ramctl = ctx;

     switch (op) {
     case IOCTL_RAMDISK_CONFIG: {
         if (cmdlen != sizeof(ramdisk_ioctl_config_t)) {
             return ZX_ERR_INVALID_ARGS;
         }
         ramdisk_ioctl_config_t* config = (ramdisk_ioctl_config_t*)cmd;
         zx_handle_t vmo;
         zx_status_t status = zx_vmo_create(
             config->blk_size * config->blk_count, 0, &vmo);
         if (status == ZX_OK) {
             status = ramctl_config(ramctl, vmo,
                                    config->blk_size, config->blk_count,
                                    config->type_guid,
                                    reply, max, out_actual);
         }
         return status;
     }
     case IOCTL_RAMDISK_CONFIG_VMO: {
         if (cmdlen != sizeof(zx_handle_t)) {
             return ZX_ERR_INVALID_ARGS;
         }
         zx_handle_t* vmo = (zx_handle_t*)cmd;

         // Ensure this is the last handle to this VMO; otherwise, the size
         // may change from underneath us.
         zx_info_handle_count_t info;
         zx_status_t status = zx_object_get_info(*vmo, ZX_INFO_HANDLE_COUNT,
                                                 &info, sizeof(info),
                                                 NULL, NULL);
         if (status != ZX_OK || info.handle_count != 1) {
             zx_handle_close(*vmo);
             return ZX_ERR_INVALID_ARGS;
         }

         uint64_t vmo_size;
         status = zx_vmo_get_size(*vmo, &vmo_size);
         if (status != ZX_OK) {
             zx_handle_close(*vmo);
             return status;
         }

         return ramctl_config(ramctl, *vmo,
                              PAGE_SIZE, (vmo_size + PAGE_SIZE - 1) / PAGE_SIZE,
                              NULL, reply, max, out_actual);
     }
     default:
         return ZX_ERR_NOT_SUPPORTED;
     }
 }

 static zx_protocol_device_t ramdisk_ctl_proto = {
     .version = DEVICE_OPS_VERSION,
     .ioctl = ramctl_ioctl,
 };

 static zx_status_t ramdisk_driver_bind(void* ctx, zx_device_t* parent) {
     ramctl_device_t* ramctl = calloc(1, sizeof(ramctl_device_t));
     if (ramctl == NULL) {
         return ZX_ERR_NO_MEMORY;
     }

     device_add_args_t args = {
         .version = DEVICE_ADD_ARGS_VERSION,
         .name = "ramctl",
         .ops = &ramdisk_ctl_proto,
         .ctx = ramctl,
     };

     return device_add(parent, &args, &ramctl->zxdev);
 }

 static zx_driver_ops_t ramdisk_driver_ops = {
     .version = DRIVER_OPS_VERSION,
     .bind = ramdisk_driver_bind,
 };

 ZIRCON_DRIVER_BEGIN(ramdisk, ramdisk_driver_ops, "zircon", "0.1", 1)
     BI_MATCH_IF(EQ, BIND_PROTOCOL, ZX_PROTOCOL_MISC_PARENT),
 ZIRCON_DRIVER_END(ramdisk)
	// Copyright 2017 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include <ddk/device.h>
	#include <ddk/driver.h>
	#include <ddk/binding.h>
	#include <ddk/protocol/block.h>

	#include <zircon/device/ramdisk.h>
	#include <lib/sync/completion.h>

	#include <assert.h>
	#include <inttypes.h>
	#include <limits.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <string.h>
	#include <sys/param.h>
	#include <threads.h>
	#include <zircon/boot/image.h>
	#include <zircon/device/block.h>
	#include <zircon/listnode.h>
	#include <zircon/process.h>
	#include <zircon/syscalls.h>
	#include <zircon/types.h>

	#define MAX_TRANSFER_SIZE (1 << 19)

	typedef struct {
	zx_device_t* zxdev;
	} ramctl_device_t;

	typedef struct ramdisk_device {
	zx_device_t* zxdev;
	uintptr_t mapped_addr;
	uint64_t blk_size;
	uint64_t blk_count;
	uint8_t type_guid[ZBI_PARTITION_GUID_LEN];

	mtx_t lock;
	sync_completion_t signal;
	list_node_t txn_list;
	list_node_t deferred_list;
	bool dead;

	uint32_t flags;
	zx_handle_t vmo;

	bool asleep; // true if the ramdisk is "sleeping"
	uint64_t sa_blk_count; // number of blocks to sleep after
	ramdisk_blk_counts_t blk_counts; // current block counts

	thrd_t worker;
	char name[NAME_MAX];
	} ramdisk_device_t;

	typedef struct {
	block_op_t op;
	list_node_t node;
	block_impl_queue_callback completion_cb;
	void* cookie;
	} ramdisk_txn_t;

	// The worker thread processes messages from iotxns in the background
	static int worker_thread(void* arg) {
	zx_status_t status = ZX_OK;
	ramdisk_device_t* dev = (ramdisk_device_t*)arg;
	ramdisk_txn_t* txn = NULL;
	bool dead, asleep, defer;
	size_t blocks = 0;

	for (;;) {
	for (;;) {
	mtx_lock(&dev->lock);
	txn = NULL;
	dead = dev->dead;
	asleep = dev->asleep;
	defer = (dev->flags & RAMDISK_FLAG_RESUME_ON_WAKE) != 0;
	blocks = dev->sa_blk_count;

	if (!asleep) {
	// If we are awake, try grabbing pending transactions from the deferred list.
	txn = list_remove_head_type(&dev->deferred_list, ramdisk_txn_t, node);
	}

	if (txn == NULL) {
	// If no transactions were available in the deferred list (or we are asleep),
	// grab one from the regular txn_list.
	txn = list_remove_head_type(&dev->txn_list, ramdisk_txn_t, node);
	}

	mtx_unlock(&dev->lock);

	if (dead) {
	goto goodbye;
	}

	if (txn == NULL) {
	sync_completion_wait(&dev->signal, ZX_TIME_INFINITE);
	} else {
	sync_completion_reset(&dev->signal);
	break;
	}
	}

	size_t txn_blocks = txn->op.rw.length;
	if (txn->op.command == BLOCK_OP_READ \|\| blocks == 0 \|\| blocks > txn_blocks) {
	// If the ramdisk is not configured to sleep after x blocks, or the number of blocks in
	// this transaction does not exceed the sa_blk_count, or we are performing a read
	// operation, use the current transaction length.
	blocks = txn_blocks;
	}

	size_t length = blocks * dev->blk_size;
	size_t dev_offset = txn->op.rw.offset_dev * dev->blk_size;
	size_t vmo_offset = txn->op.rw.offset_vmo * dev->blk_size;
	void* addr = (void*) dev->mapped_addr + dev_offset;

	if (length > MAX_TRANSFER_SIZE) {
	status = ZX_ERR_OUT_OF_RANGE;
	} else if (txn->op.command == BLOCK_OP_READ) {
	// A read operation should always succeed, even if the ramdisk is "asleep".
	status = zx_vmo_write(txn->op.rw.vmo, addr, vmo_offset, length);
	} else if (asleep) {
	if (defer) {
	// If we are asleep but resuming on wake, add txn to the deferred_list.
	// deferred_list is only accessed by the worker_thread, so a lock is not needed.
	list_add_tail(&dev->deferred_list, &txn->node);
	continue;
	} else {
	status = ZX_ERR_UNAVAILABLE;
	}
	} else { // BLOCK_OP_WRITE
	status = zx_vmo_read(txn->op.rw.vmo, addr, vmo_offset, length);

	if (status == ZX_OK && blocks < txn->op.rw.length && defer) {
	// If the first part of the transaction succeeded but the entire transaction is not
	// complete, we need to address the remainder.

	// If we are deferring after this block count, update the transaction to
	// reflect the blocks that have already been written, and add it to the
	// deferred queue.
	txn->op.rw.length -= blocks;
	txn->op.rw.offset_vmo += blocks;
	txn->op.rw.offset_dev += blocks;

	// Add the remaining blocks to the deferred list.
	list_add_tail(&dev->deferred_list, &txn->node);
	}
	}

	if (txn->op.command == BLOCK_OP_WRITE) {
	// Update the ramdisk block counts. Since we aren't failing read transactions,
	// only include write transaction counts.
	mtx_lock(&dev->lock);
	// Increment the count based on the result of the last transaction.
	if (status == ZX_OK) {
	dev->blk_counts.successful += blocks;

	if (blocks != txn_blocks && !defer) {
	// If we are not deferring, then any excess blocks have failed.
	dev->blk_counts.failed += txn_blocks - blocks;
	status = ZX_ERR_UNAVAILABLE;
	}
	} else {
	dev->blk_counts.failed += txn_blocks;
	}

	// Put the ramdisk to sleep if we have reached the required # of blocks.
	if (dev->sa_blk_count > 0) {
	dev->sa_blk_count -= blocks;
	dev->asleep = (dev->sa_blk_count == 0);
	}
	mtx_unlock(&dev->lock);

	if (defer && blocks != txn_blocks && status == ZX_OK) {
	// If we deferred partway through a transaction, hold off on returning the
	// result until the remainder of the transaction is completed.
	continue;
	}
	}

	if (txn->completion_cb) {
	txn->completion_cb(txn->cookie, status, &txn->op);
	}
	}

	goodbye:
	while (txn != NULL) {
	txn->completion_cb(txn->cookie, ZX_ERR_BAD_STATE, &txn->op);
	txn = list_remove_head_type(&dev->deferred_list, ramdisk_txn_t, node);

	if (txn == NULL) {
	mtx_lock(&dev->lock);
	txn = list_remove_head_type(&dev->txn_list, ramdisk_txn_t, node);
	mtx_unlock(&dev->lock);
	}
	}
	return 0;
	}

	static uint64_t sizebytes(ramdisk_device_t* rdev) {
	return rdev->blk_size * rdev->blk_count;
	}

	static void ramdisk_get_info(void* ctx, block_info_t* info) {
	ramdisk_device_t* ramdev = ctx;
	memset(info, 0, sizeof(*info));
	info->block_size = ramdev->blk_size;
	info->block_count = ramdev->blk_count;
	// Arbitrarily set, but matches the SATA driver for testing
	info->max_transfer_size = MAX_TRANSFER_SIZE;
	info->flags = ramdev->flags;
	}

	// implement device protocol:

	static void ramdisk_unbind(void* ctx) {
	ramdisk_device_t* ramdev = ctx;
	mtx_lock(&ramdev->lock);
	ramdev->dead = true;
	mtx_unlock(&ramdev->lock);
	sync_completion_signal(&ramdev->signal);
	device_remove(ramdev->zxdev);
	}

	static zx_status_t ramdisk_ioctl(void* ctx, uint32_t op, const void* cmd, size_t cmd_len,
	void* reply, size_t max, size_t* out_actual) {
	ramdisk_device_t* ramdev = ctx;
	if (ramdev->dead) {
	return ZX_ERR_BAD_STATE;
	}

	switch (op) {
	case IOCTL_RAMDISK_UNLINK: {
	ramdisk_unbind(ramdev);
	return ZX_OK;
	}
	case IOCTL_RAMDISK_SET_FLAGS: {
	if (cmd_len < sizeof(uint32_t)) {
	return ZX_ERR_INVALID_ARGS;
	}
	uint32_t* flags = (uint32_t*)cmd;
	ramdev->flags = *flags;
	return ZX_OK;
	}
	case IOCTL_RAMDISK_WAKE_UP: {
	// Reset state and transaction counts
	mtx_lock(&ramdev->lock);
	ramdev->asleep = false;
	memset(&ramdev->blk_counts, 0, sizeof(ramdev->blk_counts));
	ramdev->sa_blk_count = 0;
	mtx_unlock(&ramdev->lock);
	sync_completion_signal(&ramdev->signal);
	return ZX_OK;
	}
	case IOCTL_RAMDISK_SLEEP_AFTER: {
	if (cmd_len < sizeof(uint64_t)) {
	return ZX_ERR_INVALID_ARGS;
	}
	uint64_t* blk_count = (uint64_t*)cmd;
	mtx_lock(&ramdev->lock);
	ramdev->asleep = false;
	memset(&ramdev->blk_counts, 0, sizeof(ramdev->blk_counts));
	ramdev->sa_blk_count = *blk_count;

	if (*blk_count == 0) {
	ramdev->asleep = true;
	}
	mtx_unlock(&ramdev->lock);
	return ZX_OK;
	}
	case IOCTL_RAMDISK_GET_BLK_COUNTS: {
	if (max < sizeof(ramdisk_blk_counts_t)) {
	return ZX_ERR_INVALID_ARGS;
	}
	mtx_lock(&ramdev->lock);
	memcpy(reply, &ramdev->blk_counts, sizeof(ramdisk_blk_counts_t));
	mtx_unlock(&ramdev->lock);
	*out_actual = sizeof(ramdisk_blk_counts_t);
	return ZX_OK;
	}
	// Block Protocol
	case IOCTL_BLOCK_GET_NAME: {
	char* name = reply;
	memset(name, 0, max);
	strncpy(name, ramdev->name, max);
	*out_actual = strnlen(name, max);
	return ZX_OK;
	}
	case IOCTL_BLOCK_GET_INFO: {
	block_info_t* info = reply;
	if (max < sizeof(*info))
	return ZX_ERR_BUFFER_TOO_SMALL;
	ramdisk_get_info(ramdev, info);
	out_actual = sizeof(info);
	return ZX_OK;
	}
	case IOCTL_BLOCK_GET_TYPE_GUID: {
	if (max < ZBI_PARTITION_GUID_LEN)
	return ZX_ERR_BUFFER_TOO_SMALL;
	memcpy(reply, ramdev->type_guid, sizeof(ramdev->type_guid));
	*out_actual = sizeof(ramdev->type_guid);
	return ZX_OK;
	}
	case IOCTL_DEVICE_SYNC: {
	// Wow, we sync so quickly!
	return ZX_OK;
	}
	default:
	return ZX_ERR_NOT_SUPPORTED;
	}
	}

	static void ramdisk_queue(void* ctx, block_op_t* bop, block_impl_queue_callback completion_cb,
	void* cookie) {
	ramdisk_device_t* ramdev = ctx;
	ramdisk_txn_t* txn = containerof(bop, ramdisk_txn_t, op);
	bool dead;
	bool read = false;

	switch ((txn->op.command &= BLOCK_OP_MASK)) {
	case BLOCK_OP_READ:
	read = true;
	__FALLTHROUGH;
	case BLOCK_OP_WRITE:
	if ((txn->op.rw.offset_dev >= ramdev->blk_count) \|\|
	((ramdev->blk_count - txn->op.rw.offset_dev) < txn->op.rw.length)) {
	completion_cb(cookie, ZX_ERR_OUT_OF_RANGE, bop);
	return;
	}

	mtx_lock(&ramdev->lock);
	if (!(dead = ramdev->dead)) {
	if (!read) {
	ramdev->blk_counts.received += txn->op.rw.length;
	}
	txn->completion_cb = completion_cb;
	txn->cookie = cookie;
	list_add_tail(&ramdev->txn_list, &txn->node);
	}
	mtx_unlock(&ramdev->lock);
	if (dead) {
	completion_cb(cookie, ZX_ERR_BAD_STATE, bop);
	} else {
	sync_completion_signal(&ramdev->signal);
	}
	break;
	case BLOCK_OP_FLUSH:
	completion_cb(cookie, ZX_OK, bop);
	break;
	default:
	completion_cb(cookie, ZX_ERR_NOT_SUPPORTED, bop);
	break;
	}
	}

	static void ramdisk_query(void* ctx, block_info_t* bi, size_t* bopsz) {
	ramdisk_get_info(ctx, bi);
	*bopsz = sizeof(ramdisk_txn_t);
	}

	static zx_off_t ramdisk_getsize(void* ctx) {
	return sizebytes(ctx);
	}

	static void ramdisk_release(void* ctx) {
	ramdisk_device_t* ramdev = ctx;

	// Wake up the worker thread, in case it is sleeping
	mtx_lock(&ramdev->lock);
	ramdev->dead = true;
	mtx_unlock(&ramdev->lock);
	sync_completion_signal(&ramdev->signal);

	int r;
	thrd_join(ramdev->worker, &r);
	if (ramdev->vmo != ZX_HANDLE_INVALID) {
	zx_vmar_unmap(zx_vmar_root_self(), ramdev->mapped_addr, sizebytes(ramdev));
	zx_handle_close(ramdev->vmo);
	}
	free(ramdev);
	}

	static block_impl_protocol_ops_t block_ops = {
	.query = ramdisk_query,
	.queue = ramdisk_queue,
	};

	static zx_protocol_device_t ramdisk_instance_proto = {
	.version = DEVICE_OPS_VERSION,
	.ioctl = ramdisk_ioctl,
	.get_size = ramdisk_getsize,
	.unbind = ramdisk_unbind,
	.release = ramdisk_release,
	};

	// implement device protocol:

	static uint64_t ramdisk_count = 0;

	// This always consumes the VMO handle.
	static zx_status_t ramctl_config(ramctl_device_t* ramctl, zx_handle_t vmo,
	uint64_t blk_size, uint64_t blk_count,
	uint8_t* type_guid, void* reply, size_t max,
	size_t* out_actual) {
	zx_status_t status = ZX_ERR_INVALID_ARGS;
	if (max < 32) {
	goto fail;
	}

	ramdisk_device_t* ramdev = calloc(1, sizeof(ramdisk_device_t));
	if (!ramdev) {
	status = ZX_ERR_NO_MEMORY;
	goto fail;
	}
	if (mtx_init(&ramdev->lock, mtx_plain) != thrd_success) {
	goto fail_free;
	}
	ramdev->vmo = vmo;
	ramdev->blk_size = blk_size;
	ramdev->blk_count = blk_count;
	if (type_guid) {
	memcpy(ramdev->type_guid, type_guid, ZBI_PARTITION_GUID_LEN);
	} else {
	memset(ramdev->type_guid, 0, ZBI_PARTITION_GUID_LEN);
	}
	snprintf(ramdev->name, sizeof(ramdev->name),
	"ramdisk-%" PRIu64, ramdisk_count++);

	status = zx_vmar_map(zx_vmar_root_self(), ZX_VM_PERM_READ \| ZX_VM_PERM_WRITE,
	0, ramdev->vmo, 0, sizebytes(ramdev), &ramdev->mapped_addr);
	if (status != ZX_OK) {
	goto fail_mtx;
	}
	list_initialize(&ramdev->txn_list);
	list_initialize(&ramdev->deferred_list);
	if (thrd_create(&ramdev->worker, worker_thread, ramdev) != thrd_success) {
	goto fail_unmap;
	}

	device_add_args_t args = {
	.version = DEVICE_ADD_ARGS_VERSION,
	.name = ramdev->name,
	.ctx = ramdev,
	.ops = &ramdisk_instance_proto,
	.proto_id = ZX_PROTOCOL_BLOCK_IMPL,
	.proto_ops = &block_ops,
	};

	if ((status = device_add(ramctl->zxdev, &args, &ramdev->zxdev)) != ZX_OK) {
	ramdisk_release(ramdev);
	return status;
	}
	strcpy(reply, ramdev->name);
	*out_actual = strlen(reply);
	return ZX_OK;

	fail_unmap:
	zx_vmar_unmap(zx_vmar_root_self(), ramdev->mapped_addr, sizebytes(ramdev));
	fail_mtx:
	mtx_destroy(&ramdev->lock);
	fail_free:
	free(ramdev);
	fail:
	zx_handle_close(vmo);
	return status;

	}

	static zx_status_t ramctl_ioctl(void* ctx, uint32_t op, const void* cmd,
	size_t cmdlen, void* reply, size_t max, size_t* out_actual) {
	ramctl_device_t* ramctl = ctx;

	switch (op) {
	case IOCTL_RAMDISK_CONFIG: {
	if (cmdlen != sizeof(ramdisk_ioctl_config_t)) {
	return ZX_ERR_INVALID_ARGS;
	}
	ramdisk_ioctl_config_t* config = (ramdisk_ioctl_config_t*)cmd;
	zx_handle_t vmo;
	zx_status_t status = zx_vmo_create(
	config->blk_size * config->blk_count, 0, &vmo);
	if (status == ZX_OK) {
	status = ramctl_config(ramctl, vmo,
	config->blk_size, config->blk_count,
	config->type_guid,
	reply, max, out_actual);
	}
	return status;
	}
	case IOCTL_RAMDISK_CONFIG_VMO: {
	if (cmdlen != sizeof(zx_handle_t)) {
	return ZX_ERR_INVALID_ARGS;
	}
	zx_handle_t* vmo = (zx_handle_t*)cmd;

	// Ensure this is the last handle to this VMO; otherwise, the size
	// may change from underneath us.
	zx_info_handle_count_t info;
	zx_status_t status = zx_object_get_info(*vmo, ZX_INFO_HANDLE_COUNT,
	&info, sizeof(info),
	NULL, NULL);
	if (status != ZX_OK \|\| info.handle_count != 1) {
	zx_handle_close(*vmo);
	return ZX_ERR_INVALID_ARGS;
	}

	uint64_t vmo_size;
	status = zx_vmo_get_size(*vmo, &vmo_size);
	if (status != ZX_OK) {
	zx_handle_close(*vmo);
	return status;
	}

	return ramctl_config(ramctl, *vmo,
	PAGE_SIZE, (vmo_size + PAGE_SIZE - 1) / PAGE_SIZE,
	NULL, reply, max, out_actual);
	}
	default:
	return ZX_ERR_NOT_SUPPORTED;
	}
	}

	static zx_protocol_device_t ramdisk_ctl_proto = {
	.version = DEVICE_OPS_VERSION,
	.ioctl = ramctl_ioctl,
	};

	static zx_status_t ramdisk_driver_bind(void* ctx, zx_device_t* parent) {
	ramctl_device_t* ramctl = calloc(1, sizeof(ramctl_device_t));
	if (ramctl == NULL) {
	return ZX_ERR_NO_MEMORY;
	}

	device_add_args_t args = {
	.version = DEVICE_ADD_ARGS_VERSION,
	.name = "ramctl",
	.ops = &ramdisk_ctl_proto,
	.ctx = ramctl,
	};

	return device_add(parent, &args, &ramctl->zxdev);
	}

	static zx_driver_ops_t ramdisk_driver_ops = {
	.version = DRIVER_OPS_VERSION,
	.bind = ramdisk_driver_bind,
	};

	ZIRCON_DRIVER_BEGIN(ramdisk, ramdisk_driver_ops, "zircon", "0.1", 1)
	BI_MATCH_IF(EQ, BIND_PROTOCOL, ZX_PROTOCOL_MISC_PARENT),
	ZIRCON_DRIVER_END(ramdisk)