blob: 38bdaa16e0225985a5b8987ba11706e94995a1cc [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <threads.h>
#include <ddk/binding.h>
#include <ddk/debug.h>
#include <ddk/device.h>
#include <ddk/driver.h>
#include <ddk/protocol/block.h>
#include <ddk/protocol/pci.h>
#include <ddk/io-buffer.h>
#include <hw/reg.h>
#include <hw/pci.h>
#include <sync/completion.h>
#include <zircon/device/block.h>
#include <zircon/syscalls.h>
#include <zircon/types.h>
#include <zircon/listnode.h>
#include "nvme-hw.h"
// If enabled, gather stats on concurrent io ops,
// pending txns, etc. Stats are retrieved by
// IOCTL_BLOCK_GET_STATS
#define WITH_STATS 1
#define TXN_FLAG_FAILED 1
typedef struct {
block_op_t op;
list_node_t node;
uint16_t pending_utxns;
uint8_t opcode;
uint8_t flags;
} nvme_txn_t;
typedef struct {
zx_paddr_t phys; // io buffer phys base (1 page)
void* virt; // io buffer virt base
nvme_txn_t* txn; // related txn
uint16_t id;
uint16_t reserved0;
uint32_t reserved1;
} nvme_utxn_t;
#define UTXN_COUNT 63
// There's no system constant for this. Ensure it matches reality.
#define PAGE_SHIFT (12ULL)
static_assert(PAGE_SIZE == (1ULL << PAGE_SHIFT), "");
#define PAGE_MASK (PAGE_SIZE - 1ULL)
// Limit maximum transfer size to 1MB which fits comfortably
// within our single scatter gather page per utxn setup
#define MAX_XFER (1024*1024)
// Maximum submission and completion queue item counts, for
// queues that are a single page in size.
#define SQMAX (PAGE_SIZE / sizeof(nvme_cmd_t))
#define CQMAX (PAGE_SIZE / sizeof(nvme_cpl_t))
// global driver state bits
#define FLAG_IRQ_THREAD_STARTED 0x0001
#define FLAG_IO_THREAD_STARTED 0x0002
#define FLAG_SHUTDOWN 0x0004
#define FLAG_HAS_VWC 0x0100
typedef struct {
void* io;
zx_handle_t ioh;
zx_handle_t irqh;
zx_handle_t bti;
uint32_t flags;
mtx_t lock;
// io queue doorbell registers
void* io_sq_tail_db;
void* io_cq_head_db;
nvme_cpl_t* io_cq;
nvme_cmd_t* io_sq;
uint32_t io_nsid;
uint16_t io_cq_head;
uint16_t io_cq_toggle;
uint16_t io_sq_tail;
uint16_t io_sq_head;
uint64_t utxn_avail; // bitmask of available utxns
// The pending list is txns that have been received
// via nvme_queue() and are waiting for io to start.
// The exception is the head of the pending list which may
// be partially started, waiting for more utxns to become
// available.
// The active list consists of txns where all utxns have
// been created and we're waiting for them to complete or
// error out.
list_node_t pending_txns; // inbound txns to process
list_node_t active_txns; // txns in flight
// The io signal completion is signaled from nvme_queue()
// or from the irq thread, notifying the io thread that
// it has work to do.
completion_t io_signal;
uint32_t max_xfer;
block_info_t info;
// admin queue doorbell registers
void* io_admin_sq_tail_db;
void* io_admin_cq_head_db;
// admin queues and state
nvme_cpl_t* admin_cq;
nvme_cmd_t* admin_sq;
uint16_t admin_cq_head;
uint16_t admin_cq_toggle;
uint16_t admin_sq_tail;
uint16_t admin_sq_head;
// context for admin transactions
// presently we serialize these under the admin_lock
mtx_t admin_lock;
completion_t admin_signal;
nvme_cpl_t admin_result;
pci_protocol_t pci;
zx_device_t* zxdev;
size_t iosz;
// source of physical pages for queues and admin commands
io_buffer_t iob;
thrd_t irqthread;
thrd_t iothread;
#if WITH_STATS
size_t stat_concur;
size_t stat_pending;
size_t stat_max_concur;
size_t stat_max_pending;
size_t stat_total_ops;
size_t stat_total_blocks;
#endif
// pool of utxns
nvme_utxn_t utxn[UTXN_COUNT];
} nvme_device_t;
#if WITH_STATS
#define STAT_INC(name) do { nvme->stat_##name++; } while (0)
#define STAT_DEC(name) do { nvme->stat_##name--; } while (0)
#define STAT_DEC_IF(name, c) do { if (c) nvme->stat_##name--; } while (0)
#define STAT_ADD(name, num) do { nvme->stat_##name += num; } while (0)
#define STAT_INC_MAX(name) do { \
if (++nvme->stat_##name > nvme->stat_max_##name) { \
nvme->stat_max_##name = nvme->stat_##name; \
}} while (0)
#else
#define STAT_INC(name) do { } while (0)
#define STAT_DEC(name) do { } while (0)
#define STAT_DEC_IF(name, c) do { } while (0)
#define STAT_ADD(name, num) do { } while (0)
#define STAT_INC_MAX(name) do { } while (0)
#endif
// We break IO transactions down into one or more "micro transactions" (utxn)
// based on the transfer limits of the controller, etc. Each utxn has an
// id associated with it, which is used as the command id for the command
// queued to the NVME device. This id is the same as its index into the
// pool of utxns and the bitmask of free txns, to simplify management.
//
// We maintain a pool of 63 of these, which is the number of commands
// that can be submitted to NVME via a single page submit queue.
//
// The utxns are not protected by locks. Instead, after initialization,
// they may only be touched by the io thread, which is responsible for
// queueing commands and dequeuing completion messages.
static nvme_utxn_t* utxn_get(nvme_device_t* nvme) {
uint64_t n = __builtin_ffsll(nvme->utxn_avail);
if (n == 0) {
return NULL;
}
n--;
nvme->utxn_avail &= ~(1ULL << n);
STAT_INC_MAX(concur);
return nvme->utxn + n;
}
static void utxn_put(nvme_device_t* nvme, nvme_utxn_t* utxn) {
uint64_t n = utxn->id;
STAT_DEC(concur);
nvme->utxn_avail |= (1ULL << n);
}
static zx_status_t nvme_admin_cq_get(nvme_device_t* nvme, nvme_cpl_t* cpl) {
if ((readw(&nvme->admin_cq[nvme->admin_cq_head].status) & 1) != nvme->admin_cq_toggle) {
return ZX_ERR_SHOULD_WAIT;
}
*cpl = nvme->admin_cq[nvme->admin_cq_head];
// advance the head pointer, wrapping and inverting toggle at max
uint16_t next = (nvme->admin_cq_head + 1) & (CQMAX - 1);
if ((nvme->admin_cq_head = next) == 0) {
nvme->admin_cq_toggle ^= 1;
}
// note the new sq head reported by hw
nvme->admin_sq_head = cpl->sq_head;
// ring the doorbell
writel(next, nvme->io_admin_cq_head_db);
return ZX_OK;
}
static zx_status_t nvme_admin_sq_put(nvme_device_t* nvme, nvme_cmd_t* cmd) {
uint16_t next = (nvme->admin_sq_tail + 1) & (SQMAX - 1);
// if head+1 == tail: queue is full
if (next == nvme->admin_sq_head) {
return ZX_ERR_SHOULD_WAIT;
}
nvme->admin_sq[nvme->admin_sq_tail] = *cmd;
nvme->admin_sq_tail = next;
// ring the doorbell
writel(next, nvme->io_admin_sq_tail_db);
return ZX_OK;
}
static zx_status_t nvme_io_cq_get(nvme_device_t* nvme, nvme_cpl_t* cpl) {
if ((readw(&nvme->io_cq[nvme->io_cq_head].status) & 1) != nvme->io_cq_toggle) {
return ZX_ERR_SHOULD_WAIT;
}
*cpl = nvme->io_cq[nvme->io_cq_head];
// advance the head pointer, wrapping and inverting toggle at max
uint16_t next = (nvme->io_cq_head + 1) & (CQMAX - 1);
if ((nvme->io_cq_head = next) == 0) {
nvme->io_cq_toggle ^= 1;
}
// note the new sq head reported by hw
nvme->io_sq_head = cpl->sq_head;
return ZX_OK;
}
static void nvme_io_cq_ack(nvme_device_t* nvme) {
// ring the doorbell
writel(nvme->io_cq_head, nvme->io_cq_head_db);
}
static zx_status_t nvme_io_sq_put(nvme_device_t* nvme, nvme_cmd_t* cmd) {
uint16_t next = (nvme->io_sq_tail + 1) & (SQMAX - 1);
// if head+1 == tail: queue is full
if (next == nvme->io_sq_head) {
return ZX_ERR_SHOULD_WAIT;
}
nvme->io_sq[nvme->io_sq_tail] = *cmd;
nvme->io_sq_tail = next;
// ring the doorbell
writel(next, nvme->io_sq_tail_db);
return ZX_OK;
}
static int irq_thread(void* arg) {
nvme_device_t* nvme = arg;
for (;;) {
zx_status_t r;
uint64_t slots;
if ((r = zx_interrupt_wait(nvme->irqh, &slots)) != ZX_OK) {
zxlogf(ERROR, "nvme: irq wait failed: %d\n", r);
break;
}
nvme_cpl_t cpl;
if (nvme_admin_cq_get(nvme, &cpl) == ZX_OK) {
nvme->admin_result = cpl;
completion_signal(&nvme->admin_signal);
}
completion_signal(&nvme->io_signal);
}
return 0;
}
static zx_status_t nvme_admin_txn(nvme_device_t* nvme, nvme_cmd_t* cmd, nvme_cpl_t* cpl) {
zx_status_t r;
mtx_lock(&nvme->admin_lock);
completion_reset(&nvme->admin_signal);
if ((r = nvme_admin_sq_put(nvme, cmd)) != ZX_OK) {
goto done;
}
if ((r = completion_wait(&nvme->admin_signal, zx_deadline_after(ZX_SEC(1)))) != ZX_OK) {
zxlogf(ERROR, "nvme: admin txn: timed out\n");
goto done;
}
unsigned code = NVME_CPL_STATUS_CODE(nvme->admin_result.status);
if (code != 0) {
zxlogf(ERROR, "nvme: admin txn: nvm error %03x\n", code);
r = ZX_ERR_IO;
}
if (cpl != NULL) {
*cpl = nvme->admin_result;
}
done:
mtx_unlock(&nvme->admin_lock);
return r;
}
static inline void txn_complete(nvme_txn_t* txn, zx_status_t status) {
txn->op.completion_cb(&txn->op, status);
}
// Attempt to generate utxns and queue nvme commands for a txn
// Returns true if this could not be completed due to temporary
// lack of resources or false if either it succeeded or errored out.
static bool io_process_txn(nvme_device_t* nvme, nvme_txn_t* txn) {
zx_handle_t vmo = txn->op.rw.vmo;
nvme_utxn_t* utxn;
zx_paddr_t* pages;
zx_status_t r;
for (;;) {
// If there are no available utxns, we can't proceed
// and we tell the caller to retain the txn (true)
if ((utxn = utxn_get(nvme)) == NULL) {
return true;
}
uint32_t blocks = txn->op.rw.length;
if (blocks > nvme->max_xfer) {
blocks = nvme->max_xfer;
}
// Total transfer size in bytes
size_t bytes = ((size_t) blocks) * ((size_t) nvme->info.block_size);
// Page offset of first page of transfer
size_t pageoffset = txn->op.rw.offset_vmo & (~PAGE_MASK);
// Byte offset into first page of transfer
size_t byteoffset = txn->op.rw.offset_vmo & PAGE_MASK;
// Total pages mapped / touched
size_t pagecount = (byteoffset + bytes + PAGE_MASK) >> PAGE_SHIFT;
// read disk (OP_READ) -> memory (PERM_WRITE) or
// write memory (PERM_READ) -> disk (OP_WRITE)
uint32_t opt = (txn->opcode == NVME_OP_READ) ? ZX_BTI_PERM_WRITE : ZX_BTI_PERM_READ;
pages = utxn->virt;
if ((r = zx_bti_pin(nvme->bti, opt, vmo, pageoffset, pagecount << PAGE_SHIFT,
pages, pagecount)) != ZX_OK) {
zxlogf(ERROR, "nvme: could not pin pages: %d\n", r);
break;
}
nvme_cmd_t cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.cmd = NVME_CMD_CID(utxn->id) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(txn->opcode);
cmd.nsid = 1;
cmd.u.rw.start_lba = txn->op.rw.offset_dev;
cmd.u.rw.block_count = blocks - 1;
// The NVME command has room for two data pointers inline.
// The first is always the pointer to the first page where data is.
// The second is the second page if pagecount is 2.
// The second is the address of an array of page 2..n if pagecount > 2
cmd.dptr.prp[0] = pages[0] | byteoffset;
if (pagecount == 2) {
cmd.dptr.prp[1] = pages[1];
} else if (pagecount > 2) {
cmd.dptr.prp[1] = utxn->phys + sizeof(uint64_t);
}
zxlogf(TRACE, "nvme: txn=%p utxn id=%u pages=%zu op=%s\n", txn, utxn->id, pagecount,
txn->opcode == NVME_OP_WRITE ? "WR" : "RD");
zxlogf(SPEW, "nvme: prp[0]=%016zx prp[1]=%016zx\n", cmd.dptr.prp[0], cmd.dptr.prp[1]);
zxlogf(SPEW, "nvme: pages[] = { %016zx, %016zx, %016zx, %016zx, ... }\n",
pages[0], pages[1], pages[2], pages[3]);
if ((r = nvme_io_sq_put(nvme, &cmd)) != ZX_OK) {
zxlogf(ERROR, "nvme: could not submit cmd (txn=%p id=%u)\n", txn, utxn->id);
break;
}
utxn->txn = txn;
// keep track of where we are
txn->op.rw.offset_dev += blocks;
txn->op.rw.offset_vmo += bytes;
txn->op.rw.length -= blocks;
txn->pending_utxns++;
// If there's no more remaining, we're done, and we
// move this txn to the active list and tell the
// caller not to retain the txn (false)
if (txn->op.rw.length == 0) {
mtx_lock(&nvme->lock);
list_add_tail(&nvme->active_txns, &txn->node);
mtx_unlock(&nvme->lock);
return false;
}
}
// failure
if ((r = zx_bti_unpin(nvme->bti, pages[0])) != ZX_OK) {
zxlogf(ERROR, "nvme: cannot unpin io buffer: %d\n", r);
}
utxn_put(nvme, utxn);
mtx_lock(&nvme->lock);
txn->flags |= TXN_FLAG_FAILED;
if (txn->pending_utxns) {
// if there are earlier uncompleted IOs we become active now
// and will finish erroring out when they complete
list_add_tail(&nvme->active_txns, &txn->node);
txn = NULL;
}
mtx_unlock(&nvme->lock);
if (txn != NULL) {
txn_complete(txn, ZX_ERR_INTERNAL);
}
// Either way we tell the caller not to retain the txn (false)
return false;
}
static void io_process_txns(nvme_device_t* nvme) {
nvme_txn_t* txn;
for (;;) {
mtx_lock(&nvme->lock);
txn = list_remove_head_type(&nvme->pending_txns, nvme_txn_t, node);
STAT_DEC_IF(pending, txn != NULL);
mtx_unlock(&nvme->lock);
if (txn == NULL) {
return;
}
if (io_process_txn(nvme, txn)) {
// put txn back at front of queue for further processing later
mtx_lock(&nvme->lock);
list_add_head(&nvme->pending_txns, &txn->node);
STAT_INC_MAX(pending);
mtx_unlock(&nvme->lock);
return;
}
}
}
static void io_process_cpls(nvme_device_t* nvme) {
bool ring_doorbell = false;
nvme_cpl_t cpl;
while (nvme_io_cq_get(nvme, &cpl) == ZX_OK) {
ring_doorbell = true;
if (cpl.cmd_id >= UTXN_COUNT) {
zxlogf(ERROR, "nvme: unexpected cmd id %u\n", cpl.cmd_id);
continue;
}
nvme_utxn_t* utxn = nvme->utxn + cpl.cmd_id;
nvme_txn_t* txn = utxn->txn;
if (txn == NULL) {
zxlogf(ERROR, "nvme: inactive utxn #%u completed?!\n", cpl.cmd_id);
continue;
}
uint32_t code = NVME_CPL_STATUS_CODE(cpl.status);
if (code != 0) {
zxlogf(ERROR, "nvme: utxn #%u txn %p failed: status=%03x\n",
cpl.cmd_id, txn, code);
txn->flags |= TXN_FLAG_FAILED;
// discard any remaining bytes -- no reason to keep creating
// further utxns once one has failed
txn->op.rw.length = 0;
} else {
zxlogf(SPEW, "nvme: utxn #%u txn %p OKAY\n", cpl.cmd_id, txn);
}
zx_status_t r;
if ((r = zx_bti_unpin(nvme->bti, ((zx_paddr_t*)utxn->virt)[0])) != ZX_OK) {
zxlogf(ERROR, "nvme: cannot unpin io buffer: %d\n", r);
}
// release the microtransaction
utxn->txn = NULL;
utxn_put(nvme, utxn);
txn->pending_utxns--;
if ((txn->pending_utxns == 0) && (txn->op.rw.length == 0)) {
// remove from either pending or active list
mtx_lock(&nvme->lock);
list_delete(&txn->node);
mtx_unlock(&nvme->lock);
zxlogf(TRACE, "nvme: txn %p %s\n", txn, txn->flags & TXN_FLAG_FAILED ? "error" : "okay");
txn_complete(txn, txn->flags & TXN_FLAG_FAILED ? ZX_ERR_IO : ZX_OK);
}
}
if (ring_doorbell) {
nvme_io_cq_ack(nvme);
}
}
static int io_thread(void* arg) {
nvme_device_t* nvme = arg;
for (;;) {
if (completion_wait(&nvme->io_signal, ZX_TIME_INFINITE)) {
break;
}
if (nvme->flags & FLAG_SHUTDOWN) {
//TODO: cancel out pending IO
zxlogf(INFO, "nvme: io thread exiting\n");
break;
}
completion_reset(&nvme->io_signal);
// process completion messages
io_process_cpls(nvme);
// process work queue
io_process_txns(nvme);
}
return 0;
}
static void nvme_queue(void* ctx, block_op_t* op) {
nvme_device_t* nvme = ctx;
nvme_txn_t* txn = containerof(op, nvme_txn_t, op);
switch (txn->op.command & BLOCK_OP_MASK) {
case BLOCK_OP_READ:
txn->opcode = NVME_OP_READ;
break;
case BLOCK_OP_WRITE:
txn->opcode = NVME_OP_WRITE;
break;
case BLOCK_OP_FLUSH:
// TODO
txn_complete(txn, ZX_OK);
return;
default:
txn_complete(txn, ZX_ERR_NOT_SUPPORTED);
return;
}
if (txn->op.rw.length == 0) {
txn_complete(txn, ZX_ERR_INVALID_ARGS);
return;
}
// Transaction must fit within device
if ((txn->op.rw.offset_dev >= nvme->info.block_count) ||
(nvme->info.block_count - txn->op.rw.offset_dev < txn->op.rw.length)) {
txn_complete(txn, ZX_ERR_OUT_OF_RANGE);
return;
}
// convert vmo offset to a byte offset
txn->op.rw.offset_vmo *= nvme->info.block_size;
txn->pending_utxns = 0;
txn->flags = 0;
zxlogf(SPEW, "nvme: io: %s: %ublks @ blk#%zu\n",
txn->opcode == NVME_OP_WRITE ? "wr" : "rd",
txn->op.rw.length + 1U, txn->op.rw.offset_dev);
STAT_INC(total_ops);
STAT_ADD(total_blocks, txn->op.rw.length);
mtx_lock(&nvme->lock);
list_add_tail(&nvme->pending_txns, &txn->node);
STAT_INC_MAX(pending);
mtx_unlock(&nvme->lock);
completion_signal(&nvme->io_signal);
}
static void nvme_query(void* ctx, block_info_t* info_out, size_t* block_op_size_out) {
nvme_device_t* nvme = ctx;
*info_out = nvme->info;
*block_op_size_out = sizeof(nvme_txn_t);
}
static zx_status_t nvme_ioctl(void* ctx, uint32_t op, const void* cmd, size_t cmdlen, void* reply,
size_t max, size_t* out_actual) {
nvme_device_t* nvme = ctx;
switch (op) {
case IOCTL_BLOCK_GET_INFO: {
if (max < sizeof(block_info_t)) {
return ZX_ERR_BUFFER_TOO_SMALL;
}
size_t sz;
nvme_query(nvme, reply, &sz);
*out_actual = sizeof(block_info_t);
return ZX_OK;
}
case IOCTL_BLOCK_GET_STATS: {
#if WITH_STATS
if (cmdlen != sizeof(bool)) {
return ZX_ERR_INVALID_ARGS;
}
block_stats_t* out = reply;
if (max < sizeof(*out)) {
return ZX_ERR_BUFFER_TOO_SMALL;
}
mtx_lock(&nvme->lock);
out->max_concur = nvme->stat_max_concur;
out->max_pending = nvme->stat_max_pending;
out->total_ops = nvme->stat_total_ops;
out->total_blocks = nvme->stat_total_blocks;
bool clear = *(bool *)cmd;
if (clear) {
nvme->stat_max_concur = 0;
nvme->stat_max_pending = 0;
nvme->stat_total_ops = 0;
nvme->stat_total_blocks = 0;
}
mtx_unlock(&nvme->lock);
*out_actual = sizeof(*out);
return ZX_OK;
#else
return ZX_ERR_NOT_SUPPORTED;
#endif
}
case IOCTL_DEVICE_SYNC: {
return ZX_OK;
}
default:
return ZX_ERR_NOT_SUPPORTED;
}
}
static zx_off_t nvme_get_size(void* ctx) {
nvme_device_t* nvme = ctx;
return nvme->info.block_count * nvme->info.block_size;
}
static zx_status_t nvme_suspend(void* ctx, uint32_t flags) {
return ZX_OK;
}
static zx_status_t nvme_resume(void* ctx, uint32_t flags) {
return ZX_OK;
}
static void nvme_release(void* ctx) {
nvme_device_t* nvme = ctx;
int r;
zxlogf(INFO, "nvme: release\n");
nvme->flags |= FLAG_SHUTDOWN;
if (nvme->ioh != ZX_HANDLE_INVALID) {
pci_enable_bus_master(&nvme->pci, false);
zx_handle_close(nvme->bti);
zx_handle_close(nvme->ioh);
// TODO: risks a handle use-after-close, will be resolved by IRQ api
// changes coming soon
zx_handle_close(nvme->irqh);
}
if (nvme->flags & FLAG_IRQ_THREAD_STARTED) {
thrd_join(nvme->irqthread, &r);
}
if (nvme->flags & FLAG_IO_THREAD_STARTED) {
completion_signal(&nvme->io_signal);
thrd_join(nvme->iothread, &r);
}
// error out any pending txns
mtx_lock(&nvme->lock);
nvme_txn_t* txn;
while ((txn = list_remove_head_type(&nvme->active_txns, nvme_txn_t, node)) != NULL) {
txn_complete(txn, ZX_ERR_PEER_CLOSED);
}
while ((txn = list_remove_head_type(&nvme->pending_txns, nvme_txn_t, node)) != NULL) {
txn_complete(txn, ZX_ERR_PEER_CLOSED);
}
mtx_unlock(&nvme->lock);
io_buffer_release(&nvme->iob);
free(nvme);
}
static zx_protocol_device_t device_ops = {
.version = DEVICE_OPS_VERSION,
.ioctl = nvme_ioctl,
.get_size = nvme_get_size,
.suspend = nvme_suspend,
.resume = nvme_resume,
.release = nvme_release,
};
static void infostring(const char* prefix, uint8_t* str, size_t len) {
char tmp[len + 1];
size_t i;
for (i = 0; i < len; i++) {
uint8_t c = str[i];
if (c == 0) {
break;
}
if ((c < ' ') || (c > 127)) {
c = ' ';
}
tmp[i] = c;
}
tmp[i] = 0;
while (i > 0) {
i--;
if (tmp[i] == ' ') {
tmp[i] = 0;
} else {
break;
}
}
zxlogf(INFO, "nvme: %s'%s'\n", prefix, tmp);
}
// Convenience accessors for BAR0 registers
#define rd32(r) readl(nvme->io + NVME_REG_##r)
#define rd64(r) readll(nvme->io + NVME_REG_##r)
#define wr32(v,r) writel(v, nvme->io + NVME_REG_##r)
#define wr64(v,r) writell(v, nvme->io + NVME_REG_##r)
// dedicated pages from the page pool
#define IDX_ADMIN_SQ 0
#define IDX_ADMIN_CQ 1
#define IDX_IO_SQ 2
#define IDX_IO_CQ 3
#define IDX_SCRATCH 4
#define IDX_UTXN_POOL 5 // this must always be last
#define IO_PAGE_COUNT (IDX_UTXN_POOL + UTXN_COUNT)
static inline uint64_t U64(uint8_t* x) {
return *((uint64_t*) (void*) x);
}
static inline uint32_t U32(uint8_t* x) {
return *((uint32_t*) (void*) x);
}
static inline uint32_t U16(uint8_t* x) {
return *((uint16_t*) (void*) x);
}
#define WAIT_MS 5000
static zx_status_t nvme_init(nvme_device_t* nvme) {
uint32_t n = rd32(VS);
uint64_t cap = rd64(CAP);
zxlogf(INFO, "nvme: version %d.%d.%d\n", n >> 16, (n >> 8) & 0xFF, n & 0xFF);
zxlogf(INFO, "nvme: page size: (MPSMIN): %u (MPSMAX): %u\n",
(unsigned) (1 << NVME_CAP_MPSMIN(cap)),
(unsigned) (1 << NVME_CAP_MPSMAX(cap)));
zxlogf(INFO, "nvme: doorbell stride: %u\n", (unsigned) (1 << NVME_CAP_DSTRD(cap)));
zxlogf(INFO, "nvme: timeout: %u ms\n", (unsigned) (1 << NVME_CAP_TO(cap)));
zxlogf(INFO, "nvme: boot partition support (BPS): %c\n", NVME_CAP_BPS(cap) ? 'Y' : 'N');
zxlogf(INFO, "nvme: supports NVM command set (CSS:NVM): %c\n", NVME_CAP_CSS_NVM(cap) ? 'Y' : 'N');
zxlogf(INFO, "nvme: subsystem reset supported (NSSRS): %c\n", NVME_CAP_NSSRS(cap) ? 'Y' : 'N');
zxlogf(INFO, "nvme: weighted-round-robin (AMS:WRR): %c\n", NVME_CAP_AMS_WRR(cap) ? 'Y' : 'N');
zxlogf(INFO, "nvme: vendor-specific arbitration (AMS:VS): %c\n", NVME_CAP_AMS_VS(cap) ? 'Y' : 'N');
zxlogf(INFO, "nvme: contiquous queues required (CQR): %c\n", NVME_CAP_CQR(cap) ? 'Y' : 'N');
zxlogf(INFO, "nvme: maximum queue entries supported (MQES): %u\n", ((unsigned) NVME_CAP_MQES(cap)) + 1);
if ((1 << NVME_CAP_MPSMIN(cap)) > PAGE_SIZE) {
zxlogf(ERROR, "nvme: minimum page size larger than platform page size\n");
return ZX_ERR_NOT_SUPPORTED;
}
// allocate pages for various queues and the utxn scatter lists
// TODO: these should all be RO to hardware apart from the scratch io page(s)
if (io_buffer_init_with_bti(&nvme->iob, nvme->bti, PAGE_SIZE * IO_PAGE_COUNT, IO_BUFFER_RW) ||
io_buffer_physmap(&nvme->iob)) {
zxlogf(ERROR, "nvme: could not allocate io buffers\n");
return ZX_ERR_NO_MEMORY;
}
// initialize the microtransaction pool
nvme->utxn_avail = 0x7FFFFFFFFFFFFFFFULL;
for (unsigned n = 0; n < UTXN_COUNT; n++) {
nvme->utxn[n].id = n;
nvme->utxn[n].phys = nvme->iob.phys_list[IDX_UTXN_POOL + n];
nvme->utxn[n].virt = nvme->iob.virt + (IDX_UTXN_POOL + n) * PAGE_SIZE;
}
if (rd32(CSTS) & NVME_CSTS_RDY) {
zxlogf(INFO, "nvme: controller is active. resetting...\n");
wr32(rd32(CC) & ~NVME_CC_EN, CC); // disable
}
// ensure previous shutdown (by us or bootloader) has completed
unsigned ms_remain = WAIT_MS;
while (rd32(CSTS) & NVME_CSTS_RDY) {
if (--ms_remain == 0) {
zxlogf(ERROR, "nvme: timed out waiting for CSTS ~RDY\n");
return ZX_ERR_INTERNAL;
}
zx_nanosleep(zx_deadline_after(ZX_MSEC(1)));
}
zxlogf(INFO, "nvme: controller inactive. (after %u ms)\n", WAIT_MS - ms_remain);
// configure admin submission and completion queues
wr64(nvme->iob.phys_list[IDX_ADMIN_SQ], ASQ);
wr64(nvme->iob.phys_list[IDX_ADMIN_CQ], ACQ);
wr32(NVME_AQA_ASQS(SQMAX - 1) | NVME_AQA_ACQS(CQMAX - 1), AQA);
zxlogf(INFO, "nvme: enabling\n");
wr32(NVME_CC_EN | NVME_CC_AMS_RR | NVME_CC_MPS(0) |
NVME_CC_IOCQES(NVME_CPL_SHIFT) |
NVME_CC_IOSQES(NVME_CMD_SHIFT), CC);
ms_remain = WAIT_MS;
while (!(rd32(CSTS) & NVME_CSTS_RDY)) {
if (--ms_remain == 0) {
zxlogf(ERROR, "nvme: timed out waiting for CSTS RDY\n");
return ZX_ERR_INTERNAL;
}
zx_nanosleep(zx_deadline_after(ZX_MSEC(1)));
}
zxlogf(INFO, "nvme: controller ready. (after %u ms)\n", WAIT_MS - ms_remain);
// registers and buffers for admin queues
nvme->io_admin_sq_tail_db = nvme->io + NVME_REG_SQnTDBL(0, cap);
nvme->io_admin_cq_head_db = nvme->io + NVME_REG_CQnHDBL(0, cap);
nvme->admin_sq = nvme->iob.virt + PAGE_SIZE * IDX_ADMIN_SQ;
nvme->admin_sq_head = 0;
nvme->admin_sq_tail = 0;
nvme->admin_cq = nvme->iob.virt + PAGE_SIZE * IDX_ADMIN_CQ;
nvme->admin_cq_head = 0;
nvme->admin_cq_toggle = 1;
// registers and buffers for IO queues
nvme->io_sq_tail_db = nvme->io + NVME_REG_SQnTDBL(1, cap);
nvme->io_cq_head_db = nvme->io + NVME_REG_CQnHDBL(1, cap);
nvme->io_sq = nvme->iob.virt + PAGE_SIZE * IDX_IO_SQ;
nvme->io_sq_head = 0;
nvme->io_sq_tail = 0;
nvme->io_cq = nvme->iob.virt + PAGE_SIZE * IDX_IO_CQ;
nvme->io_cq_head = 0;
nvme->io_cq_toggle = 1;
// scratch page for admin ops
void* scratch = nvme->iob.virt + PAGE_SIZE * IDX_SCRATCH;
if (thrd_create_with_name(&nvme->irqthread, irq_thread, nvme, "nvme-irq-thread")) {
zxlogf(ERROR, "nvme; cannot create irq thread\n");
return ZX_ERR_INTERNAL;
}
nvme->flags |= FLAG_IRQ_THREAD_STARTED;
if (thrd_create_with_name(&nvme->iothread, io_thread, nvme, "nvme-io-thread")) {
zxlogf(ERROR, "nvme; cannot create io thread\n");
return ZX_ERR_INTERNAL;
}
nvme->flags |= FLAG_IO_THREAD_STARTED;
nvme_cmd_t cmd;
// identify device
cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_IDENTIFY);
cmd.nsid = 0;
cmd.reserved = 0;
cmd.mptr = 0;
cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_SCRATCH];
cmd.dptr.prp[1] = 0;
cmd.u.raw[0] = 1; // CNS 01
if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
zxlogf(ERROR, "nvme: device identify op failed\n");
return ZX_ERR_INTERNAL;
}
nvme_identify_t* ci = scratch;
infostring("model: ", ci->MN, sizeof(ci->MN));
infostring("serial number: ", ci->SN, sizeof(ci->SN));
infostring("firmware: ", ci->FR, sizeof(ci->FR));
if ((ci->SQES & 0xF) != NVME_CMD_SHIFT) {
zxlogf(ERROR, "nvme: SQES minimum is not %ub\n", NVME_CMD_SIZE);
return ZX_ERR_NOT_SUPPORTED;
}
if ((ci->CQES & 0xF) != NVME_CPL_SHIFT) {
zxlogf(ERROR, "nvme: CQES minimum is not %ub\n", NVME_CPL_SIZE);
return ZX_ERR_NOT_SUPPORTED;
}
zxlogf(INFO, "nvme: max outstanding commands: %u\n", ci->MAXCMD);
uint32_t nscount = ci->NN;
zxlogf(INFO, "nvme: max namespaces: %u\n", nscount);
zxlogf(INFO, "nvme: scatter gather lists (SGL): %c %08x\n",
(ci->SGLS & 3) ? 'Y' : 'N', ci->SGLS);
// Maximum transfer is in units of 2^n * PAGESIZE, n == 0 means "infinite"
nvme->max_xfer = 0xFFFFFFFF;
if ((ci->MDTS != 0) && (ci->MDTS < (31 - PAGE_SHIFT))) {
nvme->max_xfer = (1 << ci->MDTS) * PAGE_SIZE;
}
zxlogf(INFO, "nvme: max data transfer: %u bytes\n", nvme->max_xfer);
zxlogf(INFO, "nvme: sanitize caps: %u\n", ci->SANICAP & 3);
zxlogf(INFO, "nvme: abort command limit (ACL): %u\n", ci->ACL + 1);
zxlogf(INFO, "nvme: asynch event req limit (AERL): %u\n", ci->AERL + 1);
zxlogf(INFO, "nvme: firmware: slots: %u reset: %c slot1ro: %c\n", (ci->FRMW >> 1) & 3,
(ci->FRMW & (1 << 4)) ? 'N' : 'Y', (ci->FRMW & 1) ? 'Y' : 'N');
zxlogf(INFO, "nvme: host buffer: min/preferred: %u/%u pages\n", ci->HMMIN, ci->HMPRE);
zxlogf(INFO, "nvme: capacity: total/unalloc: %zu/%zu\n", ci->TNVMCAP_LO, ci->UNVMCAP_LO);
if (ci->VWC & 1) {
nvme->flags |= FLAG_HAS_VWC;
}
uint32_t awun = ci->AWUN + 1;
uint32_t awupf = ci->AWUPF + 1;
zxlogf(INFO, "nvme: volatile write cache (VWC): %s\n", nvme->flags & FLAG_HAS_VWC ? "Y" : "N");
zxlogf(INFO, "nvme: atomic write unit (AWUN)/(AWUPF): %u/%u blks\n", awun, awupf);
#define FEATURE(a,b) if (ci->a & a##_##b) zxlogf(INFO, "nvme: feature: %s\n", #b)
FEATURE(OACS, DOORBELL_BUFFER_CONFIG);
FEATURE(OACS, VIRTUALIZATION_MANAGEMENT);
FEATURE(OACS, NVME_MI_SEND_RECV);
FEATURE(OACS, DIRECTIVE_SEND_RECV);
FEATURE(OACS, DEVICE_SELF_TEST);
FEATURE(OACS, NAMESPACE_MANAGEMENT);
FEATURE(OACS, FIRMWARE_DOWNLOAD_COMMIT);
FEATURE(OACS, FORMAT_NVM);
FEATURE(OACS, SECURITY_SEND_RECV);
FEATURE(ONCS, TIMESTAMP);
FEATURE(ONCS, RESERVATIONS);
FEATURE(ONCS, SAVE_SELECT_NONZERO);
FEATURE(ONCS, WRITE_UNCORRECTABLE);
FEATURE(ONCS, COMPARE);
// set feature (number of queues) to 1 iosq and 1 iocq
memset(&cmd, 0, sizeof(cmd));
cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_SET_FEATURE);
cmd.u.raw[0] = NVME_FEATURE_NUMBER_OF_QUEUES;
cmd.u.raw[1] = 0;
nvme_cpl_t cpl;
if (nvme_admin_txn(nvme, &cmd, &cpl) != ZX_OK) {
zxlogf(ERROR, "nvme: set feature (number queues) op failed\n");
return ZX_ERR_INTERNAL;
}
zxlogf(INFO,"cpl.cmd %08x\n", cpl.cmd);
// create the IO completion queue
memset(&cmd, 0, sizeof(cmd));
cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_CREATE_IOCQ);
cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_IO_CQ];
cmd.u.raw[0] = ((CQMAX - 1) << 16) | 1; // queue size, queue id
cmd.u.raw[1] = (0 << 16) | 2 | 1; // irq vector, irq enable, phys contig
if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
zxlogf(ERROR, "nvme: completion queue creation op failed\n");
return ZX_ERR_INTERNAL;
}
// create the IO submit queue
memset(&cmd, 0, sizeof(cmd));
cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_CREATE_IOSQ);
cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_IO_SQ];
cmd.u.raw[0] = ((SQMAX - 1) << 16) | 1; // queue size, queue id
cmd.u.raw[1] = (1 << 16) | 0 | 1; // cqid, qprio, phys contig
if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
zxlogf(ERROR, "nvme: submit queue creation op failed\n");
return ZX_ERR_INTERNAL;
}
// identify namespace 1
memset(&cmd, 0, sizeof(cmd));
cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_IDENTIFY);
cmd.nsid = 1;
cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_SCRATCH];
if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
zxlogf(ERROR, "nvme: namespace identify op failed\n");
return ZX_ERR_INTERNAL;
}
nvme_identify_ns_t* ni = scratch;
uint32_t nawun = (ni->NSFEAT & NSFEAT_LOCAL_ATOMIC_SIZES) ? (ni->NAWUN + 1U) : awun;
uint32_t nawupf = (ni->NSFEAT & NSFEAT_LOCAL_ATOMIC_SIZES) ? (ni->NAWUPF + 1U) : awupf;
zxlogf(INFO, "nvme: ns: atomic write unit (AWUN)/(AWUPF): %u/%u blks\n", nawun, nawupf);
zxlogf(INFO, "nvme: ns: NABSN/NABO/NABSPF/NOIOB: %u/%u/%u/%u\n",
ni->NABSN, ni->NABO, ni->NABSPF, ni->NOIOB);
// table of block formats
for (unsigned i = 0; i < 16; i++) {
if (ni->LBAF[i]) {
zxlogf(INFO, "nvme: ns: LBA FMT %02d: RP=%u LBADS=2^%ub MS=%ub\n",
i, NVME_LBAFMT_RP(ni->LBAF[i]), NVME_LBAFMT_LBADS(ni->LBAF[i]),
NVME_LBAFMT_MS(ni->LBAF[i]));
}
}
zxlogf(INFO, "nvme: ns: LBA FMT #%u active\n", ni->FLBAS & 0xF);
zxlogf(INFO, "nvme: ns: data protection: caps/set: 0x%02x/%u\n",
ni->DPC & 0x3F, ni->DPS & 3);
uint32_t fmt = ni->LBAF[ni->FLBAS & 0xF];
zxlogf(INFO, "nvme: ns: size/cap/util: %zu/%zu/%zu blks\n", ni->NSSZ, ni->NCAP, ni->NUSE);
nvme->info.block_count = ni->NSSZ;
nvme->info.block_size = 1 << NVME_LBAFMT_LBADS(fmt);
nvme->info.max_transfer_size = 0xFFFFFFFF;
if (NVME_LBAFMT_MS(fmt)) {
zxlogf(ERROR, "nvme: cannot handle LBA format with metadata\n");
return ZX_ERR_NOT_SUPPORTED;
}
if ((nvme->info.block_size < 512) || (nvme->info.block_size > 32768)) {
zxlogf(ERROR, "nvme: cannot handle LBA size of %u\n", nvme->info.block_size);
return ZX_ERR_NOT_SUPPORTED;
}
// NVME r/w commands operate in block units, maximum of 64K:
size_t max_bytes_per_cmd = ((size_t) nvme->info.block_size) * ((size_t) 65536);
if (nvme->max_xfer > max_bytes_per_cmd) {
nvme->max_xfer = max_bytes_per_cmd;
}
// The device may allow transfers larger than we are prepared
// to handle. Clip to our limit.
if (nvme->max_xfer > MAX_XFER) {
nvme->max_xfer = MAX_XFER;
}
// convert to block units
nvme->max_xfer /= nvme->info.block_size;
zxlogf(INFO, "nvme: max transfer per r/w op: %u blocks (%u bytes)\n",
nvme->max_xfer, nvme->max_xfer * nvme->info.block_size);
device_make_visible(nvme->zxdev);
return ZX_OK;
}
block_protocol_ops_t block_ops = {
.query = nvme_query,
.queue = nvme_queue,
};
static zx_status_t nvme_bind(void* ctx, zx_device_t* dev) {
nvme_device_t* nvme;
if ((nvme = calloc(1, sizeof(nvme_device_t))) == NULL) {
return ZX_ERR_NO_MEMORY;
}
list_initialize(&nvme->pending_txns);
list_initialize(&nvme->active_txns);
mtx_init(&nvme->lock, mtx_plain);
mtx_init(&nvme->admin_lock, mtx_plain);
if (device_get_protocol(dev, ZX_PROTOCOL_PCI, &nvme->pci)) {
goto fail;
}
if (pci_map_bar(&nvme->pci, 0u, ZX_CACHE_POLICY_UNCACHED_DEVICE,
&nvme->io, &nvme->iosz, &nvme->ioh)) {
zxlogf(ERROR, "nvme: cannot map registers\n");
goto fail;
}
uint32_t modes[3] = {
ZX_PCIE_IRQ_MODE_MSI_X, ZX_PCIE_IRQ_MODE_MSI, ZX_PCIE_IRQ_MODE_LEGACY,
};
uint32_t nirq = 0;
for (unsigned n = 0; n < countof(modes); n++) {
if ((pci_query_irq_mode(&nvme->pci, modes[n], &nirq) == ZX_OK) &&
(pci_set_irq_mode(&nvme->pci, modes[n], 1) == ZX_OK)) {
zxlogf(INFO, "nvme: irq mode %u, irq count %u (#%u)\n", modes[n], nirq, n);
goto irq_configured;
}
}
zxlogf(ERROR, "nvme: could not configure irqs\n");
goto fail;
irq_configured:
if (pci_map_interrupt(&nvme->pci, 0, &nvme->irqh) != ZX_OK) {
zxlogf(ERROR, "nvme: could not map irq\n");
goto fail;
}
if (pci_enable_bus_master(&nvme->pci, true)) {
zxlogf(ERROR, "nvme: cannot enable bus mastering\n");
goto fail;
}
if (pci_get_bti(&nvme->pci, 0, &nvme->bti) != ZX_OK) {
zxlogf(ERROR, "nvme: cannot obtain bti handle\n");
goto fail;
}
device_add_args_t args = {
.version = DEVICE_ADD_ARGS_VERSION,
.name = "nvme",
.ctx = nvme,
.ops = &device_ops,
.flags = DEVICE_ADD_INVISIBLE,
.proto_id = ZX_PROTOCOL_BLOCK_IMPL,
.proto_ops = &block_ops,
};
if (device_add(dev, &args, &nvme->zxdev)) {
goto fail;
}
if (nvme_init(nvme) != ZX_OK) {
zxlogf(ERROR, "nvme: init failed\n");
device_remove(nvme->zxdev);
return ZX_ERR_INTERNAL;
}
return ZX_OK;
fail:
nvme_release(nvme);
return ZX_ERR_NOT_SUPPORTED;
}
static zx_driver_ops_t driver_ops = {
.version = DRIVER_OPS_VERSION,
.bind = nvme_bind,
};
ZIRCON_DRIVER_BEGIN(nvme, driver_ops, "zircon", "0.1", 4)
BI_ABORT_IF(NE, BIND_PROTOCOL, ZX_PROTOCOL_PCI),
BI_ABORT_IF(NE, BIND_PCI_CLASS, 1), // Mass Storage
BI_ABORT_IF(NE, BIND_PCI_SUBCLASS, 8), // NVM
BI_MATCH_IF(EQ, BIND_PCI_INTERFACE, 2), // NVMHCI
ZIRCON_DRIVER_END(nvme)