drivers/gpu/msd-arm-mali/src/msd_arm_device.cc - garnet - Git at Google

 // Copyright 2017 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "msd_arm_device.h"

 #include <fbl/algorithm.h>
 #include <fbl/string_printf.h>

 #include <bitset>
 #include <cinttypes>
 #include <cstdio>
 #include <string>

 #include "job_scheduler.h"
 #include "magma_util/dlog.h"
 #include "magma_util/macros.h"
 #include "magma_vendor_queries.h"
 #include "platform_barriers.h"
 #include "platform_port.h"
 #include "platform_trace.h"

 #include "registers.h"

 // This is the index into the mmio section of the mdi.
 enum MmioIndex {
     kMmioIndexRegisters = 0,
 };

 enum InterruptIndex {
     kInterruptIndexJob = 0,
     kInterruptIndexMmu = 1,
     kInterruptIndexGpu = 2,
 };

 class MsdArmDevice::DumpRequest : public DeviceRequest {
 public:
     DumpRequest() {}

 protected:
     magma::Status Process(MsdArmDevice* device) override
     {
         return device->ProcessDumpStatusToLog();
     }
 };

 class MsdArmDevice::PerfCounterSampleCompletedRequest : public DeviceRequest {
 public:
     PerfCounterSampleCompletedRequest() {}

 protected:
     magma::Status Process(MsdArmDevice* device) override
     {
         return device->ProcessPerfCounterSampleCompleted();
     }
 };

 class MsdArmDevice::JobInterruptRequest : public DeviceRequest {
 public:
     JobInterruptRequest() {}

 protected:
     magma::Status Process(MsdArmDevice* device) override { return device->ProcessJobInterrupt(); }
 };

 class MsdArmDevice::MmuInterruptRequest : public DeviceRequest {
 public:
     MmuInterruptRequest() {}

 protected:
     magma::Status Process(MsdArmDevice* device) override { return device->ProcessMmuInterrupt(); }
 };

 class MsdArmDevice::ScheduleAtomRequest : public DeviceRequest {
 public:
     ScheduleAtomRequest() {}

 protected:
     magma::Status Process(MsdArmDevice* device) override { return device->ProcessScheduleAtoms(); }
 };

 class MsdArmDevice::CancelAtomsRequest : public DeviceRequest {
 public:
     CancelAtomsRequest(std::shared_ptr<MsdArmConnection> connection) : connection_(connection) {}

 protected:
     magma::Status Process(MsdArmDevice* device) override
     {
         return device->ProcessCancelAtoms(connection_);
     }

     std::weak_ptr<MsdArmConnection> connection_;
 };

 class MsdArmDevice::PerfCounterRequest : public DeviceRequest {
 public:
     PerfCounterRequest(uint32_t type) : type_(type) {}

 protected:
     magma::Status Process(MsdArmDevice* device) override
     {
         return device->ProcessPerfCounterRequest(type_);
     }

     uint32_t type_;
 };

 //////////////////////////////////////////////////////////////////////////////////////////////////

 std::unique_ptr<MsdArmDevice> MsdArmDevice::Create(void* device_handle, bool start_device_thread)
 {
     auto device = std::make_unique<MsdArmDevice>();

     if (!device->Init(device_handle))
         return DRETP(nullptr, "Failed to initialize MsdArmDevice");

     if (start_device_thread)
         device->StartDeviceThread();

     return device;
 }

 MsdArmDevice::MsdArmDevice() { magic_ = kMagic; }

 MsdArmDevice::~MsdArmDevice() { Destroy(); }

 void MsdArmDevice::Destroy()
 {
     DLOG("Destroy");
     CHECK_THREAD_NOT_CURRENT(device_thread_id_);

     DisableInterrupts();

     interrupt_thread_quit_flag_ = true;

     if (gpu_interrupt_)
         gpu_interrupt_->Signal();
     if (job_interrupt_)
         job_interrupt_->Signal();
     if (mmu_interrupt_)
         mmu_interrupt_->Signal();

     if (gpu_interrupt_thread_.joinable()) {
         DLOG("joining GPU interrupt thread");
         gpu_interrupt_thread_.join();
         DLOG("joined");
     }
     if (job_interrupt_thread_.joinable()) {
         DLOG("joining Job interrupt thread");
         job_interrupt_thread_.join();
         DLOG("joined");
     }
     if (mmu_interrupt_thread_.joinable()) {
         DLOG("joining MMU interrupt thread");
         mmu_interrupt_thread_.join();
         DLOG("joined");
     }
     device_thread_quit_flag_ = true;

     if (device_request_semaphore_)
         device_request_semaphore_->Signal();

     if (device_thread_.joinable()) {
         DLOG("joining device thread");
         device_thread_.join();
         DLOG("joined");
     }
 }

 bool MsdArmDevice::Init(void* device_handle)
 {
     DLOG("Init");
     platform_device_ = magma::PlatformDevice::Create(device_handle);
     if (!platform_device_)
         return DRETF(false, "Failed to initialize device");

     std::unique_ptr<magma::PlatformMmio> mmio = platform_device_->CpuMapMmio(
         kMmioIndexRegisters, magma::PlatformMmio::CACHE_POLICY_UNCACHED_DEVICE);
     if (!mmio)
         return DRETF(false, "failed to map registers");

     register_io_ = std::make_unique<magma::RegisterIo>(std::move(mmio));

     gpu_features_.ReadFrom(register_io_.get());
     magma::log(magma::LOG_INFO, "ARM mali ID %x", gpu_features_.gpu_id.reg_value());

 #if defined(MSD_ARM_ENABLE_CACHE_COHERENCY)
     if (gpu_features_.coherency_features.ace().get()) {
         cache_coherency_status_ = kArmMaliCacheCoherencyAce;
     } else {
         magma::log(magma::LOG_INFO, "Cache coherency unsupported");
     }
 #endif

     reset_semaphore_ = magma::PlatformSemaphore::Create();

     device_request_semaphore_ = magma::PlatformSemaphore::Create();
     device_port_ = magma::PlatformPort::Create();

     power_manager_ = std::make_unique<PowerManager>(register_io_.get());
     perf_counters_ = std::make_unique<PerformanceCounters>(this);
     scheduler_ = std::make_unique<JobScheduler>(this, 3);
     address_manager_ = std::make_unique<AddressManager>(this, gpu_features_.address_space_count);

     bus_mapper_ = magma::PlatformBusMapper::Create(platform_device_->GetBusTransactionInitiator());
     if (!bus_mapper_)
         return DRETF(false, "Failed to create bus mapper");

     if (!InitializeInterrupts())
         return false;

     return InitializeHardware();
 }

 bool MsdArmDevice::InitializeHardware()
 {
     cycle_counter_refcount_ = 0;
     DASSERT(registers::GpuStatus::Get().ReadFrom(register_io_.get()).cycle_count_active().get() ==
             0);
     EnableInterrupts();
     InitializeHardwareQuirks(&gpu_features_, register_io_.get());

     uint64_t enabled_cores = 1;
 #if defined(MSD_ARM_ENABLE_ALL_CORES)
     enabled_cores = gpu_features_.shader_present;
 #endif
     power_manager_->EnableCores(register_io_.get(), enabled_cores);

     return true;
 }

 std::shared_ptr<MsdArmConnection> MsdArmDevice::Open(msd_client_id_t client_id)
 {
     auto connection = MsdArmConnection::Create(client_id, this);
     if (connection) {
         std::lock_guard<std::mutex> lock(connection_list_mutex_);
         connection_list_.push_back(connection);
     }
     return connection;
 }

 void MsdArmDevice::DeregisterConnection()
 {
     std::lock_guard<std::mutex> lock(connection_list_mutex_);
     connection_list_.erase(std::remove_if(connection_list_.begin(), connection_list_.end(),
                                           [](auto& connection) { return connection.expired(); }),
                            connection_list_.end());
 }

 void MsdArmDevice::DumpStatusToLog() { EnqueueDeviceRequest(std::make_unique<DumpRequest>()); }

 void MsdArmDevice::OutputHangMessage()
 {
     magma::log(magma::LOG_WARNING, "Possible GPU hang\n");
     ProcessDumpStatusToLog();
 }

 int MsdArmDevice::DeviceThreadLoop()
 {
     magma::PlatformThreadHelper::SetCurrentThreadName("DeviceThread");

     device_thread_id_ = std::make_unique<magma::PlatformThreadId>();
     CHECK_THREAD_IS_CURRENT(device_thread_id_);

     DLOG("DeviceThreadLoop starting thread 0x%lx", device_thread_id_->id());

     std::unique_lock<std::mutex> lock(device_request_mutex_, std::defer_lock);
     device_request_semaphore_->WaitAsync(device_port_.get());

     while (!device_thread_quit_flag_) {
         auto timeout_duration = scheduler_->GetCurrentTimeoutDuration();
         if (timeout_duration <= JobScheduler::Clock::duration::zero()) {
             scheduler_->HandleTimedOutAtoms();
             continue;
         }
         uint64_t key;
         magma::Status status(MAGMA_STATUS_OK);
         if (timeout_duration < JobScheduler::Clock::duration::max()) {
             // Add 1 to avoid rounding time down and spinning with timeouts close to 0.
             int64_t millisecond_timeout =
                 std::chrono::duration_cast<std::chrono::milliseconds>(timeout_duration).count() + 1;
             status = device_port_->Wait(&key, millisecond_timeout);
         } else {
             status = device_port_->Wait(&key);
         }
         if (status.ok()) {
             if (key == device_request_semaphore_->id()) {
                 device_request_semaphore_->Reset();
                 device_request_semaphore_->WaitAsync(device_port_.get());
                 while (!device_thread_quit_flag_) {
                     lock.lock();
                     if (!device_request_list_.size()) {
                         lock.unlock();
                         break;
                     }
                     auto request = std::move(device_request_list_.front());
                     device_request_list_.pop_front();
                     lock.unlock();
                     request->ProcessAndReply(this);
                 }
             } else {
                 scheduler_->PlatformPortSignaled(key);
             }
         }
     }

     DLOG("DeviceThreadLoop exit");
     return 0;
 }

 int MsdArmDevice::GpuInterruptThreadLoop()
 {
     magma::PlatformThreadHelper::SetCurrentThreadName("Gpu InterruptThread");
     DLOG("GPU Interrupt thread started");

     while (!interrupt_thread_quit_flag_) {
         DLOG("GPU waiting for interrupt");
         gpu_interrupt_->Wait();
         DLOG("GPU Returned from interrupt wait!");

         if (interrupt_thread_quit_flag_)
             break;

         auto irq_status = registers::GpuIrqFlags::GetStatus().ReadFrom(register_io_.get());

         if (!irq_status.reg_value()) {
             magma::log(magma::LOG_WARNING, "Got unexpected GPU IRQ with no flags set\n");
         }

         auto clear_flags = registers::GpuIrqFlags::GetIrqClear().FromValue(irq_status.reg_value());
         // Handle interrupts on the interrupt thread so the device thread can wait for them to
         // complete.
         if (irq_status.reset_completed().get()) {
             DLOG("Received GPU reset completed");
             reset_semaphore_->Signal();
             irq_status.reset_completed().set(0);
         }
         if (irq_status.power_changed_single().get() || irq_status.power_changed_all().get()) {
             irq_status.power_changed_single().set(0);
             irq_status.power_changed_all().set(0);
             power_manager_->ReceivedPowerInterrupt(register_io_.get());
             if (power_manager_->l2_ready_status() &&
                 (cache_coherency_status_ == kArmMaliCacheCoherencyAce)) {
                 auto enable_reg = registers::CoherencyFeatures::GetEnable().FromValue(0);
                 enable_reg.ace().set(true);
                 enable_reg.WriteTo(register_io_.get());
             }
         }

         if (irq_status.performance_counter_sample_completed().get()) {
             irq_status.performance_counter_sample_completed().set(0);
             EnqueueDeviceRequest(std::make_unique<PerfCounterSampleCompletedRequest>(), true);
             // Don't wait for a reply, to ensure there's no deadlock. Clearing the interrupt flag
             // before the interrupt is actually processed shouldn't matter, because perf_counters_
             // ensures only one request happens at a time.
         }

         if (irq_status.reg_value()) {
             magma::log(magma::LOG_WARNING, "Got unexpected GPU IRQ %d\n", irq_status.reg_value());
             uint64_t fault_addr =
                 registers::GpuFaultAddress::Get().ReadFrom(register_io_.get()).reg_value();
             {
                 std::lock_guard<std::mutex> lock(connection_list_mutex_);
                 for (auto& connection : connection_list_) {
                     auto locked = connection.lock();
                     if (locked) {
                         uint64_t virtual_address;
                         if (locked->GetVirtualAddressFromPhysical(fault_addr, &virtual_address))
                             magma::log(magma::LOG_WARNING,
                                        "Client %lx has VA %lx mapped to PA %lx\n",
                                        locked->client_id(), virtual_address, fault_addr);
                     }
                 }
             }

             // Perform the GPU dump immediately, because clearing the irq flags might cause another
             // GPU fault to be generated, which could overwrite the earlier data.
             std::string dump;
             DumpToString(dump, false);
             magma::log(magma::LOG_INFO, "GPU fault status: %s", dump.c_str());
         }

         if (clear_flags.reg_value()) {
             clear_flags.WriteTo(register_io_.get());
         }
     }

     DLOG("GPU Interrupt thread exited");
     return 0;
 }

 magma::Status MsdArmDevice::ProcessPerfCounterSampleCompleted()
 {

     DLOG("Perf Counter sample completed");

     uint64_t duration_ms = 0;
     std::vector<uint32_t> perf_result = perf_counters_->ReadCompleted(&duration_ms);

     magma::log(magma::LOG_INFO, "Performance counter read complete, duration %lu ms:\n",
                duration_ms);
     for (uint32_t i = 0; i < perf_result.size(); ++i) {
         magma::log(magma::LOG_INFO, "Performance counter %d: %u\n", i, perf_result[i]);
     }
     return MAGMA_STATUS_OK;
 }

 int MsdArmDevice::JobInterruptThreadLoop()
 {
     magma::PlatformThreadHelper::SetCurrentThreadName("Job InterruptThread");
     DLOG("Job Interrupt thread started");

     while (!interrupt_thread_quit_flag_) {
         DLOG("Job waiting for interrupt");
         job_interrupt_->Wait();
         DLOG("Job Returned from interrupt wait!");

         if (interrupt_thread_quit_flag_)
             break;
         auto request = std::make_unique<JobInterruptRequest>();
         auto reply = request->GetReply();
         EnqueueDeviceRequest(std::move(request), true);
         reply->Wait();
     }

     DLOG("Job Interrupt thread exited");
     return 0;
 }

 static bool IsHardwareResultCode(uint32_t result)
 {
     switch (result) {
         case kArmMaliResultSuccess:
         case kArmMaliResultSoftStopped:
         case kArmMaliResultAtomTerminated:

         case kArmMaliResultConfigFault:
         case kArmMaliResultPowerFault:
         case kArmMaliResultReadFault:
         case kArmMaliResultWriteFault:
         case kArmMaliResultAffinityFault:
         case kArmMaliResultBusFault:

         case kArmMaliResultProgramCounterInvalidFault:
         case kArmMaliResultEncodingInvalidFault:
         case kArmMaliResultTypeMismatchFault:
         case kArmMaliResultOperandFault:
         case kArmMaliResultTlsFault:
         case kArmMaliResultBarrierFault:
         case kArmMaliResultAlignmentFault:
         case kArmMaliResultDataInvalidFault:
         case kArmMaliResultTileRangeFault:
         case kArmMaliResultOutOfMemoryFault:
             return true;

         default:
             return false;
     }
 }

 magma::Status MsdArmDevice::ProcessJobInterrupt()
 {
     TRACE_DURATION("magma", "MsdArmDevice::ProcessJobInterrupt");
     while (true) {
         auto irq_status = registers::JobIrqFlags::GetRawStat().ReadFrom(register_io_.get());
         if (!irq_status.reg_value())
             break;
         auto clear_flags = registers::JobIrqFlags::GetIrqClear().FromValue(irq_status.reg_value());
         clear_flags.WriteTo(register_io_.get());
         DLOG("Processing job interrupt status %x", irq_status.reg_value());

         bool dumped_on_failure = false;
         uint32_t failed = irq_status.failed_slots().get();
         while (failed) {
             uint32_t slot = __builtin_ffs(failed) - 1;
             registers::JobSlotRegisters regs(slot);
             uint32_t raw_result = regs.Status().ReadFrom(register_io_.get()).reg_value();
             uint32_t result =
                 IsHardwareResultCode(raw_result) ? raw_result : kArmMaliResultUnknownFault;

             // Soft stopping isn't counted as an actual failure.
             if (result != kArmMaliResultSoftStopped && !dumped_on_failure) {
                 magma::log(magma::LOG_WARNING, "Got failed slot bitmask %x with result code %x\n",
                            irq_status.failed_slots().get(), raw_result);
                 ProcessDumpStatusToLog();
                 dumped_on_failure = true;
             }

             uint64_t job_tail = regs.Tail().ReadFrom(register_io_.get()).reg_value();

             scheduler_->JobCompleted(slot, static_cast<ArmMaliResultCode>(result), job_tail);
             failed &= ~(1 << slot);
         }

         uint32_t finished = irq_status.finished_slots().get();
         while (finished) {
             uint32_t slot = __builtin_ffs(finished) - 1;
             scheduler_->JobCompleted(slot, kArmMaliResultSuccess, 0u);
             finished &= ~(1 << slot);
         }
     }
     job_interrupt_->Complete();
     return MAGMA_STATUS_OK;
 }

 magma::Status MsdArmDevice::ProcessMmuInterrupt()
 {
     auto irq_status = registers::MmuIrqFlags::GetStatus().ReadFrom(register_io_.get());
     DLOG("Received MMU IRQ status 0x%x\n", irq_status.reg_value());

     uint32_t faulted_slots = irq_status.pf_flags().get() | irq_status.bf_flags().get();
     while (faulted_slots) {
         uint32_t slot = ffs(faulted_slots) - 1;

         // Clear all flags before attempting to page in memory, as otherwise
         // if the atom continues executing the next interrupt may be lost.
         auto clear_flags = registers::MmuIrqFlags::GetIrqClear().FromValue(0);
         clear_flags.pf_flags().set(1 << slot);
         clear_flags.bf_flags().set(1 << slot);
         clear_flags.WriteTo(register_io_.get());

         std::shared_ptr<MsdArmConnection> connection;
         {
             auto mapping = address_manager_->GetMappingForSlot(slot);
             if (!mapping) {
                 magma::log(magma::LOG_WARNING, "Fault on idle slot %d\n", slot);
             } else {
                 connection = mapping->connection();
             }
         }
         if (connection) {
             uint64_t address = registers::AsRegisters(slot)
                                    .FaultAddress()
                                    .ReadFrom(register_io_.get())
                                    .reg_value();
             bool kill_context = true;
             if (irq_status.bf_flags().get() & (1 << slot)) {
                 magma::log(magma::LOG_WARNING, "Bus fault at address 0x%lx on slot %d\n", address,
                            slot);
             } else {
                 if (connection->PageInMemory(address)) {
                     DLOG("Paged in address %lx\n", address);
                     kill_context = false;
                 } else {
                     magma::log(magma::LOG_WARNING, "Failed to page in address 0x%lx on slot %d\n",
                                address, slot);
                 }
             }
             if (kill_context) {
                 ProcessDumpStatusToLog();

                 connection->set_address_space_lost();
                 scheduler_->ReleaseMappingsForConnection(connection);
                 // This will invalidate the address slot, causing the job to die
                 // with a fault.
                 address_manager_->ReleaseSpaceMappings(connection->const_address_space());
             }
         }
         faulted_slots &= ~(1 << slot);
     }

     mmu_interrupt_->Complete();
     return MAGMA_STATUS_OK;
 }

 int MsdArmDevice::MmuInterruptThreadLoop()
 {
     magma::PlatformThreadHelper::SetCurrentThreadName("MMU InterruptThread");
     DLOG("MMU Interrupt thread started");

     while (!interrupt_thread_quit_flag_) {
         DLOG("MMU waiting for interrupt");
         mmu_interrupt_->Wait();
         DLOG("MMU Returned from interrupt wait!");

         if (interrupt_thread_quit_flag_)
             break;
         auto request = std::make_unique<MmuInterruptRequest>();
         auto reply = request->GetReply();
         EnqueueDeviceRequest(std::move(request), true);
         reply->Wait();
     }

     DLOG("MMU Interrupt thread exited");
     return 0;
 }

 void MsdArmDevice::StartDeviceThread()
 {
     DASSERT(!device_thread_.joinable());
     device_thread_ = std::thread([this] { this->DeviceThreadLoop(); });

     gpu_interrupt_thread_ = std::thread([this] { this->GpuInterruptThreadLoop(); });
     job_interrupt_thread_ = std::thread([this] { this->JobInterruptThreadLoop(); });
     mmu_interrupt_thread_ = std::thread([this] { this->MmuInterruptThreadLoop(); });
 }

 bool MsdArmDevice::InitializeInterrupts()
 {
     // When it's initialize the reset completed flag may be set. Clear it so
     // we don't get a useless interrupt.
     auto clear_flags = registers::GpuIrqFlags::GetIrqClear().FromValue(0xffffffff);
     clear_flags.WriteTo(register_io_.get());

     gpu_interrupt_ = platform_device_->RegisterInterrupt(kInterruptIndexGpu);
     if (!gpu_interrupt_)
         return DRETF(false, "failed to register GPU interrupt");

     job_interrupt_ = platform_device_->RegisterInterrupt(kInterruptIndexJob);
     if (!job_interrupt_)
         return DRETF(false, "failed to register JOB interrupt");

     mmu_interrupt_ = platform_device_->RegisterInterrupt(kInterruptIndexMmu);
     if (!mmu_interrupt_)
         return DRETF(false, "failed to register MMU interrupt");

     return true;
 }

 void MsdArmDevice::EnableInterrupts()
 {
     auto gpu_flags = registers::GpuIrqFlags::GetIrqMask().FromValue(0xffffffff);
     gpu_flags.WriteTo(register_io_.get());

     auto mmu_flags = registers::MmuIrqFlags::GetIrqMask().FromValue(0xffffffff);
     mmu_flags.WriteTo(register_io_.get());

     auto job_flags = registers::JobIrqFlags::GetIrqMask().FromValue(0xffffffff);
     job_flags.WriteTo(register_io_.get());
 }

 void MsdArmDevice::DisableInterrupts()
 {
     if (!register_io_)
         return;
     auto gpu_flags = registers::GpuIrqFlags::GetIrqMask().FromValue(0);
     gpu_flags.WriteTo(register_io_.get());

     auto mmu_flags = registers::MmuIrqFlags::GetIrqMask().FromValue(0);
     mmu_flags.WriteTo(register_io_.get());

     auto job_flags = registers::JobIrqFlags::GetIrqMask().FromValue(0);
     job_flags.WriteTo(register_io_.get());
 }

 void MsdArmDevice::EnqueueDeviceRequest(std::unique_ptr<DeviceRequest> request, bool enqueue_front)
 {
     std::unique_lock<std::mutex> lock(device_request_mutex_);
     if (enqueue_front) {
         device_request_list_.emplace_front(std::move(request));
     } else {
         device_request_list_.emplace_back(std::move(request));
     }
     device_request_semaphore_->Signal();
 }

 void MsdArmDevice::ScheduleAtom(std::shared_ptr<MsdArmAtom> atom)
 {
     bool need_schedule;
     {
         std::lock_guard<std::mutex> lock(schedule_mutex_);
         need_schedule = atoms_to_schedule_.empty();
         atoms_to_schedule_.push_back(std::move(atom));
     }
     if (need_schedule)
         EnqueueDeviceRequest(std::make_unique<ScheduleAtomRequest>());
 }

 void MsdArmDevice::CancelAtoms(std::shared_ptr<MsdArmConnection> connection)
 {
     EnqueueDeviceRequest(std::make_unique<CancelAtomsRequest>(connection));
 }

 magma::PlatformPort* MsdArmDevice::GetPlatformPort() { return device_port_.get(); }

 void MsdArmDevice::UpdateGpuActive(bool active) { power_manager_->UpdateGpuActive(active); }

 void MsdArmDevice::DumpRegisters(const GpuFeatures& features, magma::RegisterIo* io,
                                  DumpState* dump_state)
 {
     static struct {
         const char* name;
         registers::CoreReadyState::CoreType type;
     } core_types[] = {{"L2 Cache", registers::CoreReadyState::CoreType::kL2},
                       {"Shader", registers::CoreReadyState::CoreType::kShader},
                       {"Tiler", registers::CoreReadyState::CoreType::kTiler}};

     static struct {
         const char* name;
         registers::CoreReadyState::StatusType type;
     } status_types[] = {
         {"Present", registers::CoreReadyState::StatusType::kPresent},
         {"Ready", registers::CoreReadyState::StatusType::kReady},
         {"Transitioning", registers::CoreReadyState::StatusType::kPowerTransitioning},
         {"Power active", registers::CoreReadyState::StatusType::kPowerActive}};
     for (size_t i = 0; i < fbl::count_of(core_types); i++) {
         for (size_t j = 0; j < fbl::count_of(status_types); j++) {
             uint64_t bitmask = registers::CoreReadyState::ReadBitmask(io, core_types[i].type,
                                                                       status_types[j].type);
             dump_state->power_states.push_back({core_types[i].name, status_types[j].name, bitmask});
         }
     }

     dump_state->gpu_fault_status = registers::GpuFaultStatus::Get().ReadFrom(io).reg_value();
     dump_state->gpu_fault_address = registers::GpuFaultAddress::Get().ReadFrom(io).reg_value();
     dump_state->gpu_status = registers::GpuStatus::Get().ReadFrom(io).reg_value();
     dump_state->cycle_count = registers::CycleCount::Get().ReadFrom(io).reg_value();
     dump_state->timestamp = registers::Timestamp::Get().ReadFrom(io).reg_value();

     for (size_t i = 0; i < features.job_slot_count; i++) {
         DumpState::JobSlotStatus status;
         auto js_regs = registers::JobSlotRegisters(i);
         status.status = js_regs.Status().ReadFrom(io).reg_value();
         status.head = js_regs.Head().ReadFrom(io).reg_value();
         status.tail = js_regs.Tail().ReadFrom(io).reg_value();
         status.config = js_regs.Config().ReadFrom(io).reg_value();
         dump_state->job_slot_status.push_back(status);
     }

     for (size_t i = 0; i < features.address_space_count; i++) {
         DumpState::AddressSpaceStatus status;
         auto as_regs = registers::AsRegisters(i);
         status.status = as_regs.Status().ReadFrom(io).reg_value();
         status.fault_status = as_regs.FaultStatus().ReadFrom(io).reg_value();
         status.fault_address = as_regs.FaultAddress().ReadFrom(io).reg_value();
         dump_state->address_space_status.push_back(status);
     }
 }

 void MsdArmDevice::Dump(DumpState* dump_state, bool on_device_thread)
 {
     DumpRegisters(gpu_features_, register_io_.get(), dump_state);

     if (on_device_thread) {
         std::chrono::steady_clock::duration total_time;
         std::chrono::steady_clock::duration active_time;
         power_manager_->GetGpuActiveInfo(&total_time, &active_time);
         dump_state->total_time_ms =
             std::chrono::duration_cast<std::chrono::milliseconds>(total_time).count();
         dump_state->active_time_ms =
             std::chrono::duration_cast<std::chrono::milliseconds>(active_time).count();
     }
 }

 void MsdArmDevice::DumpToString(std::string& dump_string, bool on_device_thread)
 {
     DumpState dump_state = {};
     Dump(&dump_state, on_device_thread);

     FormatDump(dump_state, dump_string);
 }

 void MsdArmDevice::FormatDump(DumpState& dump_state, std::string& dump_string)
 {
     dump_string.append("Core power states\n");
     for (auto& state : dump_state.power_states) {
         dump_string += fbl::StringPrintf("Core type %s state %s bitmap: 0x%lx\n", state.core_type,
                                          state.status_type, state.bitmask)
                            .c_str();
     }
     dump_string += fbl::StringPrintf("Total ms %" PRIu64 " Active ms %" PRIu64 "\n",
                                      dump_state.total_time_ms, dump_state.active_time_ms)
                        .c_str();
     dump_string += fbl::StringPrintf("Gpu fault status 0x%x, address 0x%lx\n",
                                      dump_state.gpu_fault_status, dump_state.gpu_fault_address)
                        .c_str();
     dump_string += fbl::StringPrintf("Gpu status 0x%x\n", dump_state.gpu_status).c_str();
     dump_string += fbl::StringPrintf("Gpu cycle count %ld, timestamp %ld\n", dump_state.cycle_count,
                                      dump_state.timestamp)
                        .c_str();
     for (size_t i = 0; i < dump_state.job_slot_status.size(); i++) {
         auto* status = &dump_state.job_slot_status[i];
         dump_string +=
             fbl::StringPrintf("Job slot %zu status 0x%x head 0x%lx tail 0x%lx config 0x%x\n", i,
                               status->status, status->head, status->tail, status->config)
                 .c_str();
     }
     for (size_t i = 0; i < dump_state.address_space_status.size(); i++) {
         auto* status = &dump_state.address_space_status[i];
         dump_string +=
             fbl::StringPrintf("AS %zu status 0x%x fault status 0x%x fault address 0x%lx\n", i,
                               status->status, status->fault_status, status->fault_address)
                 .c_str();
     }
 }

 magma::Status MsdArmDevice::ProcessDumpStatusToLog()
 {
     std::string dump;
     DumpToString(dump, true);
     magma::log(magma::LOG_INFO, "%s", dump.c_str());
     return MAGMA_STATUS_OK;
 }

 magma::Status MsdArmDevice::ProcessScheduleAtoms()
 {
     TRACE_DURATION("magma", "MsdArmDevice::ProcessScheduleAtoms");
     std::vector<std::shared_ptr<MsdArmAtom>> atoms_to_schedule;
     {
         std::lock_guard<std::mutex> lock(schedule_mutex_);
         atoms_to_schedule.swap(atoms_to_schedule_);
     }
     for (auto& atom : atoms_to_schedule)
         scheduler_->EnqueueAtom(std::move(atom));
     scheduler_->TryToSchedule();
     return MAGMA_STATUS_OK;
 }

 magma::Status MsdArmDevice::ProcessCancelAtoms(std::weak_ptr<MsdArmConnection> connection)
 {
     // It's fine to cancel with an invalid shared_ptr, as that will clear out
     // atoms for connections that are dead already.
     scheduler_->CancelAtomsForConnection(connection.lock());
     return MAGMA_STATUS_OK;
 }

 void MsdArmDevice::ExecuteAtomOnDevice(MsdArmAtom* atom, magma::RegisterIo* register_io)
 {
     TRACE_DURATION("magma", "ExecuteAtomOnDevice", "address", atom->gpu_address(), "slot",
                    atom->slot());
     DASSERT(atom->slot() < 2u);
     bool dependencies_finished;
     atom->UpdateDependencies(&dependencies_finished);
     DASSERT(dependencies_finished);
     DASSERT(atom->gpu_address());

     // Skip atom if address space can't be assigned.
     if (!address_manager_->AssignAddressSpace(atom)) {
         scheduler_->JobCompleted(atom->slot(), kArmMaliResultAtomTerminated, 0u);
         return;
     }
     if (atom->require_cycle_counter()) {
         DASSERT(!atom->using_cycle_counter());
         atom->set_using_cycle_counter(true);

         if (++cycle_counter_refcount_ == 1) {
             register_io_->Write32(registers::GpuCommand::kOffset,
                                   registers::GpuCommand::kCmdCycleCountStart);
         }
     }

     if (atom->is_protected()) {
         DASSERT(IsInProtectedMode());
     } else {
         DASSERT(!IsInProtectedMode());
     }

     // Ensure the client's writes/cache flushes to the job chain are complete
     // before scheduling. Unlikely to be an issue since several thread and
     // process hops already happened.
     magma::barriers::WriteBarrier();

     registers::JobSlotRegisters slot(atom->slot());
     slot.HeadNext().FromValue(atom->gpu_address()).WriteTo(register_io);
     auto config = slot.ConfigNext().FromValue(0);
     config.address_space().set(atom->address_slot_mapping()->slot_number());
     config.start_flush_clean().set(true);
     config.start_flush_invalidate().set(true);
     // TODO(MA-367): Enable flush reduction optimization.
     config.thread_priority().set(8);
     config.end_flush_clean().set(true);
     config.end_flush_invalidate().set(true);
     // Atoms are in unprotected memory, so don't attempt to write to them when
     // executing in protected mode.
     bool disable_descriptor_write_back = atom->is_protected();
 #if defined(ENABLE_PROTECTED_DEBUG_SWAP_MODE)
     // In this case, nonprotected-mode atoms also need to abide by protected mode restrictions.
     disable_descriptor_write_back = true;
 #endif
     config.disable_descriptor_write_back().set(disable_descriptor_write_back);
     config.WriteTo(register_io);

     // Execute on every powered-on core.
     slot.AffinityNext().FromValue(UINT64_MAX).WriteTo(register_io);
     slot.CommandNext().FromValue(registers::JobSlotCommand::kCommandStart).WriteTo(register_io);
 }

 void MsdArmDevice::RunAtom(MsdArmAtom* atom) { ExecuteAtomOnDevice(atom, register_io_.get()); }

 void MsdArmDevice::AtomCompleted(MsdArmAtom* atom, ArmMaliResultCode result)
 {
     TRACE_DURATION("magma", "AtomCompleted", "address", atom->gpu_address());
     DLOG("Completed job atom: 0x%lx\n", atom->gpu_address());
     address_manager_->AtomFinished(atom);
     if (atom->using_cycle_counter()) {
         DASSERT(atom->require_cycle_counter());

         if (--cycle_counter_refcount_ == 0) {
             register_io_->Write32(registers::GpuCommand::kOffset,
                                   registers::GpuCommand::kCmdCycleCountStop);
         }
         atom->set_using_cycle_counter(false);
     }
     // Soft stopped atoms will be retried, so this result shouldn't be reported.
     if (result != kArmMaliResultSoftStopped) {
         atom->set_result_code(result);
         auto connection = atom->connection().lock();
         // Ensure any client writes/reads from memory happen after the mmio access saying memory is
         // read. In practice unlikely to be an issue due to data dependencies and the thread/process
         // hops.
         magma::barriers::Barrier();
         if (connection)
             connection->SendNotificationData(atom, result);
     }
 }

 void MsdArmDevice::HardStopAtom(MsdArmAtom* atom)
 {
     DASSERT(atom->hard_stopped());
     registers::JobSlotRegisters slot(atom->slot());
     DLOG("Hard stopping atom slot %d\n", atom->slot());
     slot.Command()
         .FromValue(registers::JobSlotCommand::kCommandHardStop)
         .WriteTo(register_io_.get());
 }

 void MsdArmDevice::SoftStopAtom(MsdArmAtom* atom)
 {
     registers::JobSlotRegisters slot(atom->slot());
     DLOG("Soft stopping atom slot %d\n", atom->slot());
     slot.Command()
         .FromValue(registers::JobSlotCommand::kCommandSoftStop)
         .WriteTo(register_io_.get());
 }

 void MsdArmDevice::ReleaseMappingsForAtom(MsdArmAtom* atom)
 {
     // The atom should be hung on a fault, so it won't reference memory
     // afterwards.
     address_manager_->AtomFinished(atom);
 }

 magma_status_t MsdArmDevice::QueryInfo(uint64_t id, uint64_t* value_out)
 {
     switch (id) {
         case MAGMA_QUERY_DEVICE_ID:
             *value_out = gpu_features_.gpu_id.reg_value();
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryL2Present:
             *value_out = gpu_features_.l2_present;
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryMaxThreads:
             *value_out = gpu_features_.thread_max_threads;
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryThreadMaxBarrierSize:
             *value_out = gpu_features_.thread_max_barrier_size;
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryThreadMaxWorkgroupSize:
             *value_out = gpu_features_.thread_max_workgroup_size;
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryThreadTlsAlloc:
             *value_out = gpu_features_.thread_tls_alloc;
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryShaderPresent:
             *value_out = gpu_features_.shader_present;
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryTilerFeatures:
             *value_out = gpu_features_.tiler_features.reg_value();
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryThreadFeatures:
             *value_out = gpu_features_.thread_features.reg_value();
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryL2Features:
             *value_out = gpu_features_.l2_features.reg_value();
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryMemoryFeatures:
             *value_out = gpu_features_.mem_features.reg_value();
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryMmuFeatures:
             *value_out = gpu_features_.mmu_features.reg_value();
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQueryCoherencyEnabled:
             *value_out = cache_coherency_status_;
             return MAGMA_STATUS_OK;

         case kMsdArmVendorQuerySupportsProtectedMode:
             *value_out = IsProtectedModeSupported();
             return MAGMA_STATUS_OK;

         default:
             return DRET_MSG(MAGMA_STATUS_INVALID_ARGS, "unhandled id %" PRIu64, id);
     }
 }

 // static
 void MsdArmDevice::InitializeHardwareQuirks(GpuFeatures* features, magma::RegisterIo* reg)
 {
     auto shader_config = registers::ShaderConfig::Get().FromValue(0);
     const uint32_t kGpuIdTGOX = 0x7212;
     uint32_t gpu_product_id = features->gpu_id.product_id().get();
     if (gpu_product_id == kGpuIdTGOX) {
         DLOG("Enabling TLS hashing\n");
         shader_config.tls_hashing_enable().set(1);
     }

     if (0x750 <= gpu_product_id && gpu_product_id <= 0x880) {
         DLOG("Enabling LS attr types\n");
         // This seems necessary for geometry shaders to work with non-indexed draws with point and
         // line lists on T8xx and T7xx.
         shader_config.ls_allow_attr_types().set(1);
     }

     shader_config.WriteTo(reg);
 }

 bool MsdArmDevice::IsProtectedModeSupported()
 {
     uint32_t gpu_product_id = gpu_features_.gpu_id.product_id().get();
     // TODO(MA-522): Support protected mode when using ACE cache coherency. Apparently
     // the L2 needs to be powered down then switched to ACE Lite in that mode.
     if (cache_coherency_status_ == kArmMaliCacheCoherencyAce)
         return false;
     // All Bifrost should support it. 0x6956 is Mali-t60x MP4 r0p0, so it doesn't count.
     return gpu_product_id != 0x6956 && (gpu_product_id > 0x1000);
 }

 void MsdArmDevice::EnterProtectedMode()
 {
     // TODO(MA-522): If cache-coherency is enabled, power down L2 and wait for the
     // completion of that.
     register_io_->Write32(registers::GpuCommand::kOffset,
                           registers::GpuCommand::kCmdSetProtectedMode);
 }

 bool MsdArmDevice::ExitProtectedMode()
 {
     // Remove perf counter address mapping.
     perf_counters_->ForceDisable();
     // |force_expire| is false because nothing should have been using an address
     // space before. Do this before powering down L2 so connections don't try to
     // hit the MMU while that's happening.
     address_manager_->ClearAddressMappings(false);

     if (!PowerDownL2()) {
         return DRETF(false, "Powering down L2 timed out\n");
     }

     return ResetDevice();
 }

 bool MsdArmDevice::ResetDevice()
 {
     DLOG("Resetting device protected mode\n");
     // Reset semaphore shouldn't already be signaled.
     DASSERT(!reset_semaphore_->Wait(0));

     register_io_->Write32(registers::GpuCommand::kOffset, registers::GpuCommand::kCmdSoftReset);

     if (!reset_semaphore_->Wait(1000)) {
         magma::log(magma::LOG_WARNING, "Hardware reset timed out");
         return false;
     }

     if (!InitializeHardware()) {
         magma::log(magma::LOG_WARNING, "Initialize hardware failed");
         return false;
     }

     if (!power_manager_->WaitForShaderReady(register_io_.get())) {
         magma::log(magma::LOG_WARNING, "Waiting for shader ready failed");
         return false;
     }

     return true;
 }

 bool MsdArmDevice::PowerDownL2()
 {
     power_manager_->DisableL2(register_io_.get());
     return power_manager_->WaitForL2Disable(register_io_.get());
 }

 bool MsdArmDevice::IsInProtectedMode()
 {
     return registers::GpuStatus::Get().ReadFrom(register_io_.get()).protected_mode_active().get();
 }

 void MsdArmDevice::RequestPerfCounterOperation(uint32_t type)
 {
     EnqueueDeviceRequest(std::make_unique<PerfCounterRequest>(type));
 }

 magma::Status MsdArmDevice::ProcessPerfCounterRequest(uint32_t type)
 {
     if (type == (MAGMA_DUMP_TYPE_PERF_COUNTER_ENABLE | MAGMA_DUMP_TYPE_PERF_COUNTERS)) {
         if (!perf_counters_->TriggerRead(true))
             return MAGMA_STATUS_INVALID_ARGS;
     } else if (type == MAGMA_DUMP_TYPE_PERF_COUNTERS) {
         if (!perf_counters_->TriggerRead(false))
             return MAGMA_STATUS_INVALID_ARGS;
     } else if (type == MAGMA_DUMP_TYPE_PERF_COUNTER_ENABLE) {
         if (!perf_counters_->Enable())
             return MAGMA_STATUS_INVALID_ARGS;
     } else {
         DASSERT(false);
         return MAGMA_STATUS_INVALID_ARGS;
     }
     return MAGMA_STATUS_OK;
 }

 //////////////////////////////////////////////////////////////////////////////////////////////////

 msd_connection_t* msd_device_open(msd_device_t* dev, msd_client_id_t client_id)
 {
     auto connection = MsdArmDevice::cast(dev)->Open(client_id);
     if (!connection)
         return DRETP(nullptr, "MsdArmDevice::Open failed");
     return new MsdArmAbiConnection(std::move(connection));
 }

 void msd_device_destroy(msd_device_t* dev) { delete MsdArmDevice::cast(dev); }

 magma_status_t msd_device_query(msd_device_t* device, uint64_t id, uint64_t* value_out)
 {
     return MsdArmDevice::cast(device)->QueryInfo(id, value_out);
 }

 void msd_device_dump_status(msd_device_t* device, uint32_t dump_type)
 {
     uint32_t perf_dump_type =
         dump_type & (MAGMA_DUMP_TYPE_PERF_COUNTER_ENABLE | MAGMA_DUMP_TYPE_PERF_COUNTERS);
     if (perf_dump_type) {
         MsdArmDevice::cast(device)->RequestPerfCounterOperation(perf_dump_type);
     }
     if (!dump_type || (dump_type & MAGMA_DUMP_TYPE_NORMAL)) {
         MsdArmDevice::cast(device)->DumpStatusToLog();
     }
 }