| // Copyright 2017 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| #include <align.h> |
| #include <debug.h> |
| #include <lib/boot-options/boot-options.h> |
| #include <lib/debuglog.h> |
| #include <lib/fit/defer.h> |
| #include <lib/instrumentation/asan.h> |
| #include <lib/power-management/energy-model.h> |
| #include <lib/power-management/kernel-registry.h> |
| #include <lib/power-management/port-power-level-controller.h> |
| #include <lib/relaxed_atomic.h> |
| #include <lib/syscalls/forward.h> |
| #include <lib/zbi-format/kernel.h> |
| #include <lib/zbi-format/zbi.h> |
| #include <lib/zircon-internal/macros.h> |
| #include <mexec.h> |
| #include <platform.h> |
| #include <string.h> |
| #include <sys/types.h> |
| #include <trace.h> |
| #include <zircon/boot/crash-reason.h> |
| #include <zircon/compiler.h> |
| #include <zircon/errors.h> |
| #include <zircon/rights.h> |
| #include <zircon/status.h> |
| #include <zircon/syscalls-next.h> |
| #include <zircon/syscalls/resource.h> |
| #include <zircon/syscalls/system.h> |
| #include <zircon/time.h> |
| #include <zircon/types.h> |
| |
| #include <cstddef> |
| #include <cstdint> |
| #include <cstdio> |
| |
| #include <arch/arch_ops.h> |
| #include <arch/mp.h> |
| #include <arch/ops.h> |
| #include <dev/hw_watchdog.h> |
| #include <dev/interrupt.h> |
| #include <fbl/alloc_checker.h> |
| #include <fbl/ref_ptr.h> |
| #include <kernel/cpu.h> |
| #include <kernel/idle_power_thread.h> |
| #include <kernel/mp.h> |
| #include <kernel/mutex.h> |
| #include <kernel/percpu.h> |
| #include <kernel/range_check.h> |
| #include <kernel/scheduler.h> |
| #include <kernel/thread.h> |
| #include <ktl/byte.h> |
| #include <ktl/span.h> |
| #include <ktl/unique_ptr.h> |
| #include <object/event_dispatcher.h> |
| #include <object/job_dispatcher.h> |
| #include <object/port_dispatcher.h> |
| #include <object/process_dispatcher.h> |
| #include <object/resource.h> |
| #include <object/user_handles.h> |
| #include <object/vm_object_dispatcher.h> |
| #include <platform/halt_helper.h> |
| #include <platform/halt_token.h> |
| #include <platform/timer.h> |
| #include <vm/physmap.h> |
| #include <vm/pmm.h> |
| #include <vm/vm.h> |
| #include <vm/vm_aspace.h> |
| |
| #include "system_priv.h" |
| |
| #include <ktl/enforce.h> |
| |
| #define LOCAL_TRACE 0 |
| |
| // Allocate this many extra bytes at the end of the bootdata for the platform |
| // to fill in with platform specific boot structures. |
| const size_t kBootdataPlatformExtraBytes = PAGE_SIZE * 4; |
| |
| __BEGIN_CDECLS |
| extern void mexec_asm(void); |
| extern void mexec_asm_end(void); |
| __END_CDECLS |
| |
| class IdentityPageAllocator { |
| public: |
| explicit IdentityPageAllocator(uintptr_t alloc_start) : alloc_start_(alloc_start) { |
| allocated_ = LIST_INITIAL_VALUE(allocated_); |
| } |
| ~IdentityPageAllocator() { pmm_free(&allocated_); } |
| |
| /* Allocates a page of memory that has the same physical and virtual |
| addresses. */ |
| zx_status_t Allocate(void** result); |
| |
| // Activate the 1:1 address space. P |
| void Activate(); |
| |
| private: |
| zx_status_t InitializeAspace(); |
| fbl::RefPtr<VmAspace> aspace_ = nullptr; |
| size_t mapping_id_ = 0; |
| // Minimum physical/virtual address for all allocations. |
| uintptr_t alloc_start_; |
| list_node allocated_; |
| }; |
| |
| zx_status_t IdentityPageAllocator::InitializeAspace() { |
| // The Aspace has already been initialized, nothing to do. |
| if (aspace_) { |
| return ZX_OK; |
| } |
| |
| aspace_ = VmAspace::Create(VmAspace::Type::LowKernel, "identity"); |
| if (!aspace_) { |
| return ZX_ERR_INTERNAL; |
| } |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t alloc_pages_greater_than(paddr_t lower_bound, size_t count, size_t limit, |
| paddr_t* paddrs) { |
| struct list_node list = LIST_INITIAL_VALUE(list); |
| |
| // We don't support partially completed requests. This function will either |
| // allocate |count| pages or 0 pages. If we complete a partial allocation |
| // but are unable to fulfil the complete request, we'll clean up any pages |
| // that we may have allocated in the process. |
| auto pmm_cleanup = fit::defer([&list]() { pmm_free(&list); }); |
| |
| while (count) { |
| // TODO: replace with pmm routine that can allocate while excluding a range. |
| size_t actual = 0; |
| list_node alloc_list = LIST_INITIAL_VALUE(alloc_list); |
| zx_status_t status = pmm_alloc_range(lower_bound, count, &alloc_list); |
| if (status == ZX_OK) { |
| actual = count; |
| if (list_is_empty(&list)) { |
| list_move(&alloc_list, &list); |
| } else { |
| list_splice_after(&alloc_list, list_peek_tail(&list)); |
| } |
| } |
| |
| for (size_t i = 0; i < actual; i++) { |
| paddrs[count - (i + 1)] = lower_bound + PAGE_SIZE * i; |
| } |
| |
| count -= actual; |
| lower_bound += PAGE_SIZE * (actual + 1); |
| |
| // If we're past the limit and still trying to allocate, just give up. |
| if (lower_bound >= limit) { |
| return ZX_ERR_NO_RESOURCES; |
| } |
| } |
| |
| // mark all of the pages we allocated as WIRED. |
| vm_page_t* p; |
| list_for_every_entry (&list, p, vm_page_t, queue_node) { |
| p->set_state(vm_page_state::WIRED); |
| } |
| |
| // Make sure we don't free the pages we just allocated. |
| pmm_cleanup.cancel(); |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t IdentityPageAllocator::Allocate(void** result) { |
| zx_status_t st; |
| |
| // Start by obtaining an unused physical page. This address will eventually |
| // be the physical/virtual address of our identity mapped page. |
| // TODO: when https://fxbug.dev/42105842 is completed, we should allocate low memory directly |
| // from the pmm rather than using "alloc_pages_greater_than" which is |
| // somewhat of a hack. |
| paddr_t pa; |
| DEBUG_ASSERT(alloc_start_ < 4 * GB); |
| st = alloc_pages_greater_than(alloc_start_, 1, 4 * GB - alloc_start_, &pa); |
| if (st != ZX_OK) { |
| LTRACEF("mexec: failed to allocate page in low memory\n"); |
| return st; |
| } |
| |
| // Add this page to the list of allocated pages such that it gets freed when |
| // the object is destroyed. |
| vm_page_t* page = paddr_to_vm_page(pa); |
| DEBUG_ASSERT(page); |
| list_add_tail(&allocated_, &page->queue_node); |
| |
| // The kernel address space may be in high memory which cannot be identity |
| // mapped since all Kernel Virtual Addresses might be out of range of the |
| // physical address space. For this reason, we need to make a new address |
| // space. |
| st = InitializeAspace(); |
| if (st != ZX_OK) { |
| return st; |
| } |
| |
| // Create a new allocation in the new address space that identity maps the |
| // target page. |
| constexpr uint kPermissionFlagsRWX = |
| (ARCH_MMU_FLAG_PERM_READ | ARCH_MMU_FLAG_PERM_WRITE | ARCH_MMU_FLAG_PERM_EXECUTE); |
| |
| void* addr = reinterpret_cast<void*>(pa); |
| |
| // 2 ** 64 = 18446744073709551616 |
| // len("identity 18446744073709551616\n") == 30, round to sizeof(word) = 32 |
| char mapping_name[32]; |
| snprintf(mapping_name, sizeof(mapping_name), "identity %lu", mapping_id_++); |
| |
| st = aspace_->AllocPhysical(mapping_name, PAGE_SIZE, &addr, 0, pa, |
| VmAspace::VMM_FLAG_VALLOC_SPECIFIC, kPermissionFlagsRWX); |
| if (st != ZX_OK) { |
| return st; |
| } |
| |
| *result = addr; |
| return st; |
| } |
| |
| void IdentityPageAllocator::Activate() { |
| if (!aspace_) { |
| panic("Cannot Activate 1:1 Aspace with no 1:1 mappings!"); |
| } |
| vmm_set_active_aspace(aspace_.get()); |
| } |
| |
| /* Takes all the pages in a VMO and creates a copy of them where all the pages |
| * occupy a physically contiguous region of physical memory. |
| * TODO(gkalsi): Don't coalesce pages into a physically contiguous region and |
| * just pass a vectored I/O list to the mexec assembly. |
| */ |
| static zx_status_t vmo_coalesce_pages(zx_handle_t vmo_hdl, const size_t extra_bytes, paddr_t* addr, |
| uint8_t** vaddr, size_t* size) { |
| DEBUG_ASSERT(addr); |
| if (!addr) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| DEBUG_ASSERT(size); |
| if (!size) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| ProcessDispatcher* up = ProcessDispatcher::GetCurrent(); |
| fbl::RefPtr<VmObjectDispatcher> vmo_dispatcher; |
| zx_status_t st = |
| up->handle_table().GetDispatcherWithRights(*up, vmo_hdl, ZX_RIGHT_READ, &vmo_dispatcher); |
| if (st != ZX_OK) |
| return st; |
| |
| fbl::RefPtr<VmObject> vmo = vmo_dispatcher->vmo(); |
| |
| const size_t vmo_size = vmo->size(); |
| |
| const size_t num_pages = ROUNDUP(vmo_size + extra_bytes, PAGE_SIZE) / PAGE_SIZE; |
| |
| paddr_t base_addr; |
| list_node list = LIST_INITIAL_VALUE(list); |
| st = pmm_alloc_contiguous(num_pages, PMM_ALLOC_FLAG_ANY, 0, &base_addr, &list); |
| if (st != ZX_OK) { |
| // TODO(gkalsi): Free pages allocated by pmm_alloc_contiguous pages |
| // and return an error. |
| panic("Failed to allocate contiguous memory"); |
| } |
| |
| uint8_t* dst_addr = (uint8_t*)paddr_to_physmap(base_addr); |
| |
| st = vmo->Read(dst_addr, 0, vmo_size); |
| if (st != ZX_OK) { |
| // TODO(gkalsi): Free pages allocated by pmm_alloc_contiguous pages |
| // and return an error. |
| panic("Failed to read to contiguous vmo"); |
| } |
| |
| arch_clean_invalidate_cache_range((vaddr_t)dst_addr, vmo_size); |
| |
| *size = num_pages * PAGE_SIZE; |
| *addr = base_addr; |
| if (vaddr) |
| *vaddr = dst_addr; |
| |
| return ZX_OK; |
| } |
| |
| // zx_status_t zx_system_mexec_payload_get |
| zx_status_t sys_system_mexec_payload_get(zx_handle_t resource, user_out_ptr<void> user_buffer, |
| size_t buffer_size) { |
| if (!gBootOptions->enable_debugging_syscalls) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| // Highly privileged, only mexec resource should have access. |
| if (zx_status_t result = |
| validate_ranged_resource(resource, ZX_RSRC_KIND_SYSTEM, ZX_RSRC_SYSTEM_MEXEC_BASE, 1); |
| result != ZX_OK) { |
| return result; |
| } |
| |
| // Limit the size of the result that we can return to userspace. |
| if (buffer_size > kBootdataPlatformExtraBytes) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| fbl::AllocChecker ac; |
| auto buffer = new (&ac) ktl::byte[buffer_size]; |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| if (auto result = WriteMexecData({buffer, buffer_size}); result.is_error()) { |
| return result.error_value(); |
| } else { |
| size_t zbi_size = ktl::move(result).value(); |
| ZX_DEBUG_ASSERT(zbi_size <= buffer_size); |
| return user_buffer.reinterpret<ktl::byte>().copy_array_to_user(buffer, zbi_size); |
| } |
| } |
| |
| // zx_status_t zx_system_mexec |
| NO_ASAN zx_status_t sys_system_mexec(zx_handle_t resource, zx_handle_t kernel_vmo, |
| zx_handle_t bootimage_vmo) { |
| if (!gBootOptions->enable_debugging_syscalls) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| zx_status_t result = |
| validate_ranged_resource(resource, ZX_RSRC_KIND_SYSTEM, ZX_RSRC_SYSTEM_MEXEC_BASE, 1); |
| if (result != ZX_OK) |
| return result; |
| |
| paddr_t new_kernel_addr; |
| size_t new_kernel_len; |
| result = vmo_coalesce_pages(kernel_vmo, 0, &new_kernel_addr, NULL, &new_kernel_len); |
| if (result != ZX_OK) { |
| return result; |
| } |
| |
| // for kernels that are bootdata based (eg, x86-64), the location |
| // to find the entrypoint depends on the bootdata format |
| paddr_t entry64_addr = |
| (get_kernel_base_phys() + sizeof(zbi_header_t) + // ZBI_TYPE_CONTAINER header |
| sizeof(zbi_header_t) + // ZBI_TYPE_KERNEL header |
| offsetof(zbi_kernel_t, entry)); |
| |
| paddr_t new_bootimage_addr; |
| uint8_t* bootimage_buffer; |
| size_t bootimage_len; |
| result = vmo_coalesce_pages(bootimage_vmo, kBootdataPlatformExtraBytes, &new_bootimage_addr, |
| &bootimage_buffer, &bootimage_len); |
| if (result != ZX_OK) { |
| return result; |
| } |
| |
| uintptr_t kernel_image_end = get_kernel_base_phys() + new_kernel_len; |
| |
| paddr_t final_bootimage_addr = new_bootimage_addr; |
| // For testing purposes, we may want the bootdata at a high address. Alternatively if our |
| // coalesced VMO should overlap into the target kernel range then we also need to move it, and |
| // placing it high is as good as anywhere else. |
| if (gBootOptions->mexec_force_high_ramdisk || |
| Intersects(final_bootimage_addr, bootimage_len, get_kernel_base_phys(), kernel_image_end)) { |
| const size_t page_count = bootimage_len / PAGE_SIZE + 1; |
| fbl::AllocChecker ac; |
| ktl::unique_ptr<paddr_t[]> paddrs(new (&ac) paddr_t[page_count]); |
| ASSERT(ac.check()); |
| |
| // Allocate pages greater than 4GiB to test that we're tolerant of booting |
| // with a ramdisk in high memory. This operation can be very expensive and |
| // should be replaced with a PMM API that supports allocating from a |
| // specific range of memory. |
| result = alloc_pages_greater_than(4 * GB, page_count, 8 * GB, paddrs.get()); |
| ASSERT(result == ZX_OK); |
| |
| final_bootimage_addr = paddrs.get()[0]; |
| } |
| |
| IdentityPageAllocator id_alloc(kernel_image_end); |
| void* id_page_addr = 0x0; |
| result = id_alloc.Allocate(&id_page_addr); |
| if (result != ZX_OK) { |
| return result; |
| } |
| |
| LTRACEF("zx_system_mexec allocated identity mapped page at %p\n", id_page_addr); |
| |
| Thread::Current::MigrateToCpu(BOOT_CPU_ID); |
| |
| // We assume that when the system starts, only one CPU is running. We denote |
| // this as the boot CPU. |
| // We want to make sure that this is the CPU that eventually branches into |
| // the new kernel so we attempt to migrate this thread to that cpu. |
| result = platform_halt_secondary_cpus(ZX_TIME_INFINITE); |
| DEBUG_ASSERT(result == ZX_OK); |
| |
| platform_mexec_prep(final_bootimage_addr, bootimage_len); |
| |
| const zx_instant_mono_t dlog_deadline = current_time() + ZX_SEC(5); |
| dlog_shutdown(dlog_deadline); |
| |
| // Give the watchdog one last pet to hold it off until the new image has booted far enough to pet |
| // the dog itself (or disable it). |
| hw_watchdog_pet(); |
| |
| arch_disable_ints(); |
| |
| // WARNING |
| // It is unsafe to return from this function beyond this point. |
| // This is because we have swapped out the user address space and halted the |
| // secondary cores and there is no trivial way to bring both of these back. |
| id_alloc.Activate(); |
| |
| // We're going to copy this into our identity page, make sure it's not |
| // longer than a single page. |
| size_t mexec_asm_length = (uintptr_t)mexec_asm_end - (uintptr_t)mexec_asm; |
| DEBUG_ASSERT(mexec_asm_length <= PAGE_SIZE); |
| |
| __unsanitized_memcpy(id_page_addr, (const void*)mexec_asm, mexec_asm_length); |
| arch_sync_cache_range((vaddr_t)id_page_addr, mexec_asm_length); |
| |
| // We must pass in an arg that represents a list of memory regions to |
| // shuffle around. We put this args list immediately after the mexec |
| // assembly. |
| // Put the args list in a separate page. |
| void* ops_ptr; |
| result = id_alloc.Allocate(&ops_ptr); |
| DEBUG_ASSERT(result == ZX_OK); |
| memmov_ops_t* ops = (memmov_ops_t*)(ops_ptr); |
| |
| uint32_t ops_idx = 0; |
| |
| // Op to move the new kernel into place. |
| ops[ops_idx].src = (void*)new_kernel_addr; |
| ops[ops_idx].dst = (void*)get_kernel_base_phys(); |
| ops[ops_idx].len = new_kernel_len; |
| ops_idx++; |
| |
| // We can leave the bootimage in place unless we've been asked to move it to |
| // high memory. |
| if (new_bootimage_addr != final_bootimage_addr) { |
| ops[ops_idx].src = (void*)new_bootimage_addr; |
| ops[ops_idx].dst = (void*)final_bootimage_addr; |
| ops[ops_idx].len = bootimage_len; |
| ops_idx++; |
| } |
| |
| // Null terminated list. |
| ops[ops_idx++] = {0, 0, 0}; |
| |
| // Make sure that the kernel, when copied, will not overwrite the bootdata, our mexec code or |
| // copy ops. |
| DEBUG_ASSERT(!Intersects(reinterpret_cast<uintptr_t>(ops[0].dst), ops[0].len, |
| reinterpret_cast<uintptr_t>(final_bootimage_addr), bootimage_len)); |
| DEBUG_ASSERT(!Intersects(reinterpret_cast<uintptr_t>(ops[0].dst), ops[0].len, |
| reinterpret_cast<uintptr_t>(id_page_addr), |
| static_cast<size_t>(PAGE_SIZE))); |
| DEBUG_ASSERT(!Intersects(reinterpret_cast<uintptr_t>(ops[0].dst), ops[0].len, |
| reinterpret_cast<uintptr_t>(ops_ptr), static_cast<size_t>(PAGE_SIZE))); |
| |
| // Sync because there is code in here that we intend to run. |
| arch_sync_cache_range((vaddr_t)id_page_addr, PAGE_SIZE); |
| |
| // Clean because we're going to turn the MMU/caches off and we want to make |
| // sure that things are still available afterwards. |
| arch_clean_cache_range((vaddr_t)id_page_addr, PAGE_SIZE); |
| arch_clean_cache_range((vaddr_t)ops_ptr, PAGE_SIZE); |
| |
| // Shutdown the timer and interrupts. Performing shutdown of these components |
| // is critical as we might be using a PV clock or PV EOI signaling so we must |
| // tell our hypervisor to stop updating them to avoid corrupting aribtrary |
| // memory post-mexec. |
| platform_stop_timer(); |
| platform_shutdown_timer(); |
| shutdown_interrupts_curr_cpu(); |
| shutdown_interrupts(); |
| |
| // Ask the platform to mexec into the next kernel. |
| mexec_asm_func mexec_assembly = (mexec_asm_func)id_page_addr; |
| platform_mexec(mexec_assembly, ops, final_bootimage_addr, bootimage_len, entry64_addr); |
| |
| panic("Execution should never reach here\n"); |
| return ZX_OK; |
| } |
| |
| // zx_status_t zx_system_powerctl |
| zx_status_t sys_system_powerctl(zx_handle_t power_rsrc, uint32_t cmd, |
| user_in_ptr<const zx_system_powerctl_arg_t> raw_arg) { |
| zx_status_t status; |
| if ((status = validate_ranged_resource(power_rsrc, ZX_RSRC_KIND_SYSTEM, ZX_RSRC_SYSTEM_POWER_BASE, |
| 1)) != ZX_OK) { |
| return status; |
| } |
| |
| switch (cmd) { |
| case ZX_SYSTEM_POWERCTL_ENABLE_ALL_CPUS: { |
| cpu_mask_t all_cpus = ((cpu_mask_t)1u << arch_max_num_cpus()) - 1; |
| return mp_hotplug_cpu_mask(~mp_get_online_mask() & all_cpus); |
| } |
| case ZX_SYSTEM_POWERCTL_DISABLE_ALL_CPUS_BUT_PRIMARY: { |
| cpu_mask_t primary = cpu_num_to_mask(0); |
| return mp_unplug_cpu_mask(mp_get_online_mask() & ~primary, ZX_TIME_INFINITE); |
| } |
| #if defined __x86_64__ |
| case ZX_SYSTEM_POWERCTL_ACPI_TRANSITION_S_STATE: |
| return ZX_ERR_NOT_SUPPORTED; |
| case ZX_SYSTEM_POWERCTL_X86_SET_PKG_PL1: { |
| zx_system_powerctl_arg_t arg; |
| MsrAccess msr; |
| status = raw_arg.copy_from_user(&arg); |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| return arch_system_powerctl(cmd, &arg, &msr); |
| } |
| #endif //__x86_64 |
| case ZX_SYSTEM_POWERCTL_REBOOT: |
| platform_graceful_halt_helper(HALT_ACTION_REBOOT, ZirconCrashReason::NoCrash, |
| ZX_TIME_INFINITE); |
| break; |
| case ZX_SYSTEM_POWERCTL_ACK_KERNEL_INITIATED_REBOOT: |
| return HaltToken::Get().AckPendingHalt(); |
| case ZX_SYSTEM_POWERCTL_REBOOT_BOOTLOADER: |
| platform_graceful_halt_helper(HALT_ACTION_REBOOT_BOOTLOADER, ZirconCrashReason::NoCrash, |
| ZX_TIME_INFINITE); |
| break; |
| case ZX_SYSTEM_POWERCTL_REBOOT_RECOVERY: |
| platform_graceful_halt_helper(HALT_ACTION_REBOOT_RECOVERY, ZirconCrashReason::NoCrash, |
| ZX_TIME_INFINITE); |
| break; |
| case ZX_SYSTEM_POWERCTL_SHUTDOWN: |
| platform_graceful_halt_helper(HALT_ACTION_SHUTDOWN, ZirconCrashReason::NoCrash, |
| ZX_TIME_INFINITE); |
| break; |
| default: |
| return ZX_ERR_INVALID_ARGS; |
| } |
| return ZX_OK; |
| } |
| |
| // zx_status_t zx_system_get_event |
| zx_status_t sys_system_get_event(zx_handle_t root_job, uint32_t kind, zx_handle_t* out) { |
| auto up = ProcessDispatcher::GetCurrent(); |
| |
| fbl::RefPtr<JobDispatcher> job; |
| zx_status_t status; |
| if (kind == ZX_SYSTEM_EVENT_OUT_OF_MEMORY) { |
| status = |
| up->handle_table().GetDispatcherWithRights(*up, root_job, ZX_RIGHT_MANAGE_PROCESS, &job); |
| } else { |
| // We check for the root job below. We should not need to enforce rights beyond that. |
| status = up->handle_table().GetDispatcherWithRights(*up, root_job, ZX_RIGHT_NONE, &job); |
| } |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // Validate that the job is in fact the first usermode job (aka root job). |
| if (job != GetRootJobDispatcher()) { |
| return ZX_ERR_ACCESS_DENIED; |
| } |
| |
| switch (kind) { |
| case ZX_SYSTEM_EVENT_OUT_OF_MEMORY: |
| case ZX_SYSTEM_EVENT_IMMINENT_OUT_OF_MEMORY: |
| case ZX_SYSTEM_EVENT_MEMORY_PRESSURE_CRITICAL: |
| case ZX_SYSTEM_EVENT_MEMORY_PRESSURE_WARNING: |
| case ZX_SYSTEM_EVENT_MEMORY_PRESSURE_NORMAL: |
| // Do not grant default event rights, as we don't want userspace to, for |
| // example, be able to signal this event. |
| return up->MakeAndAddHandle(GetMemPressureEvent(kind), |
| ZX_DEFAULT_SYSTEM_EVENT_LOW_MEMORY_RIGHTS, out); |
| |
| default: |
| return ZX_ERR_INVALID_ARGS; |
| } |
| } |
| |
| zx_status_t sys_system_set_performance_info(zx_handle_t resource, uint32_t topic, |
| user_in_ptr<const void> info_void, size_t count) { |
| const zx_status_t validate_status = |
| validate_ranged_resource(resource, ZX_RSRC_KIND_SYSTEM, ZX_RSRC_SYSTEM_CPU_BASE, 1); |
| if (validate_status != ZX_OK) { |
| return validate_status; |
| } |
| |
| if (topic != ZX_CPU_PERF_SCALE) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| const size_t num_cpus = percpu::processor_count(); |
| if (count == 0 || count > num_cpus) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| fbl::AllocChecker checker; |
| auto performance_info = ktl::make_unique<zx_cpu_performance_info_t[]>(&checker, count); |
| if (!checker.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| auto new_info = info_void.reinterpret<const zx_cpu_performance_info_t>(); |
| if (new_info.copy_array_from_user(performance_info.get(), count) != ZX_OK) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| cpu_num_t last_cpu = INVALID_CPU; |
| for (auto& info : ktl::span{performance_info.get(), count}) { |
| const cpu_num_t cpu = info.logical_cpu_number; |
| if (last_cpu != INVALID_CPU && cpu <= last_cpu) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| last_cpu = cpu; |
| |
| const auto [integral, fractional] = info.performance_scale; |
| if (cpu >= num_cpus || (integral == 0 && fractional == 0)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| } |
| |
| Scheduler::UpdatePerformanceScales(performance_info.get(), count); |
| return ZX_OK; |
| } |
| |
| zx_status_t sys_system_get_performance_info(zx_handle_t resource, uint32_t topic, size_t info_count, |
| user_out_ptr<void> info_void, |
| user_out_ptr<size_t> output_count) { |
| const zx_status_t validate_status = |
| validate_ranged_resource(resource, ZX_RSRC_KIND_SYSTEM, ZX_RSRC_SYSTEM_CPU_BASE, 1); |
| if (validate_status != ZX_OK) { |
| return validate_status; |
| } |
| |
| const size_t num_cpus = percpu::processor_count(); |
| if (info_count != num_cpus) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| fbl::AllocChecker checker; |
| auto performance_info = ktl::make_unique<zx_cpu_performance_info_t[]>(&checker, info_count); |
| if (!checker.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| switch (topic) { |
| case ZX_CPU_PERF_SCALE: |
| Scheduler::GetPerformanceScales(performance_info.get(), info_count); |
| break; |
| |
| case ZX_CPU_DEFAULT_PERF_SCALE: |
| Scheduler::GetDefaultPerformanceScales(performance_info.get(), info_count); |
| break; |
| |
| default: |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| auto info = info_void.reinterpret<zx_cpu_performance_info_t>(); |
| if (info.copy_array_to_user(performance_info.get(), info_count) != ZX_OK) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| if (output_count.copy_to_user(info_count) != ZX_OK) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| return ZX_OK; |
| } |
| |
| // TODO(https://fxbug.dev/42182544): Reconcile with HaltToken, zx_system_powerctl, and |
| // kernel-initiated-oom-reboot. |
| zx_status_t sys_system_suspend_enter(zx_handle_t resource, zx_instant_boot_t resume_deadline) { |
| const zx_status_t validate_status = |
| validate_ranged_resource(resource, ZX_RSRC_KIND_SYSTEM, ZX_RSRC_SYSTEM_CPU_BASE, 1); |
| if (validate_status != ZX_OK) { |
| return validate_status; |
| } |
| |
| return IdlePowerThread::TransitionAllActiveToSuspend(resume_deadline); |
| } |
| |
| zx_status_t sys_system_set_processor_power_domain( |
| zx_handle_t resource, uint64_t options, user_in_ptr<const zx_processor_power_domain_t> domain, |
| zx_handle_t port, user_in_ptr<const zx_processor_power_level_t> power_levels, |
| size_t num_power_levels, user_in_ptr<const zx_processor_power_level_transition_t> transitions, |
| size_t num_transitions) { |
| zx_status_t status = |
| validate_ranged_resource(resource, ZX_RSRC_KIND_SYSTEM, ZX_RSRC_SYSTEM_CPU_BASE, 1); |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| if (num_power_levels > ZX_MAX_POWER_LEVELS || |
| num_transitions > ZX_MAX_POWER_LEVEL_TRANSFORMATIONS) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| zx_processor_power_domain_t domain_info; |
| if (domain.copy_from_user(&domain_info) != ZX_OK) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| bool all_zero = true; |
| for (auto& c : domain_info.cpus.mask) { |
| all_zero = all_zero && (c == 0); |
| } |
| |
| // No need to validate any of the other parameters, when we are unregistering a power domain. |
| if (all_zero) { |
| return power_management::KernelPowerDomainRegistry::Unregister(domain_info.domain_id) |
| .status_value(); |
| } |
| |
| if (num_power_levels == 0) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| size_t max_cpus = arch_max_num_cpus(); |
| size_t bucket = max_cpus / ZX_CPU_SET_BITS_PER_WORD; |
| size_t bits = max_cpus % ZX_CPU_SET_BITS_PER_WORD; |
| size_t mask = ~((1ull << bits) - 1); |
| |
| // We are not allowed to set cpus beyond our max cpus. |
| if ((domain_info.cpus.mask[bucket] & mask) != 0) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| for (size_t i = bucket + 1; i < ZX_CPU_SET_MAX_CPUS / ZX_CPU_SET_BITS_PER_WORD; ++i) { |
| if (domain_info.cpus.mask[i] != 0) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| } |
| |
| // Check the port has required rights. |
| ProcessDispatcher* up = ProcessDispatcher::GetCurrent(); |
| fbl::RefPtr<PortDispatcher> port_dispatcher; |
| if (zx_status_t res = up->handle_table().GetDispatcherWithRights( |
| *up, port, ZX_RIGHT_WRITE | ZX_RIGHT_READ, &port_dispatcher); |
| res != ZX_OK) { |
| return res; |
| } |
| |
| // Set up the power domain and model. |
| fbl::AllocChecker ac; |
| auto levels = ktl::make_unique<zx_processor_power_level_t[]>(&ac, num_power_levels); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| ktl::unique_ptr<zx_processor_power_level_transition_t[]> sparse_transitions = nullptr; |
| |
| if (num_transitions > 0) { |
| sparse_transitions = |
| ktl::make_unique<zx_processor_power_level_transition_t[]>(&ac, num_transitions); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| if (zx_status_t res = |
| transitions.copy_array_from_user(sparse_transitions.get(), num_transitions); |
| res != ZX_OK) { |
| return res; |
| } |
| } |
| |
| if (zx_status_t res = power_levels.copy_array_from_user(levels.get(), num_power_levels); |
| res != ZX_OK) { |
| return res; |
| } |
| |
| auto model = |
| power_management::EnergyModel::Create(ktl::span(levels.get(), num_power_levels), |
| ktl::span(sparse_transitions.get(), num_transitions)); |
| if (model.is_error()) { |
| return model.error_value(); |
| } |
| |
| auto controller = fbl::MakeRefCountedChecked<power_management::PortPowerLevelController>( |
| &ac, ktl::move(port_dispatcher)); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| auto power_domain = fbl::MakeRefCountedChecked<power_management::PowerDomain>( |
| &ac, domain_info.domain_id, domain_info.cpus, ktl::move(model).value(), |
| ktl::move(controller)); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| // Register power domain with the registry and update schedulers. |
| return power_management::KernelPowerDomainRegistry::Register(ktl::move(power_domain)) |
| .status_value(); |
| } |
| |
| zx_status_t sys_system_set_processor_power_state( |
| zx_handle_t port, user_in_ptr<const zx_processor_power_state_t> power_state) { |
| if (port == ZX_HANDLE_INVALID) { |
| return ZX_ERR_BAD_HANDLE; |
| } |
| zx_processor_power_state_t ps = {}; |
| if (auto res = power_state.copy_from_user(&ps); res != ZX_OK) { |
| return res; |
| } |
| |
| ProcessDispatcher* up = ProcessDispatcher::GetCurrent(); |
| fbl::RefPtr<PortDispatcher> port_dispatcher; |
| if (zx_status_t res = |
| up->handle_table().GetDispatcherWithRights(*up, port, ZX_RIGHT_READ, &port_dispatcher); |
| res != ZX_OK) { |
| return res; |
| } |
| |
| return power_management::KernelPowerDomainRegistry::UpdateDomainPowerLevel( |
| ps.domain_id, port_dispatcher->get_koid(), |
| static_cast<power_management::ControlInterface>(ps.control_interface), |
| ps.control_argument) |
| .status_value(); |
| } |