blob: f555e139b40b6b3f66d840d848b5ad5e481b8495 [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/virtualization/bin/vmm/guest.h"
#include <lib/syslog/cpp/macros.h>
#include <lib/zircon-internal/align.h>
#include <zircon/assert.h>
#include <zircon/status.h>
#include <zircon/syscalls/hypervisor.h>
#include <zircon/threads.h>
#include "src/lib/fxl/strings/string_printf.h"
#include "src/virtualization/bin/vmm/pci.h"
#include "src/virtualization/bin/vmm/sysinfo.h"
namespace {
#if __aarch64__
constexpr uint8_t kSpiBase = 32;
#endif
constexpr GuestMemoryRegion RestrictUntilEnd(zx_gpaddr_t start) {
return {start, kGuestMemoryAllRemainingRange};
}
#if __x86_64__
constexpr uint64_t kOneKibibyte = 1ul << 10;
constexpr uint64_t kOneMebibyte = 1ul << 20;
constexpr uint64_t kOneGibibyte = 1ul << 30;
constexpr GuestMemoryRegion RestrictRegion(zx_gpaddr_t start, zx_gpaddr_t end) {
return {start, end - start};
}
#endif
// Ranges to avoid allocating guest memory in. These regions must not overlap and must be
// sorted by increasing base address. These requirements are enforced by a static_assert
// below.
constexpr std::array kRestrictedRegions = {
#if __aarch64__
// For ARM PCI devices are mapped in at a relatively high address, so it's reasonable to just
// block off the rest of guest memory.
RestrictUntilEnd(std::min(kDevicePhysBase, kFirstDynamicDeviceAddr)),
#elif __x86_64__
// Reserve regions in the first MiB for use by the BIOS.
RestrictRegion(0x0, 32 * kOneKibibyte),
RestrictRegion(512 * kOneKibibyte, kOneMebibyte),
// For x86 PCI devices are mapped in somewhere below 4 GiB, and the range extends to 4 GiB.
RestrictRegion(kDevicePhysBase, 4 * kOneGibibyte),
// Dynamic devices are mapped in at a very high address, so everything beyond that point
// can be blocked off.
RestrictUntilEnd(kFirstDynamicDeviceAddr),
#endif
};
constexpr bool CheckForOverlappingRestrictedRegions() {
auto overlaps = [](const GuestMemoryRegion& first, const GuestMemoryRegion& second) -> bool {
const auto& begin = std::min(first, second, GuestMemoryRegion::CompareMinByBase);
const auto& end = std::max(first, second, GuestMemoryRegion::CompareMinByBase);
return begin.base + begin.size >= end.base;
};
for (auto curr = kRestrictedRegions.begin(); curr != kRestrictedRegions.end(); curr++) {
for (auto next = std::next(curr); next != kRestrictedRegions.end(); next++) {
if (overlaps(*curr, *next)) {
return false;
}
}
}
return true;
}
// Compile time check that no regions overlap in kRestrictedRegions. If adding a region that
// overlaps with another, just merge them into one larger region.
static_assert(CheckForOverlappingRestrictedRegions());
constexpr bool CheckRestrictedRegionsAreSorted() {
for (auto curr = kRestrictedRegions.begin(); curr != kRestrictedRegions.end(); curr++) {
if (std::next(curr) == kRestrictedRegions.end()) {
break;
}
if (!GuestMemoryRegion::CompareMinByBase(*curr, *std::next(curr))) {
return false;
}
}
return true;
}
// Compile time check that regions in kRestrictedRegions are sorted by increasing base address.
static_assert(CheckRestrictedRegionsAreSorted());
constexpr uint32_t trap_kind(TrapType type) {
switch (type) {
case TrapType::MMIO_SYNC:
return ZX_GUEST_TRAP_MEM;
case TrapType::MMIO_BELL:
return ZX_GUEST_TRAP_BELL;
case TrapType::PIO_SYNC:
return ZX_GUEST_TRAP_IO;
default:
ZX_PANIC("Unhandled TrapType %d", static_cast<int>(type));
return 0;
}
}
} // namespace
// Static.
cpp20::span<const GuestMemoryRegion> Guest::GetDefaultRestrictionsForArchitecture() {
return kRestrictedRegions;
}
// Static.
uint64_t Guest::GetPageAlignedGuestMemory(uint64_t guest_memory) {
const uint32_t page_size = zx_system_get_page_size();
uint32_t page_alignment = guest_memory % page_size;
if (page_alignment != 0) {
uint32_t padding = page_size - page_alignment;
FX_LOGS(INFO) << "The requested guest memory (" << guest_memory
<< " bytes) is not a multiple of system page size (" << page_size
<< " bytes), so increasing guest memory by " << padding << " bytes.";
guest_memory += padding;
}
return guest_memory;
}
// Static.
bool Guest::PageAlignGuestMemoryRegion(GuestMemoryRegion& region) {
const uint32_t page_size = zx_system_get_page_size();
// This guest region is bounded by restricted regions, so size cannot be increased. If this
// region is smaller than a page this region must just be discarded.
if (region.size < page_size) {
return false;
}
zx_gpaddr_t start = region.base;
zx_gpaddr_t end = region.base + region.size;
// Round the starting address up to the nearest page, and the ending address down to the nearest
// page.
if (start % page_size != 0) {
start += page_size - (start % page_size);
}
if (end % page_size != 0) {
end -= end % page_size;
}
// Require a valid region to be at least a single page in size after adjustments. Both start and
// end have just been page aligned.
if (start >= end) {
return false;
}
region.base = start;
region.size = end - start;
return true;
}
// Static.
bool Guest::GenerateGuestMemoryRegions(uint64_t guest_memory,
cpp20::span<const GuestMemoryRegion> restrictions,
std::vector<GuestMemoryRegion>* regions) {
// Special case where there's no restrictions. Currently this isn't true for any production
// architecture due to the need to assign dynamic device addresses.
if (restrictions.empty()) {
regions->push_back({.base = 0x0, .size = guest_memory});
return true;
}
bool first_region = true;
GuestMemoryRegion current_region;
auto restriction = restrictions.begin();
fit::function<bool()> next_range = [&]() -> bool {
if (first_region) {
first_region = false;
if (restriction->base != 0) {
current_region = {0x0, restriction->base};
} else {
return next_range();
}
} else {
if (restriction->size == kGuestMemoryAllRemainingRange) {
return false; // No remaining valid guest memory regions.
}
// The current unrestricted region extends from the end of the current restriction to the
// start of the next restriction, or if this is the last restriction it extends to a very
// large number.
zx_gpaddr_t unrestricted_base_address = restriction->base + restriction->size;
uint64_t unrestricted_size = std::next(restriction) == restrictions.end()
? kGuestMemoryAllRemainingRange - unrestricted_base_address
: std::next(restriction)->base - unrestricted_base_address;
current_region = {unrestricted_base_address, unrestricted_size};
restriction++;
}
if (!Guest::PageAlignGuestMemoryRegion(current_region)) {
return next_range();
}
return true;
};
uint64_t mem_required = guest_memory;
while (mem_required > 0) {
if (!next_range()) {
FX_LOGS(ERROR) << "Unable to allocate enough guest memory due to guest memory restrictions. "
"Managed to allocate "
<< guest_memory - mem_required << " of " << guest_memory << " bytes";
return false;
}
uint64_t mem_used = std::min(current_region.size, mem_required);
regions->push_back({current_region.base, mem_used});
mem_required -= mem_used;
}
return true;
}
bool Guest::FitPluggableRegionBase(cpp20::span<const GuestMemoryRegion> restrictions, uint64_t base,
uint64_t size, uint64_t alignment, uint64_t* result_base) {
base = ZX_ALIGN(base, alignment);
for (auto& restriction : restrictions) {
if (restriction.HasOverlap(GuestMemoryRegion{base, size})) {
if (restriction.size == kGuestMemoryAllRemainingRange) {
return false;
}
base = ZX_ALIGN(restriction.base + restriction.size, alignment);
}
}
*result_base = base;
return true;
}
zx_status_t Guest::Init(uint64_t guest_memory, uint64_t pluggable_region_size,
uint64_t pluggable_region_alignment) {
zx::resource hypervisor_resource;
zx_status_t status = get_hypervisor_resource(&hypervisor_resource);
if (status != ZX_OK) {
FX_PLOGS(ERROR, status) << "Failed to get hypervisor resource";
return status;
}
status = zx::guest::create(hypervisor_resource, 0, &guest_, &vmar_);
if (status != ZX_OK) {
FX_PLOGS(ERROR, status) << "Failed to create guest";
return status;
}
// If unaligned, round up to the nearest page.
guest_memory = Guest::GetPageAlignedGuestMemory(guest_memory);
// Generate guest memory regions, avoiding device memory.
if (!Guest::GenerateGuestMemoryRegions(
guest_memory, Guest::GetDefaultRestrictionsForArchitecture(), &memory_regions_)) {
FX_PLOGS(ERROR, ZX_ERR_INVALID_ARGS) << "Failed to place guest memory avoiding device memory "
"ranges. Try requesting less memory.";
}
if (pluggable_region_size > 0) {
uint64_t pluggable_region_base = memory_regions_.back().base + memory_regions_.back().size;
// Calculate the position of the pluggable memory region
if (!Guest::FitPluggableRegionBase(Guest::GetDefaultRestrictionsForArchitecture(),
pluggable_region_base, pluggable_region_size,
pluggable_region_alignment, &mem_pluggable_region_addr_)) {
status = ZX_ERR_INVALID_ARGS;
FX_PLOGS(ERROR, status) << "Failed to place pluggable memory region avoiding device memory "
"ranges. Try requesting smaller pluggable region size.";
return status;
} else {
memory_regions_.push_back({mem_pluggable_region_addr_, pluggable_region_size});
}
}
uint64_t vmo_size = memory_regions_.back().base + memory_regions_.back().size;
zx::vmo vmo;
status = zx::vmo::create(vmo_size, 0, &vmo);
if (status != ZX_OK) {
FX_PLOGS(ERROR, status) << "Failed to create VMO of size " << vmo_size;
return status;
}
zx::resource vmex_resource;
status = get_vmex_resource(&vmex_resource);
if (status != ZX_OK) {
FX_PLOGS(ERROR, status) << "Failed to get VMEX resource";
return status;
}
status = vmo.replace_as_executable(vmex_resource, &vmo);
if (status != ZX_OK) {
FX_PLOGS(ERROR, status) << "Failed to make VMO executable";
return status;
}
std::vector<GuestMemoryRegion> vmar_regions = memory_regions_;
if (pluggable_region_size > 0) {
// We do want vmar mapping for the pluggable memory region but we don't want
// the pluggable memory region to be a part of the e820 memory map.
// So, leave the pluggable memory region part of vmar_regions and remove it
// from memory_regions_ which will be later used to set up e820 map
//
// TODO(https://fxbug.dev/42051237): Get virtio-mem to take the guest and host vmars,
// keep all pluggable memory unmapped and only map plugged regions. This
// would require adding EXECUTE and perhaps other flags to the vmo which is
// passed to the virtio-mem. Mapping only plugged regions can make existing
// virtio-mem tests flaky because test is not guaranteed to use the plugged
// memory during its test allocate for something which requires EXECUTE.
// Need to write a stress test for virtio-mem before making this change.
FX_CHECK(!memory_regions_.empty());
FX_CHECK(memory_regions_.back().base == mem_pluggable_region_addr_);
memory_regions_.pop_back();
}
#if __x86_64__
// x86 has reserved memory from 0 to 32KiB, and 512KiB to 1MiB. While we will not allocate guest
// memory in those regions, we still want to map these regions into the guest VMAR as they are
// not devices and we do not wish to trap on them.
vmar_regions.push_back({0, 32 * kOneKibibyte});
vmar_regions.push_back({512 * kOneKibibyte, 512 * kOneKibibyte});
#endif
for (const GuestMemoryRegion& region : vmar_regions) {
zx_gpaddr_t addr;
status = vmar_.map(ZX_VM_PERM_READ | ZX_VM_PERM_WRITE | ZX_VM_PERM_EXECUTE | ZX_VM_SPECIFIC |
ZX_VM_REQUIRE_NON_RESIZABLE,
region.base, vmo, region.base, region.size, &addr);
if (status != ZX_OK) {
FX_PLOGS(ERROR, status) << "Failed to map guest physical memory region " << region.base
<< " - " << region.base + region.size;
return status;
}
}
status = phys_mem_.Init(vmar_regions, std::move(vmo));
if (status != ZX_OK) {
FX_PLOGS(ERROR, status) << "Failed to initialize guest physical memory";
return status;
}
return ZX_OK;
}
zx_status_t Guest::CreateMapping(TrapType type, uint64_t addr, size_t size, uint64_t offset,
IoHandler* handler, async_dispatcher_t* dispatcher) {
uint32_t kind = trap_kind(type);
mappings_.emplace_front(kind, addr, size, offset, handler);
zx_status_t status = mappings_.front().SetTrap(this, dispatcher);
if (status != ZX_OK) {
mappings_.pop_front();
return status;
}
return ZX_OK;
}
zx_status_t Guest::CreateSubVmar(uint64_t addr, size_t size, zx::vmar* vmar) {
uintptr_t guest_addr;
return vmar_.allocate(ZX_VM_CAN_MAP_READ | ZX_VM_CAN_MAP_WRITE | ZX_VM_SPECIFIC, addr, size, vmar,
&guest_addr);
}
zx_status_t Guest::StartVcpu(uint64_t id, zx_gpaddr_t entry, zx_gpaddr_t boot_ptr) {
if (id >= kMaxVcpus) {
FX_PLOGS(ERROR, ZX_ERR_OUT_OF_RANGE)
<< "Failed to start VCPU-" << id << ", up to " << kMaxVcpus << " VCPUs are supported";
return ZX_ERR_OUT_OF_RANGE;
}
std::lock_guard<std::shared_mutex> lock(mutex_);
if (!vcpus_[0].has_value() && id != 0) {
FX_PLOGS(ERROR, ZX_ERR_BAD_STATE) << "VCPU-0 must be started before other VCPUs";
return ZX_ERR_BAD_STATE;
}
if (vcpus_[id].has_value()) {
// The guest might make multiple requests to start a particular VCPU. On
// x86, the guest should send two START_UP IPIs but we initialize the VCPU
// on the first. So, we ignore subsequent requests.
return ZX_OK;
}
vcpus_[id].emplace(id, this, entry, boot_ptr);
return vcpus_[id]->Start();
}
zx_status_t Guest::Interrupt(uint64_t mask, uint32_t vector) {
std::shared_lock<std::shared_mutex> lock(mutex_);
for (size_t id = 0; id != kMaxVcpus; ++id) {
if (!(mask & (1ul << id)) || !vcpus_[id]) {
continue;
}
zx_status_t status = vcpus_[id]->Interrupt(vector);
if (status != ZX_OK) {
return status;
}
#if __aarch64__
if (vector >= kSpiBase) {
break;
}
#endif
}
return ZX_OK;
}
void Guest::set_stop_callback(
fit::function<void(fit::result<::fuchsia::virtualization::GuestError>)> stop_callback) {
stop_callback_ = std::move(stop_callback);
}
void Guest::Stop(fit::result<::fuchsia::virtualization::GuestError> result) {
FX_CHECK(stop_callback_);
stop_callback_(result);
}