| // Copyright 2024 syzkaller project authors. All rights reserved. |
| // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. |
| |
| #include <dirent.h> |
| #include <stdlib.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <unistd.h> |
| |
| #include <atomic> |
| #include <string> |
| #include <utility> |
| |
| #ifndef MADV_POPULATE_WRITE |
| #define MADV_POPULATE_WRITE 23 |
| #endif |
| |
| // Size of qemu snapshots and time required to restore a snapshot depend on the amount of memory |
| // the VM touches after boot. For example, a 132 MB snapshot takes around 150ms to restore, |
| // while a 260 MB snapshot takes around 275 ms to restore. |
| // |
| // To reduce size of the snapshot it's recommended to use smaller kernel and setup fewer devices. |
| // For example the following cmdline arguments: |
| // "loop.max_loop=1 dummy_hcd.num=1 vivid.n_devs=2 vivid.multiplanar=1,2 netrom.nr_ndevs=1 rose.rose_ndevs=1" |
| // and CONFIG_USBIP_VHCI_NR_HCS=1 help to reduce snapshot by about 20 MB. Note: we have only 1 proc |
| // in snapshot mode, so we don't need lots of devices. However, our descriptions rely on vivid.n_devs=16 |
| // since they hardcode names like /dev/video36 which follow after these 16 pre-created devices. |
| // |
| // Additionally we could try to use executor as init process, this should remove dhcpd/sshd/udevd/klogd/etc. |
| // We don't need even networking in snapshot mode since we communicate via shared memory. |
| |
| static struct { |
| // Ivshmem interrupt doorbell register. |
| volatile uint32* doorbell; |
| volatile rpc::SnapshotHeaderT* hdr; |
| void* input; |
| } ivs; |
| |
| // Finds qemu ivshmem device, see: |
| // https://www.qemu.org/docs/master/specs/ivshmem-spec.html |
| static void FindIvshmemDevices() |
| { |
| std::string result; |
| DIR* devices = opendir("/sys/bus/pci/devices"); |
| if (!devices) |
| fail("opendir(/sys/bus/pci/devices) failed"); |
| void* regs = nullptr; |
| void* input = nullptr; |
| void* output = nullptr; |
| while (auto* dev = readdir(devices)) { |
| if (dev->d_name[0] == '.') |
| continue; |
| const std::string& vendor = ReadTextFile("/sys/bus/pci/devices/%s/vendor", dev->d_name); |
| const std::string& device = ReadTextFile("/sys/bus/pci/devices/%s/device", dev->d_name); |
| debug("PCI device %s: vendor=%s device=%s\n", dev->d_name, vendor.c_str(), device.c_str()); |
| if (vendor != "0x1af4" || device != "0x1110") |
| continue; |
| char filename[1024]; |
| snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource2", dev->d_name); |
| int res2 = open(filename, O_RDWR); |
| if (res2 == -1) |
| fail("failed to open ivshmem resource2"); |
| struct stat statbuf; |
| if (fstat(res2, &statbuf)) |
| fail("failed to fstat ivshmem resource2"); |
| debug("ivshmem resource2 size %zu\n", static_cast<size_t>(statbuf.st_size)); |
| // The only way to distinguish the 2 ivshmem regions is by size. |
| if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotDoorbellSize)) { |
| snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource0", dev->d_name); |
| int res0 = open(filename, O_RDWR); |
| if (res0 == -1) |
| fail("failed to open ivshmem resource0"); |
| regs = mmap(nullptr, getpagesize(), PROT_READ | PROT_WRITE, MAP_SHARED, res0, 0); |
| close(res0); |
| if (regs == MAP_FAILED) |
| fail("failed to mmap ivshmem resource0"); |
| debug("mapped doorbell registers at %p\n", regs); |
| } else if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotShmemSize)) { |
| input = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxInputSize), |
| PROT_READ, MAP_SHARED, res2, 0); |
| output = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxOutputSize), |
| PROT_READ | PROT_WRITE, MAP_SHARED, res2, |
| static_cast<uint64>(rpc::Const::MaxInputSize)); |
| if (input == MAP_FAILED || output == MAP_FAILED) |
| fail("failed to mmap ivshmem resource2"); |
| debug("mapped shmem input at at %p/%llu\n", |
| input, static_cast<uint64>(rpc::Const::MaxInputSize)); |
| debug("mapped shmem output at at %p/%llu\n", |
| output, static_cast<uint64>(rpc::Const::MaxOutputSize)); |
| #if GOOS_linux |
| if (pkeys_enabled && pkey_mprotect(output, static_cast<uint64>(rpc::Const::MaxOutputSize), |
| PROT_READ | PROT_WRITE, RESERVED_PKEY)) |
| exitf("failed to pkey_mprotect output buffer"); |
| #endif |
| } |
| close(res2); |
| } |
| closedir(devices); |
| if (regs == nullptr || input == nullptr) |
| fail("cannot find ivshmem PCI devices"); |
| ivs.doorbell = static_cast<uint32*>(regs) + 3; |
| ivs.hdr = static_cast<rpc::SnapshotHeaderT*>(output); |
| ivs.input = input; |
| output_data = reinterpret_cast<OutputData*>(static_cast<char*>(output) + sizeof(rpc::SnapshotHeaderT)); |
| output_size = static_cast<uint64>(rpc::Const::MaxOutputSize) - sizeof(rpc::SnapshotHeaderT); |
| } |
| |
| static void SnapshotSetup(char** argv, int argc) |
| { |
| flag_snapshot = true; |
| // This allows to see debug output during early setup. |
| // If debug is not actually enabled, it will be turned off in parse_handshake. |
| flag_debug = true; |
| #if GOOS_linux |
| // In snapshot mode executor output is redirected to /dev/kmsg. |
| // This is required to turn off rate limiting of writes. |
| write_file("/proc/sys/kernel/printk_devkmsg", "on\n"); |
| #endif |
| FindIvshmemDevices(); |
| // Wait for the host to write handshake_req into input memory. |
| while (ivs.hdr->state != rpc::SnapshotState::Handshake) |
| sleep_ms(10); |
| auto msg = flatbuffers::GetRoot<rpc::SnapshotHandshake>(ivs.input); |
| handshake_req req = { |
| .magic = kInMagic, |
| .use_cover_edges = msg->cover_edges(), |
| .is_kernel_64_bit = msg->kernel_64_bit(), |
| .flags = msg->env_flags(), |
| .pid = 0, |
| .sandbox_arg = static_cast<uint64>(msg->sandbox_arg()), |
| .syscall_timeout_ms = static_cast<uint64>(msg->syscall_timeout_ms()), |
| .program_timeout_ms = static_cast<uint64>(msg->program_timeout_ms()), |
| .slowdown_scale = static_cast<uint64>(msg->slowdown()), |
| }; |
| parse_handshake(req); |
| #if SYZ_HAVE_FEATURES |
| setup_sysctl(); |
| setup_cgroups(); |
| #endif |
| #if SYZ_HAVE_SETUP_EXT |
| // This can be defined in common_ext.h. |
| setup_ext(); |
| #endif |
| for (const auto& feat : features) { |
| if (!(msg->features() & feat.id)) |
| continue; |
| debug("setting up feature %s\n", rpc::EnumNameFeature(feat.id)); |
| const char* reason = feat.setup(); |
| if (reason) |
| failmsg("feature setup failed", "reason: %s", reason); |
| } |
| } |
| |
| constexpr size_t kOutputPopulate = 256 << 10; |
| constexpr size_t kInputPopulate = 64 << 10; |
| constexpr size_t kGlobalsPopulate = 4 << 10; |
| constexpr size_t kDataPopulate = 8 << 10; |
| constexpr size_t kCoveragePopulate = 64 << 10; |
| constexpr size_t kThreadsPopulate = 2; |
| |
| static void SnapshotSetState(rpc::SnapshotState state) |
| { |
| debug("changing stapshot state %s -> %s\n", |
| rpc::EnumNameSnapshotState(ivs.hdr->state), rpc::EnumNameSnapshotState(state)); |
| std::atomic_signal_fence(std::memory_order_seq_cst); |
| ivs.hdr->state = state; |
| // The register contains VM index shifted by 16 (the host part is VM index 1) |
| // + interrup vector index (0 in our case). |
| *ivs.doorbell = 1 << 16; |
| } |
| |
| // PopulateMemory prefaults anon memory (we want to avoid minor page faults as well). |
| static void PopulateMemory(void* ptr, size_t size) |
| { |
| ptr = (void*)(uintptr_t(ptr) & ~(getpagesize() - 1)); |
| if (madvise(ptr, size, MADV_POPULATE_WRITE)) |
| failmsg("populate madvise failed", "ptr=%p size=%zu", ptr, size); |
| } |
| |
| // TouchMemory prefaults non-anon shared memory. |
| static void TouchMemory(void* ptr, size_t size) |
| { |
| size_t const kPageSize = getpagesize(); |
| for (size_t i = 0; i < size; i += kPageSize) |
| (void)((volatile char*)ptr)[i]; |
| } |
| |
| #if SYZ_EXECUTOR_USES_FORK_SERVER |
| static void SnapshotPrepareParent() |
| { |
| // This allows access to the output region. |
| CoverAccessScope scope(nullptr); |
| TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate); |
| // Notify SnapshotStart that we finished prefaulting memory in the parent. |
| output_data->completed = 1; |
| // Wait for the request to come, so that we give it full time slice to execute. |
| // This process will start waiting for the child as soon as we return. |
| while (ivs.hdr->state != rpc::SnapshotState::Execute) |
| ; |
| } |
| #endif |
| |
| static void SnapshotStart() |
| { |
| debug("SnapshotStart\n"); |
| CoverAccessScope scope(nullptr); |
| // Prefault as much memory as we can before the snapshot is taken. |
| // Also pre-create some threads and let them block. |
| // This is intended to make execution after each snapshot restore faster, |
| // as we won't need to do that duplicate work again and again. |
| flag_threaded = true; |
| for (size_t i = 0; i < kThreadsPopulate; i++) { |
| thread_t* th = &threads[i]; |
| thread_create(th, i, flag_coverage); |
| if (flag_coverage) |
| PopulateMemory(th->cov.data, kCoveragePopulate); |
| } |
| TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate); |
| TouchMemory(ivs.input, kInputPopulate); |
| PopulateMemory(&flag_coverage, kGlobalsPopulate); |
| PopulateMemory((void*)SYZ_DATA_OFFSET, kDataPopulate); |
| sleep_ms(100); // let threads start and block |
| // Wait for the parent process to prefault as well. |
| while (!output_data->completed) |
| sleep_ms(1); |
| // Notify host that we are ready to be snapshotted. |
| SnapshotSetState(rpc::SnapshotState::Ready); |
| // Snapshot is restored here. |
| // First time we may loop here while the snapshot is taken, |
| // but afterwards we should be restored when the state is already Execute. |
| // Note: we don't use sleep in the loop because we may be snapshotted while in the sleep syscall. |
| // As the result each execution after snapshot restore will be slower as it will need to finish |
| // the sleep and return from the syscall. |
| while (ivs.hdr->state == rpc::SnapshotState::Ready) |
| ; |
| if (ivs.hdr->state == rpc::SnapshotState::Snapshotted) { |
| // First time around, just acknowledge and wait for snapshot restart. |
| SnapshotSetState(rpc::SnapshotState::Executed); |
| for (;;) |
| sleep(1000); |
| } |
| // Resumed for program execution. |
| output_data->Reset(); |
| auto msg = flatbuffers::GetRoot<rpc::SnapshotRequest>(ivs.input); |
| execute_req req = { |
| .magic = kInMagic, |
| .id = 0, |
| .type = rpc::RequestType::Program, |
| .exec_flags = static_cast<uint64>(msg->exec_flags()), |
| .all_call_signal = msg->all_call_signal(), |
| .all_extra_signal = msg->all_extra_signal(), |
| }; |
| parse_execute(req); |
| output_data->num_calls.store(msg->num_calls(), std::memory_order_relaxed); |
| input_data = const_cast<uint8*>(msg->prog_data()->Data()); |
| } |
| |
| NORETURN static void SnapshotDone(bool failed) |
| { |
| debug("SnapshotDone\n"); |
| CoverAccessScope scope(nullptr); |
| uint32 num_calls = output_data->num_calls.load(std::memory_order_relaxed); |
| auto data = finish_output(output_data, 0, 0, num_calls, 0, 0, failed ? kFailStatus : 0, false, nullptr); |
| ivs.hdr->output_offset = data.data() - reinterpret_cast<volatile uint8_t*>(ivs.hdr); |
| ivs.hdr->output_size = data.size(); |
| SnapshotSetState(failed ? rpc::SnapshotState::Failed : rpc::SnapshotState::Executed); |
| // Wait to be restarted from the snapshot. |
| for (;;) |
| sleep(1000); |
| } |