blob: 30c10a6151190a383c02f81369821a42d927d2f8 [file] [log] [blame] [edit]
// Copyright 2015 syzkaller project authors. All rights reserved.
// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <unistd.h>
const unsigned long KCOV_TRACE_PC = 0;
const unsigned long KCOV_TRACE_CMP = 1;
template <int N>
struct kcov_remote_arg {
uint32 trace_mode;
uint32 area_size;
uint32 num_handles;
uint32 pad;
uint64 common_handle;
uint64 handles[N];
};
#define KCOV_INIT_TRACE32 _IOR('c', 1, uint32)
#define KCOV_INIT_TRACE64 _IOR('c', 1, uint64)
#define KCOV_ENABLE _IO('c', 100)
#define KCOV_DISABLE _IO('c', 101)
#define KCOV_REMOTE_ENABLE _IOW('c', 102, kcov_remote_arg<0>)
#define KCOV_SUBSYSTEM_COMMON (0x00ull << 56)
#define KCOV_SUBSYSTEM_USB (0x01ull << 56)
#define KCOV_SUBSYSTEM_MASK (0xffull << 56)
#define KCOV_INSTANCE_MASK (0xffffffffull)
static bool is_gvisor;
static inline __u64 kcov_remote_handle(__u64 subsys, __u64 inst)
{
if (subsys & ~KCOV_SUBSYSTEM_MASK || inst & ~KCOV_INSTANCE_MASK)
return 0;
return subsys | inst;
}
static bool detect_kernel_bitness();
static bool detect_gvisor();
static void os_init(int argc, char** argv, char* data, size_t data_size)
{
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
is_kernel_64_bit = detect_kernel_bitness();
is_gvisor = detect_gvisor();
// Surround the main data mapping with PROT_NONE pages to make virtual address layout more consistent
// across different configurations (static/non-static build) and C repros.
// One observed case before: executor had a mapping above the data mapping (output region),
// while C repros did not have that mapping above, as the result in one case VMA had next link,
// while in the other it didn't and it caused a bug to not reproduce with the C repro.
void* got = mmap(data - SYZ_PAGE_SIZE, SYZ_PAGE_SIZE, PROT_NONE, MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
if (data - SYZ_PAGE_SIZE != got)
failmsg("mmap of left data PROT_NONE page failed", "want %p, got %p", data - SYZ_PAGE_SIZE, got);
got = mmap(data, data_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
if (data != got)
failmsg("mmap of data segment failed", "want %p, got %p", data, got);
got = mmap(data + data_size, SYZ_PAGE_SIZE, PROT_NONE, MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
if (data + data_size != got)
failmsg("mmap of right data PROT_NONE page failed", "want %p, got %p", data + data_size, got);
// A SIGCHLD handler makes sleep in loop exit immediately return with EINTR with a child exits.
struct sigaction act = {};
act.sa_handler = [](int) {};
sigaction(SIGCHLD, &act, nullptr);
}
static intptr_t execute_syscall(const call_t* c, intptr_t a[kMaxArgs])
{
if (c->call)
return c->call(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8]);
return syscall(c->sys_nr, a[0], a[1], a[2], a[3], a[4], a[5]);
}
static void cover_open(cover_t* cov, bool extra)
{
int fd = open("/sys/kernel/debug/kcov", O_RDWR);
if (fd == -1)
fail("open of /sys/kernel/debug/kcov failed");
if (dup2(fd, cov->fd) < 0)
failmsg("filed to dup cover fd", "from=%d, to=%d", fd, cov->fd);
close(fd);
const int kcov_init_trace = is_kernel_64_bit ? KCOV_INIT_TRACE64 : KCOV_INIT_TRACE32;
const int cover_size = extra ? kExtraCoverSize : kCoverSize;
if (ioctl(cov->fd, kcov_init_trace, cover_size))
fail("cover init trace write failed");
cov->mmap_alloc_size = cover_size * (is_kernel_64_bit ? 8 : 4);
}
static void cover_protect(cover_t* cov)
{
}
static void cover_unprotect(cover_t* cov)
{
}
static void cover_mmap(cover_t* cov)
{
if (cov->data != NULL)
fail("cover_mmap invoked on an already mmapped cover_t object");
if (cov->mmap_alloc_size == 0)
fail("cover_t structure is corrupted");
// Allocate kcov buffer plus two guard pages surrounding it.
char* mapped = (char*)mmap(NULL, cov->mmap_alloc_size + 2 * SYZ_PAGE_SIZE,
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (mapped == MAP_FAILED)
exitf("failed to preallocate kcov buffer");
// Protect the guard pages.
int res = mprotect(mapped, SYZ_PAGE_SIZE, PROT_NONE);
if (res == -1)
exitf("failed to protect kcov guard page");
res = mprotect(mapped + SYZ_PAGE_SIZE + cov->mmap_alloc_size,
SYZ_PAGE_SIZE, PROT_NONE);
if (res == -1)
exitf("failed to protect kcov guard page");
// Now map the kcov buffer to the file, overwriting the existing mapping above.
cov->data = (char*)mmap(mapped + SYZ_PAGE_SIZE, cov->mmap_alloc_size,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, cov->fd, 0);
if (cov->data == MAP_FAILED)
exitf("cover mmap failed");
cov->data_end = cov->data + cov->mmap_alloc_size;
cov->data_offset = is_kernel_64_bit ? sizeof(uint64_t) : sizeof(uint32_t);
cov->pc_offset = 0;
}
static void cover_enable(cover_t* cov, bool collect_comps, bool extra)
{
unsigned int kcov_mode = collect_comps ? KCOV_TRACE_CMP : KCOV_TRACE_PC;
// The KCOV_ENABLE call should be fatal,
// but in practice ioctl fails with assorted errors (9, 14, 25),
// so we use exitf.
if (!extra) {
if (ioctl(cov->fd, KCOV_ENABLE, kcov_mode))
exitf("cover enable write trace failed, mode=%d", kcov_mode);
return;
}
kcov_remote_arg<1> arg = {
.trace_mode = kcov_mode,
// Coverage buffer size of background threads.
.area_size = kExtraCoverSize,
.num_handles = 1,
};
arg.common_handle = kcov_remote_handle(KCOV_SUBSYSTEM_COMMON, procid + 1);
arg.handles[0] = kcov_remote_handle(KCOV_SUBSYSTEM_USB, procid + 1);
if (ioctl(cov->fd, KCOV_REMOTE_ENABLE, &arg))
exitf("remote cover enable write trace failed");
}
static void cover_reset(cover_t* cov)
{
// Callers in common_linux.h don't check this flag.
if (!flag_coverage)
return;
if (cov == 0) {
if (current_thread == 0)
fail("cover_reset: current_thread == 0");
cov = &current_thread->cov;
}
*(uint64*)cov->data = 0;
}
static void cover_collect(cover_t* cov)
{
if (is_kernel_64_bit)
cov->size = *(uint64*)cov->data;
else
cov->size = *(uint32*)cov->data;
}
static bool use_cover_edges(uint32 pc)
{
return true;
}
static bool is_kernel_data(uint64 addr)
{
if (is_gvisor)
return false;
#if GOARCH_386 || GOARCH_amd64
// This range corresponds to the first 1TB of the physical memory mapping,
// see Documentation/arch/x86/x86_64/mm.rst.
return addr >= 0xffff880000000000ull && addr < 0xffff890000000000ull;
#else
return false;
#endif
}
// Returns >0 for yes, <0 for no, 0 for don't know.
static int is_kernel_pc(uint64 pc)
{
if (is_gvisor)
return 0;
#if GOARCH_386 || GOARCH_amd64
// Text/modules range for x86_64.
return pc >= 0xffffffff80000000ull && pc < 0xffffffffff000000ull ? 1 : -1;
#else
return 0;
#endif
}
static bool use_cover_edges(uint64 pc)
{
#if GOARCH_amd64 || GOARCH_arm64
if (is_gvisor)
return false; // gvisor coverage is not a trace, so producing edges won't work
#endif
return true;
}
static bool detect_kernel_bitness()
{
if (sizeof(void*) == 8)
return true;
// It turns out to be surprisingly hard to understand if the kernel underneath is 64-bits.
// A common method is to look at uname.machine. But it is produced in some involved ways,
// and we will need to know about all strings it returns and in the end it can be overriden
// during build and lie (and there are known precedents of this).
// So instead we look at size of addresses in /proc/kallsyms.
bool wide = true;
int fd = open("/proc/kallsyms", O_RDONLY);
if (fd != -1) {
char buf[16];
if (read(fd, buf, sizeof(buf)) == sizeof(buf) &&
(buf[8] == ' ' || buf[8] == '\t'))
wide = false;
close(fd);
}
debug("detected %d-bit kernel\n", wide ? 64 : 32);
return wide;
}
static bool detect_gvisor()
{
char buf[64] = {};
// 3 stands for undeclared SYSLOG_ACTION_READ_ALL.
syscall(__NR_syslog, 3, buf, sizeof(buf) - 1);
// This is a first line of gvisor dmesg.
return strstr(buf, "Starting gVisor");
}
// One does not simply exit.
// _exit can in fact fail.
// syzkaller did manage to generate a seccomp filter that prohibits exit_group syscall.
// Previously, we get into infinite recursion via segv_handler in such case
// and corrupted output_data, which does matter in our case since it is shared
// with fuzzer process. Loop infinitely instead. Parent will kill us.
// But one does not simply loop either. Compilers are sure that _exit never returns,
// so they remove all code after _exit as dead. Call _exit via volatile indirection.
// And this does not work as well. _exit has own handling of failing exit_group
// in the form of HLT instruction, it will divert control flow from our loop.
// So call the syscall directly.
NORETURN void doexit(int status)
{
volatile unsigned i;
syscall(__NR_exit_group, status);
for (i = 0;; i++) {
}
}
// If we need to kill just a single thread (e.g. after cloning), exit_group is not
// the right choice - it will kill all threads, which might eventually lead to
// unnecessary SYZFAIL errors.
NORETURN void doexit_thread(int status)
{
volatile unsigned i;
syscall(__NR_exit, status);
for (i = 0;; i++) {
}
}
#define SYZ_HAVE_KCSAN 1
static void setup_kcsan_filterlist(char** frames, int nframes, bool suppress)
{
int fd = open("/sys/kernel/debug/kcsan", O_WRONLY);
if (fd == -1)
fail("failed to open kcsan debugfs file");
printf("%s KCSAN reports in functions: ",
suppress ? "suppressing" : "only showing");
if (!suppress)
dprintf(fd, "whitelist\n");
for (int i = 0; i < nframes; ++i) {
printf("'%s' ", frames[i]);
dprintf(fd, "!%s\n", frames[i]);
}
printf("\n");
close(fd);
}
static const char* setup_nicvf()
{
// This feature has custom checking precedure rather than just rely on running
// a simple program with this feature enabled b/c find_vf_interface cannot be made
// failing. It searches for the nic in init namespace, but then the nic is moved
// to one of testing namespace, so if number of procs is more than the number of devices,
// then some of them won't fine a nic (the code is also racy, more than one proc
// can find the same device and then moving it will fail for all but one).
// So we have to make find_vf_interface non-failing in case of failures,
// which means we cannot use it for feature checking.
int fd = open("/sys/bus/pci/devices/0000:00:11.0/", O_RDONLY | O_NONBLOCK);
if (fd == -1)
return "PCI device 0000:00:11.0 is not available";
close(fd);
return NULL;
}
static const char* setup_devlink_pci()
{
// See comment in setup_nicvf.
int fd = open("/sys/bus/pci/devices/0000:00:10.0/", O_RDONLY | O_NONBLOCK);
if (fd == -1)
return "PCI device 0000:00:10.0 is not available";
close(fd);
return NULL;
}
static const char* setup_delay_kcov()
{
int fd = open("/sys/kernel/debug/kcov", O_RDWR);
if (fd == -1)
return "open of /sys/kernel/debug/kcov failed";
close(fd);
cover_t cov = {};
cov.fd = kCoverFd;
cover_open(&cov, false);
cover_mmap(&cov);
char* first = cov.data;
cov.data = nullptr;
cover_mmap(&cov);
// If delayed kcov mmap is not supported by the kernel,
// accesses to the second mapping will crash.
// Use clock_gettime to check if it's mapped w/o crashing the process.
const char* error = NULL;
timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
if (errno != EFAULT)
fail("clock_gettime failed");
error = "kernel commit b3d7fe86fbd0 is not present";
} else {
munmap(cov.data - SYZ_PAGE_SIZE, cov.mmap_alloc_size + 2 * SYZ_PAGE_SIZE);
}
munmap(first - SYZ_PAGE_SIZE, cov.mmap_alloc_size + 2 * SYZ_PAGE_SIZE);
close(cov.fd);
return error;
}
#define SYZ_HAVE_FEATURES 1
static feature_t features[] = {
{rpc::Feature::DelayKcovMmap, setup_delay_kcov},
{rpc::Feature::Fault, setup_fault},
{rpc::Feature::Leak, setup_leak},
{rpc::Feature::KCSAN, setup_kcsan},
{rpc::Feature::USBEmulation, setup_usb},
{rpc::Feature::LRWPANEmulation, setup_802154},
{rpc::Feature::BinFmtMisc, setup_binfmt_misc},
{rpc::Feature::Swap, setup_swap},
{rpc::Feature::NicVF, setup_nicvf},
{rpc::Feature::DevlinkPCI, setup_devlink_pci},
};