Merge "llkd: add live-lock daemon"
diff --git a/libcutils/include/private/android_filesystem_config.h b/libcutils/include/private/android_filesystem_config.h
index 8209167..3be8ad0 100644
--- a/libcutils/include/private/android_filesystem_config.h
+++ b/libcutils/include/private/android_filesystem_config.h
@@ -130,6 +130,7 @@
 #define AID_INCIDENTD 1067       /* incidentd daemon */
 #define AID_SECURE_ELEMENT 1068  /* secure element subsystem */
 #define AID_LMKD 1069            /* low memory killer daemon */
+#define AID_LLKD 1070            /* live lock daemon */
 /* Changes to this file must be made in AOSP, *not* in internal branches. */
 
 #define AID_SHELL 2000 /* adb and debug shell user */
diff --git a/llkd/Android.bp b/llkd/Android.bp
new file mode 100644
index 0000000..a6edd26
--- /dev/null
+++ b/llkd/Android.bp
@@ -0,0 +1,42 @@
+cc_library_headers {
+    name: "llkd_headers",
+
+    export_include_dirs: ["include"],
+}
+
+cc_library_static {
+    name: "libllkd",
+
+    srcs: [
+        "libllkd.cpp",
+    ],
+
+    shared_libs: [
+        "libbase",
+        "libcutils",
+        "liblog",
+    ],
+
+    export_include_dirs: ["include"],
+
+    cflags: ["-Werror"],
+}
+
+cc_binary {
+    name: "llkd",
+
+    srcs: [
+        "llkd.cpp",
+    ],
+    shared_libs: [
+        "libbase",
+        "libcutils",
+        "liblog",
+    ],
+    static_libs: [
+        "libllkd",
+    ],
+    cflags: ["-Werror"],
+
+    init_rc: ["llkd.rc"],
+}
diff --git a/llkd/OWNERS b/llkd/OWNERS
new file mode 100644
index 0000000..b6af537
--- /dev/null
+++ b/llkd/OWNERS
@@ -0,0 +1,2 @@
+salyzyn@google.com
+surenb@google.com
diff --git a/llkd/README.md b/llkd/README.md
new file mode 100644
index 0000000..146a998
--- /dev/null
+++ b/llkd/README.md
@@ -0,0 +1,116 @@
+Android Live-LocK Daemon
+========================
+
+Introduction
+------------
+
+Android Live-LocK Daemon (llkd) is used to catch kernel deadlocks and mitigate.
+
+Code is structured to allow integration into another service as either as part
+of the main loop, or spun off as a thread should that be necessary.  A default
+standalone implementation is provided by llkd component.
+
+The 'C' interface from libllkd component is thus:
+
+    #include "llkd.h"
+    bool llkInit(const char* threadname) /* return true if enabled */
+    unsigned llkCheckMillseconds(void)   /* ms to sleep for next check */
+
+If a threadname is provided, a thread will be automatically spawned, otherwise
+caller must call llkCheckMilliseconds in its main loop.  Function will return
+the period of time before the next expected call to this handler.
+
+Operations
+----------
+
+If a thread is in D or Z state with no forward progress for longer than
+ro.llk.timeout_ms, or ro.llk.[D|Z].timeout_ms, kill the process or parent
+process respectively.  If another scan shows the same process continues to
+exist, then have a confirmed live-lock condition and need to panic.  Panic
+the kernel in a manner to provide the greatest bugreporting details as to the
+condition.  Add a alarm self watchdog should llkd ever get locked up that is
+double the expected time to flow through the mainloop.  Sampling is every
+ro.llk_sample_ms.
+
+Default will not monitor init, or [kthreadd] and all that [kthreadd] spawns.
+This reduces the effectiveness of llkd by limiting its coverage.  If there is
+value in covering [kthreadd] spawned threads, the requirement will be that
+the drivers not remain in a persistent 'D' state, or that they have mechanisms
+to recover the thread should it be killed externally (this is good driver
+coding hygiene, a common request to add such to publicly reviewed kernel.org
+maintained drivers).  For instance use wait_event_interruptible() instead of
+wait_event().  The blacklists can be adjusted accordingly if these
+conditions are met to cover kernel components.
+
+An accompanying gTest set have been added, and will setup a persistent D or Z
+process, with and without forward progress, but not in a live-lock state
+because that would require a buggy kernel, or a module or kernel modification
+to stimulate.  The test will check that llkd will mitigate first by killing
+the appropriate process.  D state is setup by vfork() waiting for exec() in
+child process.  Z state is setup by fork() and an un-waited for child process.
+Should be noted that both of these conditions should never happen on Android
+on purpose, and llkd effectively sweeps up processes that create these
+conditions.  If the test can, it will reconfigure llkd to expedite the test
+duration by adjusting the ro.llk.* Android properties.  Tests run the D state
+with some scheduling progress to ensure that ABA checking prevents false
+triggers.
+
+Android Properties
+------------------
+
+Android Properties llkd respond to (<prop>_ms parms are in milliseconds):
+
+#### ro.config.low_ram
+default false, if true do not sysrq t (dump all threads).
+
+#### ro.llk.enable
+default false, allow live-lock daemon to be enabled.
+
+#### ro.khungtask.enable
+default false, allow [khungtask] daemon to be enabled.
+
+#### ro.llk.mlockall
+default false, enable call to mlockall().
+
+#### ro.khungtask.timeout
+default value 12 minutes, [khungtask] maximum timelimit.
+
+#### ro.llk.timeout_ms
+default 10 minutes, D or Z maximum timelimit, double this value and it sets
+the alarm watchdog for llkd.
+
+#### ro.llk.D.timeout_ms
+default ro.llk.timeout_ms, D maximum timelimit.
+
+#### ro.llk.Z.timeout_ms
+default ro.llk.timeout_ms, Z maximum timelimit.
+
+#### ro.llk.check_ms
+default 2 minutes samples of threads for D or Z.
+
+#### ro.llk.blacklist.process
+default 0,1,2 (kernel, init and [kthreadd]) plus process names
+init,[kthreadd],[khungtaskd],lmkd,lmkd.llkd,llkd,watchdogd,
+[watchdogd],[watchdogd/0],...,[watchdogd/<get_nprocs-1>].
+
+#### ro.llk.blacklist.parent
+default 0,2 (kernel and [kthreadd]).
+
+#### ro.llk.blacklist.uid
+default <empty>, comma separated list of uid numbers or names.
+
+Architectural Concerns
+----------------------
+
+- Figure out how to communicate the kernel panic better to bootstat canonical
+  boot reason determination.  This may require an alteration to bootstat, or
+  some logging from llkd.  Would like to see boot reason to be
+  watchdog,livelock as a minimum requirement.  Or more specifically would want
+  watchdog,livelock,device or watchdog,livelock,zombie be reported.
+  Currently reports panic,sysrq (user requested panic) or panic depending on
+  system support of pstore.
+- Create kernel module and associated gTest to actually test panic.
+- Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally
+  not be inputs).  Could require more test-only interfaces to libllkd.
+- Speed up gTest using something else than ro.llk.<properties>, which should
+  not be inputs.
diff --git a/llkd/include/llkd.h b/llkd/include/llkd.h
new file mode 100644
index 0000000..2ae28ed
--- /dev/null
+++ b/llkd/include/llkd.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _LLKD_H_
+#define _LLKD_H_
+
+#ifndef LOG_TAG
+#define LOG_TAG "livelock"
+#endif
+
+#include <stdbool.h>
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+
+bool llkInit(const char* threadname); /* threadname NULL, not spawned */
+unsigned llkCheckMilliseconds(void);
+
+/* clang-format off */
+#define LLK_ENABLE_PROPERTY            "ro.llk.enable"
+#define LLK_ENABLE_DEFAULT             false
+#define KHT_ENABLE_PROPERTY            "ro.khungtask.enable"
+#define LLK_MLOCKALL_PROPERTY          "ro.llk.mlockall"
+#define LLK_MLOCKALL_DEFAULT           true
+#define LLK_TIMEOUT_MS_PROPERTY        "ro.llk.timeout_ms"
+#define KHT_TIMEOUT_PROPERTY           "ro.khungtask.timeout"
+#define LLK_D_TIMEOUT_MS_PROPERTY      "ro.llk.D.timeout_ms"
+#define LLK_Z_TIMEOUT_MS_PROPERTY      "ro.llk.Z.timeout_ms"
+#define LLK_CHECK_MS_PROPERTY          "ro.llk.check_ms"
+/* LLK_CHECK_MS_DEFAULT = actual timeout_ms / LLK_CHECKS_PER_TIMEOUT_DEFAULT */
+#define LLK_CHECKS_PER_TIMEOUT_DEFAULT 5
+#define LLK_BLACKLIST_PROCESS_PROPERTY "ro.llk.blacklist.process"
+#define LLK_BLACKLIST_PROCESS_DEFAULT  \
+    "0,1,2,init,[kthreadd],[khungtaskd],lmkd,lmkd.llkd,llkd,watchdogd,[watchdogd],[watchdogd/0]"
+#define LLK_BLACKLIST_PARENT_PROPERTY  "ro.llk.blacklist.parent"
+#define LLK_BLACKLIST_PARENT_DEFAULT   "0,2,[kthreadd]"
+#define LLK_BLACKLIST_UID_PROPERTY     "ro.llk.blacklist.uid"
+#define LLK_BLACKLIST_UID_DEFAULT      ""
+/* clang-format on */
+
+__END_DECLS
+
+#ifdef __cplusplus
+extern "C++" { /* In case this included wrapped with __BEGIN_DECLS */
+
+#include <chrono>
+
+__BEGIN_DECLS
+/* C++ code allowed to not specify threadname argument for this C linkage */
+bool llkInit(const char* threadname = nullptr);
+__END_DECLS
+std::chrono::milliseconds llkCheck(bool checkRunning = false);
+
+/* clang-format off */
+#define LLK_TIMEOUT_MS_DEFAULT  std::chrono::duration_cast<milliseconds>(std::chrono::minutes(10))
+#define LLK_TIMEOUT_MS_MINIMUM  std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::seconds(10))
+#define LLK_CHECK_MS_MINIMUM    std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::seconds(1))
+/* clang-format on */
+
+} /* extern "C++" */
+#endif /* __cplusplus */
+
+#endif /* _LLKD_H_ */
diff --git a/llkd/libllkd.cpp b/llkd/libllkd.cpp
new file mode 100644
index 0000000..b25eb06
--- /dev/null
+++ b/llkd/libllkd.cpp
@@ -0,0 +1,1159 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "llkd.h"
+
+#include <ctype.h>
+#include <dirent.h>  // opendir() and readdir()
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <pwd.h>  // getpwuid()
+#include <signal.h>
+#include <stdint.h>
+#include <sys/cdefs.h>  // ___STRING, __predict_true() and _predict_false()
+#include <sys/mman.h>   // mlockall()
+#include <sys/prctl.h>
+#include <sys/stat.h>     // lstat()
+#include <sys/syscall.h>  // __NR_getdents64
+#include <sys/sysinfo.h>  // get_nprocs_conf()
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <chrono>
+#include <ios>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include <android-base/file.h>
+#include <android-base/logging.h>
+#include <android-base/parseint.h>
+#include <android-base/properties.h>
+#include <android-base/strings.h>
+#include <cutils/android_get_control_file.h>
+#include <log/log_main.h>
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#define TASK_COMM_LEN 16  // internal kernel, not uapi, from .../linux/include/linux/sched.h
+
+using namespace std::chrono_literals;
+using namespace std::chrono;
+
+namespace {
+
+constexpr pid_t kernelPid = 0;
+constexpr pid_t initPid = 1;
+constexpr pid_t kthreaddPid = 2;
+
+constexpr char procdir[] = "/proc/";
+
+// Configuration
+milliseconds llkUpdate;                              // last check ms signature
+milliseconds llkCycle;                               // ms to next thread check
+bool llkEnable = LLK_ENABLE_DEFAULT;                 // llk daemon enabled
+bool llkRunning = false;                             // thread is running
+bool llkMlockall = LLK_MLOCKALL_DEFAULT;             // run mlocked
+milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT;  // default timeout
+enum { llkStateD, llkStateZ, llkNumStates };         // state indexes
+milliseconds llkStateTimeoutMs[llkNumStates];        // timeout override for each detection state
+milliseconds llkCheckMs;                             // checking interval to inspect any
+                                                     // persistent live-locked states
+bool llkLowRam;                                      // ro.config.low_ram
+bool khtEnable = LLK_ENABLE_DEFAULT;                 // [khungtaskd] panic
+// [khungtaskd] should have a timeout beyond the granularity of llkTimeoutMs.
+// Provides a wide angle of margin b/c khtTimeout is also its granularity.
+seconds khtTimeout = duration_cast<seconds>(llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) /
+                                            LLK_CHECKS_PER_TIMEOUT_DEFAULT);
+
+// Blacklist variables, initialized with comma separated lists of high false
+// positive and/or dangerous references, e.g. without self restart, for pid,
+// ppid, name and uid:
+
+// list of pids, or tids or names to skip. kernel pid (0), init pid (1),
+// [kthreadd] pid (2), ourselves, "init", "[kthreadd]", "lmkd", "llkd" or
+// combinations of watchdogd in kernel and user space.
+std::unordered_set<std::string> llkBlacklistProcess;
+// list of parent pids, comm or cmdline names to skip. default:
+// kernel pid (0), [kthreadd] (2), or ourselves, enforced and implied
+std::unordered_set<std::string> llkBlacklistParent;
+// list of uids, and uid names, to skip, default nothing
+std::unordered_set<std::string> llkBlacklistUid;
+
+class dir {
+  public:
+    enum level { proc, task, numLevels };
+
+  private:
+    int fd;
+    size_t available_bytes;
+    dirent* next;
+    // each directory level picked to be just north of 4K in size
+    static constexpr size_t buffEntries = 15;
+    static dirent buff[numLevels][buffEntries];
+
+    bool fill(enum level index) {
+        if (index >= numLevels) return false;
+        if (available_bytes != 0) return true;
+        if (__predict_false(fd < 0)) return false;
+        // getdents64 has no libc wrapper
+        auto rc = TEMP_FAILURE_RETRY(syscall(__NR_getdents64, fd, buff[index], sizeof(buff[0]), 0));
+        if (rc <= 0) return false;
+        available_bytes = rc;
+        next = buff[index];
+        return true;
+    }
+
+  public:
+    dir() : fd(-1), available_bytes(0), next(nullptr) {}
+
+    explicit dir(const char* directory)
+        : fd(__predict_true(directory != nullptr)
+                 ? ::open(directory, O_CLOEXEC | O_DIRECTORY | O_RDONLY)
+                 : -1),
+          available_bytes(0),
+          next(nullptr) {}
+
+    explicit dir(const std::string&& directory)
+        : fd(::open(directory.c_str(), O_CLOEXEC | O_DIRECTORY | O_RDONLY)),
+          available_bytes(0),
+          next(nullptr) {}
+
+    explicit dir(const std::string& directory)
+        : fd(::open(directory.c_str(), O_CLOEXEC | O_DIRECTORY | O_RDONLY)),
+          available_bytes(0),
+          next(nullptr) {}
+
+    // Don't need any copy or move constructors.
+    explicit dir(const dir& c) = delete;
+    explicit dir(dir& c) = delete;
+    explicit dir(dir&& c) = delete;
+
+    ~dir() {
+        if (fd >= 0) {
+            ::close(fd);
+        }
+    }
+
+    operator bool() const { return fd >= 0; }
+
+    void reset(void) {
+        if (fd >= 0) {
+            ::close(fd);
+            fd = -1;
+            available_bytes = 0;
+            next = nullptr;
+        }
+    }
+
+    dir& reset(const char* directory) {
+        reset();
+        // available_bytes will _always_ be zero here as its value is
+        // intimately tied to fd < 0 or not.
+        fd = ::open(directory, O_CLOEXEC | O_DIRECTORY | O_RDONLY);
+        return *this;
+    }
+
+    void rewind(void) {
+        if (fd >= 0) {
+            ::lseek(fd, off_t(0), SEEK_SET);
+            available_bytes = 0;
+            next = nullptr;
+        }
+    }
+
+    dirent* read(enum level index = proc, dirent* def = nullptr) {
+        if (!fill(index)) return def;
+        auto ret = next;
+        available_bytes -= next->d_reclen;
+        next = reinterpret_cast<dirent*>(reinterpret_cast<char*>(next) + next->d_reclen);
+        return ret;
+    }
+} llkTopDirectory;
+
+dirent dir::buff[dir::numLevels][dir::buffEntries];
+
+// helper functions
+
+bool llkIsMissingExeLink(pid_t tid) {
+    char c;
+    // CAP_SYS_PTRACE is required to prevent ret == -1, but ENOENT is signal
+    auto ret = ::readlink((procdir + std::to_string(tid) + "/exe").c_str(), &c, sizeof(c));
+    return (ret == -1) && (errno == ENOENT);
+}
+
+// Common routine where caller accepts empty content as error/passthrough.
+// Reduces the churn of reporting read errors in the callers.
+std::string ReadFile(std::string&& path) {
+    std::string content;
+    if (!android::base::ReadFileToString(path, &content)) {
+        PLOG(DEBUG) << "Read " << path << " failed";
+        content = "";
+    }
+    return content;
+}
+
+std::string llkProcGetName(pid_t tid, const char* node = "/cmdline") {
+    std::string content = ReadFile(procdir + std::to_string(tid) + node);
+    static constexpr char needles[] = " \t\r\n";  // including trailing nul
+    auto pos = content.find_first_of(needles, 0, sizeof(needles));
+    if (pos != std::string::npos) {
+        content.erase(pos);
+    }
+    return content;
+}
+
+uid_t llkProcGetUid(pid_t tid) {
+    // Get the process' uid.  The following read from /status is admittedly
+    // racy, prone to corruption due to shape-changes.  The consequences are
+    // not catastrophic as we sample a few times before taking action.
+    //
+    // If /loginuid worked on reliably, or on Android (all tasks report -1)...
+    // Android lmkd causes /cgroup to contain memory:/<dom>/uid_<uid>/pid_<pid>
+    // which is tighter, but also not reliable.
+    std::string content = ReadFile(procdir + std::to_string(tid) + "/status");
+    static constexpr char Uid[] = "\nUid:";
+    auto pos = content.find(Uid);
+    if (pos == std::string::npos) {
+        return -1;
+    }
+    pos += ::strlen(Uid);
+    while ((pos < content.size()) && ::isblank(content[pos])) {
+        ++pos;
+    }
+    content.erase(0, pos);
+    for (pos = 0; (pos < content.size()) && ::isdigit(content[pos]); ++pos) {
+        ;
+    }
+    // Content of form 'Uid:	0	0	0	0', newline is error
+    if ((pos >= content.size()) || !::isblank(content[pos])) {
+        return -1;
+    }
+    content.erase(pos);
+    uid_t ret;
+    if (!android::base::ParseInt(content, &ret, uid_t(0))) {
+        return -1;
+    }
+    return ret;
+}
+
+struct proc {
+    pid_t tid;                     // monitored thread id (in Z or D state).
+    nanoseconds schedUpdate;       // /proc/<tid>/sched "se.avg.lastUpdateTime",
+    uint64_t nrSwitches;           // /proc/<tid>/sched "nr_switches" for
+                                   // refined ABA problem detection, determine
+                                   // forward scheduling progress.
+    milliseconds update;           // llkUpdate millisecond signature of last.
+    milliseconds count;            // duration in state.
+    pid_t pid;                     // /proc/<pid> before iterating through
+                                   // /proc/<pid>/task/<tid> for threads.
+    pid_t ppid;                    // /proc/<tid>/stat field 4 parent pid.
+    uid_t uid;                     // /proc/<tid>/status Uid: field.
+    unsigned time;                 // sum of /proc/<tid>/stat field 14 utime &
+                                   // 15 stime for coarse ABA problem detection.
+    std::string cmdline;           // cached /cmdline content
+    char state;                    // /proc/<tid>/stat field 3: Z or D
+                                   // (others we do not monitor: S, R, T or ?)
+    char comm[TASK_COMM_LEN + 3];  // space for adding '[' and ']'
+    bool exeMissingValid;          // exeMissing has been cached
+    bool cmdlineValid;             // cmdline has been cached
+    bool updated;                  // cleared before monitoring pass.
+    bool killed;                   // sent a kill to this thread, next panic...
+
+    void setComm(const char* _comm) { strncpy(comm + 1, _comm, sizeof(comm) - 2); }
+
+    proc(pid_t tid, pid_t pid, pid_t ppid, const char* _comm, int time, char state)
+        : tid(tid),
+          schedUpdate(0),
+          nrSwitches(0),
+          update(llkUpdate),
+          count(0),
+          pid(pid),
+          ppid(ppid),
+          uid(-1),
+          time(time),
+          state(state),
+          exeMissingValid(false),
+          cmdlineValid(false),
+          updated(true),
+          killed(false) {
+        memset(comm, '\0', sizeof(comm));
+        setComm(_comm);
+    }
+
+    const char* getComm(void) {
+        if (comm[1] == '\0') {  // comm Valid?
+            strncpy(comm + 1, llkProcGetName(tid, "/comm").c_str(), sizeof(comm) - 2);
+        }
+        if (!exeMissingValid) {
+            if (llkIsMissingExeLink(tid)) {
+                comm[0] = '[';
+            }
+            exeMissingValid = true;
+        }
+        size_t len = strlen(comm + 1);
+        if (__predict_true(len < (sizeof(comm) - 1))) {
+            if (comm[0] == '[') {
+                if ((comm[len] != ']') && __predict_true(len < (sizeof(comm) - 2))) {
+                    comm[++len] = ']';
+                    comm[++len] = '\0';
+                }
+            } else {
+                if (comm[len] == ']') {
+                    comm[len] = '\0';
+                }
+            }
+        }
+        return &comm[comm[0] != '['];
+    }
+
+    const char* getCmdline(void) {
+        if (!cmdlineValid) {
+            cmdline = llkProcGetName(tid);
+            cmdlineValid = true;
+        }
+        return cmdline.c_str();
+    }
+
+    uid_t getUid(void) {
+        if (uid <= 0) {  // Churn on root user, because most likely to setuid()
+            uid = llkProcGetUid(tid);
+        }
+        return uid;
+    }
+
+    void reset(void) {  // reset cache, if we detected pid rollover
+        uid = -1;
+        state = '?';
+        cmdline = "";
+        comm[0] = '\0';
+        exeMissingValid = false;
+        cmdlineValid = false;
+    }
+};
+
+std::unordered_map<pid_t, proc> tids;
+
+// Check range and setup defaults, in order of propagation:
+//     llkTimeoutMs
+//     llkCheckMs
+//     ...
+// KISS to keep it all self-contained, and called multiple times as parameters
+// are interpreted so that defaults, llkCheckMs and llkCycle make sense.
+void llkValidate() {
+    if (llkTimeoutMs == 0ms) {
+        llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT;
+    }
+    llkTimeoutMs = std::max(llkTimeoutMs, LLK_TIMEOUT_MS_MINIMUM);
+    if (llkCheckMs == 0ms) {
+        llkCheckMs = llkTimeoutMs / LLK_CHECKS_PER_TIMEOUT_DEFAULT;
+    }
+    llkCheckMs = std::min(llkCheckMs, llkTimeoutMs);
+
+    for (size_t state = 0; state < ARRAY_SIZE(llkStateTimeoutMs); ++state) {
+        if (llkStateTimeoutMs[state] == 0ms) {
+            llkStateTimeoutMs[state] = llkTimeoutMs;
+        }
+        llkStateTimeoutMs[state] =
+            std::min(std::max(llkStateTimeoutMs[state], LLK_TIMEOUT_MS_MINIMUM), llkTimeoutMs);
+        llkCheckMs = std::min(llkCheckMs, llkStateTimeoutMs[state]);
+    }
+
+    llkCheckMs = std::max(llkCheckMs, LLK_CHECK_MS_MINIMUM);
+    if (llkCycle == 0ms) {
+        llkCycle = llkCheckMs;
+    }
+    llkCycle = std::min(llkCycle, llkCheckMs);
+}
+
+milliseconds llkGetTimespecDiffMs(timespec* from, timespec* to) {
+    return duration_cast<milliseconds>(seconds(to->tv_sec - from->tv_sec)) +
+           duration_cast<milliseconds>(nanoseconds(to->tv_nsec - from->tv_nsec));
+}
+
+std::string llkProcGetName(pid_t tid, const char* comm, const char* cmdline) {
+    if ((cmdline != nullptr) && (*cmdline != '\0')) {
+        return cmdline;
+    }
+    if ((comm != nullptr) && (*comm != '\0')) {
+        return comm;
+    }
+
+    // UNLIKELY! Here because killed before we kill it?
+    // Assume change is afoot, do not call llkTidAlloc
+
+    // cmdline ?
+    std::string content = llkProcGetName(tid);
+    if (content.size() != 0) {
+        return content;
+    }
+    // Comm instead?
+    content = llkProcGetName(tid, "/comm");
+    if (llkIsMissingExeLink(tid) && (content.size() != 0)) {
+        return '[' + content + ']';
+    }
+    return content;
+}
+
+int llkKillOneProcess(pid_t pid, char state, pid_t tid, const char* tcomm = nullptr,
+                      const char* tcmdline = nullptr, const char* pcomm = nullptr,
+                      const char* pcmdline = nullptr) {
+    std::string forTid;
+    if (tid != pid) {
+        forTid = " for '" + llkProcGetName(tid, tcomm, tcmdline) + "' (" + std::to_string(tid) + ")";
+    }
+    LOG(INFO) << "Killing '" << llkProcGetName(pid, pcomm, pcmdline) << "' (" << pid
+              << ") to check forward scheduling progress in " << state << " state" << forTid;
+    // CAP_KILL required
+    errno = 0;
+    auto r = ::kill(pid, SIGKILL);
+    if (r) {
+        PLOG(ERROR) << "kill(" << pid << ")=" << r << ' ';
+    }
+
+    return r;
+}
+
+// Kill one process
+int llkKillOneProcess(pid_t pid, proc* tprocp) {
+    return llkKillOneProcess(pid, tprocp->state, tprocp->tid, tprocp->getComm(),
+                             tprocp->getCmdline());
+}
+
+// Kill one process specified by kprocp
+int llkKillOneProcess(proc* kprocp, proc* tprocp) {
+    if (kprocp == nullptr) {
+        return -2;
+    }
+
+    return llkKillOneProcess(kprocp->tid, tprocp->state, tprocp->tid, tprocp->getComm(),
+                             tprocp->getCmdline(), kprocp->getComm(), kprocp->getCmdline());
+}
+
+// Acquire file descriptor from environment, or open and cache it.
+// NB: cache is unnecessary in our current context, pedantically
+//     required to prevent leakage of file descriptors in the future.
+int llkFileToWriteFd(const std::string& file) {
+    static std::unordered_map<std::string, int> cache;
+    auto search = cache.find(file);
+    if (search != cache.end()) return search->second;
+    auto fd = android_get_control_file(file.c_str());
+    if (fd >= 0) return fd;
+    fd = TEMP_FAILURE_RETRY(::open(file.c_str(), O_WRONLY | O_CLOEXEC));
+    if (fd >= 0) cache.emplace(std::make_pair(file, fd));
+    return fd;
+}
+
+// Wrap android::base::WriteStringToFile to use android_get_control_file.
+bool llkWriteStringToFile(const std::string& string, const std::string& file) {
+    auto fd = llkFileToWriteFd(file);
+    if (fd < 0) return false;
+    return android::base::WriteStringToFd(string, fd);
+}
+
+bool llkWriteStringToFileConfirm(const std::string& string, const std::string& file) {
+    auto fd = llkFileToWriteFd(file);
+    auto ret = (fd < 0) ? false : android::base::WriteStringToFd(string, fd);
+    std::string content;
+    if (!android::base::ReadFileToString(file, &content)) return ret;
+    return android::base::Trim(content) == string;
+}
+
+void llkPanicKernel(bool dump, pid_t tid) __noreturn;
+void llkPanicKernel(bool dump, pid_t tid) {
+    auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger");
+    if (sysrqTriggerFd < 0) {
+        // DYB
+        llkKillOneProcess(initPid, 'R', tid);
+        // The answer to life, the universe and everything
+        ::exit(42);
+        // NOTREACHED
+    }
+    ::sync();
+    if (dump) {
+        // Show all locks that are held
+        android::base::WriteStringToFd("d", sysrqTriggerFd);
+        // This can trigger hardware watchdog, that is somewhat _ok_.
+        // But useless if pstore configured for <256KB, low ram devices ...
+        if (!llkLowRam) {
+            android::base::WriteStringToFd("t", sysrqTriggerFd);
+        }
+        ::usleep(200000);  // let everything settle
+    }
+    android::base::WriteStringToFd("c", sysrqTriggerFd);
+    // NOTREACHED
+    // DYB
+    llkKillOneProcess(initPid, 'R', tid);
+    // I sat at my desk, stared into the garden and thought '42 will do'.
+    // I typed it out. End of story
+    ::exit(42);
+    // NOTREACHED
+}
+
+void llkAlarmHandler(int) {
+    llkPanicKernel(false, ::getpid());
+}
+
+milliseconds GetUintProperty(const std::string& key, milliseconds def) {
+    return milliseconds(android::base::GetUintProperty(key, static_cast<uint64_t>(def.count()),
+                                                       static_cast<uint64_t>(def.max().count())));
+}
+
+seconds GetUintProperty(const std::string& key, seconds def) {
+    return seconds(android::base::GetUintProperty(key, static_cast<uint64_t>(def.count()),
+                                                  static_cast<uint64_t>(def.max().count())));
+}
+
+proc* llkTidLookup(pid_t tid) {
+    auto search = tids.find(tid);
+    if (search == tids.end()) {
+        return nullptr;
+    }
+    return &search->second;
+}
+
+void llkTidRemove(pid_t tid) {
+    tids.erase(tid);
+}
+
+proc* llkTidAlloc(pid_t tid, pid_t pid, pid_t ppid, const char* comm, int time, char state) {
+    auto it = tids.emplace(std::make_pair(tid, proc(tid, pid, ppid, comm, time, state)));
+    return &it.first->second;
+}
+
+std::string llkFormat(milliseconds ms) {
+    auto sec = duration_cast<seconds>(ms);
+    std::ostringstream s;
+    s << sec.count() << '.';
+    auto f = s.fill('0');
+    auto w = s.width(3);
+    s << std::right << (ms - sec).count();
+    s.width(w);
+    s.fill(f);
+    s << 's';
+    return s.str();
+}
+
+std::string llkFormat(seconds s) {
+    return std::to_string(s.count()) + 's';
+}
+
+std::string llkFormat(bool flag) {
+    return flag ? "true" : "false";
+}
+
+std::string llkFormat(const std::unordered_set<std::string>& blacklist) {
+    std::string ret;
+    for (auto entry : blacklist) {
+        if (ret.size()) {
+            ret += ",";
+        }
+        ret += entry;
+    }
+    return ret;
+}
+
+// We only officially support comma separators, but wetware being what they
+// are will take some liberty and I do not believe they should be punished.
+std::unordered_set<std::string> llkSplit(const std::string& s,
+                                         const std::string& delimiters = ", \t:") {
+    std::unordered_set<std::string> result;
+
+    size_t base = 0;
+    size_t found;
+    while (true) {
+        found = s.find_first_of(delimiters, base);
+        result.emplace(s.substr(base, found - base));
+        if (found == s.npos) break;
+        base = found + 1;
+    }
+    return result;
+}
+
+bool llkSkipName(const std::string& name,
+                 const std::unordered_set<std::string>& blacklist = llkBlacklistProcess) {
+    if ((name.size() == 0) || (blacklist.size() == 0)) {
+        return false;
+    }
+
+    return blacklist.find(name) != blacklist.end();
+}
+
+bool llkSkipPid(pid_t pid) {
+    return llkSkipName(std::to_string(pid), llkBlacklistProcess);
+}
+
+bool llkSkipPpid(pid_t ppid) {
+    return llkSkipName(std::to_string(ppid), llkBlacklistParent);
+}
+
+bool llkSkipUid(uid_t uid) {
+    // Match by number?
+    if (llkSkipName(std::to_string(uid), llkBlacklistUid)) {
+        return true;
+    }
+
+    // Match by name?
+    auto pwd = ::getpwuid(uid);
+    return (pwd != nullptr) && __predict_true(pwd->pw_name != nullptr) &&
+           __predict_true(pwd->pw_name[0] != '\0') && llkSkipName(pwd->pw_name, llkBlacklistUid);
+}
+
+bool getValidTidDir(dirent* dp, std::string* piddir) {
+    if (!::isdigit(dp->d_name[0])) {
+        return false;
+    }
+
+    // Corner case can not happen in reality b/c of above ::isdigit check
+    if (__predict_false(dp->d_type != DT_DIR)) {
+        if (__predict_false(dp->d_type == DT_UNKNOWN)) {  // can't b/c procfs
+            struct stat st;
+            *piddir = procdir;
+            *piddir += dp->d_name;
+            return (lstat(piddir->c_str(), &st) == 0) && (st.st_mode & S_IFDIR);
+        }
+        return false;
+    }
+
+    *piddir = procdir;
+    *piddir += dp->d_name;
+    return true;
+}
+
+bool llkIsMonitorState(char state) {
+    return (state == 'Z') || (state == 'D');
+}
+
+// returns -1 if not found
+long long getSchedValue(const std::string& schedString, const char* key) {
+    auto pos = schedString.find(key);
+    if (pos == std::string::npos) {
+        return -1;
+    }
+    pos = schedString.find(':', pos);
+    if (__predict_false(pos == std::string::npos)) {
+        return -1;
+    }
+    while ((++pos < schedString.size()) && ::isblank(schedString[pos])) {
+        ;
+    }
+    long long ret;
+    if (!android::base::ParseInt(schedString.substr(pos), &ret, static_cast<long long>(0))) {
+        return -1;
+    }
+    return ret;
+}
+
+// Primary ABA mitigation watching last time schedule activity happened
+void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
+    // Audit finds /proc/<tid>/sched is just over 1K, and
+    // is rarely larger than 2K, even less on Android.
+    // For example, the "se.avg.lastUpdateTime" field we are
+    // interested in typically within the primary set in
+    // the first 1K.
+    //
+    // Proc entries can not be read >1K atomically via libbase,
+    // but if there are problems we assume at least a few
+    // samples of reads occur before we take any real action.
+    std::string schedString = ReadFile(piddir + "/sched");
+    if (schedString.size() == 0) {
+        // /schedstat is not as standardized, but in 3.1+
+        // Android devices, the third field is nr_switches
+        // from /sched:
+        schedString = ReadFile(piddir + "/schedstat");
+        if (schedString.size() == 0) {
+            return;
+        }
+        auto val = static_cast<unsigned long long>(-1);
+        if (((::sscanf(schedString.c_str(), "%*d %*d %llu", &val)) == 1) &&
+            (val != static_cast<unsigned long long>(-1)) && (val != 0) &&
+            (val != procp->nrSwitches)) {
+            procp->nrSwitches = val;
+            procp->count = 0ms;
+            procp->killed = false;
+        }
+        return;
+    }
+
+    auto val = getSchedValue(schedString, "\nse.avg.lastUpdateTime");
+    if (val == -1) {
+        val = getSchedValue(schedString, "\nse.svg.last_update_time");
+    }
+    if (val != -1) {
+        auto schedUpdate = nanoseconds(val);
+        if (schedUpdate != procp->schedUpdate) {
+            procp->schedUpdate = schedUpdate;
+            procp->count = 0ms;
+            procp->killed = false;
+        }
+    }
+
+    val = getSchedValue(schedString, "\nnr_switches");
+    if (val != -1) {
+        if (static_cast<uint64_t>(val) != procp->nrSwitches) {
+            procp->nrSwitches = val;
+            procp->count = 0ms;
+            procp->killed = false;
+        }
+    }
+}
+
+void llkLogConfig(void) {
+    LOG(INFO) << "ro.config.low_ram=" << llkFormat(llkLowRam) << "\n"
+              << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n"
+              << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n"
+              << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n"
+              << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n"
+              << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n"
+              << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n"
+              << LLK_Z_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateZ]) << "\n"
+              << LLK_CHECK_MS_PROPERTY "=" << llkFormat(llkCheckMs) << "\n"
+              << LLK_BLACKLIST_PROCESS_PROPERTY "=" << llkFormat(llkBlacklistProcess) << "\n"
+              << LLK_BLACKLIST_PARENT_PROPERTY "=" << llkFormat(llkBlacklistParent) << "\n"
+              << LLK_BLACKLIST_UID_PROPERTY "=" << llkFormat(llkBlacklistUid);
+}
+
+void* llkThread(void* obj) {
+    LOG(INFO) << "started";
+
+    std::string name = std::to_string(::gettid());
+    if (!llkSkipName(name)) {
+        llkBlacklistProcess.emplace(name);
+    }
+    name = static_cast<const char*>(obj);
+    prctl(PR_SET_NAME, name.c_str());
+    if (__predict_false(!llkSkipName(name))) {
+        llkBlacklistProcess.insert(name);
+    }
+    // No longer modifying llkBlacklistProcess.
+    llkRunning = true;
+    llkLogConfig();
+    while (llkRunning) {
+        ::usleep(duration_cast<microseconds>(llkCheck(true)).count());
+    }
+    // NOTREACHED
+    LOG(INFO) << "exiting";
+    return nullptr;
+}
+
+}  // namespace
+
+milliseconds llkCheck(bool checkRunning) {
+    if (!llkEnable || (checkRunning != llkRunning)) {
+        return milliseconds::max();
+    }
+
+    // Reset internal watchdog, which is a healthy engineering margin of
+    // double the maximum wait or cycle time for the mainloop that calls us.
+    //
+    // This alarm is effectively the live lock detection of llkd, as
+    // we understandably can not monitor ourselves otherwise.
+    ::alarm(duration_cast<seconds>(llkTimeoutMs * 2).count());
+
+    // kernel jiffy precision fastest acquisition
+    static timespec last;
+    timespec now;
+    ::clock_gettime(CLOCK_MONOTONIC_COARSE, &now);
+    auto ms = llkGetTimespecDiffMs(&last, &now);
+    if (ms < llkCycle) {
+        return llkCycle - ms;
+    }
+    last = now;
+
+    LOG(VERBOSE) << "opendir(\"" << procdir << "\")";
+    if (__predict_false(!llkTopDirectory)) {
+        // gid containing AID_READPROC required
+        llkTopDirectory.reset(procdir);
+        if (__predict_false(!llkTopDirectory)) {
+            // Most likely reason we could be here is a resource limit.
+            // Keep our processing down to a minimum, but not so low that
+            // we do not recover in a timely manner should the issue be
+            // transitory.
+            LOG(DEBUG) << "opendir(\"" << procdir << "\") failed";
+            return llkTimeoutMs;
+        }
+    }
+
+    for (auto& it : tids) {
+        it.second.updated = false;
+    }
+
+    auto prevUpdate = llkUpdate;
+    llkUpdate += ms;
+    ms -= llkCycle;
+    auto myPid = ::getpid();
+    auto myTid = ::gettid();
+    for (auto dp = llkTopDirectory.read(); dp != nullptr; dp = llkTopDirectory.read()) {
+        std::string piddir;
+
+        if (!getValidTidDir(dp, &piddir)) {
+            continue;
+        }
+
+        // Get the process tasks
+        std::string taskdir = piddir + "/task/";
+        int pid = -1;
+        LOG(VERBOSE) << "+opendir(\"" << taskdir << "\")";
+        dir taskDirectory(taskdir);
+        if (__predict_false(!taskDirectory)) {
+            LOG(DEBUG) << "+opendir(\"" << taskdir << "\") failed";
+        }
+        for (auto tp = taskDirectory.read(dir::task, dp); tp != nullptr;
+             tp = taskDirectory.read(dir::task)) {
+            if (!getValidTidDir(tp, &piddir)) {
+                continue;
+            }
+
+            // Get the process stat
+            std::string stat = ReadFile(piddir + "/stat");
+            if (stat.size() == 0) {
+                continue;
+            }
+            unsigned tid = -1;
+            char pdir[TASK_COMM_LEN + 1];
+            char state = '?';
+            unsigned ppid = -1;
+            unsigned utime = -1;
+            unsigned stime = -1;
+            int dummy;
+            pdir[0] = '\0';
+            // tid should not change value
+            auto match = ::sscanf(
+                stat.c_str(),
+                "%u (%" ___STRING(
+                    TASK_COMM_LEN) "[^)]) %c %u %*d %*d %*d %*d %*d %*d %*d %*d %*d %u %u %d",
+                &tid, pdir, &state, &ppid, &utime, &stime, &dummy);
+            if (pid == -1) {
+                pid = tid;
+            }
+            LOG(VERBOSE) << "match " << match << ' ' << tid << " (" << pdir << ") " << state << ' '
+                         << ppid << " ... " << utime << ' ' << stime << ' ' << dummy;
+            if (match != 7) {
+                continue;
+            }
+
+            auto procp = llkTidLookup(tid);
+            if (procp == nullptr) {
+                procp = llkTidAlloc(tid, pid, ppid, pdir, utime + stime, state);
+            } else {
+                // comm can change ...
+                procp->setComm(pdir);
+                procp->updated = true;
+                // pid/ppid/tid wrap?
+                if (((procp->update != prevUpdate) && (procp->update != llkUpdate)) ||
+                    (procp->ppid != ppid) || (procp->pid != pid)) {
+                    procp->reset();
+                } else if (procp->time != (utime + stime)) {  // secondary ABA.
+                    // watching utime+stime granularity jiffy
+                    procp->state = '?';
+                }
+                procp->update = llkUpdate;
+                procp->pid = pid;
+                procp->ppid = ppid;
+                procp->time = utime + stime;
+                if (procp->state != state) {
+                    procp->count = 0ms;
+                    procp->killed = false;
+                    procp->state = state;
+                } else {
+                    procp->count += llkCycle;
+                }
+            }
+
+            // Filter checks in intuitive order of CPU cost to evaluate
+            // If tid unique continue, if ppid or pid unique break
+
+            if (pid == myPid) {
+                break;
+            }
+            if (!llkIsMonitorState(state)) {
+                continue;
+            }
+            if ((tid == myTid) || llkSkipPid(tid)) {
+                continue;
+            }
+            if (llkSkipPpid(ppid)) {
+                break;
+            }
+
+            if (llkSkipName(procp->getComm())) {
+                continue;
+            }
+            if (llkSkipName(procp->getCmdline())) {
+                break;
+            }
+
+            auto pprocp = llkTidLookup(ppid);
+            if (pprocp == nullptr) {
+                pprocp = llkTidAlloc(ppid, ppid, 0, "", 0, '?');
+            }
+            if ((pprocp != nullptr) && (llkSkipName(pprocp->getComm(), llkBlacklistParent) ||
+                                        llkSkipName(pprocp->getCmdline(), llkBlacklistParent))) {
+                break;
+            }
+
+            if ((llkBlacklistUid.size() != 0) && llkSkipUid(procp->getUid())) {
+                continue;
+            }
+
+            // ABA mitigation watching last time schedule activity happened
+            llkCheckSchedUpdate(procp, piddir);
+
+            // Can only fall through to here if registered D or Z state !!!
+            if (procp->count < llkStateTimeoutMs[(state == 'Z') ? llkStateZ : llkStateD]) {
+                LOG(VERBOSE) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->"
+                             << pid << "->" << tid << ' ' << procp->getComm();
+                continue;
+            }
+
+            // We have to kill it to determine difference between live lock
+            // and persistent state blocked on a resource.  Is there something
+            // wrong with a process that has no forward scheduling progress in
+            // Z or D?  Yes, generally means improper accounting in the
+            // process, but not always ...
+            //
+            // Whomever we hit with a test kill must accept the Android
+            // Aphorism that everything can be burned to the ground and
+            // must survive.
+            if (procp->killed == false) {
+                procp->killed = true;
+                // confirm: re-read uid before committing to a panic.
+                procp->uid = -1;
+                switch (state) {
+                    case 'Z':  // kill ppid to free up a Zombie
+                        // Killing init will kernel panic without diagnostics
+                        // so skip right to controlled kernel panic with
+                        // diagnostics.
+                        if (ppid == initPid) {
+                            break;
+                        }
+                        LOG(WARNING) << "Z " << llkFormat(procp->count) << ' ' << ppid << "->"
+                                     << pid << "->" << tid << ' ' << procp->getComm() << " [kill]";
+                        if ((llkKillOneProcess(pprocp, procp) >= 0) ||
+                            (llkKillOneProcess(ppid, procp) >= 0)) {
+                            continue;
+                        }
+                        break;
+
+                    case 'D':  // kill tid to free up an uninterruptible D
+                        // If ABA is doing its job, we would not need or
+                        // want the following.  Test kill is a Hail Mary
+                        // to make absolutely sure there is no forward
+                        // scheduling progress.  The cost when ABA is
+                        // not working is we kill a process that likes to
+                        // stay in 'D' state, instead of panicing the
+                        // kernel (worse).
+                        LOG(WARNING) << "D " << llkFormat(procp->count) << ' ' << pid << "->" << tid
+                                     << ' ' << procp->getComm() << " [kill]";
+                        if ((llkKillOneProcess(llkTidLookup(pid), procp) >= 0) ||
+                            (llkKillOneProcess(pid, 'D', tid) >= 0) ||
+                            (llkKillOneProcess(procp, procp) >= 0) ||
+                            (llkKillOneProcess(tid, 'D', tid) >= 0)) {
+                            continue;
+                        }
+                        break;
+                }
+            }
+            // We are here because we have confirmed kernel live-lock
+            LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid
+                       << "->" << tid << ' ' << procp->getComm() << " [panic]";
+            llkPanicKernel(true, tid);
+        }
+        LOG(VERBOSE) << "+closedir()";
+    }
+    llkTopDirectory.rewind();
+    LOG(VERBOSE) << "closedir()";
+
+    // garbage collection of old process references
+    for (auto p = tids.begin(); p != tids.end();) {
+        if (!p->second.updated) {
+            IF_ALOG(LOG_VERBOSE, LOG_TAG) {
+                std::string ppidCmdline = llkProcGetName(p->second.ppid, nullptr, nullptr);
+                if (ppidCmdline.size()) {
+                    ppidCmdline = "(" + ppidCmdline + ")";
+                }
+                std::string pidCmdline;
+                if (p->second.pid != p->second.tid) {
+                    pidCmdline = llkProcGetName(p->second.pid, nullptr, p->second.getCmdline());
+                    if (pidCmdline.size()) {
+                        pidCmdline = "(" + pidCmdline + ")";
+                    }
+                }
+                std::string tidCmdline =
+                    llkProcGetName(p->second.tid, p->second.getComm(), p->second.getCmdline());
+                if (tidCmdline.size()) {
+                    tidCmdline = "(" + tidCmdline + ")";
+                }
+                LOG(VERBOSE) << "thread " << p->second.ppid << ppidCmdline << "->" << p->second.pid
+                             << pidCmdline << "->" << p->second.tid << tidCmdline << " removed";
+            }
+            p = tids.erase(p);
+        } else {
+            ++p;
+        }
+    }
+    if (__predict_false(tids.empty())) {
+        llkTopDirectory.reset();
+    }
+
+    llkCycle = llkCheckMs;
+
+    timespec end;
+    ::clock_gettime(CLOCK_MONOTONIC_COARSE, &end);
+    auto milli = llkGetTimespecDiffMs(&now, &end);
+    LOG((milli > 10s) ? ERROR : (milli > 1s) ? WARNING : VERBOSE) << "sample " << llkFormat(milli);
+
+    // cap to minimum sleep for 1 second since last cycle
+    if (llkCycle < (ms + 1s)) {
+        return 1s;
+    }
+    return llkCycle - ms;
+}
+
+unsigned llkCheckMilliseconds() {
+    return duration_cast<milliseconds>(llkCheck()).count();
+}
+
+bool llkInit(const char* threadname) {
+    llkLowRam = android::base::GetBoolProperty("ro.config.low_ram", false);
+    llkEnable = android::base::GetBoolProperty(LLK_ENABLE_PROPERTY, llkEnable);
+    if (llkEnable && !llkTopDirectory.reset(procdir)) {
+        // Most likely reason we could be here is llkd was started
+        // incorrectly without the readproc permissions.  Keep our
+        // processing down to a minimum.
+        llkEnable = false;
+    }
+    khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable);
+    llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall);
+    // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set
+    // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value.
+    khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
+    if (khtTimeout == 0s) {
+        khtTimeout = duration_cast<seconds>(llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) /
+                                            LLK_CHECKS_PER_TIMEOUT_DEFAULT);
+    }
+    llkTimeoutMs =
+        khtTimeout * LLK_CHECKS_PER_TIMEOUT_DEFAULT / (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT);
+    llkTimeoutMs = GetUintProperty(LLK_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
+    llkValidate();  // validate llkTimeoutMs, llkCheckMs and llkCycle
+    llkStateTimeoutMs[llkStateD] = GetUintProperty(LLK_D_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
+    llkStateTimeoutMs[llkStateZ] = GetUintProperty(LLK_Z_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
+    llkCheckMs = GetUintProperty(LLK_CHECK_MS_PROPERTY, llkCheckMs);
+    llkValidate();  // validate all (effectively minus llkTimeoutMs)
+    std::string defaultBlacklistProcess(
+        std::to_string(kernelPid) + "," + std::to_string(initPid) + "," +
+        std::to_string(kthreaddPid) + "," + std::to_string(::getpid()) + "," +
+        std::to_string(::gettid()) + "," LLK_BLACKLIST_PROCESS_DEFAULT);
+    if (threadname) {
+        defaultBlacklistProcess += std::string(",") + threadname;
+    }
+    for (int cpu = 1; cpu < get_nprocs_conf(); ++cpu) {
+        defaultBlacklistProcess += ",[watchdog/" + std::to_string(cpu) + "]";
+    }
+    defaultBlacklistProcess =
+        android::base::GetProperty(LLK_BLACKLIST_PROCESS_PROPERTY, defaultBlacklistProcess);
+    llkBlacklistProcess = llkSplit(defaultBlacklistProcess);
+    if (!llkSkipName("[khungtaskd]")) {  // ALWAYS ignore as special
+        llkBlacklistProcess.emplace("[khungtaskd]");
+    }
+    llkBlacklistParent = llkSplit(android::base::GetProperty(
+        LLK_BLACKLIST_PARENT_PROPERTY, std::to_string(kernelPid) + "," + std::to_string(kthreaddPid) +
+                                           "," LLK_BLACKLIST_PARENT_DEFAULT));
+    llkBlacklistUid =
+        llkSplit(android::base::GetProperty(LLK_BLACKLIST_UID_PROPERTY, LLK_BLACKLIST_UID_DEFAULT));
+
+    // internal watchdog
+    ::signal(SIGALRM, llkAlarmHandler);
+
+    // kernel hung task configuration? Otherwise leave it as-is
+    if (khtEnable) {
+        // EUID must be AID_ROOT to write to /proc/sys/kernel/ nodes, there
+        // are no capability overrides.  For security reasons we do not want
+        // to run as AID_ROOT.  We may not be able to write them successfully,
+        // we will try, but the least we can do is read the values back to
+        // confirm expectations and report whether configured or not.
+        auto configured = llkWriteStringToFileConfirm(std::to_string(khtTimeout.count()),
+                                                      "/proc/sys/kernel/hung_task_timeout_secs");
+        if (configured) {
+            llkWriteStringToFile("65535", "/proc/sys/kernel/hung_task_warnings");
+            llkWriteStringToFile("65535", "/proc/sys/kernel/hung_task_check_count");
+            configured = llkWriteStringToFileConfirm("1", "/proc/sys/kernel/hung_task_panic");
+        }
+        if (configured) {
+            LOG(INFO) << "[khungtaskd] configured";
+        } else {
+            LOG(WARNING) << "[khungtaskd] not configurable";
+        }
+    }
+
+    bool logConfig = true;
+    if (llkEnable) {
+        if (llkMlockall &&
+            // MCL_ONFAULT pins pages as they fault instead of loading
+            // everything immediately all at once. (Which would be bad,
+            // because as of this writing, we have a lot of mapped pages we
+            // never use.) Old kernels will see MCL_ONFAULT and fail with
+            // EINVAL; we ignore this failure.
+            //
+            // N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
+            // pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
+            // in pages.
+
+            // CAP_IPC_LOCK required
+            mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
+            PLOG(WARNING) << "mlockall failed ";
+        }
+
+        if (threadname) {
+            pthread_attr_t attr;
+
+            if (!pthread_attr_init(&attr)) {
+                sched_param param;
+
+                memset(&param, 0, sizeof(param));
+                pthread_attr_setschedparam(&attr, &param);
+                pthread_attr_setschedpolicy(&attr, SCHED_BATCH);
+                if (!pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) {
+                    pthread_t thread;
+                    if (!pthread_create(&thread, &attr, llkThread, const_cast<char*>(threadname))) {
+                        // wait a second for thread to start
+                        for (auto retry = 50; retry && !llkRunning; --retry) {
+                            ::usleep(20000);
+                        }
+                        logConfig = !llkRunning;  // printed in llkd context?
+                    } else {
+                        LOG(ERROR) << "failed to spawn llkd thread";
+                    }
+                } else {
+                    LOG(ERROR) << "failed to detach llkd thread";
+                }
+                pthread_attr_destroy(&attr);
+            } else {
+                LOG(ERROR) << "failed to allocate attibutes for llkd thread";
+            }
+        }
+    } else {
+        LOG(DEBUG) << "[khungtaskd] left unconfigured";
+    }
+    if (logConfig) {
+        llkLogConfig();
+    }
+
+    return llkEnable;
+}
diff --git a/llkd/llkd.cpp b/llkd/llkd.cpp
new file mode 100644
index 0000000..f10253d
--- /dev/null
+++ b/llkd/llkd.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "llkd.h"
+
+#include <sched.h>
+#include <unistd.h>
+
+#include <chrono>
+
+#include <android-base/logging.h>
+
+using namespace std::chrono;
+
+int main(int, char**) {
+    LOG(INFO) << "started";
+
+    bool enabled = llkInit();
+
+    // Would like this policy to be automatic as part of libllkd,
+    // but that would be presumptuous and bad side-effect.
+    struct sched_param param;
+    memset(&param, 0, sizeof(param));
+    sched_setscheduler(0, SCHED_BATCH, &param);
+
+    while (true) {
+        if (enabled) {
+            ::usleep(duration_cast<microseconds>(llkCheck()).count());
+        } else {
+            ::pause();
+        }
+    }
+    // NOTREACHED
+
+    LOG(INFO) << "exiting";
+    return 0;
+}
diff --git a/llkd/llkd.rc b/llkd/llkd.rc
new file mode 100644
index 0000000..a257e76
--- /dev/null
+++ b/llkd/llkd.rc
@@ -0,0 +1,18 @@
+# Configure [khungtaskd]
+on property:ro.khungtask.enable=true
+    write /proc/sys/kernel/hung_task_timeout_secs ${ro.khungtask.timeout:-720}
+    write /proc/sys/kernel/hung_task_warnings 65535
+    write /proc/sys/kernel/hung_task_check_count 65535
+    write /proc/sys/kernel/hung_task_panic 1
+
+on property:ro.llk.enable=true
+    start llkd
+
+service llkd /system/bin/llkd
+    class late_start
+    disabled
+    user llkd
+    group llkd readproc
+    capabilities KILL IPC_LOCK
+    file /proc/sysrq-trigger w
+    writepid /dev/cpuset/system-background/tasks
diff --git a/llkd/tests/Android.bp b/llkd/tests/Android.bp
new file mode 100644
index 0000000..6dd5938
--- /dev/null
+++ b/llkd/tests/Android.bp
@@ -0,0 +1,41 @@
+// Copyright (C) 2018 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+cc_test {
+    name: "llkd_unit_test",
+
+    shared_libs: [
+        "libbase",
+        "liblog",
+    ],
+    header_libs: [
+        "llkd_headers",
+    ],
+
+    target: {
+        android: {
+            srcs: [
+                "llkd_test.cpp",
+            ],
+        },
+    },
+
+    cflags: [
+        "-Wall",
+        "-Wextra",
+        "-Werror",
+    ],
+
+    compile_multilib: "first",
+}
diff --git a/llkd/tests/llkd_test.cpp b/llkd/tests/llkd_test.cpp
new file mode 100644
index 0000000..e3c95eb
--- /dev/null
+++ b/llkd/tests/llkd_test.cpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <signal.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+
+#include <android-base/properties.h>
+#include <gtest/gtest.h>
+#include <log/log_time.h>  // for MS_PER_SEC and US_PER_SEC
+
+#include "llkd.h"
+
+using namespace std::chrono;
+using namespace std::chrono_literals;
+
+namespace {
+
+milliseconds GetUintProperty(const std::string& key, milliseconds def) {
+    return milliseconds(android::base::GetUintProperty(key, static_cast<uint64_t>(def.count()),
+                                                       static_cast<uint64_t>(def.max().count())));
+}
+
+seconds GetUintProperty(const std::string& key, seconds def) {
+    return seconds(android::base::GetUintProperty(key, static_cast<uint64_t>(def.count()),
+                                                  static_cast<uint64_t>(def.max().count())));
+}
+
+// GTEST_LOG_(WARNING) output is fugly, this has much less noise
+// ToDo: look into fixing googletest to produce output that matches style of
+//       all the other status messages, and can switch off __line__ and
+//       __function__ noise
+#define GTEST_LOG_WARNING std::cerr << "[ WARNING  ] "
+#define GTEST_LOG_INFO std::cerr << "[   INFO   ] "
+
+// Properties is _not_ a high performance ABI!
+void rest() {
+    usleep(200000);
+}
+
+void execute(const char* command) {
+    if (getuid() || system(command)) {
+        system((std::string("su root ") + command).c_str());
+    }
+}
+
+seconds llkdSleepPeriod(char state) {
+    auto default_enable = android::base::GetBoolProperty(LLK_ENABLE_PROPERTY, LLK_ENABLE_DEFAULT);
+    if (android::base::GetProperty(LLK_ENABLE_PROPERTY, "nothing") == "nothing") {
+        GTEST_LOG_INFO << LLK_ENABLE_PROPERTY " defaults to " << (default_enable ? "true" : "false")
+                       << "\n";
+    }
+    // Hail Mary hope is unconfigured.
+    if ((GetUintProperty(LLK_TIMEOUT_MS_PROPERTY, LLK_TIMEOUT_MS_DEFAULT) !=
+         duration_cast<milliseconds>(120s)) ||
+        (GetUintProperty(LLK_CHECK_MS_PROPERTY,
+                         LLK_TIMEOUT_MS_DEFAULT / LLK_CHECKS_PER_TIMEOUT_DEFAULT) !=
+         duration_cast<milliseconds>(10s))) {
+        execute("stop llkd");
+        rest();
+        std::string setprop("setprop ");
+        execute((setprop + LLK_TIMEOUT_MS_PROPERTY + " 120000").c_str());
+        rest();
+        execute((setprop + KHT_TIMEOUT_PROPERTY + " 130").c_str());
+        rest();
+        execute((setprop + LLK_CHECK_MS_PROPERTY + " 10000").c_str());
+        rest();
+        execute((setprop + LLK_ENABLE_PROPERTY + " true").c_str());
+        rest();
+    }
+    default_enable = android::base::GetBoolProperty(LLK_ENABLE_PROPERTY, false);
+    if (default_enable) {
+        execute("start llkd");
+        rest();
+        GTEST_LOG_INFO << "llkd enabled\n";
+    } else {
+        GTEST_LOG_WARNING << "llkd disabled\n";
+    }
+
+    /* KISS follows llk_init() */
+    milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT;
+    seconds khtTimeout = duration_cast<seconds>(
+        llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) / LLK_CHECKS_PER_TIMEOUT_DEFAULT);
+    khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
+    llkTimeoutMs =
+        khtTimeout * LLK_CHECKS_PER_TIMEOUT_DEFAULT / (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT);
+    llkTimeoutMs = GetUintProperty(LLK_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
+    if (llkTimeoutMs < LLK_TIMEOUT_MS_MINIMUM) {
+        llkTimeoutMs = LLK_TIMEOUT_MS_MINIMUM;
+    }
+    milliseconds llkCheckMs = llkTimeoutMs / LLK_CHECKS_PER_TIMEOUT_DEFAULT;
+    auto timeout = GetUintProperty(
+        (state == 'Z') ? LLK_Z_TIMEOUT_MS_PROPERTY : LLK_D_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
+    if (timeout < LLK_TIMEOUT_MS_MINIMUM) {
+        timeout = LLK_TIMEOUT_MS_MINIMUM;
+    }
+
+    if (llkCheckMs > timeout) {
+        llkCheckMs = timeout;
+    }
+    llkCheckMs = GetUintProperty(LLK_CHECK_MS_PROPERTY, llkCheckMs);
+    timeout += llkCheckMs;
+    auto sec = duration_cast<seconds>(timeout);
+    if (sec == 0s) {
+        ++sec;
+    } else if (sec > 59s) {
+        GTEST_LOG_WARNING << "llkd is configured for about " << duration_cast<minutes>(sec).count()
+                          << " minutes to react\n";
+    }
+
+    // 33% margin for the test to naturally timeout waiting for llkd to respond
+    return (sec * 4 + 2s) / 3;
+}
+
+inline void waitForPid(pid_t child_pid) {
+    int wstatus;
+    ASSERT_LE(0, waitpid(child_pid, &wstatus, 0));
+    EXPECT_FALSE(WIFEXITED(wstatus)) << "[   INFO   ] exit=" << WEXITSTATUS(wstatus);
+    ASSERT_TRUE(WIFSIGNALED(wstatus));
+    ASSERT_EQ(WTERMSIG(wstatus), SIGKILL);
+}
+
+}  // namespace
+
+// The tests that use this helper are to simulate processes stuck in 'D'
+// state that are experiencing forward scheduled progress. As such the
+// expectation is that llkd will _not_ perform any mitigations. The sleepfor
+// argument helps us set the amount of forward scheduler progress.
+static void llkd_driver_ABA(const microseconds sleepfor) {
+    const auto period = llkdSleepPeriod('D');
+    if (period <= sleepfor) {
+        GTEST_LOG_WARNING << "llkd configuration too short for "
+                          << duration_cast<milliseconds>(sleepfor).count() << "ms work cycle\n";
+        return;
+    }
+
+    auto child_pid = fork();
+    ASSERT_LE(0, child_pid);
+    int wstatus;
+    if (!child_pid) {
+        auto ratio = period / sleepfor;
+        ASSERT_LT(0, ratio);
+        // vfork() parent is uninterruptable D state waiting for child to exec()
+        while (--ratio > 0) {
+            auto driver_pid = vfork();
+            ASSERT_LE(0, driver_pid);
+            if (driver_pid) {  // parent
+                waitpid(driver_pid, &wstatus, 0);
+                if (!WIFEXITED(wstatus)) {
+                    exit(42);
+                }
+                if (WEXITSTATUS(wstatus) != 42) {
+                    exit(42);
+                }
+            } else {
+                usleep(sleepfor.count());
+                exit(42);
+            }
+        }
+        exit(0);
+    }
+    ASSERT_LE(0, waitpid(child_pid, &wstatus, 0));
+    EXPECT_TRUE(WIFEXITED(wstatus));
+    if (WIFEXITED(wstatus)) {
+        EXPECT_EQ(0, WEXITSTATUS(wstatus));
+    }
+    ASSERT_FALSE(WIFSIGNALED(wstatus)) << "[   INFO   ] signo=" << WTERMSIG(wstatus);
+}
+
+TEST(llkd, driver_ABA_fast) {
+    llkd_driver_ABA(5ms);
+}
+
+TEST(llkd, driver_ABA_slow) {
+    llkd_driver_ABA(1s);
+}
+
+TEST(llkd, driver_ABA_glacial) {
+    llkd_driver_ABA(1min);
+}
+
+// Following tests must be last in this file to capture possible errant
+// kernel_panic mitigation failure.
+
+// The following tests simulate processes stick in 'Z' or 'D' state with
+// no forward scheduling progress, but interruptible. As such the expectation
+// is that llkd will perform kill mitigation and not progress to kernel_panic.
+
+TEST(llkd, zombie) {
+    const auto period = llkdSleepPeriod('Z');
+
+    /* Create a Persistent Zombie Process */
+    pid_t child_pid = fork();
+    ASSERT_LE(0, child_pid);
+    if (!child_pid) {
+        auto zombie_pid = fork();
+        ASSERT_LE(0, zombie_pid);
+        if (!zombie_pid) {
+            sleep(1);
+            exit(0);
+        }
+        sleep(period.count());
+        exit(42);
+    }
+
+    waitForPid(child_pid);
+}
+
+TEST(llkd, driver) {
+    const auto period = llkdSleepPeriod('D');
+
+    /* Create a Persistent Device Process */
+    auto child_pid = fork();
+    ASSERT_LE(0, child_pid);
+    if (!child_pid) {
+        // vfork() parent is uninterruptable D state waiting for child to exec()
+        auto driver_pid = vfork();
+        ASSERT_LE(0, driver_pid);
+        sleep(period.count());
+        exit(driver_pid ? 42 : 0);
+    }
+
+    waitForPid(child_pid);
+}