Merge "llkd: bootstat: propagate detailed livelock canonical boot reason"
diff --git a/bootstat/bootstat.cpp b/bootstat/bootstat.cpp
index c2688e9..8ce9dfc 100644
--- a/bootstat/bootstat.cpp
+++ b/bootstat/bootstat.cpp
@@ -303,6 +303,9 @@
     {"kernel_panic,init", 158},
     {"kernel_panic,oom", 159},
     {"kernel_panic,stack", 160},
+    {"kernel_panic,sysrq,livelock,alarm", 161},   // llkd
+    {"kernel_panic,sysrq,livelock,driver", 162},  // llkd
+    {"kernel_panic,sysrq,livelock,zombie", 163},  // llkd
 };
 
 // Converts a string value representing the reason the system booted to an
diff --git a/llkd/README.md b/llkd/README.md
index 71319c8..b2ba2a2 100644
--- a/llkd/README.md
+++ b/llkd/README.md
@@ -53,7 +53,9 @@
 conditions.  If the test can, it will reconfigure llkd to expedite the test
 duration by adjusting the ro.llk.* Android properties.  Tests run the D state
 with some scheduling progress to ensure that ABA checking prevents false
-triggers.
+triggers. If 100% reliable ABA on platform, then ro.llk.killtest can be
+set to false; however this will result in some of the unit tests to panic
+kernel instead of deal with more graceful kill operation.
 
 Android Properties
 ------------------
@@ -108,13 +110,6 @@
 Architectural Concerns
 ----------------------
 
-- Figure out how to communicate the kernel panic better to bootstat canonical
-  boot reason determination.  This may require an alteration to bootstat, or
-  some logging from llkd.  Would like to see boot reason to be
-  watchdog,livelock as a minimum requirement.  Or more specifically would want
-  watchdog,livelock,device or watchdog,livelock,zombie be reported.
-  Currently reports panic,sysrq (user requested panic) or panic depending on
-  system support of pstore.
 - Create kernel module and associated gTest to actually test panic.
 - Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally
   not be inputs).  Could require more test-only interfaces to libllkd.
diff --git a/llkd/include/llkd.h b/llkd/include/llkd.h
index bd0739b..e3ae4bb 100644
--- a/llkd/include/llkd.h
+++ b/llkd/include/llkd.h
@@ -37,6 +37,8 @@
 #define KHT_ENABLE_PROPERTY            "ro." KHT_ENABLE_WRITEABLE_PROPERTY
 #define LLK_MLOCKALL_PROPERTY          "ro.llk.mlockall"
 #define LLK_MLOCKALL_DEFAULT           true
+#define LLK_KILLTEST_PROPERTY          "ro.llk.killtest"
+#define LLK_KILLTEST_DEFAULT           true
 #define LLK_TIMEOUT_MS_PROPERTY        "ro.llk.timeout_ms"
 #define KHT_TIMEOUT_PROPERTY           "ro.khungtask.timeout"
 #define LLK_D_TIMEOUT_MS_PROPERTY      "ro.llk.D.timeout_ms"
diff --git a/llkd/libllkd.cpp b/llkd/libllkd.cpp
index d828105..f357cc2 100644
--- a/llkd/libllkd.cpp
+++ b/llkd/libllkd.cpp
@@ -70,6 +70,7 @@
 bool llkEnable = LLK_ENABLE_DEFAULT;                 // llk daemon enabled
 bool llkRunning = false;                             // thread is running
 bool llkMlockall = LLK_MLOCKALL_DEFAULT;             // run mlocked
+bool llkTestWithKill = LLK_KILLTEST_DEFAULT;         // issue test kills
 milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT;  // default timeout
 enum { llkStateD, llkStateZ, llkNumStates };         // state indexes
 milliseconds llkStateTimeoutMs[llkNumStates];        // timeout override for each detection state
@@ -292,7 +293,7 @@
           exeMissingValid(false),
           cmdlineValid(false),
           updated(true),
-          killed(false) {
+          killed(!llkTestWithKill) {
         memset(comm, '\0', sizeof(comm));
         setComm(_comm);
     }
@@ -475,8 +476,8 @@
     return android::base::Trim(content) == string;
 }
 
-void llkPanicKernel(bool dump, pid_t tid) __noreturn;
-void llkPanicKernel(bool dump, pid_t tid) {
+void llkPanicKernel(bool dump, pid_t tid, const char* state) __noreturn;
+void llkPanicKernel(bool dump, pid_t tid, const char* state) {
     auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger");
     if (sysrqTriggerFd < 0) {
         // DYB
@@ -496,6 +497,8 @@
         }
         ::usleep(200000);  // let everything settle
     }
+    llkWriteStringToFile(std::string("SysRq : Trigger a crash : 'livelock,") + state + "'\n",
+                         "/dev/kmsg");
     android::base::WriteStringToFd("c", sysrqTriggerFd);
     // NOTREACHED
     // DYB
@@ -507,7 +510,7 @@
 }
 
 void llkAlarmHandler(int) {
-    llkPanicKernel(false, ::getpid());
+    llkPanicKernel(false, ::getpid(), "alarm");
 }
 
 milliseconds GetUintProperty(const std::string& key, milliseconds def) {
@@ -686,7 +689,7 @@
             (val != procp->nrSwitches)) {
             procp->nrSwitches = val;
             procp->count = 0ms;
-            procp->killed = false;
+            procp->killed = !llkTestWithKill;
         }
         return;
     }
@@ -700,7 +703,7 @@
         if (schedUpdate != procp->schedUpdate) {
             procp->schedUpdate = schedUpdate;
             procp->count = 0ms;
-            procp->killed = false;
+            procp->killed = !llkTestWithKill;
         }
     }
 
@@ -709,7 +712,7 @@
         if (static_cast<uint64_t>(val) != procp->nrSwitches) {
             procp->nrSwitches = val;
             procp->count = 0ms;
-            procp->killed = false;
+            procp->killed = !llkTestWithKill;
         }
     }
 }
@@ -719,6 +722,7 @@
               << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n"
               << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n"
               << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n"
+              << LLK_KILLTEST_PROPERTY "=" << llkFormat(llkTestWithKill) << "\n"
               << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n"
               << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n"
               << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n"
@@ -869,7 +873,7 @@
                 procp->time = utime + stime;
                 if (procp->state != state) {
                     procp->count = 0ms;
-                    procp->killed = false;
+                    procp->killed = !llkTestWithKill;
                     procp->state = state;
                 } else {
                     procp->count += llkCycle;
@@ -973,7 +977,7 @@
             // We are here because we have confirmed kernel live-lock
             LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid
                        << "->" << tid << ' ' << procp->getComm() << " [panic]";
-            llkPanicKernel(true, tid);
+            llkPanicKernel(true, tid, (state == 'Z') ? "zombie" : "driver");
         }
         LOG(VERBOSE) << "+closedir()";
     }
@@ -1045,6 +1049,7 @@
     }
     khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable);
     llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall);
+    llkTestWithKill = android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, llkTestWithKill);
     // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set
     // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value.
     khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
diff --git a/llkd/llkd.rc b/llkd/llkd.rc
index f762a5c..e538cdb 100644
--- a/llkd/llkd.rc
+++ b/llkd/llkd.rc
@@ -44,5 +44,6 @@
     user llkd
     group llkd readproc
     capabilities KILL IPC_LOCK
+    file /dev/kmsg w
     file /proc/sysrq-trigger w
     writepid /dev/cpuset/system-background/tasks
diff --git a/llkd/tests/llkd_test.cpp b/llkd/tests/llkd_test.cpp
index 2de1820..3a15ff1 100644
--- a/llkd/tests/llkd_test.cpp
+++ b/llkd/tests/llkd_test.cpp
@@ -154,6 +154,27 @@
     ASSERT_EQ(WTERMSIG(wstatus), SIGKILL);
 }
 
+bool checkKill(const char* reason) {
+    if (android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, LLK_KILLTEST_DEFAULT)) {
+        return false;
+    }
+    auto bootreason = android::base::GetProperty("sys.boot.reason", "nothing");
+    if (bootreason == reason) {
+        GTEST_LOG_INFO << "Expected test result confirmed " << reason << "\n";
+        return true;
+    }
+    GTEST_LOG_WARNING << "Expected test result is " << reason << "\n";
+
+    // apct adjustment if needed (set LLK_KILLTEST_PROPERTY to "off" to allow test)
+    //
+    // if (android::base::GetProperty(LLK_KILLTEST_PROPERTY, "") == "false") {
+    //     GTEST_LOG_WARNING << "Bypassing test\n";
+    //     return true;
+    // }
+
+    return false;
+}
+
 }  // namespace
 
 // The tests that use this helper are to simulate processes stuck in 'D'
@@ -221,6 +242,10 @@
 // is that llkd will perform kill mitigation and not progress to kernel_panic.
 
 TEST(llkd, zombie) {
+    if (checkKill("kernel_panic,sysrq,livelock,zombie")) {
+        return;
+    }
+
     const auto period = llkdSleepPeriod('Z');
 
     /* Create a Persistent Zombie Process */
@@ -241,6 +266,10 @@
 }
 
 TEST(llkd, driver) {
+    if (checkKill("kernel_panic,sysrq,livelock,driver")) {
+        return;
+    }
+
     const auto period = llkdSleepPeriod('D');
 
     /* Create a Persistent Device Process */