Merge changes I394a7920,Ia847118c,Ic8396eee,I79a85c33,Id015e6a7, ...

* changes:
  lmkd: Select in-kernel vs userspace lmk based on kernel driver presence
  lmkd: Implement kill timeout
  lmkd: Allow killing multiple processes to downgrade memory pressure
  lmkd: Detect the highest level of vmpressure when event is detected
  lmkd: Close cgroup.event_control file when done writing
  lmkd: Remove stale dependency on libprocessgroup
  lmkd: Add ability to trace lmkd kills
  lmkd: add logic to kill the heaviest of the eligible processes
  lmkd: change defaults to disable event upgrade/downgrade logic
  lmkd: add ability to monitor all vmpressure events
diff --git a/lmkd/Android.bp b/lmkd/Android.bp
index 3f8a503..76d308a 100644
--- a/lmkd/Android.bp
+++ b/lmkd/Android.bp
@@ -4,10 +4,17 @@
     srcs: ["lmkd.c"],
     shared_libs: [
         "liblog",
-        "libprocessgroup",
         "libcutils",
     ],
     cflags: ["-Werror"],
 
     init_rc: ["lmkd.rc"],
+
+    product_variables: {
+        debuggable: {
+            cflags: [
+                "-DLMKD_TRACE_KILLS"
+            ],
+        },
+    },
 }
diff --git a/lmkd/lmkd.c b/lmkd/lmkd.c
index 15471e0..338e5fa 100644
--- a/lmkd/lmkd.c
+++ b/lmkd/lmkd.c
@@ -29,13 +29,31 @@
 #include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/types.h>
-#include <time.h>
+#include <sys/sysinfo.h>
 #include <unistd.h>
 
 #include <cutils/properties.h>
 #include <cutils/sockets.h>
 #include <log/log.h>
-#include <processgroup/processgroup.h>
+
+/*
+ * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces
+ * to profile and correlate with OOM kills
+ */
+#ifdef LMKD_TRACE_KILLS
+
+#define ATRACE_TAG ATRACE_TAG_ALWAYS
+#include <cutils/trace.h>
+
+#define TRACE_KILL_START(pid) ATRACE_INT(__FUNCTION__, pid);
+#define TRACE_KILL_END()      ATRACE_INT(__FUNCTION__, 0);
+
+#else /* LMKD_TRACE_KILLS */
+
+#define TRACE_KILL_START(pid)
+#define TRACE_KILL_END()
+
+#endif /* LMKD_TRACE_KILLS */
 
 #ifndef __unused
 #define __unused __attribute__((__unused__))
@@ -44,8 +62,6 @@
 #define MEMCG_SYSFS_PATH "/dev/memcg/"
 #define MEMCG_MEMORY_USAGE "/dev/memcg/memory.usage_in_bytes"
 #define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
-#define MEMPRESSURE_WATCH_MEDIUM_LEVEL "medium"
-#define MEMPRESSURE_WATCH_CRITICAL_LEVEL "critical"
 #define ZONEINFO_PATH "/proc/zoneinfo"
 #define LINE_MAX 128
 
@@ -72,26 +88,47 @@
 static int use_inkernel_interface = 1;
 static bool has_inkernel_module;
 
-/* memory pressure level medium event */
-static int mpevfd[2];
-#define CRITICAL_INDEX 1
-#define MEDIUM_INDEX 0
+/* memory pressure levels */
+enum vmpressure_level {
+    VMPRESS_LEVEL_LOW = 0,
+    VMPRESS_LEVEL_MEDIUM,
+    VMPRESS_LEVEL_CRITICAL,
+    VMPRESS_LEVEL_COUNT
+};
 
-static int medium_oomadj;
-static int critical_oomadj;
+static const char *level_name[] = {
+    "low",
+    "medium",
+    "critical"
+};
+
+struct mem_size {
+    int free_mem;
+    int free_swap;
+};
+
+struct {
+    int min_free; /* recorded but not used yet */
+    int max_free;
+} low_pressure_mem = { -1, -1 };
+
+static int level_oomadj[VMPRESS_LEVEL_COUNT];
+static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
 static bool debug_process_killing;
 static bool enable_pressure_upgrade;
 static int64_t upgrade_pressure;
 static int64_t downgrade_pressure;
 static bool is_go_device;
+static bool kill_heaviest_task;
+static unsigned long kill_timeout_ms;
 
 /* control socket listen and data */
 static int ctrl_lfd;
 static int ctrl_dfd = -1;
 static int ctrl_dfd_reopened; /* did we reopen ctrl conn on this loop? */
 
-/* 2 memory pressure levels, 1 ctrl listen socket, 1 ctrl data socket */
-#define MAX_EPOLL_EVENTS 4
+/* 3 memory pressure levels, 1 ctrl listen socket, 1 ctrl data socket */
+#define MAX_EPOLL_EVENTS 5
 static int epollfd;
 static int maxevents;
 
@@ -226,7 +263,7 @@
     return 0;
 }
 
-static void writefilestring(char *path, char *s) {
+static void writefilestring(const char *path, char *s) {
     int fd = open(path, O_WRONLY | O_CLOEXEC);
     int len = strlen(s);
     int ret;
@@ -534,6 +571,18 @@
     return 0;
 }
 
+static int get_free_memory(struct mem_size *ms) {
+    struct sysinfo si;
+
+    if (sysinfo(&si) < 0)
+        return -1;
+
+    ms->free_mem = (int)(si.freeram * si.mem_unit / PAGE_SIZE);
+    ms->free_swap = (int)(si.freeswap * si.mem_unit / PAGE_SIZE);
+
+    return 0;
+}
+
 static int proc_get_size(int pid) {
     char path[PATH_MAX];
     char line[LINE_MAX];
@@ -586,8 +635,32 @@
     return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
 }
 
+static struct proc *proc_get_heaviest(int oomadj) {
+    struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
+    struct adjslot_list *curr = head->next;
+    struct proc *maxprocp = NULL;
+    int maxsize = 0;
+    while (curr != head) {
+        int pid = ((struct proc *)curr)->pid;
+        int tasksize = proc_get_size(pid);
+        if (tasksize <= 0) {
+            struct adjslot_list *next = curr->next;
+            pid_remove(pid);
+            curr = next;
+        } else {
+            if (tasksize > maxsize) {
+                maxsize = tasksize;
+                maxprocp = (struct proc *)curr;
+            }
+            curr = curr->next;
+        }
+    }
+    return maxprocp;
+}
+
 /* Kill one process specified by procp.  Returns the size of the process killed */
-static int kill_one_process(struct proc* procp, int min_score_adj, bool is_critical) {
+static int kill_one_process(struct proc* procp, int min_score_adj,
+                            enum vmpressure_level level) {
     int pid = procp->pid;
     uid_t uid = procp->uid;
     char *taskname;
@@ -606,14 +679,18 @@
         return -1;
     }
 
+    TRACE_KILL_START(pid);
+
+    r = kill(pid, SIGKILL);
     ALOGI(
         "Killing '%s' (%d), uid %d, adj %d\n"
         "   to free %ldkB because system is under %s memory pressure oom_adj %d\n",
-        taskname, pid, uid, procp->oomadj, tasksize * page_k, is_critical ? "critical" : "medium",
-        min_score_adj);
-    r = kill(pid, SIGKILL);
+        taskname, pid, uid, procp->oomadj, tasksize * page_k,
+        level_name[level], min_score_adj);
     pid_remove(pid);
 
+    TRACE_KILL_END();
+
     if (r) {
         ALOGE("kill(%d): errno=%d", pid, errno);
         return -1;
@@ -623,31 +700,40 @@
 }
 
 /*
- * Find a process to kill based on the current (possibly estimated) free memory
- * and cached memory sizes.  Returns the size of the killed processes.
+ * Find processes to kill to free required number of pages.
+ * If pages_to_free is set to 0 only one process will be killed.
+ * Returns the size of the killed processes.
  */
-static int find_and_kill_process(bool is_critical) {
+static int find_and_kill_processes(enum vmpressure_level level,
+                                   int pages_to_free) {
     int i;
-    int killed_size = 0;
-    int min_score_adj = is_critical ? critical_oomadj : medium_oomadj;
+    int killed_size;
+    int pages_freed = 0;
+    int min_score_adj = level_oomadj[level];
 
     for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
         struct proc *procp;
 
-retry:
-        procp = proc_adj_lru(i);
+        while (true) {
+            if (is_go_device)
+                procp = proc_adj_lru(i);
+            else
+                procp = proc_get_heaviest(i);
 
-        if (procp) {
-            killed_size = kill_one_process(procp, min_score_adj, is_critical);
-            if (killed_size < 0) {
-                goto retry;
-            } else {
-                return killed_size;
+            if (!procp)
+                break;
+
+            killed_size = kill_one_process(procp, min_score_adj, level);
+            if (killed_size >= 0) {
+                pages_freed += killed_size;
+                if (pages_freed >= pages_to_free) {
+                    return pages_freed;
+                }
             }
         }
     }
 
-    return 0;
+    return pages_freed;
 }
 
 static int64_t get_memory_usage(const char* path) {
@@ -674,33 +760,118 @@
     return mem_usage;
 }
 
-static void mp_event_common(bool is_critical) {
+void record_low_pressure_levels(struct mem_size *free_mem) {
+    if (low_pressure_mem.min_free == -1 ||
+        low_pressure_mem.min_free > free_mem->free_mem) {
+        if (debug_process_killing) {
+            ALOGI("Low pressure min memory update from %d to %d",
+                low_pressure_mem.min_free, free_mem->free_mem);
+        }
+        low_pressure_mem.min_free = free_mem->free_mem;
+    }
+    /*
+     * Free memory at low vmpressure events occasionally gets spikes,
+     * possibly a stale low vmpressure event with memory already
+     * freed up (no memory pressure should have been reported).
+     * Ignore large jumps in max_free that would mess up our stats.
+     */
+    if (low_pressure_mem.max_free == -1 ||
+        (low_pressure_mem.max_free < free_mem->free_mem &&
+         free_mem->free_mem - low_pressure_mem.max_free < low_pressure_mem.max_free * 0.1)) {
+        if (debug_process_killing) {
+            ALOGI("Low pressure max memory update from %d to %d",
+                low_pressure_mem.max_free, free_mem->free_mem);
+        }
+        low_pressure_mem.max_free = free_mem->free_mem;
+    }
+}
+
+enum vmpressure_level upgrade_level(enum vmpressure_level level) {
+    return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ?
+        level + 1 : level);
+}
+
+enum vmpressure_level downgrade_level(enum vmpressure_level level) {
+    return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ?
+        level - 1 : level);
+}
+
+static inline unsigned long get_time_diff_ms(struct timeval *from,
+                                             struct timeval *to) {
+    return (to->tv_sec - from->tv_sec) * 1000 +
+           (to->tv_usec - from->tv_usec) / 1000;
+}
+
+static void mp_event_common(enum vmpressure_level level) {
     int ret;
     unsigned long long evcount;
-    int index = is_critical ? CRITICAL_INDEX : MEDIUM_INDEX;
     int64_t mem_usage, memsw_usage;
     int64_t mem_pressure;
+    enum vmpressure_level lvl;
+    struct mem_size free_mem;
+    static struct timeval last_report_tm;
+    static unsigned long skip_count = 0;
 
-    ret = read(mpevfd[index], &evcount, sizeof(evcount));
-    if (ret < 0)
-        ALOGE("Error reading memory pressure event fd; errno=%d",
-              errno);
+    /*
+     * Check all event counters from low to critical
+     * and upgrade to the highest priority one. By reading
+     * eventfd we also reset the event counters.
+     */
+    for (lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) {
+        if (mpevfd[lvl] != -1 &&
+            read(mpevfd[lvl], &evcount, sizeof(evcount)) > 0 &&
+            evcount > 0 && lvl > level) {
+            level = lvl;
+        }
+    }
+
+    if (kill_timeout_ms) {
+        struct timeval curr_tm;
+        gettimeofday(&curr_tm, NULL);
+        if (get_time_diff_ms(&last_report_tm, &curr_tm) < kill_timeout_ms) {
+            skip_count++;
+            return;
+        }
+    }
+
+    if (skip_count > 0) {
+        if (debug_process_killing) {
+            ALOGI("%lu memory pressure events were skipped after a kill!",
+                skip_count);
+        }
+        skip_count = 0;
+    }
+
+    if (get_free_memory(&free_mem) == 0) {
+        if (level == VMPRESS_LEVEL_LOW) {
+            record_low_pressure_levels(&free_mem);
+        }
+    } else {
+        ALOGE("Failed to get free memory!");
+        return;
+    }
+
+    if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) {
+        /* Do not monitor this pressure level */
+        return;
+    }
 
     mem_usage = get_memory_usage(MEMCG_MEMORY_USAGE);
     memsw_usage = get_memory_usage(MEMCG_MEMORYSW_USAGE);
     if (memsw_usage < 0 || mem_usage < 0) {
-        find_and_kill_process(is_critical);
-        return;
+        goto do_kill;
     }
 
     // Calculate percent for swappinness.
     mem_pressure = (mem_usage * 100) / memsw_usage;
 
-    if (enable_pressure_upgrade && !is_critical) {
+    if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) {
         // We are swapping too much.
         if (mem_pressure < upgrade_pressure) {
-            ALOGI("Event upgraded to critical.");
-            is_critical = true;
+            level = upgrade_level(level);
+            if (debug_process_killing) {
+                ALOGI("Event upgraded to %s", level_name[level]);
+            }
         }
     }
 
@@ -708,41 +879,74 @@
     // kill any process, since enough memory is available.
     if (mem_pressure > downgrade_pressure) {
         if (debug_process_killing) {
-            ALOGI("Ignore %s memory pressure", is_critical ? "critical" : "medium");
+            ALOGI("Ignore %s memory pressure", level_name[level]);
         }
         return;
-    } else if (is_critical && mem_pressure > upgrade_pressure) {
+    } else if (level == VMPRESS_LEVEL_CRITICAL &&
+               mem_pressure > upgrade_pressure) {
         if (debug_process_killing) {
             ALOGI("Downgrade critical memory pressure");
         }
-        // Downgrade event to medium, since enough memory available.
-        is_critical = false;
+        // Downgrade event, since enough memory available.
+        level = downgrade_level(level);
     }
 
-    if (find_and_kill_process(is_critical) == 0) {
-        if (debug_process_killing) {
-            ALOGI("Nothing to kill");
+do_kill:
+    if (is_go_device) {
+        /* For Go devices kill only one task */
+        if (find_and_kill_processes(level, 0) == 0) {
+            if (debug_process_killing) {
+                ALOGI("Nothing to kill");
+            }
+        }
+    } else {
+        /* If pressure level is less than critical and enough free swap then ignore */
+        if (level < VMPRESS_LEVEL_CRITICAL && free_mem.free_swap > low_pressure_mem.max_free) {
+            if (debug_process_killing) {
+                ALOGI("Ignoring pressure since %d swap pages are available ", free_mem.free_swap);
+            }
+            return;
+        }
+
+        /* Free up enough memory to downgrate the memory pressure to low level */
+        if (free_mem.free_mem < low_pressure_mem.max_free) {
+            int pages_to_free = low_pressure_mem.max_free - free_mem.free_mem;
+            if (debug_process_killing) {
+                ALOGI("Trying to free %d pages", pages_to_free);
+            }
+            int pages_freed = find_and_kill_processes(level, pages_to_free);
+            if (pages_freed < pages_to_free) {
+                if (debug_process_killing) {
+                    ALOGI("Unable to free enough memory (pages freed=%d)",
+                        pages_freed);
+                }
+            } else {
+                gettimeofday(&last_report_tm, NULL);
+            }
         }
     }
 }
 
-static void mp_event(uint32_t events __unused) {
-    mp_event_common(false);
+static void mp_event_low(uint32_t events __unused) {
+    mp_event_common(VMPRESS_LEVEL_LOW);
+}
+
+static void mp_event_medium(uint32_t events __unused) {
+    mp_event_common(VMPRESS_LEVEL_MEDIUM);
 }
 
 static void mp_event_critical(uint32_t events __unused) {
-    mp_event_common(true);
+    mp_event_common(VMPRESS_LEVEL_CRITICAL);
 }
 
-static int init_mp_common(char *levelstr, void *event_handler, bool is_critical)
-{
+static bool init_mp_common(void *event_handler, enum vmpressure_level level) {
     int mpfd;
     int evfd;
     int evctlfd;
     char buf[256];
     struct epoll_event epev;
     int ret;
-    int mpevfd_index = is_critical ? CRITICAL_INDEX : MEDIUM_INDEX;
+    const char *levelstr = level_name[level];
 
     mpfd = open(MEMCG_SYSFS_PATH "memory.pressure_level", O_RDONLY | O_CLOEXEC);
     if (mpfd < 0) {
@@ -783,8 +987,9 @@
         goto err;
     }
     maxevents++;
-    mpevfd[mpevfd_index] = evfd;
-    return 0;
+    mpevfd[level] = evfd;
+    close(evctlfd);
+    return true;
 
 err:
     close(evfd);
@@ -793,17 +998,7 @@
 err_open_evctlfd:
     close(mpfd);
 err_open_mpfd:
-    return -1;
-}
-
-static int init_mp_medium()
-{
-    return init_mp_common(MEMPRESSURE_WATCH_MEDIUM_LEVEL, (void *)&mp_event, false);
-}
-
-static int init_mp_critical()
-{
-    return init_mp_common(MEMPRESSURE_WATCH_CRITICAL_LEVEL, (void *)&mp_event_critical, true);
+    return false;
 }
 
 static int init(void) {
@@ -843,15 +1038,18 @@
     maxevents++;
 
     has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
-    use_inkernel_interface = has_inkernel_module && !is_go_device;
+    use_inkernel_interface = has_inkernel_module;
 
     if (use_inkernel_interface) {
         ALOGI("Using in-kernel low memory killer interface");
     } else {
-        ret = init_mp_medium();
-        ret |= init_mp_critical();
-        if (ret)
+        if (!init_mp_common((void *)&mp_event_low, VMPRESS_LEVEL_LOW) ||
+            !init_mp_common((void *)&mp_event_medium, VMPRESS_LEVEL_MEDIUM) ||
+            !init_mp_common((void *)&mp_event_critical,
+                            VMPRESS_LEVEL_CRITICAL)) {
             ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
+            return -1;
+        }
     }
 
     for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
@@ -892,13 +1090,27 @@
             .sched_priority = 1,
     };
 
-    medium_oomadj = property_get_int32("ro.lmk.medium", 800);
-    critical_oomadj = property_get_int32("ro.lmk.critical", 0);
+    /* By default disable low level vmpressure events */
+    level_oomadj[VMPRESS_LEVEL_LOW] =
+        property_get_int32("ro.lmk.low", OOM_SCORE_ADJ_MAX + 1);
+    level_oomadj[VMPRESS_LEVEL_MEDIUM] =
+        property_get_int32("ro.lmk.medium", 800);
+    level_oomadj[VMPRESS_LEVEL_CRITICAL] =
+        property_get_int32("ro.lmk.critical", 0);
     debug_process_killing = property_get_bool("ro.lmk.debug", false);
-    enable_pressure_upgrade = property_get_bool("ro.lmk.critical_upgrade", false);
-    upgrade_pressure = (int64_t)property_get_int32("ro.lmk.upgrade_pressure", 50);
-    downgrade_pressure = (int64_t)property_get_int32("ro.lmk.downgrade_pressure", 60);
+
+    /* By default disable upgrade/downgrade logic */
+    enable_pressure_upgrade =
+        property_get_bool("ro.lmk.critical_upgrade", false);
+    upgrade_pressure =
+        (int64_t)property_get_int32("ro.lmk.upgrade_pressure", 100);
+    downgrade_pressure =
+        (int64_t)property_get_int32("ro.lmk.downgrade_pressure", 100);
+    kill_heaviest_task =
+        property_get_bool("ro.lmk.kill_heaviest_task", true);
     is_go_device = property_get_bool("ro.config.low_ram", false);
+    kill_timeout_ms =
+        (unsigned long)property_get_int32("ro.lmk.kill_timeout_ms", 0);
 
     // MCL_ONFAULT pins pages as they fault instead of loading
     // everything immediately all at once. (Which would be bad,