WIP priority inheiritance

Change-Id: Id73ee904ed3681b9fe96ca1e69bc74a3d0576c0b
diff --git a/kernel/include/kernel/sched.h b/kernel/include/kernel/sched.h
index 598975a..640c82a 100644
--- a/kernel/include/kernel/sched.h
+++ b/kernel/include/kernel/sched.h
@@ -15,6 +15,7 @@
 /* not intended to be used by regular kernel code */
 void sched_init_early(void);
 
+void sched_init_thread(thread_t* t, int priority);
 void sched_block(void);
 void sched_yield(void);
 void sched_preempt(void);
@@ -23,6 +24,11 @@
 void sched_unblock_idle(thread_t* t);
 void sched_migrate(thread_t* t);
 
+/* set the inheirited priority of a thread and return if the caller should locally reschedule.
+ * pri should be < MAX_PRIORITY, negative values disable priority inheiritance
+ */
+void sched_inheirit_priority(thread_t* t, int pri, bool* local_resched);
+
 /* return true if the thread was placed on the current cpu's run queue */
 /* this usually means the caller should locally reschedule soon */
 bool sched_unblock(thread_t* t) __WARN_UNUSED_RESULT;
diff --git a/kernel/include/kernel/thread.h b/kernel/include/kernel/thread.h
index 158c370..a6c63ff 100644
--- a/kernel/include/kernel/thread.h
+++ b/kernel/include/kernel/thread.h
@@ -95,8 +95,11 @@
      * left the scheduler. */
     zx_duration_t runtime_ns;
 
+    /* priority */
+    int effec_priority;
     int base_priority;
     int priority_boost;
+    int inheirited_priority;
 
     /* current cpu the thread is either running on or in the ready queue, undefined otherwise */
     cpu_num_t curr_cpu;
@@ -115,6 +118,9 @@
     /* are we allowed to be interrupted on the current thing we're blocked/sleeping on */
     bool interruptable;
 
+    /* number of mutexes we currently hold */
+    int mutexes_held;
+
     /* pointer to the kernel address space this thread is associated with */
     struct vmm_aspace* aspace;
 
diff --git a/kernel/include/kernel/wait.h b/kernel/include/kernel/wait.h
index 48ffbfd..819b467 100644
--- a/kernel/include/kernel/wait.h
+++ b/kernel/include/kernel/wait.h
@@ -56,6 +56,11 @@
 zx_status_t wait_queue_block_with_mask(wait_queue_t*, zx_time_t deadline,
                                        uint signal_mask);
 
+/* returns the highest priority of all the blocked threads on this wait queue.
+ * returns -1 if no threads are blocked.
+ */
+int wait_queue_blocked_priority(wait_queue_t*);
+
 /*
  * release one or more threads from the wait queue.
  * reschedule = should the system reschedule if any is released.
diff --git a/kernel/kernel/mutex.c b/kernel/kernel/mutex.c
index 9088f4c..6c10cb6 100644
--- a/kernel/kernel/mutex.c
+++ b/kernel/kernel/mutex.c
@@ -74,6 +74,7 @@
     oldval = 0;
     if (likely(atomic_cmpxchg_u64(&m->val, &oldval, (uintptr_t)ct))) {
         // acquired it cleanly
+        ct->mutexes_held++;
         return;
     }
 
@@ -100,6 +101,11 @@
         goto retry;
     }
 
+    // have the holder inheirit our priority
+    // discard the local reschedule flag because we're just about to block anyway
+    bool unused;
+    sched_inheirit_priority(mutex_holder(m), ct->effec_priority, &unused);
+
     // we have signalled that we're blocking, so drop into the wait queue
     zx_status_t ret = wait_queue_block(&m->wait, ZX_TIME_INFINITE);
     if (unlikely(ret < ZX_OK)) {
@@ -112,6 +118,9 @@
     // someone must have woken us up, we should own the mutex now
     DEBUG_ASSERT(ct == mutex_holder(m));
 
+    // record that we hold it
+    ct->mutexes_held++;
+
     THREAD_UNLOCK(state);
 }
 
@@ -121,13 +130,33 @@
     thread_t* ct = get_current_thread();
     uintptr_t oldval;
 
+    // we're going to release it, mark as such
+    ct->mutexes_held--;
+
     // in case there's no contention, try the fast path
     oldval = (uintptr_t)ct;
     if (likely(atomic_cmpxchg_u64(&m->val, &oldval, 0))) {
         // we're done, exit
+        // if we had inheirited any priorities, undo it if we are no longer holding any mutexes
+        if (unlikely(ct->inheirited_priority >= 0) && ct->mutexes_held == 0) {
+            spin_lock_saved_state_t state;
+            if (!thread_lock_held)
+                spin_lock_irqsave(&thread_lock, state);
+
+            bool local_resched = false;
+            sched_inheirit_priority(ct, -1, &local_resched);
+            if (reschedule && local_resched) {
+                sched_reschedule();
+            }
+
+            if (!thread_lock_held)
+                spin_unlock_irqrestore(&thread_lock, state);
+        }
         return;
     }
 
+    DEBUG_ASSERT(ct->mutexes_held >= 0);
+
 // must have been some contention, try the slow release
 
 #if LK_DEBUGLEVEL > 0
@@ -159,9 +188,22 @@
 
     ktrace(TAG_KWAIT_WAKE, (uintptr_t)&m->wait >> 32, (uintptr_t)&m->wait, 1, 0);
 
+    // boost the priority of the new thread we're waking
+    // if the wait queue is empty, it'll return -1 and we'll deboost the thread if
+    // it's not already holding a mutex
+    bool local_resched = false;
+    int blocked_priority = wait_queue_blocked_priority(&m->wait);
+    if (blocked_priority >= 0 || t->mutexes_held == 0) {
+        sched_inheirit_priority(t, blocked_priority, &local_resched);
+    }
+
+    // deboost ourself if this is the last mutex we held
+    if (ct->inheirited_priority >= 0 && ct->mutexes_held == 0)
+        sched_inheirit_priority(ct, -1, &local_resched);
+
     // wake up the new thread, putting it in a run queue on a cpu. reschedule if the local
     // cpu run queue was modified
-    bool local_resched = sched_unblock(t);
+    local_resched |= sched_unblock(t);
     if (reschedule && local_resched)
         sched_reschedule();
 
diff --git a/kernel/kernel/sched.c b/kernel/kernel/sched.c
index 87e1a52..f0bdf89 100644
--- a/kernel/kernel/sched.c
+++ b/kernel/kernel/sched.c
@@ -55,10 +55,14 @@
 static bool local_migrate_if_needed(thread_t* curr_thread);
 
 /* compute the effective priority of a thread */
-static int effec_priority(const thread_t* t) {
+static void compute_effec_priority(thread_t* t) {
     int ep = t->base_priority + t->priority_boost;
+    if (t->inheirited_priority > ep)
+        ep = t->inheirited_priority;
+
     DEBUG_ASSERT(ep >= LOWEST_PRIORITY && ep <= HIGHEST_PRIORITY);
-    return ep;
+
+    t->effec_priority = ep;
 }
 
 /* boost the priority of the thread by +1 */
@@ -72,6 +76,7 @@
     if (t->priority_boost < MAX_PRIORITY_ADJ &&
         likely((t->base_priority + t->priority_boost) < HIGHEST_PRIORITY)) {
         t->priority_boost++;
+        compute_effec_priority(t);
     }
 }
 
@@ -106,6 +111,7 @@
 
     /* drop a level */
     t->priority_boost--;
+    compute_effec_priority(t);
 }
 
 /* pick a 'random' cpu out of the passed in mask of cpus */
@@ -198,10 +204,8 @@
 static void insert_in_run_queue_head(cpu_num_t cpu, thread_t* t) {
     DEBUG_ASSERT(!list_in_list(&t->queue_node));
 
-    int ep = effec_priority(t);
-
-    list_add_head(&percpu[cpu].run_queue[ep], &t->queue_node);
-    percpu[cpu].run_queue_bitmap |= (1u << ep);
+    list_add_head(&percpu[cpu].run_queue[t->effec_priority], &t->queue_node);
+    percpu[cpu].run_queue_bitmap |= (1u << t->effec_priority);
 
     /* mark the cpu as busy since the run queue now has at least one item in it */
     mp_set_cpu_busy(cpu);
@@ -210,10 +214,8 @@
 static void insert_in_run_queue_tail(cpu_num_t cpu, thread_t* t) {
     DEBUG_ASSERT(!list_in_list(&t->queue_node));
 
-    int ep = effec_priority(t);
-
-    list_add_tail(&percpu[cpu].run_queue[ep], &t->queue_node);
-    percpu[cpu].run_queue_bitmap |= (1u << ep);
+    list_add_tail(&percpu[cpu].run_queue[t->effec_priority], &t->queue_node);
+    percpu[cpu].run_queue_bitmap |= (1u << t->effec_priority);
 
     /* mark the cpu as busy since the run queue now has at least one item in it */
     mp_set_cpu_busy(cpu);
@@ -248,6 +250,13 @@
     return &c->idle_thread;
 }
 
+void sched_init_thread(thread_t* t, int priority) {
+    t->base_priority = priority;
+    t->priority_boost = 0;
+    t->inheirited_priority = -1;
+    compute_effec_priority(t);
+}
+
 void sched_block(void) {
     DEBUG_ASSERT(spin_lock_held(&thread_lock));
 
@@ -528,9 +537,8 @@
         DEBUG_ASSERT(is_valid_cpu_num(t->curr_cpu));
 
         struct percpu* c = &percpu[t->curr_cpu];
-        int pri = effec_priority(t);
-        if (list_is_empty(&c->run_queue[pri])) {
-            c->run_queue_bitmap &= ~(1u << pri);
+        if (list_is_empty(&c->run_queue[t->effec_priority])) {
+            c->run_queue_bitmap &= ~(1u << t->effec_priority);
         }
 
         find_cpu_and_insert(t, &local_resched, &accum_cpu_mask);
@@ -549,6 +557,75 @@
     }
 }
 
+/* set the priority to the higher value of what it was before and the newly inheirited value */
+/* pri < 0 disables priority inheiritance and goes back to the naturally computed values */
+void sched_inheirit_priority(thread_t* t, int pri, bool *local_resched) {
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+    if (pri > HIGHEST_PRIORITY)
+        pri = HIGHEST_PRIORITY;
+
+    // if we're setting it to something real and it's less than the current, skip
+    if (pri >= 0 && pri <= t->inheirited_priority)
+        return;
+
+    // adjust the priority and remember the old value
+    t->inheirited_priority = pri;
+    int old_ep = t->effec_priority;
+    compute_effec_priority(t);
+    if (old_ep == t->effec_priority) {
+        // same effective priority, nothing to do
+        return;
+    }
+
+    // see if we need to do something based on the state of the thread
+    cpu_mask_t accum_cpu_mask = 0;
+    switch (t->state) {
+    case THREAD_RUNNING:
+        if (t->effec_priority < old_ep) {
+            // we're currently running and dropped our effective priority, might want to resched
+            if (t == get_current_thread()) {
+                *local_resched |= true;
+            } else {
+                accum_cpu_mask = cpu_num_to_mask(t->curr_cpu);
+            }
+        }
+        break;
+    case THREAD_READY:
+        // it's sitting in a run queue somewhere, remove and add back to the proper queue on that cpu
+        DEBUG_ASSERT_MSG(list_in_list(&t->queue_node), "thread %p name %s curr_cpu %u\n", t, t->name, t->curr_cpu);
+        list_delete(&t->queue_node);
+
+        DEBUG_ASSERT(is_valid_cpu_num(t->curr_cpu));
+
+        struct percpu* c = &percpu[t->curr_cpu];
+        if (list_is_empty(&c->run_queue[old_ep])) {
+            c->run_queue_bitmap &= ~(1u << old_ep);
+        }
+
+        if (t->effec_priority > old_ep) {
+            insert_in_run_queue_head(t->curr_cpu, t);
+            if (t->curr_cpu == arch_curr_cpu_num()) {
+                *local_resched |= true;
+            } else {
+                accum_cpu_mask = cpu_num_to_mask(t->curr_cpu);
+            }
+        } else {
+            insert_in_run_queue_tail(t->curr_cpu, t);
+        }
+
+        break;
+    default:
+        // the other states do not matter, exit
+        return;
+    }
+
+    // send some ipis based on the previous code
+    if (accum_cpu_mask) {
+        mp_reschedule(MP_IPI_TARGET_MASK, accum_cpu_mask, 0);
+    }
+}
+
 /* preemption timer that is set whenever a thread is scheduled */
 static enum handler_return sched_timer_tick(struct timer* t, zx_time_t now, void* arg) {
     /* if the preemption timer went off on the idle or a real time thread, ignore it */
@@ -692,10 +769,13 @@
     /* set some optional target debug leds */
     target_set_debug_led(0, !thread_is_idle(newthread));
 
-    TRACE_CONTEXT_SWITCH("cpu %u, old %p (%s, pri %d:%d, flags 0x%x), new %p (%s, pri %d:%d, flags 0x%x)\n",
-                         cpu, oldthread, oldthread->name, oldthread->base_priority, oldthread->priority_boost,
+    TRACE_CONTEXT_SWITCH("cpu %u old %p (%s, pri %d [%d:%d], flags 0x%x) "
+                         "new %p (%s, pri %d [%d:%d], flags 0x%x)\n",
+                         cpu, oldthread, oldthread->name, oldthread->effec_priority,
+                         oldthread->base_priority, oldthread->priority_boost,
                          oldthread->flags, newthread, newthread->name,
-                         newthread->base_priority, newthread->priority_boost, newthread->flags);
+                         newthread->effec_priority, newthread->base_priority,
+                         newthread->priority_boost, newthread->flags);
 
     /* check that the old thread has not blown its stack just before pushing its context */
     if (THREAD_STACK_BOUNDS_CHECK &&
diff --git a/kernel/kernel/thread.c b/kernel/kernel/thread.c
index 121ede0..6612c32 100644
--- a/kernel/kernel/thread.c
+++ b/kernel/kernel/thread.c
@@ -122,8 +122,6 @@
 
     t->entry = entry;
     t->arg = arg;
-    t->base_priority = priority;
-    t->priority_boost = 0;
     t->state = THREAD_INITIAL;
     t->signals = 0;
     t->blocking_wait_queue = NULL;
@@ -136,6 +134,8 @@
     t->retcode = 0;
     wait_queue_init(&t->retcode_wait_queue);
 
+    sched_init_thread(t, priority);
+
     /* create the stack */
     if (!stack) {
         if (THREAD_STACK_BOUNDS_CHECK) {
@@ -1168,10 +1168,11 @@
 
     if (full_dump) {
         dprintf(INFO, "dump_thread: t %p (%s:%s)\n", t, oname, t->name);
-        dprintf(INFO, "\tstate %s, curr/last cpu %d/%d, cpu_affinity %#x, priority %d:%d, "
+        dprintf(INFO, "\tstate %s, curr/last cpu %d/%d, cpu_affinity %#x, priority %d [%d:%d,%d], "
                       "remaining time slice %" PRIu64 "\n",
-                thread_state_to_str(t->state), (int)t->curr_cpu, (int)t->last_cpu, t->cpu_affinity, t->base_priority,
-                t->priority_boost, t->remaining_time_slice);
+                thread_state_to_str(t->state), (int)t->curr_cpu, (int)t->last_cpu, t->cpu_affinity,
+                t->effec_priority, t->base_priority,
+                t->priority_boost, t->inheirited_priority, t->remaining_time_slice);
         dprintf(INFO, "\truntime_ns %" PRIu64 ", runtime_s %" PRIu64 "\n",
                 runtime, runtime / 1000000000);
         dprintf(INFO, "\tstack %p, stack_size %zu\n", t->stack, t->stack_size);
@@ -1182,15 +1183,16 @@
                 (t->flags & THREAD_FLAG_REAL_TIME) ? "Rt" : "",
                 (t->flags & THREAD_FLAG_IDLE) ? "Id" : "",
                 (t->flags & THREAD_FLAG_DEBUG_STACK_BOUNDS_CHECK) ? "Sc" : "");
-        dprintf(INFO, "\twait queue %p, blocked_status %d, interruptable %d\n",
-                t->blocking_wait_queue, t->blocked_status, t->interruptable);
+        dprintf(INFO, "\twait queue %p, blocked_status %d, interruptable %d, mutexes held %d\n",
+                t->blocking_wait_queue, t->blocked_status, t->interruptable, t->mutexes_held);
         dprintf(INFO, "\taspace %p\n", t->aspace);
         dprintf(INFO, "\tuser_thread %p, pid %" PRIu64 ", tid %" PRIu64 "\n",
                 t->user_thread, t->user_pid, t->user_tid);
         arch_dump_thread(t);
     } else {
-        printf("thr %p st %4s pri %2d:%d pid %" PRIu64 " tid %" PRIu64 " (%s:%s)\n",
-               t, thread_state_to_str(t->state), t->base_priority, t->priority_boost, t->user_pid,
+        printf("thr %p st %4s m %d pri %2d [%d:%d,%d] pid %" PRIu64 " tid %" PRIu64 " (%s:%s)\n",
+               t, thread_state_to_str(t->state), t->mutexes_held, t->effec_priority, t->base_priority,
+               t->priority_boost, t->inheirited_priority, t->user_pid,
                t->user_tid, oname, t->name);
     }
 }
diff --git a/kernel/kernel/wait.c b/kernel/kernel/wait.c
index c8bda5b..f1b59d9 100644
--- a/kernel/kernel/wait.c
+++ b/kernel/kernel/wait.c
@@ -48,17 +48,17 @@
         list_initialize(&t->queue_node);
         list_add_head(&wait->heads, &t->wait_queue_heads_node);
     } else {
-        int pri = t->base_priority; // TODO: get the effective priority from sched.c
+        int pri = t->effec_priority;
 
         // walk through the sorted list of wait queue heads
         thread_t* temp;
         list_for_every_entry(&wait->heads, temp, thread_t, wait_queue_heads_node) {
-            if (pri > temp->base_priority) {
+            if (pri > temp->effec_priority) {
                 // insert ourself here as a new queue head
                 list_initialize(&t->queue_node);
                 list_add_before(&temp->wait_queue_heads_node, &t->wait_queue_heads_node);
                 return;
-            } else if (temp->base_priority == pri) {
+            } else if (temp->effec_priority == pri) {
                 // same priority, add ourself to the tail of this queue
                 list_add_tail(&temp->queue_node, &t->queue_node);
                 list_clear_node(&t->wait_queue_heads_node);
@@ -114,6 +114,14 @@
     }
 }
 
+int wait_queue_blocked_priority(wait_queue_t* wait) {
+    thread_t* t = list_peek_head_type(&wait->heads, thread_t, wait_queue_heads_node);
+    if (!t)
+        return -1;
+
+    return t->effec_priority;
+}
+
 static zx_status_t wait_queue_block_worker(wait_queue_t* wait, zx_time_t deadline,
                                            uint signal_mask) {
     timer_t timer;