From bb8c13d61a629276a162c1d2b1a20a815cbcfbb7 Mon Sep 17 00:00:00 2001 | |
From: Borislav Petkov <bp@suse.de> | |
Date: Wed, 14 Mar 2018 19:36:15 +0100 | |
Subject: x86/microcode: Fix CPU synchronization routine | |
Emanuel reported an issue with a hang during microcode update because my | |
dumb idea to use one atomic synchronization variable for both rendezvous | |
- before and after update - was simply bollocks: | |
microcode: microcode_reload_late: late_cpus: 4 | |
microcode: __reload_late: cpu 2 entered | |
microcode: __reload_late: cpu 1 entered | |
microcode: __reload_late: cpu 3 entered | |
microcode: __reload_late: cpu 0 entered | |
microcode: __reload_late: cpu 1 left | |
microcode: Timeout while waiting for CPUs rendezvous, remaining: 1 | |
CPU1 above would finish, leave and the others will still spin waiting for | |
it to join. | |
So do two synchronization atomics instead, which makes the code a lot more | |
straightforward. | |
Also, since the update is serialized and it also takes quite some time per | |
microcode engine, increase the exit timeout by the number of CPUs on the | |
system. | |
That's ok because the moment all CPUs are done, that timeout will be cut | |
short. | |
Furthermore, panic when some of the CPUs timeout when returning from a | |
microcode update: we can't allow a system with not all cores updated. | |
Also, as an optimization, do not do the exit sync if microcode wasn't | |
updated. | |
Reported-by: Emanuel Czirai <xftroxgpx@protonmail.com> | |
Signed-off-by: Borislav Petkov <bp@suse.de> | |
Signed-off-by: Thomas Gleixner <tglx@linutronix.de> | |
Tested-by: Emanuel Czirai <xftroxgpx@protonmail.com> | |
Tested-by: Ashok Raj <ashok.raj@intel.com> | |
Tested-by: Tom Lendacky <thomas.lendacky@amd.com> | |
Link: https://lkml.kernel.org/r/20180314183615.17629-2-bp@alien8.de | |
--- | |
arch/x86/kernel/cpu/microcode/core.c | 68 ++++++++++++++++++++++-------------- | |
1 file changed, 41 insertions(+), 27 deletions(-) | |
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c | |
index 9f0fe5b..10c4fc2 100644 | |
--- a/arch/x86/kernel/cpu/microcode/core.c | |
+++ b/arch/x86/kernel/cpu/microcode/core.c | |
@@ -517,7 +517,29 @@ static int check_online_cpus(void) | |
return -EINVAL; | |
} | |
-static atomic_t late_cpus; | |
+static atomic_t late_cpus_in; | |
+static atomic_t late_cpus_out; | |
+ | |
+static int __wait_for_cpus(atomic_t *t, long long timeout) | |
+{ | |
+ int all_cpus = num_online_cpus(); | |
+ | |
+ atomic_inc(t); | |
+ | |
+ while (atomic_read(t) < all_cpus) { | |
+ if (timeout < SPINUNIT) { | |
+ pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n", | |
+ all_cpus - atomic_read(t)); | |
+ return 1; | |
+ } | |
+ | |
+ ndelay(SPINUNIT); | |
+ timeout -= SPINUNIT; | |
+ | |
+ touch_nmi_watchdog(); | |
+ } | |
+ return 0; | |
+} | |
/* | |
* Returns: | |
@@ -527,30 +549,16 @@ static atomic_t late_cpus; | |
*/ | |
static int __reload_late(void *info) | |
{ | |
- unsigned int timeout = NSEC_PER_SEC; | |
- int all_cpus = num_online_cpus(); | |
int cpu = smp_processor_id(); | |
enum ucode_state err; | |
int ret = 0; | |
- atomic_dec(&late_cpus); | |
- | |
/* | |
* Wait for all CPUs to arrive. A load will not be attempted unless all | |
* CPUs show up. | |
* */ | |
- while (atomic_read(&late_cpus)) { | |
- if (timeout < SPINUNIT) { | |
- pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n", | |
- atomic_read(&late_cpus)); | |
- return -1; | |
- } | |
- | |
- ndelay(SPINUNIT); | |
- timeout -= SPINUNIT; | |
- | |
- touch_nmi_watchdog(); | |
- } | |
+ if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC)) | |
+ return -1; | |
spin_lock(&update_lock); | |
apply_microcode_local(&err); | |
@@ -558,15 +566,22 @@ static int __reload_late(void *info) | |
if (err > UCODE_NFOUND) { | |
pr_warn("Error reloading microcode on CPU %d\n", cpu); | |
- ret = -1; | |
- } else if (err == UCODE_UPDATED) { | |
+ return -1; | |
+ /* siblings return UCODE_OK because their engine got updated already */ | |
+ } else if (err == UCODE_UPDATED || err == UCODE_OK) { | |
ret = 1; | |
+ } else { | |
+ return ret; | |
} | |
- atomic_inc(&late_cpus); | |
- | |
- while (atomic_read(&late_cpus) != all_cpus) | |
- cpu_relax(); | |
+ /* | |
+ * Increase the wait timeout to a safe value here since we're | |
+ * serializing the microcode update and that could take a while on a | |
+ * large number of CPUs. And that is fine as the *actual* timeout will | |
+ * be determined by the last CPU finished updating and thus cut short. | |
+ */ | |
+ if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus())) | |
+ panic("Timeout during microcode update!\n"); | |
return ret; | |
} | |
@@ -579,12 +594,11 @@ static int microcode_reload_late(void) | |
{ | |
int ret; | |
- atomic_set(&late_cpus, num_online_cpus()); | |
+ atomic_set(&late_cpus_in, 0); | |
+ atomic_set(&late_cpus_out, 0); | |
ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); | |
- if (ret < 0) | |
- return ret; | |
- else if (ret > 0) | |
+ if (ret > 0) | |
microcode_check(); | |
return ret; | |
-- | |
cgit v1.1 | |