2269 lines
75 KiB
Diff
2269 lines
75 KiB
Diff
From c4c5118da78e18eca012fc4d95f21690b3bf654b Mon Sep 17 00:00:00 2001
|
|
From: "Chang S. Bae" <chang.seok.bae@intel.com>
|
|
Date: Thu, 28 May 2020 16:13:47 -0400
|
|
Subject: [PATCH 01/21] x86/ptrace: Prevent ptrace from clearing the FS/GS
|
|
selector
|
|
|
|
When a ptracer writes a ptracee's FS/GSBASE with a different value, the
|
|
selector is also cleared. This behavior is not correct as the selector
|
|
should be preserved.
|
|
|
|
Update only the base value and leave the selector intact. To simplify the
|
|
code further remove the conditional checking for the same value as this
|
|
code is not performance critical.
|
|
|
|
The only recognizable downside of this change is when the selector is
|
|
already nonzero on write. The base will be reloaded according to the
|
|
selector. But the case is highly unexpected in real usages.
|
|
|
|
[ tglx: Massage changelog ]
|
|
|
|
Suggested-by: Andy Lutomirski <luto@kernel.org>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: "H . Peter Anvin" <hpa@zytor.com>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Link: https://lkml.kernel.org/r/9040CFCD-74BD-4C17-9A01-B9B713CF6B10@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/kernel/ptrace.c | 17 ++---------------
|
|
1 file changed, 2 insertions(+), 15 deletions(-)
|
|
|
|
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
|
|
index f0e1ddbc2..cc56efb75 100644
|
|
--- a/arch/x86/kernel/ptrace.c
|
|
+++ b/arch/x86/kernel/ptrace.c
|
|
@@ -380,25 +380,12 @@ static int putreg(struct task_struct *child,
|
|
case offsetof(struct user_regs_struct,fs_base):
|
|
if (value >= TASK_SIZE_MAX)
|
|
return -EIO;
|
|
- /*
|
|
- * When changing the FS base, use do_arch_prctl_64()
|
|
- * to set the index to zero and to set the base
|
|
- * as requested.
|
|
- *
|
|
- * NB: This behavior is nonsensical and likely needs to
|
|
- * change when FSGSBASE support is added.
|
|
- */
|
|
- if (child->thread.fsbase != value)
|
|
- return do_arch_prctl_64(child, ARCH_SET_FS, value);
|
|
+ x86_fsbase_write_task(child, value);
|
|
return 0;
|
|
case offsetof(struct user_regs_struct,gs_base):
|
|
- /*
|
|
- * Exactly the same here as the %fs handling above.
|
|
- */
|
|
if (value >= TASK_SIZE_MAX)
|
|
return -EIO;
|
|
- if (child->thread.gsbase != value)
|
|
- return do_arch_prctl_64(child, ARCH_SET_GS, value);
|
|
+ x86_gsbase_write_task(child, value);
|
|
return 0;
|
|
#endif
|
|
}
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From fb69a74fb8c45de09325c406459ccf88aadf21f8 Mon Sep 17 00:00:00 2001
|
|
From: Andy Lutomirski <luto@kernel.org>
|
|
Date: Thu, 28 May 2020 16:13:48 -0400
|
|
Subject: [PATCH 02/21] x86/cpu: Add 'unsafe_fsgsbase' to enable CR4.FSGSBASE
|
|
|
|
This is temporary. It will allow the next few patches to be tested
|
|
incrementally.
|
|
|
|
Setting unsafe_fsgsbase is a root hole. Don't do it.
|
|
|
|
Signed-off-by: Andy Lutomirski <luto@kernel.org>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Reviewed-by: Andi Kleen <ak@linux.intel.com>
|
|
Reviewed-by: Andy Lutomirski <luto@kernel.org>
|
|
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
|
|
Cc: Andrew Morton <akpm@linux-foundation.org>
|
|
Cc: Randy Dunlap <rdunlap@infradead.org>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Link: https://lkml.kernel.org/r/1557309753-24073-4-git-send-email-chang.seok.bae@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
.../admin-guide/kernel-parameters.txt | 3 +++
|
|
arch/x86/kernel/cpu/common.c | 24 +++++++++++++++++++
|
|
2 files changed, 27 insertions(+)
|
|
|
|
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
|
|
index ffa53aaa6..588f4eb13 100644
|
|
--- a/Documentation/admin-guide/kernel-parameters.txt
|
|
+++ b/Documentation/admin-guide/kernel-parameters.txt
|
|
@@ -3033,6 +3033,9 @@
|
|
no5lvl [X86-64] Disable 5-level paging mode. Forces
|
|
kernel to use 4-level paging instead.
|
|
|
|
+ unsafe_fsgsbase [X86] Allow FSGSBASE instructions. This will be
|
|
+ replaced with a nofsgsbase flag.
|
|
+
|
|
no_console_suspend
|
|
[HW] Never suspend the console
|
|
Disable suspending of consoles during suspend and
|
|
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
|
|
index 8293ee514..7e2954ba6 100644
|
|
--- a/arch/x86/kernel/cpu/common.c
|
|
+++ b/arch/x86/kernel/cpu/common.c
|
|
@@ -418,6 +418,22 @@ static void __init setup_cr_pinning(void)
|
|
static_key_enable(&cr_pinning.key);
|
|
}
|
|
|
|
+/*
|
|
+ * Temporary hack: FSGSBASE is unsafe until a few kernel code paths are
|
|
+ * updated. This allows us to get the kernel ready incrementally.
|
|
+ *
|
|
+ * Once all the pieces are in place, these will go away and be replaced with
|
|
+ * a nofsgsbase chicken flag.
|
|
+ */
|
|
+static bool unsafe_fsgsbase;
|
|
+
|
|
+static __init int setup_unsafe_fsgsbase(char *arg)
|
|
+{
|
|
+ unsafe_fsgsbase = true;
|
|
+ return 1;
|
|
+}
|
|
+__setup("unsafe_fsgsbase", setup_unsafe_fsgsbase);
|
|
+
|
|
/*
|
|
* Protection Keys are not available in 32-bit mode.
|
|
*/
|
|
@@ -1511,6 +1527,14 @@ static void identify_cpu(struct cpuinfo_x86 *c)
|
|
setup_smap(c);
|
|
setup_umip(c);
|
|
|
|
+ /* Enable FSGSBASE instructions if available. */
|
|
+ if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
|
|
+ if (unsafe_fsgsbase)
|
|
+ cr4_set_bits(X86_CR4_FSGSBASE);
|
|
+ else
|
|
+ clear_cpu_cap(c, X86_FEATURE_FSGSBASE);
|
|
+ }
|
|
+
|
|
/*
|
|
* The vendor-specific functions might have changed features.
|
|
* Now we do "generic changes."
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From a0d40e8fe282e44cf9380817005769755dc7375e Mon Sep 17 00:00:00 2001
|
|
From: Andi Kleen <ak@linux.intel.com>
|
|
Date: Thu, 28 May 2020 16:13:49 -0400
|
|
Subject: [PATCH 03/21] x86/fsgsbase/64: Add intrinsics for FSGSBASE
|
|
instructions
|
|
|
|
[ luto: Rename the variables from FS and GS to FSBASE and GSBASE and
|
|
make <asm/fsgsbase.h> safe to include on 32-bit kernels. ]
|
|
|
|
Signed-off-by: Andi Kleen <ak@linux.intel.com>
|
|
Signed-off-by: Andy Lutomirski <luto@kernel.org>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Reviewed-by: Andy Lutomirski <luto@kernel.org>
|
|
Reviewed-by: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Link: https://lkml.kernel.org/r/1557309753-24073-6-git-send-email-chang.seok.bae@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/include/asm/fsgsbase.h | 30 ++++++++++++++++++++++++++++++
|
|
1 file changed, 30 insertions(+)
|
|
|
|
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
|
|
index bca4c743d..fdd117749 100644
|
|
--- a/arch/x86/include/asm/fsgsbase.h
|
|
+++ b/arch/x86/include/asm/fsgsbase.h
|
|
@@ -19,6 +19,36 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task);
|
|
extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
|
|
extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
|
|
|
|
+/* Must be protected by X86_FEATURE_FSGSBASE check. */
|
|
+
|
|
+static __always_inline unsigned long rdfsbase(void)
|
|
+{
|
|
+ unsigned long fsbase;
|
|
+
|
|
+ asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
|
|
+
|
|
+ return fsbase;
|
|
+}
|
|
+
|
|
+static __always_inline unsigned long rdgsbase(void)
|
|
+{
|
|
+ unsigned long gsbase;
|
|
+
|
|
+ asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory");
|
|
+
|
|
+ return gsbase;
|
|
+}
|
|
+
|
|
+static __always_inline void wrfsbase(unsigned long fsbase)
|
|
+{
|
|
+ asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory");
|
|
+}
|
|
+
|
|
+static __always_inline void wrgsbase(unsigned long gsbase)
|
|
+{
|
|
+ asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
|
|
+}
|
|
+
|
|
/* Helper functions for reading/writing FS/GS base */
|
|
|
|
static inline unsigned long x86_fsbase_read_cpu(void)
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From c1416bba2416d252b4077f7722a4cde35f4a4a8e Mon Sep 17 00:00:00 2001
|
|
From: "Chang S. Bae" <chang.seok.bae@intel.com>
|
|
Date: Thu, 28 May 2020 16:13:50 -0400
|
|
Subject: [PATCH 04/21] x86/fsgsbase/64: Enable FSGSBASE instructions in helper
|
|
functions
|
|
|
|
Add cpu feature conditional FSGSBASE access to the relevant helper
|
|
functions. That allows to accelerate certain FS/GS base operations in
|
|
subsequent changes.
|
|
|
|
Note, that while possible, the user space entry/exit GSBASE operations are
|
|
not going to use the new FSGSBASE instructions. The reason is that it would
|
|
require additional storage for the user space value which adds more
|
|
complexity to the low level code and experiments have shown marginal
|
|
benefit. This may be revisited later but for now the SWAPGS based handling
|
|
in the entry code is preserved except for the paranoid entry/exit code.
|
|
|
|
To preserve the SWAPGS entry mechanism introduce __[rd|wr]gsbase_inactive()
|
|
helpers. Note, for Xen PV, paravirt hooks can be added later as they might
|
|
allow a very efficient but different implementation.
|
|
|
|
[ tglx: Massaged changelog ]
|
|
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Andy Lutomirski <luto@kernel.org>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
|
|
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Link: https://lkml.kernel.org/r/1557309753-24073-7-git-send-email-chang.seok.bae@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/include/asm/fsgsbase.h | 27 +++++++-------
|
|
arch/x86/kernel/process_64.c | 64 +++++++++++++++++++++++++++++++++
|
|
2 files changed, 76 insertions(+), 15 deletions(-)
|
|
|
|
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
|
|
index fdd117749..aefd53767 100644
|
|
--- a/arch/x86/include/asm/fsgsbase.h
|
|
+++ b/arch/x86/include/asm/fsgsbase.h
|
|
@@ -49,35 +49,32 @@ static __always_inline void wrgsbase(unsigned long gsbase)
|
|
asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
|
|
}
|
|
|
|
+#include <asm/cpufeature.h>
|
|
+
|
|
/* Helper functions for reading/writing FS/GS base */
|
|
|
|
static inline unsigned long x86_fsbase_read_cpu(void)
|
|
{
|
|
unsigned long fsbase;
|
|
|
|
- rdmsrl(MSR_FS_BASE, fsbase);
|
|
+ if (static_cpu_has(X86_FEATURE_FSGSBASE))
|
|
+ fsbase = rdfsbase();
|
|
+ else
|
|
+ rdmsrl(MSR_FS_BASE, fsbase);
|
|
|
|
return fsbase;
|
|
}
|
|
|
|
-static inline unsigned long x86_gsbase_read_cpu_inactive(void)
|
|
-{
|
|
- unsigned long gsbase;
|
|
-
|
|
- rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
|
-
|
|
- return gsbase;
|
|
-}
|
|
-
|
|
static inline void x86_fsbase_write_cpu(unsigned long fsbase)
|
|
{
|
|
- wrmsrl(MSR_FS_BASE, fsbase);
|
|
+ if (static_cpu_has(X86_FEATURE_FSGSBASE))
|
|
+ wrfsbase(fsbase);
|
|
+ else
|
|
+ wrmsrl(MSR_FS_BASE, fsbase);
|
|
}
|
|
|
|
-static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
|
|
-{
|
|
- wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
|
-}
|
|
+extern unsigned long x86_gsbase_read_cpu_inactive(void);
|
|
+extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
|
|
|
|
#endif /* CONFIG_X86_64 */
|
|
|
|
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
|
|
index 5ef9d8f25..062468982 100644
|
|
--- a/arch/x86/kernel/process_64.c
|
|
+++ b/arch/x86/kernel/process_64.c
|
|
@@ -150,6 +150,40 @@ enum which_selector {
|
|
GS
|
|
};
|
|
|
|
+/*
|
|
+ * Out of line to be protected from kprobes. It is not used on Xen
|
|
+ * paravirt. When paravirt support is needed, it needs to be renamed
|
|
+ * with native_ prefix.
|
|
+ */
|
|
+static noinline unsigned long __rdgsbase_inactive(void)
|
|
+{
|
|
+ unsigned long gsbase;
|
|
+
|
|
+ lockdep_assert_irqs_disabled();
|
|
+
|
|
+ native_swapgs();
|
|
+ gsbase = rdgsbase();
|
|
+ native_swapgs();
|
|
+
|
|
+ return gsbase;
|
|
+}
|
|
+NOKPROBE_SYMBOL(__rdgsbase_inactive);
|
|
+
|
|
+/*
|
|
+ * Out of line to be protected from kprobes. It is not used on Xen
|
|
+ * paravirt. When paravirt support is needed, it needs to be renamed
|
|
+ * with native_ prefix.
|
|
+ */
|
|
+static noinline void __wrgsbase_inactive(unsigned long gsbase)
|
|
+{
|
|
+ lockdep_assert_irqs_disabled();
|
|
+
|
|
+ native_swapgs();
|
|
+ wrgsbase(gsbase);
|
|
+ native_swapgs();
|
|
+}
|
|
+NOKPROBE_SYMBOL(__wrgsbase_inactive);
|
|
+
|
|
/*
|
|
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
|
|
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
|
|
@@ -328,6 +362,36 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
|
|
return base;
|
|
}
|
|
|
|
+unsigned long x86_gsbase_read_cpu_inactive(void)
|
|
+{
|
|
+ unsigned long gsbase;
|
|
+
|
|
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
|
+ unsigned long flags;
|
|
+
|
|
+ local_irq_save(flags);
|
|
+ gsbase = __rdgsbase_inactive();
|
|
+ local_irq_restore(flags);
|
|
+ } else {
|
|
+ rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
|
+ }
|
|
+
|
|
+ return gsbase;
|
|
+}
|
|
+
|
|
+void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
|
|
+{
|
|
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
|
+ unsigned long flags;
|
|
+
|
|
+ local_irq_save(flags);
|
|
+ __wrgsbase_inactive(gsbase);
|
|
+ local_irq_restore(flags);
|
|
+ } else {
|
|
+ wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
|
+ }
|
|
+}
|
|
+
|
|
unsigned long x86_fsbase_read_task(struct task_struct *task)
|
|
{
|
|
unsigned long fsbase;
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From c347037e60e92afacafcbb4b17242d0e6fff2e7c Mon Sep 17 00:00:00 2001
|
|
From: Andy Lutomirski <luto@kernel.org>
|
|
Date: Thu, 28 May 2020 16:13:51 -0400
|
|
Subject: [PATCH 05/21] x86/process/64: Use FSBSBASE in switch_to() if
|
|
available
|
|
|
|
With the new FSGSBASE instructions, FS and GSABSE can be efficiently read
|
|
and writen in __switch_to(). Use that capability to preserve the full
|
|
state.
|
|
|
|
This will enable user code to do whatever it wants with the new
|
|
instructions without any kernel-induced gotchas. (There can still be
|
|
architectural gotchas: movl %gs,%eax; movl %eax,%gs may change GSBASE if
|
|
WRGSBASE was used, but users are expected to read the CPU manual before
|
|
doing things like that.)
|
|
|
|
This is a considerable speedup. It seems to save about 100 cycles
|
|
per context switch compared to the baseline 4.6-rc1 behavior on a
|
|
Skylake laptop.
|
|
|
|
[ chang: 5~10% performance improvements were seen with a context switch
|
|
benchmark that ran threads with different FS/GSBASE values (to the
|
|
baseline 4.16). Minor edit on the changelog. ]
|
|
|
|
[ tglx: Masaage changelog ]
|
|
|
|
Signed-off-by: Andy Lutomirski <luto@kernel.org>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Reviewed-by: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Link: https://lkml.kernel.org/r/1557309753-24073-8-git-send-email-chang.seok.bae@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/kernel/process_64.c | 34 ++++++++++++++++++++++++++++------
|
|
1 file changed, 28 insertions(+), 6 deletions(-)
|
|
|
|
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
|
|
index 062468982..85c7f9cab 100644
|
|
--- a/arch/x86/kernel/process_64.c
|
|
+++ b/arch/x86/kernel/process_64.c
|
|
@@ -233,8 +233,18 @@ static __always_inline void save_fsgs(struct task_struct *task)
|
|
{
|
|
savesegment(fs, task->thread.fsindex);
|
|
savesegment(gs, task->thread.gsindex);
|
|
- save_base_legacy(task, task->thread.fsindex, FS);
|
|
- save_base_legacy(task, task->thread.gsindex, GS);
|
|
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
|
+ /*
|
|
+ * If FSGSBASE is enabled, we can't make any useful guesses
|
|
+ * about the base, and user code expects us to save the current
|
|
+ * value. Fortunately, reading the base directly is efficient.
|
|
+ */
|
|
+ task->thread.fsbase = rdfsbase();
|
|
+ task->thread.gsbase = __rdgsbase_inactive();
|
|
+ } else {
|
|
+ save_base_legacy(task, task->thread.fsindex, FS);
|
|
+ save_base_legacy(task, task->thread.gsindex, GS);
|
|
+ }
|
|
}
|
|
|
|
#if IS_ENABLED(CONFIG_KVM)
|
|
@@ -313,10 +323,22 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
|
|
static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
|
|
struct thread_struct *next)
|
|
{
|
|
- load_seg_legacy(prev->fsindex, prev->fsbase,
|
|
- next->fsindex, next->fsbase, FS);
|
|
- load_seg_legacy(prev->gsindex, prev->gsbase,
|
|
- next->gsindex, next->gsbase, GS);
|
|
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
|
+ /* Update the FS and GS selectors if they could have changed. */
|
|
+ if (unlikely(prev->fsindex || next->fsindex))
|
|
+ loadseg(FS, next->fsindex);
|
|
+ if (unlikely(prev->gsindex || next->gsindex))
|
|
+ loadseg(GS, next->gsindex);
|
|
+
|
|
+ /* Update the bases. */
|
|
+ wrfsbase(next->fsbase);
|
|
+ __wrgsbase_inactive(next->gsbase);
|
|
+ } else {
|
|
+ load_seg_legacy(prev->fsindex, prev->fsbase,
|
|
+ next->fsindex, next->fsbase, FS);
|
|
+ load_seg_legacy(prev->gsindex, prev->gsbase,
|
|
+ next->gsindex, next->gsbase, GS);
|
|
+ }
|
|
}
|
|
|
|
static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From 3162620086f8b40ff0559d098d23eeee5a75f341 Mon Sep 17 00:00:00 2001
|
|
From: Thomas Gleixner <tglx@linutronix.de>
|
|
Date: Thu, 28 May 2020 16:13:52 -0400
|
|
Subject: [PATCH 06/21] x86/process/64: Make save_fsgs() public available
|
|
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/include/asm/processor.h | 4 +---
|
|
arch/x86/kernel/process_64.c | 15 +++++++++------
|
|
arch/x86/kvm/vmx/vmx.c | 2 +-
|
|
3 files changed, 11 insertions(+), 10 deletions(-)
|
|
|
|
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
|
|
index 3bcf27caf..809bc013d 100644
|
|
--- a/arch/x86/include/asm/processor.h
|
|
+++ b/arch/x86/include/asm/processor.h
|
|
@@ -456,10 +456,8 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
|
|
DECLARE_PER_CPU(unsigned int, irq_count);
|
|
extern asmlinkage void ignore_sysret(void);
|
|
|
|
-#if IS_ENABLED(CONFIG_KVM)
|
|
/* Save actual FS/GS selectors and bases to current->thread */
|
|
-void save_fsgs_for_kvm(void);
|
|
-#endif
|
|
+void current_save_fsgs(void);
|
|
#else /* X86_64 */
|
|
#ifdef CONFIG_STACKPROTECTOR
|
|
/*
|
|
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
|
|
index 85c7f9cab..aefb30bc5 100644
|
|
--- a/arch/x86/kernel/process_64.c
|
|
+++ b/arch/x86/kernel/process_64.c
|
|
@@ -247,18 +247,21 @@ static __always_inline void save_fsgs(struct task_struct *task)
|
|
}
|
|
}
|
|
|
|
-#if IS_ENABLED(CONFIG_KVM)
|
|
/*
|
|
* While a process is running,current->thread.fsbase and current->thread.gsbase
|
|
- * may not match the corresponding CPU registers (see save_base_legacy()). KVM
|
|
- * wants an efficient way to save and restore FSBASE and GSBASE.
|
|
- * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
|
|
+ * may not match the corresponding CPU registers (see save_base_legacy()).
|
|
*/
|
|
-void save_fsgs_for_kvm(void)
|
|
+void current_save_fsgs(void)
|
|
{
|
|
+ unsigned long flags;
|
|
+
|
|
+ /* Interrupts need to be off for FSGSBASE */
|
|
+ local_irq_save(flags);
|
|
save_fsgs(current);
|
|
+ local_irq_restore(flags);
|
|
}
|
|
-EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
|
|
+#if IS_ENABLED(CONFIG_KVM)
|
|
+EXPORT_SYMBOL_GPL(current_save_fsgs);
|
|
#endif
|
|
|
|
static __always_inline void loadseg(enum which_selector which,
|
|
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
|
|
index d7aa0dfab..430774bff 100644
|
|
--- a/arch/x86/kvm/vmx/vmx.c
|
|
+++ b/arch/x86/kvm/vmx/vmx.c
|
|
@@ -1167,7 +1167,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
|
|
|
|
gs_base = cpu_kernelmode_gs_base(cpu);
|
|
if (likely(is_64bit_mm(current->mm))) {
|
|
- save_fsgs_for_kvm();
|
|
+ current_save_fsgs();
|
|
fs_sel = current->thread.fsindex;
|
|
gs_sel = current->thread.gsindex;
|
|
fs_base = current->thread.fsbase;
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From f91977e48cbdb921d6cd6768fd9701643aa3d6cd Mon Sep 17 00:00:00 2001
|
|
From: "Chang S. Bae" <chang.seok.bae@intel.com>
|
|
Date: Thu, 28 May 2020 16:13:53 -0400
|
|
Subject: [PATCH 07/21] x86/process/64: Use FSGSBASE instructions on thread
|
|
copy and ptrace
|
|
|
|
When FSGSBASE is enabled, copying threads and reading fsbase and gsbase
|
|
using ptrace must read the actual values.
|
|
|
|
When copying a thread, use save_fsgs() and copy the saved values. For
|
|
ptrace, the bases must be read from memory regardless of the selector if
|
|
FSGSBASE is enabled.
|
|
|
|
[ tglx: Invoke __rdgsbase_inactive() with interrupts disabled ]
|
|
[ luto: Massage changelog ]
|
|
|
|
Suggested-by: Andy Lutomirski <luto@kernel.org>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: "H . Peter Anvin" <hpa@zytor.com>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Link: https://lkml.kernel.org/r/1557309753-24073-9-git-send-email-chang.seok.bae@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/kernel/process.c | 10 ++++++----
|
|
arch/x86/kernel/process_64.c | 6 ++++--
|
|
2 files changed, 10 insertions(+), 6 deletions(-)
|
|
|
|
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
|
|
index 8f4533c1a..c34924d91 100644
|
|
--- a/arch/x86/kernel/process.c
|
|
+++ b/arch/x86/kernel/process.c
|
|
@@ -140,10 +140,12 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
|
|
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
|
|
|
|
#ifdef CONFIG_X86_64
|
|
- savesegment(gs, p->thread.gsindex);
|
|
- p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase;
|
|
- savesegment(fs, p->thread.fsindex);
|
|
- p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase;
|
|
+ current_save_fsgs();
|
|
+ p->thread.fsindex = current->thread.fsindex;
|
|
+ p->thread.fsbase = current->thread.fsbase;
|
|
+ p->thread.gsindex = current->thread.gsindex;
|
|
+ p->thread.gsbase = current->thread.gsbase;
|
|
+
|
|
savesegment(es, p->thread.es);
|
|
savesegment(ds, p->thread.ds);
|
|
#else
|
|
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
|
|
index aefb30bc5..0bcb48a12 100644
|
|
--- a/arch/x86/kernel/process_64.c
|
|
+++ b/arch/x86/kernel/process_64.c
|
|
@@ -423,7 +423,8 @@ unsigned long x86_fsbase_read_task(struct task_struct *task)
|
|
|
|
if (task == current)
|
|
fsbase = x86_fsbase_read_cpu();
|
|
- else if (task->thread.fsindex == 0)
|
|
+ else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
|
|
+ (task->thread.fsindex == 0))
|
|
fsbase = task->thread.fsbase;
|
|
else
|
|
fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
|
|
@@ -437,7 +438,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task)
|
|
|
|
if (task == current)
|
|
gsbase = x86_gsbase_read_cpu_inactive();
|
|
- else if (task->thread.gsindex == 0)
|
|
+ else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
|
|
+ (task->thread.gsindex == 0))
|
|
gsbase = task->thread.gsbase;
|
|
else
|
|
gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From aaa66b23a4968a6828236e3bebd65f8cf6cc075e Mon Sep 17 00:00:00 2001
|
|
From: Tony Luck <tony.luck@intel.com>
|
|
Date: Thu, 28 May 2020 16:13:54 -0400
|
|
Subject: [PATCH 08/21] x86/speculation/swapgs: Check FSGSBASE in enabling
|
|
SWAPGS mitigation
|
|
|
|
Before enabling FSGSBASE the kernel could safely assume that the content
|
|
of GS base was a user address. Thus any speculative access as the result
|
|
of a mispredicted branch controlling the execution of SWAPGS would be to
|
|
a user address. So systems with speculation-proof SMAP did not need to
|
|
add additional LFENCE instructions to mitigate.
|
|
|
|
With FSGSBASE enabled a hostile user can set GS base to a kernel address.
|
|
So they can make the kernel speculatively access data they wish to leak
|
|
via a side channel. This means that SMAP provides no protection.
|
|
|
|
Add FSGSBASE as an additional condition to enable the fence-based SWAPGS
|
|
mitigation.
|
|
|
|
Signed-off-by: Tony Luck <tony.luck@intel.com>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Cc: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Borislav Petkov <bp@alien8.de>
|
|
Cc: Andy Lutomirski <luto@kernel.org>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Cc: Dave Hansen <dave.hansen@intel.com>
|
|
Cc: Tony Luck <tony.luck@intel.com>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/kernel/cpu/bugs.c | 6 ++----
|
|
1 file changed, 2 insertions(+), 4 deletions(-)
|
|
|
|
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
|
|
index b53dcff21..b3004dda4 100644
|
|
--- a/arch/x86/kernel/cpu/bugs.c
|
|
+++ b/arch/x86/kernel/cpu/bugs.c
|
|
@@ -543,14 +543,12 @@ static void __init spectre_v1_select_mitigation(void)
|
|
* If FSGSBASE is enabled, the user can put a kernel address in
|
|
* GS, in which case SMAP provides no protection.
|
|
*
|
|
- * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the
|
|
- * FSGSBASE enablement patches have been merged. ]
|
|
- *
|
|
* If FSGSBASE is disabled, the user can only put a user space
|
|
* address in GS. That makes an attack harder, but still
|
|
* possible if there's no SMAP protection.
|
|
*/
|
|
- if (!smap_works_speculatively()) {
|
|
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
|
|
+ !smap_works_speculatively()) {
|
|
/*
|
|
* Mitigation can be provided from SWAPGS itself or
|
|
* PTI as the CR3 write in the Meltdown mitigation
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From 900b2defa0185481b6f8a05b01ddea8844249547 Mon Sep 17 00:00:00 2001
|
|
From: "Chang S. Bae" <chang.seok.bae@intel.com>
|
|
Date: Thu, 28 May 2020 16:13:55 -0400
|
|
Subject: [PATCH 09/21] x86/entry/64: Switch CR3 before SWAPGS in paranoid
|
|
entry
|
|
|
|
When FSGSBASE is enabled, the GSBASE handling in paranoid entry will need
|
|
to retrieve the kernel GSBASE which requires that the kernel page table is
|
|
active.
|
|
|
|
As the CR3 switch to the kernel page tables (PTI is active) does not depend
|
|
on kernel GSBASE, move the CR3 switch in front of the GSBASE handling.
|
|
|
|
Comment the EBX content while at it.
|
|
|
|
No functional change.
|
|
|
|
[ tglx: Rewrote changelog and comments ]
|
|
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Andy Lutomirski <luto@kernel.org>
|
|
Cc: "H . Peter Anvin" <hpa@zytor.com>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
|
|
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Link: https://lkml.kernel.org/r/1557309753-24073-11-git-send-email-chang.seok.bae@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/entry/entry_64.S | 32 ++++++++++++++++++++++++--------
|
|
1 file changed, 24 insertions(+), 8 deletions(-)
|
|
|
|
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
|
|
index 3063aa909..3b9ccba6c 100644
|
|
--- a/arch/x86/entry/entry_64.S
|
|
+++ b/arch/x86/entry/entry_64.S
|
|
@@ -1220,13 +1220,6 @@ SYM_CODE_START_LOCAL(paranoid_entry)
|
|
cld
|
|
PUSH_AND_CLEAR_REGS save_ret=1
|
|
ENCODE_FRAME_POINTER 8
|
|
- movl $1, %ebx
|
|
- movl $MSR_GS_BASE, %ecx
|
|
- rdmsr
|
|
- testl %edx, %edx
|
|
- js 1f /* negative -> in kernel */
|
|
- SWAPGS
|
|
- xorl %ebx, %ebx
|
|
|
|
1:
|
|
/*
|
|
@@ -1238,9 +1231,29 @@ SYM_CODE_START_LOCAL(paranoid_entry)
|
|
* This is also why CS (stashed in the "iret frame" by the
|
|
* hardware at entry) can not be used: this may be a return
|
|
* to kernel code, but with a user CR3 value.
|
|
+ *
|
|
+ * Switching CR3 does not depend on kernel GSBASE so it can
|
|
+ * be done before switching to the kernel GSBASE. This is
|
|
+ * required for FSGSBASE because the kernel GSBASE has to
|
|
+ * be retrieved from a kernel internal table.
|
|
*/
|
|
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
|
|
|
|
+ /* EBX = 1 -> kernel GSBASE active, no restore required */
|
|
+ movl $1, %ebx
|
|
+ /*
|
|
+ * The kernel-enforced convention is a negative GSBASE indicates
|
|
+ * a kernel value. No SWAPGS needed on entry and exit.
|
|
+ */
|
|
+ movl $MSR_GS_BASE, %ecx
|
|
+ rdmsr
|
|
+ testl %edx, %edx
|
|
+ jns .Lparanoid_entry_swapgs
|
|
+ ret
|
|
+
|
|
+.Lparanoid_entry_swapgs:
|
|
+ SWAPGS
|
|
+
|
|
/*
|
|
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
|
|
* unconditional CR3 write, even in the PTI case. So do an lfence
|
|
@@ -1248,6 +1261,8 @@ SYM_CODE_START_LOCAL(paranoid_entry)
|
|
*/
|
|
FENCE_SWAPGS_KERNEL_ENTRY
|
|
|
|
+ /* EBX = 0 -> SWAPGS required on exit */
|
|
+ xorl %ebx, %ebx
|
|
ret
|
|
SYM_CODE_END(paranoid_entry)
|
|
|
|
@@ -1267,7 +1282,8 @@ SYM_CODE_START_LOCAL(paranoid_exit)
|
|
UNWIND_HINT_REGS
|
|
DISABLE_INTERRUPTS(CLBR_ANY)
|
|
TRACE_IRQS_OFF_DEBUG
|
|
- testl %ebx, %ebx /* swapgs needed? */
|
|
+ /* If EBX is 0, SWAPGS is required */
|
|
+ testl %ebx, %ebx
|
|
jnz .Lparanoid_exit_no_swapgs
|
|
TRACE_IRQS_IRETQ
|
|
/* Always restore stashed CR3 value (see paranoid_entry) */
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From bee58bcdc0ee08fedc95a80bf3bd883247e72a83 Mon Sep 17 00:00:00 2001
|
|
From: "Chang S. Bae" <chang.seok.bae@intel.com>
|
|
Date: Thu, 28 May 2020 16:13:56 -0400
|
|
Subject: [PATCH 10/21] x86/entry/64: Introduce the FIND_PERCPU_BASE macro
|
|
|
|
GSBASE is used to find per-CPU data in the kernel. But when GSBASE is
|
|
unknown, the per-CPU base can be found from the per_cpu_offset table with a
|
|
CPU NR. The CPU NR is extracted from the limit field of the CPUNODE entry
|
|
in GDT, or by the RDPID instruction. This is a prerequisite for using
|
|
FSGSBASE in the low level entry code.
|
|
|
|
Also, add the GAS-compatible RDPID macro as binutils 2.21 do not support
|
|
it. Support is added in version 2.27.
|
|
|
|
[ tglx: Massaged changelog ]
|
|
|
|
Suggested-by: H. Peter Anvin <hpa@zytor.com>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Andy Lutomirski <luto@kernel.org>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
|
|
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
|
Link: https://lkml.kernel.org/r/1557309753-24073-12-git-send-email-chang.seok.bae@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/entry/calling.h | 34 ++++++++++++++++++++++++++++++++++
|
|
arch/x86/include/asm/inst.h | 15 +++++++++++++++
|
|
2 files changed, 49 insertions(+)
|
|
|
|
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
|
|
index 1c7f13bb6..1b95cdc19 100644
|
|
--- a/arch/x86/entry/calling.h
|
|
+++ b/arch/x86/entry/calling.h
|
|
@@ -6,6 +6,7 @@
|
|
#include <asm/percpu.h>
|
|
#include <asm/asm-offsets.h>
|
|
#include <asm/processor-flags.h>
|
|
+#include <asm/inst.h>
|
|
|
|
/*
|
|
|
|
@@ -349,6 +350,39 @@ For 32-bit we have the following conventions - kernel is built with
|
|
#endif
|
|
.endm
|
|
|
|
+#ifdef CONFIG_SMP
|
|
+
|
|
+/*
|
|
+ * CPU/node NR is loaded from the limit (size) field of a special segment
|
|
+ * descriptor entry in GDT.
|
|
+ */
|
|
+.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
|
|
+ movq $__CPUNODE_SEG, \reg
|
|
+ lsl \reg, \reg
|
|
+.endm
|
|
+
|
|
+/*
|
|
+ * Fetch the per-CPU GSBASE value for this processor and put it in @reg.
|
|
+ * We normally use %gs for accessing per-CPU data, but we are setting up
|
|
+ * %gs here and obviously can not use %gs itself to access per-CPU data.
|
|
+ */
|
|
+.macro GET_PERCPU_BASE reg:req
|
|
+ ALTERNATIVE \
|
|
+ "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \
|
|
+ "RDPID \reg", \
|
|
+ X86_FEATURE_RDPID
|
|
+ andq $VDSO_CPUNODE_MASK, \reg
|
|
+ movq __per_cpu_offset(, \reg, 8), \reg
|
|
+.endm
|
|
+
|
|
+#else
|
|
+
|
|
+.macro GET_PERCPU_BASE reg:req
|
|
+ movq pcpu_unit_offsets(%rip), \reg
|
|
+.endm
|
|
+
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
/*
|
|
* This does 'call enter_from_user_mode' unless we can avoid it based on
|
|
* kernel config or using the static jump infrastructure.
|
|
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
|
|
index f5a796da0..d063841a1 100644
|
|
--- a/arch/x86/include/asm/inst.h
|
|
+++ b/arch/x86/include/asm/inst.h
|
|
@@ -306,6 +306,21 @@
|
|
.endif
|
|
MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
|
|
.endm
|
|
+
|
|
+.macro RDPID opd
|
|
+ REG_TYPE rdpid_opd_type \opd
|
|
+ .if rdpid_opd_type == REG_TYPE_R64
|
|
+ R64_NUM rdpid_opd \opd
|
|
+ .else
|
|
+ R32_NUM rdpid_opd \opd
|
|
+ .endif
|
|
+ .byte 0xf3
|
|
+ .if rdpid_opd > 7
|
|
+ PFX_REX rdpid_opd 0
|
|
+ .endif
|
|
+ .byte 0x0f, 0xc7
|
|
+ MODRM 0xc0 rdpid_opd 0x7
|
|
+.endm
|
|
#endif
|
|
|
|
#endif
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From 9326ff1d674fbd2f9bba99a4658ff462b986a7c9 Mon Sep 17 00:00:00 2001
|
|
From: "Chang S. Bae" <chang.seok.bae@intel.com>
|
|
Date: Thu, 28 May 2020 16:13:57 -0400
|
|
Subject: [PATCH 11/21] x86/entry/64: Handle FSGSBASE enabled paranoid
|
|
entry/exit
|
|
|
|
Without FSGSBASE, user space cannot change GSBASE other than through a
|
|
PRCTL. The kernel enforces that the user space GSBASE value is postive as
|
|
negative values are used for detecting the kernel space GSBASE value in the
|
|
paranoid entry code.
|
|
|
|
If FSGSBASE is enabled, user space can set arbitrary GSBASE values without
|
|
kernel intervention, including negative ones, which breaks the paranoid
|
|
entry assumptions.
|
|
|
|
To avoid this, paranoid entry needs to unconditionally save the current
|
|
GSBASE value independent of the interrupted context, retrieve and write the
|
|
kernel GSBASE and unconditionally restore the saved value on exit. The
|
|
restore happens either in paranoid_exit or in the special exit path of the
|
|
NMI low level code.
|
|
|
|
All other entry code pathes which use unconditional SWAPGS are not affected
|
|
as they do not depend on the actual content.
|
|
|
|
[ tglx: Massaged changelogs and comments ]
|
|
|
|
Suggested-by: H. Peter Anvin <hpa@zytor.com>
|
|
Suggested-by: Andy Lutomirski <luto@kernel.org>
|
|
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
|
|
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
|
Link: https://lkml.kernel.org/r/1557309753-24073-13-git-send-email-chang.seok.bae@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/entry/calling.h | 6 +++
|
|
arch/x86/entry/entry_64.S | 107 ++++++++++++++++++++++++++++++--------
|
|
2 files changed, 91 insertions(+), 22 deletions(-)
|
|
|
|
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
|
|
index 1b95cdc19..57335f948 100644
|
|
--- a/arch/x86/entry/calling.h
|
|
+++ b/arch/x86/entry/calling.h
|
|
@@ -342,6 +342,12 @@ For 32-bit we have the following conventions - kernel is built with
|
|
#endif
|
|
.endm
|
|
|
|
+.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
|
|
+ rdgsbase \save_reg
|
|
+ GET_PERCPU_BASE \scratch_reg
|
|
+ wrgsbase \scratch_reg
|
|
+.endm
|
|
+
|
|
#endif /* CONFIG_X86_64 */
|
|
|
|
.macro STACKLEAK_ERASE
|
|
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
|
|
index 3b9ccba6c..53246c470 100644
|
|
--- a/arch/x86/entry/entry_64.S
|
|
+++ b/arch/x86/entry/entry_64.S
|
|
@@ -38,6 +38,7 @@
|
|
#include <asm/export.h>
|
|
#include <asm/frame.h>
|
|
#include <asm/nospec-branch.h>
|
|
+#include <asm/fsgsbase.h>
|
|
#include <linux/err.h>
|
|
|
|
#include "calling.h"
|
|
@@ -921,7 +922,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
|
|
.endif
|
|
|
|
.if \paranoid
|
|
- /* this procedure expect "no swapgs" flag in ebx */
|
|
jmp paranoid_exit
|
|
.else
|
|
jmp error_exit
|
|
@@ -1211,9 +1211,14 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
|
|
#endif
|
|
|
|
/*
|
|
- * Save all registers in pt_regs, and switch gs if needed.
|
|
- * Use slow, but surefire "are we in kernel?" check.
|
|
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
|
|
+ * Save all registers in pt_regs. Return GSBASE related information
|
|
+ * in EBX depending on the availability of the FSGSBASE instructions:
|
|
+ *
|
|
+ * FSGSBASE R/EBX
|
|
+ * N 0 -> SWAPGS on exit
|
|
+ * 1 -> no SWAPGS on exit
|
|
+ *
|
|
+ * Y GSBASE value at entry, must be restored in paranoid_exit
|
|
*/
|
|
SYM_CODE_START_LOCAL(paranoid_entry)
|
|
UNWIND_HINT_FUNC
|
|
@@ -1221,7 +1226,6 @@ SYM_CODE_START_LOCAL(paranoid_entry)
|
|
PUSH_AND_CLEAR_REGS save_ret=1
|
|
ENCODE_FRAME_POINTER 8
|
|
|
|
-1:
|
|
/*
|
|
* Always stash CR3 in %r14. This value will be restored,
|
|
* verbatim, at exit. Needed if paranoid_entry interrupted
|
|
@@ -1239,6 +1243,28 @@ SYM_CODE_START_LOCAL(paranoid_entry)
|
|
*/
|
|
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
|
|
|
|
+ /*
|
|
+ * Handling GSBASE depends on the availability of FSGSBASE.
|
|
+ *
|
|
+ * Without FSGSBASE the kernel enforces that negative GSBASE
|
|
+ * values indicate kernel GSBASE. With FSGSBASE no assumptions
|
|
+ * can be made about the GSBASE value when entering from user
|
|
+ * space.
|
|
+ */
|
|
+ ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
|
|
+
|
|
+ /*
|
|
+ * Read the current GSBASE and store it in %rbx unconditionally,
|
|
+ * retrieve and set the current CPUs kernel GSBASE. The stored value
|
|
+ * has to be restored in paranoid_exit unconditionally.
|
|
+ *
|
|
+ * The MSR write ensures that no subsequent load is based on a
|
|
+ * mispredicted GSBASE. No extra FENCE required.
|
|
+ */
|
|
+ SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
|
|
+ ret
|
|
+
|
|
+.Lparanoid_entry_checkgs:
|
|
/* EBX = 1 -> kernel GSBASE active, no restore required */
|
|
movl $1, %ebx
|
|
/*
|
|
@@ -1273,28 +1299,48 @@ SYM_CODE_END(paranoid_entry)
|
|
*
|
|
* We may be returning to very strange contexts (e.g. very early
|
|
* in syscall entry), so checking for preemption here would
|
|
- * be complicated. Fortunately, we there's no good reason
|
|
- * to try to handle preemption here.
|
|
+ * be complicated. Fortunately, there's no good reason to try
|
|
+ * to handle preemption here.
|
|
+ *
|
|
+ * R/EBX contains the GSBASE related information depending on the
|
|
+ * availability of the FSGSBASE instructions:
|
|
+ *
|
|
+ * FSGSBASE R/EBX
|
|
+ * N 0 -> SWAPGS on exit
|
|
+ * 1 -> no SWAPGS on exit
|
|
*
|
|
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
|
|
+ * Y User space GSBASE, must be restored unconditionally
|
|
*/
|
|
SYM_CODE_START_LOCAL(paranoid_exit)
|
|
UNWIND_HINT_REGS
|
|
DISABLE_INTERRUPTS(CLBR_ANY)
|
|
- TRACE_IRQS_OFF_DEBUG
|
|
- /* If EBX is 0, SWAPGS is required */
|
|
- testl %ebx, %ebx
|
|
- jnz .Lparanoid_exit_no_swapgs
|
|
- TRACE_IRQS_IRETQ
|
|
- /* Always restore stashed CR3 value (see paranoid_entry) */
|
|
- RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
|
|
- SWAPGS_UNSAFE_STACK
|
|
- jmp restore_regs_and_return_to_kernel
|
|
-.Lparanoid_exit_no_swapgs:
|
|
+ /*
|
|
+ * The order of operations is important. IRQ tracing requires
|
|
+ * kernel GSBASE and CR3. RESTORE_CR3 requires kernel GSBASE.
|
|
+ *
|
|
+ * NB to anyone to try to optimize this code: this code does
|
|
+ * not execute at all for exceptions from user mode. Those
|
|
+ * exceptions go through error_exit instead.
|
|
+ */
|
|
TRACE_IRQS_IRETQ_DEBUG
|
|
- /* Always restore stashed CR3 value (see paranoid_entry) */
|
|
- RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
|
|
- jmp restore_regs_and_return_to_kernel
|
|
+
|
|
+ RESTORE_CR3 scratch_reg=%rax save_reg=%r14
|
|
+
|
|
+ /* Handle the three GSBASE cases */
|
|
+ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
|
|
+
|
|
+ /* With FSGSBASE enabled, unconditionally restore GSBASE */
|
|
+ wrgsbase %rbx
|
|
+ jmp restore_regs_and_return_to_kernel
|
|
+
|
|
+.Lparanoid_exit_checkgs:
|
|
+ /* On non-FSGSBASE systems, conditionally do SWAPGS */
|
|
+ testl %ebx, %ebx
|
|
+ jnz restore_regs_and_return_to_kernel
|
|
+
|
|
+ /* We are returning to a context with user GSBASE */
|
|
+ SWAPGS_UNSAFE_STACK
|
|
+ jmp restore_regs_and_return_to_kernel
|
|
SYM_CODE_END(paranoid_exit)
|
|
|
|
/*
|
|
@@ -1702,10 +1748,27 @@ end_repeat_nmi:
|
|
/* Always restore stashed CR3 value (see paranoid_entry) */
|
|
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
|
|
|
|
- testl %ebx, %ebx /* swapgs needed? */
|
|
+ /*
|
|
+ * The above invocation of paranoid_entry stored the GSBASE
|
|
+ * related information in R/EBX depending on the availability
|
|
+ * of FSGSBASE.
|
|
+ *
|
|
+ * If FSGSBASE is enabled, restore the saved GSBASE value
|
|
+ * unconditionally, otherwise take the conditional SWAPGS path.
|
|
+ */
|
|
+ ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
|
|
+
|
|
+ wrgsbase %rbx
|
|
+ jmp nmi_restore
|
|
+
|
|
+nmi_no_fsgsbase:
|
|
+ /* EBX == 0 -> invoke SWAPGS */
|
|
+ testl %ebx, %ebx
|
|
jnz nmi_restore
|
|
+
|
|
nmi_swapgs:
|
|
SWAPGS_UNSAFE_STACK
|
|
+
|
|
nmi_restore:
|
|
POP_REGS
|
|
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From a50c828845a3f9fc3d087841a53e17c470e0dfcb Mon Sep 17 00:00:00 2001
|
|
From: Andy Lutomirski <luto@kernel.org>
|
|
Date: Thu, 28 May 2020 16:13:58 -0400
|
|
Subject: [PATCH 12/21] x86/cpu: Enable FSGSBASE on 64bit by default and add a
|
|
chicken bit
|
|
|
|
Now that FSGSBASE is fully supported, remove unsafe_fsgsbase, enable
|
|
FSGSBASE by default, and add nofsgsbase to disable it.
|
|
|
|
Signed-off-by: Andy Lutomirski <luto@kernel.org>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Reviewed-by: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Link: https://lkml.kernel.org/r/1557309753-24073-17-git-send-email-chang.seok.bae@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
.../admin-guide/kernel-parameters.txt | 3 +-
|
|
arch/x86/kernel/cpu/common.c | 32 ++++++++-----------
|
|
2 files changed, 15 insertions(+), 20 deletions(-)
|
|
|
|
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
|
|
index 588f4eb13..91f45239b 100644
|
|
--- a/Documentation/admin-guide/kernel-parameters.txt
|
|
+++ b/Documentation/admin-guide/kernel-parameters.txt
|
|
@@ -3033,8 +3033,7 @@
|
|
no5lvl [X86-64] Disable 5-level paging mode. Forces
|
|
kernel to use 4-level paging instead.
|
|
|
|
- unsafe_fsgsbase [X86] Allow FSGSBASE instructions. This will be
|
|
- replaced with a nofsgsbase flag.
|
|
+ nofsgsbase [X86] Disables FSGSBASE instructions.
|
|
|
|
no_console_suspend
|
|
[HW] Never suspend the console
|
|
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
|
|
index 7e2954ba6..9d891f63b 100644
|
|
--- a/arch/x86/kernel/cpu/common.c
|
|
+++ b/arch/x86/kernel/cpu/common.c
|
|
@@ -418,21 +418,21 @@ static void __init setup_cr_pinning(void)
|
|
static_key_enable(&cr_pinning.key);
|
|
}
|
|
|
|
-/*
|
|
- * Temporary hack: FSGSBASE is unsafe until a few kernel code paths are
|
|
- * updated. This allows us to get the kernel ready incrementally.
|
|
- *
|
|
- * Once all the pieces are in place, these will go away and be replaced with
|
|
- * a nofsgsbase chicken flag.
|
|
- */
|
|
-static bool unsafe_fsgsbase;
|
|
-
|
|
-static __init int setup_unsafe_fsgsbase(char *arg)
|
|
+static __init int x86_nofsgsbase_setup(char *arg)
|
|
{
|
|
- unsafe_fsgsbase = true;
|
|
+ /* Require an exact match without trailing characters. */
|
|
+ if (strlen(arg))
|
|
+ return 0;
|
|
+
|
|
+ /* Do not emit a message if the feature is not present. */
|
|
+ if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
|
|
+ return 1;
|
|
+
|
|
+ setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
|
|
+ pr_info("FSGSBASE disabled via kernel command line\n");
|
|
return 1;
|
|
}
|
|
-__setup("unsafe_fsgsbase", setup_unsafe_fsgsbase);
|
|
+__setup("nofsgsbase", x86_nofsgsbase_setup);
|
|
|
|
/*
|
|
* Protection Keys are not available in 32-bit mode.
|
|
@@ -1528,12 +1528,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
|
|
setup_umip(c);
|
|
|
|
/* Enable FSGSBASE instructions if available. */
|
|
- if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
|
|
- if (unsafe_fsgsbase)
|
|
- cr4_set_bits(X86_CR4_FSGSBASE);
|
|
- else
|
|
- clear_cpu_cap(c, X86_FEATURE_FSGSBASE);
|
|
- }
|
|
+ if (cpu_has(c, X86_FEATURE_FSGSBASE))
|
|
+ cr4_set_bits(X86_CR4_FSGSBASE);
|
|
|
|
/*
|
|
* The vendor-specific functions might have changed features.
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From 81c0b794d059bdbd98019ccf9eb238694a9b92e0 Mon Sep 17 00:00:00 2001
|
|
From: Andi Kleen <ak@linux.intel.com>
|
|
Date: Thu, 28 May 2020 16:13:59 -0400
|
|
Subject: [PATCH 13/21] x86/elf: Enumerate kernel FSGSBASE capability in
|
|
AT_HWCAP2
|
|
|
|
The kernel needs to explicitly enable FSGSBASE. So, the application needs
|
|
to know if it can safely use these instructions. Just looking at the CPUID
|
|
bit is not enough because it may be running in a kernel that does not
|
|
enable the instructions.
|
|
|
|
One way for the application would be to just try and catch the SIGILL.
|
|
But that is difficult to do in libraries which may not want to overwrite
|
|
the signal handlers of the main application.
|
|
|
|
Enumerate the enabled FSGSBASE capability in bit 1 of AT_HWCAP2 in the ELF
|
|
aux vector. AT_HWCAP2 is already used by PPC for similar purposes.
|
|
|
|
The application can access it open coded or by using the getauxval()
|
|
function in newer versions of glibc.
|
|
|
|
[ tglx: Massaged changelog ]
|
|
|
|
Signed-off-by: Andi Kleen <ak@linux.intel.com>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Andy Lutomirski <luto@kernel.org>
|
|
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Link: https://lkml.kernel.org/r/1557309753-24073-18-git-send-email-chang.seok.bae@intel.com
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
arch/x86/include/uapi/asm/hwcap2.h | 3 +++
|
|
arch/x86/kernel/cpu/common.c | 4 +++-
|
|
2 files changed, 6 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h
|
|
index 8b2effe6e..5fdfcb470 100644
|
|
--- a/arch/x86/include/uapi/asm/hwcap2.h
|
|
+++ b/arch/x86/include/uapi/asm/hwcap2.h
|
|
@@ -5,4 +5,7 @@
|
|
/* MONITOR/MWAIT enabled in Ring 3 */
|
|
#define HWCAP2_RING3MWAIT (1 << 0)
|
|
|
|
+/* Kernel allows FSGSBASE instructions available in Ring 3 */
|
|
+#define HWCAP2_FSGSBASE BIT(1)
|
|
+
|
|
#endif
|
|
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
|
|
index 9d891f63b..74f6a08e7 100644
|
|
--- a/arch/x86/kernel/cpu/common.c
|
|
+++ b/arch/x86/kernel/cpu/common.c
|
|
@@ -1528,8 +1528,10 @@ static void identify_cpu(struct cpuinfo_x86 *c)
|
|
setup_umip(c);
|
|
|
|
/* Enable FSGSBASE instructions if available. */
|
|
- if (cpu_has(c, X86_FEATURE_FSGSBASE))
|
|
+ if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
|
|
cr4_set_bits(X86_CR4_FSGSBASE);
|
|
+ elf_hwcap2 |= HWCAP2_FSGSBASE;
|
|
+ }
|
|
|
|
/*
|
|
* The vendor-specific functions might have changed features.
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From be779b20ff6e867cb4acb2749325bb5b3426d6dd Mon Sep 17 00:00:00 2001
|
|
From: Thomas Gleixner <tglx@linutronix.de>
|
|
Date: Thu, 28 May 2020 16:14:00 -0400
|
|
Subject: [PATCH 14/21] Documentation/x86/64: Add documentation for GS/FS
|
|
addressing mode
|
|
|
|
Explain how the GS/FS based addressing can be utilized in user space
|
|
applications along with the differences between the generic prctl() based
|
|
GS/FS base control and the FSGSBASE version available on newer CPUs.
|
|
|
|
Originally-by: Andi Kleen <ak@linux.intel.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Reviewed-by: Tony Luck <tony.luck@intel.com>
|
|
Cc: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Borislav Petkov <bp@alien8.de>
|
|
Cc: Andy Lutomirski <luto@kernel.org>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Cc: Dave Hansen <dave.hansen@intel.com>
|
|
Cc: Tony Luck <tony.luck@intel.com>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Randy Dunlap <rdunlap@infradead.org>
|
|
Cc: Jonathan Corbet <corbet@lwn.net>
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
Documentation/x86/x86_64/fsgs.rst | 199 +++++++++++++++++++++++++++++
|
|
Documentation/x86/x86_64/index.rst | 1 +
|
|
2 files changed, 200 insertions(+)
|
|
create mode 100644 Documentation/x86/x86_64/fsgs.rst
|
|
|
|
diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst
|
|
new file mode 100644
|
|
index 000000000..50960e09e
|
|
--- /dev/null
|
|
+++ b/Documentation/x86/x86_64/fsgs.rst
|
|
@@ -0,0 +1,199 @@
|
|
+.. SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+Using FS and GS segments in user space applications
|
|
+===================================================
|
|
+
|
|
+The x86 architecture supports segmentation. Instructions which access
|
|
+memory can use segment register based addressing mode. The following
|
|
+notation is used to address a byte within a segment:
|
|
+
|
|
+ Segment-register:Byte-address
|
|
+
|
|
+The segment base address is added to the Byte-address to compute the
|
|
+resulting virtual address which is accessed. This allows to access multiple
|
|
+instances of data with the identical Byte-address, i.e. the same code. The
|
|
+selection of a particular instance is purely based on the base-address in
|
|
+the segment register.
|
|
+
|
|
+In 32-bit mode the CPU provides 6 segments, which also support segment
|
|
+limits. The limits can be used to enforce address space protections.
|
|
+
|
|
+In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is
|
|
+always 0 to provide a full 64bit address space. The FS and GS segments are
|
|
+still functional in 64-bit mode.
|
|
+
|
|
+Common FS and GS usage
|
|
+------------------------------
|
|
+
|
|
+The FS segment is commonly used to address Thread Local Storage (TLS). FS
|
|
+is usually managed by runtime code or a threading library. Variables
|
|
+declared with the '__thread' storage class specifier are instantiated per
|
|
+thread and the compiler emits the FS: address prefix for accesses to these
|
|
+variables. Each thread has its own FS base address so common code can be
|
|
+used without complex address offset calculations to access the per thread
|
|
+instances. Applications should not use FS for other purposes when they use
|
|
+runtimes or threading libraries which manage the per thread FS.
|
|
+
|
|
+The GS segment has no common use and can be used freely by
|
|
+applications. GCC and Clang support GS based addressing via address space
|
|
+identifiers.
|
|
+
|
|
+Reading and writing the FS/GS base address
|
|
+------------------------------------------
|
|
+
|
|
+There exist two mechanisms to read and write the FS/GS base address:
|
|
+
|
|
+ - the arch_prctl() system call
|
|
+
|
|
+ - the FSGSBASE instruction family
|
|
+
|
|
+Accessing FS/GS base with arch_prctl()
|
|
+--------------------------------------
|
|
+
|
|
+ The arch_prctl(2) based mechanism is available on all 64-bit CPUs and all
|
|
+ kernel versions.
|
|
+
|
|
+ Reading the base:
|
|
+
|
|
+ arch_prctl(ARCH_GET_FS, &fsbase);
|
|
+ arch_prctl(ARCH_GET_GS, &gsbase);
|
|
+
|
|
+ Writing the base:
|
|
+
|
|
+ arch_prctl(ARCH_SET_FS, fsbase);
|
|
+ arch_prctl(ARCH_SET_GS, gsbase);
|
|
+
|
|
+ The ARCH_SET_GS prctl may be disabled depending on kernel configuration
|
|
+ and security settings.
|
|
+
|
|
+Accessing FS/GS base with the FSGSBASE instructions
|
|
+---------------------------------------------------
|
|
+
|
|
+ With the Ivy Bridge CPU generation Intel introduced a new set of
|
|
+ instructions to access the FS and GS base registers directly from user
|
|
+ space. These instructions are also supported on AMD Family 17H CPUs. The
|
|
+ following instructions are available:
|
|
+
|
|
+ =============== ===========================
|
|
+ RDFSBASE %reg Read the FS base register
|
|
+ RDGSBASE %reg Read the GS base register
|
|
+ WRFSBASE %reg Write the FS base register
|
|
+ WRGSBASE %reg Write the GS base register
|
|
+ =============== ===========================
|
|
+
|
|
+ The instructions avoid the overhead of the arch_prctl() syscall and allow
|
|
+ more flexible usage of the FS/GS addressing modes in user space
|
|
+ applications. This does not prevent conflicts between threading libraries
|
|
+ and runtimes which utilize FS and applications which want to use it for
|
|
+ their own purpose.
|
|
+
|
|
+FSGSBASE instructions enablement
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+ The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If
|
|
+ available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs.
|
|
+
|
|
+ The availability of the instructions does not enable them
|
|
+ automatically. The kernel has to enable them explicitly in CR4. The
|
|
+ reason for this is that older kernels make assumptions about the values in
|
|
+ the GS register and enforce them when GS base is set via
|
|
+ arch_prctl(). Allowing user space to write arbitrary values to GS base
|
|
+ would violate these assumptions and cause malfunction.
|
|
+
|
|
+ On kernels which do not enable FSGSBASE the execution of the FSGSBASE
|
|
+ instructions will fault with a #UD exception.
|
|
+
|
|
+ The kernel provides reliable information about the enabled state in the
|
|
+ ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the
|
|
+ kernel has FSGSBASE instructions enabled and applications can use them.
|
|
+ The following code example shows how this detection works::
|
|
+
|
|
+ #include <sys/auxv.h>
|
|
+ #include <elf.h>
|
|
+
|
|
+ /* Will be eventually in asm/hwcap.h */
|
|
+ #ifndef HWCAP2_FSGSBASE
|
|
+ #define HWCAP2_FSGSBASE (1 << 1)
|
|
+ #endif
|
|
+
|
|
+ ....
|
|
+
|
|
+ unsigned val = getauxval(AT_HWCAP2);
|
|
+
|
|
+ if (val & HWCAP2_FSGSBASE)
|
|
+ printf("FSGSBASE enabled\n");
|
|
+
|
|
+FSGSBASE instructions compiler support
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE
|
|
+instructions. Clang 5 supports them as well.
|
|
+
|
|
+ =================== ===========================
|
|
+ _readfsbase_u64() Read the FS base register
|
|
+ _readfsbase_u64() Read the GS base register
|
|
+ _writefsbase_u64() Write the FS base register
|
|
+ _writegsbase_u64() Write the GS base register
|
|
+ =================== ===========================
|
|
+
|
|
+To utilize these instrinsics <immintrin.h> must be included in the source
|
|
+code and the compiler option -mfsgsbase has to be added.
|
|
+
|
|
+Compiler support for FS/GS based addressing
|
|
+-------------------------------------------
|
|
+
|
|
+GCC version 6 and newer provide support for FS/GS based addressing via
|
|
+Named Address Spaces. GCC implements the following address space
|
|
+identifiers for x86:
|
|
+
|
|
+ ========= ====================================
|
|
+ __seg_fs Variable is addressed relative to FS
|
|
+ __seg_gs Variable is addressed relative to GS
|
|
+ ========= ====================================
|
|
+
|
|
+The preprocessor symbols __SEG_FS and __SEG_GS are defined when these
|
|
+address spaces are supported. Code which implements fallback modes should
|
|
+check whether these symbols are defined. Usage example::
|
|
+
|
|
+ #ifdef __SEG_GS
|
|
+
|
|
+ long data0 = 0;
|
|
+ long data1 = 1;
|
|
+
|
|
+ long __seg_gs *ptr;
|
|
+
|
|
+ /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */
|
|
+ ....
|
|
+
|
|
+ /* Set GS base to point to data0 */
|
|
+ _writegsbase_u64(&data0);
|
|
+
|
|
+ /* Access offset 0 of GS */
|
|
+ ptr = 0;
|
|
+ printf("data0 = %ld\n", *ptr);
|
|
+
|
|
+ /* Set GS base to point to data1 */
|
|
+ _writegsbase_u64(&data1);
|
|
+ /* ptr still addresses offset 0! */
|
|
+ printf("data1 = %ld\n", *ptr);
|
|
+
|
|
+
|
|
+Clang does not provide the GCC address space identifiers, but it provides
|
|
+address spaces via an attribute based mechanism in Clang 2.6 and newer
|
|
+versions:
|
|
+
|
|
+ ==================================== =====================================
|
|
+ __attribute__((address_space(256)) Variable is addressed relative to GS
|
|
+ __attribute__((address_space(257)) Variable is addressed relative to FS
|
|
+ ==================================== =====================================
|
|
+
|
|
+FS/GS based addressing with inline assembly
|
|
+-------------------------------------------
|
|
+
|
|
+In case the compiler does not support address spaces, inline assembly can
|
|
+be used for FS/GS based addressing mode::
|
|
+
|
|
+ mov %fs:offset, %reg
|
|
+ mov %gs:offset, %reg
|
|
+
|
|
+ mov %reg, %fs:offset
|
|
+ mov %reg, %gs:offset
|
|
diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst
|
|
index d6eaaa5a3..a56070fc8 100644
|
|
--- a/Documentation/x86/x86_64/index.rst
|
|
+++ b/Documentation/x86/x86_64/index.rst
|
|
@@ -14,3 +14,4 @@ x86_64 Support
|
|
fake-numa-for-cpusets
|
|
cpu-hotplug-spec
|
|
machinecheck
|
|
+ fsgs
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From aa373b15eb7fc3d76170411197e536684b8c55e8 Mon Sep 17 00:00:00 2001
|
|
From: "Chang S. Bae" <chang.seok.bae@intel.com>
|
|
Date: Thu, 28 May 2020 16:14:01 -0400
|
|
Subject: [PATCH 15/21] selftests/x86/fsgsbase: Test GS selector on
|
|
ptracer-induced GS base write
|
|
|
|
The test validates that the selector is not changed when a ptracer writes
|
|
the ptracee's GS base.
|
|
|
|
Originally-by: Andy Lutomirski <luto@kernel.org>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Reviewed-by: Tony Luck <tony.luck@intel.com>
|
|
Cc: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Borislav Petkov <bp@alien8.de>
|
|
Cc: Andy Lutomirski <luto@kernel.org>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Cc: Dave Hansen <dave.hansen@intel.com>
|
|
Cc: Tony Luck <tony.luck@intel.com>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
tools/testing/selftests/x86/fsgsbase.c | 21 +++++++++++++++------
|
|
1 file changed, 15 insertions(+), 6 deletions(-)
|
|
|
|
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
|
|
index 15a329da5..950a48b2e 100644
|
|
--- a/tools/testing/selftests/x86/fsgsbase.c
|
|
+++ b/tools/testing/selftests/x86/fsgsbase.c
|
|
@@ -465,7 +465,7 @@ static void test_ptrace_write_gsbase(void)
|
|
wait(&status);
|
|
|
|
if (WSTOPSIG(status) == SIGTRAP) {
|
|
- unsigned long gs, base;
|
|
+ unsigned long gs;
|
|
unsigned long gs_offset = USER_REGS_OFFSET(gs);
|
|
unsigned long base_offset = USER_REGS_OFFSET(gs_base);
|
|
|
|
@@ -481,7 +481,6 @@ static void test_ptrace_write_gsbase(void)
|
|
err(1, "PTRACE_POKEUSER");
|
|
|
|
gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
|
|
- base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL);
|
|
|
|
/*
|
|
* In a non-FSGSBASE system, the nonzero selector will load
|
|
@@ -489,11 +488,21 @@ static void test_ptrace_write_gsbase(void)
|
|
* selector value is changed or not by the GSBASE write in
|
|
* a ptracer.
|
|
*/
|
|
- if (gs == 0 && base == 0xFF) {
|
|
- printf("[OK]\tGS was reset as expected\n");
|
|
- } else {
|
|
+ if (gs != *shared_scratch) {
|
|
nerrs++;
|
|
- printf("[FAIL]\tGS=0x%lx, GSBASE=0x%lx (should be 0, 0xFF)\n", gs, base);
|
|
+ printf("[FAIL]\tGS changed to %lx\n", gs);
|
|
+
|
|
+ /*
|
|
+ * On older kernels, poking a nonzero value into the
|
|
+ * base would zero the selector. On newer kernels,
|
|
+ * this behavior has changed -- poking the base
|
|
+ * changes only the base and, if FSGSBASE is not
|
|
+ * available, this may not effect.
|
|
+ */
|
|
+ if (gs == 0)
|
|
+ printf("\tNote: this is expected behavior on older kernels.\n");
|
|
+ } else {
|
|
+ printf("[OK]\tGS remained 0x%hx\n", *shared_scratch);
|
|
}
|
|
}
|
|
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From 877a6d6273f3e3a73e9df1cf94ddf3d23cc9a6c1 Mon Sep 17 00:00:00 2001
|
|
From: "Chang S. Bae" <chang.seok.bae@intel.com>
|
|
Date: Thu, 28 May 2020 16:14:02 -0400
|
|
Subject: [PATCH 16/21] selftests/x86/fsgsbase: Test ptracer-induced GS base
|
|
write with FSGSBASE
|
|
|
|
This validates that GS selector and base are independently preserved in
|
|
ptrace commands.
|
|
|
|
Suggested-by: Andy Lutomirski <luto@kernel.org>
|
|
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
|
|
Reviewed-by: Tony Luck <tony.luck@intel.com>
|
|
Cc: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Borislav Petkov <bp@alien8.de>
|
|
Cc: Andy Lutomirski <luto@kernel.org>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Cc: Dave Hansen <dave.hansen@intel.com>
|
|
Cc: Tony Luck <tony.luck@intel.com>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Signed-off-by: Sasha Levin <sashal@kernel.org>
|
|
---
|
|
tools/testing/selftests/x86/fsgsbase.c | 11 +++++++++--
|
|
1 file changed, 9 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
|
|
index 950a48b2e..9a4349813 100644
|
|
--- a/tools/testing/selftests/x86/fsgsbase.c
|
|
+++ b/tools/testing/selftests/x86/fsgsbase.c
|
|
@@ -465,7 +465,7 @@ static void test_ptrace_write_gsbase(void)
|
|
wait(&status);
|
|
|
|
if (WSTOPSIG(status) == SIGTRAP) {
|
|
- unsigned long gs;
|
|
+ unsigned long gs, base;
|
|
unsigned long gs_offset = USER_REGS_OFFSET(gs);
|
|
unsigned long base_offset = USER_REGS_OFFSET(gs_base);
|
|
|
|
@@ -481,6 +481,7 @@ static void test_ptrace_write_gsbase(void)
|
|
err(1, "PTRACE_POKEUSER");
|
|
|
|
gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
|
|
+ base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL);
|
|
|
|
/*
|
|
* In a non-FSGSBASE system, the nonzero selector will load
|
|
@@ -501,8 +502,14 @@ static void test_ptrace_write_gsbase(void)
|
|
*/
|
|
if (gs == 0)
|
|
printf("\tNote: this is expected behavior on older kernels.\n");
|
|
+ } else if (have_fsgsbase && (base != 0xFF)) {
|
|
+ nerrs++;
|
|
+ printf("[FAIL]\tGSBASE changed to %lx\n", base);
|
|
} else {
|
|
- printf("[OK]\tGS remained 0x%hx\n", *shared_scratch);
|
|
+ printf("[OK]\tGS remained 0x%hx", *shared_scratch);
|
|
+ if (have_fsgsbase)
|
|
+ printf(" and GSBASE changed to 0xFF");
|
|
+ printf("\n");
|
|
}
|
|
}
|
|
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From 58cf585e43711f1b1b10c6e736d8843b4f9989c5 Mon Sep 17 00:00:00 2001
|
|
From: Andy Lutomirski <luto@kernel.org>
|
|
Date: Fri, 19 Jun 2020 16:46:33 -0700
|
|
Subject: [PATCH 17/21] selftests/x86/fsgsbase: Fix a comment in the
|
|
ptrace_write_gsbase test
|
|
|
|
A comment was unclear. Fix it.
|
|
|
|
Fixes: 5e7ec8578fa3 ("selftests/x86/fsgsbase: Test ptracer-induced GS base write with FSGSBASE")
|
|
Signed-off-by: Andy Lutomirski <luto@kernel.org>
|
|
---
|
|
tools/testing/selftests/x86/fsgsbase.c | 3 ++-
|
|
1 file changed, 2 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
|
|
index 9a4349813..f47495d2f 100644
|
|
--- a/tools/testing/selftests/x86/fsgsbase.c
|
|
+++ b/tools/testing/selftests/x86/fsgsbase.c
|
|
@@ -498,7 +498,8 @@ static void test_ptrace_write_gsbase(void)
|
|
* base would zero the selector. On newer kernels,
|
|
* this behavior has changed -- poking the base
|
|
* changes only the base and, if FSGSBASE is not
|
|
- * available, this may not effect.
|
|
+ * available, this may have no effect once the tracee
|
|
+ * is resumed.
|
|
*/
|
|
if (gs == 0)
|
|
printf("\tNote: this is expected behavior on older kernels.\n");
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From e0070bef72602b943dd2b989a4f7e978611de0f6 Mon Sep 17 00:00:00 2001
|
|
From: Andy Lutomirski <luto@kernel.org>
|
|
Date: Fri, 19 Jun 2020 22:20:35 -0700
|
|
Subject: [PATCH 18/21] selftests/x86/fsgsbase: Add a missing memory constraint
|
|
|
|
The manual call to set_thread_area() via int $0x80 was missing any
|
|
indication that the descriptor was a pointer, causing gcc to
|
|
occasionally generate wrong code. Add the missing constraint.
|
|
|
|
Signed-off-by: Andy Lutomirski <luto@kernel.org>
|
|
---
|
|
tools/testing/selftests/x86/fsgsbase.c | 3 ++-
|
|
1 file changed, 2 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
|
|
index f47495d2f..998319553 100644
|
|
--- a/tools/testing/selftests/x86/fsgsbase.c
|
|
+++ b/tools/testing/selftests/x86/fsgsbase.c
|
|
@@ -285,7 +285,8 @@ static unsigned short load_gs(void)
|
|
/* 32-bit set_thread_area */
|
|
long ret;
|
|
asm volatile ("int $0x80"
|
|
- : "=a" (ret) : "a" (243), "b" (low_desc)
|
|
+ : "=a" (ret), "+m" (*low_desc)
|
|
+ : "a" (243), "b" (low_desc)
|
|
: "r8", "r9", "r10", "r11");
|
|
memcpy(&desc, low_desc, sizeof(desc));
|
|
munmap(low_desc, sizeof(desc));
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From bf7b9854f69be5c958efe5fb670ee544426bfdf4 Mon Sep 17 00:00:00 2001
|
|
From: Andy Lutomirski <luto@kernel.org>
|
|
Date: Sat, 20 Jun 2020 08:29:44 -0700
|
|
Subject: [PATCH 19/21] x86/ptrace: Fix 32-bit PTRACE_SETREGS vs fsbase and
|
|
gsbase
|
|
|
|
Debuggers expect that doing PTRACE_GETREGS, then poking at a tracee
|
|
and maybe letting it run for a while, then doing PTRACE_SETREGS will
|
|
put the tracee back where it was. In the specific case of a 32-bit
|
|
tracer and tracee, the PTRACE_GETREGS/SETREGS data structure doesn't
|
|
have fs_base or gs_base fields, so FSBASE and GSBASE fields are
|
|
never stored anywhere. Everything used to still work because
|
|
nonzero FS or GS would result full reloads of the segment registers
|
|
when the tracee resumes, and the bases associated with FS==0 or
|
|
GS==0 are irrelevant to 32-bit code.
|
|
|
|
Adding FSGSBASE support broke this: when FSGSBASE is enabled, FSBASE
|
|
and GSBASE are now restored independently of FS and GS for all tasks
|
|
when context-switched in. This means that, if a 32-bit tracer
|
|
restores a previous state using PTRACE_SETREGS but the tracee's
|
|
pre-restore and post-restore bases don't match, then the tracee is
|
|
resumed with the wrong base.
|
|
|
|
Fix it by explicitly loading the base when a 32-bit tracer pokes FS
|
|
or GS on a 64-bit kernel.
|
|
|
|
Also add a test case.
|
|
|
|
Fixes: 673903495c85 ("x86/process/64: Use FSBSBASE in switch_to() if available")
|
|
Cc: Sasha Levin <sashal@kernel.org>
|
|
Signed-off-by: Andy Lutomirski <luto@kernel.org>
|
|
---
|
|
arch/x86/include/asm/fsgsbase.h | 2 +
|
|
arch/x86/kernel/process_64.c | 4 +-
|
|
arch/x86/kernel/ptrace.c | 43 ++-
|
|
tools/testing/selftests/x86/Makefile | 2 +-
|
|
.../testing/selftests/x86/fsgsbase_restore.c | 245 ++++++++++++++++++
|
|
5 files changed, 280 insertions(+), 16 deletions(-)
|
|
create mode 100644 tools/testing/selftests/x86/fsgsbase_restore.c
|
|
|
|
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
|
|
index aefd53767..d55264641 100644
|
|
--- a/arch/x86/include/asm/fsgsbase.h
|
|
+++ b/arch/x86/include/asm/fsgsbase.h
|
|
@@ -75,6 +75,8 @@ static inline void x86_fsbase_write_cpu(unsigned long fsbase)
|
|
|
|
extern unsigned long x86_gsbase_read_cpu_inactive(void);
|
|
extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
|
|
+extern unsigned long x86_fsgsbase_read_task(struct task_struct *task,
|
|
+ unsigned short selector);
|
|
|
|
#endif /* CONFIG_X86_64 */
|
|
|
|
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
|
|
index 0bcb48a12..b4e56f827 100644
|
|
--- a/arch/x86/kernel/process_64.c
|
|
+++ b/arch/x86/kernel/process_64.c
|
|
@@ -344,8 +344,8 @@ static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
|
|
}
|
|
}
|
|
|
|
-static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
|
|
- unsigned short selector)
|
|
+unsigned long x86_fsgsbase_read_task(struct task_struct *task,
|
|
+ unsigned short selector)
|
|
{
|
|
unsigned short idx = selector >> 3;
|
|
unsigned long base;
|
|
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
|
|
index cc56efb75..3ae271830 100644
|
|
--- a/arch/x86/kernel/ptrace.c
|
|
+++ b/arch/x86/kernel/ptrace.c
|
|
@@ -282,17 +282,9 @@ static int set_segment_reg(struct task_struct *task,
|
|
return -EIO;
|
|
|
|
/*
|
|
- * This function has some ABI oddities.
|
|
- *
|
|
- * A 32-bit ptracer probably expects that writing FS or GS will change
|
|
- * FSBASE or GSBASE respectively. In the absence of FSGSBASE support,
|
|
- * this code indeed has that effect. When FSGSBASE is added, this
|
|
- * will require a special case.
|
|
- *
|
|
- * For existing 64-bit ptracers, writing FS or GS *also* currently
|
|
- * changes the base if the selector is nonzero the next time the task
|
|
- * is run. This behavior may not be needed, and trying to preserve it
|
|
- * when FSGSBASE is added would be complicated at best.
|
|
+ * Writes to FS and GS will change the stored selector. Whether
|
|
+ * this changes the segment base as well depends on whether
|
|
+ * FSGSBASE is enabled.
|
|
*/
|
|
|
|
switch (offset) {
|
|
@@ -868,14 +860,39 @@ long arch_ptrace(struct task_struct *child, long request,
|
|
static int putreg32(struct task_struct *child, unsigned regno, u32 value)
|
|
{
|
|
struct pt_regs *regs = task_pt_regs(child);
|
|
+ int ret;
|
|
|
|
switch (regno) {
|
|
|
|
SEG32(cs);
|
|
SEG32(ds);
|
|
SEG32(es);
|
|
- SEG32(fs);
|
|
- SEG32(gs);
|
|
+
|
|
+ /*
|
|
+ * A 32-bit ptracer on a 64-bit kernel expects that writing
|
|
+ * FS or GS will also update the base. This is needed for
|
|
+ * operations like PTRACE_SETREGS to fully restore a saved
|
|
+ * CPU state.
|
|
+ */
|
|
+
|
|
+ case offsetof(struct user32, regs.fs):
|
|
+ ret = set_segment_reg(child,
|
|
+ offsetof(struct user_regs_struct, fs),
|
|
+ value);
|
|
+ if (ret == 0)
|
|
+ child->thread.fsbase =
|
|
+ x86_fsgsbase_read_task(child, value);
|
|
+ break;
|
|
+
|
|
+ case offsetof(struct user32, regs.gs):
|
|
+ ret = set_segment_reg(child,
|
|
+ offsetof(struct user_regs_struct, gs),
|
|
+ value);
|
|
+ if (ret == 0)
|
|
+ child->thread.gsbase =
|
|
+ x86_fsgsbase_read_task(child, value);
|
|
+ break;
|
|
+
|
|
SEG32(ss);
|
|
|
|
R32(ebx, bx);
|
|
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
|
|
index 5d49bfec1..8aee616b3 100644
|
|
--- a/tools/testing/selftests/x86/Makefile
|
|
+++ b/tools/testing/selftests/x86/Makefile
|
|
@@ -13,7 +13,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie)
|
|
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
|
|
check_initial_reg_state sigreturn iopl ioperm \
|
|
protection_keys test_vdso test_vsyscall mov_ss_trap \
|
|
- syscall_arg_fault
|
|
+ syscall_arg_fault fsgsbase_restore
|
|
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
|
|
test_FCMOV test_FCOMI test_FISTTP \
|
|
vdso_restorer
|
|
diff --git a/tools/testing/selftests/x86/fsgsbase_restore.c b/tools/testing/selftests/x86/fsgsbase_restore.c
|
|
new file mode 100644
|
|
index 000000000..70502a708
|
|
--- /dev/null
|
|
+++ b/tools/testing/selftests/x86/fsgsbase_restore.c
|
|
@@ -0,0 +1,245 @@
|
|
+// SPDX-License-Identifier: GPL-2.0-only
|
|
+/*
|
|
+ * fsgsbase_restore.c, test ptrace vs fsgsbase
|
|
+ * Copyright (c) 2020 Andy Lutomirski
|
|
+ *
|
|
+ * This test case simulates a tracer redirecting tracee execution to
|
|
+ * a function and then restoring tracee state using PTRACE_GETREGS and
|
|
+ * PTRACE_SETREGS. This is similar to what gdb does when doing
|
|
+ * 'p func()'. The catch is that this test has the called function
|
|
+ * modify a segment register. This makes sure that ptrace correctly
|
|
+ * restores segment state when using PTRACE_SETREGS.
|
|
+ *
|
|
+ * This is not part of fsgsbase.c, because that test is 64-bit only.
|
|
+ */
|
|
+
|
|
+#define _GNU_SOURCE
|
|
+#include <stdio.h>
|
|
+#include <stdlib.h>
|
|
+#include <stdbool.h>
|
|
+#include <string.h>
|
|
+#include <sys/syscall.h>
|
|
+#include <unistd.h>
|
|
+#include <err.h>
|
|
+#include <sys/user.h>
|
|
+#include <asm/prctl.h>
|
|
+#include <sys/prctl.h>
|
|
+#include <asm/ldt.h>
|
|
+#include <sys/mman.h>
|
|
+#include <stddef.h>
|
|
+#include <sys/ptrace.h>
|
|
+#include <sys/wait.h>
|
|
+#include <stdint.h>
|
|
+
|
|
+#define EXPECTED_VALUE 0x1337f00d
|
|
+
|
|
+#ifdef __x86_64__
|
|
+# define SEG "%gs"
|
|
+#else
|
|
+# define SEG "%fs"
|
|
+#endif
|
|
+
|
|
+static unsigned int dereference_seg_base(void)
|
|
+{
|
|
+ int ret;
|
|
+ asm volatile ("mov %" SEG ":(0), %0" : "=rm" (ret));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void init_seg(void)
|
|
+{
|
|
+ unsigned int *target = mmap(
|
|
+ NULL, sizeof(unsigned int),
|
|
+ PROT_READ | PROT_WRITE,
|
|
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0);
|
|
+ if (target == MAP_FAILED)
|
|
+ err(1, "mmap");
|
|
+
|
|
+ *target = EXPECTED_VALUE;
|
|
+
|
|
+ printf("\tsegment base address = 0x%lx\n", (unsigned long)target);
|
|
+
|
|
+ struct user_desc desc = {
|
|
+ .entry_number = 0,
|
|
+ .base_addr = (unsigned int)(uintptr_t)target,
|
|
+ .limit = sizeof(unsigned int) - 1,
|
|
+ .seg_32bit = 1,
|
|
+ .contents = 0, /* Data, grow-up */
|
|
+ .read_exec_only = 0,
|
|
+ .limit_in_pages = 0,
|
|
+ .seg_not_present = 0,
|
|
+ .useable = 0
|
|
+ };
|
|
+ if (false && syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
|
|
+ printf("\tusing LDT slot 0\n");
|
|
+ asm volatile ("mov %0, %" SEG :: "rm" ((unsigned short)0x7));
|
|
+ } else {
|
|
+ /* No modify_ldt for us (configured out, perhaps) */
|
|
+
|
|
+ struct user_desc *low_desc = mmap(
|
|
+ NULL, sizeof(desc),
|
|
+ PROT_READ | PROT_WRITE,
|
|
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0);
|
|
+ memcpy(low_desc, &desc, sizeof(desc));
|
|
+
|
|
+ low_desc->entry_number = -1;
|
|
+
|
|
+ /* 32-bit set_thread_area */
|
|
+ long ret;
|
|
+ asm volatile ("int $0x80"
|
|
+ : "=a" (ret), "+m" (*low_desc)
|
|
+ : "a" (243), "b" (low_desc)
|
|
+#ifdef __x86_64__
|
|
+ : "r8", "r9", "r10", "r11"
|
|
+#endif
|
|
+ );
|
|
+ memcpy(&desc, low_desc, sizeof(desc));
|
|
+ munmap(low_desc, sizeof(desc));
|
|
+
|
|
+ if (ret != 0) {
|
|
+ printf("[NOTE]\tcould not create a segment -- can't test anything\n");
|
|
+ exit(0);
|
|
+ }
|
|
+ printf("\tusing GDT slot %d\n", desc.entry_number);
|
|
+
|
|
+ unsigned short sel = (unsigned short)((desc.entry_number << 3) | 0x3);
|
|
+ asm volatile ("mov %0, %" SEG :: "rm" (sel));
|
|
+ }
|
|
+}
|
|
+
|
|
+static void tracee_zap_segment(void)
|
|
+{
|
|
+ /*
|
|
+ * The tracer will redirect execution here. This is meant to
|
|
+ * work like gdb's 'p func()' feature. The tricky bit is that
|
|
+ * we modify a segment register in order to make sure that ptrace
|
|
+ * can correctly restore segment registers.
|
|
+ */
|
|
+ printf("\tTracee: in tracee_zap_segment()\n");
|
|
+
|
|
+ /*
|
|
+ * Write a nonzero selector with base zero to the segment register.
|
|
+ * Using a null selector would defeat the test on AMD pre-Zen2
|
|
+ * CPUs, as such CPUs don't clear the base when loading a null
|
|
+ * selector.
|
|
+ */
|
|
+ unsigned short sel;
|
|
+ asm volatile ("mov %%ss, %0\n\t"
|
|
+ "mov %0, %" SEG
|
|
+ : "=rm" (sel));
|
|
+
|
|
+ pid_t pid = getpid(), tid = syscall(SYS_gettid);
|
|
+
|
|
+ printf("\tTracee is going back to sleep\n");
|
|
+ syscall(SYS_tgkill, pid, tid, SIGSTOP);
|
|
+
|
|
+ /* Should not get here. */
|
|
+ while (true) {
|
|
+ printf("[FAIL]\tTracee hit unreachable code\n");
|
|
+ pause();
|
|
+ }
|
|
+}
|
|
+
|
|
+int main()
|
|
+{
|
|
+ printf("\tSetting up a segment\n");
|
|
+ init_seg();
|
|
+
|
|
+ unsigned int val = dereference_seg_base();
|
|
+ if (val != EXPECTED_VALUE) {
|
|
+ printf("[FAIL]\tseg[0] == %x; should be %x\n", val, EXPECTED_VALUE);
|
|
+ return 1;
|
|
+ }
|
|
+ printf("[OK]\tThe segment points to the right place.\n");
|
|
+
|
|
+ pid_t chld = fork();
|
|
+ if (chld < 0)
|
|
+ err(1, "fork");
|
|
+
|
|
+ if (chld == 0) {
|
|
+ prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0, 0);
|
|
+
|
|
+ if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
|
|
+ err(1, "PTRACE_TRACEME");
|
|
+
|
|
+ pid_t pid = getpid(), tid = syscall(SYS_gettid);
|
|
+
|
|
+ printf("\tTracee will take a nap until signaled\n");
|
|
+ syscall(SYS_tgkill, pid, tid, SIGSTOP);
|
|
+
|
|
+ printf("\tTracee was resumed. Will re-check segment.\n");
|
|
+
|
|
+ val = dereference_seg_base();
|
|
+ if (val != EXPECTED_VALUE) {
|
|
+ printf("[FAIL]\tseg[0] == %x; should be %x\n", val, EXPECTED_VALUE);
|
|
+ exit(1);
|
|
+ }
|
|
+
|
|
+ printf("[OK]\tThe segment points to the right place.\n");
|
|
+ exit(0);
|
|
+ }
|
|
+
|
|
+ int status;
|
|
+
|
|
+ /* Wait for SIGSTOP. */
|
|
+ if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
|
|
+ err(1, "waitpid");
|
|
+
|
|
+ struct user_regs_struct regs;
|
|
+
|
|
+ if (ptrace(PTRACE_GETREGS, chld, NULL, ®s) != 0)
|
|
+ err(1, "PTRACE_GETREGS");
|
|
+
|
|
+#ifdef __x86_64__
|
|
+ printf("\tChild GS=0x%lx, GSBASE=0x%lx\n", (unsigned long)regs.gs, (unsigned long)regs.gs_base);
|
|
+#else
|
|
+ printf("\tChild FS=0x%lx\n", (unsigned long)regs.xfs);
|
|
+#endif
|
|
+
|
|
+ struct user_regs_struct regs2 = regs;
|
|
+#ifdef __x86_64__
|
|
+ regs2.rip = (unsigned long)tracee_zap_segment;
|
|
+ regs2.rsp -= 128; /* Don't clobber the redzone. */
|
|
+#else
|
|
+ regs2.eip = (unsigned long)tracee_zap_segment;
|
|
+#endif
|
|
+
|
|
+ printf("\tTracer: redirecting tracee to tracee_zap_segment()\n");
|
|
+ if (ptrace(PTRACE_SETREGS, chld, NULL, ®s2) != 0)
|
|
+ err(1, "PTRACE_GETREGS");
|
|
+ if (ptrace(PTRACE_CONT, chld, NULL, NULL) != 0)
|
|
+ err(1, "PTRACE_GETREGS");
|
|
+
|
|
+ /* Wait for SIGSTOP. */
|
|
+ if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
|
|
+ err(1, "waitpid");
|
|
+
|
|
+ printf("\tTracer: restoring tracee state\n");
|
|
+ if (ptrace(PTRACE_SETREGS, chld, NULL, ®s) != 0)
|
|
+ err(1, "PTRACE_GETREGS");
|
|
+ if (ptrace(PTRACE_DETACH, chld, NULL, NULL) != 0)
|
|
+ err(1, "PTRACE_GETREGS");
|
|
+
|
|
+ /* Wait for SIGSTOP. */
|
|
+ if (waitpid(chld, &status, 0) != chld)
|
|
+ err(1, "waitpid");
|
|
+
|
|
+ if (WIFSIGNALED(status)) {
|
|
+ printf("[FAIL]\tTracee crashed\n");
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ if (!WIFEXITED(status)) {
|
|
+ printf("[FAIL]\tTracee stopped for an unexpected reason: %d\n", status);
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ int exitcode = WEXITSTATUS(status);
|
|
+ if (exitcode != 0) {
|
|
+ printf("[FAIL]\tTracee reported failure\n");
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ printf("[OK]\tAll is well.\n");
|
|
+ return 0;
|
|
+}
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From 41b34f043560c89213a144c3d55ffd051b322a94 Mon Sep 17 00:00:00 2001
|
|
From: Andy Lutomirski <luto@kernel.org>
|
|
Date: Wed, 27 May 2020 16:02:36 -0700
|
|
Subject: [PATCH 20/21] selftests/x86: Add a syscall_arg_fault_64 test for
|
|
negative GSBASE
|
|
|
|
If the kernel erroneously allows WRGSBASE and user code writes a
|
|
negative value, paranoid_entry will get confused. Check for this by
|
|
writing a negative value to GSBASE and doing SYSENTER with TF set. A
|
|
successful run looks like:
|
|
|
|
[RUN] SYSENTER with TF, invalid state, and GSBASE < 0
|
|
[SKIP] Illegal instruction
|
|
|
|
A failed run causes a kernel hang, and I believe it's because we
|
|
double-fault and then get a never ending series of page faults and,
|
|
when we exhaust the double fault stack we double fault again,
|
|
starting the process over.
|
|
|
|
Signed-off-by: Andy Lutomirski <luto@kernel.org>
|
|
Signed-off-by: Borislav Petkov <bp@suse.de>
|
|
Link: https://lkml.kernel.org/r/f4f71efc91b9eae5e3dae21c9aee1c70cf5f370e.1590620529.git.luto@kernel.org
|
|
---
|
|
.../testing/selftests/x86/syscall_arg_fault.c | 26 +++++++++++++++++++
|
|
1 file changed, 26 insertions(+)
|
|
|
|
diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c
|
|
index bc0ecc2e8..62fba4086 100644
|
|
--- a/tools/testing/selftests/x86/syscall_arg_fault.c
|
|
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
|
|
@@ -72,6 +72,7 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
|
|
if (ax != -EFAULT && ax != -ENOSYS) {
|
|
printf("[FAIL]\tAX had the wrong value: 0x%lx\n",
|
|
(unsigned long)ax);
|
|
+ printf("\tIP = 0x%lx\n", (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
|
|
n_errs++;
|
|
} else {
|
|
printf("[OK]\tSeems okay\n");
|
|
@@ -226,5 +227,30 @@ int main()
|
|
}
|
|
set_eflags(get_eflags() & ~X86_EFLAGS_TF);
|
|
|
|
+#ifdef __x86_64__
|
|
+ printf("[RUN]\tSYSENTER with TF, invalid state, and GSBASE < 0\n");
|
|
+
|
|
+ if (sigsetjmp(jmpbuf, 1) == 0) {
|
|
+ sigtrap_consecutive_syscalls = 0;
|
|
+
|
|
+ asm volatile ("wrgsbase %%rax\n\t"
|
|
+ :: "a" (0xffffffffffff0000UL));
|
|
+
|
|
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
|
|
+ asm volatile (
|
|
+ "movl $-1, %%eax\n\t"
|
|
+ "movl $-1, %%ebx\n\t"
|
|
+ "movl $-1, %%ecx\n\t"
|
|
+ "movl $-1, %%edx\n\t"
|
|
+ "movl $-1, %%esi\n\t"
|
|
+ "movl $-1, %%edi\n\t"
|
|
+ "movl $-1, %%ebp\n\t"
|
|
+ "movl $-1, %%esp\n\t"
|
|
+ "sysenter"
|
|
+ : : : "memory", "flags");
|
|
+ }
|
|
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
|
|
+#endif
|
|
+
|
|
return 0;
|
|
}
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|
|
|
|
From a1a6a47bf355c2a4d9f17fdb313306cb7a935bc4 Mon Sep 17 00:00:00 2001
|
|
From: Oleksandr Natalenko <oleksandr@redhat.com>
|
|
Date: Fri, 26 Jun 2020 09:32:08 +0200
|
|
Subject: [PATCH 21/21] fsgsbase-5.7: remove erroneous if expression bit
|
|
|
|
Signed-off-by: Oleksandr Natalenko <oleksandr@redhat.com>
|
|
---
|
|
tools/testing/selftests/x86/fsgsbase_restore.c | 2 +-
|
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
|
|
diff --git a/tools/testing/selftests/x86/fsgsbase_restore.c b/tools/testing/selftests/x86/fsgsbase_restore.c
|
|
index 70502a708..6fffadc51 100644
|
|
--- a/tools/testing/selftests/x86/fsgsbase_restore.c
|
|
+++ b/tools/testing/selftests/x86/fsgsbase_restore.c
|
|
@@ -70,7 +70,7 @@ static void init_seg(void)
|
|
.seg_not_present = 0,
|
|
.useable = 0
|
|
};
|
|
- if (false && syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
|
|
+ if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
|
|
printf("\tusing LDT slot 0\n");
|
|
asm volatile ("mov %0, %" SEG :: "rm" ((unsigned short)0x7));
|
|
} else {
|
|
--
|
|
2.27.0.112.g101b3204f3
|
|
|