NACK/Cmnt: [Jammy][PATCH 5/6] x86/entry: Convert INT 0x80 emulation to IDTENTRY
Manuel Diewald
manuel.diewald at canonical.com
Wed Jun 19 08:47:48 UTC 2024
On Tue, Jun 11, 2024 at 04:11:44PM -0400, Yuxuan Luo wrote:
> From: Thomas Gleixner <tglx at linutronix.de>
>
> There is no real reason to have a separate ASM entry point implementation
> for the legacy INT 0x80 syscall emulation on 64-bit.
>
> IDTENTRY provides all the functionality needed with the only difference
> that it does not:
>
> - save the syscall number (AX) into pt_regs::orig_ax
> - set pt_regs::ax to -ENOSYS
>
> Both can be done safely in the C code of an IDTENTRY before invoking any of
> the syscall related functions which depend on this convention.
>
> Aside of ASM code reduction this prepares for detecting and handling a
> local APIC injected vector 0x80.
>
> [ kirill.shutemov: More verbose comments ]
> Suggested-by: Linus Torvalds <torvalds at linuxfoundation.org>
> Signed-off-by: Thomas Gleixner <tglx at linutronix.de>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov at linux.intel.com>
> Signed-off-by: Dave Hansen <dave.hansen at linux.intel.com>
> Reviewed-by: Borislav Petkov (AMD) <bp at alien8.de>
> Cc: <stable at vger.kernel.org> # v6.0+
> (backported from commit be5341eb0d43b1e754799498bd2e8756cc167a41)
> [yuxuan.luo:
> - entry_64_compat.S: ignore the conflict and remove the macro.
> - proto.h: ignore the conflict and remove the declarations.
> ]
> CVE-2024-25744
> Signed-off-by: Yuxuan Luo <yuxuan.luo at canonical.com>
> ---
> arch/x86/entry/common.c | 58 ++++++++++++++++-
> arch/x86/entry/entry_64_compat.S | 106 -------------------------------
> arch/x86/include/asm/idtentry.h | 4 ++
> arch/x86/include/asm/proto.h | 4 --
> arch/x86/kernel/idt.c | 2 +-
> arch/x86/xen/enlighten_pv.c | 2 +-
> arch/x86/xen/xen-asm.S | 2 +-
> 7 files changed, 64 insertions(+), 114 deletions(-)
>
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 3ea32cbca6513..5adc7a17f37c9 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -119,7 +119,62 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
> }
> }
>
> -/* Handles int $0x80 */
> +#ifdef CONFIG_IA32_EMULATION
> +/**
> + * int80_emulation - 32-bit legacy syscall entry
> + *
> + * This entry point can be used by 32-bit and 64-bit programs to perform
> + * 32-bit system calls. Instances of INT $0x80 can be found inline in
> + * various programs and libraries. It is also used by the vDSO's
> + * __kernel_vsyscall fallback for hardware that doesn't support a faster
> + * entry method. Restarted 32-bit system calls also fall back to INT
> + * $0x80 regardless of what instruction was originally used to do the
> + * system call.
> + *
> + * This is considered a slow path. It is not used by most libc
> + * implementations on modern hardware except during process startup.
> + *
> + * The arguments for the INT $0x80 based syscall are on stack in the
> + * pt_regs structure:
> + * eax: system call number
> + * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
> + */
> +DEFINE_IDTENTRY_RAW(int80_emulation)
This code is changed with upstream commit
7390db8aea0d64e9deb28b8e1ce716f5020c7ee5 (backports available for 6.1
and 6.6) to mitigate native BHI. We need to include these changes in
this patchset.
> +{
> + int nr;
> +
> + /* Establish kernel context. */
> + enter_from_user_mode(regs);
> +
> + instrumentation_begin();
> + add_random_kstack_offset();
> +
> + /*
> + * The low level idtentry code pushed -1 into regs::orig_ax
> + * and regs::ax contains the syscall number.
> + *
> + * User tracing code (ptrace or signal handlers) might assume
> + * that the regs::orig_ax contains a 32-bit number on invoking
> + * a 32-bit syscall.
> + *
> + * Establish the syscall convention by saving the 32bit truncated
> + * syscall number in regs::orig_ax and by invalidating regs::ax.
> + */
> + regs->orig_ax = regs->ax & GENMASK(31, 0);
> + regs->ax = -ENOSYS;
> +
> + nr = syscall_32_enter(regs);
> +
> + local_irq_enable();
> + nr = syscall_enter_from_user_mode_work(regs, nr);
> + do_syscall_32_irqs_on(regs, nr);
> +
> + instrumentation_end();
> + syscall_exit_to_user_mode(regs);
> +}
> +#else /* CONFIG_IA32_EMULATION */
> +
> +/* Handles int $0x80 on a 32bit kernel */
> __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
> {
> int nr = syscall_32_enter(regs);
> @@ -138,6 +193,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
> instrumentation_end();
> syscall_exit_to_user_mode(regs);
> }
> +#endif /* !CONFIG_IA32_EMULATION */
>
> static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
> {
> diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
> index d03f0cfbcb1e8..1b0ebbfd0d7f2 100644
> --- a/arch/x86/entry/entry_64_compat.S
> +++ b/arch/x86/entry/entry_64_compat.S
> @@ -324,109 +324,3 @@ sysret32_from_system_call:
> CLEAR_CPU_BUFFERS
> sysretl
> SYM_CODE_END(entry_SYSCALL_compat)
> -
> -/*
> - * 32-bit legacy system call entry.
> - *
> - * 32-bit x86 Linux system calls traditionally used the INT $0x80
> - * instruction. INT $0x80 lands here.
> - *
> - * This entry point can be used by 32-bit and 64-bit programs to perform
> - * 32-bit system calls. Instances of INT $0x80 can be found inline in
> - * various programs and libraries. It is also used by the vDSO's
> - * __kernel_vsyscall fallback for hardware that doesn't support a faster
> - * entry method. Restarted 32-bit system calls also fall back to INT
> - * $0x80 regardless of what instruction was originally used to do the
> - * system call.
> - *
> - * This is considered a slow path. It is not used by most libc
> - * implementations on modern hardware except during process startup.
> - *
> - * Arguments:
> - * eax system call number
> - * ebx arg1
> - * ecx arg2
> - * edx arg3
> - * esi arg4
> - * edi arg5
> - * ebp arg6
> - */
> -SYM_CODE_START(entry_INT80_compat)
> - UNWIND_HINT_ENTRY
> - /*
> - * Interrupts are off on entry.
> - */
> - ASM_CLAC /* Do this early to minimize exposure */
> - SWAPGS
> -
> - /*
> - * User tracing code (ptrace or signal handlers) might assume that
> - * the saved RAX contains a 32-bit number when we're invoking a 32-bit
> - * syscall. Just in case the high bits are nonzero, zero-extend
> - * the syscall number. (This could almost certainly be deleted
> - * with no ill effects.)
> - */
> - movl %eax, %eax
> -
> - /* switch to thread stack expects orig_ax and rdi to be pushed */
> - pushq %rax /* pt_regs->orig_ax */
> - pushq %rdi /* pt_regs->di */
> -
> - /* Need to switch before accessing the thread stack. */
> - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
> -
> - /* In the Xen PV case we already run on the thread stack. */
> - ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
> -
> - movq %rsp, %rdi
> - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
> -
> - pushq 6*8(%rdi) /* regs->ss */
> - pushq 5*8(%rdi) /* regs->rsp */
> - pushq 4*8(%rdi) /* regs->eflags */
> - pushq 3*8(%rdi) /* regs->cs */
> - pushq 2*8(%rdi) /* regs->ip */
> - pushq 1*8(%rdi) /* regs->orig_ax */
> - pushq (%rdi) /* pt_regs->di */
> -.Lint80_keep_stack:
> -
> - pushq %rsi /* pt_regs->si */
> - xorl %esi, %esi /* nospec si */
> - pushq %rdx /* pt_regs->dx */
> - xorl %edx, %edx /* nospec dx */
> - pushq %rcx /* pt_regs->cx */
> - xorl %ecx, %ecx /* nospec cx */
> - pushq $-ENOSYS /* pt_regs->ax */
> - pushq %r8 /* pt_regs->r8 */
> - xorl %r8d, %r8d /* nospec r8 */
> - pushq %r9 /* pt_regs->r9 */
> - xorl %r9d, %r9d /* nospec r9 */
> - pushq %r10 /* pt_regs->r10*/
> - xorl %r10d, %r10d /* nospec r10 */
> - pushq %r11 /* pt_regs->r11 */
> - xorl %r11d, %r11d /* nospec r11 */
> - pushq %rbx /* pt_regs->rbx */
> - xorl %ebx, %ebx /* nospec rbx */
> - pushq %rbp /* pt_regs->rbp */
> - xorl %ebp, %ebp /* nospec rbp */
> - pushq %r12 /* pt_regs->r12 */
> - xorl %r12d, %r12d /* nospec r12 */
> - pushq %r13 /* pt_regs->r13 */
> - xorl %r13d, %r13d /* nospec r13 */
> - pushq %r14 /* pt_regs->r14 */
> - xorl %r14d, %r14d /* nospec r14 */
> - pushq %r15 /* pt_regs->r15 */
> - xorl %r15d, %r15d /* nospec r15 */
> -
> - UNWIND_HINT_REGS
> -
> - cld
> -
> - IBRS_ENTER
> - UNTRAIN_RET
> - CLEAR_BRANCH_HISTORY
CLEAR_BRANCH_HISTORY is part of native BHI mitigation that we backported
from linux-5.15.y. Removing this macro and not backporting mitigation
for the IDTENTRY will make us vulnerable to CVE-2024-2201 again. I think
the issue here is that we backported the fixes for CVE-2024-25744 from
linux-6.7.y which did not receive the native BHI mitigation patches. We
need to backport the changes from upstream commit
7390db8aea0d64e9deb28b8e1ce716f5020c7ee5 in order to include mitigation
in the IDTENTRY code for int80 emulation.
> -
> - movq %rsp, %rdi
> - call do_int80_syscall_32
> - jmp swapgs_restore_regs_and_return_to_usermode
> -SYM_CODE_END(entry_INT80_compat)
> diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
> index 1345088e99025..2ab668956741d 100644
> --- a/arch/x86/include/asm/idtentry.h
> +++ b/arch/x86/include/asm/idtentry.h
> @@ -567,6 +567,10 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op);
> DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3);
> DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault);
>
> +#if defined(CONFIG_IA32_EMULATION)
> +DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR, int80_emulation);
> +#endif
> +
> #ifdef CONFIG_X86_MCE
> #ifdef CONFIG_X86_64
> DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
> diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
> index feed36d44d044..c4d331fe65ffd 100644
> --- a/arch/x86/include/asm/proto.h
> +++ b/arch/x86/include/asm/proto.h
> @@ -28,10 +28,6 @@ void entry_SYSENTER_compat(void);
> void __end_entry_SYSENTER_compat(void);
> void entry_SYSCALL_compat(void);
> void entry_SYSCALL_compat_safe_stack(void);
> -void entry_INT80_compat(void);
> -#ifdef CONFIG_XEN_PV
> -void xen_entry_INT80_compat(void);
> -#endif
> #endif
>
> void x86_configure_nx(void);
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index df0fa695bb09c..b9e806ac1de77 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -109,7 +109,7 @@ static const __initconst struct idt_data def_idts[] = {
>
> SYSG(X86_TRAP_OF, asm_exc_overflow),
> #if defined(CONFIG_IA32_EMULATION)
> - SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
> + SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation),
> #elif defined(CONFIG_X86_32)
> SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
> #endif
> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
> index 998db0257e2ad..47aabc173b108 100644
> --- a/arch/x86/xen/enlighten_pv.c
> +++ b/arch/x86/xen/enlighten_pv.c
> @@ -609,7 +609,7 @@ static struct trap_array_entry trap_array[] = {
> TRAP_ENTRY(exc_int3, false ),
> TRAP_ENTRY(exc_overflow, false ),
> #ifdef CONFIG_IA32_EMULATION
> - { entry_INT80_compat, xen_entry_INT80_compat, false },
> + TRAP_ENTRY(int80_emulation, false ),
> #endif
> TRAP_ENTRY(exc_page_fault, false ),
> TRAP_ENTRY(exc_divide_error, false ),
> diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
> index 1b757a1ee1bb6..56f2407564c2a 100644
> --- a/arch/x86/xen/xen-asm.S
> +++ b/arch/x86/xen/xen-asm.S
> @@ -151,7 +151,7 @@ xen_pv_trap asm_xenpv_exc_machine_check
> #endif /* CONFIG_X86_MCE */
> xen_pv_trap asm_exc_simd_coprocessor_error
> #ifdef CONFIG_IA32_EMULATION
> -xen_pv_trap entry_INT80_compat
> +xen_pv_trap asm_int80_emulation
> #endif
> xen_pv_trap asm_exc_xen_unknown_trap
> xen_pv_trap asm_exc_xen_hypervisor_callback
> --
> 2.34.1
>
>
> --
> kernel-team mailing list
> kernel-team at lists.ubuntu.com
> https://lists.ubuntu.com/mailman/listinfo/kernel-team
Focal patchset looks good to me, so we could in theory apply that
without submitting it again.
--
Manuel
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: not available
URL: <https://lists.ubuntu.com/archives/kernel-team/attachments/20240619/2b23cc59/attachment.sig>
More information about the kernel-team
mailing list