2025-03-14 15:12:16 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
/* 64-bit system call dispatch */
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <linux/sys.h>
|
|
|
|
#include <linux/cache.h>
|
2019-10-08 22:40:48 +00:00
|
|
|
#include <linux/syscalls.h>
|
2025-03-14 15:12:16 +00:00
|
|
|
#include <linux/entry-common.h>
|
|
|
|
#include <linux/nospec.h>
|
2013-08-05 22:02:35 +00:00
|
|
|
#include <asm/syscall.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2021-05-17 07:38:10 +00:00
|
|
|
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
|
2024-06-26 06:02:00 +00:00
|
|
|
#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
|
2011-11-12 00:07:41 +00:00
|
|
|
#include <asm/syscalls_64.h>
|
2025-03-14 15:12:17 +00:00
|
|
|
#ifdef CONFIG_X86_X32_ABI
|
|
|
|
#include <asm/syscalls_x32.h>
|
|
|
|
#endif
|
2024-06-26 06:02:00 +00:00
|
|
|
#undef __SYSCALL
|
|
|
|
|
|
|
|
#undef __SYSCALL_NORETURN
|
|
|
|
#define __SYSCALL_NORETURN __SYSCALL
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2024-04-03 23:36:44 +00:00
|
|
|
/*
|
|
|
|
* The sys_call_table[] is no longer used for system calls, but
|
|
|
|
* kernel/trace/trace_syscalls.c still wants to know the system
|
|
|
|
* call address.
|
|
|
|
*/
|
x86/syscalls: Stop filling syscall arrays with *_sys_ni_syscall
This is a follow-up cleanup after switching to the generic syscalltbl.sh.
The old x86 specific script skipped non-existing syscalls. So, the
generated syscalls_64.h, for example, had a big hole in the syscall numbers
335-423 range. That is why there exists [0 ... __NR_*_syscall_max] =
&__*_sys_ni_cyscall.
The new script, scripts/syscalltbl.sh automatically fills holes
with __SYSCALL(<nr>, sys_ni_syscall), hence such ugly code can
go away. The designated initializers, '[nr] =' are also unneeded.
Also, there is no need to give __NR_*_syscall_max+1 because the array
size is implied by the number of syscalls in the generated headers.
Hence, there is no need to include <asm/unistd.h>, either.
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210517073815.97426-4-masahiroy@kernel.org
2021-05-17 07:38:11 +00:00
|
|
|
#define __SYSCALL(nr, sym) __x64_##sym,
|
2024-04-03 23:36:44 +00:00
|
|
|
const sys_call_ptr_t sys_call_table[] = {
|
2011-11-12 00:07:41 +00:00
|
|
|
#include <asm/syscalls_64.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
};
|
2024-06-26 06:02:00 +00:00
|
|
|
#undef __SYSCALL
|
2024-04-03 23:36:44 +00:00
|
|
|
|
|
|
|
#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
|
|
|
|
long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
|
|
|
|
{
|
|
|
|
switch (nr) {
|
|
|
|
#include <asm/syscalls_64.h>
|
|
|
|
default: return __x64_sys_ni_syscall(regs);
|
|
|
|
}
|
2025-03-14 15:12:19 +00:00
|
|
|
}
|
2025-03-14 15:12:16 +00:00
|
|
|
|
2025-03-14 15:12:17 +00:00
|
|
|
#ifdef CONFIG_X86_X32_ABI
|
|
|
|
long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
|
|
|
|
{
|
|
|
|
switch (nr) {
|
|
|
|
#include <asm/syscalls_x32.h>
|
|
|
|
default: return __x64_sys_ni_syscall(regs);
|
|
|
|
}
|
2025-03-14 15:12:19 +00:00
|
|
|
}
|
2025-03-14 15:12:17 +00:00
|
|
|
#endif
|
|
|
|
|
2025-03-14 15:12:16 +00:00
|
|
|
static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Convert negative numbers to very high and thus out of range
|
|
|
|
* numbers for comparisons.
|
|
|
|
*/
|
|
|
|
unsigned int unr = nr;
|
|
|
|
|
|
|
|
if (likely(unr < NR_syscalls)) {
|
|
|
|
unr = array_index_nospec(unr, NR_syscalls);
|
|
|
|
regs->ax = x64_sys_call(regs, unr);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Adjust the starting offset of the table, and convert numbers
|
|
|
|
* < __X32_SYSCALL_BIT to very high and thus out of range
|
|
|
|
* numbers for comparisons.
|
|
|
|
*/
|
|
|
|
unsigned int xnr = nr - __X32_SYSCALL_BIT;
|
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
|
|
|
|
xnr = array_index_nospec(xnr, X32_NR_syscalls);
|
|
|
|
regs->ax = x32_sys_call(regs, xnr);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Returns true to return using SYSRET, or false to use IRET */
|
|
|
|
__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
|
|
|
|
{
|
|
|
|
add_random_kstack_offset();
|
|
|
|
nr = syscall_enter_from_user_mode(regs, nr);
|
|
|
|
|
|
|
|
instrumentation_begin();
|
|
|
|
|
|
|
|
if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
|
|
|
|
/* Invalid system call, but still a system call. */
|
|
|
|
regs->ax = __x64_sys_ni_syscall(regs);
|
|
|
|
}
|
|
|
|
|
|
|
|
instrumentation_end();
|
|
|
|
syscall_exit_to_user_mode(regs);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check that the register state is valid for using SYSRET to exit
|
|
|
|
* to userspace. Otherwise use the slower but fully capable IRET
|
|
|
|
* exit path.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* XEN PV guests always use the IRET path */
|
|
|
|
if (cpu_feature_enabled(X86_FEATURE_XENPV))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* SYSRET requires RCX == RIP and R11 == EFLAGS */
|
|
|
|
if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* CS and SS must match the values set in MSR_STAR */
|
|
|
|
if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
|
|
|
|
* in kernel space. This essentially lets the user take over
|
|
|
|
* the kernel, since userspace controls RSP.
|
|
|
|
*
|
|
|
|
* TASK_SIZE_MAX covers all user-accessible addresses other than
|
|
|
|
* the deprecated vsyscall page.
|
|
|
|
*/
|
|
|
|
if (unlikely(regs->ip >= TASK_SIZE_MAX))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* SYSRET cannot restore RF. It can restore TF, but unlike IRET,
|
|
|
|
* restoring TF results in a trap from userspace immediately after
|
|
|
|
* SYSRET.
|
|
|
|
*/
|
|
|
|
if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* Use SYSRET to exit to userspace */
|
|
|
|
return true;
|
|
|
|
}
|