if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { /* * Subtlety here: if ptrace pokes something larger than * 2^32-1 into orig_ax, this truncates it. This may or * may not be necessary, but it matches the old asm * behavior. */ nr = syscall_trace_enter(regs); }
if (likely(nr < IA32_NR_syscalls)) { /* * It's possible that a 32-bit syscall implementation * takes a 64-bit parameter but nonetheless assumes that * the high bits are zero. Make sure we zero-extend all * of the args. */ regs->ax = ia32_sys_call_table[nr]( (unsignedint)regs->bx, (unsignedint)regs->cx, (unsignedint)regs->dx, (unsignedint)regs->si, (unsignedint)regs->di, (unsignedint)regs->bp); }
#ifdef __i386__ /* this struct defines the way the registers are stored on the stack during a system call. */
#ifndef __KERNEL__
structpt_regs { long ebx; long ecx; long edx; long esi; long edi; long ebp; long eax; int xds; int xes; int xfs; int xgs; long orig_eax; long eip; int xcs; long eflags; long esp; int xss; };
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { /* * Subtlety here: if ptrace pokes something larger than * 2^32-1 into orig_ax, this truncates it. This may or * may not be necessary, but it matches the old asm * behavior. */ nr = syscall_trace_enter(regs); }
if (likely(nr < IA32_NR_syscalls)) { /* * It's possible that a 32-bit syscall implementation * takes a 64-bit parameter but nonetheless assumes that * the high bits are zero. Make sure we zero-extend all * of the args. */ regs->ax = ia32_sys_call_table[nr]( (unsignedint)regs->bx, (unsignedint)regs->cx, (unsignedint)regs->dx, (unsignedint)regs->si, (unsignedint)regs->di, (unsignedint)regs->bp); }
__visible constsys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, #include<asm/syscalls_32.h> };
之所以提出新指令,是因为通过软中断来实现系统调用实在太慢了。于是 Intel x86 CPU 自 Pentium II(Family 6, Model 3, Stepping 3)之后,开始支持新的系统调用指令sysenter/sysexit。前者用于从低特权级切换到ring 0,后者用于从ring 0切换到低特权级。没有特权级别检查(CPL, DPL),也没有压栈的操作
在 Intel SDM 中阐述了sysenter指令。首先 CPU 有一堆特殊的寄存器,名为Model-Specific Register(MSR),这些寄存器在操作系统运行过程中起着重要作用。对于这些寄存器,需要采用专门的指令RDMSR和WRMSR进行读写
/* * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions * on 32-bit kernels: */ #ifdef CONFIG_X86_32 voidenable_sep_cpu(void) { structtss_struct *tss; int cpu;
if (!boot_cpu_has(X86_FEATURE_SEP)) return;
cpu = get_cpu(); tss = &per_cpu(cpu_tss, cpu);
/* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- * see the big comment in struct x86_hw_tss's definition. */
__kernel_vsyscall: CFI_STARTPROC /* * Reshuffle regs so that all of any of the entry instructions * will preserve enough state. * * A really nice entry sequence would be: * pushl %edx * pushl %ecx * movl %esp, %ecx * * Unfortunately, naughty Android versions between July and December * 2015 actually hardcode the traditional Linux SYSENTER entry * sequence. That is severely broken for a number of reasons (ask * anyone with an AMD CPU, for example). Nonetheless, we try to keep * it working approximately as well as it ever worked. * * This link may eludicate some of the history: * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7 * personally, I find it hard to understand what's going on there. * * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE. * Execute an indirect call to the address in the AT_SYSINFO auxv * entry. That is the ONLY correct way to make a fast 32-bit system * call on Linux. (Open-coding int $0x80 is also fine, but it's * slow.) */ pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx, 0 pushl %edx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET edx, 0 pushl %ebp CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebp, 0
#ifdef CONFIG_X86_64 /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \ SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32 #else ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP #endif
/* Enter using int $0x80 */ int $0x80 GLOBAL(int80_landing_pad)
/* * Restore EDX and ECX in case they were clobbered. EBP is not * clobbered (the kernel restores it), but it's cleaner and * probably faster to pop it than to adjust ESP using addl. */ popl %ebp CFI_RESTORE ebp CFI_ADJUST_CFA_OFFSET -4 popl %edx CFI_RESTORE edx CFI_ADJUST_CFA_OFFSET -4 popl %ecx CFI_RESTORE ecx CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC
ALTERNATIVE_2宏实际上是在做选择,如果支持X86_FEATURE_SYSENTER32(Intel CPU),则执行SYSENTER_SEQUENCE,如果支持X86_FEATURE_SYSCALL32(AMD CPU),则执行SYSCALL_SEQUENCE。如果都不支持,那么啥都不干(???)。如果啥都没干,那么接着往下执行,即执行 int $0x80,退化到传统(legacy)方式进行系统调用
/* * SYSENTER doesn't filter flags, so we need to clear NT, AC * and TF ourselves. To save a few cycles, we can check whether * either was set instead of doing an unconditional popfq. * This needs to happen before enabling interrupts so that * we don't get preempted with NT set. * * If TF is set, we will single-step all the way to here -- do_debug * will ignore all the traps. (Yes, this is slow, but so is * single-stepping in general. This allows us to avoid having * a more complicated code to handle the case where a user program * forces us to single-step through the SYSENTER entry code.) * * NB.: .Lsysenter_fix_flags is a label with the code under it moved * out-of-line as an optimization: NT is unlikely to be set in the * majority of the cases and instead of polluting the I$ unnecessarily, * we're keeping that code behind a branch which will predict as * not-taken and therefore its instructions won't be fetched. */ testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed:
/* * User mode is traced as though IRQs are on, and SYSENTER * turned them off. */ TRACE_IRQS_OFF
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible longdo_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */
/* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. * Fix it up. */ regs->ip = landing_pad;
enter_from_user_mode();
local_irq_enable();
/* Fetch EBP from where the vDSO stashed it. */ if ( #ifdef CONFIG_X86_64 /* * Micro-optimization: the pointer we're following is explicitly * 32 bits, so it can't be out of range. */ __get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsignedlong)(u32)regs->sp) #else get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsignedlong)(u32)regs->sp) #endif ) {
/* User code screwed up. */ local_irq_disable(); regs->ax = -EFAULT; prepare_exit_to_usermode(regs); return0; /* Keep it simple: use IRET. */ }
/* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs);
#ifdef CONFIG_X86_64 /* * Opportunistic SYSRETL: if possible, try to return using SYSRETL. * SYSRETL is available on all 64-bit CPUs, so we don't need to * bother with SYSEXIT. * * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, * because the ECX fixup above will ensure that this is essentially * never the case. */ return regs->cs == __USER32_CS && regs->ss == __USER_DS && regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; #else /* * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. * * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, * because the ECX fixup above will ensure that this is essentially * never the case. * * We don't allow syscalls at all from VM86 mode, but we still * need to check VM, because we might be returning from sys_vm86. */ return static_cpu_has(X86_FEATURE_SEP) && regs->cs == __USER_CS && regs->ss == __USER_DS && regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif } #endif
/* * Restore all flags except IF. (We restore IF separately because * STI gives a one-instruction window in which we won't be interrupted, * whereas POPF does not.) */ addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */ btr $X86_EFLAGS_IF_BIT, (%esp) popfl
/* * Return back to the vDSO, which will pop ecx and edx. * Don't bother with DS and ES (they already contain __USER_DS). */ sti sysexit
#ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsignedlong)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsignedlong)ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif
/* Flags to clear on syscall */ wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); }
ENTRY(entry_SYSCALL_64) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK SWITCH_KERNEL_CR3_NO_STACK /* * A hypervisor implementation might want to use a label * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ GLOBAL(entry_SYSCALL_64_after_swapgs)
/* * If we need to do entry work or if we guess we'll need to do * exit work, go straight to the slow path. */ movq PER_CPU_VAR(current_task), %r11 testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz entry_SYSCALL64_slow_path
entry_SYSCALL_64_fastpath: /* * Easy case: enable interrupts and issue the syscall. If the syscall * needs pt_regs, we'll call a stub that disables interrupts again * and jumps to the slow path. */ TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max, %rax #else andl $__SYSCALL_MASK, %eax cmpl $__NR_syscall_max, %eax #endif ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10, %rcx
/* * This call instruction is handled specially in stub_ptregs_64. * It might end up jumping to the slow path. If it jumps, RAX * and all argument registers are clobbered. */ call *sys_call_table(, %rax, 8) .Lentry_SYSCALL_64_after_fastpath_call:
movq %rax, RAX(%rsp) 1:
/* * If we get here, then we know that pt_regs is clean for SYSRET64. * If we see that no exit work is required (which we are required * to check with IRQs off), then we can go straight to SYSRET64. */ DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movq PER_CPU_VAR(current_task), %r11 testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz 1f
LOCKDEP_SYS_EXIT TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 /* * This opens a window where we have a user CR3, but are * running in the kernel. This makes using the CS * register useless for telling whether or not we need to * switch CR3 in NMIs. Normal interrupts are OK because * they are off here. */ SWITCH_USER_CR3 movq RSP(%rsp), %rsp USERGS_SYSRET64
1: /* * The fast path looked good when we started, but something changed * along the way and we need to switch to the slow path. Calling * raise(3) will trigger this, for example. IRQs are off. */ TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ jmp return_from_SYSCALL_64
entry_SYSCALL64_slow_path: /* IRQs are off. */ SAVE_EXTRA_REGS movq %rsp, %rdi call do_syscall_64 /* returns with IRQs disabled */
return_from_SYSCALL_64: RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */
/* * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. */ movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 cmpq %rcx, %r11 /* RCX == RIP */ jne opportunistic_sysret_failed
/* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over * the kernel, since userspace controls RSP. * * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. */ .ifne __VIRTUAL_MASK_SHIFT - 47 .error "virtual address width changed -- SYSRET checks need update" .endif
/* Change top 16 bits to be the sign-extension of 47th bit */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
/* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 jne opportunistic_sysret_failed
cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ jne opportunistic_sysret_failed
/* * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot * restore RF properly. If the slowpath sets it for whatever reason, we * need to restore it correctly. * * SYSRET can restore TF, but unlike IRET, restoring TF results in a * trap from userspace immediately after SYSRET. This would cause an * infinite loop whenever #DB happens with register state that satisfies * the opportunistic SYSRET conditions. For example, single-stepping * this user code: * * movq $stuck_here, %rcx * pushfq * popq %r11 * stuck_here: * * would never get past 'stuck_here'. */ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 jnz opportunistic_sysret_failed
/* nothing to check for RSP */
cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ jne opportunistic_sysret_failed
/* * We win! This label is here just for ease of understanding * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 /* * This opens a window where we have a user CR3, but are * running in the kernel. This makes using the CS * register useless for telling whether or not we need to * switch CR3 in NMIs. Normal interrupts are OK because * they are off here. */ SWITCH_USER_CR3 movq RSP(%rsp), %rsp USERGS_SYSRET64
opportunistic_sysret_failed: /* * This opens a window where we have a user CR3, but are * running in the kernel. This makes using the CS * register useless for telling whether or not we need to * switch CR3 in NMIs. Normal interrupts are OK because * they are off here. */ SWITCH_USER_CR3 SWAPGS jmp restore_c_regs_and_iret END(entry_SYSCALL_64)
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs);
/* * NB: Native and x32 syscalls are dispatched from the same * table. The only functional difference is the x32 bit in * regs->orig_ax, which changes the behavior of some syscalls. */ if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { regs->ax = sys_call_table[nr & __SYSCALL_MASK]( regs->di, regs->si, regs->dx, regs->r10, regs->r8, regs->r9); }