updating to mainline 4.14.9

This commit is contained in:
Jake Day 2017-12-25 15:25:41 -05:00
parent 5934ac1393
commit 97cd4e0224
205 changed files with 3418 additions and 1789 deletions

11
config
View file

@ -1,6 +1,6 @@
#
# Automatically generated file; DO NOT EDIT.
# Linux/x86_64 4.14.8-jakeday Kernel Configuration
# Linux/x86_64 4.14.9-jakeday Kernel Configuration
#
CONFIG_64BIT=y
CONFIG_X86_64=y
@ -8688,8 +8688,7 @@ CONFIG_DEBUG_FS=y
# CONFIG_HEADERS_CHECK is not set
# CONFIG_DEBUG_SECTION_MISMATCH is not set
CONFIG_SECTION_MISMATCH_WARN_ONLY=y
CONFIG_FRAME_POINTER=y
# CONFIG_STACK_VALIDATION is not set
CONFIG_STACK_VALIDATION=y
# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x01b6
@ -8927,9 +8926,9 @@ CONFIG_OPTIMIZE_INLINING=y
# CONFIG_DEBUG_NMI_SELFTEST is not set
CONFIG_X86_DEBUG_FPU=y
CONFIG_PUNIT_ATOM_DEBUG=m
CONFIG_FRAME_POINTER_UNWINDER=y
# CONFIG_ORC_UNWINDER is not set
# CONFIG_GUESS_UNWINDER is not set
CONFIG_UNWINDER_ORC=y
# CONFIG_UNWINDER_FRAME_POINTER is not set
# CONFIG_UNWINDER_GUESS is not set
#
# Security options

View file

@ -4,7 +4,7 @@ ORC unwinder
Overview
--------
The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is
The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is
similar in concept to a DWARF unwinder. The difference is that the
format of the ORC data is much simpler than DWARF, which in turn allows
the ORC unwinder to be much simpler and faster.

View file

@ -34,7 +34,7 @@ ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB)
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
... unused hole ...
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 4
PATCHLEVEL = 14
SUBLEVEL = 8
SUBLEVEL = 9
EXTRAVERSION =
NAME = Petit Gorille
@ -935,8 +935,8 @@ ifdef CONFIG_STACK_VALIDATION
ifeq ($(has_libelf),1)
objtool_target := tools/objtool FORCE
else
ifdef CONFIG_ORC_UNWINDER
$(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
ifdef CONFIG_UNWINDER_ORC
$(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
else
$(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
endif

View file

@ -244,7 +244,7 @@ CONFIG_USB_STORAGE_ONETOUCH=m
CONFIG_USB_STORAGE_KARMA=m
CONFIG_USB_STORAGE_CYPRESS_ATACB=m
CONFIG_USB_STORAGE_ENE_UB6250=m
CONFIG_USB_UAS=m
CONFIG_USB_UAS=y
CONFIG_USB_DWC3=y
CONFIG_USB_DWC2=y
CONFIG_USB_HSIC_USB3503=y

View file

@ -126,8 +126,7 @@ extern unsigned long profile_pc(struct pt_regs *regs);
/*
* kprobe-based event tracer support
*/
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/compiler.h>
#define MAX_REG_OFFSET (offsetof(struct pt_regs, ARM_ORIG_r0))
extern int regs_query_register_offset(const char *name);

View file

@ -51,6 +51,13 @@ enum fixed_addresses {
FIX_EARLYCON_MEM_BASE,
FIX_TEXT_POKE0,
#ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */
FIX_APEI_GHES_IRQ,
FIX_APEI_GHES_NMI,
#endif /* CONFIG_ACPI_APEI_GHES */
__end_of_permanent_fixed_addresses,
/*

View file

@ -276,9 +276,12 @@ void arch_touch_nmi_watchdog(void)
{
unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
int cpu = smp_processor_id();
u64 tb = get_tb();
if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
watchdog_timer_interrupt(cpu);
if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
per_cpu(wd_timer_tb, cpu) = tb;
wd_smp_clear_cpu_pending(cpu, tb);
}
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);

View file

@ -762,7 +762,8 @@ emit_clear:
func = (u8 *) __bpf_call_base + imm;
/* Save skb pointer if we need to re-cache skb data */
if (bpf_helper_changes_pkt_data(func))
if ((ctx->seen & SEEN_SKB) &&
bpf_helper_changes_pkt_data(func))
PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_func_call(image, ctx, (u64)func);
@ -771,7 +772,8 @@ emit_clear:
PPC_MR(b2p[BPF_REG_0], 3);
/* refresh skb cache */
if (bpf_helper_changes_pkt_data(func)) {
if ((ctx->seen & SEEN_SKB) &&
bpf_helper_changes_pkt_data(func)) {
/* reload skb pointer to r3 */
PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_skb_loads(image, ctx);

View file

@ -530,14 +530,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
waiting:
secondary = 1;
spin_begin();
while (secondary && !xmon_gate) {
if (in_xmon == 0) {
if (fromipi)
if (fromipi) {
spin_end();
goto leave;
}
secondary = test_and_set_bit(0, &in_xmon);
}
barrier();
spin_cpu_relax();
touch_nmi_watchdog();
}
spin_end();
if (!secondary && !xmon_gate) {
/* we are the first cpu to come in */
@ -568,21 +573,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
mb();
xmon_gate = 1;
barrier();
touch_nmi_watchdog();
}
cmdloop:
while (in_xmon) {
if (secondary) {
spin_begin();
if (cpu == xmon_owner) {
if (!test_and_set_bit(0, &xmon_taken)) {
secondary = 0;
spin_end();
continue;
}
/* missed it */
while (cpu == xmon_owner)
barrier();
spin_cpu_relax();
}
barrier();
spin_cpu_relax();
touch_nmi_watchdog();
} else {
cmd = cmds(regs);
if (cmd != 0) {

View file

@ -55,8 +55,7 @@ struct bpf_jit {
#define SEEN_LITERAL 8 /* code uses literals */
#define SEEN_FUNC 16 /* calls C functions */
#define SEEN_TAIL_CALL 32 /* code uses tail calls */
#define SEEN_SKB_CHANGE 64 /* code changes skb data */
#define SEEN_REG_AX 128 /* code uses constant blinding */
#define SEEN_REG_AX 64 /* code uses constant blinding */
#define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB)
/*
@ -448,12 +447,12 @@ static void bpf_jit_prologue(struct bpf_jit *jit)
EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
REG_15, 152);
}
if (jit->seen & SEEN_SKB)
if (jit->seen & SEEN_SKB) {
emit_load_skb_data_hlen(jit);
if (jit->seen & SEEN_SKB_CHANGE)
/* stg %b1,ST_OFF_SKBP(%r0,%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
STK_OFF_SKBP);
}
}
/*
@ -983,8 +982,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
EMIT2(0x0d00, REG_14, REG_W1);
/* lgr %b0,%r2: load return value into %b0 */
EMIT4(0xb9040000, BPF_REG_0, REG_2);
if (bpf_helper_changes_pkt_data((void *)func)) {
jit->seen |= SEEN_SKB_CHANGE;
if ((jit->seen & SEEN_SKB) &&
bpf_helper_changes_pkt_data((void *)func)) {
/* lg %b1,ST_OFF_SKBP(%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
REG_15, STK_OFF_SKBP);

View file

@ -7,6 +7,7 @@
#if defined(__sparc__) && defined(__arch64__)
#ifndef __ASSEMBLY__
#include <linux/compiler.h>
#include <linux/threads.h>
#include <asm/switch_to.h>

View file

@ -1245,14 +1245,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
u8 *func = ((u8 *)__bpf_call_base) + imm;
ctx->saw_call = true;
if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
emit_call((u32 *)func, ctx);
emit_nop(ctx);
emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind)
load_skb_regs(ctx, bpf2sparc[BPF_REG_6]);
if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
load_skb_regs(ctx, L7);
break;
}

View file

@ -1,4 +1,5 @@
generic-y += barrier.h
generic-y += bpf_perf_event.h
generic-y += bug.h
generic-y += clkdev.h
generic-y += current.h

View file

@ -41,7 +41,7 @@
typedef int (*initcall_t)(void);
typedef void (*exitcall_t)(void);
#include <linux/compiler.h>
#include <linux/compiler_types.h>
/* These are for everybody (although not all archs will actually
discard it in modules) */

View file

@ -108,7 +108,7 @@ config X86
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
select HAVE_ARCH_KASAN if X86_64
select HAVE_ARCH_KGDB
select HAVE_ARCH_KMEMCHECK
select HAVE_ARCH_MMAP_RND_BITS if MMU
@ -171,7 +171,7 @@ config X86
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
select HAVE_STACK_VALIDATION if X86_64
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
@ -303,7 +303,6 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
default 0xdff8000000000000 if X86_5LEVEL
default 0xdffffc0000000000
config HAVE_INTEL_TXT

View file

@ -359,28 +359,14 @@ config PUNIT_ATOM_DEBUG
choice
prompt "Choose kernel unwinder"
default FRAME_POINTER_UNWINDER
default UNWINDER_ORC if X86_64
default UNWINDER_FRAME_POINTER if X86_32
---help---
This determines which method will be used for unwinding kernel stack
traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
livepatch, lockdep, and more.
config FRAME_POINTER_UNWINDER
bool "Frame pointer unwinder"
select FRAME_POINTER
---help---
This option enables the frame pointer unwinder for unwinding kernel
stack traces.
The unwinder itself is fast and it uses less RAM than the ORC
unwinder, but the kernel text size will grow by ~3% and the kernel's
overall performance will degrade by roughly 5-10%.
This option is recommended if you want to use the livepatch
consistency model, as this is currently the only way to get a
reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
config ORC_UNWINDER
config UNWINDER_ORC
bool "ORC unwinder"
depends on X86_64
select STACK_VALIDATION
@ -396,7 +382,22 @@ config ORC_UNWINDER
Enabling this option will increase the kernel's runtime memory usage
by roughly 2-4MB, depending on your kernel config.
config GUESS_UNWINDER
config UNWINDER_FRAME_POINTER
bool "Frame pointer unwinder"
select FRAME_POINTER
---help---
This option enables the frame pointer unwinder for unwinding kernel
stack traces.
The unwinder itself is fast and it uses less RAM than the ORC
unwinder, but the kernel text size will grow by ~3% and the kernel's
overall performance will degrade by roughly 5-10%.
This option is recommended if you want to use the livepatch
consistency model, as this is currently the only way to get a
reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
config UNWINDER_GUESS
bool "Guess unwinder"
depends on EXPERT
---help---
@ -411,7 +412,7 @@ config GUESS_UNWINDER
endchoice
config FRAME_POINTER
depends on !ORC_UNWINDER && !GUESS_UNWINDER
depends on !UNWINDER_ORC && !UNWINDER_GUESS
bool
endmenu

View file

@ -1,5 +1,5 @@
CONFIG_NOHIGHMEM=y
# CONFIG_HIGHMEM4G is not set
# CONFIG_HIGHMEM64G is not set
CONFIG_GUESS_UNWINDER=y
# CONFIG_FRAME_POINTER_UNWINDER is not set
CONFIG_UNWINDER_GUESS=y
# CONFIG_UNWINDER_FRAME_POINTER is not set

View file

@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_BOOT_PARAMS=y
CONFIG_OPTIMIZE_INLINING=y
CONFIG_UNWINDER_ORC=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y

View file

@ -142,56 +142,25 @@ For 32-bit we have the following conventions - kernel is built with
UNWIND_HINT_REGS offset=\offset
.endm
.macro RESTORE_EXTRA_REGS offset=0
movq 0*8+\offset(%rsp), %r15
movq 1*8+\offset(%rsp), %r14
movq 2*8+\offset(%rsp), %r13
movq 3*8+\offset(%rsp), %r12
movq 4*8+\offset(%rsp), %rbp
movq 5*8+\offset(%rsp), %rbx
UNWIND_HINT_REGS offset=\offset extra=0
.macro POP_EXTRA_REGS
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
.endm
.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
.if \rstor_r11
movq 6*8(%rsp), %r11
.endif
.if \rstor_r8910
movq 7*8(%rsp), %r10
movq 8*8(%rsp), %r9
movq 9*8(%rsp), %r8
.endif
.if \rstor_rax
movq 10*8(%rsp), %rax
.endif
.if \rstor_rcx
movq 11*8(%rsp), %rcx
.endif
.if \rstor_rdx
movq 12*8(%rsp), %rdx
.endif
movq 13*8(%rsp), %rsi
movq 14*8(%rsp), %rdi
UNWIND_HINT_IRET_REGS offset=16*8
.endm
.macro RESTORE_C_REGS
RESTORE_C_REGS_HELPER 1,1,1,1,1
.endm
.macro RESTORE_C_REGS_EXCEPT_RAX
RESTORE_C_REGS_HELPER 0,1,1,1,1
.endm
.macro RESTORE_C_REGS_EXCEPT_RCX
RESTORE_C_REGS_HELPER 1,0,1,1,1
.endm
.macro RESTORE_C_REGS_EXCEPT_R11
RESTORE_C_REGS_HELPER 1,1,0,1,1
.endm
.macro RESTORE_C_REGS_EXCEPT_RCX_R11
RESTORE_C_REGS_HELPER 1,0,0,1,1
.endm
.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
subq $-(15*8+\addskip), %rsp
.macro POP_C_REGS
popq %r11
popq %r10
popq %r9
popq %r8
popq %rax
popq %rcx
popq %rdx
popq %rsi
popq %rdi
.endm
.macro icebp

View file

@ -941,7 +941,8 @@ ENTRY(debug)
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Ldebug_from_sysenter_stack
@ -984,7 +985,8 @@ ENTRY(nmi)
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Lnmi_from_sysenter_stack

View file

@ -136,6 +136,64 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs.
*/
.pushsection .entry_trampoline, "ax"
/*
* The code in here gets remapped into cpu_entry_area's trampoline. This means
* that the assembler and linker have the wrong idea as to where this code
* lives (and, in fact, it's mapped more than once, so it's not even at a
* fixed address). So we can't reference any symbols outside the entry
* trampoline and expect it to work.
*
* Instead, we carefully abuse %rip-relative addressing.
* _entry_trampoline(%rip) refers to the start of the remapped) entry
* trampoline. We can thus find cpu_entry_area with this macro:
*/
#define CPU_ENTRY_AREA \
_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
ENTRY(entry_SYSCALL_64_trampoline)
UNWIND_HINT_EMPTY
swapgs
/* Stash the user RSP. */
movq %rsp, RSP_SCRATCH
/* Load the top of the task stack into RSP */
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
/* Start building the simulated IRET frame. */
pushq $__USER_DS /* pt_regs->ss */
pushq RSP_SCRATCH /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
/*
* x86 lacks a near absolute jump, and we can't jump to the real
* entry text with a relative jump. We could push the target
* address and then use retq, but this destroys the pipeline on
* many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
* spill RDI and restore it in a second-stage trampoline.
*/
pushq %rdi
movq $entry_SYSCALL_64_stage2, %rdi
jmp *%rdi
END(entry_SYSCALL_64_trampoline)
.popsection
ENTRY(entry_SYSCALL_64_stage2)
UNWIND_HINT_EMPTY
popq %rdi
jmp entry_SYSCALL_64_after_hwframe
END(entry_SYSCALL_64_stage2)
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
@ -221,10 +279,9 @@ entry_SYSCALL_64_fastpath:
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
addq $6*8, %rsp /* skip extra regs -- they were preserved */
UNWIND_HINT_EMPTY
USERGS_SYSRET64
jmp .Lpop_c_regs_except_rcx_r11_and_sysret
1:
/*
@ -246,17 +303,18 @@ entry_SYSCALL64_slow_path:
call do_syscall_64 /* returns with IRQs disabled */
return_from_SYSCALL_64:
RESTORE_EXTRA_REGS
TRACE_IRQS_IRETQ /* we're about to change IF */
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context.
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
*/
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
cmpq %rcx, %r11 /* RCX == RIP */
jne opportunistic_sysret_failed
cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
jne swapgs_restore_regs_and_return_to_usermode
/*
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
@ -274,14 +332,14 @@ return_from_SYSCALL_64:
/* If this changed %rcx, it was not canonical */
cmpq %rcx, %r11
jne opportunistic_sysret_failed
jne swapgs_restore_regs_and_return_to_usermode
cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
jne opportunistic_sysret_failed
jne swapgs_restore_regs_and_return_to_usermode
movq R11(%rsp), %r11
cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
jne opportunistic_sysret_failed
jne swapgs_restore_regs_and_return_to_usermode
/*
* SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
@ -302,12 +360,12 @@ return_from_SYSCALL_64:
* would never get past 'stuck_here'.
*/
testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
jnz opportunistic_sysret_failed
jnz swapgs_restore_regs_and_return_to_usermode
/* nothing to check for RSP */
cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
jne opportunistic_sysret_failed
jne swapgs_restore_regs_and_return_to_usermode
/*
* We win! This label is here just for ease of understanding
@ -315,14 +373,36 @@ return_from_SYSCALL_64:
*/
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
UNWIND_HINT_EMPTY
USERGS_SYSRET64
POP_EXTRA_REGS
.Lpop_c_regs_except_rcx_r11_and_sysret:
popq %rsi /* skip r11 */
popq %r10
popq %r9
popq %r8
popq %rax
popq %rsi /* skip rcx */
popq %rdx
popq %rsi
opportunistic_sysret_failed:
SWAPGS
jmp restore_c_regs_and_iret
/*
* Now all regs are restored except RSP and RDI.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
popq %rdi
popq %rsp
USERGS_SYSRET64
END(entry_SYSCALL_64)
ENTRY(stub_ptregs_64)
@ -423,8 +503,7 @@ ENTRY(ret_from_fork)
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWAPGS
jmp restore_regs_and_iret
jmp swapgs_restore_regs_and_return_to_usermode
1:
/* kernel thread */
@ -457,12 +536,13 @@ END(irq_entries_start)
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
pushfq
testl $X86_EFLAGS_IF, (%rsp)
pushq %rax
SAVE_FLAGS(CLBR_RAX)
testl $X86_EFLAGS_IF, %eax
jz .Lokay_\@
ud2
.Lokay_\@:
addq $8, %rsp
popq %rax
#endif
.endm
@ -554,6 +634,13 @@ END(irq_entries_start)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
cld
testb $3, CS-ORIG_RAX(%rsp)
jz 1f
SWAPGS
call switch_to_thread_stack
1:
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
@ -563,12 +650,8 @@ END(irq_entries_start)
jz 1f
/*
* IRQ from user mode. Switch to kernel gsbase and inform context
* tracking that we're in kernel mode.
*/
SWAPGS
/*
* IRQ from user mode.
*
* We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode
* (which can take locks). Since TRACE_IRQS_OFF idempotent,
@ -612,8 +695,52 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
GLOBAL(swapgs_restore_regs_and_return_to_usermode)
#ifdef CONFIG_DEBUG_ENTRY
/* Assert that pt_regs indicates user mode. */
testb $3, CS(%rsp)
jnz 1f
ud2
1:
#endif
POP_EXTRA_REGS
popq %r11
popq %r10
popq %r9
popq %r8
popq %rax
popq %rcx
popq %rdx
popq %rsi
/*
* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
pushq 5*8(%rdi) /* RSP */
pushq 4*8(%rdi) /* EFLAGS */
pushq 3*8(%rdi) /* CS */
pushq 2*8(%rdi) /* RIP */
/* Push user RDI on the trampoline stack. */
pushq (%rdi)
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
/* Restore RDI. */
popq %rdi
SWAPGS
jmp restore_regs_and_iret
INTERRUPT_RETURN
/* Returning to kernel space */
retint_kernel:
@ -633,15 +760,17 @@ retint_kernel:
*/
TRACE_IRQS_IRETQ
/*
* At this label, code paths which return to kernel and to user,
* which come from interrupts/exception and from syscalls, merge.
*/
GLOBAL(restore_regs_and_iret)
RESTORE_EXTRA_REGS
restore_c_regs_and_iret:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
GLOBAL(restore_regs_and_return_to_kernel)
#ifdef CONFIG_DEBUG_ENTRY
/* Assert that pt_regs indicates kernel mode. */
testb $3, CS(%rsp)
jz 1f
ud2
1:
#endif
POP_EXTRA_REGS
POP_C_REGS
addq $8, %rsp /* skip regs->orig_ax */
INTERRUPT_RETURN
ENTRY(native_iret)
@ -805,7 +934,33 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
/*
* Exception entry points.
*/
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
/*
* Switch to the thread stack. This is called with the IRET frame and
* orig_ax on the stack. (That is, RDI..R12 are not on the stack and
* space has not been allocated for them.)
*/
ENTRY(switch_to_thread_stack)
UNWIND_HINT_FUNC
pushq %rdi
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
pushq 7*8(%rdi) /* regs->ss */
pushq 6*8(%rdi) /* regs->rsp */
pushq 5*8(%rdi) /* regs->eflags */
pushq 4*8(%rdi) /* regs->cs */
pushq 3*8(%rdi) /* regs->ip */
pushq 2*8(%rdi) /* regs->orig_ax */
pushq 8(%rdi) /* return address */
UNWIND_HINT_FUNC
movq (%rdi), %rdi
ret
END(switch_to_thread_stack)
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
@ -818,17 +973,18 @@ ENTRY(\sym)
ASM_CLAC
.ifeq \has_error_code
.if \has_error_code == 0
pushq $-1 /* ORIG_RAX: no syscall to restart */
.endif
ALLOC_PT_GPREGS_ON_STACK
.if \paranoid
.if \paranoid == 1
.if \paranoid < 2
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
jnz 1f
jnz .Lfrom_usermode_switch_stack_\@
.endif
.if \paranoid
call paranoid_entry
.else
call error_entry
@ -870,20 +1026,15 @@ ENTRY(\sym)
jmp error_exit
.endif
.if \paranoid == 1
.if \paranoid < 2
/*
* Paranoid entry from userspace. Switch stacks and treat it
* Entry from userspace. Switch stacks and treat it
* as a normal entry. This means that paranoid handlers
* run in real process context if user_mode(regs).
*/
1:
.Lfrom_usermode_switch_stack_\@:
call error_entry
movq %rsp, %rdi /* pt_regs pointer */
call sync_regs
movq %rax, %rsp /* switch stack */
movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code
@ -1059,6 +1210,7 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry stack_segment do_stack_segment has_error_code=1
#ifdef CONFIG_XEN
idtentry xennmi do_nmi has_error_code=0
idtentry xendebug do_debug has_error_code=0
idtentry xenint3 do_int3 has_error_code=0
#endif
@ -1112,17 +1264,14 @@ ENTRY(paranoid_exit)
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
paranoid_exit_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
INTERRUPT_RETURN
.Lparanoid_exit_restore:
jmp restore_regs_and_return_to_kernel
END(paranoid_exit)
/*
@ -1146,6 +1295,14 @@ ENTRY(error_entry)
SWAPGS
.Lerror_entry_from_usermode_after_swapgs:
/* Put us onto the real thread stack. */
popq %r12 /* save return addr in %12 */
movq %rsp, %rdi /* arg0 = pt_regs pointer */
call sync_regs
movq %rax, %rsp /* switch stack */
ENCODE_FRAME_POINTER
pushq %r12
/*
* We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode
@ -1223,10 +1380,13 @@ ENTRY(error_exit)
jmp retint_user
END(error_exit)
/* Runs on exception stack */
/* XXX: broken on Xen PV */
/*
* Runs on exception stack. Xen PV does not go through this path at all,
* so we can use real assembly here.
*/
ENTRY(nmi)
UNWIND_HINT_IRET_REGS
/*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
* the iretq it performs will take us out of NMI context.
@ -1284,7 +1444,7 @@ ENTRY(nmi)
* stacks lest we corrupt the "NMI executing" variable.
*/
SWAPGS_UNSAFE_STACK
swapgs
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@ -1328,8 +1488,7 @@ ENTRY(nmi)
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts.
*/
SWAPGS
jmp restore_regs_and_iret
jmp swapgs_restore_regs_and_return_to_usermode
.Lnmi_from_kernel:
/*
@ -1450,7 +1609,7 @@ nested_nmi_out:
popq %rdx
/* We are returning to kernel mode, so this cannot result in a fault. */
INTERRUPT_RETURN
iretq
first_nmi:
/* Restore rdx. */
@ -1481,7 +1640,7 @@ first_nmi:
pushfq /* RFLAGS */
pushq $__KERNEL_CS /* CS */
pushq $1f /* RIP */
INTERRUPT_RETURN /* continues at repeat_nmi below */
iretq /* continues at repeat_nmi below */
UNWIND_HINT_IRET_REGS
1:
#endif
@ -1544,29 +1703,34 @@ end_repeat_nmi:
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
POP_EXTRA_REGS
POP_C_REGS
/* Point RSP at the "iret" frame. */
REMOVE_PT_GPREGS_FROM_STACK 6*8
/*
* Skip orig_ax and the "outermost" frame to point RSP at the "iret"
* at the "iret" frame.
*/
addq $6*8, %rsp
/*
* Clear "NMI executing". Set DF first so that we can easily
* distinguish the remaining code between here and IRET from
* the SYSCALL entry and exit paths. On a native kernel, we
* could just inspect RIP, but, on paravirt kernels,
* INTERRUPT_RETURN can translate into a jump into a
* hypercall page.
* the SYSCALL entry and exit paths.
*
* We arguably should just inspect RIP instead, but I (Andy) wrote
* this code when I had the misapprehension that Xen PV supported
* NMIs, and Xen PV would break that approach.
*/
std
movq $0, 5*8(%rsp) /* clear "NMI executing" */
/*
* INTERRUPT_RETURN reads the "iret" frame and exits the NMI
* stack in a single instruction. We are returning to kernel
* mode, so this cannot result in a fault.
* iretq reads the "iret" frame and exits the NMI stack in a
* single instruction. We are returning to kernel mode, so this
* cannot result in a fault. Similarly, we don't need to worry
* about espfix64 on the way back to kernel mode.
*/
INTERRUPT_RETURN
iretq
END(nmi)
ENTRY(ignore_sysret)

View file

@ -48,7 +48,7 @@
*/
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
SWAPGS
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat)
*/
movl %eax, %eax
/* Construct struct pt_regs on stack (iret frame is already on stack) */
pushq %rax /* pt_regs->orig_ax */
/* switch to thread stack expects orig_ax to be pushed */
call switch_to_thread_stack
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
@ -337,8 +340,7 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
SWAPGS
jmp restore_regs_and_iret
jmp swapgs_restore_regs_and_return_to_usermode
END(entry_INT80_compat)
ENTRY(stub32_clone)

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
out := $(obj)/../../include/generated/asm
uapi := $(obj)/../../include/generated/uapi/asm
out := arch/$(SRCARCH)/include/generated/asm
uapi := arch/$(SRCARCH)/include/generated/uapi/asm
# Create output directory if not already present
_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \

View file

@ -2371,7 +2371,7 @@ static unsigned long get_segment_base(unsigned int segment)
struct ldt_struct *ldt;
/* IRQs are off, so this synchronizes with smp_store_release */
ldt = lockless_dereference(current->active_mm->context.ldt);
ldt = READ_ONCE(current->active_mm->context.ldt);
if (!ldt || idx >= ldt->nr_entries)
return 0;

View file

@ -2958,6 +2958,10 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
if (event->attr.use_clockid)
flags &= ~PERF_SAMPLE_TIME;
if (!event->attr.exclude_kernel)
flags &= ~PERF_SAMPLE_REGS_USER;
if (event->attr.sample_regs_user & ~PEBS_REGS)
flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
return flags;
}

View file

@ -85,13 +85,15 @@ struct amd_nb {
* Flags PEBS can handle without an PMI.
*
* TID can only be handled by flushing at context switch.
* REGS_USER can be handled for events limited to ring 3.
*
*/
#define PEBS_FREERUNNING_FLAGS \
(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
/*
* A debug store configuration.
@ -110,6 +112,26 @@ struct debug_store {
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};
#define PEBS_REGS \
(PERF_REG_X86_AX | \
PERF_REG_X86_BX | \
PERF_REG_X86_CX | \
PERF_REG_X86_DX | \
PERF_REG_X86_DI | \
PERF_REG_X86_SI | \
PERF_REG_X86_SP | \
PERF_REG_X86_BP | \
PERF_REG_X86_IP | \
PERF_REG_X86_FLAGS | \
PERF_REG_X86_R8 | \
PERF_REG_X86_R9 | \
PERF_REG_X86_R10 | \
PERF_REG_X86_R11 | \
PERF_REG_X86_R12 | \
PERF_REG_X86_R13 | \
PERF_REG_X86_R14 | \
PERF_REG_X86_R15)
/*
* Per register state.
*/

View file

@ -113,7 +113,7 @@ void hyperv_init(void)
u64 guest_id;
union hv_x64_msr_hypercall_contents hypercall_msr;
if (x86_hyper != &x86_hyper_ms_hyperv)
if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return;
/* Allocate percpu VP index */

View file

@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v)
bool ok;
unsigned int retry = RDRAND_RETRY_LOOPS;
do {
asm volatile(RDRAND_LONG "\n\t"
asm volatile(RDRAND_LONG
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
if (ok)
@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v)
bool ok;
unsigned int retry = RDRAND_RETRY_LOOPS;
do {
asm volatile(RDRAND_INT "\n\t"
asm volatile(RDRAND_INT
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
if (ok)
@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v)
static inline bool rdseed_long(unsigned long *v)
{
bool ok;
asm volatile(RDSEED_LONG "\n\t"
asm volatile(RDSEED_LONG
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
return ok;
@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v)
static inline bool rdseed_int(unsigned int *v)
{
bool ok;
asm volatile(RDSEED_INT "\n\t"
asm volatile(RDSEED_INT
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
return ok;

View file

@ -143,7 +143,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
{
bool negative;
asm volatile(LOCK_PREFIX "andb %2,%1\n\t"
asm volatile(LOCK_PREFIX "andb %2,%1"
CC_SET(s)
: CC_OUT(s) (negative), ADDR
: "ir" ((char) ~(1 << nr)) : "memory");
@ -246,7 +246,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
{
bool oldbit;
asm("bts %2,%1\n\t"
asm("bts %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
@ -286,7 +286,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long
{
bool oldbit;
asm volatile("btr %2,%1\n\t"
asm volatile("btr %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
@ -298,7 +298,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
{
bool oldbit;
asm volatile("btc %2,%1\n\t"
asm volatile("btc %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr) : "memory");
@ -329,7 +329,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l
{
bool oldbit;
asm volatile("bt %2,%1\n\t"
asm volatile("bt %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long *)addr), "Ir" (nr));

View file

@ -7,6 +7,7 @@
*/
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <asm/processor.h>
#include <asm/user32.h>
#include <asm/unistd.h>

View file

@ -126,16 +126,17 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
#define setup_clear_cpu_cap(bit) do { \
clear_cpu_cap(&boot_cpu_data, bit); \
set_bit(bit, (unsigned long *)cpu_caps_cleared); \
} while (0)
extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
#define setup_force_cpu_cap(bit) do { \
set_cpu_cap(&boot_cpu_data, bit); \
set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
/*
* Static testing of CPU features. Used the same as boot_cpu_has().

View file

@ -13,173 +13,176 @@
/*
* Defines x86 CPU feature bits
*/
#define NCAPINTS 18 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
#define NCAPINTS 18 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
* Note: If the comment begins with a quoted string, that string is used
* in /proc/cpuinfo instead of the macro name. If the string is "",
* this feature bit is not displayed in /proc/cpuinfo at all.
*
* When adding new features here that depend on other features,
* please update the table in kernel/cpu/cpuid-deps.c as well.
*/
/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */
/* (plus FCMOVcc, FCOMI with FPU) */
#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */
#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
/* Don't duplicate feature flags which are redundant with Intel! */
#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */
#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */
#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */
#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */
#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */
#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */
#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */
#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */
/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
/* Other features, Linux-defined mapping, word 3 */
/* This range is used for feature bits which conflict or are synthesized */
#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
/* cpu types for specific tunings: */
#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */
#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */
#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */
#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */
#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */
#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */
#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */
#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
/* CPU types for specific tunings: */
#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */
#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */
#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */
#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */
#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */
#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */
#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */
#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */
#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */
#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */
#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */
#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */
#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */
#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */
#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */
#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */
#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */
#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */
#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */
#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */
#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */
#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */
#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */
#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */
#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */
#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */
#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */
#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */
/*
* Auxiliary flags: Linux defined - For features scattered in various
@ -187,146 +190,155 @@
*
* Reuse free bits when adding new feature flags!
*/
#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */
#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */
#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */
#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */
#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */
#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */
#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */
#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */
#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */
#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */
#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */
#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */
#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */
/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */
#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */
#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */
/*
* BUG word(s)
*/
#define X86_BUG(x) (NCAPINTS*32 + (x))
#define X86_BUG(x) (NCAPINTS*32 + (x))
#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
#ifdef CONFIG_X86_32
/*
* 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
* to avoid confusion.
*/
#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
#endif
#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
#endif /* _ASM_X86_CPUFEATURES_H */

View file

@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
return this_cpu_ptr(&gdt_page)->gdt;
}
/* Get the fixmap index for a specific processor */
static inline unsigned int get_cpu_gdt_ro_index(int cpu)
{
return FIX_GDT_REMAP_BEGIN + cpu;
}
/* Provide the fixmap address of the remapped GDT */
static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
{
unsigned int idx = get_cpu_gdt_ro_index(cpu);
return (struct desc_struct *)__fix_to_virt(idx);
return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
}
/* Provide the current read-only GDT */
@ -185,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
#endif
}
static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
{
struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss;

View file

@ -44,6 +44,45 @@ extern unsigned long __FIXADDR_TOP;
PAGE_SIZE)
#endif
/*
* cpu_entry_area is a percpu region in the fixmap that contains things
* needed by the CPU and early entry/exit code. Real types aren't used
* for all fields here to avoid circular header dependencies.
*
* Every field is a virtual alias of some other allocated backing store.
* There is no direct allocation of a struct cpu_entry_area.
*/
struct cpu_entry_area {
char gdt[PAGE_SIZE];
/*
* The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
* a a read-only guard page.
*/
struct SYSENTER_stack_page SYSENTER_stack_page;
/*
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
* we need task switches to work, and task switches write to the TSS.
*/
struct tss_struct tss;
char entry_trampoline[PAGE_SIZE];
#ifdef CONFIG_X86_64
/*
* Exception stacks used for IST entries.
*
* In the future, this should have a separate slot for each stack
* with guard pages between them.
*/
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
#endif
};
#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
extern void setup_cpu_entry_areas(void);
/*
* Here we define all the compile-time 'special' virtual
@ -101,8 +140,14 @@ enum fixed_addresses {
FIX_LNW_VRTC,
#endif
/* Fixmap entries to remap the GDTs, one per processor. */
FIX_GDT_REMAP_BEGIN,
FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
FIX_CPU_ENTRY_AREA_TOP,
FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
#ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */
FIX_APEI_GHES_IRQ,
FIX_APEI_GHES_NMI,
#endif
__end_of_permanent_fixed_addresses,
@ -185,5 +230,30 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
void __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);
static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
{
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
}
#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
__get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
})
#define get_cpu_entry_area_index(cpu, field) \
__get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
}
static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
{
return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
}
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */

View file

@ -20,14 +20,22 @@
#ifndef _ASM_X86_HYPERVISOR_H
#define _ASM_X86_HYPERVISOR_H
/* x86 hypervisor types */
enum x86_hypervisor_type {
X86_HYPER_NATIVE = 0,
X86_HYPER_VMWARE,
X86_HYPER_MS_HYPERV,
X86_HYPER_XEN_PV,
X86_HYPER_XEN_HVM,
X86_HYPER_KVM,
};
#ifdef CONFIG_HYPERVISOR_GUEST
#include <asm/kvm_para.h>
#include <asm/x86_init.h>
#include <asm/xen/hypervisor.h>
/*
* x86 hypervisor information
*/
struct hypervisor_x86 {
/* Hypervisor name */
const char *name;
@ -35,40 +43,27 @@ struct hypervisor_x86 {
/* Detection routine */
uint32_t (*detect)(void);
/* Platform setup (run once per boot) */
void (*init_platform)(void);
/* Hypervisor type */
enum x86_hypervisor_type type;
/* X2APIC detection (run once per boot) */
bool (*x2apic_available)(void);
/* init time callbacks */
struct x86_hyper_init init;
/* pin current vcpu to specified physical cpu (run rarely) */
void (*pin_vcpu)(int);
/* called during init_mem_mapping() to setup early mappings. */
void (*init_mem_mapping)(void);
/* runtime callbacks */
struct x86_hyper_runtime runtime;
};
extern const struct hypervisor_x86 *x86_hyper;
/* Recognized hypervisors */
extern const struct hypervisor_x86 x86_hyper_vmware;
extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
extern const struct hypervisor_x86 x86_hyper_xen_pv;
extern const struct hypervisor_x86 x86_hyper_xen_hvm;
extern const struct hypervisor_x86 x86_hyper_kvm;
extern enum x86_hypervisor_type x86_hyper_type;
extern void init_hypervisor_platform(void);
extern bool hypervisor_x2apic_available(void);
extern void hypervisor_pin_vcpu(int cpu);
static inline void hypervisor_init_mem_mapping(void)
static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
{
if (x86_hyper && x86_hyper->init_mem_mapping)
x86_hyper->init_mem_mapping();
return x86_hyper_type == type;
}
#else
static inline void init_hypervisor_platform(void) { }
static inline bool hypervisor_x2apic_available(void) { return false; }
static inline void hypervisor_init_mem_mapping(void) { }
static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
{
return type == X86_HYPER_NATIVE;
}
#endif /* CONFIG_HYPERVISOR_GUEST */
#endif /* _ASM_X86_HYPERVISOR_H */

View file

@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
swapgs; \
sysretl
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(x) pushfq; popq %rax
#endif
#else
#define INTERRUPT_RETURN iret
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit

View file

@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, int all);
extern void show_iret_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);

View file

@ -73,8 +73,8 @@ static inline void load_mm_ldt(struct mm_struct *mm)
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
/* lockless_dereference synchronizes with smp_store_release */
ldt = lockless_dereference(mm->context.ldt);
/* READ_ONCE synchronizes with smp_store_release */
ldt = READ_ONCE(mm->context.ldt);
/*
* Any change to mm->context.ldt is followed by an IPI to all

View file

@ -6,7 +6,7 @@
#include <asm/orc_types.h>
struct mod_arch_specific {
#ifdef CONFIG_ORC_UNWINDER
#ifdef CONFIG_UNWINDER_ORC
unsigned int num_orcs;
int *orc_unwind_ip;
struct orc_entry *orc_unwind;

View file

@ -16,10 +16,9 @@
#include <linux/cpumask.h>
#include <asm/frame.h>
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
static inline void load_sp0(unsigned long sp0)
{
PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0);
}
/* The paravirtualized CPUID instruction. */
@ -928,6 +927,15 @@ extern void default_banner(void);
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#endif
#endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */

View file

@ -134,7 +134,7 @@ struct pv_cpu_ops {
void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
void (*load_sp0)(unsigned long sp0);
void (*set_iopl_mask)(unsigned mask);

View file

@ -526,7 +526,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
{
bool oldbit;
asm volatile("bt "__percpu_arg(2)",%1\n\t"
asm volatile("bt "__percpu_arg(2)",%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));

View file

@ -200,10 +200,9 @@ enum page_cache_mode {
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
_PAGE_DIRTY | _PAGE_ENC)
#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER)
#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC)

View file

@ -162,9 +162,9 @@ enum cpuid_regs_idx {
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
extern struct tss_struct doublefault_tss;
extern __u32 cpu_caps_cleared[NCAPINTS];
extern __u32 cpu_caps_set[NCAPINTS];
extern struct x86_hw_tss doublefault_tss;
extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
#ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
@ -252,6 +252,11 @@ static inline void load_cr3(pgd_t *pgdir)
write_cr3(__sme_pa(pgdir));
}
/*
* Note that while the legacy 'TSS' name comes from 'Task State Segment',
* on modern x86 CPUs the TSS also holds information important to 64-bit mode,
* unrelated to the task-switch mechanism:
*/
#ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */
struct x86_hw_tss {
@ -304,7 +309,13 @@ struct x86_hw_tss {
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
/*
* We store cpu_current_top_of_stack in sp1 so it's always accessible.
* Linux does not use ring 1, so sp1 is not otherwise needed.
*/
u64 sp1;
u64 sp2;
u64 reserved2;
u64 ist[7];
@ -322,12 +333,22 @@ struct x86_hw_tss {
#define IO_BITMAP_BITS 65536
#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000
struct SYSENTER_stack {
unsigned long words[64];
};
struct SYSENTER_stack_page {
struct SYSENTER_stack stack;
} __aligned(PAGE_SIZE);
struct tss_struct {
/*
* The hardware state:
* The fixed hardware portion. This must not cross a page boundary
* at risk of violating the SDM's advice and potentially triggering
* errata.
*/
struct x86_hw_tss x86_tss;
@ -338,18 +359,9 @@ struct tss_struct {
* be within the limit.
*/
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
} __aligned(PAGE_SIZE);
#ifdef CONFIG_X86_32
/*
* Space for the temporary SYSENTER stack.
*/
unsigned long SYSENTER_stack_canary;
unsigned long SYSENTER_stack[64];
#endif
} ____cacheline_aligned;
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
/*
* sizeof(unsigned long) coming from an extra "long" at the end
@ -363,6 +375,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif
/*
@ -431,7 +446,9 @@ typedef struct {
struct thread_struct {
/* Cached TLS descriptors: */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
#ifdef CONFIG_X86_32
unsigned long sp0;
#endif
unsigned long sp;
#ifdef CONFIG_X86_32
unsigned long sysenter_cs;
@ -518,16 +535,9 @@ static inline void native_set_iopl_mask(unsigned mask)
}
static inline void
native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
native_load_sp0(unsigned long sp0)
{
tss->x86_tss.sp0 = thread->sp0;
#ifdef CONFIG_X86_32
/* Only happens when SEP is enabled, no need to test "SEP"arately: */
if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
tss->x86_tss.ss1 = thread->sysenter_cs;
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
#endif
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
static inline void native_swapgs(void)
@ -539,12 +549,18 @@ static inline void native_swapgs(void)
static inline unsigned long current_top_of_stack(void)
{
#ifdef CONFIG_X86_64
return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
#else
/* sp0 on x86_32 is special in and around vm86 mode. */
/*
* We can't read directly from tss.sp0: sp0 on x86_32 is special in
* and around vm86 mode and sp0 on x86_64 is special because of the
* entry trampoline.
*/
return this_cpu_read_stable(cpu_current_top_of_stack);
#endif
}
static inline bool on_thread_stack(void)
{
return (unsigned long)(current_top_of_stack() -
current_stack_pointer) < THREAD_SIZE;
}
#ifdef CONFIG_PARAVIRT
@ -552,10 +568,9 @@ static inline unsigned long current_top_of_stack(void)
#else
#define __cpuid native_cpuid
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
static inline void load_sp0(unsigned long sp0)
{
native_load_sp0(tss, thread);
native_load_sp0(sp0);
}
#define set_iopl_mask native_set_iopl_mask
@ -804,6 +819,15 @@ static inline void spin_lock_prefetch(const void *x)
#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
TOP_OF_KERNEL_STACK_PADDING)
#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
#define task_pt_regs(task) \
({ \
unsigned long __ptr = (unsigned long)task_stack_page(task); \
__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
((struct pt_regs *)__ptr) - 1; \
})
#ifdef CONFIG_X86_32
/*
* User space process size: 3GB (default).
@ -823,23 +847,6 @@ static inline void spin_lock_prefetch(const void *x)
.addr_limit = KERNEL_DS, \
}
/*
* TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
* This is necessary to guarantee that the entire "struct pt_regs"
* is accessible even if the CPU haven't stored the SS/ESP registers
* on the stack (interrupt gate does not save these registers
* when switching to the same priv ring).
* Therefore beware: accessing the ss/esp fields of the
* "struct pt_regs" is possible, but they may contain the
* completely wrong values.
*/
#define task_pt_regs(task) \
({ \
unsigned long __ptr = (unsigned long)task_stack_page(task); \
__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
((struct pt_regs *)__ptr) - 1; \
})
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
@ -873,11 +880,9 @@ static inline void spin_lock_prefetch(const void *x)
#define STACK_TOP_MAX TASK_SIZE_MAX
#define INIT_THREAD { \
.sp0 = TOP_OF_INIT_STACK, \
.addr_limit = KERNEL_DS, \
}
#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */

View file

@ -136,9 +136,9 @@ static inline int v8086_mode(struct pt_regs *regs)
#endif
}
#ifdef CONFIG_X86_64
static inline bool user_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
#ifndef CONFIG_PARAVIRT
/*
* On non-paravirt systems, this is the only long mode CPL 3
@ -149,8 +149,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
/* Headers are too twisted for this to go in paravirt.h. */
return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
#endif
#else /* !CONFIG_X86_64 */
return false;
#endif
}
#ifdef CONFIG_X86_64
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
#endif

View file

@ -29,7 +29,7 @@ cc_label: \
#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
do { \
bool c; \
asm volatile (fullop ";" CC_SET(cc) \
asm volatile (fullop CC_SET(cc) \
: [counter] "+m" (var), CC_OUT(cc) (c) \
: __VA_ARGS__ : clobbers); \
return c; \

View file

@ -16,6 +16,7 @@ enum stack_type {
STACK_TYPE_TASK,
STACK_TYPE_IRQ,
STACK_TYPE_SOFTIRQ,
STACK_TYPE_SYSENTER,
STACK_TYPE_EXCEPTION,
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
};
@ -28,6 +29,8 @@ struct stack_info {
bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask);

View file

@ -2,6 +2,8 @@
#ifndef _ASM_X86_SWITCH_TO_H
#define _ASM_X86_SWITCH_TO_H
#include <linux/sched/task_stack.h>
struct task_struct; /* one of the stranger aspects of C forward declarations */
struct task_struct *__switch_to_asm(struct task_struct *prev,
@ -73,4 +75,28 @@ do { \
((last) = __switch_to_asm((prev), (next))); \
} while (0)
#ifdef CONFIG_X86_32
static inline void refresh_sysenter_cs(struct thread_struct *thread)
{
/* Only happens when SEP is enabled, no need to test "SEP"arately: */
if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
return;
this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
#endif
/* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_sp0(struct task_struct *task)
{
/* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
load_sp0(task->thread.sp0);
#else
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
}
#endif /* _ASM_X86_SWITCH_TO_H */

View file

@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64
# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
#endif
#endif

View file

@ -34,11 +34,6 @@ DECLARE_EVENT_CLASS(x86_fpu,
)
);
DEFINE_EVENT(x86_fpu, x86_fpu_state,
TP_PROTO(struct fpu *fpu),
TP_ARGS(fpu)
);
DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
TP_PROTO(struct fpu *fpu),
TP_ARGS(fpu)
@ -74,11 +69,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state,
TP_ARGS(fpu)
);
DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state,
TP_PROTO(struct fpu *fpu),
TP_ARGS(fpu)
);
DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
TP_PROTO(struct fpu *fpu),
TP_ARGS(fpu)

View file

@ -38,9 +38,9 @@ asmlinkage void simd_coprocessor_error(void);
#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
asmlinkage void xen_divide_error(void);
asmlinkage void xen_xennmi(void);
asmlinkage void xen_xendebug(void);
asmlinkage void xen_xenint3(void);
asmlinkage void xen_nmi(void);
asmlinkage void xen_overflow(void);
asmlinkage void xen_bounds(void);
asmlinkage void xen_invalid_op(void);
@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
dotraplinkage void do_stack_segment(struct pt_regs *, long);
#ifdef CONFIG_X86_64
dotraplinkage void do_double_fault(struct pt_regs *, long);
asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
@ -145,4 +144,22 @@ enum {
X86_TRAP_IRET = 32, /* 32, IRET Exception */
};
/*
* Page fault error code bits:
*
* bit 0 == 0: no page found 1: protection fault
* bit 1 == 0: read access 1: write access
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
*/
enum x86_pf_error_code {
X86_PF_PROT = 1 << 0,
X86_PF_WRITE = 1 << 1,
X86_PF_USER = 1 << 2,
X86_PF_RSVD = 1 << 3,
X86_PF_INSTR = 1 << 4,
X86_PF_PK = 1 << 5,
};
#endif /* _ASM_X86_TRAPS_H */

View file

@ -7,17 +7,20 @@
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
struct unwind_state {
struct stack_info stack_info;
unsigned long stack_mask;
struct task_struct *task;
int graph_idx;
bool error;
#if defined(CONFIG_ORC_UNWINDER)
#if defined(CONFIG_UNWINDER_ORC)
bool signal, full_regs;
unsigned long sp, bp, ip;
struct pt_regs *regs;
#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
bool got_irq;
unsigned long *bp, *orig_sp, ip;
struct pt_regs *regs;
@ -51,7 +54,11 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
__unwind_start(state, task, regs, first_frame);
}
#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
/*
* WARNING: The entire pt_regs may not be safe to dereference. In some cases,
* only the iret frame registers are accessible. Use with caution!
*/
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{
if (unwind_done(state))
@ -66,7 +73,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
}
#endif
#ifdef CONFIG_ORC_UNWINDER
#ifdef CONFIG_UNWINDER_ORC
void unwind_init(void);
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
void *orc, size_t orc_size);

View file

@ -114,6 +114,18 @@ struct x86_init_pci {
void (*fixup_irqs)(void);
};
/**
* struct x86_hyper_init - x86 hypervisor init functions
* @init_platform: platform setup
* @x2apic_available: X2APIC detection
* @init_mem_mapping: setup early mappings during init_mem_mapping()
*/
struct x86_hyper_init {
void (*init_platform)(void);
bool (*x2apic_available)(void);
void (*init_mem_mapping)(void);
};
/**
* struct x86_init_ops - functions for platform specific setup
*
@ -127,6 +139,7 @@ struct x86_init_ops {
struct x86_init_timers timers;
struct x86_init_iommu iommu;
struct x86_init_pci pci;
struct x86_hyper_init hyper;
};
/**
@ -199,6 +212,15 @@ struct x86_legacy_features {
struct x86_legacy_devices devices;
};
/**
* struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks
*
* @pin_vcpu: pin current vcpu to specified physical cpu (run rarely)
*/
struct x86_hyper_runtime {
void (*pin_vcpu)(int cpu);
};
/**
* struct x86_platform_ops - platform specific runtime functions
* @calibrate_cpu: calibrate CPU
@ -218,6 +240,7 @@ struct x86_legacy_features {
* possible in x86_early_init_platform_quirks() by
* only using the current x86_hardware_subarch
* semantics.
* @hyper: x86 hypervisor specific runtime callbacks
*/
struct x86_platform_ops {
unsigned long (*calibrate_cpu)(void);
@ -233,6 +256,7 @@ struct x86_platform_ops {
void (*apic_post_init)(void);
struct x86_legacy_features legacy;
void (*set_legacy_features)(void);
struct x86_hyper_runtime hyper;
};
struct pci_dev;

View file

@ -152,5 +152,8 @@
#define CX86_ARR_BASE 0xc4
#define CX86_RCR_BASE 0xdc
#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
X86_CR0_PG)
#endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */

View file

@ -25,9 +25,9 @@ endif
KASAN_SANITIZE_head$(BITS).o := n
KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
@ -128,9 +128,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o
obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o
obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o
obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
###
# 64 bit specific files

View file

@ -1645,7 +1645,7 @@ static __init void try_to_enable_x2apic(int remap_mode)
* under KVM
*/
if (max_physical_apicid > 255 ||
!hypervisor_x2apic_available()) {
!x86_init.hyper.x2apic_available()) {
pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
x2apic_disable();
return;

View file

@ -920,9 +920,8 @@ static __init void uv_rtc_init(void)
/*
* percpu heartbeat timer
*/
static void uv_heartbeat(unsigned long ignored)
static void uv_heartbeat(struct timer_list *timer)
{
struct timer_list *timer = &uv_scir_info->timer;
unsigned char bits = uv_scir_info->state;
/* Flip heartbeat bit: */
@ -947,7 +946,7 @@ static int uv_heartbeat_enable(unsigned int cpu)
struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
setup_pinned_timer(timer, uv_heartbeat, cpu);
timer_setup(timer, uv_heartbeat, TIMER_PINNED);
timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
add_timer_on(timer, cpu);
uv_cpu_scir_info(cpu)->enabled = 1;

View file

@ -93,4 +93,10 @@ void common(void) {
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
}

View file

@ -47,13 +47,8 @@ void foo(void)
BLANK();
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
offsetofend(struct tss_struct, SYSENTER_stack));
/* Offset from cpu_tss to SYSENTER_stack */
OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
/* Size of SYSENTER_stack */
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
#ifdef CONFIG_CC_STACKPROTECTOR
BLANK();

View file

@ -23,6 +23,9 @@ int main(void)
#ifdef CONFIG_PARAVIRT
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
#endif
BLANK();
#endif
@ -63,6 +66,7 @@ int main(void)
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
BLANK();
#ifdef CONFIG_CC_STACKPROTECTOR

View file

@ -23,6 +23,7 @@ obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o

View file

@ -804,8 +804,11 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x17: init_amd_zn(c); break;
}
/* Enable workaround for FXSAVE leak */
if (c->x86 >= 6)
/*
* Enable workaround for FXSAVE leak on CPUs
* without a XSaveErPtr feature
*/
if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR)))
set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
cpu_detect_cache_sizes(c);

View file

@ -452,8 +452,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
__u32 cpu_caps_cleared[NCAPINTS];
__u32 cpu_caps_set[NCAPINTS];
__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
void load_percpu_segment(int cpu)
{
@ -466,27 +466,116 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}
/* Setup the fixmap mapping only once per-processor */
static inline void setup_fixmap_gdt(int cpu)
#ifdef CONFIG_X86_32
/* The 32-bit entry code needs to find cpu_entry_area. */
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#endif
#ifdef CONFIG_X86_64
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif
static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
SYSENTER_stack_storage);
static void __init
set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
{
for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
}
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
/* On 64-bit systems, we use a read-only fixmap GDT. */
pgprot_t prot = PAGE_KERNEL_RO;
extern char _entry_trampoline[];
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the GDT
* is read-only, that will triple fault.
* a task gate needs to change an available TSS to busy. If the
* GDT is read-only, that will triple fault. The TSS cannot be
* read-only because the CPU writes to it on task switches.
*
* On Xen PV, the GDT must be read-only because the hypervisor requires
* it.
* On Xen PV, the GDT must be read-only because the hypervisor
* requires it.
*/
pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
pgprot_t tss_prot = PAGE_KERNEL;
#endif
__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
PAGE_KERNEL);
/*
* The Intel SDM says (Volume 3, 7.2.1):
*
* Avoid placing a page boundary in the part of the TSS that the
* processor reads during a task switch (the first 104 bytes). The
* processor may not correctly perform address translations if a
* boundary occurs in this area. During a task switch, the processor
* reads and writes into the first 104 bytes of each TSS (using
* contiguous physical addresses beginning with the physical address
* of the first byte of the TSS). So, after TSS access begins, if
* part of the 104 bytes is not physically contiguous, the processor
* will access incorrect information without generating a page-fault
* exception.
*
* There are also a lot of errata involving the TSS spanning a page
* boundary. Assert that we're not doing that.
*/
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
&per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE,
tss_prot);
#ifdef CONFIG_X86_32
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
#endif
#ifdef CONFIG_X86_64
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
BUILD_BUG_ON(sizeof(exception_stacks) !=
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE,
PAGE_KERNEL);
__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
}
void __init setup_cpu_entry_areas(void)
{
unsigned int cpu;
for_each_possible_cpu(cpu)
setup_cpu_entry_area(cpu);
}
/* Load the original GDT from the per-cpu structure */
@ -723,7 +812,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
{
int i;
for (i = 0; i < NCAPINTS; i++) {
for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
c->x86_capability[i] &= ~cpu_caps_cleared[i];
c->x86_capability[i] |= cpu_caps_set[i];
}
@ -1225,7 +1314,7 @@ void enable_sep_cpu(void)
return;
cpu = get_cpu();
tss = &per_cpu(cpu_tss, cpu);
tss = &per_cpu(cpu_tss_rw, cpu);
/*
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@ -1234,11 +1323,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
wrmsr(MSR_IA32_SYSENTER_ESP,
(unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
0);
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu();
@ -1301,18 +1386,16 @@ void print_cpu_info(struct cpuinfo_x86 *c)
pr_cont(")\n");
}
static __init int setup_disablecpuid(char *arg)
/*
* clearcpuid= was already parsed in fpu__init_parse_early_param.
* But we need to keep a dummy __setup around otherwise it would
* show up as an environment variable for init.
*/
static __init int setup_clearcpuid(char *arg)
{
int bit;
if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32)
setup_clear_cpu_cap(bit);
else
return 0;
return 1;
}
__setup("clearcpuid=", setup_disablecpuid);
__setup("clearcpuid=", setup_clearcpuid);
#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(union irq_stack_union,
@ -1334,25 +1417,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
extern char _entry_trampoline[];
extern char entry_SYSCALL_64_trampoline[];
int cpu = smp_processor_id();
unsigned long SYSCALL64_entry_trampoline =
(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
(entry_SYSCALL_64_trampoline - _entry_trampoline);
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@ -1363,7 +1440,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@ -1507,7 +1584,7 @@ void cpu_init(void)
if (cpu)
load_ucode_ap();
t = &per_cpu(cpu_tss, cpu);
t = &per_cpu(cpu_tss_rw, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@ -1546,7 +1623,7 @@ void cpu_init(void)
* set up and load the per-CPU TSS
*/
if (!oist->ist[0]) {
char *estacks = per_cpu(exception_stacks, cpu);
char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
estacks += exception_stack_sizes[v];
@ -1557,7 +1634,7 @@ void cpu_init(void)
}
}
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
/*
* <= is required because the CPU will access up to
@ -1572,9 +1649,14 @@ void cpu_init(void)
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, me);
load_sp0(t, &current->thread);
set_tss_desc(cpu, t);
/*
* Initialize the TSS. sp0 points to the entry trampoline stack
* regardless of what task is running.
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
load_mm_ldt(&init_mm);
clear_all_debug_regs();
@ -1585,7 +1667,6 @@ void cpu_init(void)
if (is_uv_system())
uv_cpu_init();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
@ -1595,8 +1676,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
struct thread_struct *thread = &curr->thread;
struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
wait_for_master_cpu(cpu);
@ -1627,12 +1707,16 @@ void cpu_init(void)
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, curr);
load_sp0(t, thread);
set_tss_desc(cpu, t);
/*
* Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode.
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
#ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */
@ -1644,7 +1728,6 @@ void cpu_init(void)
fpu__init_cpu();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
#endif

View file

@ -0,0 +1,121 @@
/* Declare dependencies between CPUIDs */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <asm/cpufeature.h>
struct cpuid_dep {
unsigned int feature;
unsigned int depends;
};
/*
* Table of CPUID features that depend on others.
*
* This only includes dependencies that can be usefully disabled, not
* features part of the base set (like FPU).
*
* Note this all is not __init / __initdata because it can be
* called from cpu hotplug. It shouldn't do anything in this case,
* but it's difficult to tell that to the init reference checker.
*/
const static struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
{ X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
{ X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
{ X86_FEATURE_AVX, X86_FEATURE_XSAVE },
{ X86_FEATURE_PKU, X86_FEATURE_XSAVE },
{ X86_FEATURE_MPX, X86_FEATURE_XSAVE },
{ X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
{ X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
{ X86_FEATURE_XMM, X86_FEATURE_FXSR },
{ X86_FEATURE_XMM2, X86_FEATURE_XMM },
{ X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
{ X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
{ X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
{ X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
{ X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
{ X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
{ X86_FEATURE_F16C, X86_FEATURE_XMM2, },
{ X86_FEATURE_AES, X86_FEATURE_XMM2 },
{ X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
{ X86_FEATURE_FMA, X86_FEATURE_AVX },
{ X86_FEATURE_AVX2, X86_FEATURE_AVX, },
{ X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
{ X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
{ X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
{ X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
{ X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
{}
};
static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
{
/*
* Note: This could use the non atomic __*_bit() variants, but the
* rest of the cpufeature code uses atomics as well, so keep it for
* consistency. Cleanup all of it separately.
*/
if (!c) {
clear_cpu_cap(&boot_cpu_data, feature);
set_bit(feature, (unsigned long *)cpu_caps_cleared);
} else {
clear_bit(feature, (unsigned long *)c->x86_capability);
}
}
/* Take the capabilities and the BUG bits into account */
#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
{
DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
const struct cpuid_dep *d;
bool changed;
if (WARN_ON(feature >= MAX_FEATURE_BITS))
return;
clear_feature(c, feature);
/* Collect all features to disable, handling dependencies */
memset(disable, 0, sizeof(disable));
__set_bit(feature, disable);
/* Loop until we get a stable state. */
do {
changed = false;
for (d = cpuid_deps; d->feature; d++) {
if (!test_bit(d->depends, disable))
continue;
if (__test_and_set_bit(d->feature, disable))
continue;
changed = true;
clear_feature(c, d->feature);
}
} while (changed);
}
void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
{
do_clear_cpu_cap(c, feature);
}
void setup_clear_cpu_cap(unsigned int feature)
{
do_clear_cpu_cap(NULL, feature);
}

View file

@ -26,6 +26,12 @@
#include <asm/processor.h>
#include <asm/hypervisor.h>
extern const struct hypervisor_x86 x86_hyper_vmware;
extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
extern const struct hypervisor_x86 x86_hyper_xen_pv;
extern const struct hypervisor_x86 x86_hyper_xen_hvm;
extern const struct hypervisor_x86 x86_hyper_kvm;
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
#ifdef CONFIG_XEN_PV
@ -41,54 +47,52 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
#endif
};
const struct hypervisor_x86 *x86_hyper;
EXPORT_SYMBOL(x86_hyper);
enum x86_hypervisor_type x86_hyper_type;
EXPORT_SYMBOL(x86_hyper_type);
static inline void __init
static inline const struct hypervisor_x86 * __init
detect_hypervisor_vendor(void)
{
const struct hypervisor_x86 *h, * const *p;
const struct hypervisor_x86 *h = NULL, * const *p;
uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
h = *p;
pri = h->detect();
if (pri != 0 && pri > max_pri) {
pri = (*p)->detect();
if (pri > max_pri) {
max_pri = pri;
x86_hyper = h;
h = *p;
}
}
if (max_pri)
pr_info("Hypervisor detected: %s\n", x86_hyper->name);
if (h)
pr_info("Hypervisor detected: %s\n", h->name);
return h;
}
static void __init copy_array(const void *src, void *target, unsigned int size)
{
unsigned int i, n = size / sizeof(void *);
const void * const *from = (const void * const *)src;
const void **to = (const void **)target;
for (i = 0; i < n; i++)
if (from[i])
to[i] = from[i];
}
void __init init_hypervisor_platform(void)
{
const struct hypervisor_x86 *h;
detect_hypervisor_vendor();
h = detect_hypervisor_vendor();
if (!x86_hyper)
if (!h)
return;
if (x86_hyper->init_platform)
x86_hyper->init_platform();
}
copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
bool __init hypervisor_x2apic_available(void)
{
return x86_hyper &&
x86_hyper->x2apic_available &&
x86_hyper->x2apic_available();
}
void hypervisor_pin_vcpu(int cpu)
{
if (!x86_hyper)
return;
if (x86_hyper->pin_vcpu)
x86_hyper->pin_vcpu(cpu);
else
WARN_ONCE(1, "vcpu pinning requested but not supported!\n");
x86_hyper_type = h->type;
x86_init.hyper.init_platform();
}

View file

@ -254,9 +254,9 @@ static void __init ms_hyperv_init_platform(void)
#endif
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
.init_platform = ms_hyperv_init_platform,
.type = X86_HYPER_MS_HYPERV,
.init.init_platform = ms_hyperv_init_platform,
};
EXPORT_SYMBOL(x86_hyper_ms_hyperv);

View file

@ -205,10 +205,10 @@ static bool __init vmware_legacy_x2apic_available(void)
(eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
}
const __refconst struct hypervisor_x86 x86_hyper_vmware = {
const __initconst struct hypervisor_x86 x86_hyper_vmware = {
.name = "VMware",
.detect = vmware_platform,
.init_platform = vmware_platform_setup,
.x2apic_available = vmware_legacy_x2apic_available,
.type = X86_HYPER_VMWARE,
.init.init_platform = vmware_platform_setup,
.init.x2apic_available = vmware_legacy_x2apic_available,
};
EXPORT_SYMBOL(x86_hyper_vmware);

View file

@ -50,25 +50,23 @@ static void doublefault_fn(void)
cpu_relax();
}
struct tss_struct doublefault_tss __cacheline_aligned = {
.x86_tss = {
.sp0 = STACK_START,
.ss0 = __KERNEL_DS,
.ldt = 0,
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
struct x86_hw_tss doublefault_tss __cacheline_aligned = {
.sp0 = STACK_START,
.ss0 = __KERNEL_DS,
.ldt = 0,
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
.ip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */
.flags = X86_EFLAGS_SF | 0x2,
.sp = STACK_START,
.es = __USER_DS,
.cs = __KERNEL_CS,
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
.ip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */
.flags = X86_EFLAGS_SF | 0x2,
.sp = STACK_START,
.es = __USER_DS,
.cs = __KERNEL_CS,
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
.__cr3 = __pa_nodebug(swapper_pg_dir),
}
.__cr3 = __pa_nodebug(swapper_pg_dir),
};
/* dummy for do_double_fault() call */

View file

@ -43,6 +43,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true;
}
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
{
struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
void *begin = ss;
void *end = ss + 1;
if ((void *)stack < begin || (void *)stack >= end)
return false;
info->type = STACK_TYPE_SYSENTER;
info->begin = begin;
info->end = end;
info->next_sp = NULL;
return true;
}
static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl)
{
@ -50,6 +68,28 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
void show_iret_regs(struct pt_regs *regs)
{
printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
regs->sp, regs->flags);
}
static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
{
if (on_stack(info, regs, sizeof(*regs)))
__show_regs(regs, 0);
else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
IRET_FRAME_SIZE)) {
/*
* When an interrupt or exception occurs in entry code, the
* full pt_regs might not have been saved yet. In that case
* just print the iret frame.
*/
show_iret_regs(regs);
}
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl)
{
@ -71,31 +111,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack
* - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce)
* - SYSENTER stack
*
* x86-32 can have up to three stacks:
* x86-32 can have up to four stacks:
* - task stack
* - softirq stack
* - hardirq stack
* - SYSENTER stack
*/
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;
/*
* If we overflowed the task stack into a guard page, jump back
* to the bottom of the usable stack.
*/
if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
stack = task_stack_page(task);
if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
/*
* We weren't on a valid stack. It's possible that
* we overflowed a valid stack into a guard page.
* See if the next page up is valid so that we can
* generate some kind of backtrace if this happens.
*/
stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
}
stack_name = stack_type_name(stack_info.type);
if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name);
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
__show_regs(regs, 0);
if (regs)
show_regs_safe(&stack_info, regs);
/*
* Scan the stack, printing any text addresses we find. At the
@ -119,7 +163,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/*
* Don't print regs->ip again if it was already printed
* by __show_regs() below.
* by show_regs_safe() below.
*/
if (regs && stack == &regs->ip)
goto next;
@ -155,8 +199,8 @@ next:
/* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state);
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
__show_regs(regs, 0);
if (regs)
show_regs_safe(&stack_info, regs);
}
if (stack_name)

View file

@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ";
if (type == STACK_TYPE_SYSENTER)
return "SYSENTER";
return NULL;
}
@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current)
goto unknown;
if (in_sysenter_stack(stack, info))
goto recursion_check;
if (in_hardirq_stack(stack, info))
goto recursion_check;

View file

@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ)
return "IRQ";
if (type == STACK_TYPE_SYSENTER)
return "SYSENTER";
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info))
goto recursion_check;
if (in_sysenter_stack(stack, info))
goto recursion_check;
goto unknown;
recursion_check:

View file

@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
char arg[32];
char *argptr = arg;
int bit;
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
sizeof(arg)) &&
get_option(&argptr, &bit) &&
bit >= 0 &&
bit < NCAPINTS * 32)
setup_clear_cpu_cap(bit);
}
/*

View file

@ -15,6 +15,7 @@
#include <asm/fpu/xstate.h>
#include <asm/tlbflush.h>
#include <asm/cpufeature.h>
/*
* Although we spell it out in here, the Processor Trace
@ -36,6 +37,19 @@ static const char *xfeature_names[] =
"unknown xstate feature" ,
};
static short xsave_cpuid_features[] __initdata = {
X86_FEATURE_FPU,
X86_FEATURE_XMM,
X86_FEATURE_AVX,
X86_FEATURE_MPX,
X86_FEATURE_MPX,
X86_FEATURE_AVX512F,
X86_FEATURE_AVX512F,
X86_FEATURE_AVX512F,
X86_FEATURE_INTEL_PT,
X86_FEATURE_PKU,
};
/*
* Mask of xstate features supported by the CPU and the kernel:
*/
@ -59,26 +73,6 @@ unsigned int fpu_user_xstate_size;
void fpu__xstate_clear_all_cpu_caps(void)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
setup_clear_cpu_cap(X86_FEATURE_AVX512F);
setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
setup_clear_cpu_cap(X86_FEATURE_PKU);
setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
}
/*
@ -726,6 +720,7 @@ void __init fpu__init_system_xstate(void)
unsigned int eax, ebx, ecx, edx;
static int on_boot_cpu __initdata = 1;
int err;
int i;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@ -759,6 +754,14 @@ void __init fpu__init_system_xstate(void)
goto out_disable;
}
/*
* Clear XSAVE features that are disabled in the normal CPUID.
*/
for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
if (!boot_cpu_has(xsave_cpuid_features[i]))
xfeatures_mask &= ~BIT(i);
}
xfeatures_mask &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */

View file

@ -212,9 +212,6 @@ ENTRY(startup_32_smp)
#endif
.Ldefault_entry:
#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
X86_CR0_PG)
movl $(CR0_STATE & ~X86_CR0_PG),%eax
movl %eax,%cr0
@ -402,7 +399,7 @@ ENTRY(early_idt_handler_array)
# 24(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
pushl $0 # Dummy error code, to make stack frame uniform
.endif
pushl $i # 20(%esp) Vector number

View file

@ -38,11 +38,12 @@
*
*/
#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
#endif
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
@ -50,6 +51,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.code64
.globl startup_64
startup_64:
UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
@ -89,6 +91,7 @@ startup_64:
addq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
@ -133,6 +136,7 @@ ENTRY(secondary_startup_64)
movq $1f, %rax
jmp *%rax
1:
UNWIND_HINT_EMPTY
/* Check if nx is implemented */
movl $0x80000001, %eax
@ -150,9 +154,6 @@ ENTRY(secondary_startup_64)
1: wrmsr /* Make changes effective */
/* Setup cr0 */
#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
X86_CR0_PG)
movl $CR0_STATE, %eax
/* Make changes effective */
movq %rax, %cr0
@ -235,7 +236,7 @@ ENTRY(secondary_startup_64)
pushq %rax # target address in negative space
lretq
.Lafter_lret:
ENDPROC(secondary_startup_64)
END(secondary_startup_64)
#include "verify_cpu.S"
@ -247,6 +248,7 @@ ENDPROC(secondary_startup_64)
*/
ENTRY(start_cpu0)
movq initial_stack(%rip), %rsp
UNWIND_HINT_EMPTY
jmp .Ljump_to_C_code
ENDPROC(start_cpu0)
#endif
@ -266,26 +268,24 @@ ENDPROC(start_cpu0)
.quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
__FINITDATA
bad_address:
jmp bad_address
__INIT
ENTRY(early_idt_handler_array)
# 104(%rsp) %rflags
# 96(%rsp) %cs
# 88(%rsp) %rip
# 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
pushq $0 # Dummy error code, to make stack frame uniform
.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
UNWIND_HINT_IRET_REGS
pushq $0 # Dummy error code, to make stack frame uniform
.else
UNWIND_HINT_IRET_REGS offset=8
.endif
pushq $i # 72(%rsp) Vector number
jmp early_idt_handler_common
UNWIND_HINT_IRET_REGS
i = i + 1
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
ENDPROC(early_idt_handler_array)
UNWIND_HINT_IRET_REGS offset=16
END(early_idt_handler_array)
early_idt_handler_common:
/*
@ -313,6 +313,7 @@ early_idt_handler_common:
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
UNWIND_HINT_REGS
cmpq $14,%rsi /* Page fault? */
jnz 10f
@ -327,8 +328,8 @@ early_idt_handler_common:
20:
decl early_recursion_flag(%rip)
jmp restore_regs_and_iret
ENDPROC(early_idt_handler_common)
jmp restore_regs_and_return_to_kernel
END(early_idt_handler_common)
__INITDATA
@ -362,10 +363,7 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
NEXT_PAGE(init_top_pgt)
.fill 512,8,0
#else
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
@ -382,6 +380,9 @@ NEXT_PAGE(level2_ident_pgt)
* Don't set NX because code runs from these pages.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#else
NEXT_PAGE(init_top_pgt)
.fill 512,8,0
#endif
#ifdef CONFIG_X86_5LEVEL
@ -435,7 +436,7 @@ ENTRY(phys_base)
EXPORT_SYMBOL(phys_base)
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE

View file

@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
tss = &per_cpu(cpu_tss, get_cpu());
tss = &per_cpu(cpu_tss_rw, get_cpu());
if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num);

View file

@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
/*
* NB: Unlike exception entries, IRQ entries do not reliably
* handle context tracking in the low-level entry code. This is
* because syscall entries execute briefly with IRQs on before
* updating context tracking state, so we can take an IRQ from
* kernel mode with CONTEXT_USER. The low-level entry code only
* updates the context if we came from user mode, so we won't
* switch to CONTEXT_KERNEL. We'll fix that once the syscall
* code is cleaned up enough that we can cleanly defer enabling
* IRQs.
*/
entering_irq();
/* entering_irq() tells RCU that we're not quiescent. Check it. */

View file

@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return;
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
current->comm, curbase, regs->sp,
irq_stack_top, irq_stack_bottom,
estack_top, estack_bottom);
estack_top, estack_bottom, (void *)regs->ip);
if (sysctl_panic_on_stackoverflow)
panic("low stack detected by irq handler - check messages\n");

View file

@ -544,12 +544,12 @@ static uint32_t __init kvm_detect(void)
return kvm_cpuid_base();
}
const struct hypervisor_x86 x86_hyper_kvm __refconst = {
const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.name = "KVM",
.detect = kvm_detect,
.x2apic_available = kvm_para_available,
.type = X86_HYPER_KVM,
.init.x2apic_available = kvm_para_available,
};
EXPORT_SYMBOL_GPL(x86_hyper_kvm);
static __init int activate_jump_labels(void)
{

View file

@ -103,7 +103,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)
static void install_ldt(struct mm_struct *current_mm,
struct ldt_struct *ldt)
{
/* Synchronizes with lockless_dereference in load_mm_ldt. */
/* Synchronizes with READ_ONCE in load_mm_ldt. */
smp_store_release(&current_mm->context.ldt, ldt);
/* Activate the LDT for all CPUs using current_mm. */

View file

@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):

View file

@ -47,9 +47,25 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.x86_tss = {
.sp0 = TOP_OF_INIT_STACK,
/*
* .sp0 is only used when entering ring 0 from a lower
* privilege level. Since the init task never runs anything
* but ring 0 code, there is no need for a valid value here.
* Poison it.
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
#ifdef CONFIG_X86_64
/*
* .sp1 is cpu_current_top_of_stack. The init task never
* runs user code, but cpu_current_top_of_stack should still
* be well defined before the first context switch.
*/
.sp1 = TOP_OF_INIT_STACK,
#endif
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
@ -65,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
*/
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
#ifdef CONFIG_X86_32
.SYSENTER_stack_canary = STACK_END_MAGIC,
#endif
};
EXPORT_PER_CPU_SYMBOL(cpu_tss);
EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@ -98,7 +111,7 @@ void exit_thread(struct task_struct *tsk)
struct fpu *fpu = &t->fpu;
if (bp) {
struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);

View file

@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Reload esp0 and cpu_current_top_of_stack. This changes
* current_thread_info().
* current_thread_info(). Refresh the SYSENTER configuration in
* case prev or next is vm86.
*/
load_sp0(tss, next);
update_sp0(next_p);
refresh_sysenter_cs(next);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);

View file

@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
regs->sp, regs->flags);
show_iret_regs(regs);
if (regs->orig_ax != -1)
pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
else
@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
if (!all)
return;
asm("movl %%ds,%0" : "=r" (ds));
asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
rdmsrl(MSR_GS_BASE, gs);
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
if (!all)
return;
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = __read_cr3();
@ -274,7 +273,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
struct inactive_task_frame *frame;
struct task_struct *me = current;
p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
@ -401,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
@ -463,9 +461,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Switch the PDA and FPU contexts.
*/
this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
/* Reload esp0 and ss1. This changes current_thread_info(). */
load_sp0(tss, next);
/* Reload sp0. */
update_sp0(next_p);
/*
* Now maybe reload the debug registers and handle I/O bitmaps

View file

@ -962,8 +962,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
per_cpu(cpu_current_top_of_stack, cpu) =
(unsigned long)task_stack_page(idle) + THREAD_SIZE;
per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#else
initial_gs = per_cpu_offset(cpu);
#endif

View file

@ -141,8 +141,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
BUG_ON((unsigned long)(current_top_of_stack() -
current_stack_pointer) >= THREAD_SIZE);
BUG_ON(!on_thread_stack());
preempt_enable_no_resched();
}
@ -349,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
/*
* If IRET takes a non-IST fault on the espfix64 stack, then we
* end up promoting it to a doublefault. In that case, modify
* the stack to make it look like we just entered the #GP
* handler from user space, similar to bad_iret.
* end up promoting it to a doublefault. In that case, take
* advantage of the fact that we're not using the normal (TSS.sp0)
* stack right now. We can write a fake #GP(0) frame at TSS.sp0
* and then modify our own IRET frame so that, when we return,
* we land directly at the #GP(0) vector with the stack already
* set up according to its expectations.
*
* The net result is that our #GP handler will think that we
* entered from usermode with the bad user context.
*
* No need for ist_enter here because we don't use RCU.
*/
@ -359,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{
struct pt_regs *normal_regs = task_pt_regs(current);
struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Fake a #GP(0) from userspace. */
memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
* regs->sp points to the failing IRET frame on the
* ESPFIX64 stack. Copy it to the entry stack. This fills
* in gpregs->ss through gpregs->ip.
*
*/
memmove(&gpregs->ip, (void *)regs->sp, 5*8);
gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
* Adjust our frame so that we return straight to the #GP
* vector with the expected RSP value. This is safe because
* we won't enable interupts or schedule before we invoke
* general_protection, so nothing will clobber the stack
* frame we just set up.
*/
regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&normal_regs->orig_ax;
regs->sp = (unsigned long)&gpregs->orig_ax;
return;
}
@ -390,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
*
* Processors update CR2 whenever a page fault is detected. If a
* second page fault occurs while an earlier page fault is being
* deliv- ered, the faulting linear address of the second fault will
* delivered, the faulting linear address of the second fault will
* overwrite the contents of CR2 (replacing the previous
* address). These updates to CR2 occur even if the page fault
* results in a double fault or occurs during the delivery of a
@ -601,14 +619,15 @@ NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
* Help handler running on IST stack to switch off the IST stack if the
* interrupted code was in user mode. The actual stack switch is done in
* entry_64.S
* Help handler running on a per-cpu (IST or entry trampoline) stack
* to switch to the normal thread stack if the interrupted code was in
* user mode. The actual stack switch is done in entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = task_pt_regs(current);
*regs = *eregs;
struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
@ -624,13 +643,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
/*
* This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault
* correctly, we want move our stack frame to task_pt_regs
* and we want to pretend that the exception came from the
* iret target.
* correctly, we want to move our stack frame to where it would
* be had we entered directly on the entry stack (rather than
* just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/
struct bad_iret_stack *new_stack =
container_of(task_pt_regs(current),
struct bad_iret_stack, regs);
(struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the new stack. */
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
@ -795,14 +814,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
#if defined(CONFIG_X86_32)
/*
* This is the most likely code path that involves non-trivial use
* of the SYSENTER stack. Check that we haven't overrun it.
*/
WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
"Overran or corrupted SYSENTER stack\n");
#endif
ist_exit(regs);
}
NOKPROBE_SYMBOL(do_debug);
@ -929,6 +940,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
void __init trap_init(void)
{
/* Init cpu_entry_area before IST entries are set up */
setup_cpu_entry_areas();
idt_setup_traps();
/*

View file

@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
return NULL;
}
static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
size_t len)
{
struct stack_info *info = &state->stack_info;
void *addr = (void *)_addr;
/*
* If the address isn't on the current stack, switch to the next one.
*
* We may have to traverse multiple stacks to deal with the possibility
* that info->next_sp could point to an empty stack and the address
* could be on a subsequent stack.
*/
while (!on_stack(info, (void *)addr, len))
if (get_stack_info(info->next_sp, state->task, info,
&state->stack_mask))
return false;
if (!on_stack(info, addr, len) &&
(get_stack_info(addr, state->task, info, &state->stack_mask)))
return false;
return true;
}
@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
return true;
}
#define REGS_SIZE (sizeof(struct pt_regs))
#define SP_OFFSET (offsetof(struct pt_regs, sp))
#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
unsigned long *ip, unsigned long *sp, bool full)
unsigned long *ip, unsigned long *sp)
{
size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
struct pt_regs *regs = (struct pt_regs *)addr;
if (IS_ENABLED(CONFIG_X86_64)) {
if (!stack_access_ok(state, addr, regs_size))
return false;
/* x86-32 support will be more complicated due to the &regs->sp hack */
BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
*ip = regs->ip;
*sp = regs->sp;
return true;
}
if (!stack_access_ok(state, addr, sp_offset))
if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
return false;
*ip = regs->ip;
*sp = regs->sp;
return true;
}
if (user_mode(regs)) {
if (!stack_access_ok(state, addr + sp_offset,
REGS_SIZE - SP_OFFSET))
return false;
static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
unsigned long *ip, unsigned long *sp)
{
struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
*sp = regs->sp;
} else
*sp = (unsigned long)&regs->sp;
if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
return false;
*ip = regs->ip;
*sp = regs->sp;
return true;
}
@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc;
struct pt_regs *ptregs;
bool indirect = false;
if (unwind_done(state))
@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS_IRET:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference iret registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
}
ptregs = container_of((void *)sp, struct pt_regs, ip);
if ((unsigned long)ptregs >= prev_sp &&
on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
state->regs = ptregs;
state->full_regs = false;
} else
state->regs = NULL;
state->regs = (void *)sp - IRET_FRAME_OFFSET;
state->full_regs = false;
state->signal = true;
break;
@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
}
if (get_stack_info((unsigned long *)state->sp, state->task,
&state->stack_info, &state->stack_mask))
return;
&state->stack_info, &state->stack_mask)) {
/*
* We weren't on a valid stack. It's possible that
* we overflowed a valid stack into a guard page.
* See if the next page up is valid so that we can
* generate some kind of backtrace if this happens.
*/
void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
if (get_stack_info(next_page, state->task, &state->stack_info,
&state->stack_mask))
return;
}
/*
* The caller can provide the address of the first frame directly

View file

@ -33,7 +33,7 @@
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
verify_cpu:
ENTRY(verify_cpu)
pushf # Save caller passed flags
push $0 # Kill any dangerous flags
popf
@ -139,3 +139,4 @@ verify_cpu:
popf # Restore caller passed flags
xorl %eax, %eax
ret
ENDPROC(verify_cpu)

View file

@ -55,6 +55,7 @@
#include <asm/irq.h>
#include <asm/traps.h>
#include <asm/vm86.h>
#include <asm/switch_to.h>
/*
* Known problems:
@ -94,7 +95,6 @@
void save_v86_state(struct kernel_vm86_regs *regs, int retval)
{
struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86plus_struct __user *user;
struct vm86 *vm86 = current->thread.vm86;
@ -146,12 +146,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
do_exit(SIGSEGV);
}
tss = &per_cpu(cpu_tss, get_cpu());
preempt_disable();
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
load_sp0(tss, &tsk->thread);
update_sp0(tsk);
refresh_sysenter_cs(&tsk->thread);
vm86->saved_sp0 = 0;
put_cpu();
preempt_enable();
memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
@ -237,7 +238,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
{
struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86 *vm86 = tsk->thread.vm86;
struct kernel_vm86_regs vm86regs;
@ -365,15 +365,17 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86->saved_sp0 = tsk->thread.sp0;
lazy_save_gs(vm86->regs32.gs);
tss = &per_cpu(cpu_tss, get_cpu());
/* make room for real-mode segments */
preempt_disable();
tsk->thread.sp0 += 16;
if (static_cpu_has(X86_FEATURE_SEP))
if (static_cpu_has(X86_FEATURE_SEP)) {
tsk->thread.sysenter_cs = 0;
refresh_sysenter_cs(&tsk->thread);
}
load_sp0(tss, &tsk->thread);
put_cpu();
update_sp0(tsk);
preempt_enable();
if (vm86->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);

View file

@ -107,6 +107,15 @@ SECTIONS
SOFTIRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
#ifdef CONFIG_X86_64
. = ALIGN(PAGE_SIZE);
_entry_trampoline = .;
*(.entry_trampoline)
. = ALIGN(PAGE_SIZE);
ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
#endif
/* End of text section */
_etext = .;
} :text = 0x9090

View file

@ -28,6 +28,8 @@ void x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
int __init iommu_init_noop(void) { return 0; }
void iommu_shutdown_noop(void) { }
bool __init bool_x86_init_noop(void) { return false; }
void x86_op_int_noop(int cpu) { }
/*
* The platform setup functions are preset with the default functions
@ -81,6 +83,12 @@ struct x86_init_ops x86_init __initdata = {
.init_irq = x86_default_pci_init_irq,
.fixup_irqs = x86_default_pci_fixup_irqs,
},
.hyper = {
.init_platform = x86_init_noop,
.x2apic_available = bool_x86_init_noop,
.init_mem_mapping = x86_init_noop,
},
};
struct x86_cpuinit_ops x86_cpuinit = {
@ -101,6 +109,7 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.get_nmi_reason = default_get_nmi_reason,
.save_sched_clock_state = tsc_save_sched_clock_state,
.restore_sched_clock_state = tsc_restore_sched_clock_state,
.hyper.pin_vcpu = x86_op_int_noop,
};
EXPORT_SYMBOL_GPL(x86_platform);

View file

@ -5476,13 +5476,13 @@ int kvm_mmu_module_init(void)
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, 0, NULL);
0, SLAB_ACCOUNT, NULL);
if (!pte_list_desc_cache)
goto nomem;
mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
sizeof(struct kvm_mmu_page),
0, 0, NULL);
0, SLAB_ACCOUNT, NULL);
if (!mmu_page_header_cache)
goto nomem;

View file

@ -2295,7 +2295,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* processors. See 22.2.4.
*/
vmcs_writel(HOST_TR_BASE,
(unsigned long)this_cpu_ptr(&cpu_tss));
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*

View file

@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)
delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
/*
* Use cpu_tss as a cacheline-aligned, seldomly
* Use cpu_tss_rw as a cacheline-aligned, seldomly
* accessed per-cpu variable as the monitor target.
*/
__monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
* AMD, like Intel, supports the EAX hint and EAX=0xf

View file

@ -29,26 +29,6 @@
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
/*
* Page fault error code bits:
*
* bit 0 == 0: no page found 1: protection fault
* bit 1 == 0: read access 1: write access
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
*/
enum x86_pf_error_code {
PF_PROT = 1 << 0,
PF_WRITE = 1 << 1,
PF_USER = 1 << 2,
PF_RSVD = 1 << 3,
PF_INSTR = 1 << 4,
PF_PK = 1 << 5,
};
/*
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
@ -150,7 +130,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
*/
if (error_code & PF_INSTR)
if (error_code & X86_PF_INSTR)
return 0;
instr = (void *)convert_ip_to_linear(current, regs);
@ -180,7 +160,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
* siginfo so userspace can discover which protection key was set
* on the PTE.
*
* If we get here, we know that the hardware signaled a PF_PK
* If we get here, we know that the hardware signaled a X86_PF_PK
* fault and that there was a VMA once we got in the fault
* handler. It does *not* guarantee that the VMA we find here
* was the one that we faulted on.
@ -205,7 +185,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
/*
* force_sig_info_fault() is called from a number of
* contexts, some of which have a VMA and some of which
* do not. The PF_PK handing happens after we have a
* do not. The X86_PF_PK handing happens after we have a
* valid VMA, so we should never reach this without a
* valid VMA.
*/
@ -698,7 +678,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (!oops_may_print())
return;
if (error_code & PF_INSTR) {
if (error_code & X86_PF_INSTR) {
unsigned int level;
pgd_t *pgd;
pte_t *pte;
@ -780,7 +760,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
*/
if (current->thread.sig_on_uaccess_err && signal) {
tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code | PF_USER;
tsk->thread.error_code = error_code | X86_PF_USER;
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
@ -898,7 +878,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
/* User mode accesses just cause a SIGSEGV */
if (error_code & PF_USER) {
if (error_code & X86_PF_USER) {
/*
* It's possible to have interrupts off here:
*/
@ -919,7 +899,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* Instruction fetch faults in the vsyscall page might need
* emulation.
*/
if (unlikely((error_code & PF_INSTR) &&
if (unlikely((error_code & X86_PF_INSTR) &&
((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
@ -932,7 +912,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* are always protection faults.
*/
if (address >= TASK_SIZE_MAX)
error_code |= PF_PROT;
error_code |= X86_PF_PROT;
if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
@ -993,11 +973,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return false;
if (error_code & PF_PK)
if (error_code & X86_PF_PK)
return true;
/* this checks permission keys on the VMA: */
if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
(error_code & PF_INSTR), foreign))
if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
(error_code & X86_PF_INSTR), foreign))
return true;
return false;
}
@ -1025,7 +1005,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
int code = BUS_ADRERR;
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@ -1053,14 +1033,14 @@ static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 *pkey, unsigned int fault)
{
if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
return;
}
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return;
@ -1085,16 +1065,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & PF_WRITE) && !pte_write(*pte))
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
if ((error_code & PF_INSTR) && !pte_exec(*pte))
if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
* changes, so no spurious fault will ever set PF_PK.
* changes, so no spurious fault will ever set X86_PF_PK.
*/
if ((error_code & PF_PK))
if ((error_code & X86_PF_PK))
return 1;
return 1;
@ -1140,8 +1120,8 @@ spurious_fault(unsigned long error_code, unsigned long address)
* change, so user accesses are not expected to cause spurious
* faults.
*/
if (error_code != (PF_WRITE | PF_PROT)
&& error_code != (PF_INSTR | PF_PROT))
if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
error_code != (X86_PF_INSTR | X86_PF_PROT))
return 0;
pgd = init_mm.pgd + pgd_index(address);
@ -1201,19 +1181,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
* always an unconditional error and can never result in
* a follow-up action to resolve the fault, like a COW.
*/
if (error_code & PF_PK)
if (error_code & X86_PF_PK)
return 1;
/*
* Make sure to check the VMA so that we do not perform
* faults just to hit a PF_PK as soon as we fill in a
* faults just to hit a X86_PF_PK as soon as we fill in a
* page.
*/
if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
(error_code & PF_INSTR), foreign))
if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
(error_code & X86_PF_INSTR), foreign))
return 1;
if (error_code & PF_WRITE) {
if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
return 1;
@ -1221,7 +1201,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
}
/* read, present: */
if (unlikely(error_code & PF_PROT))
if (unlikely(error_code & X86_PF_PROT))
return 1;
/* read, not present: */
@ -1244,7 +1224,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
if (!static_cpu_has(X86_FEATURE_SMAP))
return false;
if (error_code & PF_USER)
if (error_code & X86_PF_USER)
return false;
if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
@ -1297,7 +1277,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
@ -1325,7 +1305,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (unlikely(kprobes_fault(regs)))
return;
if (unlikely(error_code & PF_RSVD))
if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
@ -1351,7 +1331,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
error_code |= PF_USER;
error_code |= X86_PF_USER;
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@ -1360,9 +1340,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (error_code & PF_WRITE)
if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & PF_INSTR)
if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
/*
@ -1382,7 +1362,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* space check, thus avoiding the deadlock:
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
if (!(error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
@ -1409,7 +1389,7 @@ retry:
bad_area(regs, error_code, address);
return;
}
if (error_code & PF_USER) {
if (error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter

View file

@ -671,7 +671,7 @@ void __init init_mem_mapping(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
hypervisor_init_mem_mapping();
x86_init.hyper.init_mem_mapping();
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}

View file

@ -1426,16 +1426,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
struct page *start_page, unsigned long nr_pages)
{
unsigned long addr = (unsigned long)start_page;
unsigned long end = (unsigned long)(start_page + size);
unsigned long end = (unsigned long)(start_page + nr_pages);
unsigned long next;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
unsigned int nr_pages;
unsigned int nr_pmd_pages;
struct page *page;
for (; addr < end; addr = next) {
@ -1482,9 +1482,9 @@ void register_page_bootmem_memmap(unsigned long section_nr,
if (pmd_none(*pmd))
continue;
nr_pages = 1 << (get_order(PMD_SIZE));
nr_pmd_pages = 1 << get_order(PMD_SIZE);
page = pmd_page(*pmd);
while (nr_pages--)
while (nr_pmd_pages--)
get_page_bootmem(section_nr, page++,
SECTION_INFO);
}

View file

@ -4,19 +4,150 @@
#include <linux/bootmem.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/vmalloc.h>
#include <asm/e820/types.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
extern struct range pfn_mapped[E820_MAX_ENTRIES];
static int __init map_range(struct range *range)
static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
static __init void *early_alloc(size_t size, int nid)
{
return memblock_virt_alloc_try_nid_nopanic(size, size,
__pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
}
static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
unsigned long end, int nid)
{
pte_t *pte;
if (pmd_none(*pmd)) {
void *p;
if (boot_cpu_has(X86_FEATURE_PSE) &&
((end - addr) == PMD_SIZE) &&
IS_ALIGNED(addr, PMD_SIZE)) {
p = early_alloc(PMD_SIZE, nid);
if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
return;
else if (p)
memblock_free(__pa(p), PMD_SIZE);
}
p = early_alloc(PAGE_SIZE, nid);
pmd_populate_kernel(&init_mm, pmd, p);
}
pte = pte_offset_kernel(pmd, addr);
do {
pte_t entry;
void *p;
if (!pte_none(*pte))
continue;
p = early_alloc(PAGE_SIZE, nid);
entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
} while (pte++, addr += PAGE_SIZE, addr != end);
}
static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
unsigned long end, int nid)
{
pmd_t *pmd;
unsigned long next;
if (pud_none(*pud)) {
void *p;
if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
((end - addr) == PUD_SIZE) &&
IS_ALIGNED(addr, PUD_SIZE)) {
p = early_alloc(PUD_SIZE, nid);
if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
return;
else if (p)
memblock_free(__pa(p), PUD_SIZE);
}
p = early_alloc(PAGE_SIZE, nid);
pud_populate(&init_mm, pud, p);
}
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (!pmd_large(*pmd))
kasan_populate_pmd(pmd, addr, next, nid);
} while (pmd++, addr = next, addr != end);
}
static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
unsigned long end, int nid)
{
pud_t *pud;
unsigned long next;
if (p4d_none(*p4d)) {
void *p = early_alloc(PAGE_SIZE, nid);
p4d_populate(&init_mm, p4d, p);
}
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (!pud_large(*pud))
kasan_populate_pud(pud, addr, next, nid);
} while (pud++, addr = next, addr != end);
}
static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
unsigned long end, int nid)
{
void *p;
p4d_t *p4d;
unsigned long next;
if (pgd_none(*pgd)) {
p = early_alloc(PAGE_SIZE, nid);
pgd_populate(&init_mm, pgd, p);
}
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
kasan_populate_p4d(p4d, addr, next, nid);
} while (p4d++, addr = next, addr != end);
}
static void __init kasan_populate_shadow(unsigned long addr, unsigned long end,
int nid)
{
pgd_t *pgd;
unsigned long next;
addr = addr & PAGE_MASK;
end = round_up(end, PAGE_SIZE);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
kasan_populate_pgd(pgd, addr, next, nid);
} while (pgd++, addr = next, addr != end);
}
static void __init map_range(struct range *range)
{
unsigned long start;
unsigned long end;
@ -24,15 +155,17 @@ static int __init map_range(struct range *range)
start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
return vmemmap_populate(start, end, NUMA_NO_NODE);
kasan_populate_shadow(start, end, early_pfn_to_nid(range->start));
}
static void __init clear_pgds(unsigned long start,
unsigned long end)
{
pgd_t *pgd;
/* See comment in kasan_init() */
unsigned long pgd_end = end & PGDIR_MASK;
for (; start < end; start += PGDIR_SIZE) {
for (; start < pgd_end; start += PGDIR_SIZE) {
pgd = pgd_offset_k(start);
/*
* With folded p4d, pgd_clear() is nop, use p4d_clear()
@ -43,29 +176,61 @@ static void __init clear_pgds(unsigned long start,
else
pgd_clear(pgd);
}
pgd = pgd_offset_k(start);
for (; start < end; start += P4D_SIZE)
p4d_clear(p4d_offset(pgd, start));
}
static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
{
unsigned long p4d;
if (!IS_ENABLED(CONFIG_X86_5LEVEL))
return (p4d_t *)pgd;
p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
p4d += __START_KERNEL_map - phys_base;
return (p4d_t *)p4d + p4d_index(addr);
}
static void __init kasan_early_p4d_populate(pgd_t *pgd,
unsigned long addr,
unsigned long end)
{
pgd_t pgd_entry;
p4d_t *p4d, p4d_entry;
unsigned long next;
if (pgd_none(*pgd)) {
pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d));
set_pgd(pgd, pgd_entry);
}
p4d = early_p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (!p4d_none(*p4d))
continue;
p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud));
set_p4d(p4d, p4d_entry);
} while (p4d++, addr = next, addr != end && p4d_none(*p4d));
}
static void __init kasan_map_early_shadow(pgd_t *pgd)
{
int i;
unsigned long start = KASAN_SHADOW_START;
/* See comment in kasan_init() */
unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK;
unsigned long end = KASAN_SHADOW_END;
unsigned long next;
for (i = pgd_index(start); start < end; i++) {
switch (CONFIG_PGTABLE_LEVELS) {
case 4:
pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
_KERNPG_TABLE);
break;
case 5:
pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
_KERNPG_TABLE);
break;
default:
BUILD_BUG();
}
start += PGDIR_SIZE;
}
pgd += pgd_index(addr);
do {
next = pgd_addr_end(addr, end);
kasan_early_p4d_populate(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
#ifdef CONFIG_KASAN_INLINE
@ -102,7 +267,7 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
@ -112,37 +277,76 @@ void __init kasan_early_init(void)
void __init kasan_init(void)
{
int i;
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
#ifdef CONFIG_KASAN_INLINE
register_die_notifier(&kasan_die_notifier);
#endif
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
/*
* We use the same shadow offset for 4- and 5-level paging to
* facilitate boot-time switching between paging modes.
* As result in 5-level paging mode KASAN_SHADOW_START and
* KASAN_SHADOW_END are not aligned to PGD boundary.
*
* KASAN_SHADOW_START doesn't share PGD with anything else.
* We claim whole PGD entry to make things easier.
*
* KASAN_SHADOW_END lands in the last PGD entry and it collides with
* bunch of things like kernel code, modules, EFI mapping, etc.
* We need to take extra steps to not overwrite them.
*/
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
void *ptr;
ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table));
set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)],
__pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE));
}
load_cr3(early_top_pgt);
__flush_tlb_all();
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);
kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
kasan_mem_to_shadow((void *)PAGE_OFFSET));
for (i = 0; i < E820_MAX_ENTRIES; i++) {
if (pfn_mapped[i].end == 0)
break;
if (map_range(&pfn_mapped[i]))
panic("kasan: unable to allocate shadow!");
map_range(&pfn_mapped[i]);
}
kasan_populate_zero_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
kasan_mem_to_shadow((void *)__START_KERNEL_map));
vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
NUMA_NO_NODE);
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext)));
shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
PAGE_SIZE);
shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
PAGE_SIZE);
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
shadow_cpu_entry_begin);
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
(unsigned long)shadow_cpu_entry_end, 0);
kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END);
load_cr3(init_top_pgt);
__flush_tlb_all();

View file

@ -160,17 +160,19 @@ static void do_fpu_end(void)
static void fix_processor_context(void)
{
int cpu = smp_processor_id();
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
struct desc_struct *desc = get_cpu_gdt_rw(cpu);
tss_desc tss;
#endif
set_tss_desc(cpu, t); /*
* This just modifies memory; should not be
* necessary. But... This is necessary, because
* 386 hardware has concept of busy TSS or some
* similar stupidity.
*/
/*
* We need to reload TR, which requires that we change the
* GDT entry to indicate "available" first.
*
* XXX: This could probably all be replaced by a call to
* force_reload_TR().
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
#ifdef CONFIG_X86_64
memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));

Some files were not shown because too many files have changed in this diff Show more