diff options
author | 2018-01-05 15:03:54 +0000 | |
---|---|---|
committer | 2018-01-05 15:03:54 +0000 | |
commit | a08c6f0923abc66cb0192f849780a30c3016e946 (patch) | |
tree | f4368864b59d22d21a581c68810afec3f0218b39 | |
parent | Linux patch 4.9.74 (diff) | |
download | linux-patches-a08c6f0923abc66cb0192f849780a30c3016e946.tar.gz linux-patches-a08c6f0923abc66cb0192f849780a30c3016e946.tar.bz2 linux-patches-a08c6f0923abc66cb0192f849780a30c3016e946.zip |
linux kernel 4.9.75
-rw-r--r-- | 0000_README | 4 | ||||
-rw-r--r-- | 1074_linux-4.9.75.patch | 2577 |
2 files changed, 2581 insertions, 0 deletions
diff --git a/0000_README b/0000_README index 350d2c5f..eed33722 100644 --- a/0000_README +++ b/0000_README @@ -339,6 +339,10 @@ Patch: 1073_linux-4.9.74.patch From: http://www.kernel.org Desc: Linux 4.9.74 +Patch: 1074_linux-4.9.75.patch +From: http://www.kernel.org +Desc: Linux 4.9.75 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1074_linux-4.9.75.patch b/1074_linux-4.9.75.patch new file mode 100644 index 00000000..6299f19d --- /dev/null +++ b/1074_linux-4.9.75.patch @@ -0,0 +1,2577 @@ +diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt +index 152ec4e87b57..5d2676d043de 100644 +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. + + nojitter [IA-64] Disables jitter checking for ITC timers. + ++ nopti [X86-64] Disable KAISER isolation of kernel from user. ++ + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver + + no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page +@@ -3325,6 +3327,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. + pt. [PARIDE] + See Documentation/blockdev/paride.txt. + ++ pti= [X86_64] ++ Control KAISER user/kernel address space isolation: ++ on - enable ++ off - disable ++ auto - default setting ++ + pty.legacy_count= + [KNL] Number of legacy pty's. Overwrites compiled-in + default number. +diff --git a/Makefile b/Makefile +index 075e429732e7..acbc1b032db2 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 4 + PATCHLEVEL = 9 +-SUBLEVEL = 74 ++SUBLEVEL = 75 + EXTRAVERSION = + NAME = Roaring Lionus + +diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h +index 766a5211f827..2728e1b7e4a6 100644 +--- a/arch/x86/boot/compressed/misc.h ++++ b/arch/x86/boot/compressed/misc.h +@@ -9,6 +9,7 @@ + */ + #undef CONFIG_PARAVIRT + #undef CONFIG_PARAVIRT_SPINLOCKS ++#undef CONFIG_PAGE_TABLE_ISOLATION + #undef CONFIG_KASAN + + #include <linux/linkage.h> +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index e7b0e7ff4c58..af4e58132d91 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -36,6 +36,7 @@ + #include <asm/smap.h> + #include <asm/pgtable_types.h> + #include <asm/export.h> ++#include <asm/kaiser.h> + #include <linux/err.h> + + /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +@@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64) + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK ++ SWITCH_KERNEL_CR3_NO_STACK + /* + * A hypervisor implementation might want to use a label + * after the swapgs, so that it can do the swapgs +@@ -228,6 +230,14 @@ entry_SYSCALL_64_fastpath: + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + RESTORE_C_REGS_EXCEPT_RCX_R11 ++ /* ++ * This opens a window where we have a user CR3, but are ++ * running in the kernel. This makes using the CS ++ * register useless for telling whether or not we need to ++ * switch CR3 in NMIs. Normal interrupts are OK because ++ * they are off here. ++ */ ++ SWITCH_USER_CR3 + movq RSP(%rsp), %rsp + USERGS_SYSRET64 + +@@ -323,10 +333,26 @@ return_from_SYSCALL_64: + syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 ++ /* ++ * This opens a window where we have a user CR3, but are ++ * running in the kernel. This makes using the CS ++ * register useless for telling whether or not we need to ++ * switch CR3 in NMIs. Normal interrupts are OK because ++ * they are off here. ++ */ ++ SWITCH_USER_CR3 + movq RSP(%rsp), %rsp + USERGS_SYSRET64 + + opportunistic_sysret_failed: ++ /* ++ * This opens a window where we have a user CR3, but are ++ * running in the kernel. This makes using the CS ++ * register useless for telling whether or not we need to ++ * switch CR3 in NMIs. Normal interrupts are OK because ++ * they are off here. ++ */ ++ SWITCH_USER_CR3 + SWAPGS + jmp restore_c_regs_and_iret + END(entry_SYSCALL_64) +@@ -424,6 +450,7 @@ ENTRY(ret_from_fork) + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ ++ SWITCH_USER_CR3 + SWAPGS + jmp restore_regs_and_iret + +@@ -478,6 +505,7 @@ END(irq_entries_start) + * tracking that we're in kernel mode. + */ + SWAPGS ++ SWITCH_KERNEL_CR3 + + /* + * We need to tell lockdep that IRQs are off. We can't do this until +@@ -535,6 +563,7 @@ GLOBAL(retint_user) + mov %rsp,%rdi + call prepare_exit_to_usermode + TRACE_IRQS_IRETQ ++ SWITCH_USER_CR3 + SWAPGS + jmp restore_regs_and_iret + +@@ -612,6 +641,7 @@ native_irq_return_ldt: + + pushq %rdi /* Stash user RDI */ + SWAPGS ++ SWITCH_KERNEL_CR3 + movq PER_CPU_VAR(espfix_waddr), %rdi + movq %rax, (0*8)(%rdi) /* user RAX */ + movq (1*8)(%rsp), %rax /* user RIP */ +@@ -638,6 +668,7 @@ native_irq_return_ldt: + * still points to an RO alias of the ESPFIX stack. + */ + orq PER_CPU_VAR(espfix_stack), %rax ++ SWITCH_USER_CR3 + SWAPGS + movq %rax, %rsp + +@@ -1022,7 +1053,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec + /* + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. +- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise ++ * ++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit ++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit ++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit ++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit + */ + ENTRY(paranoid_entry) + cld +@@ -1035,7 +1070,26 @@ ENTRY(paranoid_entry) + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx, %ebx +-1: ret ++1: ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ /* ++ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 ++ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. ++ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done ++ * unconditionally, but we need to find out whether the reverse ++ * should be done on return (conveyed to paranoid_exit in %ebx). ++ */ ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER ++ testl $KAISER_SHADOW_PGD_OFFSET, %eax ++ jz 2f ++ orl $2, %ebx ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax ++ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID ++ movq %rax, %cr3 ++2: ++#endif ++ ret + END(paranoid_entry) + + /* +@@ -1048,19 +1102,26 @@ END(paranoid_entry) + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. + * +- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) ++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3 ++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 ++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 ++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs + */ + ENTRY(paranoid_exit) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF_DEBUG +- testl %ebx, %ebx /* swapgs needed? */ ++ TRACE_IRQS_IRETQ_DEBUG ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */ ++ testl $2, %ebx /* SWITCH_USER_CR3 needed? */ ++ jz paranoid_exit_no_switch ++ SWITCH_USER_CR3 ++paranoid_exit_no_switch: ++#endif ++ testl $1, %ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs +- TRACE_IRQS_IRETQ + SWAPGS_UNSAFE_STACK +- jmp paranoid_exit_restore + paranoid_exit_no_swapgs: +- TRACE_IRQS_IRETQ_DEBUG +-paranoid_exit_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 +@@ -1075,6 +1136,13 @@ ENTRY(error_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 ++ /* ++ * error_entry() always returns with a kernel gsbase and ++ * CR3. We must also have a kernel CR3/gsbase before ++ * calling TRACE_IRQS_*. Just unconditionally switch to ++ * the kernel CR3 here. ++ */ ++ SWITCH_KERNEL_CR3 + xorl %ebx, %ebx + testb $3, CS+8(%rsp) + jz .Lerror_kernelspace +@@ -1235,6 +1303,10 @@ ENTRY(nmi) + */ + + SWAPGS_UNSAFE_STACK ++ /* ++ * percpu variables are mapped with user CR3, so no need ++ * to switch CR3 here. ++ */ + cld + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +@@ -1268,12 +1340,34 @@ ENTRY(nmi) + + movq %rsp, %rdi + movq $-1, %rsi ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ /* Unconditionally use kernel CR3 for do_nmi() */ ++ /* %rax is saved above, so OK to clobber here */ ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER ++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID ++ pushq %rax ++ /* mask off "user" bit of pgd address and 12 PCID bits: */ ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax ++ movq %rax, %cr3 ++2: ++#endif + call do_nmi + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ /* ++ * Unconditionally restore CR3. I know we return to ++ * kernel code that needs user CR3, but do we ever return ++ * to "user mode" where we need the kernel CR3? ++ */ ++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER ++#endif ++ + /* + * Return back to user mode. We must *not* do the normal exit +- * work, because we don't want to enable interrupts. Fortunately, +- * do_nmi doesn't modify pt_regs. ++ * work, because we don't want to enable interrupts. Do not ++ * switch to user CR3: we might be going back to kernel code ++ * that had a user CR3 set. + */ + SWAPGS + jmp restore_c_regs_and_iret +@@ -1470,22 +1564,55 @@ end_repeat_nmi: + ALLOC_PT_GPREGS_ON_STACK + + /* +- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit +- * as we should not be calling schedule in NMI context. +- * Even with normal interrupts enabled. An NMI should not be +- * setting NEED_RESCHED or anything that normal interrupts and +- * exceptions might do. ++ * Use the same approach as paranoid_entry to handle SWAPGS, but ++ * without CR3 handling since we do that differently in NMIs. No ++ * need to use paranoid_exit as we should not be calling schedule ++ * in NMI context. Even with normal interrupts enabled. An NMI ++ * should not be setting NEED_RESCHED or anything that normal ++ * interrupts and exceptions might do. + */ +- call paranoid_entry +- +- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ ++ cld ++ SAVE_C_REGS ++ SAVE_EXTRA_REGS ++ movl $1, %ebx ++ movl $MSR_GS_BASE, %ecx ++ rdmsr ++ testl %edx, %edx ++ js 1f /* negative -> in kernel */ ++ SWAPGS ++ xorl %ebx, %ebx ++1: + movq %rsp, %rdi + movq $-1, %rsi ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ /* Unconditionally use kernel CR3 for do_nmi() */ ++ /* %rax is saved above, so OK to clobber here */ ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER ++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID ++ pushq %rax ++ /* mask off "user" bit of pgd address and 12 PCID bits: */ ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax ++ movq %rax, %cr3 ++2: ++#endif ++ ++ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + call do_nmi + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ /* ++ * Unconditionally restore CR3. We might be returning to ++ * kernel code that needs user CR3, like just just before ++ * a sysret. ++ */ ++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER ++#endif ++ + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore + nmi_swapgs: ++ /* We fixed up CR3 above, so no need to switch it here */ + SWAPGS_UNSAFE_STACK + nmi_restore: + RESTORE_EXTRA_REGS +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index e1721dafbcb1..d76a97653980 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -13,6 +13,8 @@ + #include <asm/irqflags.h> + #include <asm/asm.h> + #include <asm/smap.h> ++#include <asm/pgtable_types.h> ++#include <asm/kaiser.h> + #include <linux/linkage.h> + #include <linux/err.h> + +@@ -48,6 +50,7 @@ + ENTRY(entry_SYSENTER_compat) + /* Interrupts are off on entry. */ + SWAPGS_UNSAFE_STACK ++ SWITCH_KERNEL_CR3_NO_STACK + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* +@@ -184,6 +187,7 @@ ENDPROC(entry_SYSENTER_compat) + ENTRY(entry_SYSCALL_compat) + /* Interrupts are off on entry. */ + SWAPGS_UNSAFE_STACK ++ SWITCH_KERNEL_CR3_NO_STACK + + /* Stash user ESP and switch to the kernel stack. */ + movl %esp, %r8d +@@ -259,6 +263,7 @@ sysret32_from_system_call: + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 ++ SWITCH_USER_CR3 + movq RSP-ORIG_RAX(%rsp), %rsp + swapgs + sysretl +@@ -297,7 +302,7 @@ ENTRY(entry_INT80_compat) + PARAVIRT_ADJUST_EXCEPTION_FRAME + ASM_CLAC /* Do this early to minimize exposure */ + SWAPGS +- ++ SWITCH_KERNEL_CR3_NO_STACK + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit +@@ -338,6 +343,7 @@ ENTRY(entry_INT80_compat) + + /* Go back to user mode. */ + TRACE_IRQS_ON ++ SWITCH_USER_CR3 + SWAPGS + jmp restore_regs_and_iret + END(entry_INT80_compat) +diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c +index 9dfeeeca0ea8..8e7a3f1df3a5 100644 +--- a/arch/x86/events/intel/ds.c ++++ b/arch/x86/events/intel/ds.c +@@ -2,11 +2,15 @@ + #include <linux/types.h> + #include <linux/slab.h> + ++#include <asm/kaiser.h> + #include <asm/perf_event.h> + #include <asm/insn.h> + + #include "../perf_event.h" + ++static ++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); ++ + /* The size of a BTS record in bytes: */ + #define BTS_RECORD_SIZE 24 + +@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu) + + static DEFINE_PER_CPU(void *, insn_buffer); + ++static void *dsalloc(size_t size, gfp_t flags, int node) ++{ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ unsigned int order = get_order(size); ++ struct page *page; ++ unsigned long addr; ++ ++ page = __alloc_pages_node(node, flags | __GFP_ZERO, order); ++ if (!page) ++ return NULL; ++ addr = (unsigned long)page_address(page); ++ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { ++ __free_pages(page, order); ++ addr = 0; ++ } ++ return (void *)addr; ++#else ++ return kmalloc_node(size, flags | __GFP_ZERO, node); ++#endif ++} ++ ++static void dsfree(const void *buffer, size_t size) ++{ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ if (!buffer) ++ return; ++ kaiser_remove_mapping((unsigned long)buffer, size); ++ free_pages((unsigned long)buffer, get_order(size)); ++#else ++ kfree(buffer); ++#endif ++} ++ + static int alloc_pebs_buffer(int cpu) + { + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; +@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu) + if (!x86_pmu.pebs) + return 0; + +- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); ++ buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); + if (unlikely(!buffer)) + return -ENOMEM; + +@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu) + if (x86_pmu.intel_cap.pebs_format < 2) { + ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); + if (!ibuffer) { +- kfree(buffer); ++ dsfree(buffer, x86_pmu.pebs_buffer_size); + return -ENOMEM; + } + per_cpu(insn_buffer, cpu) = ibuffer; +@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu) + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; + +- kfree((void *)(unsigned long)ds->pebs_buffer_base); ++ dsfree((void *)(unsigned long)ds->pebs_buffer_base, ++ x86_pmu.pebs_buffer_size); + ds->pebs_buffer_base = 0; + } + +@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu) + if (!x86_pmu.bts) + return 0; + +- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); ++ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + if (unlikely(!buffer)) { + WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); + return -ENOMEM; +@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu) + if (!ds || !x86_pmu.bts) + return; + +- kfree((void *)(unsigned long)ds->bts_buffer_base); ++ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); + ds->bts_buffer_base = 0; + } + + static int alloc_ds_buffer(int cpu) + { +- int node = cpu_to_node(cpu); +- struct debug_store *ds; +- +- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); +- if (unlikely(!ds)) +- return -ENOMEM; ++ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); + ++ memset(ds, 0, sizeof(*ds)); + per_cpu(cpu_hw_events, cpu).ds = ds; + + return 0; +@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu) + return; + + per_cpu(cpu_hw_events, cpu).ds = NULL; +- kfree(ds); + } + + void release_ds_buffers(void) +diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h +index e01f7f7ccb0c..84ae170bc3d0 100644 +--- a/arch/x86/include/asm/cmdline.h ++++ b/arch/x86/include/asm/cmdline.h +@@ -2,5 +2,7 @@ + #define _ASM_X86_CMDLINE_H + + int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); ++int cmdline_find_option(const char *cmdline_ptr, const char *option, ++ char *buffer, int bufsize); + + #endif /* _ASM_X86_CMDLINE_H */ +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index ed10b5bf9b93..454a37adb823 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -189,6 +189,7 @@ + + #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ + #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ ++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */ + + #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +@@ -197,6 +198,9 @@ + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + ++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ ++ + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ + #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index 12080d87da3b..2ed5a2b3f8f7 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -43,7 +43,7 @@ struct gdt_page { + struct desc_struct gdt[GDT_ENTRIES]; + } __attribute__((aligned(PAGE_SIZE))); + +-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); ++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); + + static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) + { +diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h +index b90e1053049b..0817d63bce41 100644 +--- a/arch/x86/include/asm/hw_irq.h ++++ b/arch/x86/include/asm/hw_irq.h +@@ -178,7 +178,7 @@ extern char irq_entries_start[]; + #define VECTOR_RETRIGGERED ((void *)~0UL) + + typedef struct irq_desc* vector_irq_t[NR_VECTORS]; +-DECLARE_PER_CPU(vector_irq_t, vector_irq); ++DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); + + #endif /* !ASSEMBLY_ */ + +diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h +new file mode 100644 +index 000000000000..802bbbdfe143 +--- /dev/null ++++ b/arch/x86/include/asm/kaiser.h +@@ -0,0 +1,141 @@ ++#ifndef _ASM_X86_KAISER_H ++#define _ASM_X86_KAISER_H ++ ++#include <uapi/asm/processor-flags.h> /* For PCID constants */ ++ ++/* ++ * This file includes the definitions for the KAISER feature. ++ * KAISER is a counter measure against x86_64 side channel attacks on ++ * the kernel virtual memory. It has a shadow pgd for every process: the ++ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole ++ * user memory. Within a kernel context switch, or when an interrupt is handled, ++ * the pgd is switched to the normal one. When the system switches to user mode, ++ * the shadow pgd is enabled. By this, the virtual memory caches are freed, ++ * and the user may not attack the whole kernel memory. ++ * ++ * A minimalistic kernel mapping holds the parts needed to be mapped in user ++ * mode, such as the entry/exit functions of the user space, or the stacks. ++ */ ++ ++#define KAISER_SHADOW_PGD_OFFSET 0x1000 ++ ++#ifdef __ASSEMBLY__ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ ++.macro _SWITCH_TO_KERNEL_CR3 reg ++movq %cr3, \reg ++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg ++/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ ++ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID ++movq \reg, %cr3 ++.endm ++ ++.macro _SWITCH_TO_USER_CR3 reg regb ++/* ++ * regb must be the low byte portion of reg: because we have arranged ++ * for the low byte of the user PCID to serve as the high byte of NOFLUSH ++ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are ++ * not enabled): so that the one register can update both memory and cr3. ++ */ ++movq %cr3, \reg ++orq PER_CPU_VAR(x86_cr3_pcid_user), \reg ++js 9f ++/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */ ++movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) ++9: ++movq \reg, %cr3 ++.endm ++ ++.macro SWITCH_KERNEL_CR3 ++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER ++_SWITCH_TO_KERNEL_CR3 %rax ++popq %rax ++8: ++.endm ++ ++.macro SWITCH_USER_CR3 ++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER ++_SWITCH_TO_USER_CR3 %rax %al ++popq %rax ++8: ++.endm ++ ++.macro SWITCH_KERNEL_CR3_NO_STACK ++ALTERNATIVE "jmp 8f", \ ++ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ ++ X86_FEATURE_KAISER ++_SWITCH_TO_KERNEL_CR3 %rax ++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax ++8: ++.endm ++ ++#else /* CONFIG_PAGE_TABLE_ISOLATION */ ++ ++.macro SWITCH_KERNEL_CR3 ++.endm ++.macro SWITCH_USER_CR3 ++.endm ++.macro SWITCH_KERNEL_CR3_NO_STACK ++.endm ++ ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */ ++ ++#else /* __ASSEMBLY__ */ ++ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++/* ++ * Upon kernel/user mode switch, it may happen that the address ++ * space has to be switched before the registers have been ++ * stored. To change the address space, another register is ++ * needed. A register therefore has to be stored/restored. ++*/ ++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); ++ ++DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); ++ ++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; ++ ++extern int kaiser_enabled; ++extern void __init kaiser_check_boottime_disable(void); ++#else ++#define kaiser_enabled 0 ++static inline void __init kaiser_check_boottime_disable(void) {} ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */ ++ ++/* ++ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set, ++ * so as to build with tests on kaiser_enabled instead of #ifdefs. ++ */ ++ ++/** ++ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping ++ * @addr: the start address of the range ++ * @size: the size of the range ++ * @flags: The mapping flags of the pages ++ * ++ * The mapping is done on a global scope, so no bigger ++ * synchronization has to be done. the pages have to be ++ * manually unmapped again when they are not needed any longer. ++ */ ++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); ++ ++/** ++ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping ++ * @addr: the start address of the range ++ * @size: the size of the range ++ */ ++extern void kaiser_remove_mapping(unsigned long start, unsigned long size); ++ ++/** ++ * kaiser_init - Initialize the shadow mapping ++ * ++ * Most parts of the shadow mapping can be mapped upon boot ++ * time. Only per-process things like the thread stacks ++ * or a new LDT have to be mapped at runtime. These boot- ++ * time mappings are permanent and never unmapped. ++ */ ++extern void kaiser_init(void); ++ ++#endif /* __ASSEMBLY */ ++ ++#endif /* _ASM_X86_KAISER_H */ +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 437feb436efa..2536f90cd30c 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -18,6 +18,12 @@ + #ifndef __ASSEMBLY__ + #include <asm/x86_init.h> + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++extern int kaiser_enabled; ++#else ++#define kaiser_enabled 0 ++#endif ++ + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); + void ptdump_walk_pgd_level_checkwx(void); + +@@ -690,7 +696,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) + + static inline int pgd_bad(pgd_t pgd) + { +- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; ++ pgdval_t ignore_flags = _PAGE_USER; ++ /* ++ * We set NX on KAISER pgds that map userspace memory so ++ * that userspace can not meaningfully use the kernel ++ * page table by accident; it will fault on the first ++ * instruction it tries to run. See native_set_pgd(). ++ */ ++ if (kaiser_enabled) ++ ignore_flags |= _PAGE_NX; ++ ++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; + } + + static inline int pgd_none(pgd_t pgd) +@@ -903,7 +919,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, + */ + static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) + { +- memcpy(dst, src, count * sizeof(pgd_t)); ++ memcpy(dst, src, count * sizeof(pgd_t)); ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ if (kaiser_enabled) { ++ /* Clone the shadow pgd part as well */ ++ memcpy(native_get_shadow_pgd(dst), ++ native_get_shadow_pgd(src), ++ count * sizeof(pgd_t)); ++ } ++#endif + } + + #define PTE_SHIFT ilog2(PTRS_PER_PTE) +diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h +index 1cc82ece9ac1..ce97c8c6a310 100644 +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud) + native_set_pud(pud, native_make_pud(0)); + } + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); ++ ++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) ++{ ++#ifdef CONFIG_DEBUG_VM ++ /* linux/mmdebug.h may not have been included at this point */ ++ BUG_ON(!kaiser_enabled); ++#endif ++ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); ++} ++#else ++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) ++{ ++ return pgd; ++} ++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) ++{ ++ BUILD_BUG_ON(1); ++ return NULL; ++} ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */ ++ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) + { +- *pgdp = pgd; ++ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); + } + + static inline void native_pgd_clear(pgd_t *pgd) +diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h +index 8b4de22d6429..f1c8ac468292 100644 +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -119,7 +119,7 @@ + #define _PAGE_DEVMAP (_AT(pteval_t, 0)) + #endif + +-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) ++#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) + + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY) +@@ -137,6 +137,33 @@ + _PAGE_SOFT_DIRTY) + #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) + ++/* The ASID is the lower 12 bits of CR3 */ ++#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) ++ ++/* Mask for all the PCID-related bits in CR3: */ ++#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) ++#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) ++ ++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64) ++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ ++#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) ++ ++#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) ++#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) ++#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) ++#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) ++#else ++#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) ++/* ++ * PCIDs are unsupported on 32-bit and none of these bits can be ++ * set in CR3: ++ */ ++#define X86_CR3_PCID_KERN_FLUSH (0) ++#define X86_CR3_PCID_USER_FLUSH (0) ++#define X86_CR3_PCID_KERN_NOFLUSH (0) ++#define X86_CR3_PCID_USER_NOFLUSH (0) ++#endif ++ + /* + * The cache modes defined here are used to translate between pure SW usage + * and the HW defined cache mode bits and/or PAT entries. +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 83db0eae9979..8cb52ee3ade6 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -308,7 +308,7 @@ struct tss_struct { + + } ____cacheline_aligned; + +-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); ++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss); + + #ifdef CONFIG_X86_32 + DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 7d2ea6b1f7d9..94146f665a3c 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -132,6 +132,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) + cr4_set_bits(mask); + } + ++/* ++ * Declare a couple of kaiser interfaces here for convenience, ++ * to avoid the need for asm/kaiser.h in unexpected places. ++ */ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++extern int kaiser_enabled; ++extern void kaiser_setup_pcid(void); ++extern void kaiser_flush_tlb_on_return_to_user(void); ++#else ++#define kaiser_enabled 0 ++static inline void kaiser_setup_pcid(void) ++{ ++} ++static inline void kaiser_flush_tlb_on_return_to_user(void) ++{ ++} ++#endif ++ + static inline void __native_flush_tlb(void) + { + /* +@@ -140,6 +158,8 @@ static inline void __native_flush_tlb(void) + * back: + */ + preempt_disable(); ++ if (kaiser_enabled) ++ kaiser_flush_tlb_on_return_to_user(); + native_write_cr3(native_read_cr3()); + preempt_enable(); + } +@@ -149,20 +169,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); +- /* clear PGE */ +- native_write_cr4(cr4 & ~X86_CR4_PGE); +- /* write old PGE again and flush TLBs */ +- native_write_cr4(cr4); ++ if (cr4 & X86_CR4_PGE) { ++ /* clear PGE and flush TLB of all entries */ ++ native_write_cr4(cr4 & ~X86_CR4_PGE); ++ /* restore PGE as it was before */ ++ native_write_cr4(cr4); ++ } else { ++ /* do it with cr3, letting kaiser flush user PCID */ ++ __native_flush_tlb(); ++ } + } + + static inline void __native_flush_tlb_global(void) + { + unsigned long flags; + +- if (static_cpu_has(X86_FEATURE_INVPCID)) { ++ if (this_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. ++ * ++ * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all(); + return; +@@ -174,24 +201,45 @@ static inline void __native_flush_tlb_global(void) + * be called from deep inside debugging code.) + */ + raw_local_irq_save(flags); +- + __native_flush_tlb_global_irq_disabled(); +- + raw_local_irq_restore(flags); + } + + static inline void __native_flush_tlb_single(unsigned long addr) + { +- asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); ++ /* ++ * SIMICS #GP's if you run INVPCID with type 2/3 ++ * and X86_CR4_PCIDE clear. Shame! ++ * ++ * The ASIDs used below are hard-coded. But, we must not ++ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call ++ * invlpg in the case we are called early. ++ */ ++ ++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { ++ if (kaiser_enabled) ++ kaiser_flush_tlb_on_return_to_user(); ++ asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); ++ return; ++ } ++ /* Flush the address out of both PCIDs. */ ++ /* ++ * An optimization here might be to determine addresses ++ * that are only kernel-mapped and only flush the kernel ++ * ASID. But, userspace flushes are probably much more ++ * important performance-wise. ++ * ++ * Make sure to do only a single invpcid when KAISER is ++ * disabled and we have only a single ASID. ++ */ ++ if (kaiser_enabled) ++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); ++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); + } + + static inline void __flush_tlb_all(void) + { +- if (boot_cpu_has(X86_FEATURE_PGE)) +- __flush_tlb_global(); +- else +- __flush_tlb(); +- ++ __flush_tlb_global(); + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but +diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h +index 567de50a4c2a..6768d1321016 100644 +--- a/arch/x86/include/uapi/asm/processor-flags.h ++++ b/arch/x86/include/uapi/asm/processor-flags.h +@@ -77,7 +77,8 @@ + #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) + #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ + #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) +-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ ++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ ++#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) + + /* + * Intel CPU features in CR4 +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 91588be529b9..918e44772b04 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = { + + static const struct cpu_dev *this_cpu = &default_cpu; + +-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { ++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { + #ifdef CONFIG_X86_64 + /* + * We need valid kernel segments for data and code in long mode too +@@ -327,8 +327,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) + static void setup_pcid(struct cpuinfo_x86 *c) + { + if (cpu_has(c, X86_FEATURE_PCID)) { +- if (cpu_has(c, X86_FEATURE_PGE)) { ++ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) { + cr4_set_bits(X86_CR4_PCIDE); ++ /* ++ * INVPCID has two "groups" of types: ++ * 1/2: Invalidate an individual address ++ * 3/4: Invalidate all contexts ++ * ++ * 1/2 take a PCID, but 3/4 do not. So, 3/4 ++ * ignore the PCID argument in the descriptor. ++ * But, we have to be careful not to call 1/2 ++ * with an actual non-zero PCID in them before ++ * we do the above cr4_set_bits(). ++ */ ++ if (cpu_has(c, X86_FEATURE_INVPCID)) ++ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't +@@ -341,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x86 *c) + clear_cpu_cap(c, X86_FEATURE_PCID); + } + } ++ kaiser_setup_pcid(); + } + + /* +@@ -1365,7 +1379,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [DEBUG_STACK - 1] = DEBUG_STKSZ + }; + +-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks ++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); + + /* May not be marked __init: used by software suspend */ +@@ -1523,6 +1537,14 @@ void cpu_init(void) + * try to read it. + */ + cr4_init_shadow(); ++ if (!kaiser_enabled) { ++ /* ++ * secondary_startup_64() deferred setting PGE in cr4: ++ * probe_page_size_mask() sets it on the boot cpu, ++ * but it needs to be set on each secondary cpu. ++ */ ++ cr4_set_bits(X86_CR4_PGE); ++ } + + /* + * Load microcode on this cpu if a valid microcode is available. +diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c +index 04f89caef9c4..e33b38541be3 100644 +--- a/arch/x86/kernel/espfix_64.c ++++ b/arch/x86/kernel/espfix_64.c +@@ -41,6 +41,7 @@ + #include <asm/pgalloc.h> + #include <asm/setup.h> + #include <asm/espfix.h> ++#include <asm/kaiser.h> + + /* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round +@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void) + /* Install the espfix pud into the kernel page directory */ + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); ++ /* ++ * Just copy the top-level PGD that is mapping the espfix ++ * area to ensure it is mapped into the shadow user page ++ * tables. ++ */ ++ if (kaiser_enabled) { ++ set_pgd(native_get_shadow_pgd(pgd_p), ++ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); ++ } + + /* Randomize the locations */ + init_espfix_random(); +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index b4421cc191b0..67cd7c1b99da 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -190,8 +190,8 @@ ENTRY(secondary_startup_64) + movq $(init_level4_pgt - __START_KERNEL_map), %rax + 1: + +- /* Enable PAE mode and PGE */ +- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx ++ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ ++ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx + movq %rcx, %cr4 + + /* Setup early boot stage 4 level pagetables. */ +@@ -405,6 +405,27 @@ GLOBAL(early_recursion_flag) + .balign PAGE_SIZE; \ + GLOBAL(name) + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++/* ++ * Each PGD needs to be 8k long and 8k aligned. We do not ++ * ever go out to userspace with these, so we do not ++ * strictly *need* the second page, but this allows us to ++ * have a single set_pgd() implementation that does not ++ * need to worry about whether it has 4k or 8k to work ++ * with. ++ * ++ * This ensures PGDs are 8k long: ++ */ ++#define KAISER_USER_PGD_FILL 512 ++/* This ensures they are 8k-aligned: */ ++#define NEXT_PGD_PAGE(name) \ ++ .balign 2 * PAGE_SIZE; \ ++GLOBAL(name) ++#else ++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) ++#define KAISER_USER_PGD_FILL 0 ++#endif ++ + /* Automate the creation of 1 to 1 mapping pmd entries */ + #define PMDS(START, PERM, COUNT) \ + i = 0 ; \ +@@ -414,9 +435,10 @@ GLOBAL(name) + .endr + + __INITDATA +-NEXT_PAGE(early_level4_pgt) ++NEXT_PGD_PAGE(early_level4_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE ++ .fill KAISER_USER_PGD_FILL,8,0 + + NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 +@@ -424,16 +446,18 @@ NEXT_PAGE(early_dynamic_pgts) + .data + + #ifndef CONFIG_XEN +-NEXT_PAGE(init_level4_pgt) ++NEXT_PGD_PAGE(init_level4_pgt) + .fill 512,8,0 ++ .fill KAISER_USER_PGD_FILL,8,0 + #else +-NEXT_PAGE(init_level4_pgt) ++NEXT_PGD_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_START_KERNEL*8, 0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE ++ .fill KAISER_USER_PGD_FILL,8,0 + + NEXT_PAGE(level3_ident_pgt) + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE +@@ -444,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt) + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) + #endif ++ .fill KAISER_USER_PGD_FILL,8,0 + + NEXT_PAGE(level3_kernel_pgt) + .fill L3_START_KERNEL,8,0 +diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c +index 1423ab1b0312..f480b38a03c3 100644 +--- a/arch/x86/kernel/irqinit.c ++++ b/arch/x86/kernel/irqinit.c +@@ -51,7 +51,7 @@ static struct irqaction irq2 = { + .flags = IRQF_NO_THREAD, + }; + +-DEFINE_PER_CPU(vector_irq_t, vector_irq) = { ++DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { + [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, + }; + +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index 5f70014ca602..8bc68cfc0d33 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -16,6 +16,7 @@ + #include <linux/slab.h> + #include <linux/vmalloc.h> + #include <linux/uaccess.h> ++#include <linux/kaiser.h> + + #include <asm/ldt.h> + #include <asm/desc.h> +@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) + set_ldt(pc->ldt->entries, pc->ldt->size); + } + ++static void __free_ldt_struct(struct ldt_struct *ldt) ++{ ++ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) ++ vfree(ldt->entries); ++ else ++ free_page((unsigned long)ldt->entries); ++ kfree(ldt); ++} ++ + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ + static struct ldt_struct *alloc_ldt_struct(int size) + { + struct ldt_struct *new_ldt; + int alloc_size; ++ int ret; + + if (size > LDT_ENTRIES) + return NULL; +@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size) + return NULL; + } + ++ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, ++ __PAGE_KERNEL); + new_ldt->size = size; ++ if (ret) { ++ __free_ldt_struct(new_ldt); ++ return NULL; ++ } + return new_ldt; + } + +@@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) + if (likely(!ldt)) + return; + ++ kaiser_remove_mapping((unsigned long)ldt->entries, ++ ldt->size * LDT_ENTRY_SIZE); + paravirt_free_ldt(ldt->entries, ldt->size); +- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) +- vfree(ldt->entries); +- else +- free_page((unsigned long)ldt->entries); +- kfree(ldt); ++ __free_ldt_struct(ldt); + } + + /* +diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c +index bb3840cedb4f..ee43b36075c7 100644 +--- a/arch/x86/kernel/paravirt_patch_64.c ++++ b/arch/x86/kernel/paravirt_patch_64.c +@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); + DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); + DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); + DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); +-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); + DEF_NATIVE(pv_cpu_ops, clts, "clts"); + DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); + +@@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); + PATCH_SITE(pv_cpu_ops, clts); +- PATCH_SITE(pv_mmu_ops, flush_tlb_single); + PATCH_SITE(pv_cpu_ops, wbinvd); + #if defined(CONFIG_PARAVIRT_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 8e10e72bf6ee..a55b32007785 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -41,7 +41,7 @@ + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { ++__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = { + .x86_tss = { + .sp0 = TOP_OF_INIT_STACK, + #ifdef CONFIG_X86_32 +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index feaab07fa124..6b55012d02a3 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -114,6 +114,7 @@ + #include <asm/microcode.h> + #include <asm/mmu_context.h> + #include <asm/kaslr.h> ++#include <asm/kaiser.h> + + /* + * max_low_pfn_mapped: highest direct mapped pfn under 4GB +@@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p) + */ + init_hypervisor_platform(); + ++ /* ++ * This needs to happen right after XENPV is set on xen and ++ * kaiser_enabled is checked below in cleanup_highmap(). ++ */ ++ kaiser_check_boottime_disable(); ++ + x86_init.resources.probe_roms(); + + /* after parse_early_param, so could debug it */ +diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c +index 1c113db9ed57..2bb5ee464df3 100644 +--- a/arch/x86/kernel/tracepoint.c ++++ b/arch/x86/kernel/tracepoint.c +@@ -9,10 +9,12 @@ + #include <linux/atomic.h> + + atomic_t trace_idt_ctr = ATOMIC_INIT(0); ++__aligned(PAGE_SIZE) + struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) trace_idt_table }; + + /* No need to be aligned, but done to keep all IDTs defined the same way. */ ++__aligned(PAGE_SIZE) + gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; + + static int trace_irq_vector_refcount; +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 7e28e6c877d9..73304b1a03cc 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) + return 1; + + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ +- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) ++ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) || ++ !is_long_mode(vcpu)) + return 1; + } + +diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c +index 5cc78bf57232..3261abb21ef4 100644 +--- a/arch/x86/lib/cmdline.c ++++ b/arch/x86/lib/cmdline.c +@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, + return 0; /* Buffer overrun */ + } + ++/* ++ * Find a non-boolean option (i.e. option=argument). In accordance with ++ * standard Linux practice, if this option is repeated, this returns the ++ * last instance on the command line. ++ * ++ * @cmdline: the cmdline string ++ * @max_cmdline_size: the maximum size of cmdline ++ * @option: option string to look for ++ * @buffer: memory buffer to return the option argument ++ * @bufsize: size of the supplied memory buffer ++ * ++ * Returns the length of the argument (regardless of if it was ++ * truncated to fit in the buffer), or -1 on not found. ++ */ ++static int ++__cmdline_find_option(const char *cmdline, int max_cmdline_size, ++ const char *option, char *buffer, int bufsize) ++{ ++ char c; ++ int pos = 0, len = -1; ++ const char *opptr = NULL; ++ char *bufptr = buffer; ++ enum { ++ st_wordstart = 0, /* Start of word/after whitespace */ ++ st_wordcmp, /* Comparing this word */ ++ st_wordskip, /* Miscompare, skip */ ++ st_bufcpy, /* Copying this to buffer */ ++ } state = st_wordstart; ++ ++ if (!cmdline) ++ return -1; /* No command line */ ++ ++ /* ++ * This 'pos' check ensures we do not overrun ++ * a non-NULL-terminated 'cmdline' ++ */ ++ while (pos++ < max_cmdline_size) { ++ c = *(char *)cmdline++; ++ if (!c) ++ break; ++ ++ switch (state) { ++ case st_wordstart: ++ if (myisspace(c)) ++ break; ++ ++ state = st_wordcmp; ++ opptr = option; ++ /* fall through */ ++ ++ case st_wordcmp: ++ if ((c == '=') && !*opptr) { ++ /* ++ * We matched all the way to the end of the ++ * option we were looking for, prepare to ++ * copy the argument. ++ */ ++ len = 0; ++ bufptr = buffer; ++ state = st_bufcpy; ++ break; ++ } else if (c == *opptr++) { ++ /* ++ * We are currently matching, so continue ++ * to the next character on the cmdline. ++ */ ++ break; ++ } ++ state = st_wordskip; ++ /* fall through */ ++ ++ case st_wordskip: ++ if (myisspace(c)) ++ state = st_wordstart; ++ break; ++ ++ case st_bufcpy: ++ if (myisspace(c)) { ++ state = st_wordstart; ++ } else { ++ /* ++ * Increment len, but don't overrun the ++ * supplied buffer and leave room for the ++ * NULL terminator. ++ */ ++ if (++len < bufsize) ++ *bufptr++ = c; ++ } ++ break; ++ } ++ } ++ ++ if (bufsize) ++ *bufptr = '\0'; ++ ++ return len; ++} ++ + int cmdline_find_option_bool(const char *cmdline, const char *option) + { + return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); + } ++ ++int cmdline_find_option(const char *cmdline, const char *option, char *buffer, ++ int bufsize) ++{ ++ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, ++ buffer, bufsize); ++} +diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile +index 96d2b847e09e..c548b46100cb 100644 +--- a/arch/x86/mm/Makefile ++++ b/arch/x86/mm/Makefile +@@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o + + obj-$(CONFIG_X86_INTEL_MPX) += mpx.o + obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +- ++obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o ++obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index 0381638168d1..1e779bca4f3e 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void) + cr4_set_bits_and_update_boot(X86_CR4_PSE); + + /* Enable PGE if available */ +- if (boot_cpu_has(X86_FEATURE_PGE)) { ++ if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) { + cr4_set_bits_and_update_boot(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } else +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index 3e27ded6ac65..7df8e3a79dc0 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -324,6 +324,16 @@ void __init cleanup_highmap(void) + continue; + if (vaddr < (unsigned long) _text || vaddr > end) + set_pmd(pmd, __pmd(0)); ++ else if (kaiser_enabled) { ++ /* ++ * level2_kernel_pgt is initialized with _PAGE_GLOBAL: ++ * clear that now. This is not important, so long as ++ * CR4.PGE remains clear, but it removes an anomaly. ++ * Physical mapping setup below avoids _PAGE_GLOBAL ++ * by use of massage_pgprot() inside pfn_pte() etc. ++ */ ++ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); ++ } + } + } + +diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c +new file mode 100644 +index 000000000000..d8376b4ad9f0 +--- /dev/null ++++ b/arch/x86/mm/kaiser.c +@@ -0,0 +1,455 @@ ++#include <linux/bug.h> ++#include <linux/kernel.h> ++#include <linux/errno.h> ++#include <linux/string.h> ++#include <linux/types.h> ++#include <linux/bug.h> ++#include <linux/init.h> ++#include <linux/interrupt.h> ++#include <linux/spinlock.h> ++#include <linux/mm.h> ++#include <linux/uaccess.h> ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt ++ ++#include <asm/kaiser.h> ++#include <asm/tlbflush.h> /* to verify its kaiser declarations */ ++#include <asm/pgtable.h> ++#include <asm/pgalloc.h> ++#include <asm/desc.h> ++#include <asm/cmdline.h> ++ ++int kaiser_enabled __read_mostly = 1; ++EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ ++ ++__visible ++DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); ++ ++/* ++ * These can have bit 63 set, so we can not just use a plain "or" ++ * instruction to get their value or'd into CR3. It would take ++ * another register. So, we use a memory reference to these instead. ++ * ++ * This is also handy because systems that do not support PCIDs ++ * just end up or'ing a 0 into their CR3, which does no harm. ++ */ ++DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); ++ ++/* ++ * At runtime, the only things we map are some things for CPU ++ * hotplug, and stacks for new processes. No two CPUs will ever ++ * be populating the same addresses, so we only need to ensure ++ * that we protect between two CPUs trying to allocate and ++ * populate the same page table page. ++ * ++ * Only take this lock when doing a set_p[4um]d(), but it is not ++ * needed for doing a set_pte(). We assume that only the *owner* ++ * of a given allocation will be doing this for _their_ ++ * allocation. ++ * ++ * This ensures that once a system has been running for a while ++ * and there have been stacks all over and these page tables ++ * are fully populated, there will be no further acquisitions of ++ * this lock. ++ */ ++static DEFINE_SPINLOCK(shadow_table_allocation_lock); ++ ++/* ++ * Returns -1 on error. ++ */ ++static inline unsigned long get_pa_from_mapping(unsigned long vaddr) ++{ ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++ ++ pgd = pgd_offset_k(vaddr); ++ /* ++ * We made all the kernel PGDs present in kaiser_init(). ++ * We expect them to stay that way. ++ */ ++ BUG_ON(pgd_none(*pgd)); ++ /* ++ * PGDs are either 512GB or 128TB on all x86_64 ++ * configurations. We don't handle these. ++ */ ++ BUG_ON(pgd_large(*pgd)); ++ ++ pud = pud_offset(pgd, vaddr); ++ if (pud_none(*pud)) { ++ WARN_ON_ONCE(1); ++ return -1; ++ } ++ ++ if (pud_large(*pud)) ++ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); ++ ++ pmd = pmd_offset(pud, vaddr); ++ if (pmd_none(*pmd)) { ++ WARN_ON_ONCE(1); ++ return -1; ++ } ++ ++ if (pmd_large(*pmd)) ++ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); ++ ++ pte = pte_offset_kernel(pmd, vaddr); ++ if (pte_none(*pte)) { ++ WARN_ON_ONCE(1); ++ return -1; ++ } ++ ++ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); ++} ++ ++/* ++ * This is a relatively normal page table walk, except that it ++ * also tries to allocate page tables pages along the way. ++ * ++ * Returns a pointer to a PTE on success, or NULL on failure. ++ */ ++static pte_t *kaiser_pagetable_walk(unsigned long address) ++{ ++ pmd_t *pmd; ++ pud_t *pud; ++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); ++ ++ if (pgd_none(*pgd)) { ++ WARN_ONCE(1, "All shadow pgds should have been populated"); ++ return NULL; ++ } ++ BUILD_BUG_ON(pgd_large(*pgd) != 0); ++ ++ pud = pud_offset(pgd, address); ++ /* The shadow page tables do not use large mappings: */ ++ if (pud_large(*pud)) { ++ WARN_ON(1); ++ return NULL; ++ } ++ if (pud_none(*pud)) { ++ unsigned long new_pmd_page = __get_free_page(gfp); ++ if (!new_pmd_page) ++ return NULL; ++ spin_lock(&shadow_table_allocation_lock); ++ if (pud_none(*pud)) { ++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); ++ __inc_zone_page_state(virt_to_page((void *) ++ new_pmd_page), NR_KAISERTABLE); ++ } else ++ free_page(new_pmd_page); ++ spin_unlock(&shadow_table_allocation_lock); ++ } ++ ++ pmd = pmd_offset(pud, address); ++ /* The shadow page tables do not use large mappings: */ ++ if (pmd_large(*pmd)) { ++ WARN_ON(1); ++ return NULL; ++ } ++ if (pmd_none(*pmd)) { ++ unsigned long new_pte_page = __get_free_page(gfp); ++ if (!new_pte_page) ++ return NULL; ++ spin_lock(&shadow_table_allocation_lock); ++ if (pmd_none(*pmd)) { ++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); ++ __inc_zone_page_state(virt_to_page((void *) ++ new_pte_page), NR_KAISERTABLE); ++ } else ++ free_page(new_pte_page); ++ spin_unlock(&shadow_table_allocation_lock); ++ } ++ ++ return pte_offset_kernel(pmd, address); ++} ++ ++static int kaiser_add_user_map(const void *__start_addr, unsigned long size, ++ unsigned long flags) ++{ ++ int ret = 0; ++ pte_t *pte; ++ unsigned long start_addr = (unsigned long )__start_addr; ++ unsigned long address = start_addr & PAGE_MASK; ++ unsigned long end_addr = PAGE_ALIGN(start_addr + size); ++ unsigned long target_address; ++ ++ /* ++ * It is convenient for callers to pass in __PAGE_KERNEL etc, ++ * and there is no actual harm from setting _PAGE_GLOBAL, so ++ * long as CR4.PGE is not set. But it is nonetheless troubling ++ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" ++ * requires that not to be #defined to 0): so mask it off here. ++ */ ++ flags &= ~_PAGE_GLOBAL; ++ ++ for (; address < end_addr; address += PAGE_SIZE) { ++ target_address = get_pa_from_mapping(address); ++ if (target_address == -1) { ++ ret = -EIO; ++ break; ++ } ++ pte = kaiser_pagetable_walk(address); ++ if (!pte) { ++ ret = -ENOMEM; ++ break; ++ } ++ if (pte_none(*pte)) { ++ set_pte(pte, __pte(flags | target_address)); ++ } else { ++ pte_t tmp; ++ set_pte(&tmp, __pte(flags | target_address)); ++ WARN_ON_ONCE(!pte_same(*pte, tmp)); ++ } ++ } ++ return ret; ++} ++ ++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) ++{ ++ unsigned long size = end - start; ++ ++ return kaiser_add_user_map(start, size, flags); ++} ++ ++/* ++ * Ensure that the top level of the (shadow) page tables are ++ * entirely populated. This ensures that all processes that get ++ * forked have the same entries. This way, we do not have to ++ * ever go set up new entries in older processes. ++ * ++ * Note: we never free these, so there are no updates to them ++ * after this. ++ */ ++static void __init kaiser_init_all_pgds(void) ++{ ++ pgd_t *pgd; ++ int i = 0; ++ ++ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); ++ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { ++ pgd_t new_pgd; ++ pud_t *pud = pud_alloc_one(&init_mm, ++ PAGE_OFFSET + i * PGDIR_SIZE); ++ if (!pud) { ++ WARN_ON(1); ++ break; ++ } ++ inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); ++ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); ++ /* ++ * Make sure not to stomp on some other pgd entry. ++ */ ++ if (!pgd_none(pgd[i])) { ++ WARN_ON(1); ++ continue; ++ } ++ set_pgd(pgd + i, new_pgd); ++ } ++} ++ ++#define kaiser_add_user_map_early(start, size, flags) do { \ ++ int __ret = kaiser_add_user_map(start, size, flags); \ ++ WARN_ON(__ret); \ ++} while (0) ++ ++#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ ++ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ ++ WARN_ON(__ret); \ ++} while (0) ++ ++void __init kaiser_check_boottime_disable(void) ++{ ++ bool enable = true; ++ char arg[5]; ++ int ret; ++ ++ if (boot_cpu_has(X86_FEATURE_XENPV)) ++ goto silent_disable; ++ ++ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); ++ if (ret > 0) { ++ if (!strncmp(arg, "on", 2)) ++ goto enable; ++ ++ if (!strncmp(arg, "off", 3)) ++ goto disable; ++ ++ if (!strncmp(arg, "auto", 4)) ++ goto skip; ++ } ++ ++ if (cmdline_find_option_bool(boot_command_line, "nopti")) ++ goto disable; ++ ++skip: ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ++ goto disable; ++ ++enable: ++ if (enable) ++ setup_force_cpu_cap(X86_FEATURE_KAISER); ++ ++ return; ++ ++disable: ++ pr_info("disabled\n"); ++ ++silent_disable: ++ kaiser_enabled = 0; ++ setup_clear_cpu_cap(X86_FEATURE_KAISER); ++} ++ ++/* ++ * If anything in here fails, we will likely die on one of the ++ * first kernel->user transitions and init will die. But, we ++ * will have most of the kernel up by then and should be able to ++ * get a clean warning out of it. If we BUG_ON() here, we run ++ * the risk of being before we have good console output. ++ */ ++void __init kaiser_init(void) ++{ ++ int cpu; ++ ++ if (!kaiser_enabled) ++ return; ++ ++ kaiser_init_all_pgds(); ++ ++ for_each_possible_cpu(cpu) { ++ void *percpu_vaddr = __per_cpu_user_mapped_start + ++ per_cpu_offset(cpu); ++ unsigned long percpu_sz = __per_cpu_user_mapped_end - ++ __per_cpu_user_mapped_start; ++ kaiser_add_user_map_early(percpu_vaddr, percpu_sz, ++ __PAGE_KERNEL); ++ } ++ ++ /* ++ * Map the entry/exit text section, which is needed at ++ * switches from user to and from kernel. ++ */ ++ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, ++ __PAGE_KERNEL_RX); ++ ++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) ++ kaiser_add_user_map_ptrs_early(__irqentry_text_start, ++ __irqentry_text_end, ++ __PAGE_KERNEL_RX); ++#endif ++ kaiser_add_user_map_early((void *)idt_descr.address, ++ sizeof(gate_desc) * NR_VECTORS, ++ __PAGE_KERNEL_RO); ++#ifdef CONFIG_TRACING ++ kaiser_add_user_map_early(&trace_idt_descr, ++ sizeof(trace_idt_descr), ++ __PAGE_KERNEL); ++ kaiser_add_user_map_early(&trace_idt_table, ++ sizeof(gate_desc) * NR_VECTORS, ++ __PAGE_KERNEL); ++#endif ++ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), ++ __PAGE_KERNEL); ++ kaiser_add_user_map_early(&debug_idt_table, ++ sizeof(gate_desc) * NR_VECTORS, ++ __PAGE_KERNEL); ++ ++ pr_info("enabled\n"); ++} ++ ++/* Add a mapping to the shadow mapping, and synchronize the mappings */ ++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) ++{ ++ if (!kaiser_enabled) ++ return 0; ++ return kaiser_add_user_map((const void *)addr, size, flags); ++} ++ ++void kaiser_remove_mapping(unsigned long start, unsigned long size) ++{ ++ extern void unmap_pud_range_nofree(pgd_t *pgd, ++ unsigned long start, unsigned long end); ++ unsigned long end = start + size; ++ unsigned long addr, next; ++ pgd_t *pgd; ++ ++ if (!kaiser_enabled) ++ return; ++ pgd = native_get_shadow_pgd(pgd_offset_k(start)); ++ for (addr = start; addr < end; pgd++, addr = next) { ++ next = pgd_addr_end(addr, end); ++ unmap_pud_range_nofree(pgd, addr, next); ++ } ++} ++ ++/* ++ * Page table pages are page-aligned. The lower half of the top ++ * level is used for userspace and the top half for the kernel. ++ * This returns true for user pages that need to get copied into ++ * both the user and kernel copies of the page tables, and false ++ * for kernel pages that should only be in the kernel copy. ++ */ ++static inline bool is_userspace_pgd(pgd_t *pgdp) ++{ ++ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); ++} ++ ++pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) ++{ ++ if (!kaiser_enabled) ++ return pgd; ++ /* ++ * Do we need to also populate the shadow pgd? Check _PAGE_USER to ++ * skip cases like kexec and EFI which make temporary low mappings. ++ */ ++ if (pgd.pgd & _PAGE_USER) { ++ if (is_userspace_pgd(pgdp)) { ++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; ++ /* ++ * Even if the entry is *mapping* userspace, ensure ++ * that userspace can not use it. This way, if we ++ * get out to userspace running on the kernel CR3, ++ * userspace will crash instead of running. ++ */ ++ if (__supported_pte_mask & _PAGE_NX) ++ pgd.pgd |= _PAGE_NX; ++ } ++ } else if (!pgd.pgd) { ++ /* ++ * pgd_clear() cannot check _PAGE_USER, and is even used to ++ * clear corrupted pgd entries: so just rely on cases like ++ * kexec and EFI never to be using pgd_clear(). ++ */ ++ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && ++ is_userspace_pgd(pgdp)) ++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; ++ } ++ return pgd; ++} ++ ++void kaiser_setup_pcid(void) ++{ ++ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; ++ ++ if (this_cpu_has(X86_FEATURE_PCID)) ++ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; ++ /* ++ * These variables are used by the entry/exit ++ * code to change PCID and pgd and TLB flushing. ++ */ ++ this_cpu_write(x86_cr3_pcid_user, user_cr3); ++} ++ ++/* ++ * Make a note that this cpu will need to flush USER tlb on return to user. ++ * If cpu does not have PCID, then the NOFLUSH bit will never have been set. ++ */ ++void kaiser_flush_tlb_on_return_to_user(void) ++{ ++ if (this_cpu_has(X86_FEATURE_PCID)) ++ this_cpu_write(x86_cr3_pcid_user, ++ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); ++} ++EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); +diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c +index aed206475aa7..319183d93602 100644 +--- a/arch/x86/mm/kaslr.c ++++ b/arch/x86/mm/kaslr.c +@@ -189,6 +189,6 @@ void __meminit init_trampoline(void) + *pud_tramp = *pud; + } + +- set_pgd(&trampoline_pgd_entry, +- __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); ++ /* Avoid set_pgd(), in case it's complicated by CONFIG_PAGE_TABLE_ISOLATION */ ++ trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); + } +diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c +index e3353c97d086..73dcb0e18c1b 100644 +--- a/arch/x86/mm/pageattr.c ++++ b/arch/x86/mm/pageattr.c +@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); + #define CPA_FLUSHTLB 1 + #define CPA_ARRAY 2 + #define CPA_PAGES_ARRAY 4 ++#define CPA_FREE_PAGETABLES 8 + + #ifdef CONFIG_PROC_FS + static unsigned long direct_pages_count[PG_LEVEL_NUM]; +@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, + return 0; + } + +-static bool try_to_free_pte_page(pte_t *pte) ++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte) + { + int i; + ++ if (!(cpa->flags & CPA_FREE_PAGETABLES)) ++ return false; ++ + for (i = 0; i < PTRS_PER_PTE; i++) + if (!pte_none(pte[i])) + return false; +@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte) + return true; + } + +-static bool try_to_free_pmd_page(pmd_t *pmd) ++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd) + { + int i; + ++ if (!(cpa->flags & CPA_FREE_PAGETABLES)) ++ return false; ++ + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_none(pmd[i])) + return false; +@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd) + return true; + } + +-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) ++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd, ++ unsigned long start, ++ unsigned long end) + { + pte_t *pte = pte_offset_kernel(pmd, start); + +@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) + pte++; + } + +- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { ++ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) { + pmd_clear(pmd); + return true; + } + return false; + } + +-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, ++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd, + unsigned long start, unsigned long end) + { +- if (unmap_pte_range(pmd, start, end)) +- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) ++ if (unmap_pte_range(cpa, pmd, start, end)) ++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); + } + +-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) ++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, ++ unsigned long start, unsigned long end) + { + pmd_t *pmd = pmd_offset(pud, start); + +@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + +- __unmap_pmd_range(pud, pmd, start, pre_end); ++ __unmap_pmd_range(cpa, pud, pmd, start, pre_end); + + start = pre_end; + pmd++; +@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) + if (pmd_large(*pmd)) + pmd_clear(pmd); + else +- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); ++ __unmap_pmd_range(cpa, pud, pmd, ++ start, start + PMD_SIZE); + + start += PMD_SIZE; + pmd++; +@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) + * 4K leftovers? + */ + if (start < end) +- return __unmap_pmd_range(pud, pmd, start, end); ++ return __unmap_pmd_range(cpa, pud, pmd, start, end); + + /* + * Try again to free the PMD page if haven't succeeded above. + */ + if (!pud_none(*pud)) +- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) ++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); + } + +-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd, ++ unsigned long start, ++ unsigned long end) + { + pud_t *pud = pud_offset(pgd, start); + +@@ -834,7 +847,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + +- unmap_pmd_range(pud, start, pre_end); ++ unmap_pmd_range(cpa, pud, start, pre_end); + + start = pre_end; + pud++; +@@ -848,7 +861,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) + if (pud_large(*pud)) + pud_clear(pud); + else +- unmap_pmd_range(pud, start, start + PUD_SIZE); ++ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE); + + start += PUD_SIZE; + pud++; +@@ -858,7 +871,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) + * 2M leftovers? + */ + if (start < end) +- unmap_pmd_range(pud, start, end); ++ unmap_pmd_range(cpa, pud, start, end); + + /* + * No need to try to free the PUD page because we'll free it in +@@ -866,6 +879,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) + */ + } + ++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ++{ ++ struct cpa_data cpa = { ++ .flags = CPA_FREE_PAGETABLES, ++ }; ++ ++ __unmap_pud_range(&cpa, pgd, start, end); ++} ++ ++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end) ++{ ++ struct cpa_data cpa = { ++ .flags = 0, ++ }; ++ ++ __unmap_pud_range(&cpa, pgd, start, end); ++} ++ + static int alloc_pte_page(pmd_t *pmd) + { + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index 3feec5af4e67..5aaec8effc5f 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -344,14 +344,22 @@ static inline void _pgd_free(pgd_t *pgd) + kmem_cache_free(pgd_cache, pgd); + } + #else ++ ++/* ++ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is ++ * both 8k in size and 8k-aligned. That lets us just flip bit 12 ++ * in a pointer to swap between the two 4k halves. ++ */ ++#define PGD_ALLOCATION_ORDER kaiser_enabled ++ + static inline pgd_t *_pgd_alloc(void) + { +- return (pgd_t *)__get_free_page(PGALLOC_GFP); ++ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); + } + + static inline void _pgd_free(pgd_t *pgd) + { +- free_page((unsigned long)pgd); ++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); + } + #endif /* CONFIG_X86_PAE */ + +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 53b72fb4e781..41205de487e7 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -6,13 +6,14 @@ + #include <linux/interrupt.h> + #include <linux/export.h> + #include <linux/cpu.h> ++#include <linux/debugfs.h> + + #include <asm/tlbflush.h> + #include <asm/mmu_context.h> + #include <asm/cache.h> + #include <asm/apic.h> + #include <asm/uv/uv.h> +-#include <linux/debugfs.h> ++#include <asm/kaiser.h> + + /* + * TLB flushing, formerly SMP-only +@@ -34,6 +35,36 @@ struct flush_tlb_info { + unsigned long flush_end; + }; + ++static void load_new_mm_cr3(pgd_t *pgdir) ++{ ++ unsigned long new_mm_cr3 = __pa(pgdir); ++ ++ if (kaiser_enabled) { ++ /* ++ * We reuse the same PCID for different tasks, so we must ++ * flush all the entries for the PCID out when we change tasks. ++ * Flush KERN below, flush USER when returning to userspace in ++ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. ++ * ++ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could ++ * do it here, but can only be used if X86_FEATURE_INVPCID is ++ * available - and many machines support pcid without invpcid. ++ * ++ * If X86_CR3_PCID_KERN_FLUSH actually added something, then it ++ * would be needed in the write_cr3() below - if PCIDs enabled. ++ */ ++ BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH); ++ kaiser_flush_tlb_on_return_to_user(); ++ } ++ ++ /* ++ * Caution: many callers of this function expect ++ * that load_cr3() is serializing and orders TLB ++ * fills with respect to the mm_cpumask writes. ++ */ ++ write_cr3(new_mm_cr3); ++} ++ + /* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. +@@ -45,7 +76,7 @@ void leave_mm(int cpu) + BUG(); + if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { + cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); +- load_cr3(swapper_pg_dir); ++ load_new_mm_cr3(swapper_pg_dir); + /* + * This gets called in the idle path where RCU + * functions differently. Tracing normally +@@ -120,7 +151,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + * ordering guarantee we need. + * + */ +- load_cr3(next->pgd); ++ load_new_mm_cr3(next->pgd); + + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + +@@ -167,7 +198,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. + */ +- load_cr3(next->pgd); ++ load_new_mm_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + load_mm_cr4(next); + load_mm_ldt(next); +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index dc81e5287ebf..2e6000a4eb2c 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -778,7 +778,14 @@ + */ + #define PERCPU_INPUT(cacheline) \ + VMLINUX_SYMBOL(__per_cpu_start) = .; \ ++ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ + *(.data..percpu..first) \ ++ . = ALIGN(cacheline); \ ++ *(.data..percpu..user_mapped) \ ++ *(.data..percpu..user_mapped..shared_aligned) \ ++ . = ALIGN(PAGE_SIZE); \ ++ *(.data..percpu..user_mapped..page_aligned) \ ++ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..page_aligned) \ + . = ALIGN(cacheline); \ +diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h +new file mode 100644 +index 000000000000..58c55b1589d0 +--- /dev/null ++++ b/include/linux/kaiser.h +@@ -0,0 +1,52 @@ ++#ifndef _LINUX_KAISER_H ++#define _LINUX_KAISER_H ++ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++#include <asm/kaiser.h> ++ ++static inline int kaiser_map_thread_stack(void *stack) ++{ ++ /* ++ * Map that page of kernel stack on which we enter from user context. ++ */ ++ return kaiser_add_mapping((unsigned long)stack + ++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL); ++} ++ ++static inline void kaiser_unmap_thread_stack(void *stack) ++{ ++ /* ++ * Note: may be called even when kaiser_map_thread_stack() failed. ++ */ ++ kaiser_remove_mapping((unsigned long)stack + ++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE); ++} ++#else ++ ++/* ++ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which ++ * includes architectures that support KAISER, but have it disabled. ++ */ ++ ++static inline void kaiser_init(void) ++{ ++} ++static inline int kaiser_add_mapping(unsigned long addr, ++ unsigned long size, unsigned long flags) ++{ ++ return 0; ++} ++static inline void kaiser_remove_mapping(unsigned long start, ++ unsigned long size) ++{ ++} ++static inline int kaiser_map_thread_stack(void *stack) ++{ ++ return 0; ++} ++static inline void kaiser_unmap_thread_stack(void *stack) ++{ ++} ++ ++#endif /* !CONFIG_PAGE_TABLE_ISOLATION */ ++#endif /* _LINUX_KAISER_H */ +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index fff21a82780c..490f5a83f947 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -124,8 +124,9 @@ enum zone_stat_item { + NR_SLAB_UNRECLAIMABLE, + NR_PAGETABLE, /* used for pagetables */ + NR_KERNEL_STACK_KB, /* measured in KiB */ +- /* Second 128 byte cacheline */ ++ NR_KAISERTABLE, + NR_BOUNCE, ++ /* Second 128 byte cacheline */ + #if IS_ENABLED(CONFIG_ZSMALLOC) + NR_ZSPAGES, /* allocated in zsmalloc */ + #endif +diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h +index 8f16299ca068..8902f23bb770 100644 +--- a/include/linux/percpu-defs.h ++++ b/include/linux/percpu-defs.h +@@ -35,6 +35,12 @@ + + #endif + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++#define USER_MAPPED_SECTION "..user_mapped" ++#else ++#define USER_MAPPED_SECTION "" ++#endif ++ + /* + * Base implementations of per-CPU variable declarations and definitions, where + * the section in which the variable is to be placed is provided by the +@@ -115,6 +121,12 @@ + #define DEFINE_PER_CPU(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "") + ++#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) ++ ++#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) ++ + /* + * Declaration/definition used for per-CPU variables that must come first in + * the set of variables. +@@ -144,6 +156,14 @@ + DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + ++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ ++ ____cacheline_aligned_in_smp ++ ++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ ++ ____cacheline_aligned_in_smp ++ + #define DECLARE_PER_CPU_ALIGNED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ + ____cacheline_aligned +@@ -162,11 +182,21 @@ + #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ + __aligned(PAGE_SIZE) ++/* ++ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. ++ */ ++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ ++ __aligned(PAGE_SIZE) ++ ++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ ++ __aligned(PAGE_SIZE) + + /* + * Declaration/definition used for per-CPU variables that must be read mostly. + */ +-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ ++#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..read_mostly") + + #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ +diff --git a/init/main.c b/init/main.c +index 25bac88bc66e..99f026565608 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -80,6 +80,7 @@ + #include <linux/integrity.h> + #include <linux/proc_ns.h> + #include <linux/io.h> ++#include <linux/kaiser.h> + + #include <asm/io.h> + #include <asm/bugs.h> +@@ -473,6 +474,7 @@ static void __init mm_init(void) + pgtable_init(); + vmalloc_init(); + ioremap_huge_init(); ++ kaiser_init(); + } + + asmlinkage __visible void __init start_kernel(void) +diff --git a/kernel/fork.c b/kernel/fork.c +index 9321b1ad3335..70e10cb49be0 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -58,6 +58,7 @@ + #include <linux/tsacct_kern.h> + #include <linux/cn_proc.h> + #include <linux/freezer.h> ++#include <linux/kaiser.h> + #include <linux/delayacct.h> + #include <linux/taskstats_kern.h> + #include <linux/random.h> +@@ -213,6 +214,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) + + static inline void free_thread_stack(struct task_struct *tsk) + { ++ kaiser_unmap_thread_stack(tsk->stack); + #ifdef CONFIG_VMAP_STACK + if (task_stack_vm_area(tsk)) { + unsigned long flags; +@@ -495,6 +497,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + * functions again. + */ + tsk->stack = stack; ++ ++ err= kaiser_map_thread_stack(tsk->stack); ++ if (err) ++ goto free_stack; + #ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = stack_vm_area; + #endif +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 604f26a4f696..6a088df04b29 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -932,6 +932,7 @@ const char * const vmstat_text[] = { + "nr_slab_unreclaimable", + "nr_page_table_pages", + "nr_kernel_stack", ++ "nr_overhead", + "nr_bounce", + #if IS_ENABLED(CONFIG_ZSMALLOC) + "nr_zspages", +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 97f9cac98348..e86a34fd5484 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -843,6 +843,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk) + */ + static u32 bbr_undo_cwnd(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ bbr->full_bw_cnt = 0; ++ bbr_reset_lt_bw_sampling(sk); + return tcp_sk(sk)->snd_cwnd; + } + +diff --git a/security/Kconfig b/security/Kconfig +index 118f4549404e..32f36b40e9f0 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -31,6 +31,16 @@ config SECURITY + + If you are unsure how to answer this question, answer N. + ++config PAGE_TABLE_ISOLATION ++ bool "Remove the kernel mapping in user mode" ++ default y ++ depends on X86_64 && SMP ++ help ++ This enforces a strict kernel and user space isolation, in order ++ to close hardware side channels on kernel address information. ++ ++ If you are unsure how to answer this question, answer Y. ++ + config SECURITYFS + bool "Enable the securityfs filesystem" + help +diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h +index a39629206864..f79669a38c0c 100644 +--- a/tools/arch/x86/include/asm/cpufeatures.h ++++ b/tools/arch/x86/include/asm/cpufeatures.h +@@ -197,6 +197,9 @@ + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + ++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ ++ + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ + #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ |