summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlice Ferrazzi <alicef@gentoo.org>2018-01-05 15:03:54 +0000
committerAlice Ferrazzi <alicef@gentoo.org>2018-01-05 15:03:54 +0000
commita08c6f0923abc66cb0192f849780a30c3016e946 (patch)
treef4368864b59d22d21a581c68810afec3f0218b39
parentLinux patch 4.9.74 (diff)
downloadlinux-patches-a08c6f0923abc66cb0192f849780a30c3016e946.tar.gz
linux-patches-a08c6f0923abc66cb0192f849780a30c3016e946.tar.bz2
linux-patches-a08c6f0923abc66cb0192f849780a30c3016e946.zip
linux kernel 4.9.75
-rw-r--r--0000_README4
-rw-r--r--1074_linux-4.9.75.patch2577
2 files changed, 2581 insertions, 0 deletions
diff --git a/0000_README b/0000_README
index 350d2c5f..eed33722 100644
--- a/0000_README
+++ b/0000_README
@@ -339,6 +339,10 @@ Patch: 1073_linux-4.9.74.patch
From: http://www.kernel.org
Desc: Linux 4.9.74
+Patch: 1074_linux-4.9.75.patch
+From: http://www.kernel.org
+Desc: Linux 4.9.75
+
Patch: 1500_XATTR_USER_PREFIX.patch
From: https://bugs.gentoo.org/show_bug.cgi?id=470644
Desc: Support for namespace user.pax.* on tmpfs.
diff --git a/1074_linux-4.9.75.patch b/1074_linux-4.9.75.patch
new file mode 100644
index 00000000..6299f19d
--- /dev/null
+++ b/1074_linux-4.9.75.patch
@@ -0,0 +1,2577 @@
+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
+index 152ec4e87b57..5d2676d043de 100644
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
+
+ nojitter [IA-64] Disables jitter checking for ITC timers.
+
++ nopti [X86-64] Disable KAISER isolation of kernel from user.
++
+ no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
+
+ no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
+@@ -3325,6 +3327,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
+ pt. [PARIDE]
+ See Documentation/blockdev/paride.txt.
+
++ pti= [X86_64]
++ Control KAISER user/kernel address space isolation:
++ on - enable
++ off - disable
++ auto - default setting
++
+ pty.legacy_count=
+ [KNL] Number of legacy pty's. Overwrites compiled-in
+ default number.
+diff --git a/Makefile b/Makefile
+index 075e429732e7..acbc1b032db2 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,6 +1,6 @@
+ VERSION = 4
+ PATCHLEVEL = 9
+-SUBLEVEL = 74
++SUBLEVEL = 75
+ EXTRAVERSION =
+ NAME = Roaring Lionus
+
+diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
+index 766a5211f827..2728e1b7e4a6 100644
+--- a/arch/x86/boot/compressed/misc.h
++++ b/arch/x86/boot/compressed/misc.h
+@@ -9,6 +9,7 @@
+ */
+ #undef CONFIG_PARAVIRT
+ #undef CONFIG_PARAVIRT_SPINLOCKS
++#undef CONFIG_PAGE_TABLE_ISOLATION
+ #undef CONFIG_KASAN
+
+ #include <linux/linkage.h>
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index e7b0e7ff4c58..af4e58132d91 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -36,6 +36,7 @@
+ #include <asm/smap.h>
+ #include <asm/pgtable_types.h>
+ #include <asm/export.h>
++#include <asm/kaiser.h>
+ #include <linux/err.h>
+
+ /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+@@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)
+ * it is too small to ever cause noticeable irq latency.
+ */
+ SWAPGS_UNSAFE_STACK
++ SWITCH_KERNEL_CR3_NO_STACK
+ /*
+ * A hypervisor implementation might want to use a label
+ * after the swapgs, so that it can do the swapgs
+@@ -228,6 +230,14 @@ entry_SYSCALL_64_fastpath:
+ movq RIP(%rsp), %rcx
+ movq EFLAGS(%rsp), %r11
+ RESTORE_C_REGS_EXCEPT_RCX_R11
++ /*
++ * This opens a window where we have a user CR3, but are
++ * running in the kernel. This makes using the CS
++ * register useless for telling whether or not we need to
++ * switch CR3 in NMIs. Normal interrupts are OK because
++ * they are off here.
++ */
++ SWITCH_USER_CR3
+ movq RSP(%rsp), %rsp
+ USERGS_SYSRET64
+
+@@ -323,10 +333,26 @@ return_from_SYSCALL_64:
+ syscall_return_via_sysret:
+ /* rcx and r11 are already restored (see code above) */
+ RESTORE_C_REGS_EXCEPT_RCX_R11
++ /*
++ * This opens a window where we have a user CR3, but are
++ * running in the kernel. This makes using the CS
++ * register useless for telling whether or not we need to
++ * switch CR3 in NMIs. Normal interrupts are OK because
++ * they are off here.
++ */
++ SWITCH_USER_CR3
+ movq RSP(%rsp), %rsp
+ USERGS_SYSRET64
+
+ opportunistic_sysret_failed:
++ /*
++ * This opens a window where we have a user CR3, but are
++ * running in the kernel. This makes using the CS
++ * register useless for telling whether or not we need to
++ * switch CR3 in NMIs. Normal interrupts are OK because
++ * they are off here.
++ */
++ SWITCH_USER_CR3
+ SWAPGS
+ jmp restore_c_regs_and_iret
+ END(entry_SYSCALL_64)
+@@ -424,6 +450,7 @@ ENTRY(ret_from_fork)
+ movq %rsp, %rdi
+ call syscall_return_slowpath /* returns with IRQs disabled */
+ TRACE_IRQS_ON /* user mode is traced as IRQS on */
++ SWITCH_USER_CR3
+ SWAPGS
+ jmp restore_regs_and_iret
+
+@@ -478,6 +505,7 @@ END(irq_entries_start)
+ * tracking that we're in kernel mode.
+ */
+ SWAPGS
++ SWITCH_KERNEL_CR3
+
+ /*
+ * We need to tell lockdep that IRQs are off. We can't do this until
+@@ -535,6 +563,7 @@ GLOBAL(retint_user)
+ mov %rsp,%rdi
+ call prepare_exit_to_usermode
+ TRACE_IRQS_IRETQ
++ SWITCH_USER_CR3
+ SWAPGS
+ jmp restore_regs_and_iret
+
+@@ -612,6 +641,7 @@ native_irq_return_ldt:
+
+ pushq %rdi /* Stash user RDI */
+ SWAPGS
++ SWITCH_KERNEL_CR3
+ movq PER_CPU_VAR(espfix_waddr), %rdi
+ movq %rax, (0*8)(%rdi) /* user RAX */
+ movq (1*8)(%rsp), %rax /* user RIP */
+@@ -638,6 +668,7 @@ native_irq_return_ldt:
+ * still points to an RO alias of the ESPFIX stack.
+ */
+ orq PER_CPU_VAR(espfix_stack), %rax
++ SWITCH_USER_CR3
+ SWAPGS
+ movq %rax, %rsp
+
+@@ -1022,7 +1053,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
+ /*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
++ *
++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
+ */
+ ENTRY(paranoid_entry)
+ cld
+@@ -1035,7 +1070,26 @@ ENTRY(paranoid_entry)
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx, %ebx
+-1: ret
++1:
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ /*
++ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
++ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
++ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
++ * unconditionally, but we need to find out whether the reverse
++ * should be done on return (conveyed to paranoid_exit in %ebx).
++ */
++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
++ testl $KAISER_SHADOW_PGD_OFFSET, %eax
++ jz 2f
++ orl $2, %ebx
++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
++ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
++ movq %rax, %cr3
++2:
++#endif
++ ret
+ END(paranoid_entry)
+
+ /*
+@@ -1048,19 +1102,26 @@ END(paranoid_entry)
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
+ *
+- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
++ * ebx=2: needs both swapgs and SWITCH_USER_CR3
++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
+ */
+ ENTRY(paranoid_exit)
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF_DEBUG
+- testl %ebx, %ebx /* swapgs needed? */
++ TRACE_IRQS_IRETQ_DEBUG
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
++ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
++ jz paranoid_exit_no_switch
++ SWITCH_USER_CR3
++paranoid_exit_no_switch:
++#endif
++ testl $1, %ebx /* swapgs needed? */
+ jnz paranoid_exit_no_swapgs
+- TRACE_IRQS_IRETQ
+ SWAPGS_UNSAFE_STACK
+- jmp paranoid_exit_restore
+ paranoid_exit_no_swapgs:
+- TRACE_IRQS_IRETQ_DEBUG
+-paranoid_exit_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+@@ -1075,6 +1136,13 @@ ENTRY(error_entry)
+ cld
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
++ /*
++ * error_entry() always returns with a kernel gsbase and
++ * CR3. We must also have a kernel CR3/gsbase before
++ * calling TRACE_IRQS_*. Just unconditionally switch to
++ * the kernel CR3 here.
++ */
++ SWITCH_KERNEL_CR3
+ xorl %ebx, %ebx
+ testb $3, CS+8(%rsp)
+ jz .Lerror_kernelspace
+@@ -1235,6 +1303,10 @@ ENTRY(nmi)
+ */
+
+ SWAPGS_UNSAFE_STACK
++ /*
++ * percpu variables are mapped with user CR3, so no need
++ * to switch CR3 here.
++ */
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+@@ -1268,12 +1340,34 @@ ENTRY(nmi)
+
+ movq %rsp, %rdi
+ movq $-1, %rsi
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ /* Unconditionally use kernel CR3 for do_nmi() */
++ /* %rax is saved above, so OK to clobber here */
++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
++ pushq %rax
++ /* mask off "user" bit of pgd address and 12 PCID bits: */
++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
++ movq %rax, %cr3
++2:
++#endif
+ call do_nmi
+
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ /*
++ * Unconditionally restore CR3. I know we return to
++ * kernel code that needs user CR3, but do we ever return
++ * to "user mode" where we need the kernel CR3?
++ */
++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
++#endif
++
+ /*
+ * Return back to user mode. We must *not* do the normal exit
+- * work, because we don't want to enable interrupts. Fortunately,
+- * do_nmi doesn't modify pt_regs.
++ * work, because we don't want to enable interrupts. Do not
++ * switch to user CR3: we might be going back to kernel code
++ * that had a user CR3 set.
+ */
+ SWAPGS
+ jmp restore_c_regs_and_iret
+@@ -1470,22 +1564,55 @@ end_repeat_nmi:
+ ALLOC_PT_GPREGS_ON_STACK
+
+ /*
+- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
+- * as we should not be calling schedule in NMI context.
+- * Even with normal interrupts enabled. An NMI should not be
+- * setting NEED_RESCHED or anything that normal interrupts and
+- * exceptions might do.
++ * Use the same approach as paranoid_entry to handle SWAPGS, but
++ * without CR3 handling since we do that differently in NMIs. No
++ * need to use paranoid_exit as we should not be calling schedule
++ * in NMI context. Even with normal interrupts enabled. An NMI
++ * should not be setting NEED_RESCHED or anything that normal
++ * interrupts and exceptions might do.
+ */
+- call paranoid_entry
+-
+- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
++ cld
++ SAVE_C_REGS
++ SAVE_EXTRA_REGS
++ movl $1, %ebx
++ movl $MSR_GS_BASE, %ecx
++ rdmsr
++ testl %edx, %edx
++ js 1f /* negative -> in kernel */
++ SWAPGS
++ xorl %ebx, %ebx
++1:
+ movq %rsp, %rdi
+ movq $-1, %rsi
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ /* Unconditionally use kernel CR3 for do_nmi() */
++ /* %rax is saved above, so OK to clobber here */
++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
++ pushq %rax
++ /* mask off "user" bit of pgd address and 12 PCID bits: */
++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
++ movq %rax, %cr3
++2:
++#endif
++
++ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ call do_nmi
+
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ /*
++ * Unconditionally restore CR3. We might be returning to
++ * kernel code that needs user CR3, like just just before
++ * a sysret.
++ */
++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
++#endif
++
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz nmi_restore
+ nmi_swapgs:
++ /* We fixed up CR3 above, so no need to switch it here */
+ SWAPGS_UNSAFE_STACK
+ nmi_restore:
+ RESTORE_EXTRA_REGS
+diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
+index e1721dafbcb1..d76a97653980 100644
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -13,6 +13,8 @@
+ #include <asm/irqflags.h>
+ #include <asm/asm.h>
+ #include <asm/smap.h>
++#include <asm/pgtable_types.h>
++#include <asm/kaiser.h>
+ #include <linux/linkage.h>
+ #include <linux/err.h>
+
+@@ -48,6 +50,7 @@
+ ENTRY(entry_SYSENTER_compat)
+ /* Interrupts are off on entry. */
+ SWAPGS_UNSAFE_STACK
++ SWITCH_KERNEL_CR3_NO_STACK
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ /*
+@@ -184,6 +187,7 @@ ENDPROC(entry_SYSENTER_compat)
+ ENTRY(entry_SYSCALL_compat)
+ /* Interrupts are off on entry. */
+ SWAPGS_UNSAFE_STACK
++ SWITCH_KERNEL_CR3_NO_STACK
+
+ /* Stash user ESP and switch to the kernel stack. */
+ movl %esp, %r8d
+@@ -259,6 +263,7 @@ sysret32_from_system_call:
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r10
++ SWITCH_USER_CR3
+ movq RSP-ORIG_RAX(%rsp), %rsp
+ swapgs
+ sysretl
+@@ -297,7 +302,7 @@ ENTRY(entry_INT80_compat)
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ ASM_CLAC /* Do this early to minimize exposure */
+ SWAPGS
+-
++ SWITCH_KERNEL_CR3_NO_STACK
+ /*
+ * User tracing code (ptrace or signal handlers) might assume that
+ * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+@@ -338,6 +343,7 @@ ENTRY(entry_INT80_compat)
+
+ /* Go back to user mode. */
+ TRACE_IRQS_ON
++ SWITCH_USER_CR3
+ SWAPGS
+ jmp restore_regs_and_iret
+ END(entry_INT80_compat)
+diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
+index 9dfeeeca0ea8..8e7a3f1df3a5 100644
+--- a/arch/x86/events/intel/ds.c
++++ b/arch/x86/events/intel/ds.c
+@@ -2,11 +2,15 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+
++#include <asm/kaiser.h>
+ #include <asm/perf_event.h>
+ #include <asm/insn.h>
+
+ #include "../perf_event.h"
+
++static
++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
++
+ /* The size of a BTS record in bytes: */
+ #define BTS_RECORD_SIZE 24
+
+@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
+
+ static DEFINE_PER_CPU(void *, insn_buffer);
+
++static void *dsalloc(size_t size, gfp_t flags, int node)
++{
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ unsigned int order = get_order(size);
++ struct page *page;
++ unsigned long addr;
++
++ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
++ if (!page)
++ return NULL;
++ addr = (unsigned long)page_address(page);
++ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
++ __free_pages(page, order);
++ addr = 0;
++ }
++ return (void *)addr;
++#else
++ return kmalloc_node(size, flags | __GFP_ZERO, node);
++#endif
++}
++
++static void dsfree(const void *buffer, size_t size)
++{
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ if (!buffer)
++ return;
++ kaiser_remove_mapping((unsigned long)buffer, size);
++ free_pages((unsigned long)buffer, get_order(size));
++#else
++ kfree(buffer);
++#endif
++}
++
+ static int alloc_pebs_buffer(int cpu)
+ {
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
+ if (!x86_pmu.pebs)
+ return 0;
+
+- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
++ buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+ if (unlikely(!buffer))
+ return -ENOMEM;
+
+@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
+ if (x86_pmu.intel_cap.pebs_format < 2) {
+ ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+ if (!ibuffer) {
+- kfree(buffer);
++ dsfree(buffer, x86_pmu.pebs_buffer_size);
+ return -ENOMEM;
+ }
+ per_cpu(insn_buffer, cpu) = ibuffer;
+@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
+ kfree(per_cpu(insn_buffer, cpu));
+ per_cpu(insn_buffer, cpu) = NULL;
+
+- kfree((void *)(unsigned long)ds->pebs_buffer_base);
++ dsfree((void *)(unsigned long)ds->pebs_buffer_base,
++ x86_pmu.pebs_buffer_size);
+ ds->pebs_buffer_base = 0;
+ }
+
+@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
+ if (!x86_pmu.bts)
+ return 0;
+
+- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
++ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ if (unlikely(!buffer)) {
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
+ return -ENOMEM;
+@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
+ if (!ds || !x86_pmu.bts)
+ return;
+
+- kfree((void *)(unsigned long)ds->bts_buffer_base);
++ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
+ ds->bts_buffer_base = 0;
+ }
+
+ static int alloc_ds_buffer(int cpu)
+ {
+- int node = cpu_to_node(cpu);
+- struct debug_store *ds;
+-
+- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
+- if (unlikely(!ds))
+- return -ENOMEM;
++ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
+
++ memset(ds, 0, sizeof(*ds));
+ per_cpu(cpu_hw_events, cpu).ds = ds;
+
+ return 0;
+@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
+ return;
+
+ per_cpu(cpu_hw_events, cpu).ds = NULL;
+- kfree(ds);
+ }
+
+ void release_ds_buffers(void)
+diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
+index e01f7f7ccb0c..84ae170bc3d0 100644
+--- a/arch/x86/include/asm/cmdline.h
++++ b/arch/x86/include/asm/cmdline.h
+@@ -2,5 +2,7 @@
+ #define _ASM_X86_CMDLINE_H
+
+ int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
++int cmdline_find_option(const char *cmdline_ptr, const char *option,
++ char *buffer, int bufsize);
+
+ #endif /* _ASM_X86_CMDLINE_H */
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
+index ed10b5bf9b93..454a37adb823 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -189,6 +189,7 @@
+
+ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
+ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
+
+ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
+ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+@@ -197,6 +198,9 @@
+ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
+ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+
++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
++
+ /* Virtualization flags: Linux defined, word 8 */
+ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
+ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
+diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
+index 12080d87da3b..2ed5a2b3f8f7 100644
+--- a/arch/x86/include/asm/desc.h
++++ b/arch/x86/include/asm/desc.h
+@@ -43,7 +43,7 @@ struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+ } __attribute__((aligned(PAGE_SIZE)));
+
+-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
+
+ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+ {
+diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
+index b90e1053049b..0817d63bce41 100644
+--- a/arch/x86/include/asm/hw_irq.h
++++ b/arch/x86/include/asm/hw_irq.h
+@@ -178,7 +178,7 @@ extern char irq_entries_start[];
+ #define VECTOR_RETRIGGERED ((void *)~0UL)
+
+ typedef struct irq_desc* vector_irq_t[NR_VECTORS];
+-DECLARE_PER_CPU(vector_irq_t, vector_irq);
++DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
+
+ #endif /* !ASSEMBLY_ */
+
+diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
+new file mode 100644
+index 000000000000..802bbbdfe143
+--- /dev/null
++++ b/arch/x86/include/asm/kaiser.h
+@@ -0,0 +1,141 @@
++#ifndef _ASM_X86_KAISER_H
++#define _ASM_X86_KAISER_H
++
++#include <uapi/asm/processor-flags.h> /* For PCID constants */
++
++/*
++ * This file includes the definitions for the KAISER feature.
++ * KAISER is a counter measure against x86_64 side channel attacks on
++ * the kernel virtual memory. It has a shadow pgd for every process: the
++ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
++ * user memory. Within a kernel context switch, or when an interrupt is handled,
++ * the pgd is switched to the normal one. When the system switches to user mode,
++ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
++ * and the user may not attack the whole kernel memory.
++ *
++ * A minimalistic kernel mapping holds the parts needed to be mapped in user
++ * mode, such as the entry/exit functions of the user space, or the stacks.
++ */
++
++#define KAISER_SHADOW_PGD_OFFSET 0x1000
++
++#ifdef __ASSEMBLY__
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++
++.macro _SWITCH_TO_KERNEL_CR3 reg
++movq %cr3, \reg
++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
++/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
++ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
++movq \reg, %cr3
++.endm
++
++.macro _SWITCH_TO_USER_CR3 reg regb
++/*
++ * regb must be the low byte portion of reg: because we have arranged
++ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
++ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
++ * not enabled): so that the one register can update both memory and cr3.
++ */
++movq %cr3, \reg
++orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
++js 9f
++/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
++movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
++9:
++movq \reg, %cr3
++.endm
++
++.macro SWITCH_KERNEL_CR3
++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
++_SWITCH_TO_KERNEL_CR3 %rax
++popq %rax
++8:
++.endm
++
++.macro SWITCH_USER_CR3
++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
++_SWITCH_TO_USER_CR3 %rax %al
++popq %rax
++8:
++.endm
++
++.macro SWITCH_KERNEL_CR3_NO_STACK
++ALTERNATIVE "jmp 8f", \
++ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
++ X86_FEATURE_KAISER
++_SWITCH_TO_KERNEL_CR3 %rax
++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
++8:
++.endm
++
++#else /* CONFIG_PAGE_TABLE_ISOLATION */
++
++.macro SWITCH_KERNEL_CR3
++.endm
++.macro SWITCH_USER_CR3
++.endm
++.macro SWITCH_KERNEL_CR3_NO_STACK
++.endm
++
++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
++
++#else /* __ASSEMBLY__ */
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++/*
++ * Upon kernel/user mode switch, it may happen that the address
++ * space has to be switched before the registers have been
++ * stored. To change the address space, another register is
++ * needed. A register therefore has to be stored/restored.
++*/
++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
++
++DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
++
++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
++
++extern int kaiser_enabled;
++extern void __init kaiser_check_boottime_disable(void);
++#else
++#define kaiser_enabled 0
++static inline void __init kaiser_check_boottime_disable(void) {}
++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
++
++/*
++ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
++ * so as to build with tests on kaiser_enabled instead of #ifdefs.
++ */
++
++/**
++ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
++ * @addr: the start address of the range
++ * @size: the size of the range
++ * @flags: The mapping flags of the pages
++ *
++ * The mapping is done on a global scope, so no bigger
++ * synchronization has to be done. the pages have to be
++ * manually unmapped again when they are not needed any longer.
++ */
++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
++
++/**
++ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
++ * @addr: the start address of the range
++ * @size: the size of the range
++ */
++extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
++
++/**
++ * kaiser_init - Initialize the shadow mapping
++ *
++ * Most parts of the shadow mapping can be mapped upon boot
++ * time. Only per-process things like the thread stacks
++ * or a new LDT have to be mapped at runtime. These boot-
++ * time mappings are permanent and never unmapped.
++ */
++extern void kaiser_init(void);
++
++#endif /* __ASSEMBLY */
++
++#endif /* _ASM_X86_KAISER_H */
+diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
+index 437feb436efa..2536f90cd30c 100644
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -18,6 +18,12 @@
+ #ifndef __ASSEMBLY__
+ #include <asm/x86_init.h>
+
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++extern int kaiser_enabled;
++#else
++#define kaiser_enabled 0
++#endif
++
+ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+ void ptdump_walk_pgd_level_checkwx(void);
+
+@@ -690,7 +696,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
+
+ static inline int pgd_bad(pgd_t pgd)
+ {
+- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
++ pgdval_t ignore_flags = _PAGE_USER;
++ /*
++ * We set NX on KAISER pgds that map userspace memory so
++ * that userspace can not meaningfully use the kernel
++ * page table by accident; it will fault on the first
++ * instruction it tries to run. See native_set_pgd().
++ */
++ if (kaiser_enabled)
++ ignore_flags |= _PAGE_NX;
++
++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
+ }
+
+ static inline int pgd_none(pgd_t pgd)
+@@ -903,7 +919,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+ */
+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ {
+- memcpy(dst, src, count * sizeof(pgd_t));
++ memcpy(dst, src, count * sizeof(pgd_t));
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ if (kaiser_enabled) {
++ /* Clone the shadow pgd part as well */
++ memcpy(native_get_shadow_pgd(dst),
++ native_get_shadow_pgd(src),
++ count * sizeof(pgd_t));
++ }
++#endif
+ }
+
+ #define PTE_SHIFT ilog2(PTRS_PER_PTE)
+diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
+index 1cc82ece9ac1..ce97c8c6a310 100644
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
+ native_set_pud(pud, native_make_pud(0));
+ }
+
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
++
++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
++{
++#ifdef CONFIG_DEBUG_VM
++ /* linux/mmdebug.h may not have been included at this point */
++ BUG_ON(!kaiser_enabled);
++#endif
++ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
++}
++#else
++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++ return pgd;
++}
++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
++{
++ BUILD_BUG_ON(1);
++ return NULL;
++}
++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
++
+ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
+- *pgdp = pgd;
++ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
+ }
+
+ static inline void native_pgd_clear(pgd_t *pgd)
+diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
+index 8b4de22d6429..f1c8ac468292 100644
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -119,7 +119,7 @@
+ #define _PAGE_DEVMAP (_AT(pteval_t, 0))
+ #endif
+
+-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
++#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+
+ #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+@@ -137,6 +137,33 @@
+ _PAGE_SOFT_DIRTY)
+ #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+
++/* The ASID is the lower 12 bits of CR3 */
++#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
++
++/* Mask for all the PCID-related bits in CR3: */
++#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
++#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
++
++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
++#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
++
++#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
++#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
++#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
++#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
++#else
++#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
++/*
++ * PCIDs are unsupported on 32-bit and none of these bits can be
++ * set in CR3:
++ */
++#define X86_CR3_PCID_KERN_FLUSH (0)
++#define X86_CR3_PCID_USER_FLUSH (0)
++#define X86_CR3_PCID_KERN_NOFLUSH (0)
++#define X86_CR3_PCID_USER_NOFLUSH (0)
++#endif
++
+ /*
+ * The cache modes defined here are used to translate between pure SW usage
+ * and the HW defined cache mode bits and/or PAT entries.
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
+index 83db0eae9979..8cb52ee3ade6 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -308,7 +308,7 @@ struct tss_struct {
+
+ } ____cacheline_aligned;
+
+-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
+
+ #ifdef CONFIG_X86_32
+ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
+index 7d2ea6b1f7d9..94146f665a3c 100644
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -132,6 +132,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+ cr4_set_bits(mask);
+ }
+
++/*
++ * Declare a couple of kaiser interfaces here for convenience,
++ * to avoid the need for asm/kaiser.h in unexpected places.
++ */
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++extern int kaiser_enabled;
++extern void kaiser_setup_pcid(void);
++extern void kaiser_flush_tlb_on_return_to_user(void);
++#else
++#define kaiser_enabled 0
++static inline void kaiser_setup_pcid(void)
++{
++}
++static inline void kaiser_flush_tlb_on_return_to_user(void)
++{
++}
++#endif
++
+ static inline void __native_flush_tlb(void)
+ {
+ /*
+@@ -140,6 +158,8 @@ static inline void __native_flush_tlb(void)
+ * back:
+ */
+ preempt_disable();
++ if (kaiser_enabled)
++ kaiser_flush_tlb_on_return_to_user();
+ native_write_cr3(native_read_cr3());
+ preempt_enable();
+ }
+@@ -149,20 +169,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
+ unsigned long cr4;
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+- /* clear PGE */
+- native_write_cr4(cr4 & ~X86_CR4_PGE);
+- /* write old PGE again and flush TLBs */
+- native_write_cr4(cr4);
++ if (cr4 & X86_CR4_PGE) {
++ /* clear PGE and flush TLB of all entries */
++ native_write_cr4(cr4 & ~X86_CR4_PGE);
++ /* restore PGE as it was before */
++ native_write_cr4(cr4);
++ } else {
++ /* do it with cr3, letting kaiser flush user PCID */
++ __native_flush_tlb();
++ }
+ }
+
+ static inline void __native_flush_tlb_global(void)
+ {
+ unsigned long flags;
+
+- if (static_cpu_has(X86_FEATURE_INVPCID)) {
++ if (this_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
++ *
++ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
+ return;
+@@ -174,24 +201,45 @@ static inline void __native_flush_tlb_global(void)
+ * be called from deep inside debugging code.)
+ */
+ raw_local_irq_save(flags);
+-
+ __native_flush_tlb_global_irq_disabled();
+-
+ raw_local_irq_restore(flags);
+ }
+
+ static inline void __native_flush_tlb_single(unsigned long addr)
+ {
+- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
++ /*
++ * SIMICS #GP's if you run INVPCID with type 2/3
++ * and X86_CR4_PCIDE clear. Shame!
++ *
++ * The ASIDs used below are hard-coded. But, we must not
++ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
++ * invlpg in the case we are called early.
++ */
++
++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
++ if (kaiser_enabled)
++ kaiser_flush_tlb_on_return_to_user();
++ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
++ return;
++ }
++ /* Flush the address out of both PCIDs. */
++ /*
++ * An optimization here might be to determine addresses
++ * that are only kernel-mapped and only flush the kernel
++ * ASID. But, userspace flushes are probably much more
++ * important performance-wise.
++ *
++ * Make sure to do only a single invpcid when KAISER is
++ * disabled and we have only a single ASID.
++ */
++ if (kaiser_enabled)
++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+ }
+
+ static inline void __flush_tlb_all(void)
+ {
+- if (boot_cpu_has(X86_FEATURE_PGE))
+- __flush_tlb_global();
+- else
+- __flush_tlb();
+-
++ __flush_tlb_global();
+ /*
+ * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+ * we'd end up flushing kernel translations for the current ASID but
+diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
+index 567de50a4c2a..6768d1321016 100644
+--- a/arch/x86/include/uapi/asm/processor-flags.h
++++ b/arch/x86/include/uapi/asm/processor-flags.h
+@@ -77,7 +77,8 @@
+ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
+ #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
+ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
+-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
++#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
+
+ /*
+ * Intel CPU features in CR4
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index 91588be529b9..918e44772b04 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {
+
+ static const struct cpu_dev *this_cpu = &default_cpu;
+
+-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
+ #ifdef CONFIG_X86_64
+ /*
+ * We need valid kernel segments for data and code in long mode too
+@@ -327,8 +327,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
+ static void setup_pcid(struct cpuinfo_x86 *c)
+ {
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+- if (cpu_has(c, X86_FEATURE_PGE)) {
++ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
+ cr4_set_bits(X86_CR4_PCIDE);
++ /*
++ * INVPCID has two "groups" of types:
++ * 1/2: Invalidate an individual address
++ * 3/4: Invalidate all contexts
++ *
++ * 1/2 take a PCID, but 3/4 do not. So, 3/4
++ * ignore the PCID argument in the descriptor.
++ * But, we have to be careful not to call 1/2
++ * with an actual non-zero PCID in them before
++ * we do the above cr4_set_bits().
++ */
++ if (cpu_has(c, X86_FEATURE_INVPCID))
++ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+@@ -341,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
+ clear_cpu_cap(c, X86_FEATURE_PCID);
+ }
+ }
++ kaiser_setup_pcid();
+ }
+
+ /*
+@@ -1365,7 +1379,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+ };
+
+-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+
+ /* May not be marked __init: used by software suspend */
+@@ -1523,6 +1537,14 @@ void cpu_init(void)
+ * try to read it.
+ */
+ cr4_init_shadow();
++ if (!kaiser_enabled) {
++ /*
++ * secondary_startup_64() deferred setting PGE in cr4:
++ * probe_page_size_mask() sets it on the boot cpu,
++ * but it needs to be set on each secondary cpu.
++ */
++ cr4_set_bits(X86_CR4_PGE);
++ }
+
+ /*
+ * Load microcode on this cpu if a valid microcode is available.
+diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
+index 04f89caef9c4..e33b38541be3 100644
+--- a/arch/x86/kernel/espfix_64.c
++++ b/arch/x86/kernel/espfix_64.c
+@@ -41,6 +41,7 @@
+ #include <asm/pgalloc.h>
+ #include <asm/setup.h>
+ #include <asm/espfix.h>
++#include <asm/kaiser.h>
+
+ /*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
+ /* Install the espfix pud into the kernel page directory */
+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
++ /*
++ * Just copy the top-level PGD that is mapping the espfix
++ * area to ensure it is mapped into the shadow user page
++ * tables.
++ */
++ if (kaiser_enabled) {
++ set_pgd(native_get_shadow_pgd(pgd_p),
++ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
++ }
+
+ /* Randomize the locations */
+ init_espfix_random();
+diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
+index b4421cc191b0..67cd7c1b99da 100644
+--- a/arch/x86/kernel/head_64.S
++++ b/arch/x86/kernel/head_64.S
+@@ -190,8 +190,8 @@ ENTRY(secondary_startup_64)
+ movq $(init_level4_pgt - __START_KERNEL_map), %rax
+ 1:
+
+- /* Enable PAE mode and PGE */
+- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
++ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
++ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
+ movq %rcx, %cr4
+
+ /* Setup early boot stage 4 level pagetables. */
+@@ -405,6 +405,27 @@ GLOBAL(early_recursion_flag)
+ .balign PAGE_SIZE; \
+ GLOBAL(name)
+
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++/*
++ * Each PGD needs to be 8k long and 8k aligned. We do not
++ * ever go out to userspace with these, so we do not
++ * strictly *need* the second page, but this allows us to
++ * have a single set_pgd() implementation that does not
++ * need to worry about whether it has 4k or 8k to work
++ * with.
++ *
++ * This ensures PGDs are 8k long:
++ */
++#define KAISER_USER_PGD_FILL 512
++/* This ensures they are 8k-aligned: */
++#define NEXT_PGD_PAGE(name) \
++ .balign 2 * PAGE_SIZE; \
++GLOBAL(name)
++#else
++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
++#define KAISER_USER_PGD_FILL 0
++#endif
++
+ /* Automate the creation of 1 to 1 mapping pmd entries */
+ #define PMDS(START, PERM, COUNT) \
+ i = 0 ; \
+@@ -414,9 +435,10 @@ GLOBAL(name)
+ .endr
+
+ __INITDATA
+-NEXT_PAGE(early_level4_pgt)
++NEXT_PGD_PAGE(early_level4_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
++ .fill KAISER_USER_PGD_FILL,8,0
+
+ NEXT_PAGE(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+@@ -424,16 +446,18 @@ NEXT_PAGE(early_dynamic_pgts)
+ .data
+
+ #ifndef CONFIG_XEN
+-NEXT_PAGE(init_level4_pgt)
++NEXT_PGD_PAGE(init_level4_pgt)
+ .fill 512,8,0
++ .fill KAISER_USER_PGD_FILL,8,0
+ #else
+-NEXT_PAGE(init_level4_pgt)
++NEXT_PGD_PAGE(init_level4_pgt)
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .org init_level4_pgt + L4_START_KERNEL*8, 0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
++ .fill KAISER_USER_PGD_FILL,8,0
+
+ NEXT_PAGE(level3_ident_pgt)
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+@@ -444,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt)
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+ #endif
++ .fill KAISER_USER_PGD_FILL,8,0
+
+ NEXT_PAGE(level3_kernel_pgt)
+ .fill L3_START_KERNEL,8,0
+diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
+index 1423ab1b0312..f480b38a03c3 100644
+--- a/arch/x86/kernel/irqinit.c
++++ b/arch/x86/kernel/irqinit.c
+@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
+ .flags = IRQF_NO_THREAD,
+ };
+
+-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
++DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
+ [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
+ };
+
+diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
+index 5f70014ca602..8bc68cfc0d33 100644
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -16,6 +16,7 @@
+ #include <linux/slab.h>
+ #include <linux/vmalloc.h>
+ #include <linux/uaccess.h>
++#include <linux/kaiser.h>
+
+ #include <asm/ldt.h>
+ #include <asm/desc.h>
+@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
+ set_ldt(pc->ldt->entries, pc->ldt->size);
+ }
+
++static void __free_ldt_struct(struct ldt_struct *ldt)
++{
++ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
++ vfree(ldt->entries);
++ else
++ free_page((unsigned long)ldt->entries);
++ kfree(ldt);
++}
++
+ /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+ static struct ldt_struct *alloc_ldt_struct(int size)
+ {
+ struct ldt_struct *new_ldt;
+ int alloc_size;
++ int ret;
+
+ if (size > LDT_ENTRIES)
+ return NULL;
+@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
+ return NULL;
+ }
+
++ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
++ __PAGE_KERNEL);
+ new_ldt->size = size;
++ if (ret) {
++ __free_ldt_struct(new_ldt);
++ return NULL;
++ }
+ return new_ldt;
+ }
+
+@@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
+ if (likely(!ldt))
+ return;
+
++ kaiser_remove_mapping((unsigned long)ldt->entries,
++ ldt->size * LDT_ENTRY_SIZE);
+ paravirt_free_ldt(ldt->entries, ldt->size);
+- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+- vfree(ldt->entries);
+- else
+- free_page((unsigned long)ldt->entries);
+- kfree(ldt);
++ __free_ldt_struct(ldt);
+ }
+
+ /*
+diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
+index bb3840cedb4f..ee43b36075c7 100644
+--- a/arch/x86/kernel/paravirt_patch_64.c
++++ b/arch/x86/kernel/paravirt_patch_64.c
+@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
+ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
+ DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
+ DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
+-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
+ DEF_NATIVE(pv_cpu_ops, clts, "clts");
+ DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
+
+@@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+ PATCH_SITE(pv_mmu_ops, read_cr3);
+ PATCH_SITE(pv_mmu_ops, write_cr3);
+ PATCH_SITE(pv_cpu_ops, clts);
+- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
+ PATCH_SITE(pv_cpu_ops, wbinvd);
+ #if defined(CONFIG_PARAVIRT_SPINLOCKS)
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
+index 8e10e72bf6ee..a55b32007785 100644
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -41,7 +41,7 @@
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
++__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
+ .x86_tss = {
+ .sp0 = TOP_OF_INIT_STACK,
+ #ifdef CONFIG_X86_32
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index feaab07fa124..6b55012d02a3 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -114,6 +114,7 @@
+ #include <asm/microcode.h>
+ #include <asm/mmu_context.h>
+ #include <asm/kaslr.h>
++#include <asm/kaiser.h>
+
+ /*
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+@@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p)
+ */
+ init_hypervisor_platform();
+
++ /*
++ * This needs to happen right after XENPV is set on xen and
++ * kaiser_enabled is checked below in cleanup_highmap().
++ */
++ kaiser_check_boottime_disable();
++
+ x86_init.resources.probe_roms();
+
+ /* after parse_early_param, so could debug it */
+diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
+index 1c113db9ed57..2bb5ee464df3 100644
+--- a/arch/x86/kernel/tracepoint.c
++++ b/arch/x86/kernel/tracepoint.c
+@@ -9,10 +9,12 @@
+ #include <linux/atomic.h>
+
+ atomic_t trace_idt_ctr = ATOMIC_INIT(0);
++__aligned(PAGE_SIZE)
+ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
+ (unsigned long) trace_idt_table };
+
+ /* No need to be aligned, but done to keep all IDTs defined the same way. */
++__aligned(PAGE_SIZE)
+ gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
+
+ static int trace_irq_vector_refcount;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 7e28e6c877d9..73304b1a03cc 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+ return 1;
+
+ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
++ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
++ !is_long_mode(vcpu))
+ return 1;
+ }
+
+diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
+index 5cc78bf57232..3261abb21ef4 100644
+--- a/arch/x86/lib/cmdline.c
++++ b/arch/x86/lib/cmdline.c
+@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
+ return 0; /* Buffer overrun */
+ }
+
++/*
++ * Find a non-boolean option (i.e. option=argument). In accordance with
++ * standard Linux practice, if this option is repeated, this returns the
++ * last instance on the command line.
++ *
++ * @cmdline: the cmdline string
++ * @max_cmdline_size: the maximum size of cmdline
++ * @option: option string to look for
++ * @buffer: memory buffer to return the option argument
++ * @bufsize: size of the supplied memory buffer
++ *
++ * Returns the length of the argument (regardless of if it was
++ * truncated to fit in the buffer), or -1 on not found.
++ */
++static int
++__cmdline_find_option(const char *cmdline, int max_cmdline_size,
++ const char *option, char *buffer, int bufsize)
++{
++ char c;
++ int pos = 0, len = -1;
++ const char *opptr = NULL;
++ char *bufptr = buffer;
++ enum {
++ st_wordstart = 0, /* Start of word/after whitespace */
++ st_wordcmp, /* Comparing this word */
++ st_wordskip, /* Miscompare, skip */
++ st_bufcpy, /* Copying this to buffer */
++ } state = st_wordstart;
++
++ if (!cmdline)
++ return -1; /* No command line */
++
++ /*
++ * This 'pos' check ensures we do not overrun
++ * a non-NULL-terminated 'cmdline'
++ */
++ while (pos++ < max_cmdline_size) {
++ c = *(char *)cmdline++;
++ if (!c)
++ break;
++
++ switch (state) {
++ case st_wordstart:
++ if (myisspace(c))
++ break;
++
++ state = st_wordcmp;
++ opptr = option;
++ /* fall through */
++
++ case st_wordcmp:
++ if ((c == '=') && !*opptr) {
++ /*
++ * We matched all the way to the end of the
++ * option we were looking for, prepare to
++ * copy the argument.
++ */
++ len = 0;
++ bufptr = buffer;
++ state = st_bufcpy;
++ break;
++ } else if (c == *opptr++) {
++ /*
++ * We are currently matching, so continue
++ * to the next character on the cmdline.
++ */
++ break;
++ }
++ state = st_wordskip;
++ /* fall through */
++
++ case st_wordskip:
++ if (myisspace(c))
++ state = st_wordstart;
++ break;
++
++ case st_bufcpy:
++ if (myisspace(c)) {
++ state = st_wordstart;
++ } else {
++ /*
++ * Increment len, but don't overrun the
++ * supplied buffer and leave room for the
++ * NULL terminator.
++ */
++ if (++len < bufsize)
++ *bufptr++ = c;
++ }
++ break;
++ }
++ }
++
++ if (bufsize)
++ *bufptr = '\0';
++
++ return len;
++}
++
+ int cmdline_find_option_bool(const char *cmdline, const char *option)
+ {
+ return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+ }
++
++int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
++ int bufsize)
++{
++ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
++ buffer, bufsize);
++}
+diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
+index 96d2b847e09e..c548b46100cb 100644
+--- a/arch/x86/mm/Makefile
++++ b/arch/x86/mm/Makefile
+@@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
+
+ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
+ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+-
++obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
++obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o
+diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
+index 0381638168d1..1e779bca4f3e 100644
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void)
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+- if (boot_cpu_has(X86_FEATURE_PGE)) {
++ if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) {
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ } else
+diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
+index 3e27ded6ac65..7df8e3a79dc0 100644
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -324,6 +324,16 @@ void __init cleanup_highmap(void)
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > end)
+ set_pmd(pmd, __pmd(0));
++ else if (kaiser_enabled) {
++ /*
++ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
++ * clear that now. This is not important, so long as
++ * CR4.PGE remains clear, but it removes an anomaly.
++ * Physical mapping setup below avoids _PAGE_GLOBAL
++ * by use of massage_pgprot() inside pfn_pte() etc.
++ */
++ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
++ }
+ }
+ }
+
+diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
+new file mode 100644
+index 000000000000..d8376b4ad9f0
+--- /dev/null
++++ b/arch/x86/mm/kaiser.c
+@@ -0,0 +1,455 @@
++#include <linux/bug.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/string.h>
++#include <linux/types.h>
++#include <linux/bug.h>
++#include <linux/init.h>
++#include <linux/interrupt.h>
++#include <linux/spinlock.h>
++#include <linux/mm.h>
++#include <linux/uaccess.h>
++
++#undef pr_fmt
++#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
++
++#include <asm/kaiser.h>
++#include <asm/tlbflush.h> /* to verify its kaiser declarations */
++#include <asm/pgtable.h>
++#include <asm/pgalloc.h>
++#include <asm/desc.h>
++#include <asm/cmdline.h>
++
++int kaiser_enabled __read_mostly = 1;
++EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
++
++__visible
++DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
++
++/*
++ * These can have bit 63 set, so we can not just use a plain "or"
++ * instruction to get their value or'd into CR3. It would take
++ * another register. So, we use a memory reference to these instead.
++ *
++ * This is also handy because systems that do not support PCIDs
++ * just end up or'ing a 0 into their CR3, which does no harm.
++ */
++DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
++
++/*
++ * At runtime, the only things we map are some things for CPU
++ * hotplug, and stacks for new processes. No two CPUs will ever
++ * be populating the same addresses, so we only need to ensure
++ * that we protect between two CPUs trying to allocate and
++ * populate the same page table page.
++ *
++ * Only take this lock when doing a set_p[4um]d(), but it is not
++ * needed for doing a set_pte(). We assume that only the *owner*
++ * of a given allocation will be doing this for _their_
++ * allocation.
++ *
++ * This ensures that once a system has been running for a while
++ * and there have been stacks all over and these page tables
++ * are fully populated, there will be no further acquisitions of
++ * this lock.
++ */
++static DEFINE_SPINLOCK(shadow_table_allocation_lock);
++
++/*
++ * Returns -1 on error.
++ */
++static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
++{
++ pgd_t *pgd;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *pte;
++
++ pgd = pgd_offset_k(vaddr);
++ /*
++ * We made all the kernel PGDs present in kaiser_init().
++ * We expect them to stay that way.
++ */
++ BUG_ON(pgd_none(*pgd));
++ /*
++ * PGDs are either 512GB or 128TB on all x86_64
++ * configurations. We don't handle these.
++ */
++ BUG_ON(pgd_large(*pgd));
++
++ pud = pud_offset(pgd, vaddr);
++ if (pud_none(*pud)) {
++ WARN_ON_ONCE(1);
++ return -1;
++ }
++
++ if (pud_large(*pud))
++ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
++
++ pmd = pmd_offset(pud, vaddr);
++ if (pmd_none(*pmd)) {
++ WARN_ON_ONCE(1);
++ return -1;
++ }
++
++ if (pmd_large(*pmd))
++ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
++
++ pte = pte_offset_kernel(pmd, vaddr);
++ if (pte_none(*pte)) {
++ WARN_ON_ONCE(1);
++ return -1;
++ }
++
++ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
++}
++
++/*
++ * This is a relatively normal page table walk, except that it
++ * also tries to allocate page tables pages along the way.
++ *
++ * Returns a pointer to a PTE on success, or NULL on failure.
++ */
++static pte_t *kaiser_pagetable_walk(unsigned long address)
++{
++ pmd_t *pmd;
++ pud_t *pud;
++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
++
++ if (pgd_none(*pgd)) {
++ WARN_ONCE(1, "All shadow pgds should have been populated");
++ return NULL;
++ }
++ BUILD_BUG_ON(pgd_large(*pgd) != 0);
++
++ pud = pud_offset(pgd, address);
++ /* The shadow page tables do not use large mappings: */
++ if (pud_large(*pud)) {
++ WARN_ON(1);
++ return NULL;
++ }
++ if (pud_none(*pud)) {
++ unsigned long new_pmd_page = __get_free_page(gfp);
++ if (!new_pmd_page)
++ return NULL;
++ spin_lock(&shadow_table_allocation_lock);
++ if (pud_none(*pud)) {
++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
++ __inc_zone_page_state(virt_to_page((void *)
++ new_pmd_page), NR_KAISERTABLE);
++ } else
++ free_page(new_pmd_page);
++ spin_unlock(&shadow_table_allocation_lock);
++ }
++
++ pmd = pmd_offset(pud, address);
++ /* The shadow page tables do not use large mappings: */
++ if (pmd_large(*pmd)) {
++ WARN_ON(1);
++ return NULL;
++ }
++ if (pmd_none(*pmd)) {
++ unsigned long new_pte_page = __get_free_page(gfp);
++ if (!new_pte_page)
++ return NULL;
++ spin_lock(&shadow_table_allocation_lock);
++ if (pmd_none(*pmd)) {
++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
++ __inc_zone_page_state(virt_to_page((void *)
++ new_pte_page), NR_KAISERTABLE);
++ } else
++ free_page(new_pte_page);
++ spin_unlock(&shadow_table_allocation_lock);
++ }
++
++ return pte_offset_kernel(pmd, address);
++}
++
++static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
++ unsigned long flags)
++{
++ int ret = 0;
++ pte_t *pte;
++ unsigned long start_addr = (unsigned long )__start_addr;
++ unsigned long address = start_addr & PAGE_MASK;
++ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
++ unsigned long target_address;
++
++ /*
++ * It is convenient for callers to pass in __PAGE_KERNEL etc,
++ * and there is no actual harm from setting _PAGE_GLOBAL, so
++ * long as CR4.PGE is not set. But it is nonetheless troubling
++ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
++ * requires that not to be #defined to 0): so mask it off here.
++ */
++ flags &= ~_PAGE_GLOBAL;
++
++ for (; address < end_addr; address += PAGE_SIZE) {
++ target_address = get_pa_from_mapping(address);
++ if (target_address == -1) {
++ ret = -EIO;
++ break;
++ }
++ pte = kaiser_pagetable_walk(address);
++ if (!pte) {
++ ret = -ENOMEM;
++ break;
++ }
++ if (pte_none(*pte)) {
++ set_pte(pte, __pte(flags | target_address));
++ } else {
++ pte_t tmp;
++ set_pte(&tmp, __pte(flags | target_address));
++ WARN_ON_ONCE(!pte_same(*pte, tmp));
++ }
++ }
++ return ret;
++}
++
++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
++{
++ unsigned long size = end - start;
++
++ return kaiser_add_user_map(start, size, flags);
++}
++
++/*
++ * Ensure that the top level of the (shadow) page tables are
++ * entirely populated. This ensures that all processes that get
++ * forked have the same entries. This way, we do not have to
++ * ever go set up new entries in older processes.
++ *
++ * Note: we never free these, so there are no updates to them
++ * after this.
++ */
++static void __init kaiser_init_all_pgds(void)
++{
++ pgd_t *pgd;
++ int i = 0;
++
++ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
++ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
++ pgd_t new_pgd;
++ pud_t *pud = pud_alloc_one(&init_mm,
++ PAGE_OFFSET + i * PGDIR_SIZE);
++ if (!pud) {
++ WARN_ON(1);
++ break;
++ }
++ inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
++ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
++ /*
++ * Make sure not to stomp on some other pgd entry.
++ */
++ if (!pgd_none(pgd[i])) {
++ WARN_ON(1);
++ continue;
++ }
++ set_pgd(pgd + i, new_pgd);
++ }
++}
++
++#define kaiser_add_user_map_early(start, size, flags) do { \
++ int __ret = kaiser_add_user_map(start, size, flags); \
++ WARN_ON(__ret); \
++} while (0)
++
++#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
++ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
++ WARN_ON(__ret); \
++} while (0)
++
++void __init kaiser_check_boottime_disable(void)
++{
++ bool enable = true;
++ char arg[5];
++ int ret;
++
++ if (boot_cpu_has(X86_FEATURE_XENPV))
++ goto silent_disable;
++
++ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
++ if (ret > 0) {
++ if (!strncmp(arg, "on", 2))
++ goto enable;
++
++ if (!strncmp(arg, "off", 3))
++ goto disable;
++
++ if (!strncmp(arg, "auto", 4))
++ goto skip;
++ }
++
++ if (cmdline_find_option_bool(boot_command_line, "nopti"))
++ goto disable;
++
++skip:
++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
++ goto disable;
++
++enable:
++ if (enable)
++ setup_force_cpu_cap(X86_FEATURE_KAISER);
++
++ return;
++
++disable:
++ pr_info("disabled\n");
++
++silent_disable:
++ kaiser_enabled = 0;
++ setup_clear_cpu_cap(X86_FEATURE_KAISER);
++}
++
++/*
++ * If anything in here fails, we will likely die on one of the
++ * first kernel->user transitions and init will die. But, we
++ * will have most of the kernel up by then and should be able to
++ * get a clean warning out of it. If we BUG_ON() here, we run
++ * the risk of being before we have good console output.
++ */
++void __init kaiser_init(void)
++{
++ int cpu;
++
++ if (!kaiser_enabled)
++ return;
++
++ kaiser_init_all_pgds();
++
++ for_each_possible_cpu(cpu) {
++ void *percpu_vaddr = __per_cpu_user_mapped_start +
++ per_cpu_offset(cpu);
++ unsigned long percpu_sz = __per_cpu_user_mapped_end -
++ __per_cpu_user_mapped_start;
++ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
++ __PAGE_KERNEL);
++ }
++
++ /*
++ * Map the entry/exit text section, which is needed at
++ * switches from user to and from kernel.
++ */
++ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
++ __PAGE_KERNEL_RX);
++
++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
++ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
++ __irqentry_text_end,
++ __PAGE_KERNEL_RX);
++#endif
++ kaiser_add_user_map_early((void *)idt_descr.address,
++ sizeof(gate_desc) * NR_VECTORS,
++ __PAGE_KERNEL_RO);
++#ifdef CONFIG_TRACING
++ kaiser_add_user_map_early(&trace_idt_descr,
++ sizeof(trace_idt_descr),
++ __PAGE_KERNEL);
++ kaiser_add_user_map_early(&trace_idt_table,
++ sizeof(gate_desc) * NR_VECTORS,
++ __PAGE_KERNEL);
++#endif
++ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
++ __PAGE_KERNEL);
++ kaiser_add_user_map_early(&debug_idt_table,
++ sizeof(gate_desc) * NR_VECTORS,
++ __PAGE_KERNEL);
++
++ pr_info("enabled\n");
++}
++
++/* Add a mapping to the shadow mapping, and synchronize the mappings */
++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
++{
++ if (!kaiser_enabled)
++ return 0;
++ return kaiser_add_user_map((const void *)addr, size, flags);
++}
++
++void kaiser_remove_mapping(unsigned long start, unsigned long size)
++{
++ extern void unmap_pud_range_nofree(pgd_t *pgd,
++ unsigned long start, unsigned long end);
++ unsigned long end = start + size;
++ unsigned long addr, next;
++ pgd_t *pgd;
++
++ if (!kaiser_enabled)
++ return;
++ pgd = native_get_shadow_pgd(pgd_offset_k(start));
++ for (addr = start; addr < end; pgd++, addr = next) {
++ next = pgd_addr_end(addr, end);
++ unmap_pud_range_nofree(pgd, addr, next);
++ }
++}
++
++/*
++ * Page table pages are page-aligned. The lower half of the top
++ * level is used for userspace and the top half for the kernel.
++ * This returns true for user pages that need to get copied into
++ * both the user and kernel copies of the page tables, and false
++ * for kernel pages that should only be in the kernel copy.
++ */
++static inline bool is_userspace_pgd(pgd_t *pgdp)
++{
++ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
++}
++
++pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++ if (!kaiser_enabled)
++ return pgd;
++ /*
++ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
++ * skip cases like kexec and EFI which make temporary low mappings.
++ */
++ if (pgd.pgd & _PAGE_USER) {
++ if (is_userspace_pgd(pgdp)) {
++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
++ /*
++ * Even if the entry is *mapping* userspace, ensure
++ * that userspace can not use it. This way, if we
++ * get out to userspace running on the kernel CR3,
++ * userspace will crash instead of running.
++ */
++ if (__supported_pte_mask & _PAGE_NX)
++ pgd.pgd |= _PAGE_NX;
++ }
++ } else if (!pgd.pgd) {
++ /*
++ * pgd_clear() cannot check _PAGE_USER, and is even used to
++ * clear corrupted pgd entries: so just rely on cases like
++ * kexec and EFI never to be using pgd_clear().
++ */
++ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
++ is_userspace_pgd(pgdp))
++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
++ }
++ return pgd;
++}
++
++void kaiser_setup_pcid(void)
++{
++ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
++
++ if (this_cpu_has(X86_FEATURE_PCID))
++ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
++ /*
++ * These variables are used by the entry/exit
++ * code to change PCID and pgd and TLB flushing.
++ */
++ this_cpu_write(x86_cr3_pcid_user, user_cr3);
++}
++
++/*
++ * Make a note that this cpu will need to flush USER tlb on return to user.
++ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
++ */
++void kaiser_flush_tlb_on_return_to_user(void)
++{
++ if (this_cpu_has(X86_FEATURE_PCID))
++ this_cpu_write(x86_cr3_pcid_user,
++ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
++}
++EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
+index aed206475aa7..319183d93602 100644
+--- a/arch/x86/mm/kaslr.c
++++ b/arch/x86/mm/kaslr.c
+@@ -189,6 +189,6 @@ void __meminit init_trampoline(void)
+ *pud_tramp = *pud;
+ }
+
+- set_pgd(&trampoline_pgd_entry,
+- __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
++ /* Avoid set_pgd(), in case it's complicated by CONFIG_PAGE_TABLE_ISOLATION */
++ trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp));
+ }
+diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
+index e3353c97d086..73dcb0e18c1b 100644
+--- a/arch/x86/mm/pageattr.c
++++ b/arch/x86/mm/pageattr.c
+@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
+ #define CPA_FLUSHTLB 1
+ #define CPA_ARRAY 2
+ #define CPA_PAGES_ARRAY 4
++#define CPA_FREE_PAGETABLES 8
+
+ #ifdef CONFIG_PROC_FS
+ static unsigned long direct_pages_count[PG_LEVEL_NUM];
+@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
+ return 0;
+ }
+
+-static bool try_to_free_pte_page(pte_t *pte)
++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
+ {
+ int i;
+
++ if (!(cpa->flags & CPA_FREE_PAGETABLES))
++ return false;
++
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ if (!pte_none(pte[i]))
+ return false;
+@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte)
+ return true;
+ }
+
+-static bool try_to_free_pmd_page(pmd_t *pmd)
++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
+ {
+ int i;
+
++ if (!(cpa->flags & CPA_FREE_PAGETABLES))
++ return false;
++
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ if (!pmd_none(pmd[i]))
+ return false;
+@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
+ return true;
+ }
+
+-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
++ unsigned long start,
++ unsigned long end)
+ {
+ pte_t *pte = pte_offset_kernel(pmd, start);
+
+@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+ pte++;
+ }
+
+- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
++ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
+ pmd_clear(pmd);
+ return true;
+ }
+ return false;
+ }
+
+-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
+ unsigned long start, unsigned long end)
+ {
+- if (unmap_pte_range(pmd, start, end))
+- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
++ if (unmap_pte_range(cpa, pmd, start, end))
++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+ }
+
+-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
++ unsigned long start, unsigned long end)
+ {
+ pmd_t *pmd = pmd_offset(pud, start);
+
+@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+- __unmap_pmd_range(pud, pmd, start, pre_end);
++ __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
+
+ start = pre_end;
+ pmd++;
+@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+ if (pmd_large(*pmd))
+ pmd_clear(pmd);
+ else
+- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
++ __unmap_pmd_range(cpa, pud, pmd,
++ start, start + PMD_SIZE);
+
+ start += PMD_SIZE;
+ pmd++;
+@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+ * 4K leftovers?
+ */
+ if (start < end)
+- return __unmap_pmd_range(pud, pmd, start, end);
++ return __unmap_pmd_range(cpa, pud, pmd, start, end);
+
+ /*
+ * Try again to free the PMD page if haven't succeeded above.
+ */
+ if (!pud_none(*pud))
+- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+ }
+
+-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
++ unsigned long start,
++ unsigned long end)
+ {
+ pud_t *pud = pud_offset(pgd, start);
+
+@@ -834,7 +847,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+- unmap_pmd_range(pud, start, pre_end);
++ unmap_pmd_range(cpa, pud, start, pre_end);
+
+ start = pre_end;
+ pud++;
+@@ -848,7 +861,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+ if (pud_large(*pud))
+ pud_clear(pud);
+ else
+- unmap_pmd_range(pud, start, start + PUD_SIZE);
++ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
+
+ start += PUD_SIZE;
+ pud++;
+@@ -858,7 +871,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+ * 2M leftovers?
+ */
+ if (start < end)
+- unmap_pmd_range(pud, start, end);
++ unmap_pmd_range(cpa, pud, start, end);
+
+ /*
+ * No need to try to free the PUD page because we'll free it in
+@@ -866,6 +879,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+ */
+ }
+
++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
++{
++ struct cpa_data cpa = {
++ .flags = CPA_FREE_PAGETABLES,
++ };
++
++ __unmap_pud_range(&cpa, pgd, start, end);
++}
++
++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
++{
++ struct cpa_data cpa = {
++ .flags = 0,
++ };
++
++ __unmap_pud_range(&cpa, pgd, start, end);
++}
++
+ static int alloc_pte_page(pmd_t *pmd)
+ {
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index 3feec5af4e67..5aaec8effc5f 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -344,14 +344,22 @@ static inline void _pgd_free(pgd_t *pgd)
+ kmem_cache_free(pgd_cache, pgd);
+ }
+ #else
++
++/*
++ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
++ * both 8k in size and 8k-aligned. That lets us just flip bit 12
++ * in a pointer to swap between the two 4k halves.
++ */
++#define PGD_ALLOCATION_ORDER kaiser_enabled
++
+ static inline pgd_t *_pgd_alloc(void)
+ {
+- return (pgd_t *)__get_free_page(PGALLOC_GFP);
++ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
+ }
+
+ static inline void _pgd_free(pgd_t *pgd)
+ {
+- free_page((unsigned long)pgd);
++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+ }
+ #endif /* CONFIG_X86_PAE */
+
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 53b72fb4e781..41205de487e7 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -6,13 +6,14 @@
+ #include <linux/interrupt.h>
+ #include <linux/export.h>
+ #include <linux/cpu.h>
++#include <linux/debugfs.h>
+
+ #include <asm/tlbflush.h>
+ #include <asm/mmu_context.h>
+ #include <asm/cache.h>
+ #include <asm/apic.h>
+ #include <asm/uv/uv.h>
+-#include <linux/debugfs.h>
++#include <asm/kaiser.h>
+
+ /*
+ * TLB flushing, formerly SMP-only
+@@ -34,6 +35,36 @@ struct flush_tlb_info {
+ unsigned long flush_end;
+ };
+
++static void load_new_mm_cr3(pgd_t *pgdir)
++{
++ unsigned long new_mm_cr3 = __pa(pgdir);
++
++ if (kaiser_enabled) {
++ /*
++ * We reuse the same PCID for different tasks, so we must
++ * flush all the entries for the PCID out when we change tasks.
++ * Flush KERN below, flush USER when returning to userspace in
++ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
++ *
++ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
++ * do it here, but can only be used if X86_FEATURE_INVPCID is
++ * available - and many machines support pcid without invpcid.
++ *
++ * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
++ * would be needed in the write_cr3() below - if PCIDs enabled.
++ */
++ BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
++ kaiser_flush_tlb_on_return_to_user();
++ }
++
++ /*
++ * Caution: many callers of this function expect
++ * that load_cr3() is serializing and orders TLB
++ * fills with respect to the mm_cpumask writes.
++ */
++ write_cr3(new_mm_cr3);
++}
++
+ /*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+@@ -45,7 +76,7 @@ void leave_mm(int cpu)
+ BUG();
+ if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
+ cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
+- load_cr3(swapper_pg_dir);
++ load_new_mm_cr3(swapper_pg_dir);
+ /*
+ * This gets called in the idle path where RCU
+ * functions differently. Tracing normally
+@@ -120,7 +151,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ * ordering guarantee we need.
+ *
+ */
+- load_cr3(next->pgd);
++ load_new_mm_cr3(next->pgd);
+
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+@@ -167,7 +198,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ * As above, load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask write.
+ */
+- load_cr3(next->pgd);
++ load_new_mm_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ load_mm_cr4(next);
+ load_mm_ldt(next);
+diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
+index dc81e5287ebf..2e6000a4eb2c 100644
+--- a/include/asm-generic/vmlinux.lds.h
++++ b/include/asm-generic/vmlinux.lds.h
+@@ -778,7 +778,14 @@
+ */
+ #define PERCPU_INPUT(cacheline) \
+ VMLINUX_SYMBOL(__per_cpu_start) = .; \
++ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
+ *(.data..percpu..first) \
++ . = ALIGN(cacheline); \
++ *(.data..percpu..user_mapped) \
++ *(.data..percpu..user_mapped..shared_aligned) \
++ . = ALIGN(PAGE_SIZE); \
++ *(.data..percpu..user_mapped..page_aligned) \
++ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..page_aligned) \
+ . = ALIGN(cacheline); \
+diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
+new file mode 100644
+index 000000000000..58c55b1589d0
+--- /dev/null
++++ b/include/linux/kaiser.h
+@@ -0,0 +1,52 @@
++#ifndef _LINUX_KAISER_H
++#define _LINUX_KAISER_H
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++#include <asm/kaiser.h>
++
++static inline int kaiser_map_thread_stack(void *stack)
++{
++ /*
++ * Map that page of kernel stack on which we enter from user context.
++ */
++ return kaiser_add_mapping((unsigned long)stack +
++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
++}
++
++static inline void kaiser_unmap_thread_stack(void *stack)
++{
++ /*
++ * Note: may be called even when kaiser_map_thread_stack() failed.
++ */
++ kaiser_remove_mapping((unsigned long)stack +
++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
++}
++#else
++
++/*
++ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
++ * includes architectures that support KAISER, but have it disabled.
++ */
++
++static inline void kaiser_init(void)
++{
++}
++static inline int kaiser_add_mapping(unsigned long addr,
++ unsigned long size, unsigned long flags)
++{
++ return 0;
++}
++static inline void kaiser_remove_mapping(unsigned long start,
++ unsigned long size)
++{
++}
++static inline int kaiser_map_thread_stack(void *stack)
++{
++ return 0;
++}
++static inline void kaiser_unmap_thread_stack(void *stack)
++{
++}
++
++#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
++#endif /* _LINUX_KAISER_H */
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index fff21a82780c..490f5a83f947 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -124,8 +124,9 @@ enum zone_stat_item {
+ NR_SLAB_UNRECLAIMABLE,
+ NR_PAGETABLE, /* used for pagetables */
+ NR_KERNEL_STACK_KB, /* measured in KiB */
+- /* Second 128 byte cacheline */
++ NR_KAISERTABLE,
+ NR_BOUNCE,
++ /* Second 128 byte cacheline */
+ #if IS_ENABLED(CONFIG_ZSMALLOC)
+ NR_ZSPAGES, /* allocated in zsmalloc */
+ #endif
+diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
+index 8f16299ca068..8902f23bb770 100644
+--- a/include/linux/percpu-defs.h
++++ b/include/linux/percpu-defs.h
+@@ -35,6 +35,12 @@
+
+ #endif
+
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++#define USER_MAPPED_SECTION "..user_mapped"
++#else
++#define USER_MAPPED_SECTION ""
++#endif
++
+ /*
+ * Base implementations of per-CPU variable declarations and definitions, where
+ * the section in which the variable is to be placed is provided by the
+@@ -115,6 +121,12 @@
+ #define DEFINE_PER_CPU(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, "")
+
++#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
++
++#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
++
+ /*
+ * Declaration/definition used for per-CPU variables that must come first in
+ * the set of variables.
+@@ -144,6 +156,14 @@
+ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
++ ____cacheline_aligned_in_smp
++
++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
++ ____cacheline_aligned_in_smp
++
+ #define DECLARE_PER_CPU_ALIGNED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
+ ____cacheline_aligned
+@@ -162,11 +182,21 @@
+ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
+ __aligned(PAGE_SIZE)
++/*
++ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
++ */
++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
++ __aligned(PAGE_SIZE)
++
++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
++ __aligned(PAGE_SIZE)
+
+ /*
+ * Declaration/definition used for per-CPU variables that must be read mostly.
+ */
+-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
++#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
+
+ #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
+diff --git a/init/main.c b/init/main.c
+index 25bac88bc66e..99f026565608 100644
+--- a/init/main.c
++++ b/init/main.c
+@@ -80,6 +80,7 @@
+ #include <linux/integrity.h>
+ #include <linux/proc_ns.h>
+ #include <linux/io.h>
++#include <linux/kaiser.h>
+
+ #include <asm/io.h>
+ #include <asm/bugs.h>
+@@ -473,6 +474,7 @@ static void __init mm_init(void)
+ pgtable_init();
+ vmalloc_init();
+ ioremap_huge_init();
++ kaiser_init();
+ }
+
+ asmlinkage __visible void __init start_kernel(void)
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 9321b1ad3335..70e10cb49be0 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -58,6 +58,7 @@
+ #include <linux/tsacct_kern.h>
+ #include <linux/cn_proc.h>
+ #include <linux/freezer.h>
++#include <linux/kaiser.h>
+ #include <linux/delayacct.h>
+ #include <linux/taskstats_kern.h>
+ #include <linux/random.h>
+@@ -213,6 +214,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+
+ static inline void free_thread_stack(struct task_struct *tsk)
+ {
++ kaiser_unmap_thread_stack(tsk->stack);
+ #ifdef CONFIG_VMAP_STACK
+ if (task_stack_vm_area(tsk)) {
+ unsigned long flags;
+@@ -495,6 +497,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
+ * functions again.
+ */
+ tsk->stack = stack;
++
++ err= kaiser_map_thread_stack(tsk->stack);
++ if (err)
++ goto free_stack;
+ #ifdef CONFIG_VMAP_STACK
+ tsk->stack_vm_area = stack_vm_area;
+ #endif
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 604f26a4f696..6a088df04b29 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -932,6 +932,7 @@ const char * const vmstat_text[] = {
+ "nr_slab_unreclaimable",
+ "nr_page_table_pages",
+ "nr_kernel_stack",
++ "nr_overhead",
+ "nr_bounce",
+ #if IS_ENABLED(CONFIG_ZSMALLOC)
+ "nr_zspages",
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 97f9cac98348..e86a34fd5484 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -843,6 +843,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
+ */
+ static u32 bbr_undo_cwnd(struct sock *sk)
+ {
++ struct bbr *bbr = inet_csk_ca(sk);
++
++ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
++ bbr->full_bw_cnt = 0;
++ bbr_reset_lt_bw_sampling(sk);
+ return tcp_sk(sk)->snd_cwnd;
+ }
+
+diff --git a/security/Kconfig b/security/Kconfig
+index 118f4549404e..32f36b40e9f0 100644
+--- a/security/Kconfig
++++ b/security/Kconfig
+@@ -31,6 +31,16 @@ config SECURITY
+
+ If you are unsure how to answer this question, answer N.
+
++config PAGE_TABLE_ISOLATION
++ bool "Remove the kernel mapping in user mode"
++ default y
++ depends on X86_64 && SMP
++ help
++ This enforces a strict kernel and user space isolation, in order
++ to close hardware side channels on kernel address information.
++
++ If you are unsure how to answer this question, answer Y.
++
+ config SECURITYFS
+ bool "Enable the securityfs filesystem"
+ help
+diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
+index a39629206864..f79669a38c0c 100644
+--- a/tools/arch/x86/include/asm/cpufeatures.h
++++ b/tools/arch/x86/include/asm/cpufeatures.h
+@@ -197,6 +197,9 @@
+ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
+ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+
++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
++
+ /* Virtualization flags: Linux defined, word 8 */
+ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
+ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */