Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "The changes in here are:

   - text_poke() fixes and an extensive set of executability lockdowns,
     to (hopefully) eliminate the last residual circumstances under
     which we are using W|X mappings even temporarily on x86 kernels.
     This required a broad range of surgery in text patching facilities,
     module loading, trampoline handling and other bits.

   - tweak page fault messages to be more informative and more
     structured.

   - remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the
     default.

   - reduce KASLR granularity on 5-level paging kernels from 512 GB to
     1 GB.

   - misc other changes and updates"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
  x86/mm: Initialize PGD cache during mm initialization
  x86/alternatives: Add comment about module removal races
  x86/kprobes: Use vmalloc special flag
  x86/ftrace: Use vmalloc special flag
  bpf: Use vmalloc special flag
  modules: Use vmalloc special flag
  mm/vmalloc: Add flag for freeing of special permsissions
  mm/hibernation: Make hibernation handle unmapped pages
  x86/mm/cpa: Add set_direct_map_*() functions
  x86/alternatives: Remove the return value of text_poke_*()
  x86/jump-label: Remove support for custom text poker
  x86/modules: Avoid breaking W^X while loading modules
  x86/kprobes: Set instruction page as executable
  x86/ftrace: Set trampoline pages as executable
  x86/kgdb: Avoid redundant comparison of patched code
  x86/alternatives: Use temporary mm for text poking
  x86/alternatives: Initialize temporary mm for patching
  fork: Provide a function for copying init_mm
  uprobes: Initialize uprobes earlier
  x86/mm: Save debug registers when loading a temporary mm
  ...
This commit is contained in:
Linus Torvalds 2019-05-06 16:13:31 -07:00
commit 0bc40e549a
40 changed files with 710 additions and 342 deletions

View File

@ -72,7 +72,7 @@ Complete virtual memory map with 5-level page tables
Notes:
- With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PB starting
offset and many of the regions expand to support the much larger physical
memory supported.
@ -83,7 +83,7 @@ Notes:
0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm
__________________|____________|__________________|_________|___________________________________________________________
| | | |
0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
0100000000000000 | +64 PB | feffffffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
| | | | virtual memory addresses up to the -64 PB
| | | | starting offset of kernel mappings.
__________________|____________|__________________|_________|___________________________________________________________
@ -99,7 +99,7 @@ ____________________________________________________________|___________________
ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole
ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base)
ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole
ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory
ffdf000000000000 | -8.25 PB | fffffbffffffffff | ~8 PB | KASAN shadow memory
__________________|____________|__________________|_________|____________________________________________________________
|
| Identical layout to the 47-bit one from here on:

View File

@ -249,6 +249,10 @@ config ARCH_HAS_FORTIFY_SOURCE
config ARCH_HAS_SET_MEMORY
bool
# Select if arch has all set_direct_map_invalid/default() functions
config ARCH_HAS_SET_DIRECT_MAP
bool
# Select if arch init_task must go in the __init_task_data section
config ARCH_TASK_STRUCT_ON_STACK
bool

View File

@ -65,6 +65,7 @@ config X86
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
@ -1592,12 +1593,9 @@ config ARCH_FLATMEM_ENABLE
depends on X86_32 && !NUMA
config ARCH_DISCONTIGMEM_ENABLE
def_bool y
depends on NUMA && X86_32
config ARCH_DISCONTIGMEM_DEFAULT
def_bool y
def_bool n
depends on NUMA && X86_32
depends on BROKEN
config ARCH_SPARSEMEM_ENABLE
def_bool y
@ -1606,8 +1604,7 @@ config ARCH_SPARSEMEM_ENABLE
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
config ARCH_SPARSEMEM_DEFAULT
def_bool y
depends on X86_64
def_bool X86_64 || (NUMA && X86_32)
config ARCH_SELECT_MEMORY_MODEL
def_bool y

View File

@ -103,8 +103,6 @@ enum fixed_addresses {
#ifdef CONFIG_PARAVIRT
FIX_PARAVIRT_BOOTMAP,
#endif
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif

View File

@ -13,6 +13,7 @@
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/mpx.h>
#include <asm/debugreg.h>
extern atomic64_t last_mm_ctx_id;
@ -356,4 +357,59 @@ static inline unsigned long __get_current_cr3_fast(void)
return cr3;
}
typedef struct {
struct mm_struct *mm;
} temp_mm_state_t;
/*
* Using a temporary mm allows to set temporary mappings that are not accessible
* by other CPUs. Such mappings are needed to perform sensitive memory writes
* that override the kernel memory protections (e.g., W^X), without exposing the
* temporary page-table mappings that are required for these write operations to
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
* mapping is torn down.
*
* Context: The temporary mm needs to be used exclusively by a single core. To
* harden security IRQs must be disabled while the temporary mm is
* loaded, thereby preventing interrupt handler bugs from overriding
* the kernel memory protection.
*/
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
{
temp_mm_state_t temp_state;
lockdep_assert_irqs_disabled();
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
/*
* If breakpoints are enabled, disable them while the temporary mm is
* used. Userspace might set up watchpoints on addresses that are used
* in the temporary mm, which would lead to wrong signals being sent or
* crashes.
*
* Note that breakpoints are not disabled selectively, which also causes
* kernel breakpoints (e.g., perf's) to be disabled. This might be
* undesirable, but still seems reasonable as the code that runs in the
* temporary mm should be short.
*/
if (hw_breakpoint_active())
hw_breakpoint_disable();
return temp_state;
}
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
lockdep_assert_irqs_disabled();
switch_mm_irqs_off(NULL, prev_state.mm, current);
/*
* Restore the breakpoints if they were disabled before the temporary mm
* was loaded.
*/
if (hw_breakpoint_active())
hw_breakpoint_restore();
}
#endif /* _ASM_X86_MMU_CONTEXT_H */

View File

@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void)
/* Default trampoline pgd value */
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
}
void __init poking_init(void);
# ifdef CONFIG_RANDOMIZE_MEMORY
void __meminit init_trampoline(void);
# else

View File

@ -85,6 +85,9 @@ int set_pages_nx(struct page *page, int numpages);
int set_pages_ro(struct page *page, int numpages);
int set_pages_rw(struct page *page, int numpages);
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
extern int kernel_set_to_readonly;
void set_kernel_text_rw(void);
void set_kernel_text_ro(void);

View File

@ -18,7 +18,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
#define __parainstructions_end NULL
#endif
extern void *text_poke_early(void *addr, const void *opcode, size_t len);
extern void text_poke_early(void *addr, const void *opcode, size_t len);
/*
* Clear and restore the kernel write-protection flag on the local CPU.
@ -35,8 +35,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
* inconsistent instruction while you patch.
*/
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
extern int after_bootmem;
extern __ro_after_init struct mm_struct *poking_mm;
extern __ro_after_init unsigned long poking_addr;
#endif /* _ASM_X86_TEXT_PATCHING_H */

View File

@ -274,6 +274,8 @@ static inline bool nmi_uaccess_okay(void)
return true;
}
#define nmi_uaccess_okay nmi_uaccess_okay
/* Initialize cr4 shadow for this CPU. */
static inline void cr4_init_shadow(void)
{

View File

@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void *text_poke_early(void *addr, const void *opcode, size_t len);
void text_poke_early(void *addr, const void *opcode, size_t len);
/*
* Are we looking at a near JMP with a 1 or 4-byte displacement.
@ -666,16 +667,136 @@ void __init alternative_instructions(void)
* instructions. And on the local CPU you need to be protected again NMI or MCE
* handlers seeing an inconsistent instruction while you patch.
*/
void *__init_or_module text_poke_early(void *addr, const void *opcode,
size_t len)
void __init_or_module text_poke_early(void *addr, const void *opcode,
size_t len)
{
unsigned long flags;
if (boot_cpu_has(X86_FEATURE_NX) &&
is_module_text_address((unsigned long)addr)) {
/*
* Modules text is marked initially as non-executable, so the
* code cannot be running and speculative code-fetches are
* prevented. Just change the code.
*/
memcpy(addr, opcode, len);
} else {
local_irq_save(flags);
memcpy(addr, opcode, len);
local_irq_restore(flags);
sync_core();
/*
* Could also do a CLFLUSH here to speed up CPU recovery; but
* that causes hangs on some VIA CPUs.
*/
}
}
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
static void *__text_poke(void *addr, const void *opcode, size_t len)
{
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
temp_mm_state_t prev;
unsigned long flags;
pte_t pte, *ptep;
spinlock_t *ptl;
pgprot_t pgprot;
/*
* While boot memory allocator is running we cannot use struct pages as
* they are not yet initialized. There is no way to recover.
*/
BUG_ON(!after_bootmem);
if (!core_kernel_text((unsigned long)addr)) {
pages[0] = vmalloc_to_page(addr);
if (cross_page_boundary)
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
} else {
pages[0] = virt_to_page(addr);
WARN_ON(!PageReserved(pages[0]));
if (cross_page_boundary)
pages[1] = virt_to_page(addr + PAGE_SIZE);
}
/*
* If something went wrong, crash and burn since recovery paths are not
* implemented.
*/
BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
local_irq_save(flags);
memcpy(addr, opcode, len);
/*
* Map the page without the global bit, as TLB flushing is done with
* flush_tlb_mm_range(), which is intended for non-global PTEs.
*/
pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
/*
* The lock is not really needed, but this allows to avoid open-coding.
*/
ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
/*
* This must not fail; preallocated in poking_init().
*/
VM_BUG_ON(!ptep);
pte = mk_pte(pages[0], pgprot);
set_pte_at(poking_mm, poking_addr, ptep, pte);
if (cross_page_boundary) {
pte = mk_pte(pages[1], pgprot);
set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
}
/*
* Loading the temporary mm behaves as a compiler barrier, which
* guarantees that the PTE will be set at the time memcpy() is done.
*/
prev = use_temporary_mm(poking_mm);
kasan_disable_current();
memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
kasan_enable_current();
/*
* Ensure that the PTE is only cleared after the instructions of memcpy
* were issued by using a compiler barrier.
*/
barrier();
pte_clear(poking_mm, poking_addr, ptep);
if (cross_page_boundary)
pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
/*
* Loading the previous page-table hierarchy requires a serializing
* instruction that already allows the core to see the updated version.
* Xen-PV is assumed to serialize execution in a similar manner.
*/
unuse_temporary_mm(prev);
/*
* Flushing the TLB might involve IPIs, which would require enabled
* IRQs, but not if the mm is not used, as it is in this point.
*/
flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
PAGE_SHIFT, false);
/*
* If the text does not match what we just wrote then something is
* fundamentally screwy; there's nothing we can really do about that.
*/
BUG_ON(memcmp(addr, opcode, len));
pte_unmap_unlock(ptep, ptl);
local_irq_restore(flags);
sync_core();
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
return addr;
}
@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
* It means the size must be writable atomically and the address must be aligned
* in a way that permits an atomic write. It also makes sure we fit on a single
* page.
*
* Note that the caller must ensure that if the modified code is part of a
* module, the module would not be removed during poking. This can be achieved
* by registering a module notifier, and ordering module removal and patching
* trough a mutex.
*/
void *text_poke(void *addr, const void *opcode, size_t len)
{
unsigned long flags;
char *vaddr;
struct page *pages[2];
int i;
/*
* While boot memory allocator is runnig we cannot use struct
* pages as they are not yet initialized.
*/
BUG_ON(!after_bootmem);
lockdep_assert_held(&text_mutex);
if (!core_kernel_text((unsigned long)addr)) {
pages[0] = vmalloc_to_page(addr);
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
} else {
pages[0] = virt_to_page(addr);
WARN_ON(!PageReserved(pages[0]));
pages[1] = virt_to_page(addr + PAGE_SIZE);
}
BUG_ON(!pages[0]);
local_irq_save(flags);
set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
if (pages[1])
set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
clear_fixmap(FIX_TEXT_POKE0);
if (pages[1])
clear_fixmap(FIX_TEXT_POKE1);
local_flush_tlb();
sync_core();
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
for (i = 0; i < len; i++)
BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
local_irq_restore(flags);
return addr;
return __text_poke(addr, opcode, len);
}
/**
* text_poke_kgdb - Update instructions on a live kernel by kgdb
* @addr: address to modify
* @opcode: source of the copy
* @len: length to copy
*
* Only atomic text poke/set should be allowed when not doing early patching.
* It means the size must be writable atomically and the address must be aligned
* in a way that permits an atomic write. It also makes sure we fit on a single
* page.
*
* Context: should only be used by kgdb, which ensures no other core is running,
* despite the fact it does not hold the text_mutex.
*/
void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
{
return __text_poke(addr, opcode, len);
}
static void do_sync_core(void *info)
@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);
* replacing opcode
* - sync cores
*/
void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
unsigned char int3 = 0xcc;
@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
* the writing of the new instruction.
*/
bp_patching_in_progress = false;
return addr;
}

View File

@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size)
{
return module_alloc(size);
}
static inline void tramp_free(void *tramp, int size)
static inline void tramp_free(void *tramp)
{
int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
set_memory_nx((unsigned long)tramp, npages);
set_memory_rw((unsigned long)tramp, npages);
module_memfree(tramp);
}
#else
@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size)
{
return NULL;
}
static inline void tramp_free(void *tramp, int size) { }
static inline void tramp_free(void *tramp) { }
#endif
/* Defined as markers to the end of the ftrace default trampolines */
@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long end_offset;
unsigned long op_offset;
unsigned long offset;
unsigned long npages;
unsigned long size;
unsigned long retq;
unsigned long *ptr;
@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
return 0;
*tramp_size = size + RET_SIZE + sizeof(void *);
npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
/* Copy ftrace_caller onto the trampoline memory */
ret = probe_kernel_read(trampoline, (void *)start_offset, size);
@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
set_vm_flush_reset_perms(trampoline);
/*
* Module allocation needs to be completed by making the page
* executable. The page is still writable, which is a security hazard,
* but anyhow ftrace breaks W^X completely.
*/
set_memory_x((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
tramp_free(trampoline, *tramp_size);
tramp_free(trampoline);
return 0;
}
@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
tramp_free((void *)ops->trampoline, ops->trampoline_size);
tramp_free((void *)ops->trampoline);
ops->trampoline = 0;
}

View File

@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line)
static void __ref __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
void *(*poker)(void *, const void *, size_t),
int init)
{
union jump_code_union jmp;
@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
jmp.offset = jump_entry_target(entry) -
(jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
if (early_boot_irqs_disabled)
poker = text_poke_early;
if (type == JUMP_LABEL_JMP) {
if (init) {
expect = default_nop; line = __LINE__;
@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
bug_at((void *)jump_entry_code(entry), line);
/*
* Make text_poke_bp() a default fallback poker.
* As long as only a single processor is running and the code is still
* not marked as RO, text_poke_early() can be used; Checking that
* system_state is SYSTEM_BOOTING guarantees it. It will be set to
* SYSTEM_SCHEDULING before other cores are awaken and before the
* code is write-protected.
*
* At the time the change is being done, just ignore whether we
* are doing nop -> jump or jump -> nop transition, and assume
* always nop being the 'currently valid' instruction
*
*/
if (poker) {
(*poker)((void *)jump_entry_code(entry), code,
JUMP_LABEL_NOP_SIZE);
if (init || system_state == SYSTEM_BOOTING) {
text_poke_early((void *)jump_entry_code(entry), code,
JUMP_LABEL_NOP_SIZE);
return;
}
@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
enum jump_label_type type)
{
mutex_lock(&text_mutex);
__jump_label_transform(entry, type, NULL, 0);
__jump_label_transform(entry, type, 0);
mutex_unlock(&text_mutex);
}
@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
jlstate = JL_STATE_NO_UPDATE;
}
if (jlstate == JL_STATE_UPDATE)
__jump_label_transform(entry, type, text_poke_early, 1);
__jump_label_transform(entry, type, 1);
}

View File

@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
char opc[BREAK_INSTR_SIZE];
bpt->type = BP_BREAKPOINT;
err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
if (!err)
return err;
/*
* It is safe to call text_poke() because normal kernel execution
* It is safe to call text_poke_kgdb() because normal kernel execution
* is stopped on all cores, so long as the text_mutex is not locked.
*/
if (mutex_is_locked(&text_mutex))
return -EBUSY;
text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
BREAK_INSTR_SIZE);
err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
if (err)
return err;
if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
return -EINVAL;
text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
BREAK_INSTR_SIZE);
bpt->type = BP_POKE_BREAKPOINT;
return err;
@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
char opc[BREAK_INSTR_SIZE];
if (bpt->type != BP_POKE_BREAKPOINT)
goto knl_write;
/*
* It is safe to call text_poke() because normal kernel execution
* It is safe to call text_poke_kgdb() because normal kernel execution
* is stopped on all cores, so long as the text_mutex is not locked.
*/
if (mutex_is_locked(&text_mutex))
goto knl_write;
text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
goto knl_write;
return err;
text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
BREAK_INSTR_SIZE);
return 0;
knl_write:
return probe_kernel_write((char *)bpt->bpt_addr,

View File

@ -431,8 +431,21 @@ void *alloc_insn_page(void)
void *page;
page = module_alloc(PAGE_SIZE);
if (page)
set_memory_ro((unsigned long)page & PAGE_MASK, 1);
if (!page)
return NULL;
set_vm_flush_reset_perms(page);
/*
* First make the page read-only, and only then make it executable to
* prevent it from being W+X in between.
*/
set_memory_ro((unsigned long)page, 1);
/*
* TODO: Once additional kernel code protection mechanisms are set, ensure
* that the page was not maliciously altered and it is still zeroed.
*/
set_memory_x((unsigned long)page, 1);
return page;
}
@ -440,8 +453,6 @@ void *alloc_insn_page(void)
/* Recover page to RW mode before releasing it */
void free_insn_page(void *page)
{
set_memory_nx((unsigned long)page & PAGE_MASK, 1);
set_memory_rw((unsigned long)page & PAGE_MASK, 1);
module_memfree(page);
}

View File

@ -87,7 +87,7 @@ void *module_alloc(unsigned long size)
p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(),
MODULES_END, GFP_KERNEL,
PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
if (p && (kasan_module_alloc(p, size) < 0)) {
vfree(p);

View File

@ -141,11 +141,11 @@ SECTIONS
*(.text.__x86.indirect_thunk)
__indirect_thunk_end = .;
#endif
/* End of text section */
_etext = .;
} :text = 0x9090
/* End of text section */
_etext = .;
NOTES :text :note
EXCEPTION_TABLE(16) :text = 0x9090

View File

@ -360,8 +360,6 @@ static noinline int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
WARN_ON_ONCE(in_nmi());
/*
* Copy kernel mappings over when needed. This can also
* happen within a race in page table update. In the later
@ -604,24 +602,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
}
/*
* This helper function transforms the #PF error_code bits into
* "[PROT] [USER]" type of descriptive, almost human-readable error strings:
*/
static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt)
{
if (error_code & mask) {
if (buf[0])
strcat(buf, " ");
strcat(buf, txt);
}
}
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
char err_txt[64];
if (!oops_may_print())
return;
@ -645,31 +628,29 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
from_kuid(&init_user_ns, current_uid()));
}
pr_alert("BUG: unable to handle kernel %s at %px\n",
address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
(void *)address);
if (address < PAGE_SIZE && !user_mode(regs))
pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
(void *)address);
else
pr_alert("BUG: unable to handle page fault for address: %px\n",
(void *)address);
err_txt[0] = 0;
/*
* Note: length of these appended strings including the separation space and the
* zero delimiter must fit into err_txt[].
*/
err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" );
err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]");
err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" );
err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
pr_alert("#PF: %s %s in %s mode\n",
(error_code & X86_PF_USER) ? "user" : "supervisor",
(error_code & X86_PF_INSTR) ? "instruction fetch" :
(error_code & X86_PF_WRITE) ? "write access" :
"read access",
user_mode(regs) ? "user" : "kernel");
pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
!(error_code & X86_PF_PROT) ? "not-present page" :
(error_code & X86_PF_RSVD) ? "reserved bit violation" :
(error_code & X86_PF_PK) ? "protection keys violation" :
"permissions violation");
if (!(error_code & X86_PF_USER) && user_mode(regs)) {
struct desc_ptr idt, gdt;
u16 ldtr, tr;
pr_alert("This was a system access from user code\n");
/*
* This can happen for quite a few reasons. The more obvious
* ones are faults accessing the GDT, or LDT. Perhaps

View File

@ -6,6 +6,7 @@
#include <linux/swapfile.h>
#include <linux/swapops.h>
#include <linux/kmemleak.h>
#include <linux/sched/task.h>
#include <asm/set_memory.h>
#include <asm/e820/api.h>
@ -23,6 +24,7 @@
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
#include <asm/pti.h>
#include <asm/text-patching.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@ -701,6 +703,41 @@ void __init init_mem_mapping(void)
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}
/*
* Initialize an mm_struct to be used during poking and a pointer to be used
* during patching.
*/
void __init poking_init(void)
{
spinlock_t *ptl;
pte_t *ptep;
poking_mm = copy_init_mm();
BUG_ON(!poking_mm);
/*
* Randomize the poking address, but make sure that the following page
* will be mapped at the same PMD. We need 2 pages, so find space for 3,
* and adjust the address if the PMD ends after the first one.
*/
poking_addr = TASK_UNMAPPED_BASE;
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
(TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
poking_addr += PAGE_SIZE;
/*
* We need to trigger the allocation of the page-tables that will be
* needed for poking now. Later, poking may be performed in an atomic
* section, which might cause allocation to fail.
*/
ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
BUG_ON(!ptep);
pte_unmap_unlock(ptep, ptl);
}
/*
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
* is valid. The argument is a physical page number.

View File

@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void)
*/
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
if (pgtable_l5_enabled())
entropy = (rand % (entropy + 1)) & P4D_MASK;
else
entropy = (rand % (entropy + 1)) & PUD_MASK;
entropy = (rand % (entropy + 1)) & PUD_MASK;
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
@ -137,84 +134,71 @@ void __init kernel_randomize_memory(void)
* randomization alignment.
*/
vaddr += get_padding(&kaslr_regions[i]);
if (pgtable_l5_enabled())
vaddr = round_up(vaddr + 1, P4D_SIZE);
else
vaddr = round_up(vaddr + 1, PUD_SIZE);
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
}
static void __meminit init_trampoline_pud(void)
{
unsigned long paddr, paddr_next;
pud_t *pud_page_tramp, *pud, *pud_tramp;
p4d_t *p4d_page_tramp, *p4d, *p4d_tramp;
unsigned long paddr, vaddr;
pgd_t *pgd;
pud_t *pud_page, *pud_page_tramp;
int i;
pud_page_tramp = alloc_low_page();
/*
* There are two mappings for the low 1MB area, the direct mapping
* and the 1:1 mapping for the real mode trampoline:
*
* Direct mapping: virt_addr = phys_addr + PAGE_OFFSET
* 1:1 mapping: virt_addr = phys_addr
*/
paddr = 0;
pgd = pgd_offset_k((unsigned long)__va(paddr));
pud_page = (pud_t *) pgd_page_vaddr(*pgd);
vaddr = (unsigned long)__va(paddr);
pgd = pgd_offset_k(vaddr);
for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) {
pud_t *pud, *pud_tramp;
unsigned long vaddr = (unsigned long)__va(paddr);
p4d = p4d_offset(pgd, vaddr);
pud = pud_offset(p4d, vaddr);
pud_tramp = pud_page_tramp + pud_index(paddr);
pud = pud_page + pud_index(vaddr);
paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
pud_tramp = pud_page_tramp + pud_index(paddr);
*pud_tramp = *pud;
*pud_tramp = *pud;
}
set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
}
static void __meminit init_trampoline_p4d(void)
{
unsigned long paddr, paddr_next;
pgd_t *pgd;
p4d_t *p4d_page, *p4d_page_tramp;
int i;
p4d_page_tramp = alloc_low_page();
paddr = 0;
pgd = pgd_offset_k((unsigned long)__va(paddr));
p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
p4d_t *p4d, *p4d_tramp;
unsigned long vaddr = (unsigned long)__va(paddr);
if (pgtable_l5_enabled()) {
p4d_page_tramp = alloc_low_page();
p4d_tramp = p4d_page_tramp + p4d_index(paddr);
p4d = p4d_page + p4d_index(vaddr);
paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
*p4d_tramp = *p4d;
set_p4d(p4d_tramp,
__p4d(_KERNPG_TABLE | __pa(pud_page_tramp)));
set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
} else {
set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
}
set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
}
/*
* Create PGD aligned trampoline table to allow real mode initialization
* of additional CPUs. Consume only 1 low memory page.
* The real mode trampoline, which is required for bootstrapping CPUs
* occupies only a small area under the low 1MB. See reserve_real_mode()
* for details.
*
* If KASLR is disabled the first PGD entry of the direct mapping is copied
* to map the real mode trampoline.
*
* If KASLR is enabled, copy only the PUD which covers the low 1MB
* area. This limits the randomization granularity to 1GB for both 4-level
* and 5-level paging.
*/
void __meminit init_trampoline(void)
{
if (!kaslr_memory_enabled()) {
init_trampoline_default();
return;
}
if (pgtable_l5_enabled())
init_trampoline_p4d();
else
init_trampoline_pud();
init_trampoline_pud();
}

View File

@ -2209,8 +2209,6 @@ int set_pages_rw(struct page *page, int numpages)
return set_memory_rw(addr, numpages);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
static int __set_pages_p(struct page *page, int numpages)
{
unsigned long tempaddr = (unsigned long) page_address(page);
@ -2249,6 +2247,16 @@ static int __set_pages_np(struct page *page, int numpages)
return __change_page_attr_set_clr(&cpa, 0);
}
int set_direct_map_invalid_noflush(struct page *page)
{
return __set_pages_np(page, 1);
}
int set_direct_map_default_noflush(struct page *page)
{
return __set_pages_p(page, 1);
}
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
if (PageHighMem(page))
@ -2282,7 +2290,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
}
#ifdef CONFIG_HIBERNATION
bool kernel_page_present(struct page *page)
{
unsigned int level;
@ -2294,11 +2301,8 @@ bool kernel_page_present(struct page *page)
pte = lookup_address((unsigned long)page_address(page), &level);
return (pte_val(*pte) & _PAGE_PRESENT);
}
#endif /* CONFIG_HIBERNATION */
#endif /* CONFIG_DEBUG_PAGEALLOC */
int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
unsigned numpages, unsigned long page_flags)
{

View File

@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
static struct kmem_cache *pgd_cache;
static int __init pgd_cache_init(void)
void __init pgd_cache_init(void)
{
/*
* When PAE kernel is running as a Xen domain, it does not use
* shared kernel pmd. And this requires a whole page for pgd.
*/
if (!SHARED_KERNEL_PMD)
return 0;
return;
/*
* when PAE kernel is not running as a Xen domain, it uses
@ -390,9 +390,7 @@ static int __init pgd_cache_init(void)
*/
pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
SLAB_PANIC, NULL);
return 0;
}
core_initcall(pgd_cache_init);
static inline pgd_t *_pgd_alloc(void)
{
@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd)
}
#else
void __init pgd_cache_init(void)
{
}
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);

View File

@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
}
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
{
const struct flush_tlb_info *f = info;
@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
*/
unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
#ifdef CONFIG_DEBUG_VM
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
#endif
static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned int stride_shift, bool freed_tables,
u64 new_tlb_gen)
{
struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
#ifdef CONFIG_DEBUG_VM
/*
* Ensure that the following code is non-reentrant and flush_tlb_info
* is not overwritten. This means no TLB flushing is initiated by
* interrupt handlers and machine-check exception handlers.
*/
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
#endif
info->start = start;
info->end = end;
info->mm = mm;
info->stride_shift = stride_shift;
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
return info;
}
static inline void put_flush_tlb_info(void)
{
#ifdef CONFIG_DEBUG_VM
/* Complete reentrency prevention checks */
barrier();
this_cpu_dec(flush_tlb_info_idx);
#endif
}
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,
bool freed_tables)
{
struct flush_tlb_info *info;
u64 new_tlb_gen;
int cpu;
struct flush_tlb_info info = {
.mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,
};
cpu = get_cpu();
/* This is also a barrier that synchronizes with switch_mm(). */
info.new_tlb_gen = inc_mm_tlb_gen(mm);
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
info.start = start;
info.end = end;
} else {
info.start = 0UL;
info.end = TLB_FLUSH_ALL;
if ((end == TLB_FLUSH_ALL) ||
((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
start = 0;
end = TLB_FLUSH_ALL;
}
/* This is also a barrier that synchronizes with switch_mm(). */
new_tlb_gen = inc_mm_tlb_gen(mm);
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
new_tlb_gen);
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
VM_WARN_ON(irqs_disabled());
lockdep_assert_irqs_enabled();
local_irq_disable();
flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
local_irq_enable();
}
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), &info);
flush_tlb_others(mm_cpumask(mm), info);
put_flush_tlb_info();
put_cpu();
}
@ -787,38 +825,48 @@ static void do_kernel_range_flush(void *info)
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL ||
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
on_each_cpu(do_flush_tlb_all, NULL, 1);
} else {
struct flush_tlb_info info;
info.start = start;
info.end = end;
on_each_cpu(do_kernel_range_flush, &info, 1);
struct flush_tlb_info *info;
preempt_disable();
info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
on_each_cpu(do_kernel_range_flush, info, 1);
put_flush_tlb_info();
preempt_enable();
}
}
/*
* arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
* This means that the 'struct flush_tlb_info' that describes which mappings to
* flush is actually fixed. We therefore set a single fixed struct and use it in
* arch_tlbbatch_flush().
*/
static const struct flush_tlb_info full_flush_tlb_info = {
.mm = NULL,
.start = 0,
.end = TLB_FLUSH_ALL,
};
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
struct flush_tlb_info info = {
.mm = NULL,
.start = 0UL,
.end = TLB_FLUSH_ALL,
};
int cpu = get_cpu();
if (cpumask_test_cpu(cpu, &batch->cpumask)) {
VM_WARN_ON(irqs_disabled());
lockdep_assert_irqs_enabled();
local_irq_disable();
flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
local_irq_enable();
}
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
flush_tlb_others(&batch->cpumask, &info);
flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
cpumask_clear(&batch->cpumask);

View File

@ -2318,8 +2318,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#elif defined(CONFIG_X86_VSYSCALL_EMULATION)
case VSYSCALL_PAGE:
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;

View File

@ -1126,6 +1126,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
static inline void init_espfix_bsp(void) { }
#endif
extern void __init pgd_cache_init(void);
#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{

View File

@ -21,6 +21,15 @@
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
/*
* Blindly accessing user memory from NMI context can be dangerous
* if we're in the middle of switching the current user task or switching
* the loaded mm.
*/
#ifndef nmi_uaccess_okay
# define nmi_uaccess_okay() true
#endif
#ifdef CONFIG_MMU
/*

View File

@ -20,6 +20,7 @@
#include <linux/set_memory.h>
#include <linux/kallsyms.h>
#include <linux/if_vlan.h>
#include <linux/vmalloc.h>
#include <net/sch_generic.h>
@ -503,7 +504,6 @@ struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
@ -733,24 +733,15 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
{
fp->undo_set_mem = 1;
set_vm_flush_reset_perms(fp);
set_memory_ro((unsigned long)fp, fp->pages);
}
static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
{
if (fp->undo_set_mem)
set_memory_rw((unsigned long)fp, fp->pages);
}
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
set_vm_flush_reset_perms(hdr);
set_memory_ro((unsigned long)hdr, hdr->pages);
}
static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
{
set_memory_rw((unsigned long)hdr, hdr->pages);
set_memory_x((unsigned long)hdr, hdr->pages);
}
static inline struct bpf_binary_header *
@ -788,7 +779,6 @@ void __bpf_prog_free(struct bpf_prog *fp);
static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
{
bpf_prog_unlock_ro(fp);
__bpf_prog_free(fp);
}

View File

@ -2610,37 +2610,31 @@ static inline void kernel_poison_pages(struct page *page, int numpages,
int enable) { }
#endif
#ifdef CONFIG_DEBUG_PAGEALLOC
extern bool _debug_pagealloc_enabled;
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
static inline bool debug_pagealloc_enabled(void)
{
return _debug_pagealloc_enabled;
return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled;
}
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
static inline void
kernel_map_pages(struct page *page, int numpages, int enable)
{
if (!debug_pagealloc_enabled())
return;
__kernel_map_pages(page, numpages, enable);
}
#ifdef CONFIG_HIBERNATION
extern bool kernel_page_present(struct page *page);
#endif /* CONFIG_HIBERNATION */
#else /* CONFIG_DEBUG_PAGEALLOC */
#else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
static inline void
kernel_map_pages(struct page *page, int numpages, int enable) {}
#ifdef CONFIG_HIBERNATION
static inline bool kernel_page_present(struct page *page) { return true; }
#endif /* CONFIG_HIBERNATION */
static inline bool debug_pagealloc_enabled(void)
{
return false;
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);

View File

@ -76,6 +76,7 @@ extern void exit_itimers(struct signal_struct *);
extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
struct task_struct *fork_idle(int);
struct mm_struct *copy_init_mm(void);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);

View File

@ -17,6 +17,17 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
#endif
#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP
static inline int set_direct_map_invalid_noflush(struct page *page)
{
return 0;
}
static inline int set_direct_map_default_noflush(struct page *page)
{
return 0;
}
#endif
#ifndef set_mce_nospec
static inline int set_mce_nospec(unsigned long pfn)
{

View File

@ -115,6 +115,7 @@ struct uprobes_state {
struct xol_area *xol_area;
};
extern void __init uprobes_init(void);
extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
extern bool is_swbp_insn(uprobe_opcode_t *insn);
@ -154,6 +155,10 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
struct uprobes_state {
};
static inline void uprobes_init(void)
{
}
#define uprobe_get_trap_addr(regs) instruction_pointer(regs)
static inline int

View File

@ -21,6 +21,11 @@ struct notifier_block; /* in notifier.h */
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
/*
* Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
* vfree_atomic().
*/
#define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */
/* bits [20..32] reserved for arch specific ioremap internals */
/*
@ -142,6 +147,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
pgprot_t prot, struct page **pages);
extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
extern void unmap_kernel_range(unsigned long addr, unsigned long size);
static inline void set_vm_flush_reset_perms(void *addr)
{
struct vm_struct *vm = find_vm_area(addr);
if (vm)
vm->flags |= VM_FLUSH_RESET_PERMS;
}
#else
static inline int
map_kernel_range_noflush(unsigned long start, unsigned long size,
@ -157,6 +169,9 @@ static inline void
unmap_kernel_range(unsigned long addr, unsigned long size)
{
}
static inline void set_vm_flush_reset_perms(void *addr)
{
}
#endif
/* Allocate/destroy a 'vmalloc' VM area. */

View File

@ -504,6 +504,10 @@ void __init __weak thread_stack_cache_init(void)
void __init __weak mem_encrypt_init(void) { }
void __init __weak poking_init(void) { }
void __init __weak pgd_cache_init(void) { }
bool initcall_debug;
core_param(initcall_debug, initcall_debug, bool, 0644);
@ -535,6 +539,7 @@ static void __init mm_init(void)
init_espfix_bsp();
/* Should be run after espfix64 is set up. */
pti_init();
pgd_cache_init();
}
void __init __weak arch_call_rest_init(void)
@ -737,6 +742,7 @@ asmlinkage __visible void __init start_kernel(void)
taskstats_init_early();
delayacct_init();
poking_init();
check_bugs();
acpi_subsystem_init();

View File

@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
bpf_jit_binary_unlock_ro(hdr);
bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));

View File

@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
.priority = INT_MAX-1, /* notified after kprobes, kgdb */
};
static int __init init_uprobes(void)
void __init uprobes_init(void)
{
int i;
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
if (percpu_init_rwsem(&dup_mmap_sem))
return -ENOMEM;
BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
return register_die_notifier(&uprobe_exception_nb);
BUG_ON(register_die_notifier(&uprobe_exception_nb));
}
__initcall(init_uprobes);

View File

@ -815,6 +815,7 @@ void __init fork_init(void)
#endif
lockdep_init_task(&init_task);
uprobes_init();
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
complete_vfork_done(tsk);
}
/*
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
/**
* dup_mm() - duplicates an existing mm structure
* @tsk: the task_struct with which the new mm will be associated.
* @oldmm: the mm to duplicate.
*
* Allocates a new mm structure and duplicates the provided @oldmm structure
* content into it.
*
* Return: the duplicated mm or NULL on failure.
*/
static struct mm_struct *dup_mm(struct task_struct *tsk)
static struct mm_struct *dup_mm(struct task_struct *tsk,
struct mm_struct *oldmm)
{
struct mm_struct *mm, *oldmm = current->mm;
struct mm_struct *mm;
int err;
mm = allocate_mm();
@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
}
retval = -ENOMEM;
mm = dup_mm(tsk);
mm = dup_mm(tsk, current->mm);
if (!mm)
goto fail_nomem;
@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu)
return task;
}
struct mm_struct *copy_init_mm(void)
{
return dup_mm(NULL, &init_mm);
}
/*
* Ok, this is the main fork-routine.
*

View File

@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
/* Work queue for freeing init sections in success case */
static struct work_struct init_free_wq;
static struct llist_head init_free_list;
#ifdef CONFIG_MODULES_TREE_LOOKUP
/*
@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
if (!rodata_enabled)
return;
set_vm_flush_reset_perms(mod->core_layout.base);
set_vm_flush_reset_perms(mod->init_layout.base);
frob_text(&mod->core_layout, set_memory_ro);
frob_text(&mod->core_layout, set_memory_x);
frob_rodata(&mod->core_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_x);
frob_rodata(&mod->init_layout, set_memory_ro);
if (after_init)
@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
frob_writable_data(&mod->init_layout, set_memory_nx);
}
static void module_disable_nx(const struct module *mod)
{
frob_rodata(&mod->core_layout, set_memory_x);
frob_ro_after_init(&mod->core_layout, set_memory_x);
frob_writable_data(&mod->core_layout, set_memory_x);
frob_rodata(&mod->init_layout, set_memory_x);
frob_writable_data(&mod->init_layout, set_memory_x);
}
/* Iterate through all modules and set each module's text as RW */
void set_all_modules_text_rw(void)
{
@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)
}
mutex_unlock(&module_mutex);
}
static void disable_ro_nx(const struct module_layout *layout)
{
if (rodata_enabled) {
frob_text(layout, set_memory_rw);
frob_rodata(layout, set_memory_rw);
frob_ro_after_init(layout, set_memory_rw);
}
frob_rodata(layout, set_memory_x);
frob_ro_after_init(layout, set_memory_x);
frob_writable_data(layout, set_memory_x);
}
#else
static void disable_ro_nx(const struct module_layout *layout) { }
static void module_enable_nx(const struct module *mod) { }
static void module_disable_nx(const struct module *mod) { }
#endif
#ifdef CONFIG_LIVEPATCH
@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)
void __weak module_memfree(void *module_region)
{
/*
* This memory may be RO, and freeing RO memory in an interrupt is not
* supported by vmalloc.
*/
WARN_ON(in_interrupt());
vfree(module_region);
}
@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)
mutex_unlock(&module_mutex);
/* This may be empty, but that's OK */
disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base);
kfree(mod->args);
@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)
lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
/* Finally, free the core (containing the module structure) */
disable_ro_nx(&mod->core_layout);
module_memfree(mod->core_layout.base);
}
@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
/* For freeing module_init on success, in case kallsyms traversing */
struct mod_initfree {
struct rcu_head rcu;
struct llist_node node;
void *module_init;
};
static void do_free_init(struct rcu_head *head)
static void do_free_init(struct work_struct *w)
{
struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
module_memfree(m->module_init);
kfree(m);
struct llist_node *pos, *n, *list;
struct mod_initfree *initfree;
list = llist_del_all(&init_free_list);
synchronize_rcu();
llist_for_each_safe(pos, n, list) {
initfree = container_of(pos, struct mod_initfree, node);
module_memfree(initfree->module_init);
kfree(initfree);
}
}
static int __init modules_wq_init(void)
{
INIT_WORK(&init_free_wq, do_free_init);
init_llist_head(&init_free_list);
return 0;
}
module_init(modules_wq_init);
/*
* This is where the real work happens.
*
@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
#endif
module_enable_ro(mod, true);
mod_tree_remove_init(mod);
disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
mod->init_layout.base = NULL;
mod->init_layout.size = 0;
@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
* call synchronize_rcu(), but we don't want to slow down the success
* path, so use actual RCU here.
* path. module_memfree() cannot be called in an interrupt, so do the
* work and call synchronize_rcu() in a work queue.
*
* Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to
* be cleaned up needs to sync with the queued work - ie
* rcu_barrier()
*/
call_rcu(&freeinit->rcu, do_free_init);
if (llist_add(&freeinit->node, &init_free_list))
schedule_work(&init_free_wq);
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
/* we can't deallocate the module until we clear memory protection */
module_disable_ro(mod);
module_disable_nx(mod);
ddebug_cleanup:
ftrace_release_mod(mod);
dynamic_debug_remove(mod, info->debug);

View File

@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
* safe_copy_page - Copy a page in a safe way.
*
* Check if the page we are going to copy is marked as present in the kernel
* page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
* and in that case kernel_page_present() always returns 'true').
* page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
* CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
* always returns 'true'.
*/
static void safe_copy_page(void *dst, struct page *s_page)
{

View File

@ -14,6 +14,8 @@
#include <linux/syscalls.h>
#include <linux/error-injection.h>
#include <asm/tlb.h>
#include "trace_probe.h"
#include "trace.h"
@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
* access_ok() should prevent writing to non-user memory, but in
* some situations (nommu, temporary switch, etc) access_ok() does
* not provide enough validation, hence the check on KERNEL_DS.
*
* nmi_uaccess_okay() ensures the probe is not run in an interim
* state, when the task or mm are switched. This is specifically
* required to prevent the use of temporary mm.
*/
if (unlikely(in_interrupt() ||
@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
return -EPERM;
if (unlikely(uaccess_kernel()))
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
if (!access_ok(unsafe_ptr, size))
return -EPERM;

View File

@ -1144,7 +1144,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
}
arch_free_page(page, order);
kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
if (debug_pagealloc_enabled())
kernel_map_pages(page, 1 << order, 0);
kasan_free_nondeferred_pages(page, order);
return true;
@ -2014,7 +2016,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
set_page_refcounted(page);
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
if (debug_pagealloc_enabled())
kernel_map_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);

View File

@ -18,6 +18,7 @@
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
@ -1059,24 +1060,9 @@ static void vb_free(const void *addr, unsigned long size)
spin_unlock(&vb->lock);
}
/**
* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
*
* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
* to amortize TLB flushing overheads. What this means is that any page you
* have now, may, in a former life, have been mapped into kernel virtual
* address by the vmap layer and so there might be some CPUs with TLB entries
* still referencing that page (additional to the regular 1:1 kernel mapping).
*
* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
* be sure that none of the pages we have control over will have any aliases
* from the vmap layer.
*/
void vm_unmap_aliases(void)
static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
unsigned long start = ULONG_MAX, end = 0;
int cpu;
int flush = 0;
if (unlikely(!vmap_initialized))
return;
@ -1113,6 +1099,27 @@ void vm_unmap_aliases(void)
flush_tlb_kernel_range(start, end);
mutex_unlock(&vmap_purge_lock);
}
/**
* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
*
* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
* to amortize TLB flushing overheads. What this means is that any page you
* have now, may, in a former life, have been mapped into kernel virtual
* address by the vmap layer and so there might be some CPUs with TLB entries
* still referencing that page (additional to the regular 1:1 kernel mapping).
*
* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
* be sure that none of the pages we have control over will have any aliases
* from the vmap layer.
*/
void vm_unmap_aliases(void)
{
unsigned long start = ULONG_MAX, end = 0;
int flush = 0;
_vm_unmap_aliases(start, end, flush);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
/**
@ -1505,6 +1512,72 @@ struct vm_struct *remove_vm_area(const void *addr)
return NULL;
}
static inline void set_area_direct_map(const struct vm_struct *area,
int (*set_direct_map)(struct page *page))
{
int i;
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
set_direct_map(area->pages[i]);
}
/* Handle removing and resetting vm mappings related to the vm_struct. */
static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
{
unsigned long addr = (unsigned long)area->addr;
unsigned long start = ULONG_MAX, end = 0;
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int i;
/*
* The below block can be removed when all architectures that have
* direct map permissions also have set_direct_map_() implementations.
* This is concerned with resetting the direct map any an vm alias with
* execute permissions, without leaving a RW+X window.
*/
if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
set_memory_nx(addr, area->nr_pages);
set_memory_rw(addr, area->nr_pages);
}
remove_vm_area(area->addr);
/* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
if (!flush_reset)
return;
/*
* If not deallocating pages, just do the flush of the VM area and
* return.
*/
if (!deallocate_pages) {
vm_unmap_aliases();
return;
}
/*
* If execution gets here, flush the vm mapping and reset the direct
* map. Find the start and end range of the direct mappings to make sure
* the vm_unmap_aliases() flush includes the direct map.
*/
for (i = 0; i < area->nr_pages; i++) {
if (page_address(area->pages[i])) {
start = min(addr, start);
end = max(addr, end);
}
}
/*
* Set direct map to something invalid so that it won't be cached if
* there are any accesses after the TLB flush, then flush the TLB and
* reset the direct map permissions to the default.
*/
set_area_direct_map(area, set_direct_map_invalid_noflush);
_vm_unmap_aliases(start, end, 1);
set_area_direct_map(area, set_direct_map_default_noflush);
}
static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
@ -1526,7 +1599,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
remove_vm_area(addr);
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
int i;
@ -1961,8 +2035,9 @@ EXPORT_SYMBOL(vzalloc_node);
*/
void *vmalloc_exec(unsigned long size)
{
return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
NUMA_NO_NODE, __builtin_return_address(0));
return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
NUMA_NO_NODE, __builtin_return_address(0));
}
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)