From 46612b751c4941c5c0472ddf04027e877ae5990f Mon Sep 17 00:00:00 2001 From: zhongjiang Date: Tue, 5 Mar 2019 15:41:16 -0800 Subject: [PATCH 001/159] mm: hwpoison: fix thp split handing in soft_offline_in_use_page() When soft_offline_in_use_page() runs on a thp tail page after pmd is split, we trigger the following VM_BUG_ON_PAGE(): Memory failure: 0x3755ff: non anonymous thp __get_any_page: 0x3755ff: unknown zero refcount page type 2fffff80000000 Soft offlining pfn 0x34d805 at process virtual address 0x20fff000 page:ffffea000d360140 count:0 mapcount:0 mapping:0000000000000000 index:0x1 flags: 0x2fffff80000000() raw: 002fffff80000000 ffffea000d360108 ffffea000d360188 0000000000000000 raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: VM_BUG_ON_PAGE(page_ref_count(page) == 0) ------------[ cut here ]------------ kernel BUG at ./include/linux/mm.h:519! soft_offline_in_use_page() passed refcount and page lock from tail page to head page, which is not needed because we can pass any subpage to split_huge_page(). Naoya had fixed a similar issue in c3901e722b29 ("mm: hwpoison: fix thp split handling in memory_failure()"). But he missed fixing soft offline. Link: http://lkml.kernel.org/r/1551452476-24000-1-git-send-email-zhongjiang@huawei.com Fixes: 61f5d698cc97 ("mm: re-enable THP") Signed-off-by: zhongjiang Acked-by: Naoya Horiguchi Cc: Michal Hocko Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 831be5ff5f4d..fc8b51744579 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1825,19 +1825,17 @@ static int soft_offline_in_use_page(struct page *page, int flags) struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) { - lock_page(hpage); - if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { - unlock_page(hpage); - if (!PageAnon(hpage)) + lock_page(page); + if (!PageAnon(page) || unlikely(split_huge_page(page))) { + unlock_page(page); + if (!PageAnon(page)) pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); else pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); - put_hwpoison_page(hpage); + put_hwpoison_page(page); return -EBUSY; } - unlock_page(hpage); - get_hwpoison_page(page); - put_hwpoison_page(hpage); + unlock_page(page); } /* From 7771bdbbfd3d6f204631b6fd9e1bbc30cd15918e Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 5 Mar 2019 15:41:20 -0800 Subject: [PATCH 002/159] kasan: remove use after scope bugs detection. Use after scope bugs detector seems to be almost entirely useless for the linux kernel. It exists over two years, but I've seen only one valid bug so far [1]. And the bug was fixed before it has been reported. There were some other use-after-scope reports, but they were false-positives due to different reasons like incompatibility with structleak plugin. This feature significantly increases stack usage, especially with GCC < 9 version, and causes a 32K stack overflow. It probably adds performance penalty too. Given all that, let's remove use-after-scope detector entirely. While preparing this patch I've noticed that we mistakenly enable use-after-scope detection for clang compiler regardless of CONFIG_KASAN_EXTRA setting. This is also fixed now. [1] http://lkml.kernel.org/r/<20171129052106.rhgbjhhis53hkgfn@wfg-t540p.sh.intel.com> Link: http://lkml.kernel.org/r/20190111185842.13978-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Will Deacon [arm64] Cc: Qian Cai Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/memory.h | 4 ---- lib/Kconfig.debug | 1 - lib/Kconfig.kasan | 10 ---------- lib/test_kasan.c | 24 ------------------------ mm/kasan/generic.c | 19 ------------------- mm/kasan/generic_report.c | 3 --- mm/kasan/kasan.h | 3 --- scripts/Makefile.kasan | 5 ----- scripts/gcc-plugins/Kconfig | 4 ---- 9 files changed, 73 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 0c656850eeea..b01ef0180a03 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -80,11 +80,7 @@ */ #ifdef CONFIG_KASAN #define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT)) -#ifdef CONFIG_KASAN_EXTRA -#define KASAN_THREAD_SHIFT 2 -#else #define KASAN_THREAD_SHIFT 1 -#endif /* CONFIG_KASAN_EXTRA */ #else #define KASAN_SHADOW_SIZE (0) #define KASAN_THREAD_SHIFT 0 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d4df5b24d75e..a219f3488ad7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -222,7 +222,6 @@ config ENABLE_MUST_CHECK config FRAME_WARN int "Warn for stack frames larger than (needs gcc 4.4)" range 0 8192 - default 3072 if KASAN_EXTRA default 2048 if GCC_PLUGIN_LATENT_ENTROPY default 1280 if (!64BIT && PARISC) default 1024 if (!64BIT && !PARISC) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 9737059ec58b..9950b660e62d 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -78,16 +78,6 @@ config KASAN_SW_TAGS endchoice -config KASAN_EXTRA - bool "KASAN: extra checks" - depends on KASAN_GENERIC && DEBUG_KERNEL && !COMPILE_TEST - help - This enables further checks in generic KASAN, for now it only - includes the address-use-after-scope check that can lead to - excessive kernel stack usage, frame size warnings and longer - compile time. - See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 - choice prompt "Instrumentation type" depends on KASAN diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 51b78405bf24..7de2702621dc 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -480,29 +480,6 @@ static noinline void __init copy_user_test(void) kfree(kmem); } -static noinline void __init use_after_scope_test(void) -{ - volatile char *volatile p; - - pr_info("use-after-scope on int\n"); - { - int local = 0; - - p = (char *)&local; - } - p[0] = 1; - p[3] = 1; - - pr_info("use-after-scope on array\n"); - { - char local[1024] = {0}; - - p = local; - } - p[0] = 1; - p[1023] = 1; -} - static noinline void __init kasan_alloca_oob_left(void) { volatile int i = 10; @@ -682,7 +659,6 @@ static int __init kmalloc_tests_init(void) kasan_alloca_oob_right(); ksize_unpoisons_memory(); copy_user_test(); - use_after_scope_test(); kmem_cache_double_free(); kmem_cache_invalid_free(); kasan_memchr(); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index ccb6207276e3..504c79363a34 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -275,25 +275,6 @@ EXPORT_SYMBOL(__asan_storeN_noabort); void __asan_handle_no_return(void) {} EXPORT_SYMBOL(__asan_handle_no_return); -/* Emitted by compiler to poison large objects when they go out of scope. */ -void __asan_poison_stack_memory(const void *addr, size_t size) -{ - /* - * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded - * by redzones, so we simply round up size to simplify logic. - */ - kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), - KASAN_USE_AFTER_SCOPE); -} -EXPORT_SYMBOL(__asan_poison_stack_memory); - -/* Emitted by compiler to unpoison large objects when they go into scope. */ -void __asan_unpoison_stack_memory(const void *addr, size_t size) -{ - kasan_unpoison_shadow(addr, size); -} -EXPORT_SYMBOL(__asan_unpoison_stack_memory); - /* Emitted by compiler to poison alloca()ed objects. */ void __asan_alloca_poison(unsigned long addr, size_t size) { diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c index 5e12035888f2..36c645939bc9 100644 --- a/mm/kasan/generic_report.c +++ b/mm/kasan/generic_report.c @@ -82,9 +82,6 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) case KASAN_KMALLOC_FREE: bug_type = "use-after-free"; break; - case KASAN_USE_AFTER_SCOPE: - bug_type = "use-after-scope"; - break; case KASAN_ALLOCA_LEFT: case KASAN_ALLOCA_RIGHT: bug_type = "alloca-out-of-bounds"; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index ea51b2d898ec..3e0c11f7d7a1 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -34,7 +34,6 @@ #define KASAN_STACK_MID 0xF2 #define KASAN_STACK_RIGHT 0xF3 #define KASAN_STACK_PARTIAL 0xF4 -#define KASAN_USE_AFTER_SCOPE 0xF8 /* * alloca redzone shadow values @@ -187,8 +186,6 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size); void __asan_loadN(unsigned long addr, size_t size); void __asan_storeN(unsigned long addr, size_t size); void __asan_handle_no_return(void); -void __asan_poison_stack_memory(const void *addr, size_t size); -void __asan_unpoison_stack_memory(const void *addr, size_t size); void __asan_alloca_poison(unsigned long addr, size_t size); void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan index 6deabedc67fc..6410bd22fe38 100644 --- a/scripts/Makefile.kasan +++ b/scripts/Makefile.kasan @@ -27,14 +27,9 @@ else $(call cc-param,asan-globals=1) \ $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ $(call cc-param,asan-stack=$(CONFIG_KASAN_STACK)) \ - $(call cc-param,asan-use-after-scope=1) \ $(call cc-param,asan-instrument-allocas=1) endif -ifdef CONFIG_KASAN_EXTRA -CFLAGS_KASAN += $(call cc-option, -fsanitize-address-use-after-scope) -endif - endif # CONFIG_KASAN_GENERIC ifdef CONFIG_KASAN_SW_TAGS diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index d45f7f36b859..d9fd9988ef27 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -68,10 +68,6 @@ config GCC_PLUGIN_LATENT_ENTROPY config GCC_PLUGIN_STRUCTLEAK bool "Force initialization of variables containing userspace addresses" - # Currently STRUCTLEAK inserts initialization out of live scope of - # variables from KASAN point of view. This leads to KASAN false - # positive reports. Prohibit this combination for now. - depends on !KASAN_EXTRA help This plugin zero-initializes any structures containing a __user attribute. This can prevent some classes of information From 4117992df66a26fa33908b4969e04801534baab1 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 5 Mar 2019 15:41:24 -0800 Subject: [PATCH 003/159] page_poison: play nicely with KASAN KASAN does not play well with the page poisoning (CONFIG_PAGE_POISONING). It triggers false positives in the allocation path: BUG: KASAN: use-after-free in memchr_inv+0x2ea/0x330 Read of size 8 at addr ffff88881f800000 by task swapper/0 CPU: 0 PID: 0 Comm: swapper Not tainted 5.0.0-rc1+ #54 Call Trace: dump_stack+0xe0/0x19a print_address_description.cold.2+0x9/0x28b kasan_report.cold.3+0x7a/0xb5 __asan_report_load8_noabort+0x19/0x20 memchr_inv+0x2ea/0x330 kernel_poison_pages+0x103/0x3d5 get_page_from_freelist+0x15e7/0x4d90 because KASAN has not yet unpoisoned the shadow page for allocation before it checks memchr_inv() but only found a stale poison pattern. Also, false positives in free path, BUG: KASAN: slab-out-of-bounds in kernel_poison_pages+0x29e/0x3d5 Write of size 4096 at addr ffff8888112cc000 by task swapper/0/1 CPU: 5 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc1+ #55 Call Trace: dump_stack+0xe0/0x19a print_address_description.cold.2+0x9/0x28b kasan_report.cold.3+0x7a/0xb5 check_memory_region+0x22d/0x250 memset+0x28/0x40 kernel_poison_pages+0x29e/0x3d5 __free_pages_ok+0x75f/0x13e0 due to KASAN adds poisoned redzones around slab objects, but the page poisoning needs to poison the whole page. Link: http://lkml.kernel.org/r/20190114233405.67843-1-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/page_poison.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b9f577b1a2a..10d0f2ed9f69 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1945,8 +1945,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); - kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); + kernel_poison_pages(page, 1 << order, 1); set_page_owner(page, order, gfp_flags); } diff --git a/mm/page_poison.c b/mm/page_poison.c index f0c15e9017c0..21d4f97cb49b 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -6,6 +6,7 @@ #include #include #include +#include static bool want_page_poisoning __read_mostly; @@ -40,7 +41,10 @@ static void poison_page(struct page *page) { void *addr = kmap_atomic(page); + /* KASAN still think the page is in-use, so skip it. */ + kasan_disable_current(); memset(addr, PAGE_POISON, PAGE_SIZE); + kasan_enable_current(); kunmap_atomic(addr); } From bcf6f55a0d05eedd8ebb6ecc60ae3f93205ad833 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Mar 2019 15:41:27 -0800 Subject: [PATCH 004/159] kasan: fix kasan_check_read/write definitions Building little-endian allmodconfig kernels on arm64 started failing with the generated atomic.h implementation, since we now try to call kasan helpers from the EFI stub: aarch64-linux-gnu-ld: drivers/firmware/efi/libstub/arm-stub.stub.o: in function `atomic_set': include/generated/atomic-instrumented.h:44: undefined reference to `__efistub_kasan_check_write' I suspect that we get similar problems in other files that explicitly disable KASAN for some reason but call atomic_t based helper functions. We can fix this by checking the predefined __SANITIZE_ADDRESS__ macro that the compiler sets instead of checking CONFIG_KASAN, but this in turn requires a small hack in mm/kasan/common.c so we do see the extern declaration there instead of the inline function. Link: http://lkml.kernel.org/r/20181211133453.2835077-1-arnd@arndb.de Fixes: b1864b828644 ("locking/atomics: build atomic headers as required") Signed-off-by: Arnd Bergmann Reported-by: Anders Roxell Acked-by: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Will Deacon Cc: Mark Rutland Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Stephen Rothwell , Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan-checks.h | 2 +- mm/kasan/common.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index d314150658a4..a61dc075e2ce 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,7 +2,7 @@ #ifndef _LINUX_KASAN_CHECKS_H #define _LINUX_KASAN_CHECKS_H -#ifdef CONFIG_KASAN +#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL) void kasan_check_read(const volatile void *p, unsigned int size); void kasan_check_write(const volatile void *p, unsigned int size); #else diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 09b534fbba17..80bbe62b16cd 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -14,6 +14,8 @@ * */ +#define __KASAN_INTERNAL + #include #include #include From 5c0198b6fb73867dea296a69a944a4fdbceff3d8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 5 Mar 2019 15:41:31 -0800 Subject: [PATCH 005/159] kasan: fix coccinelle warnings in kasan_p*_table kasan_p4d_table(), kasan_pmd_table() and kasan_pud_table() are declared as returning bool, but return 0 instead of false, which produces a coccinelle warning. Fix it. Link: http://lkml.kernel.org/r/1fa6fadf644859e8a6a8ecce258444b49be8c7ee.1551716733.git.andreyknvl@google.com Fixes: 0207df4fa1a8 ("kernel/memremap, kasan: make ZONE_DEVICE with work with KASAN") Signed-off-by: Andrey Konovalov Reported-by: kbuild test robot Acked-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 45a1b5e38e1e..fcaa1ca03175 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -42,7 +42,7 @@ static inline bool kasan_p4d_table(pgd_t pgd) #else static inline bool kasan_p4d_table(pgd_t pgd) { - return 0; + return false; } #endif #if CONFIG_PGTABLE_LEVELS > 3 @@ -54,7 +54,7 @@ static inline bool kasan_pud_table(p4d_t p4d) #else static inline bool kasan_pud_table(p4d_t p4d) { - return 0; + return false; } #endif #if CONFIG_PGTABLE_LEVELS > 2 @@ -66,7 +66,7 @@ static inline bool kasan_pmd_table(pud_t pud) #else static inline bool kasan_pmd_table(pud_t pud) { - return 0; + return false; } #endif pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; From 1d6693fb9d157f64a71be8741c83305782cd98bc Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 5 Mar 2019 15:41:34 -0800 Subject: [PATCH 006/159] scripts/decode_stacktrace.sh: handle RIP address with segment decode line: RIP: 0010:khugepaged+0x2a2/0x2280 into RIP: 0010:khugepaged (mm/khugepaged.c:1885) Link: http://lkml.kernel.org/r/154660071227.52726.15645307951282727605.stgit@buzz Signed-off-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/decode_stacktrace.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 98a7d63a723e..bcdd45df3f51 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -37,6 +37,13 @@ parse_symbol() { symbol=${symbol#\(} symbol=${symbol%\)} + # Strip segment + local segment + if [[ $symbol == *:* ]] ; then + segment=${symbol%%:*}: + symbol=${symbol#*:} + fi + # Strip the symbol name so that we could look it up local name=${symbol%+*} @@ -84,7 +91,7 @@ parse_symbol() { code=${code//$'\n'/' '} # Replace old address with pretty line numbers - symbol="$name ($code)" + symbol="$segment$name ($code)" } decode_code() { From 685536921fa7307ac91ebcd987a1e2400d3b6378 Mon Sep 17 00:00:00 2001 From: Firoz Khan Date: Tue, 5 Mar 2019 15:41:37 -0800 Subject: [PATCH 007/159] sh: remove nargs from __SYSCALL The __SYSCALL macro's arguments are system call number, system call entry name and number of arguments for the system call. Argument- nargs in __SYSCALL(nr, entry, nargs) is neither calculated nor used anywhere. So it would be better to keep the implementation as __SYSCALL(nr, entry). This unifies the implementation with some other architectures too. Link: http://lkml.kernel.org/r/1546443445-21075-2-git-send-email-firoz.khan@linaro.org Signed-off-by: Firoz Khan Cc: Yoshinori Sato Cc: Rich Felker Cc: Simon Horman Cc: Kuninori Morimoto Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Cc: Thomas Gleixner Cc: Kate Stewart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/syscalls/syscalltbl.sh | 4 ++-- arch/sh/kernel/syscalls_32.S | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/sh/kernel/syscalls/syscalltbl.sh b/arch/sh/kernel/syscalls/syscalltbl.sh index 85d78d9309ad..904b8e6e625d 100644 --- a/arch/sh/kernel/syscalls/syscalltbl.sh +++ b/arch/sh/kernel/syscalls/syscalltbl.sh @@ -13,10 +13,10 @@ emit() { t_entry="$3" while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}" + printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" t_nxt=$((t_nxt+1)) done - printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}" + printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" } grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S index 96e9c54a07f5..bd1a9c544767 100644 --- a/arch/sh/kernel/syscalls_32.S +++ b/arch/sh/kernel/syscalls_32.S @@ -10,7 +10,7 @@ #include #include -#define __SYSCALL(nr, entry, nargs) .long entry +#define __SYSCALL(nr, entry) .long entry .data ENTRY(sys_call_table) #include From cc725ef3cb202ef2019a3c67c8913efa05c3cce6 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Tue, 5 Mar 2019 15:41:41 -0800 Subject: [PATCH 008/159] ocfs2: fix a panic problem caused by o2cb_ctl In the process of creating a node, it will cause NULL pointer dereference in kernel if o2cb_ctl failed in the interval (mkdir, o2cb_set_node_attribute(node_num)] in function o2cb_add_node. The node num is initialized to 0 in function o2nm_node_group_make_item, o2nm_node_group_drop_item will mistake the node number 0 for a valid node number when we delete the node before the node number is set correctly. If the local node number of the current host happens to be 0, cluster->cl_local_node will be set to O2NM_INVALID_NODE_NUM while o2hb_thread still running. The panic stack is generated as follows: o2hb_thread \-o2hb_do_disk_heartbeat \-o2hb_check_own_slot |-slot = ®->hr_slots[o2nm_this_node()]; //o2nm_this_node() return O2NM_INVALID_NODE_NUM We need to check whether the node number is set when we delete the node. Link: http://lkml.kernel.org/r/133d8045-72cc-863e-8eae-5013f9f6bc51@huawei.com Signed-off-by: Jia Guo Reviewed-by: Joseph Qi Acked-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/nodemanager.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 0e4166cc23a0..4ac775e32240 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group, struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); - o2net_disconnect_node(node); + if (cluster->cl_nodes[node->nd_num] == node) { + o2net_disconnect_node(node); - if (cluster->cl_has_local && - (cluster->cl_local_node == node->nd_num)) { - cluster->cl_has_local = 0; - cluster->cl_local_node = O2NM_INVALID_NODE_NUM; - o2net_stop_listening(node); + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } } /* XXX call into net to stop this node from trading messages */ From 5500ab4ed3b8f0749ec584d8c5e2738bc01ea52e Mon Sep 17 00:00:00 2001 From: Gang He Date: Tue, 5 Mar 2019 15:41:45 -0800 Subject: [PATCH 009/159] ocfs2: fix the application IO timeout when fstrim is running The user reported this problem, the upper application IO was timeout when fstrim was running on this ocfs2 partition. the application monitoring resource agent considered that this application did not work, then this node was fenced by the cluster brain (e.g. pacemaker). The root cause is that fstrim thread always holds main_bm meta-file related locks until all the cluster groups are trimmed. This patch will make fstrim thread release main_bm meta-file related locks when each cluster group is trimmed, this will let the current application IO has a chance to claim the clusters from main_bm meta-file. Link: http://lkml.kernel.org/r/20190111090014.31645-1-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 159 +++++++++++++++++++++++++---------------- fs/ocfs2/dlmglue.c | 5 ++ fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/ocfs2_trace.h | 2 + fs/ocfs2/super.c | 2 + 5 files changed, 106 insertions(+), 63 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d1cbb27808e2..6f0999015a44 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb, return count; } -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +static +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range) { struct ocfs2_super *osb = OCFS2_SB(sb); - u64 start, len, trimmed, first_group, last_group, group; + u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0; int ret, cnt; u32 first_bit, last_bit, minlen; struct buffer_head *main_bm_bh = NULL; @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) struct buffer_head *gd_bh = NULL; struct ocfs2_dinode *main_bm; struct ocfs2_group_desc *gd = NULL; - struct ocfs2_trim_fs_info info, *pinfo = NULL; start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) return -EINVAL; + trace_ocfs2_trim_mainbm(start, len, minlen); + +next_group: main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) } main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; - if (start >= le32_to_cpu(main_bm->i_clusters)) { - ret = -EINVAL; - goto out_unlock; - } - - len = range->len >> osb->s_clustersize_bits; - if (start + len > le32_to_cpu(main_bm->i_clusters)) - len = le32_to_cpu(main_bm->i_clusters) - start; - - trace_ocfs2_trim_fs(start, len, minlen); - - ocfs2_trim_fs_lock_res_init(osb); - ret = ocfs2_trim_fs_lock(osb, NULL, 1); - if (ret < 0) { - if (ret != -EAGAIN) { - mlog_errno(ret); - ocfs2_trim_fs_lock_res_uninit(osb); + /* + * Do some check before trim the first group. + */ + if (!group) { + if (start >= le32_to_cpu(main_bm->i_clusters)) { + ret = -EINVAL; goto out_unlock; } - mlog(ML_NOTICE, "Wait for trim on device (%s) to " - "finish, which is running from another node.\n", - osb->dev_str); - ret = ocfs2_trim_fs_lock(osb, &info, 0); - if (ret < 0) { - mlog_errno(ret); - ocfs2_trim_fs_lock_res_uninit(osb); - goto out_unlock; - } + if (start + len > le32_to_cpu(main_bm->i_clusters)) + len = le32_to_cpu(main_bm->i_clusters) - start; - if (info.tf_valid && info.tf_success && - info.tf_start == start && info.tf_len == len && - info.tf_minlen == minlen) { - /* Avoid sending duplicated trim to a shared device */ - mlog(ML_NOTICE, "The same trim on device (%s) was " - "just done from node (%u), return.\n", - osb->dev_str, info.tf_nodenum); - range->len = info.tf_trimlen; - goto out_trimunlock; - } + /* + * Determine first and last group to examine based on + * start and len + */ + first_group = ocfs2_which_cluster_group(main_bm_inode, start); + if (first_group == osb->first_cluster_group_blkno) + first_bit = start; + else + first_bit = start - ocfs2_blocks_to_clusters(sb, + first_group); + last_group = ocfs2_which_cluster_group(main_bm_inode, + start + len - 1); + group = first_group; } - info.tf_nodenum = osb->node_num; - info.tf_start = start; - info.tf_len = len; - info.tf_minlen = minlen; - - /* Determine first and last group to examine based on start and len */ - first_group = ocfs2_which_cluster_group(main_bm_inode, start); - if (first_group == osb->first_cluster_group_blkno) - first_bit = start; - else - first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); - last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); - last_bit = osb->bitmap_cpg; - - trimmed = 0; - for (group = first_group; group <= last_group;) { + do { if (first_bit + len >= osb->bitmap_cpg) last_bit = osb->bitmap_cpg; else @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); else group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); - } - range->len = trimmed * sb->s_blocksize; + } while (0); - info.tf_trimlen = range->len; - info.tf_success = (ret ? 0 : 1); - pinfo = &info; -out_trimunlock: - ocfs2_trim_fs_unlock(osb, pinfo); - ocfs2_trim_fs_lock_res_uninit(osb); out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); + main_bm_bh = NULL; out_mutex: inode_unlock(main_bm_inode); iput(main_bm_inode); + + /* + * If all the groups trim are not done or failed, but we should release + * main_bm related locks for avoiding the current IO starve, then go to + * trim the next group + */ + if (ret >= 0 && group <= last_group) + goto next_group; out: + range->len = trimmed * sb->s_blocksize; + return ret; +} + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct ocfs2_trim_fs_info info, *pinfo = NULL; + + ocfs2_trim_fs_lock_res_init(osb); + + trace_ocfs2_trim_fs(range->start, range->len, range->minlen); + + ret = ocfs2_trim_fs_lock(osb, NULL, 1); + if (ret < 0) { + if (ret != -EAGAIN) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + return ret; + } + + mlog(ML_NOTICE, "Wait for trim on device (%s) to " + "finish, which is running from another node.\n", + osb->dev_str); + ret = ocfs2_trim_fs_lock(osb, &info, 0); + if (ret < 0) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + return ret; + } + + if (info.tf_valid && info.tf_success && + info.tf_start == range->start && + info.tf_len == range->len && + info.tf_minlen == range->minlen) { + /* Avoid sending duplicated trim to a shared device */ + mlog(ML_NOTICE, "The same trim on device (%s) was " + "just done from node (%u), return.\n", + osb->dev_str, info.tf_nodenum); + range->len = info.tf_trimlen; + goto out; + } + } + + info.tf_nodenum = osb->node_num; + info.tf_start = range->start; + info.tf_len = range->len; + info.tf_minlen = range->minlen; + + ret = ocfs2_trim_mainbm(sb, range); + + info.tf_trimlen = range->len; + info.tf_success = (ret < 0 ? 0 : 1); + pinfo = &info; +out: + ocfs2_trim_fs_unlock(osb, pinfo); + ocfs2_trim_fs_lock_res_uninit(osb); return ret; } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 7c835824247e..af405586c5b1 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) { struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + /* Only one trimfs thread are allowed to work at the same time. */ + mutex_lock(&osb->obs_trim_fs_mutex); + ocfs2_lock_res_init_once(lockres); ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb) ocfs2_simple_drop_lockres(osb, lockres); ocfs2_lock_res_free(lockres); + + mutex_unlock(&osb->obs_trim_fs_mutex); } static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 4f86ac0027b5..1f029fbe8b8d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -407,6 +407,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; struct ocfs2_lock_res osb_trim_fs_lockres; + struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 2ee76a90ba8f..dc4bce1649c1 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent, DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm); + DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); /* End of trace events for fs/ocfs2/alloc.c. */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3415e0b09398..96ae7cedd487 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb) if (ocfs2_is_hard_readonly(osb)) goto leave; + mutex_init(&osb->obs_trim_fs_mutex); + status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); From f402cf03fc4c5576df379e1e252a6afc17658414 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 5 Mar 2019 15:41:48 -0800 Subject: [PATCH 010/159] ocfs2: Use zero-sized array and struct_size() in kzalloc() Update the code to use a zero-sized array instead of a pointer in structure ocfs2_slot_info and use struct_size() in kzalloc(). Notice that one of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Link: http://lkml.kernel.org/r/20190108191903.GA22056@embeddedor Signed-off-by: Gustavo A. R. Silva Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/slot_map.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index d7407994f308..ea0756d83250 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -55,7 +55,7 @@ struct ocfs2_slot_info { unsigned int si_blocks; struct buffer_head **si_bh; unsigned int si_num_slots; - struct ocfs2_slot *si_slots; + struct ocfs2_slot si_slots[]; }; @@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) struct inode *inode = NULL; struct ocfs2_slot_info *si; - si = kzalloc(sizeof(struct ocfs2_slot_info) + - (sizeof(struct ocfs2_slot) * osb->max_slots), - GFP_KERNEL); + si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL); if (!si) { status = -ENOMEM; mlog_errno(status); @@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) si->si_extended = ocfs2_uses_extended_slot_map(osb); si->si_num_slots = osb->max_slots; - si->si_slots = (struct ocfs2_slot *)((char *)si + - sizeof(struct ocfs2_slot_info)); inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); From a905737fdd767c75688e1e6de65967923007ec1d Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 5 Mar 2019 15:41:52 -0800 Subject: [PATCH 011/159] fs/inode.c: inode_set_flags(): replace opencoded set_mask_bits() It seems that commits 5f16f3225b0624 and 00a1a053ebe5, both with same commitlog ("ext4: atomically set inode->i_flags in ext4_set_inode_flags()") introduced the set_mask_bits API, but somehow missed not using it in ext4 in the end. Also, set_mask_bits() is used in fs quite a bit and we can possibly come up with a generic llsc based implementation (w/o the cmpxchg loop) Link: http://lkml.kernel.org/r/1548275584-18096-3-git-send-email-vgupta@synopsys.com Signed-off-by: Vineet Gupta Reviewed-by: Anthony Yznaga Cc: Alexander Viro Cc: Theodore Ts'o Cc: Peter Zijlstra (Intel) Cc: Chris Wilson Cc: Ingo Molnar Cc: Jani Nikula Cc: Miklos Szeredi Cc: Oleg Nesterov Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 73432e64f874..e9d97add2b36 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2093,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait); void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask) { - unsigned int old_flags, new_flags; - WARN_ON_ONCE(flags & ~mask); - do { - old_flags = READ_ONCE(inode->i_flags); - new_flags = (old_flags & ~mask) | flags; - } while (unlikely(cmpxchg(&inode->i_flags, old_flags, - new_flags) != old_flags)); + set_mask_bits(&inode->i_flags, mask, flags); } EXPORT_SYMBOL(inode_set_flags); From 5704a06810682683355624923547b41540e2801a Mon Sep 17 00:00:00 2001 From: Shuriyc Chu Date: Tue, 5 Mar 2019 15:41:56 -0800 Subject: [PATCH 012/159] fs/file.c: initialize init_files.resize_wait (Taken from https://bugzilla.kernel.org/show_bug.cgi?id=200647) 'get_unused_fd_flags' in kthread cause kernel crash. It works fine on 4.1, but causes crash after get 64 fds. It also cause crash on ubuntu1404/1604/1804, centos7.5, and the crash messages are almost the same. The crash message on centos7.5 shows below: start fd 61 start fd 62 start fd 63 BUG: unable to handle kernel NULL pointer dereference at (null) IP: __wake_up_common+0x2e/0x90 PGD 0 Oops: 0000 [#1] SMP Modules linked in: test(OE) xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter devlink sunrpc kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd sg ppdev pcspkr virtio_balloon parport_pc parport i2c_piix4 joydev ip_tables xfs libcrc32c sr_mod cdrom sd_mod crc_t10dif crct10dif_generic ata_generic pata_acpi virtio_scsi virtio_console virtio_net cirrus drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm crct10dif_pclmul crct10dif_common crc32c_intel drm ata_piix serio_raw libata virtio_pci virtio_ring i2c_core virtio floppy dm_mirror dm_region_hash dm_log dm_mod CPU: 2 PID: 1820 Comm: test_fd Kdump: loaded Tainted: G OE ------------ 3.10.0-862.3.3.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014 task: ffff8e92b9431fa0 ti: ffff8e94247a0000 task.ti: ffff8e94247a0000 RIP: 0010:__wake_up_common+0x2e/0x90 RSP: 0018:ffff8e94247a2d18 EFLAGS: 00010086 RAX: 0000000000000000 RBX: ffffffff9d09daa0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffffffff9d09daa0 RBP: ffff8e94247a2d50 R08: 0000000000000000 R09: ffff8e92b95dfda8 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff9d09daa8 R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000003 FS: 0000000000000000(0000) GS:ffff8e9434e80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000017c686000 CR4: 00000000000207e0 Call Trace: __wake_up+0x39/0x50 expand_files+0x131/0x250 __alloc_fd+0x47/0x170 get_unused_fd_flags+0x30/0x40 test_fd+0x12a/0x1c0 [test] kthread+0xd1/0xe0 ret_from_fork_nospec_begin+0x21/0x21 Code: 66 90 55 48 89 e5 41 57 41 89 f7 41 56 41 89 ce 41 55 41 54 49 89 fc 49 83 c4 08 53 48 83 ec 10 48 8b 47 08 89 55 cc 4c 89 45 d0 <48> 8b 08 49 39 c4 48 8d 78 e8 4c 8d 69 e8 75 08 eb 3b 4c 89 ef RIP __wake_up_common+0x2e/0x90 RSP CR2: 0000000000000000 This issue exists since CentOS 7.5 3.10.0-862 and CentOS 7.4 (3.10.0-693.21.1 ) is ok. Root cause: the item 'resize_wait' is not initialized before being used. Reported-by: Richard Zhang Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/file.c b/fs/file.c index 3209ee271c41..a10487aa0a84 100644 --- a/fs/file.c +++ b/fs/file.c @@ -457,6 +457,7 @@ struct files_struct init_files = { .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), + .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) From edde82b6df70cf10486d7f1e1611151218e6c316 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Tue, 5 Mar 2019 15:42:00 -0800 Subject: [PATCH 013/159] mm/slub.c: freelist is ensured to be NULL when new_slab() fails new_slab_objects() will return immediately if freelist is not NULL. if (freelist) return freelist; One more assignment operation could be avoided. Link: http://lkml.kernel.org/r/20181229062512.30469-1-rocking@whu.edu.cn Signed-off-by: Peng Wang Reviewed-by: Pekka Enberg Reviewed-by: Andrew Morton Acked-by: David Rientjes Cc: Christoph Lameter Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index dc777761b6b7..a561f909446d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2482,8 +2482,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, stat(s, ALLOC_SLAB); c->page = page; *pc = c; - } else - freelist = NULL; + } return freelist; } From 92d1d07daad65c300c7d0b68bbef8867e9895d54 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 5 Mar 2019 15:42:03 -0800 Subject: [PATCH 014/159] mm/slab.c: kmemleak no scan alien caches Kmemleak throws endless warnings during boot due to in __alloc_alien_cache(), alc = kmalloc_node(memsize, gfp, node); init_arraycache(&alc->ac, entries, batch); kmemleak_no_scan(ac); Kmemleak does not track the array cache (alc->ac) but the alien cache (alc) instead, so let it track the latter by lifting kmemleak_no_scan() out of init_arraycache(). There is another place that calls init_arraycache(), but alloc_kmem_cache_cpus() uses the percpu allocation where will never be considered as a leak. kmemleak: Found object by alias at 0xffff8007b9aa7e38 CPU: 190 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc2+ #2 Call trace: dump_backtrace+0x0/0x168 show_stack+0x24/0x30 dump_stack+0x88/0xb0 lookup_object+0x84/0xac find_and_get_object+0x84/0xe4 kmemleak_no_scan+0x74/0xf4 setup_kmem_cache_node+0x2b4/0x35c __do_tune_cpucache+0x250/0x2d4 do_tune_cpucache+0x4c/0xe4 enable_cpucache+0xc8/0x110 setup_cpu_cache+0x40/0x1b8 __kmem_cache_create+0x240/0x358 create_cache+0xc0/0x198 kmem_cache_create_usercopy+0x158/0x20c kmem_cache_create+0x50/0x64 fsnotify_init+0x58/0x6c do_one_initcall+0x194/0x388 kernel_init_freeable+0x668/0x688 kernel_init+0x18/0x124 ret_from_fork+0x10/0x18 kmemleak: Object 0xffff8007b9aa7e00 (size 256): kmemleak: comm "swapper/0", pid 1, jiffies 4294697137 kmemleak: min_count = 1 kmemleak: count = 0 kmemleak: flags = 0x1 kmemleak: checksum = 0 kmemleak: backtrace: kmemleak_alloc+0x84/0xb8 kmem_cache_alloc_node_trace+0x31c/0x3a0 __kmalloc_node+0x58/0x78 setup_kmem_cache_node+0x26c/0x35c __do_tune_cpucache+0x250/0x2d4 do_tune_cpucache+0x4c/0xe4 enable_cpucache+0xc8/0x110 setup_cpu_cache+0x40/0x1b8 __kmem_cache_create+0x240/0x358 create_cache+0xc0/0x198 kmem_cache_create_usercopy+0x158/0x20c kmem_cache_create+0x50/0x64 fsnotify_init+0x58/0x6c do_one_initcall+0x194/0x388 kernel_init_freeable+0x668/0x688 kernel_init+0x18/0x124 kmemleak: Not scanning unknown object at 0xffff8007b9aa7e38 CPU: 190 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc2+ #2 Call trace: dump_backtrace+0x0/0x168 show_stack+0x24/0x30 dump_stack+0x88/0xb0 kmemleak_no_scan+0x90/0xf4 setup_kmem_cache_node+0x2b4/0x35c __do_tune_cpucache+0x250/0x2d4 do_tune_cpucache+0x4c/0xe4 enable_cpucache+0xc8/0x110 setup_cpu_cache+0x40/0x1b8 __kmem_cache_create+0x240/0x358 create_cache+0xc0/0x198 kmem_cache_create_usercopy+0x158/0x20c kmem_cache_create+0x50/0x64 fsnotify_init+0x58/0x6c do_one_initcall+0x194/0x388 kernel_init_freeable+0x668/0x688 kernel_init+0x18/0x124 ret_from_fork+0x10/0x18 Link: http://lkml.kernel.org/r/20190129184518.39808-1-cai@lca.pw Fixes: 1fe00d50a9e8 ("slab: factor out initialization of array cache") Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 91c1863df93d..757e646baa5d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -550,14 +550,6 @@ static void start_cpu_timer(int cpu) static void init_arraycache(struct array_cache *ac, int limit, int batch) { - /* - * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transferred to another - * cache the pointers are not cleared and they could be counted as - * valid references during a kmemleak scan. Therefore, kmemleak must - * not scan such objects. - */ - kmemleak_no_scan(ac); if (ac) { ac->avail = 0; ac->limit = limit; @@ -573,6 +565,14 @@ static struct array_cache *alloc_arraycache(int node, int entries, struct array_cache *ac = NULL; ac = kmalloc_node(memsize, gfp, node); + /* + * The array_cache structures contain pointers to free object. + * However, when such objects are allocated or transferred to another + * cache the pointers are not cleared and they could be counted as + * valid references during a kmemleak scan. Therefore, kmemleak must + * not scan such objects. + */ + kmemleak_no_scan(ac); init_arraycache(ac, entries, batchcount); return ac; } @@ -667,6 +667,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, alc = kmalloc_node(memsize, gfp, node); if (alc) { + kmemleak_no_scan(alc); init_arraycache(&alc->ac, entries, batch); spin_lock_init(&alc->lock); } From de810f490db7ed4c1db2bbfa458b2e27681d2ccb Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 5 Mar 2019 15:42:07 -0800 Subject: [PATCH 015/159] include/linux/slub_def.h: comment fixes Capitialize comment string, use C89 comment style, correct grammar/punctuation in comments. Link: http://lkml.kernel.org/r/20190204005713.9463-2-tobin@kernel.org Link: http://lkml.kernel.org/r/20190204005713.9463-3-tobin@kernel.org Link: http://lkml.kernel.org/r/20190204005713.9463-4-tobin@kernel.org Signed-off-by: Tobin C. Harding Reviewed-by: Andrew Morton Reviewed-by: William Kucharski Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 3a1a1dbc6f49..d2153789bd9f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -81,12 +81,12 @@ struct kmem_cache_order_objects { */ struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; - /* Used for retriving partial slabs etc */ + /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; - unsigned int size; /* The size of an object including meta data */ - unsigned int object_size;/* The size of an object without meta data */ - unsigned int offset; /* Free pointer offset. */ + unsigned int size; /* The size of an object including metadata */ + unsigned int object_size;/* The size of an object without metadata */ + unsigned int offset; /* Free pointer offset */ #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; @@ -110,7 +110,7 @@ struct kmem_cache { #endif #ifdef CONFIG_MEMCG struct memcg_cache_params memcg_params; - /* for propagation, maximum size of a stored attr */ + /* For propagation, maximum size of a stored attr */ unsigned int max_attr_size; #ifdef CONFIG_SYSFS struct kset *memcg_kset; @@ -151,7 +151,7 @@ struct kmem_cache { #else #define slub_cpu_partial(s) (0) #define slub_set_cpu_partial(s, n) -#endif // CONFIG_SLUB_CPU_PARTIAL +#endif /* CONFIG_SLUB_CPU_PARTIAL */ #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS From 278d7756dff0b4c8089c46abad20a79bcfa66b5b Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 5 Mar 2019 15:42:10 -0800 Subject: [PATCH 016/159] mm/slub.c: remove an unused addr argument "addr" function argument is not used in alloc_consistency_checks() at all, so remove it. Link: http://lkml.kernel.org/r/20190211123214.35592-1-cai@lca.pw Fixes: becfda68abca ("slub: convert SLAB_DEBUG_FREE to SLAB_CONSISTENCY_CHECKS") Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a561f909446d..037a8ca7f7ca 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1093,8 +1093,7 @@ static void setup_page_debug(struct kmem_cache *s, void *addr, int order) } static inline int alloc_consistency_checks(struct kmem_cache *s, - struct page *page, - void *object, unsigned long addr) + struct page *page, void *object) { if (!check_slab(s, page)) return 0; @@ -1115,7 +1114,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, void *object, unsigned long addr) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!alloc_consistency_checks(s, page, object, addr)) + if (!alloc_consistency_checks(s, page, object)) goto bad; } From a9cd410a3d296846a8125aa43d97a573a354c472 Mon Sep 17 00:00:00 2001 From: Arun KS Date: Tue, 5 Mar 2019 15:42:14 -0800 Subject: [PATCH 017/159] mm/page_alloc.c: memory hotplug: free pages as higher order When freeing pages are done with higher order, time spent on coalescing pages by buddy allocator can be reduced. With section size of 256MB, hot add latency of a single section shows improvement from 50-60 ms to less than 1 ms, hence improving the hot add latency by 60 times. Modify external providers of online callback to align with the change. [arunks@codeaurora.org: v11] Link: http://lkml.kernel.org/r/1547792588-18032-1-git-send-email-arunks@codeaurora.org [akpm@linux-foundation.org: remove unused local, per Arun] [akpm@linux-foundation.org: avoid return of void-returning __free_pages_core(), per Oscar] [akpm@linux-foundation.org: fix it for mm-convert-totalram_pages-and-totalhigh_pages-variables-to-atomic.patch] [arunks@codeaurora.org: v8] Link: http://lkml.kernel.org/r/1547032395-24582-1-git-send-email-arunks@codeaurora.org [arunks@codeaurora.org: v9] Link: http://lkml.kernel.org/r/1547098543-26452-1-git-send-email-arunks@codeaurora.org Link: http://lkml.kernel.org/r/1538727006-5727-1-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Reviewed-by: Andrew Morton Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: Alexander Duyck Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Dan Williams Cc: Vlastimil Babka Cc: Joonsoo Kim Cc: Greg Kroah-Hartman Cc: Mathieu Malaterre Cc: "Kirill A. Shutemov" Cc: Souptick Joarder Cc: Mel Gorman Cc: Aaron Lu Cc: Srivatsa Vaddagiri Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hv/hv_balloon.c | 7 ++++--- drivers/xen/balloon.c | 15 +++++++++----- include/linux/memory_hotplug.h | 2 +- mm/internal.h | 1 + mm/memory_hotplug.c | 37 +++++++++++++++++++++++----------- mm/page_alloc.c | 8 ++++---- 6 files changed, 45 insertions(+), 25 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 7c6349a50ef1..a50b7624b2a3 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -771,7 +771,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, } } -static void hv_online_page(struct page *pg) +static void hv_online_page(struct page *pg, unsigned int order) { struct hv_hotadd_state *has; unsigned long flags; @@ -780,10 +780,11 @@ static void hv_online_page(struct page *pg) spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry(has, &dm_device.ha_region_list, list) { /* The page belongs to a different HAS. */ - if ((pfn < has->start_pfn) || (pfn >= has->end_pfn)) + if ((pfn < has->start_pfn) || + (pfn + (1UL << order) > has->end_pfn)) continue; - hv_page_online_one(has, pg); + hv_bring_pgs_online(has, pfn, 1UL << order); break; } spin_unlock_irqrestore(&dm_device.ha_lock, flags); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index ceb5048de9a7..d107447c47de 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -369,14 +369,19 @@ static enum bp_state reserve_additional_memory(void) return BP_ECANCELED; } -static void xen_online_page(struct page *page) +static void xen_online_page(struct page *page, unsigned int order) { - __online_page_set_limits(page); + unsigned long i, size = (1 << order); + unsigned long start_pfn = page_to_pfn(page); + struct page *p; + pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); mutex_lock(&balloon_mutex); - - __balloon_append(page); - + for (i = 0; i < size; i++) { + p = pfn_to_page(start_pfn + i); + __online_page_set_limits(p); + __balloon_append(p); + } mutex_unlock(&balloon_mutex); } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 368267c1b71b..52869d6d38b3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -89,7 +89,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end); extern void __offline_isolated_pages(unsigned long, unsigned long); -typedef void (*online_page_callback_t)(struct page *page); +typedef void (*online_page_callback_t)(struct page *page, unsigned int order); extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); diff --git a/mm/internal.h b/mm/internal.h index f4a7bb02decf..536bc2a839b9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -163,6 +163,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, extern int __isolate_free_page(struct page *page, unsigned int order); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); +extern void __free_pages_core(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1ad28323fb9f..4f07c8ddfdd7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -47,7 +47,7 @@ * and restore_online_page_callback() for generic callback restore. */ -static void generic_online_page(struct page *page); +static void generic_online_page(struct page *page, unsigned int order); static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); @@ -656,26 +656,39 @@ void __online_page_free(struct page *page) } EXPORT_SYMBOL_GPL(__online_page_free); -static void generic_online_page(struct page *page) +static void generic_online_page(struct page *page, unsigned int order) { - __online_page_set_limits(page); - __online_page_increment_counters(page); - __online_page_free(page); + __free_pages_core(page, order); + totalram_pages_add(1UL << order); +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages_add(1UL << order); +#endif +} + +static int online_pages_blocks(unsigned long start, unsigned long nr_pages) +{ + unsigned long end = start + nr_pages; + int order, onlined_pages = 0; + + while (start < end) { + order = min(MAX_ORDER - 1, + get_order(PFN_PHYS(end) - PFN_PHYS(start))); + (*online_page_callback)(pfn_to_page(start), order); + + onlined_pages += (1UL << order); + start += (1UL << order); + } + return onlined_pages; } static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { - unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; - struct page *page; if (PageReserved(pfn_to_page(start_pfn))) - for (i = 0; i < nr_pages; i++) { - page = pfn_to_page(start_pfn + i); - (*online_page_callback)(page); - onlined_pages++; - } + onlined_pages += online_pages_blocks(start_pfn, nr_pages); online_mem_sections(start_pfn, start_pfn + nr_pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 10d0f2ed9f69..5361bd078493 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1303,7 +1303,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, unsigned int order) +void __free_pages_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1382,7 +1382,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, order); + __free_pages_core(page, order); } /* @@ -1472,14 +1472,14 @@ static void __init deferred_free_range(unsigned long pfn, if (nr_pages == pageblock_nr_pages && (pfn & (pageblock_nr_pages - 1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pageblock_order); + __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { if ((pfn & (pageblock_nr_pages - 1)) == 0) set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, 0); + __free_pages_core(page, 0); } } From 4d3467e171f8a8ef8f1dd205769cf2f21fbc8e1e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:18 -0800 Subject: [PATCH 018/159] mm: balloon: update comment about isolation/migration/compaction Patch series "mm/kdump: allow to exclude pages that are logically offline" Right now, pages inflated as part of a balloon driver will be dumped by dump tools like makedumpfile. While XEN is able to check in the crash kernel whether a certain pfn is actuall backed by memory in the hypervisor (see xen_oldmem_pfn_is_ram) and optimize this case, dumps of virtio-balloon, hv-balloon and VMWare balloon inflated memory will essentially result in zero pages getting allocated by the hypervisor and the dump getting filled with this data. The allocation and reading of zero pages can directly be avoided if a dumping tool could know which pages only contain stale information not to be dumped. Also for XEN, calling into the kernel and asking the hypervisor if a pfn is backed can be avoided if the duming tool would skip such pages right from the beginning. Dumping tools have no idea whether a given page is part of a balloon driver and shall not be dumped. Esp. PG_reserved cannot be used for that purpose as all memory allocated during early boot is also PG_reserved, see discussion at [1]. So some other way of indication is required and a new page flag is frowned upon. We have PG_balloon (MAPCOUNT value), which is essentially unused now. I suggest renaming it to something more generic (PG_offline) to mark pages as logically offline. This flag can than e.g. also be used by virtio-mem in the future to mark subsections as offline. Or by other code that wants to put pages logically offline (e.g. later maybe poisoned pages that shall no longer be used). This series converts PG_balloon to PG_offline, allows dumping tools to query the value to detect such pages and marks pages in the hv-balloon and XEN balloon properly as PG_offline. Note that virtio-balloon already set pages to PG_balloon (and now PG_offline). Please note that this is also helpful for a problem we were seeing under Hyper-V: Dumping logically offline memory (pages kept fake offline while onlining a section via online_page_callback) would under some condicions result in a kernel panic when dumping them. As I don't have access to neither XEN nor Hyper-V nor VMWare installations, this was only tested with the virtio-balloon and pages were properly skipped when dumping. I'll also attach the makedumpfile patch to this series. [1] https://lkml.org/lkml/2018/7/20/566 This patch (of 8): Commit b1123ea6d3b3 ("mm: balloon: use general non-lru movable page feature") reworked balloon handling to make use of the general non-lru movable page feature. The big comment block in balloon_compaction.h contains quite some outdated information. Let's fix this. Link: http://lkml.kernel.org/r/20181119101616.8901-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michael S. Tsirkin Cc: Matthew Wilcox Cc: Michal Hocko Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Christian Hansen Cc: Dave Young Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jonathan Corbet Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: Kazuhito Hagio Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Lianbo Jiang Cc: Michal Hocko Cc: Mike Rapoport Cc: Miles Chen Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Omar Sandoval Cc: Pankaj gupta Cc: Pavel Machek Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Stephen Rothwell Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/balloon_compaction.h | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 53051f3d8f25..cbe50da5a59d 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -4,15 +4,18 @@ * * Common interface definitions for making balloon pages movable by compaction. * - * Despite being perfectly possible to perform ballooned pages migration, they - * make a special corner case to compaction scans because balloon pages are not - * enlisted at any LRU list like the other pages we do compact / migrate. + * Balloon page migration makes use of the general non-lru movable page + * feature. + * + * page->private is used to reference the responsible balloon device. + * page->mapping is used in context of non-lru page migration to reference + * the address space operations for page isolation/migration/compaction. * * As the page isolation scanning step a compaction thread does is a lockless * procedure (from a page standpoint), it might bring some racy situations while * performing balloon page compaction. In order to sort out these racy scenarios * and safely perform balloon's page compaction and migration we must, always, - * ensure following these three simple rules: + * ensure following these simple rules: * * i. when updating a balloon's page ->mapping element, strictly do it under * the following lock order, independently of the far superior @@ -21,19 +24,8 @@ * +--spin_lock_irq(&b_dev_info->pages_lock); * ... page->mapping updates here ... * - * ii. before isolating or dequeueing a balloon page from the balloon device - * pages list, the page reference counter must be raised by one and the - * extra refcount must be dropped when the page is enqueued back into - * the balloon device page list, thus a balloon page keeps its reference - * counter raised only while it is under our special handling; - * - * iii. after the lockless scan step have selected a potential balloon page for - * isolation, re-test the PageBalloon mark and the PagePrivate flag - * under the proper page lock, to ensure isolating a valid balloon page - * (not yet isolated, nor under release procedure) - * - * iv. isolation or dequeueing procedure must clear PagePrivate flag under - * page lock together with removing page from balloon device page list. + * ii. isolation or dequeueing procedure must remove the page from balloon + * device page list under b_dev_info->pages_lock. * * The functions provided by this interface are placed to help on coping with * the aforementioned balloon page corner case, as well as to ensure the simple From ca215086b14b89a0e70fc211314944aa6ce50020 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:23 -0800 Subject: [PATCH 019/159] mm: convert PG_balloon to PG_offline PG_balloon was introduced to implement page migration/compaction for pages inflated in virtio-balloon. Nowadays, it is only a marker that a page is part of virtio-balloon and therefore logically offline. We also want to make use of this flag in other balloon drivers - for inflated pages or when onlining a section but keeping some pages offline (e.g. used right now by XEN and Hyper-V via set_online_page_callback()). We are going to expose this flag to dump tools like makedumpfile. But instead of exposing PG_balloon, let's generalize the concept of marking pages as logically offline, so it can be reused for other purposes later on. Rename PG_balloon to PG_offline. This is an indicator that the page is logically offline, the content stale and that it should not be touched (e.g. a hypervisor would have to allocate backing storage in order for the guest to dump an unused page). We can then e.g. exclude such pages from dumps. We replace and reuse KPF_BALLOON (23), as this shouldn't really harm (and for now the semantics stay the same). In following patches, we will make use of this bit also in other balloon drivers. While at it, document PGTABLE. [akpm@linux-foundation.org: fix comment text, per David] Link: http://lkml.kernel.org/r/20181119101616.8901-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Konstantin Khlebnikov Acked-by: Michael S. Tsirkin Acked-by: Pankaj gupta Cc: Jonathan Corbet Cc: Alexey Dobriyan Cc: Mike Rapoport Cc: Christian Hansen Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Stephen Rothwell Cc: Matthew Wilcox Cc: Michal Hocko Cc: Pavel Tatashin Cc: Alexander Duyck Cc: Naoya Horiguchi Cc: Miles Chen Cc: David Rientjes Cc: Kazuhito Hagio Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Dave Young Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Lianbo Jiang Cc: Michal Hocko Cc: Nadav Amit Cc: Omar Sandoval Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 9 ++++++--- fs/proc/page.c | 4 ++-- include/linux/balloon_compaction.h | 8 ++++---- include/linux/page-flags.h | 11 +++++++---- include/uapi/linux/kernel-page-flags.h | 2 +- tools/vm/page-types.c | 2 +- 6 files changed, 21 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 3f7bade2c231..340a5aee9b80 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -75,9 +75,10 @@ number of times a page is mapped. 20. NOPAGE 21. KSM 22. THP - 23. BALLOON + 23. OFFLINE 24. ZERO_PAGE 25. IDLE + 26. PGTABLE * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the memory cgroup each page is charged to, indexed by PFN. Only available when @@ -118,8 +119,8 @@ Short descriptions to the page flags identical memory pages dynamically shared between one or more processes 22 - THP contiguous pages which construct transparent hugepages -23 - BALLOON - balloon compaction page +23 - OFFLINE + page is logically offline 24 - ZERO_PAGE zero page for pfn_zero or huge_zero page 25 - IDLE @@ -128,6 +129,8 @@ Short descriptions to the page flags Note that this flag may be stale in case the page was accessed via a PTE. To make sure the flag is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first. +26 - PGTABLE + page is in use as a page table IO related page flags --------------------- diff --git a/fs/proc/page.c b/fs/proc/page.c index 40b05e0d4274..544d1ee15aee 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page) else if (page_count(page) == 0 && is_free_buddy_page(page)) u |= 1 << KPF_BUDDY; - if (PageBalloon(page)) - u |= 1 << KPF_BALLOON; + if (PageOffline(page)) + u |= 1 << KPF_OFFLINE; if (PageTable(page)) u |= 1 << KPF_PGTABLE; diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index cbe50da5a59d..f111c780ef1d 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -95,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping, static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); __SetPageMovable(page, balloon->inode->i_mapping); set_page_private(page, (unsigned long)balloon); list_add(&page->lru, &balloon->pages); @@ -111,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, */ static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); __ClearPageMovable(page); set_page_private(page, 0); /* @@ -141,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void) static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); list_add(&page->lru, &balloon->pages); } static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); list_del(&page->lru); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 39b4494e29f1..808b4183e30d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -671,7 +671,7 @@ PAGEFLAG_FALSE(DoubleMap) /* Reserve 0x0000007f to catch underflows of page_mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 -#define PG_balloon 0x00000100 +#define PG_offline 0x00000100 #define PG_kmemcg 0x00000200 #define PG_table 0x00000400 @@ -706,10 +706,13 @@ static __always_inline void __ClearPage##uname(struct page *page) \ PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is true for pages that are on the balloon page list - * (see mm/balloon_compaction.c). + * PageOffline() indicates that the page is logically offline although the + * containing section is online. (e.g. inflated in a balloon driver or + * not onlined when onlining the section). + * The content of these pages is effectively stale. Such pages should not + * be touched (read/write/dump/save) except by their owner. */ -PAGE_TYPE_OPS(Balloon, balloon) +PAGE_TYPE_OPS(Offline, offline) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 21b9113c69da..6f2f2720f3ac 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -32,7 +32,7 @@ #define KPF_KSM 21 #define KPF_THP 22 -#define KPF_BALLOON 23 +#define KPF_OFFLINE 23 #define KPF_ZERO_PAGE 24 #define KPF_IDLE 25 #define KPF_PGTABLE 26 diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 1ff3a6c0367b..6f64b2b93234 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -133,7 +133,7 @@ static const char * const page_flag_names[] = { [KPF_NOPAGE] = "n:nopage", [KPF_KSM] = "x:ksm", [KPF_THP] = "t:thp", - [KPF_BALLOON] = "o:balloon", + [KPF_OFFLINE] = "o:offline", [KPF_PGTABLE] = "g:pgtable", [KPF_ZERO_PAGE] = "z:zero_page", [KPF_IDLE] = "i:idle_page", From e04b742f74c236202b7a505c2688068969d00e65 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:27 -0800 Subject: [PATCH 020/159] kexec: export PG_offline to VMCOREINFO Right now, pages inflated as part of a balloon driver will be dumped by dump tools like makedumpfile. While XEN is able to check in the crash kernel whether a certain pfn is actuall backed by memory in the hypervisor (see xen_oldmem_pfn_is_ram) and optimize this case, dumps of other balloon inflated memory will essentially result in zero pages getting allocated by the hypervisor and the dump getting filled with this data. The allocation and reading of zero pages can directly be avoided if a dumping tool could know which pages only contain stale information not to be dumped. We now have PG_offline which can be (and already is by virtio-balloon) used for marking pages as logically offline. Follow up patches will make use of this flag also in other balloon implementations. Let's export PG_offline via PAGE_OFFLINE_MAPCOUNT_VALUE, so makedumpfile can directly skip pages that are logically offline and the content therefore stale. Please note that this is also helpful for a problem we were seeing under Hyper-V: Dumping logically offline memory (pages kept fake offline while onlining a section via online_page_callback) would under some condicions result in a kernel panic when dumping them. Link: http://lkml.kernel.org/r/20181119101616.8901-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michael S. Tsirkin Acked-by: Dave Young Cc: "Kirill A. Shutemov" Cc: Baoquan He Cc: Omar Sandoval Cc: Arnd Bergmann Cc: Matthew Wilcox Cc: Michal Hocko Cc: Lianbo Jiang Cc: Borislav Petkov Cc: Kazuhito Hagio Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Boris Ostrovsky Cc: Christian Hansen Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jonathan Corbet Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: Konstantin Khlebnikov Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Miles Chen Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Pankaj gupta Cc: Pavel Machek Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Stephen Rothwell Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 933cb3e45b98..093c9f917ed0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif arch_crash_save_vmcoreinfo(); From 77c4adf6a6df6f8f39807eaed48eb73d0eb4261e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:32 -0800 Subject: [PATCH 021/159] xen/balloon: mark inflated pages PG_offline Mark inflated and never onlined pages PG_offline, to tell the world that the content is stale and should not be dumped. Link: http://lkml.kernel.org/r/20181119101616.8901-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Stefano Stabellini Cc: Matthew Wilcox Cc: Michal Hocko Cc: "Michael S. Tsirkin" Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Christian Hansen Cc: Dave Young Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jonathan Corbet Cc: Julien Freche Cc: Kairui Song Cc: Kazuhito Hagio Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Lianbo Jiang Cc: Michal Hocko Cc: Mike Rapoport Cc: Miles Chen Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Omar Sandoval Cc: Pankaj gupta Cc: Pavel Machek Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stephen Hemminger Cc: Stephen Rothwell Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/xen/balloon.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index d107447c47de..39b229f9e256 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -380,6 +380,7 @@ static void xen_online_page(struct page *page, unsigned int order) for (i = 0; i < size; i++) { p = pfn_to_page(start_pfn + i); __online_page_set_limits(p); + __SetPageOffline(p); __balloon_append(p); } mutex_unlock(&balloon_mutex); @@ -446,6 +447,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); /* Relinquish the page back to the allocator. */ + __ClearPageOffline(page); free_reserved_page(page); } @@ -472,6 +474,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) state = BP_EAGAIN; break; } + __SetPageOffline(page); adjust_managed_page_count(page, -1); xenmem_reservation_scrub_page(page); list_add(&page->lru, &pages); From fae42c4d522b9b9c9de21a5cade162f2e7eaf644 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:36 -0800 Subject: [PATCH 022/159] hv_balloon: mark inflated pages PG_offline Mark inflated and never onlined pages PG_offline, to tell the world that the content is stale and should not be dumped. Link: http://lkml.kernel.org/r/20181119101616.8901-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Pankaj gupta Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Kairui Song Cc: Vitaly Kuznetsov Cc: Matthew Wilcox Cc: Michal Hocko Cc: "Michael S. Tsirkin" Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Christian Hansen Cc: Dave Young Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Jonathan Corbet Cc: Juergen Gross Cc: Julien Freche Cc: Kazuhito Hagio Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: Len Brown Cc: Lianbo Jiang Cc: Michal Hocko Cc: Mike Rapoport Cc: Miles Chen Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Omar Sandoval Cc: Pavel Machek Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Rothwell Cc: Vlastimil Babka Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hv/hv_balloon.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index a50b7624b2a3..dd475f3bcc8a 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -681,8 +681,13 @@ static struct notifier_block hv_memory_nb = { /* Check if the particular page is backed and can be onlined and online it. */ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) { - if (!has_pfn_is_backed(has, page_to_pfn(pg))) + if (!has_pfn_is_backed(has, page_to_pfn(pg))) { + if (!PageOffline(pg)) + __SetPageOffline(pg); return; + } + if (PageOffline(pg)) + __ClearPageOffline(pg); /* This frame is currently backed; online the page. */ __online_page_set_limits(pg); @@ -1202,6 +1207,7 @@ static void free_balloon_pages(struct hv_dynmem_device *dm, for (i = 0; i < num_pages; i++) { pg = pfn_to_page(i + start_frame); + __ClearPageOffline(pg); __free_page(pg); dm->num_pages_ballooned--; } @@ -1214,7 +1220,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, struct dm_balloon_response *bl_resp, int alloc_unit) { - unsigned int i = 0; + unsigned int i, j; struct page *pg; if (num_pages < alloc_unit) @@ -1246,6 +1252,10 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, if (alloc_unit != 1) split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); + /* mark all pages offline */ + for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++) + __SetPageOffline(pg + j); + bl_resp->range_count++; bl_resp->range_array[i].finfo.start_page = page_to_pfn(pg); From 8165540c7fbc4a638d53907d2d51de6751f4a8ab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:41 -0800 Subject: [PATCH 023/159] vmw_balloon: mark inflated pages PG_offline Mark inflated and never onlined pages PG_offline, to tell the world that the content is stale and should not be dumped. [david@redhat.com: use vmballoon_page_in_frames more widely] Link: http://lkml.kernel.org/r/20181122100627.5189-7-david@redhat.com Link: http://lkml.kernel.org/r/20181119101616.8901-7-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Nadav Amit Cc: Xavier Deguillard Cc: Nadav Amit Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Julien Freche Cc: Matthew Wilcox Cc: Michal Hocko Cc: "Michael S. Tsirkin" Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Christian Hansen Cc: Dave Young Cc: David Rientjes Cc: Haiyang Zhang Cc: Jonathan Corbet Cc: Juergen Gross Cc: Kairui Song Cc: Kazuhito Hagio Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Lianbo Jiang Cc: Michal Hocko Cc: Mike Rapoport Cc: Miles Chen Cc: Naoya Horiguchi Cc: Omar Sandoval Cc: Pankaj gupta Cc: Pavel Machek Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Stephen Rothwell Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/vmw_balloon.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index f8240b87df22..869ec842729e 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -556,6 +556,36 @@ vmballoon_page_in_frames(enum vmballoon_page_size_type page_size) return 1 << vmballoon_page_order(page_size); } +/** + * vmballoon_mark_page_offline() - mark a page as offline + * @page: pointer for the page. + * @page_size: the size of the page. + */ +static void +vmballoon_mark_page_offline(struct page *page, + enum vmballoon_page_size_type page_size) +{ + int i; + + for (i = 0; i < vmballoon_page_in_frames(page_size); i++) + __SetPageOffline(page + i); +} + +/** + * vmballoon_mark_page_online() - mark a page as online + * @page: pointer for the page. + * @page_size: the size of the page. + */ +static void +vmballoon_mark_page_online(struct page *page, + enum vmballoon_page_size_type page_size) +{ + int i; + + for (i = 0; i < vmballoon_page_in_frames(page_size); i++) + __ClearPageOffline(page + i); +} + /** * vmballoon_send_get_target() - Retrieve desired balloon size from the host. * @@ -612,6 +642,7 @@ static int vmballoon_alloc_page_list(struct vmballoon *b, ctl->page_size); if (page) { + vmballoon_mark_page_offline(page, ctl->page_size); /* Success. Add the page to the list and continue. */ list_add(&page->lru, &ctl->pages); continue; @@ -850,6 +881,7 @@ static void vmballoon_release_page_list(struct list_head *page_list, list_for_each_entry_safe(page, tmp, page_list, lru) { list_del(&page->lru); + vmballoon_mark_page_online(page, page_size); __free_pages(page, vmballoon_page_order(page_size)); } From 5b56db37218e6503906c6057c177a84f0a0ba551 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:45 -0800 Subject: [PATCH 024/159] PM/Hibernate: use pfn_to_online_page() Let's use pfn_to_online_page() instead of pfn_to_page() when checking for saveable pages to not save/restore offline memory sections. Link: http://lkml.kernel.org/r/20181119101616.8901-8-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki Cc: Len Brown Cc: Matthew Wilcox Cc: "Michael S. Tsirkin" Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Christian Hansen Cc: Dave Young Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jonathan Corbet Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: Kazuhito Hagio Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: "K. Y. Srinivasan" Cc: Lianbo Jiang Cc: Mike Rapoport Cc: Miles Chen Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Omar Sandoval Cc: Pankaj gupta Cc: Pavel Tatashin Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Stephen Rothwell Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 640b2034edd6..87e6dd57819f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1215,8 +1215,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(!PageHighMem(page)); @@ -1277,8 +1277,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(PageHighMem(page)); From abd02ac616e32d818a0478e68924beac8ba5e5d8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:50 -0800 Subject: [PATCH 025/159] PM/Hibernate: exclude all PageOffline() pages The content of pages that are marked PG_offline is not of interest (e.g. inflated by a balloon driver), let's skip these pages. In saveable_highmem_page(), move the PageReserved() check to a new check along with the PageOffline() check to separate it from the swsusp checks. [david@redhat.com: v2] Link: http://lkml.kernel.org/r/20181122100627.5189-9-david@redhat.com Link: http://lkml.kernel.org/r/20181119101616.8901-9-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki Cc: Pavel Machek Cc: Len Brown Cc: Matthew Wilcox Cc: Michal Hocko Cc: "Michael S. Tsirkin" Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Christian Hansen Cc: Dave Young Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jonathan Corbet Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: Kazuhito Hagio Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: "K. Y. Srinivasan" Cc: Lianbo Jiang Cc: Michal Hocko Cc: Mike Rapoport Cc: Miles Chen Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Omar Sandoval Cc: Pankaj gupta Cc: Pavel Tatashin Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Stephen Rothwell Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 87e6dd57819f..4802b039b89f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1221,8 +1221,10 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) BUG_ON(!PageHighMem(page)); - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) || PageOffline(page)) return NULL; if (page_is_guard(page)) @@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; + if (PageOffline(page)) + return NULL; + if (PageReserved(page) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; From 6ade20327dbb808882888ed8ccded71e93067cf9 Mon Sep 17 00:00:00 2001 From: Liviu Dudau Date: Tue, 5 Mar 2019 15:42:54 -0800 Subject: [PATCH 026/159] mm/vmalloc.c: don't dereference possible NULL pointer in __vunmap() find_vmap_area() can return a NULL pointer and we're going to dereference it without checking it first. Use the existing find_vm_area() function which does exactly what we want and checks for the NULL pointer. Link: http://lkml.kernel.org/r/20181228171009.22269-1-liviu@dudau.co.uk Fixes: f3c01d2f3ade ("mm: vmalloc: avoid racy handling of debugobjects in vunmap") Signed-off-by: Liviu Dudau Reviewed-by: Andrew Morton Cc: Chintan Pandya Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 871e41c55e23..806047d7fda3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1505,7 +1505,7 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = find_vmap_area((unsigned long)addr)->vm; + area = find_vm_area(addr); if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); From 98fa15f34cb379864757670b8e8743b21456a20e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:42:58 -0800 Subject: [PATCH 027/159] mm: replace all open encodings for NUMA_NO_NODE Patch series "Replace all open encodings for NUMA_NO_NODE", v3. All these places for replacement were found by running the following grep patterns on the entire kernel code. Please let me know if this might have missed some instances. This might also have replaced some false positives. I will appreciate suggestions, inputs and review. 1. git grep "nid == -1" 2. git grep "node == -1" 3. git grep "nid = -1" 4. git grep "node = -1" This patch (of 2): At present there are multiple places where invalid node number is encoded as -1. Even though implicitly understood it is always better to have macros in there. Replace these open encodings for an invalid node number with the global macro NUMA_NO_NODE. This helps remove NUMA related assumptions like 'invalid node' from various places redirecting them to a common definition. Link: http://lkml.kernel.org/r/1545127933-10711-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: David Hildenbrand Acked-by: Jeff Kirsher [ixgbe] Acked-by: Jens Axboe [mtip32xx] Acked-by: Vinod Koul [dmaengine.c] Acked-by: Michael Ellerman [powerpc] Acked-by: Doug Ledford [drivers/infiniband] Cc: Joseph Qi Cc: Hans Verkuil Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/topology.h | 3 ++- arch/ia64/kernel/numa.c | 2 +- arch/ia64/mm/discontig.c | 6 +++--- arch/powerpc/include/asm/pci-bridge.h | 3 ++- arch/powerpc/kernel/paca.c | 3 ++- arch/powerpc/kernel/pci-common.c | 3 ++- arch/powerpc/mm/numa.c | 14 +++++++------- arch/powerpc/platforms/powernv/memtrace.c | 5 +++-- arch/sparc/kernel/pci_fire.c | 3 ++- arch/sparc/kernel/pci_schizo.c | 3 ++- arch/sparc/kernel/psycho_common.c | 3 ++- arch/sparc/kernel/sbus.c | 3 ++- arch/sparc/mm/init_64.c | 6 +++--- arch/x86/include/asm/pci.h | 3 ++- arch/x86/kernel/apic/x2apic_uv_x.c | 7 ++++--- arch/x86/kernel/smpboot.c | 3 ++- drivers/block/mtip32xx/mtip32xx.c | 5 +++-- drivers/dma/dmaengine.c | 4 +++- drivers/infiniband/hw/hfi1/affinity.c | 3 ++- drivers/infiniband/hw/hfi1/init.c | 3 ++- drivers/iommu/dmar.c | 5 +++-- drivers/iommu/intel-iommu.c | 3 ++- drivers/misc/sgi-xp/xpc_uv.c | 3 ++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 +++-- include/linux/device.h | 2 +- init/init_task.c | 3 ++- kernel/kthread.c | 3 ++- kernel/sched/fair.c | 15 ++++++++------- lib/cpumask.c | 3 ++- mm/huge_memory.c | 13 +++++++------ mm/hugetlb.c | 3 ++- mm/ksm.c | 2 +- mm/memory.c | 7 ++++--- mm/memory_hotplug.c | 12 ++++++------ mm/mempolicy.c | 2 +- mm/page_alloc.c | 4 ++-- mm/page_ext.c | 2 +- net/core/pktgen.c | 3 ++- net/qrtr/qrtr.c | 3 ++- 39 files changed, 104 insertions(+), 74 deletions(-) diff --git a/arch/alpha/include/asm/topology.h b/arch/alpha/include/asm/topology.h index e6e13a85796a..5a77a40567fa 100644 --- a/arch/alpha/include/asm/topology.h +++ b/arch/alpha/include/asm/topology.h @@ -4,6 +4,7 @@ #include #include +#include #include #ifdef CONFIG_NUMA @@ -29,7 +30,7 @@ static const struct cpumask *cpumask_of_node(int node) { int cpu; - if (node == -1) + if (node == NUMA_NO_NODE) return cpu_all_mask; cpumask_clear(&node_to_cpumask_map[node]); diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c index 92c376279c6d..1315da6c7aeb 100644 --- a/arch/ia64/kernel/numa.c +++ b/arch/ia64/kernel/numa.c @@ -74,7 +74,7 @@ void __init build_cpu_to_node_map(void) cpumask_clear(&node_to_cpu_mask[node]); for_each_possible_early_cpu(cpu) { - node = -1; + node = NUMA_NO_NODE; for (i = 0; i < NR_CPUS; ++i) if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { node = node_cpuid[i].nid; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 8a965784340c..f9c36750c6a4 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -227,7 +227,7 @@ void __init setup_per_cpu_areas(void) * CPUs are put into groups according to node. Walk cpu_map * and create new groups at node boundaries. */ - prev_node = -1; + prev_node = NUMA_NO_NODE; ai->nr_groups = 0; for (unit = 0; unit < nr_units; unit++) { cpu = cpu_map[unit]; @@ -435,7 +435,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize) { void *ptr = NULL; u8 best = 0xff; - int bestnode = -1, node, anynode = 0; + int bestnode = NUMA_NO_NODE, node, anynode = 0; for_each_online_node(node) { if (node_isset(node, memory_less_mask)) @@ -447,7 +447,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize) anynode = node; } - if (bestnode == -1) + if (bestnode == NUMA_NO_NODE) bestnode = anynode; ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE, diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index aee4fcc24990..77fc21278fa2 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -10,6 +10,7 @@ #include #include #include +#include struct device_node; @@ -265,7 +266,7 @@ extern int pcibios_map_io_space(struct pci_bus *bus); #ifdef CONFIG_NUMA #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE)) #else -#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = -1) +#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = NUMA_NO_NODE) #endif #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 913bfca09c4f..b8480127793d 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -36,7 +37,7 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align, * which will put its paca in the right place. */ if (cpu == boot_cpuid) { - nid = -1; + nid = NUMA_NO_NODE; memblock_set_bottom_up(true); } else { nid = early_cpu_to_node(cpu); diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 88e4f69a09e5..4538e8ddde80 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -132,7 +133,7 @@ struct pci_controller *pcibios_alloc_controller(struct device_node *dev) int nid = of_node_to_nid(dev); if (nid < 0 || !node_online(nid)) - nid = -1; + nid = NUMA_NO_NODE; PHB_SET_NODE(phb, nid); } diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 87f0dd004295..270cefb75cca 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -215,7 +215,7 @@ static void initialize_distance_lookup_table(int nid, */ static int associativity_to_nid(const __be32 *associativity) { - int nid = -1; + int nid = NUMA_NO_NODE; if (min_common_depth == -1) goto out; @@ -225,7 +225,7 @@ static int associativity_to_nid(const __be32 *associativity) /* POWER4 LPAR uses 0xffff as invalid node */ if (nid == 0xffff || nid >= MAX_NUMNODES) - nid = -1; + nid = NUMA_NO_NODE; if (nid > 0 && of_read_number(associativity, 1) >= distance_ref_points_depth) { @@ -244,7 +244,7 @@ out: */ static int of_node_to_nid_single(struct device_node *device) { - int nid = -1; + int nid = NUMA_NO_NODE; const __be32 *tmp; tmp = of_get_associativity(device); @@ -256,7 +256,7 @@ static int of_node_to_nid_single(struct device_node *device) /* Walk the device tree upwards, looking for an associativity id */ int of_node_to_nid(struct device_node *device) { - int nid = -1; + int nid = NUMA_NO_NODE; of_node_get(device); while (device) { @@ -454,7 +454,7 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb) */ static int numa_setup_cpu(unsigned long lcpu) { - int nid = -1; + int nid = NUMA_NO_NODE; struct device_node *cpu; /* @@ -930,7 +930,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) { struct drmem_lmb *lmb; unsigned long lmb_size; - int nid = -1; + int nid = NUMA_NO_NODE; lmb_size = drmem_lmb_size(); @@ -960,7 +960,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) static int hot_add_node_scn_to_nid(unsigned long scn_addr) { struct device_node *memory; - int nid = -1; + int nid = NUMA_NO_NODE; for_each_node_by_type(memory, "memory") { unsigned long start, size; diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 84d038ed3882..248a38ad25c7 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -223,7 +224,7 @@ static int memtrace_online(void) ent = &memtrace_array[i]; /* We have onlined this chunk previously */ - if (ent->nid == -1) + if (ent->nid == NUMA_NO_NODE) continue; /* Remove from io mappings */ @@ -257,7 +258,7 @@ static int memtrace_online(void) */ debugfs_remove_recursive(ent->dir); pr_info("Added trace memory back to node %d\n", ent->nid); - ent->size = ent->start = ent->nid = -1; + ent->size = ent->start = ent->nid = NUMA_NO_NODE; } if (ret) return ret; diff --git a/arch/sparc/kernel/pci_fire.c b/arch/sparc/kernel/pci_fire.c index be71ae086622..0ca08d455e80 100644 --- a/arch/sparc/kernel/pci_fire.c +++ b/arch/sparc/kernel/pci_fire.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -416,7 +417,7 @@ static int pci_fire_pbm_init(struct pci_pbm_info *pbm, struct device_node *dp = op->dev.of_node; int err; - pbm->numa_node = -1; + pbm->numa_node = NUMA_NO_NODE; pbm->pci_ops = &sun4u_pci_ops; pbm->config_space_reg_bits = 12; diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c index 934b97c72f7c..421aba00e6b0 100644 --- a/arch/sparc/kernel/pci_schizo.c +++ b/arch/sparc/kernel/pci_schizo.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -1347,7 +1348,7 @@ static int schizo_pbm_init(struct pci_pbm_info *pbm, pbm->next = pci_pbm_root; pci_pbm_root = pbm; - pbm->numa_node = -1; + pbm->numa_node = NUMA_NO_NODE; pbm->pci_ops = &sun4u_pci_ops; pbm->config_space_reg_bits = 8; diff --git a/arch/sparc/kernel/psycho_common.c b/arch/sparc/kernel/psycho_common.c index 81aa91e5c0e6..e90bcb6bad7f 100644 --- a/arch/sparc/kernel/psycho_common.c +++ b/arch/sparc/kernel/psycho_common.c @@ -5,6 +5,7 @@ */ #include #include +#include #include @@ -454,7 +455,7 @@ void psycho_pbm_init_common(struct pci_pbm_info *pbm, struct platform_device *op struct device_node *dp = op->dev.of_node; pbm->name = dp->full_name; - pbm->numa_node = -1; + pbm->numa_node = NUMA_NO_NODE; pbm->chip_type = chip_type; pbm->chip_version = of_getintprop_default(dp, "version#", 0); pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0); diff --git a/arch/sparc/kernel/sbus.c b/arch/sparc/kernel/sbus.c index 41c5deb581b8..32141e1006c4 100644 --- a/arch/sparc/kernel/sbus.c +++ b/arch/sparc/kernel/sbus.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -561,7 +562,7 @@ static void __init sbus_iommu_init(struct platform_device *op) op->dev.archdata.iommu = iommu; op->dev.archdata.stc = strbuf; - op->dev.archdata.numa_node = -1; + op->dev.archdata.numa_node = NUMA_NO_NODE; reg_base = regs + SYSIO_IOMMUREG_BASE; iommu->iommu_control = reg_base + IOMMU_CONTROL; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index b4221d3727d0..9e6bd868ba6f 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -976,13 +976,13 @@ static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid) { int prev_nid, new_nid; - prev_nid = -1; + prev_nid = NUMA_NO_NODE; for ( ; start < end; start += PAGE_SIZE) { for (new_nid = 0; new_nid < num_node_masks; new_nid++) { struct node_mem_mask *p = &node_masks[new_nid]; if ((start & p->mask) == p->match) { - if (prev_nid == -1) + if (prev_nid == NUMA_NO_NODE) prev_nid = new_nid; break; } @@ -1208,7 +1208,7 @@ int of_node_to_nid(struct device_node *dp) md = mdesc_grab(); count = 0; - nid = -1; + nid = NUMA_NO_NODE; mdesc_for_each_node_by_name(md, grp, "group") { if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) { nid = count; diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 662963681ea6..e662f987dfa2 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -141,7 +142,7 @@ cpumask_of_pcibus(const struct pci_bus *bus) int node; node = __pcibus_to_node(bus); - return (node == -1) ? cpu_online_mask : + return (node == NUMA_NO_NODE) ? cpu_online_mask : cpumask_of_node(node); } #endif diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index a555da094157..1e225528f0d7 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1390,7 +1391,7 @@ static void __init build_socket_tables(void) } /* Set socket -> node values: */ - lnid = -1; + lnid = NUMA_NO_NODE; for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); int apicid, sockid; @@ -1521,7 +1522,7 @@ static void __init uv_system_init_hub(void) new_hub->pnode = 0xffff; new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); - new_hub->memory_nid = -1; + new_hub->memory_nid = NUMA_NO_NODE; new_hub->nr_possible_cpus = 0; new_hub->nr_online_cpus = 0; } @@ -1538,7 +1539,7 @@ static void __init uv_system_init_hub(void) uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; - if (uv_cpu_hub_info(cpu)->memory_nid == -1) + if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE) uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); /* Init memoryless node: */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ccd1f2a8e557..c91ff9f9fe8a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -841,7 +842,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { - static int current_node = -1; + static int current_node = NUMA_NO_NODE; int node = early_cpu_to_node(cpu); static int width, node_width; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 88e8440e75c3..2f3ee4d6af82 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "mtip32xx.h" #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) @@ -4018,9 +4019,9 @@ static int get_least_used_cpu_on_node(int node) /* Helper for selecting a node in round robin mode */ static inline int mtip_get_next_rr_node(void) { - static int next_node = -1; + static int next_node = NUMA_NO_NODE; - if (next_node == -1) { + if (next_node == NUMA_NO_NODE) { next_node = first_online_node; return next_node; } diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index f1a441ab395d..3a11b1092e80 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -63,6 +63,7 @@ #include #include #include +#include static DEFINE_MUTEX(dma_list_mutex); static DEFINE_IDA(dma_ida); @@ -386,7 +387,8 @@ EXPORT_SYMBOL(dma_issue_pending_all); static bool dma_chan_is_local(struct dma_chan *chan, int cpu) { int node = dev_to_node(chan->device->dev); - return node == -1 || cpumask_test_cpu(cpu, cpumask_of_node(node)); + return node == NUMA_NO_NODE || + cpumask_test_cpu(cpu, cpumask_of_node(node)); } /** diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 2baf38cc1e23..4fe662c3bbc1 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "hfi.h" #include "affinity.h" @@ -777,7 +778,7 @@ void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) _dev_comp_vect_cpu_mask_clean_up(dd, entry); unlock: mutex_unlock(&node_affinity.lock); - dd->node = -1; + dd->node = NUMA_NO_NODE; } /* diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 7835eb52e7c5..441b06e2a154 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include "hfi.h" @@ -1303,7 +1304,7 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, dd->unit = ret; list_add(&dd->list, &hfi1_dev_list); } - dd->node = -1; + dd->node = NUMA_NO_NODE; spin_unlock_irqrestore(&hfi1_devs_lock, flags); idr_preload_end(); diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 58dc70bffd5b..9c49300e9fb7 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -477,7 +478,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg) int node = acpi_map_pxm_to_node(rhsa->proximity_domain); if (!node_online(node)) - node = -1; + node = NUMA_NO_NODE; drhd->iommu->node = node; return 0; } @@ -1062,7 +1063,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) iommu->msagaw = msagaw; iommu->segment = drhd->segment; - iommu->node = -1; + iommu->node = NUMA_NO_NODE; ver = readl(iommu->reg + DMAR_VER_REG); pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 78188bf7e90d..39a33dec4d0b 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1716,7 +1717,7 @@ static struct dmar_domain *alloc_domain(int flags) return NULL; memset(domain, 0, sizeof(*domain)); - domain->nid = -1; + domain->nid = NUMA_NO_NODE; domain->flags = flags; domain->has_iotlb_device = false; INIT_LIST_HEAD(&domain->devices); diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 0441abe87880..9e443df44b3b 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #if defined CONFIG_X86_64 #include @@ -61,7 +62,7 @@ static struct xpc_heartbeat_uv *xpc_heartbeat_uv; XPC_NOTIFY_MSG_SIZE_UV) #define XPC_NOTIFY_IRQ_NAME "xpc_notify" -static int xpc_mq_node = -1; +static int xpc_mq_node = NUMA_NO_NODE; static struct xpc_gru_mq_uv *xpc_activate_mq_uv; static struct xpc_gru_mq_uv *xpc_notify_mq_uv; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index a4e7584a50cb..e100054a3765 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -6418,7 +6419,7 @@ int ixgbe_setup_tx_resources(struct ixgbe_ring *tx_ring) { struct device *dev = tx_ring->dev; int orig_node = dev_to_node(dev); - int ring_node = -1; + int ring_node = NUMA_NO_NODE; int size; size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count; @@ -6512,7 +6513,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, { struct device *dev = rx_ring->dev; int orig_node = dev_to_node(dev); - int ring_node = -1; + int ring_node = NUMA_NO_NODE; int size; size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count; diff --git a/include/linux/device.h b/include/linux/device.h index 6cb4640b6160..4d2f13e8c540 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1095,7 +1095,7 @@ static inline void set_dev_node(struct device *dev, int node) #else static inline int dev_to_node(struct device *dev) { - return -1; + return NUMA_NO_NODE; } static inline void set_dev_node(struct device *dev, int node) { diff --git a/init/init_task.c b/init/init_task.c index 5aebe3be4d7c..26131e73aa6d 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -154,7 +155,7 @@ struct task_struct init_task .vtime.state = VTIME_SYS, #endif #ifdef CONFIG_NUMA_BALANCING - .numa_preferred_nid = -1, + .numa_preferred_nid = NUMA_NO_NODE, .numa_group = NULL, .numa_faults = NULL, #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index 087d18d771b5..ebebbcf3c5de 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,6 +20,7 @@ #include #include #include +#include #include static DEFINE_SPINLOCK(kthread_create_lock); @@ -675,7 +676,7 @@ __kthread_create_worker(int cpu, unsigned int flags, { struct kthread_worker *worker; struct task_struct *task; - int node = -1; + int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 310d0637fe4b..0e6a0ef129c5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1160,7 +1160,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = -1; + p->numa_preferred_nid = NUMA_NO_NODE; return; } @@ -1180,13 +1180,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); } static void account_numa_dequeue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } @@ -1400,7 +1400,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * two full passes of the "multi-stage node selection" test that is * executed below. */ - if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) return true; @@ -1848,7 +1848,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -2095,7 +2095,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1; + int seq, nid, max_nid = NUMA_NO_NODE; unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; @@ -2638,7 +2638,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) * the preferred node. */ if (dst_nid == p->numa_preferred_nid || - (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + (p->numa_preferred_nid != NUMA_NO_NODE && + src_nid != p->numa_preferred_nid)) return; } diff --git a/lib/cpumask.c b/lib/cpumask.c index 8d666ab84b5c..087a3e9a0202 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -5,6 +5,7 @@ #include #include #include +#include /** * cpumask_next - get the next cpu in a cpumask @@ -206,7 +207,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node) /* Wrap: we always want a cpu. */ i %= num_online_cpus(); - if (node == -1) { + if (node == NUMA_NO_NODE) { for_each_cpu(cpu, cpu_online_mask) if (i-- == 0) return cpu; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index faf357eaf0ce..d066f7ca1ee8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1475,7 +1476,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int page_nid = -1, this_nid = numa_node_id(); + int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; @@ -1520,7 +1521,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) */ page_locked = trylock_page(page); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { + if (target_nid == NUMA_NO_NODE) { /* If the page was locked, there are no parallel migrations */ if (page_locked) goto clear_pmdnuma; @@ -1528,7 +1529,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - page_nid = -1; + page_nid = NUMA_NO_NODE; if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); @@ -1549,14 +1550,14 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); - page_nid = -1; + page_nid = NUMA_NO_NODE; goto out_unlock; } /* Bail if we fail to protect against THP splits for any reason */ if (unlikely(!anon_vma)) { put_page(page); - page_nid = -1; + page_nid = NUMA_NO_NODE; goto clear_pmdnuma; } @@ -1618,7 +1619,7 @@ out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); - if (page_nid != -1) + if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8dfdffc34a99..3c504fa6b460 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - int node = -1; + int node = NUMA_NO_NODE; zonelist = node_zonelist(nid, gfp_mask); diff --git a/mm/ksm.c b/mm/ksm.c index 6c48ad13b4c9..fd2db6a74d3c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -598,7 +598,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, chain->chain_prune_time = jiffies; chain->rmap_hlist_len = STABLE_NODE_CHAIN; #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) - chain->nid = -1; /* debug */ + chain->nid = NUMA_NO_NODE; /* debug */ #endif ksm_stable_node_chains++; diff --git a/mm/memory.c b/mm/memory.c index e11ca9dd823f..eb40f32295d2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -3586,7 +3587,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; - int page_nid = -1; + int page_nid = NUMA_NO_NODE; int last_cpupid; int target_nid; bool migrated = false; @@ -3653,7 +3654,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); pte_unmap_unlock(vmf->pte, vmf->ptl); - if (target_nid == -1) { + if (target_nid == NUMA_NO_NODE) { put_page(page); goto out; } @@ -3667,7 +3668,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_MIGRATE_FAIL; out: - if (page_nid != -1) + if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4f07c8ddfdd7..b3d3c64d15df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -702,9 +702,9 @@ static void node_states_check_changes_online(unsigned long nr_pages, { int nid = zone_to_nid(zone); - arg->status_change_nid = -1; - arg->status_change_nid_normal = -1; - arg->status_change_nid_high = -1; + arg->status_change_nid = NUMA_NO_NODE; + arg->status_change_nid_normal = NUMA_NO_NODE; + arg->status_change_nid_high = NUMA_NO_NODE; if (!node_state(nid, N_MEMORY)) arg->status_change_nid = nid; @@ -1509,9 +1509,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages, unsigned long present_pages = 0; enum zone_type zt; - arg->status_change_nid = -1; - arg->status_change_nid_normal = -1; - arg->status_change_nid_high = -1; + arg->status_change_nid = NUMA_NO_NODE; + arg->status_change_nid_normal = NUMA_NO_NODE; + arg->status_change_nid_high = NUMA_NO_NODE; /* * Check whether node_states[N_NORMAL_MEMORY] will be changed. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ee2bce59d2bf..76e7e4bc3335 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2304,7 +2304,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); - int polnid = -1; + int polnid = NUMA_NO_NODE; int ret = -1; pol = get_vma_policy(vma, addr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5361bd078493..1f9f1409df9b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6016,7 +6016,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn, return state->last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); - if (nid != -1) { + if (nid != NUMA_NO_NODE) { state->last_start = start_pfn; state->last_end = end_pfn; state->last_nid = nid; @@ -6771,7 +6771,7 @@ unsigned long __init node_map_pfn_alignment(void) { unsigned long accl_mask = 0, last_end = 0; unsigned long start, end, mask; - int last_nid = -1; + int last_nid = NUMA_NO_NODE; int i, nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { diff --git a/mm/page_ext.c b/mm/page_ext.c index 8c78b8d45117..762d5b7eb523 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -300,7 +300,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); - if (nid == -1) { + if (nid == NUMA_NO_NODE) { /* * In this case, "nid" already exists and contains valid memory. * "start_pfn" passed to us is a pfn which is an arg for diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 6ac919847ce6..f3f5a78cd062 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -158,6 +158,7 @@ #include #include #include +#include #include #include #include @@ -3625,7 +3626,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; pkt_dev->burst = 1; - pkt_dev->node = -1; + pkt_dev->node = NUMA_NO_NODE; err = pktgen_setup_dev(t->net, pkt_dev, ifname); if (err) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 86e1e37eb4e8..b37e6e0a1026 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -15,6 +15,7 @@ #include #include #include /* For TIOCINQ/OUTQ */ +#include #include @@ -101,7 +102,7 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk) return container_of(sk, struct qrtr_sock, sk); } -static unsigned int qrtr_local_nid = -1; +static unsigned int qrtr_local_nid = NUMA_NO_NODE; /* for node ids */ static RADIX_TREE(qrtr_nodes, GFP_KERNEL); From 7c9eefe82ca1efec5890678c33e66d5d520c06f4 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 5 Mar 2019 15:43:01 -0800 Subject: [PATCH 028/159] tools/: replace open encodings for NUMA_NO_NODE This replaces all open encodings in tools with NUMA_NO_NODE. Also linux/numa.h is now needed for the perf build. [sfr@canb.auug.org.au: fix for replace open encodings for NUMA_NO_NODE] Link: http://lkml.kernel.org/r/20190108131141.730e9c4f@canb.auug.org.au Link: http://lkml.kernel.org/r/1545127933-10711-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Stephen Rothwell Signed-off-by: Anshuman Khandual Signed-off-by: Stephen Rothwell Cc: David Hildenbrand Cc: Doug Ledford [drivers/infiniband] Cc: Hans Verkuil Cc: Jeff Kirsher [ixgbe] Cc: Jens Axboe [mtip32xx] Cc: Joseph Qi Cc: Michael Ellerman [powerpc] Cc: Vinod Koul [dmaengine.c] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/linux/numa.h | 16 ++++++++++++++++ tools/perf/bench/numa.c | 7 ++++--- 2 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 tools/include/linux/numa.h diff --git a/tools/include/linux/numa.h b/tools/include/linux/numa.h new file mode 100644 index 000000000000..110b0e5d0fb0 --- /dev/null +++ b/tools/include/linux/numa.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NUMA_H +#define _LINUX_NUMA_H + + +#ifdef CONFIG_NODES_SHIFT +#define NODES_SHIFT CONFIG_NODES_SHIFT +#else +#define NODES_SHIFT 0 +#endif + +#define MAX_NUMNODES (1 << NODES_SHIFT) + +#define NUMA_NO_NODE (-1) + +#endif /* _LINUX_NUMA_H */ diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 44195514b19e..98ad783efc69 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -298,7 +299,7 @@ static cpu_set_t bind_to_node(int target_node) CPU_ZERO(&mask); - if (target_node == -1) { + if (target_node == NUMA_NO_NODE) { for (cpu = 0; cpu < g->p.nr_cpus; cpu++) CPU_SET(cpu, &mask); } else { @@ -339,7 +340,7 @@ static void bind_to_memnode(int node) unsigned long nodemask; int ret; - if (node == -1) + if (node == NUMA_NO_NODE) return; BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); @@ -1363,7 +1364,7 @@ static void init_thread_data(void) int cpu; /* Allow all nodes by default: */ - td->bind_node = -1; + td->bind_node = NUMA_NO_NODE; /* Allow all CPUs by default: */ CPU_ZERO(&td->bind_cpumask); From 52d1e606ee733921e984770d47539a6bb91e8506 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 5 Mar 2019 15:43:06 -0800 Subject: [PATCH 029/159] mm: reuse only-pte-mapped KSM page in do_wp_page() Add an optimization for KSM pages almost in the same way that we have for ordinary anonymous pages. If there is a write fault in a page, which is mapped to an only pte, and it is not related to swap cache; the page may be reused without copying its content. [ Note that we do not consider PageSwapCache() pages at least for now, since we don't want to complicate __get_ksm_page(), which has nice optimization based on this (for the migration case). Currenly it is spinning on PageSwapCache() pages, waiting for when they have unfreezed counters (i.e., for the migration finish). But we don't want to make it also spinning on swap cache pages, which we try to reuse, since there is not a very high probability to reuse them. So, for now we do not consider PageSwapCache() pages at all. ] So in reuse_ksm_page() we check for 1) PageSwapCache() and 2) page_stable_node(), to skip a page, which KSM is currently trying to link to stable tree. Then we do page_ref_freeze() to prohibit KSM to merge one more page into the page, we are reusing. After that, nobody can refer to the reusing page: KSM skips !PageSwapCache() pages with zero refcount; and the protection against of all other participants is the same as for reused ordinary anon pages pte lock, page lock and mmap_sem. [akpm@linux-foundation.org: replace BUG_ON()s with WARN_ON()s] Link: http://lkml.kernel.org/r/154471491016.31352.1168978849911555609.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Christian Koenig Cc: Claudio Imbrenda Cc: Rik van Riel Cc: Huang Ying Cc: Minchan Kim Cc: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 7 +++++++ mm/ksm.c | 30 ++++++++++++++++++++++++++++-- mm/memory.c | 16 ++++++++++++++-- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 161e8164abcf..e48b1e453ff5 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -53,6 +53,8 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); +bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, unsigned long address); #else /* !CONFIG_KSM */ @@ -86,6 +88,11 @@ static inline void rmap_walk_ksm(struct page *page, static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) { } +static inline bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + return false; +} #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ diff --git a/mm/ksm.c b/mm/ksm.c index fd2db6a74d3c..983fbac24bda 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -706,8 +706,9 @@ again: * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; - * but if page is swapcache in migrate_page_move_mapping(), it might - * still be our page, in which case it's essential to keep the node. + * the same is in reuse_ksm_page() case; but if page is swapcache + * in migrate_page_move_mapping(), it might still be our page, + * in which case it's essential to keep the node. */ while (!get_page_unless_zero(page)) { /* @@ -2642,6 +2643,31 @@ again: goto again; } +bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ +#ifdef CONFIG_DEBUG_VM + if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || + WARN_ON(!page_mapped(page)) || + WARN_ON(!PageLocked(page))) { + dump_page(page, "reuse_ksm_page"); + return false; + } +#endif + + if (PageSwapCache(page) || !page_stable_node(page)) + return false; + /* Prohibit parallel get_ksm_page() */ + if (!page_ref_freeze(page, 1)) + return false; + + page_move_anon_rmap(page, vma); + page->index = linear_page_index(vma, address); + page_ref_unfreeze(page, 1); + + return true; +} #ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { diff --git a/mm/memory.c b/mm/memory.c index eb40f32295d2..222da66f16b4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2505,8 +2505,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { + if (PageAnon(vmf->page)) { int total_map_swapcount; + if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) || + page_count(vmf->page) != 1)) + goto copy; if (!trylock_page(vmf->page)) { get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2521,6 +2524,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) } put_page(vmf->page); } + if (PageKsm(vmf->page)) { + bool reused = reuse_ksm_page(vmf->page, vmf->vma, + vmf->address); + unlock_page(vmf->page); + if (!reused) + goto copy; + wp_page_reuse(vmf); + return VM_FAULT_WRITE; + } if (reuse_swap_page(vmf->page, &total_map_swapcount)) { if (total_map_swapcount == 1) { /* @@ -2541,7 +2553,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) (VM_WRITE|VM_SHARED))) { return wp_page_shared(vmf); } - +copy: /* * Ok, we need to copy. Oh, well.. */ From 9234bae9b252bbc231abcabfa644a4eb9724250c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 5 Mar 2019 15:43:10 -0800 Subject: [PATCH 030/159] mm, slub: make the comment of put_cpu_partial() complete There are two cases when put_cpu_partial() is invoked. * __slab_free * get_partial_node This patch just makes it cover these two cases. Link: http://lkml.kernel.org/r/20181025094437.18951-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 037a8ca7f7ca..d8b1eee2dd86 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2253,8 +2253,8 @@ static void unfreeze_partials(struct kmem_cache *s, } /* - * Put a page that was just frozen (in __slab_free) into a partial page - * slot if available. + * Put a page that was just frozen (in __slab_free|get_partial_node) into a + * partial page slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. From 60cd4bcd62384cfa1e5890cebacccf08b3161156 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 5 Mar 2019 15:43:13 -0800 Subject: [PATCH 031/159] memcg: localize memcg_kmem_enabled() check Move the memcg_kmem_enabled() checks into memcg kmem charge/uncharge functions, so, the users don't have to explicitly check that condition. This is purely code cleanup patch without any functional change. Only the order of checks in memcg_charge_slab() can potentially be changed but the functionally it will be same. This should not matter as memcg_charge_slab() is not in the hot path. Link: http://lkml.kernel.org/r/20190103161203.162375-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 3 +-- include/linux/memcontrol.h | 37 +++++++++++++++++++++++++++++++++---- mm/memcontrol.c | 16 ++++++++-------- mm/page_alloc.c | 4 ++-- mm/slab.h | 4 ---- 5 files changed, 44 insertions(+), 20 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index bdc5d3c0977d..51d5fd8840ab 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct page *page = buf->page; if (page_count(page) == 1) { - if (memcg_kmem_enabled()) - memcg_kmem_uncharge(page, 0); + memcg_kmem_uncharge(page, 0); __SetPageLocked(page); return 0; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83ae11cbd12c..b0eb29ea0d9c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1273,12 +1273,12 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); void memcg_kmem_put_cache(struct kmem_cache *cachep); -int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_KMEM -int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); -void memcg_kmem_uncharge(struct page *page, int order); +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); +void __memcg_kmem_uncharge(struct page *page, int order); +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg); extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1300,6 +1300,26 @@ static inline bool memcg_kmem_enabled(void) return static_branch_unlikely(&memcg_kmem_enabled_key); } +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + if (memcg_kmem_enabled()) + return __memcg_kmem_charge(page, gfp, order); + return 0; +} + +static inline void memcg_kmem_uncharge(struct page *page, int order) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge(page, order); +} + +static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, + int order, struct mem_cgroup *memcg) +{ + if (memcg_kmem_enabled()) + return __memcg_kmem_charge_memcg(page, gfp, order, memcg); + return 0; +} /* * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function @@ -1325,6 +1345,15 @@ static inline void memcg_kmem_uncharge(struct page *page, int order) { } +static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + return 0; +} + +static inline void __memcg_kmem_uncharge(struct page *page, int order) +{ +} + #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index af7f18b32389..72414bb7e226 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2573,7 +2573,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) } /** - * memcg_kmem_charge_memcg: charge a kmem page + * __memcg_kmem_charge_memcg: charge a kmem page * @page: page to charge * @gfp: reclaim mode * @order: allocation order @@ -2581,7 +2581,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) * * Returns 0 on success, an error code on failure. */ -int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg) { unsigned int nr_pages = 1 << order; @@ -2604,24 +2604,24 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, } /** - * memcg_kmem_charge: charge a kmem page to the current memory cgroup + * __memcg_kmem_charge: charge a kmem page to the current memory cgroup * @page: page to charge * @gfp: reclaim mode * @order: allocation order * * Returns 0 on success, an error code on failure. */ -int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; - if (mem_cgroup_disabled() || memcg_kmem_bypass()) + if (memcg_kmem_bypass()) return 0; memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { - ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) __SetPageKmemcg(page); } @@ -2629,11 +2629,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) return ret; } /** - * memcg_kmem_uncharge: uncharge a kmem page + * __memcg_kmem_uncharge: uncharge a kmem page * @page: page to uncharge * @order: allocation order */ -void memcg_kmem_uncharge(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1f9f1409df9b..034b8b6043a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1056,7 +1056,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (PageMappingFlags(page)) page->mapping = NULL; if (memcg_kmem_enabled() && PageKmemcg(page)) - memcg_kmem_uncharge(page, order); + __memcg_kmem_uncharge(page, order); if (check_free) bad += free_pages_check(page); if (bad) @@ -4568,7 +4568,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, out: if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && - unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { + unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) { __free_pages(page, order); page = NULL; } diff --git a/mm/slab.h b/mm/slab.h index 384105318779..e5e6658eeacc 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -276,8 +276,6 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - if (!memcg_kmem_enabled()) - return 0; if (is_root_cache(s)) return 0; return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); @@ -286,8 +284,6 @@ static __always_inline int memcg_charge_slab(struct page *page, static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - if (!memcg_kmem_enabled()) - return; memcg_kmem_uncharge(page, order); } From 5a82ac715d1fd4f117d7b7e76664c0ea3d09e5e7 Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Tue, 5 Mar 2019 15:43:17 -0800 Subject: [PATCH 032/159] mm/vmalloc.c: make vmalloc_32_user() align base kernel virtual address to SHMLBA This patch repeats the original one from David S Miller: 2dca6999eed5 ("mm, perf_event: Make vmalloc_user() align base kernel virtual address to SHMLBA") but for missed vmalloc_32_user() case, which also requires correct alignment of virtual address on kernel side to avoid D-caches aliases. A bit of copy-paste from original patch to recover in memory of what is all about: When a vmalloc'd area is mmap'd into userspace, some kind of co-ordination is necessary for this to work on platforms with cpu D-caches which can have aliases. Otherwise kernel side writes won't be seen properly in userspace and vice versa. If the kernel side mapping and the user side one have the same alignment, modulo SHMLBA, this can work as long as VM_SHARED is shared of VMA and for all current users this is true. VM_SHARED will force SHMLBA alignment of the user side mmap on platforms with D-cache aliasing matters. David S. Miller > What are the user-visible runtime effects of this change? In simple words: proper alignment avoids possible difference in data, seen by different virtual mapings: userspace and kernel in our case. I.e. userspace reads cache line A, kernel writes to cache line B. Both cache lines correspond to the same physical memory (thus aliases). So this should fix data corruption for archs with vivt and vipt caches, e.g. armv6. Personally I've never worked with this archs, I just spotted the strange difference in code: for one case we do alignment, for another - not. I have a strong feeling that David simply missed vmalloc_32_user() case. > > Is a -stable backport needed? No, I do not think so. The only one user of vmalloc_32_user() is virtual frame buffer device drivers/video/fbdev/vfb.c, which has in the description "The main use of this frame buffer device is testing and debugging the frame buffer subsystem. Do NOT enable it for normal systems!". And it seems to me that this vfb.c does not need 32bit addressable pages (vmalloc_32_user() case), because it is virtual device and should not care about things like dma32 zones, etc. Probably is better to clean the code and switch vfb.c from vmalloc_32_user() to vmalloc_user() case and wipe out vmalloc_32_user() from vmalloc.c completely. But I'm not very much sure that this is worth to do, that's so minor, so we can leave it as is. Link: http://lkml.kernel.org/r/20190108110944.23591-1-rpenyaev@suse.de Signed-off-by: Roman Penyaev Reviewed-by: Andrew Morton Cc: Stephen Rothwell Cc: Michal Hocko Cc: David S. Miller Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 806047d7fda3..cb827624c006 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1967,8 +1967,9 @@ void *vmalloc_32_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); + ret = __vmalloc_node(size, SHMLBA, GFP_VMALLOC32 | __GFP_ZERO, + PAGE_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); area->flags |= VM_USERMAP; From 401592d2e095947344e10ec0623adbcd58934dd4 Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Tue, 5 Mar 2019 15:43:20 -0800 Subject: [PATCH 033/159] mm/vmalloc: fix size check for remap_vmalloc_range_partial() When VM_NO_GUARD is not set area->size includes adjacent guard page, thus for correct size checking get_vm_area_size() should be used, but not area->size. This fixes possible kernel oops when userspace tries to mmap an area on 1 page bigger than was allocated by vmalloc_user() call: the size check inside remap_vmalloc_range_partial() accounts non-existing guard page also, so check successfully passes but vmalloc_to_page() returns NULL (guard page does not physically exist). The following code pattern example should trigger an oops: static int oops_mmap(struct file *file, struct vm_area_struct *vma) { void *mem; mem = vmalloc_user(4096); BUG_ON(!mem); /* Do not care about mem leak */ return remap_vmalloc_range(vma, mem, 0); } And userspace simply mmaps size + PAGE_SIZE: mmap(NULL, 8192, PROT_WRITE|PROT_READ, MAP_PRIVATE, fd, 0); Possible candidates for oops which do not have any explicit size checks: *** drivers/media/usb/stkwebcam/stk-webcam.c: v4l_stk_mmap[789] ret = remap_vmalloc_range(vma, sbuf->buffer, 0); Or the following one: *** drivers/video/fbdev/core/fbmem.c static int fb_mmap(struct file *file, struct vm_area_struct * vma) ... res = fb->fb_mmap(info, vma); Where fb_mmap callback calls remap_vmalloc_range() directly without any explicit checks: *** drivers/video/fbdev/vfb.c static int vfb_mmap(struct fb_info *info, struct vm_area_struct *vma) { return remap_vmalloc_range(vma, (void *)info->fix.smem_start, vma->vm_pgoff); } Link: http://lkml.kernel.org/r/20190103145954.16942-2-rpenyaev@suse.de Signed-off-by: Roman Penyaev Acked-by: Michal Hocko Cc: Andrey Ryabinin Cc: Joe Perches Cc: "Luis R. Rodriguez" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cb827624c006..421ae07ffb37 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2249,7 +2249,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, if (!(area->flags & VM_USERMAP)) return -EINVAL; - if (kaddr + size > area->addr + area->size) + if (kaddr + size > area->addr + get_vm_area_size(area)) return -EINVAL; do { From c67dc6247576250a9c9f09adcabad0385a1e7d73 Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Tue, 5 Mar 2019 15:43:24 -0800 Subject: [PATCH 034/159] mm/vmalloc: do not call kmemleak_free() on not yet accounted memory __vmalloc_area_node() calls vfree() on error path, which in turn calls kmemleak_free(), but area is not yet accounted by kmemleak_vmalloc(). Link: http://lkml.kernel.org/r/20190103145954.16942-3-rpenyaev@suse.de Signed-off-by: Roman Penyaev Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Andrey Ryabinin Cc: Joe Perches Cc: "Luis R. Rodriguez" Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 421ae07ffb37..351ec73b3288 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1565,6 +1565,14 @@ void vfree_atomic(const void *addr) __vfree_deferred(addr); } +static void __vfree(const void *addr) +{ + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else + __vunmap(addr, 1); +} + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1591,10 +1599,8 @@ void vfree(const void *addr) if (!addr) return; - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else - __vunmap(addr, 1); + + __vfree(addr); } EXPORT_SYMBOL(vfree); @@ -1709,7 +1715,7 @@ fail: warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); - vfree(area->addr); + __vfree(area->addr); return NULL; } From bc84c53525b4199317df1dab414263a68ba4b6f6 Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Tue, 5 Mar 2019 15:43:27 -0800 Subject: [PATCH 035/159] mm/vmalloc: pass VM_USERMAP flags directly to __vmalloc_node_range() vmalloc_user*() calls differ from normal vmalloc() only in that they set VM_USERMAP flags for the area. During the whole history of vmalloc.c changes now it is possible simply to pass VM_USERMAP flags directly to __vmalloc_node_range() call instead of finding the area (which obviously takes time) after the allocation. Link: http://lkml.kernel.org/r/20190103145954.16942-4-rpenyaev@suse.de Signed-off-by: Roman Penyaev Acked-by: Michal Hocko Cc: Andrey Ryabinin Cc: Joe Perches Cc: "Luis R. Rodriguez" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 351ec73b3288..895b2c522d86 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1865,18 +1865,10 @@ EXPORT_SYMBOL(vzalloc); */ void *vmalloc_user(unsigned long size) { - struct vm_struct *area; - void *ret; - - ret = __vmalloc_node(size, SHMLBA, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, NUMA_NO_NODE, - __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_user); @@ -1970,17 +1962,10 @@ EXPORT_SYMBOL(vmalloc_32); */ void *vmalloc_32_user(unsigned long size) { - struct vm_struct *area; - void *ret; - - ret = __vmalloc_node(size, SHMLBA, GFP_VMALLOC32 | __GFP_ZERO, - PAGE_KERNEL, NUMA_NO_NODE, - __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_user); From 153178edc7819b5c550e5d498d50697ff9d5f223 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 5 Mar 2019 15:43:30 -0800 Subject: [PATCH 036/159] vmalloc: export __vmalloc_node_range for CONFIG_TEST_VMALLOC_MODULE Export __vmaloc_node_range() function if CONFIG_TEST_VMALLOC_MODULE is enabled. Some test cases in vmalloc test suite module require and make use of that function. Please note, that it is not supposed to be used for other purposes. We need it only for performance analysis, stressing and stability check of vmalloc allocator. Link: http://lkml.kernel.org/r/20190103142108.20744-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Kees Cook Cc: Matthew Wilcox Cc: Shuah Khan Cc: Oleksiy Avramchenko Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 895b2c522d86..e83961767dc1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1774,6 +1774,15 @@ fail: return NULL; } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node_range); +#endif + /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size From 3f21a6b7ef207892841feecc3b9216e1a29c745f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 5 Mar 2019 15:43:34 -0800 Subject: [PATCH 037/159] vmalloc: add test driver to analyse vmalloc allocator This adds a new kernel module for analysis of vmalloc allocator. It is only enabled as a module. There are two main reasons this module should be used for: performance evaluation and stressing of vmalloc subsystem. It consists of several test cases. As of now there are 8. The module has five parameters we can specify to change its the behaviour. 1) run_test_mask - set of tests to be run id: 1, name: fix_size_alloc_test id: 2, name: full_fit_alloc_test id: 4, name: long_busy_list_alloc_test id: 8, name: random_size_alloc_test id: 16, name: fix_align_alloc_test id: 32, name: random_size_align_alloc_test id: 64, name: align_shift_alloc_test id: 128, name: pcpu_alloc_test By default all tests are in run test mask. If you want to select some specific tests it is possible to pass the mask. For example for first, second and fourth tests we go 11 value. 2) test_repeat_count - how many times each test should be repeated By default it is one time per test. It is possible to pass any number. As high the value is the test duration gets increased. 3) test_loop_count - internal test loop counter. By default it is set to 1000000. 4) single_cpu_test - use one CPU to run the tests By default this parameter is set to false. It means that all online CPUs execute tests. By setting it to 1, the tests are executed by first online CPU only. 5) sequential_test_order - run tests in sequential order By default this parameter is set to false. It means that before running tests the order is shuffled. It is possible to make it sequential, just set it to 1. Performance analysis: In order to evaluate performance of vmalloc allocations, usually it makes sense to use only one CPU that runs tests, use sequential order, number of repeat tests can be different as well as set of test mask. For example if we want to run all tests, to use one CPU and repeat each test 3 times. Insert the module passing following parameters: single_cpu_test=1 sequential_test_order=1 test_repeat_count=3 with following output: Summary: fix_size_alloc_test passed: 3 failed: 0 repeat: 3 loops: 1000000 avg: 901177 usec Summary: full_fit_alloc_test passed: 3 failed: 0 repeat: 3 loops: 1000000 avg: 1039341 usec Summary: long_busy_list_alloc_test passed: 3 failed: 0 repeat: 3 loops: 1000000 avg: 11775763 usec Summary: random_size_alloc_test passed 3: failed: 0 repeat: 3 loops: 1000000 avg: 6081992 usec Summary: fix_align_alloc_test passed: 3 failed: 0 repeat: 3, loops: 1000000 avg: 2003712 usec Summary: random_size_align_alloc_test passed: 3 failed: 0 repeat: 3 loops: 1000000 avg: 2895689 usec Summary: align_shift_alloc_test passed: 0 failed: 3 repeat: 3 loops: 1000000 avg: 573 usec Summary: pcpu_alloc_test passed: 3 failed: 0 repeat: 3 loops: 1000000 avg: 95802 usec All test took CPU0=192945605995 cycles The align_shift_alloc_test is expected to be failed. Stressing: In order to stress the vmalloc subsystem we run all available test cases on all available CPUs simultaneously. In order to prevent constant behaviour pattern, the test cases array is shuffled by default to randomize the order of test execution. For example if we want to run all tests(default), use all online CPUs(default) with shuffled order(default) and to repeat each test 30 times. The command would be like: modprobe vmalloc_test test_repeat_count=30 Expected results are the system is alive, there are no any BUG_ONs or Kernel Panics the tests are completed, no memory leaks. [urezki@gmail.com: fix 32-bit builds] Link: http://lkml.kernel.org/r/20190106214839.ffvjvmrn52uqog7k@pc636 [urezki@gmail.com: make CONFIG_TEST_VMALLOC depend on CONFIG_MMU] Link: http://lkml.kernel.org/r/20190219085441.s6bg2gpy4esny5vw@pc636 Link: http://lkml.kernel.org/r/20190103142108.20744-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Kees Cook Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 13 ++ lib/Makefile | 1 + lib/test_vmalloc.c | 551 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 565 insertions(+) create mode 100644 lib/test_vmalloc.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a219f3488ad7..48f584393e28 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1875,6 +1875,19 @@ config TEST_LKM If unsure, say N. +config TEST_VMALLOC + tristate "Test module for stress/performance analysis of vmalloc allocator" + default n + depends on MMU + depends on m + help + This builds the "test_vmalloc" module that should be used for + stress and performance analysis. So, any new change for vmalloc + subsystem can be evaluated from performance and stability point + of view. + + If unsure, say N. + config TEST_USER_COPY tristate "Test user/kernel boundary protections" depends on m diff --git a/lib/Makefile b/lib/Makefile index e1b59da71418..cbfacd55aeca 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -60,6 +60,7 @@ UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_LKM) += test_module.o +obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o obj-$(CONFIG_TEST_OVERFLOW) += test_overflow.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o obj-$(CONFIG_TEST_SORT) += test_sort.o diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c new file mode 100644 index 000000000000..83cdcaa82bf6 --- /dev/null +++ b/lib/test_vmalloc.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Test module for stress and analyze performance of vmalloc allocator. + * (C) 2018 Uladzislau Rezki (Sony) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __param(type, name, init, msg) \ + static type name = init; \ + module_param(name, type, 0444); \ + MODULE_PARM_DESC(name, msg) \ + +__param(bool, single_cpu_test, false, + "Use single first online CPU to run tests"); + +__param(bool, sequential_test_order, false, + "Use sequential stress tests order"); + +__param(int, test_repeat_count, 1, + "Set test repeat counter"); + +__param(int, test_loop_count, 1000000, + "Set test loop counter"); + +__param(int, run_test_mask, INT_MAX, + "Set tests specified in the mask.\n\n" + "\t\tid: 1, name: fix_size_alloc_test\n" + "\t\tid: 2, name: full_fit_alloc_test\n" + "\t\tid: 4, name: long_busy_list_alloc_test\n" + "\t\tid: 8, name: random_size_alloc_test\n" + "\t\tid: 16, name: fix_align_alloc_test\n" + "\t\tid: 32, name: random_size_align_alloc_test\n" + "\t\tid: 64, name: align_shift_alloc_test\n" + "\t\tid: 128, name: pcpu_alloc_test\n" + /* Add a new test case description here. */ +); + +/* + * Depends on single_cpu_test parameter. If it is true, then + * use first online CPU to trigger a test on, otherwise go with + * all online CPUs. + */ +static cpumask_t cpus_run_test_mask = CPU_MASK_NONE; + +/* + * Read write semaphore for synchronization of setup + * phase that is done in main thread and workers. + */ +static DECLARE_RWSEM(prepare_for_test_rwsem); + +/* + * Completion tracking for worker threads. + */ +static DECLARE_COMPLETION(test_all_done_comp); +static atomic_t test_n_undone = ATOMIC_INIT(0); + +static inline void +test_report_one_done(void) +{ + if (atomic_dec_and_test(&test_n_undone)) + complete(&test_all_done_comp); +} + +static int random_size_align_alloc_test(void) +{ + unsigned long size, align, rnd; + void *ptr; + int i; + + for (i = 0; i < test_loop_count; i++) { + get_random_bytes(&rnd, sizeof(rnd)); + + /* + * Maximum 1024 pages, if PAGE_SIZE is 4096. + */ + align = 1 << (rnd % 23); + + /* + * Maximum 10 pages. + */ + size = ((rnd % 10) + 1) * PAGE_SIZE; + + ptr = __vmalloc_node_range(size, align, + VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, + PAGE_KERNEL, + 0, 0, __builtin_return_address(0)); + + if (!ptr) + return -1; + + vfree(ptr); + } + + return 0; +} + +/* + * This test case is supposed to be failed. + */ +static int align_shift_alloc_test(void) +{ + unsigned long align; + void *ptr; + int i; + + for (i = 0; i < BITS_PER_LONG; i++) { + align = ((unsigned long) 1) << i; + + ptr = __vmalloc_node_range(PAGE_SIZE, align, + VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, + PAGE_KERNEL, + 0, 0, __builtin_return_address(0)); + + if (!ptr) + return -1; + + vfree(ptr); + } + + return 0; +} + +static int fix_align_alloc_test(void) +{ + void *ptr; + int i; + + for (i = 0; i < test_loop_count; i++) { + ptr = __vmalloc_node_range(5 * PAGE_SIZE, + THREAD_ALIGN << 1, + VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, + PAGE_KERNEL, + 0, 0, __builtin_return_address(0)); + + if (!ptr) + return -1; + + vfree(ptr); + } + + return 0; +} + +static int random_size_alloc_test(void) +{ + unsigned int n; + void *p; + int i; + + for (i = 0; i < test_loop_count; i++) { + get_random_bytes(&n, sizeof(i)); + n = (n % 100) + 1; + + p = vmalloc(n * PAGE_SIZE); + + if (!p) + return -1; + + *((__u8 *)p) = 1; + vfree(p); + } + + return 0; +} + +static int long_busy_list_alloc_test(void) +{ + void *ptr_1, *ptr_2; + void **ptr; + int rv = -1; + int i; + + ptr = vmalloc(sizeof(void *) * 15000); + if (!ptr) + return rv; + + for (i = 0; i < 15000; i++) + ptr[i] = vmalloc(1 * PAGE_SIZE); + + for (i = 0; i < test_loop_count; i++) { + ptr_1 = vmalloc(100 * PAGE_SIZE); + if (!ptr_1) + goto leave; + + ptr_2 = vmalloc(1 * PAGE_SIZE); + if (!ptr_2) { + vfree(ptr_1); + goto leave; + } + + *((__u8 *)ptr_1) = 0; + *((__u8 *)ptr_2) = 1; + + vfree(ptr_1); + vfree(ptr_2); + } + + /* Success */ + rv = 0; + +leave: + for (i = 0; i < 15000; i++) + vfree(ptr[i]); + + vfree(ptr); + return rv; +} + +static int full_fit_alloc_test(void) +{ + void **ptr, **junk_ptr, *tmp; + int junk_length; + int rv = -1; + int i; + + junk_length = fls(num_online_cpus()); + junk_length *= (32 * 1024 * 1024 / PAGE_SIZE); + + ptr = vmalloc(sizeof(void *) * junk_length); + if (!ptr) + return rv; + + junk_ptr = vmalloc(sizeof(void *) * junk_length); + if (!junk_ptr) { + vfree(ptr); + return rv; + } + + for (i = 0; i < junk_length; i++) { + ptr[i] = vmalloc(1 * PAGE_SIZE); + junk_ptr[i] = vmalloc(1 * PAGE_SIZE); + } + + for (i = 0; i < junk_length; i++) + vfree(junk_ptr[i]); + + for (i = 0; i < test_loop_count; i++) { + tmp = vmalloc(1 * PAGE_SIZE); + + if (!tmp) + goto error; + + *((__u8 *)tmp) = 1; + vfree(tmp); + } + + /* Success */ + rv = 0; + +error: + for (i = 0; i < junk_length; i++) + vfree(ptr[i]); + + vfree(ptr); + vfree(junk_ptr); + + return rv; +} + +static int fix_size_alloc_test(void) +{ + void *ptr; + int i; + + for (i = 0; i < test_loop_count; i++) { + ptr = vmalloc(3 * PAGE_SIZE); + + if (!ptr) + return -1; + + *((__u8 *)ptr) = 0; + + vfree(ptr); + } + + return 0; +} + +static int +pcpu_alloc_test(void) +{ + int rv = 0; +#ifndef CONFIG_NEED_PER_CPU_KM + void __percpu **pcpu; + size_t size, align; + int i; + + pcpu = vmalloc(sizeof(void __percpu *) * 35000); + if (!pcpu) + return -1; + + for (i = 0; i < 35000; i++) { + unsigned int r; + + get_random_bytes(&r, sizeof(i)); + size = (r % (PAGE_SIZE / 4)) + 1; + + /* + * Maximum PAGE_SIZE + */ + get_random_bytes(&r, sizeof(i)); + align = 1 << ((i % 11) + 1); + + pcpu[i] = __alloc_percpu(size, align); + if (!pcpu[i]) + rv = -1; + } + + for (i = 0; i < 35000; i++) + free_percpu(pcpu[i]); + + vfree(pcpu); +#endif + return rv; +} + +struct test_case_desc { + const char *test_name; + int (*test_func)(void); +}; + +static struct test_case_desc test_case_array[] = { + { "fix_size_alloc_test", fix_size_alloc_test }, + { "full_fit_alloc_test", full_fit_alloc_test }, + { "long_busy_list_alloc_test", long_busy_list_alloc_test }, + { "random_size_alloc_test", random_size_alloc_test }, + { "fix_align_alloc_test", fix_align_alloc_test }, + { "random_size_align_alloc_test", random_size_align_alloc_test }, + { "align_shift_alloc_test", align_shift_alloc_test }, + { "pcpu_alloc_test", pcpu_alloc_test }, + /* Add a new test case here. */ +}; + +struct test_case_data { + int test_failed; + int test_passed; + u64 time; +}; + +/* Split it to get rid of: WARNING: line over 80 characters */ +static struct test_case_data + per_cpu_test_data[NR_CPUS][ARRAY_SIZE(test_case_array)]; + +static struct test_driver { + struct task_struct *task; + unsigned long start; + unsigned long stop; + int cpu; +} per_cpu_test_driver[NR_CPUS]; + +static void shuffle_array(int *arr, int n) +{ + unsigned int rnd; + int i, j, x; + + for (i = n - 1; i > 0; i--) { + get_random_bytes(&rnd, sizeof(rnd)); + + /* Cut the range. */ + j = rnd % i; + + /* Swap indexes. */ + x = arr[i]; + arr[i] = arr[j]; + arr[j] = x; + } +} + +static int test_func(void *private) +{ + struct test_driver *t = private; + cpumask_t newmask = CPU_MASK_NONE; + int random_array[ARRAY_SIZE(test_case_array)]; + int index, i, j, ret; + ktime_t kt; + u64 delta; + + cpumask_set_cpu(t->cpu, &newmask); + set_cpus_allowed_ptr(current, &newmask); + + for (i = 0; i < ARRAY_SIZE(test_case_array); i++) + random_array[i] = i; + + if (!sequential_test_order) + shuffle_array(random_array, ARRAY_SIZE(test_case_array)); + + /* + * Block until initialization is done. + */ + down_read(&prepare_for_test_rwsem); + + t->start = get_cycles(); + for (i = 0; i < ARRAY_SIZE(test_case_array); i++) { + index = random_array[i]; + + /* + * Skip tests if run_test_mask has been specified. + */ + if (!((run_test_mask & (1 << index)) >> index)) + continue; + + kt = ktime_get(); + for (j = 0; j < test_repeat_count; j++) { + ret = test_case_array[index].test_func(); + if (!ret) + per_cpu_test_data[t->cpu][index].test_passed++; + else + per_cpu_test_data[t->cpu][index].test_failed++; + } + + /* + * Take an average time that test took. + */ + delta = (u64) ktime_us_delta(ktime_get(), kt); + do_div(delta, (u32) test_repeat_count); + + per_cpu_test_data[t->cpu][index].time = delta; + } + t->stop = get_cycles(); + + up_read(&prepare_for_test_rwsem); + test_report_one_done(); + + /* + * Wait for the kthread_stop() call. + */ + while (!kthread_should_stop()) + msleep(10); + + return 0; +} + +static void +init_test_configurtion(void) +{ + /* + * Reset all data of all CPUs. + */ + memset(per_cpu_test_data, 0, sizeof(per_cpu_test_data)); + + if (single_cpu_test) + cpumask_set_cpu(cpumask_first(cpu_online_mask), + &cpus_run_test_mask); + else + cpumask_and(&cpus_run_test_mask, cpu_online_mask, + cpu_online_mask); + + if (test_repeat_count <= 0) + test_repeat_count = 1; + + if (test_loop_count <= 0) + test_loop_count = 1; +} + +static void do_concurrent_test(void) +{ + int cpu, ret; + + /* + * Set some basic configurations plus sanity check. + */ + init_test_configurtion(); + + /* + * Put on hold all workers. + */ + down_write(&prepare_for_test_rwsem); + + for_each_cpu(cpu, &cpus_run_test_mask) { + struct test_driver *t = &per_cpu_test_driver[cpu]; + + t->cpu = cpu; + t->task = kthread_run(test_func, t, "vmalloc_test/%d", cpu); + + if (!IS_ERR(t->task)) + /* Success. */ + atomic_inc(&test_n_undone); + else + pr_err("Failed to start kthread for %d CPU\n", cpu); + } + + /* + * Now let the workers do their job. + */ + up_write(&prepare_for_test_rwsem); + + /* + * Sleep quiet until all workers are done with 1 second + * interval. Since the test can take a lot of time we + * can run into a stack trace of the hung task. That is + * why we go with completion_timeout and HZ value. + */ + do { + ret = wait_for_completion_timeout(&test_all_done_comp, HZ); + } while (!ret); + + for_each_cpu(cpu, &cpus_run_test_mask) { + struct test_driver *t = &per_cpu_test_driver[cpu]; + int i; + + if (!IS_ERR(t->task)) + kthread_stop(t->task); + + for (i = 0; i < ARRAY_SIZE(test_case_array); i++) { + if (!((run_test_mask & (1 << i)) >> i)) + continue; + + pr_info( + "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n", + test_case_array[i].test_name, + per_cpu_test_data[cpu][i].test_passed, + per_cpu_test_data[cpu][i].test_failed, + test_repeat_count, test_loop_count, + per_cpu_test_data[cpu][i].time); + } + + pr_info("All test took CPU%d=%lu cycles\n", + cpu, t->stop - t->start); + } +} + +static int vmalloc_test_init(void) +{ + do_concurrent_test(); + return -EAGAIN; /* Fail will directly unload the module */ +} + +static void vmalloc_test_exit(void) +{ +} + +module_init(vmalloc_test_init) +module_exit(vmalloc_test_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Uladzislau Rezki"); +MODULE_DESCRIPTION("vmalloc test module"); From a05ef00c97900f69f6e69d88e8a657b7a4ef8cbd Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 5 Mar 2019 15:43:37 -0800 Subject: [PATCH 038/159] selftests/vm: add script helper for CONFIG_TEST_VMALLOC_MODULE Add the test script for the kernel test driver to analyse vmalloc allocator for benchmarking and stressing purposes. It is just a kernel module loader. You can specify and pass different parameters in order to investigate allocations behaviour. See "usage" output for more details. Also add basic vmalloc smoke test to the "run_vmtests" suite. Link: http://lkml.kernel.org/r/20190103142108.20744-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Shuah Khan Cc: Kees Cook Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/run_vmtests | 16 ++ tools/testing/selftests/vm/test_vmalloc.sh | 176 +++++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 tools/testing/selftests/vm/test_vmalloc.sh diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 584a91ae4a8f..951c507a27f7 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -211,4 +211,20 @@ else echo "[PASS]" fi +echo "------------------------------------" +echo "running vmalloc stability smoke test" +echo "------------------------------------" +./test_vmalloc.sh smoke +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + exit $exitcode diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh new file mode 100644 index 000000000000..06d2bb109f06 --- /dev/null +++ b/tools/testing/selftests/vm/test_vmalloc.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2018 Uladzislau Rezki (Sony) +# +# This is a test script for the kernel test driver to analyse vmalloc +# allocator. Therefore it is just a kernel module loader. You can specify +# and pass different parameters in order to: +# a) analyse performance of vmalloc allocations; +# b) stressing and stability check of vmalloc subsystem. + +TEST_NAME="vmalloc" +DRIVER="test_${TEST_NAME}" + +# 1 if fails +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# +# Static templates for performance, stressing and smoke tests. +# Also it is possible to pass any supported parameters manualy. +# +PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3" +SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10" +STRESS_PARAM="test_repeat_count=20" + +check_test_requirements() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "$0: Must be run as root" + exit $ksft_skip + fi + + if ! which modprobe > /dev/null 2>&1; then + echo "$0: You need modprobe installed" + exit $ksft_skip + fi + + if ! modinfo $DRIVER > /dev/null 2>&1; then + echo "$0: You must have the following enabled in your kernel:" + echo "CONFIG_TEST_VMALLOC=m" + exit $ksft_skip + fi +} + +run_perfformance_check() +{ + echo "Run performance tests to evaluate how fast vmalloc allocation is." + echo "It runs all test cases on one single CPU with sequential order." + + modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 + echo "Done." + echo "Ccheck the kernel message buffer to see the summary." +} + +run_stability_check() +{ + echo "Run stability tests. In order to stress vmalloc subsystem we run" + echo "all available test cases on all available CPUs simultaneously." + echo "It will take time, so be patient." + + modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +run_smoke_check() +{ + echo "Run smoke test. Note, this test provides basic coverage." + echo "Please check $0 output how it can be used" + echo "for deep performance analysis as well as stress testing." + + modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +usage() +{ + echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | " + echo "manual parameters" + echo + echo "Valid tests and parameters:" + echo + modinfo $DRIVER + echo + echo "Example usage:" + echo + echo "# Shows help message" + echo "./${DRIVER}.sh" + echo + echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs" + echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5" + echo + echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " + echo "sequential order" + echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 " + echo "run_test_mask=23" + echo + echo -n "# Runs all tests on all online CPUs, shuffled order, repeats " + echo "20 times" + echo "./${DRIVER}.sh test_repeat_count=20" + echo + echo "# Performance analysis" + echo "./${DRIVER}.sh performance" + echo + echo "# Stress testing" + echo "./${DRIVER}.sh stress" + echo + exit 0 +} + +function validate_passed_args() +{ + VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'` + + # + # Something has been passed, check it. + # + for passed_arg in $@; do + key=${passed_arg//=*/} + val="${passed_arg:$((${#key}+1))}" + valid=0 + + for valid_arg in $VALID_ARGS; do + if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then + valid=1 + break + fi + done + + if [[ $valid -ne 1 ]]; then + echo "Error: key or value is not correct: ${key} $val" + exit $exitcode + fi + done +} + +function run_manual_check() +{ + # + # Validate passed parameters. If there is wrong one, + # the script exists and does not execute further. + # + validate_passed_args $@ + + echo "Run the test with following parameters: $@" + modprobe $DRIVER $@ > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +function run_test() +{ + if [ $# -eq 0 ]; then + usage + else + if [[ "$1" = "performance" ]]; then + run_perfformance_check + elif [[ "$1" = "stress" ]]; then + run_stability_check + elif [[ "$1" = "smoke" ]]; then + run_smoke_check + else + run_manual_check $@ + fi + fi +} + +check_test_requirements +run_test $@ + +exit 0 From 6b7e5cad651a2b1031a4c69a98f87e3532dd4cef Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 5 Mar 2019 15:43:41 -0800 Subject: [PATCH 039/159] mm: remove sysctl_extfrag_handler() sysctl_extfrag_handler() neglects to propagate the return value from proc_dointvec_minmax() to its caller. It's a wrapper that doesn't need to exist, so just use proc_dointvec_minmax() directly. Link: http://lkml.kernel.org/r/20190104032557.3056-1-willy@infradead.org Signed-off-by: Matthew Wilcox Reported-by: Aditya Pakki Acked-by: Mel Gorman Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 2 -- kernel/sysctl.c | 2 +- mm/compaction.c | 8 -------- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 68250a57aace..70d0256edd31 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -88,8 +88,6 @@ extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; -extern int sysctl_extfrag_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos); extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7578e21a711b..9c78e06f7ba4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1460,7 +1460,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = sysctl_extfrag_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, diff --git a/mm/compaction.c b/mm/compaction.c index ef29490b0f46..c15b4bbc9e9e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1876,14 +1876,6 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, return 0; } -int sysctl_extfrag_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - proc_dointvec_minmax(table, write, buffer, length, ppos); - - return 0; -} - #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) static ssize_t sysfs_compact_node(struct device *dev, struct device_attribute *attr, From 7ed2c31dabdeb3ee6abe8ff5aac7287821a50cba Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:43:44 -0800 Subject: [PATCH 040/159] mm/hugetlb: distinguish between migratability and movability Patch series "arm64/mm: Enable HugeTLB migration", v4. This patch series enables HugeTLB migration support for all supported huge page sizes at all levels including contiguous bit implementation. Following HugeTLB migration support matrix has been enabled with this patch series. All permutations have been tested except for the 16GB. CONT PTE PMD CONT PMD PUD -------- --- -------- --- 4K: 64K 2M 32M 1G 16K: 2M 32M 1G 64K: 2M 512M 16G First the series adds migration support for PUD based huge pages. It then adds a platform specific hook to query an architecture if a given huge page size is supported for migration while also providing a default fallback option preserving the existing semantics which just checks for (PMD|PUD|PGDIR)_SHIFT macros. The last two patches enables HugeTLB migration on arm64 and subscribe to this new platform specific hook by defining an override. The second patch differentiates between movability and migratability aspects of huge pages and implements hugepage_movable_supported() which can then be used during allocation to decide whether to place the huge page in movable zone or not. This patch (of 5): During huge page allocation it's migratability is checked to determine if it should be placed under movable zones with GFP_HIGHUSER_MOVABLE. But the movability aspect of the huge page could depend on other factors than just migratability. Movability in itself is a distinct property which should not be tied with migratability alone. This differentiates these two and implements an enhanced movability check which also considers huge page size to determine if it is feasible to be placed under a movable zone. At present it just checks for gigantic pages but going forward it can incorporate other enhanced checks. Link: http://lkml.kernel.org/r/1545121450-1663-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Steve Capper Reviewed-by: Naoya Horiguchi Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 30 ++++++++++++++++++++++++++++++ mm/hugetlb.c | 2 +- mm/migrate.c | 2 +- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 087fd5f48c91..1b858d795731 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -506,6 +506,31 @@ static inline bool hugepage_migration_supported(struct hstate *h) #endif } +/* + * Movability check is different as compared to migration check. + * It determines whether or not a huge page should be placed on + * movable zone or not. Movability of any huge page should be + * required only if huge page size is supported for migration. + * There wont be any reason for the huge page to be movable if + * it is not migratable to start with. Also the size of the huge + * page should be large enough to be placed under a movable zone + * and still feasible enough to be migratable. Just the presence + * in movable zone does not make the migration feasible. + * + * So even though large huge page sizes like the gigantic ones + * are migratable they should not be movable because its not + * feasible to migrate them from movable zone. + */ +static inline bool hugepage_movable_supported(struct hstate *h) +{ + if (!hugepage_migration_supported(h)) + return false; + + if (hstate_is_gigantic(h)) + return false; + return true; +} + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { @@ -602,6 +627,11 @@ static inline bool hugepage_migration_supported(struct hstate *h) return false; } +static inline bool hugepage_movable_supported(struct hstate *h) +{ + return false; +} + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3c504fa6b460..2fb3062a3595 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -920,7 +920,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepage_migration_supported(h)) + if (hugepage_movable_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; diff --git a/mm/migrate.c b/mm/migrate.c index 181f5d2718a9..0413596fc523 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1287,7 +1287,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct anon_vma *anon_vma = NULL; /* - * Movability of hugepages depends on architectures and hugepage size. + * Migratability of hugepages depends on architectures and their size. * This check is necessary because some callers of hugepage migration * like soft offline and memory hotremove don't walk through page * tables or check whether the hugepage is pmd-based or not before From 9b553bf5eb99dd1b2d8ae23136da46da5c205dfd Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:43:48 -0800 Subject: [PATCH 041/159] mm/hugetlb: enable PUD level huge page migration Architectures like arm64 have PUD level HugeTLB pages for certain configs (1GB huge page is PUD based on ARM64_4K_PAGES base page size) that can be enabled for migration. It can be achieved through checking for PUD_SHIFT order based HugeTLB pages during migration. Link: http://lkml.kernel.org/r/1545121450-1663-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Naoya Horiguchi Reviewed-by: Steve Capper Acked-by: Michal Hocko Cc: Catalin Marinas Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1b858d795731..70bcd8973323 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -497,7 +497,8 @@ static inline bool hugepage_migration_supported(struct hstate *h) { #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION if ((huge_page_shift(h) == PMD_SHIFT) || - (huge_page_shift(h) == PGDIR_SHIFT)) + (huge_page_shift(h) == PUD_SHIFT) || + (huge_page_shift(h) == PGDIR_SHIFT)) return true; else return false; From e693de186414ae66f2a316ff9befcd2b7a6d07b6 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:43:51 -0800 Subject: [PATCH 042/159] mm/hugetlb: enable arch specific huge page size support for migration Architectures like arm64 have HugeTLB page sizes which are different than generic sizes at PMD, PUD, PGD level and implemented via contiguous bits. At present these special size HugeTLB pages cannot be identified through macros like (PMD|PUD|PGDIR)_SHIFT and hence chosen not be migrated. Enabling migration support for these special HugeTLB page sizes along with the generic ones (PMD|PUD|PGD) would require identifying all of them on a given platform. A platform specific hook can precisely enumerate all huge page sizes supported for migration. Instead of comparing against standard huge page orders let hugetlb_migration_support() function call a platform hook arch_hugetlb_migration_support(). Default definition for the platform hook maintains existing semantics which checks standard huge page order. But an architecture can choose to override the default and provide support for a comprehensive set of huge page sizes. Link: http://lkml.kernel.org/r/1545121450-1663-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Naoya Horiguchi Reviewed-by: Steve Capper Acked-by: Michal Hocko Cc: Catalin Marinas Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 70bcd8973323..4cc3871b65fc 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -493,18 +493,29 @@ static inline pgoff_t basepage_index(struct page *page) extern int dissolve_free_huge_page(struct page *page); extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); -static inline bool hugepage_migration_supported(struct hstate *h) -{ + #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION +#ifndef arch_hugetlb_migration_supported +static inline bool arch_hugetlb_migration_supported(struct hstate *h) +{ if ((huge_page_shift(h) == PMD_SHIFT) || (huge_page_shift(h) == PUD_SHIFT) || (huge_page_shift(h) == PGDIR_SHIFT)) return true; else return false; -#else - return false; +} #endif +#else +static inline bool arch_hugetlb_migration_supported(struct hstate *h) +{ + return false; +} +#endif + +static inline bool hugepage_migration_supported(struct hstate *h) +{ + return arch_hugetlb_migration_supported(h); } /* From 4a03a058d1fe7558faffab1a831dde508501e85c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:43:55 -0800 Subject: [PATCH 043/159] arm64/mm: enable HugeTLB migration Let arm64 subscribe to generic HugeTLB page migration framework. Right now this only works on the following PMD and PUD level HugeTLB page sizes with various kernel base page size combinations. CONT PTE PMD CONT PMD PUD -------- --- -------- --- 4K: NA 2M NA 1G 16K: NA 32M NA 64K: NA 512M NA Link: http://lkml.kernel.org/r/1545121450-1663-5-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Naoya Horiguchi Reviewed-by: Steve Capper Acked-by: Catalin Marinas Cc: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a4168d366127..cfbf307d6dc4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1467,6 +1467,10 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC +config ARCH_ENABLE_HUGEPAGE_MIGRATION + def_bool y + depends on HUGETLB_PAGE && MIGRATION + menu "Power management options" source "kernel/power/Kconfig" From 5480280d3f2d11d47f9be59d49b20a8d7d1b33e8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:43:58 -0800 Subject: [PATCH 044/159] arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages Let arm64 subscribe to the previously added framework in which architecture can inform whether a given huge page size is supported for migration. This just overrides the default function arch_hugetlb_migration_supported() and enables migration for all possible HugeTLB page sizes on arm64. With this, HugeTLB migration support on arm64 now covers all possible HugeTLB options. CONT PTE PMD CONT PMD PUD -------- --- -------- --- 4K: 64K 2M 32M 1G 16K: 2M 32M 1G 64K: 2M 512M 16G Link: http://lkml.kernel.org/r/1545121450-1663-6-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Naoya Horiguchi Reviewed-by: Steve Capper Acked-by: Catalin Marinas Cc: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/hugetlb.h | 5 +++++ arch/arm64/mm/hugetlbpage.c | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index fb6609875455..c6a07a3b433e 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -20,6 +20,11 @@ #include +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION +#define arch_hugetlb_migration_supported arch_hugetlb_migration_supported +extern bool arch_hugetlb_migration_supported(struct hstate *h); +#endif + #define __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(pte_t *ptep) { diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 28cbc22d7e30..6b4a47b3adf4 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -27,6 +27,26 @@ #include #include +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION +bool arch_hugetlb_migration_supported(struct hstate *h) +{ + size_t pagesize = huge_page_size(h); + + switch (pagesize) { +#ifdef CONFIG_ARM64_4K_PAGES + case PUD_SIZE: +#endif + case PMD_SIZE: + case CONT_PMD_SIZE: + case CONT_PTE_SIZE: + return true; + } + pr_warn("%s: unrecognized huge page size 0x%lx\n", + __func__, pagesize); + return false; +} +#endif + int pmd_huge(pmd_t pmd) { return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); From c52e75935f8ded2bd4a75eb08e914bd96802725b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 5 Mar 2019 15:44:01 -0800 Subject: [PATCH 045/159] mm: remove extra drain pages on pcp list In the current implementation, there are two places to isolate a range of page: __offline_pages() and alloc_contig_range(). During this procedure, it will drain pages on pcp list. Below is a brief call flow: __offline_pages()/alloc_contig_range() start_isolate_page_range() set_migratetype_isolate() drain_all_pages() drain_all_pages() <--- A This snippet shows the current logic is isolate and drain pcp list for each pageblock and drain pcp list again for the whole range. start_isolate_page_range is responsible for isolating the given pfn range. One part of that job is to make sure that also pages that are on the allocator pcp lists are properly isolated. Otherwise they could be reused and the range wouldn't be completely isolated until the memory is freed back. While there is no strict guarantee here because pages might get allocated at any time before drain_all_pages is called there doesn't seem to be any strong demand for such a guarantee. In any case, draining is already done at the isolation level and there is no need to do it again later by start_isolate_page_range callers (memory hotplug and CMA allocator currently). Therefore remove pointless draining in existing callers to make the code more clear and functionally correct. [mhocko@suse.com: provide a clearer changelog for the last two paragraphs] Link: http://lkml.kernel.org/r/20190105233141.2329-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Michal Hocko Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 - mm/page_alloc.c | 1 - 2 files changed, 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b3d3c64d15df..eb1a28469b66 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1625,7 +1625,6 @@ static int __ref __offline_pages(unsigned long start_pfn, cond_resched(); lru_add_drain_all(); - drain_all_pages(zone); pfn = scan_movable_pages(pfn, end_pfn); if (pfn) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 034b8b6043a3..8afb6f007f68 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8196,7 +8196,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ lru_add_drain_all(); - drain_all_pages(cc.zone); order = 0; outer_start = start; From 67b8046f42f81aecfca97b50ec3b398bb0ee8e97 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 5 Mar 2019 15:44:05 -0800 Subject: [PATCH 046/159] mm/memcontrol.c: use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kmalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kmalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Link: http://lkml.kernel.org/r/20190104183726.GA6374@embeddedor Signed-off-by: Gustavo A. R. Silva Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 72414bb7e226..f93f7f22a6f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3626,8 +3626,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, size = thresholds->primary ? thresholds->primary->size + 1 : 1; /* Allocate memory for new array of thresholds */ - new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), - GFP_KERNEL); + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); if (!new) { ret = -ENOMEM; goto unlock; From 14ef1fc72a3e86cea6498193c5e04b4619cb8622 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 5 Mar 2019 15:44:08 -0800 Subject: [PATCH 047/159] mm/filemap.c: remove redundant test from find_get_pages_contig After we establish a reference on the page, we check the pointer continues to be in the correct position in i_pages. Checking page->index afterwards is unnecessary; if it were to change, then the pointer to it from the page cache would also move. The check used to be done before grabbing a reference on the page which was racy (see commit 9cbb4cb21b19f ("mm: find_get_pages_contig fixlet")), but nobody noticed that moving the check after grabbing the reference was redundant. Link: http://lkml.kernel.org/r/20190107200224.13260-1-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 9f5e323e883e..935fbc29aeb1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1837,16 +1837,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, if (unlikely(page != xas_reload(&xas))) goto put_page; - /* - * must check mapping and index after taking the ref. - * otherwise we can get both false positives and false - * negatives, which is just confusing to the caller. - */ - if (!page->mapping || page_to_pgoff(page) != xas.xa_index) { - put_page(page); - break; - } - pages[ret] = page; if (++ret == nr_pages) break; From 8fd2e0b505d124bbb046ab15de0ff6f8d4babf56 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 5 Mar 2019 15:44:11 -0800 Subject: [PATCH 048/159] mm: swap: check if swap backing device is congested or not Swap readahead would read in a few pages regardless if the underlying device is busy or not. It may incur long waiting time if the device is congested, and it may also exacerbate the congestion. Use inode_read_congested() to check if the underlying device is busy or not like what file page readahead does. Get inode from swap_info_struct. Although we can add inode information in swap_address_space (address_space->host), it may lead some unexpected side effect, i.e. it may break mapping_cap_account_dirty(). Using inode from swap_info_struct seems simple and good enough. Just does the check in vma_cluster_readahead() since swap_vma_readahead() is just used for non-rotational device which much less likely has congestion than traditional HDD. Although swap slots may be consecutive on swap partition, it still may be fragmented on swap file. This check would help to reduce excessive stall for such case. The test with page_fault1 of will-it-scale (sometimes tracing may just show runtest.py that is the wrapper script of page_fault1), which basically launches NR_CPU threads to generate 128MB anonymous pages for each thread, on my virtual machine with congested HDD shows long tail latency is reduced significantly. Without the patch page_fault1_thr-1490 [023] 129.311706: funcgraph_entry: #57377.796 us | do_swap_page(); page_fault1_thr-1490 [023] 129.369103: funcgraph_entry: 5.642us | do_swap_page(); page_fault1_thr-1490 [023] 129.369119: funcgraph_entry: #1289.592 us | do_swap_page(); page_fault1_thr-1490 [023] 129.370411: funcgraph_entry: 4.957us | do_swap_page(); page_fault1_thr-1490 [023] 129.370419: funcgraph_entry: 1.940us | do_swap_page(); page_fault1_thr-1490 [023] 129.378847: funcgraph_entry: #1411.385 us | do_swap_page(); page_fault1_thr-1490 [023] 129.380262: funcgraph_entry: 3.916us | do_swap_page(); page_fault1_thr-1490 [023] 129.380275: funcgraph_entry: #4287.751 us | do_swap_page(); With the patch runtest.py-1417 [020] 301.925911: funcgraph_entry: #9870.146 us | do_swap_page(); runtest.py-1417 [020] 301.935785: funcgraph_entry: 9.802us | do_swap_page(); runtest.py-1417 [020] 301.935799: funcgraph_entry: 3.551us | do_swap_page(); runtest.py-1417 [020] 301.935806: funcgraph_entry: 2.142us | do_swap_page(); runtest.py-1417 [020] 301.935853: funcgraph_entry: 6.938us | do_swap_page(); runtest.py-1417 [020] 301.935864: funcgraph_entry: 3.765us | do_swap_page(); runtest.py-1417 [020] 301.935871: funcgraph_entry: 3.600us | do_swap_page(); runtest.py-1417 [020] 301.935878: funcgraph_entry: 7.202us | do_swap_page(); [akpm@linux-foundation.org: code cleanup] [yang.shi@linux.alibaba.com: add comment] Link: http://lkml.kernel.org/r/bbc7bda7-62d0-df1a-23ef-d369e865bdca@linux.alibaba.com Link: http://lkml.kernel.org/r/1546543673-108536-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Tim Chen Reviewed-by: Andrew Morton Cc: Huang Ying Cc: Minchan Kim Cc: Daniel Jordan Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/swap_state.c b/mm/swap_state.c index fd2f21e1c60a..56a3cc4f1068 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -543,6 +543,13 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!mask) goto skip; + /* Test swap type to make sure the dereference is safe */ + if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) { + struct inode *inode = si->swap_file->f_mapping->host; + if (inode_read_congested(inode)) + goto skip; + } + do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; From e9f598730ea0dde3b45560766240503367c404b7 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 5 Mar 2019 15:44:15 -0800 Subject: [PATCH 049/159] mm: swap: add comment for swap_vma_readahead swap_vma_readahead()'s comment is missing, just add it. Link: http://lkml.kernel.org/r/1546543673-108536-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: Andrew Morton Cc: Huang Ying Cc: Tim Chen Cc: Minchan Kim Cc: Daniel Jordan Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 56a3cc4f1068..85245fdec8d9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -523,7 +523,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) * This has been extended to use the NUMA policies from the mm triggering * the readahead. * - * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. + * Caller must hold read mmap_sem if vmf->vma is not NULL. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) @@ -698,6 +698,20 @@ static void swap_ra_info(struct vm_fault *vmf, pte_unmap(orig_pte); } +/** + * swap_vma_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vmf: fault information + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * Primitive swap readahead code. We simply read in a few pages whoes + * virtual addresses are around the fault address in the same vma. + * + * Caller must hold read mmap_sem if vmf->vma is not NULL. + * + */ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct vm_fault *vmf) { From d71e53cee7c2e553b85c572e76da778a93d32135 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:44:18 -0800 Subject: [PATCH 050/159] mm: shuffle GFP_* flags GFP_KERNEL is one of the most used constant but on archs like arm with fixed length instruction some constants are more equal than the others. Constants with tightly packed bits can be injected directly into instruction stream: 0: e3a00d33 mov r0, #3264 ; 0xcc0 Others require multiple instructions or even loading out of instruction stream: 0: e3a000c0 mov r0, #192 ; 0xc0 4: e3400060 movt r0, #96 ; 0x60 Shuffle GFP_* flags so that GFP_KERNEL/GFP_ATOMIC + __GFP_ZERO bits are close to each other. Savings on arm configs are ~0.1%. Link: http://lkml.kernel.org/r/20190109201838.GA9140@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5f5e25fd6149..fdab7de7490d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -24,21 +24,21 @@ struct vm_area_struct; #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u -#define ___GFP_WRITE 0x100u -#define ___GFP_NOWARN 0x200u -#define ___GFP_RETRY_MAYFAIL 0x400u -#define ___GFP_NOFAIL 0x800u -#define ___GFP_NORETRY 0x1000u -#define ___GFP_MEMALLOC 0x2000u -#define ___GFP_COMP 0x4000u -#define ___GFP_ZERO 0x8000u -#define ___GFP_NOMEMALLOC 0x10000u -#define ___GFP_HARDWALL 0x20000u -#define ___GFP_THISNODE 0x40000u -#define ___GFP_ATOMIC 0x80000u -#define ___GFP_ACCOUNT 0x100000u -#define ___GFP_DIRECT_RECLAIM 0x200000u -#define ___GFP_KSWAPD_RECLAIM 0x400000u +#define ___GFP_ZERO 0x100u +#define ___GFP_ATOMIC 0x200u +#define ___GFP_DIRECT_RECLAIM 0x400u +#define ___GFP_KSWAPD_RECLAIM 0x800u +#define ___GFP_WRITE 0x1000u +#define ___GFP_NOWARN 0x2000u +#define ___GFP_RETRY_MAYFAIL 0x4000u +#define ___GFP_NOFAIL 0x8000u +#define ___GFP_NORETRY 0x10000u +#define ___GFP_MEMALLOC 0x20000u +#define ___GFP_COMP 0x40000u +#define ___GFP_NOMEMALLOC 0x80000u +#define ___GFP_HARDWALL 0x100000u +#define ___GFP_THISNODE 0x200000u +#define ___GFP_ACCOUNT 0x400000u #ifdef CONFIG_LOCKDEP #define ___GFP_NOLOCKDEP 0x800000u #else From 35f12f0f5c3bbd60caba89351f45c8eef8ffd423 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 5 Mar 2019 15:44:21 -0800 Subject: [PATCH 051/159] mm/filemap: pass inclusive 'end_byte' parameter to filemap_range_has_page The 'end_byte' parameter of filemap_range_has_page is required to be inclusive, so follow the rule. Link: http://lkml.kernel.org/r/1548678679-18122-1-git-send-email-zhengbin13@huawei.com Fixes: 6be96d3ad34a ("fs: return if direct I/O will trigger writeback") Signed-off-by: zhengbin Reviewed-by: Andrew Morton Reviewed-by: Matthew Wilcox Acked-by: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Amir Goldstein Cc: Dave Chinner Cc: Johannes Weiner Cc: Hugh Dickins Cc: Hou Tao Cc: zhangyi (F) Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 935fbc29aeb1..e59fdecdab74 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3071,7 +3071,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) { /* If there are pages to writeback, return */ if (filemap_range_has_page(inode->i_mapping, pos, - pos + write_len)) + pos + write_len - 1)) return -EAGAIN; } else { written = filemap_write_and_wait_range(mapping, pos, From c5fbd937b603885f1db3280ca212ed28add895bc Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:25 -0800 Subject: [PATCH 052/159] mm, compaction: shrink compact_control Patch series "Increase success rates and reduce latency of compaction", v3. This series reduces scan rates and success rates of compaction, primarily by using the free lists to shorten scans, better controlling of skip information and whether multiple scanners can target the same block and capturing pageblocks before being stolen by parallel requests. The series is based on mmotm from January 9th, 2019 with the previous compaction series reverted. I'm mostly using thpscale to measure the impact of the series. The benchmark creates a large file, maps it, faults it, punches holes in the mapping so that the virtual address space is fragmented and then tries to allocate THP. It re-executes for different numbers of threads. From a fragmentation perspective, the workload is relatively benign but it does stress compaction. The overall impact on latencies for a 1-socket machine is baseline patches Amean fault-both-3 3832.09 ( 0.00%) 2748.56 * 28.28%* Amean fault-both-5 4933.06 ( 0.00%) 4255.52 ( 13.73%) Amean fault-both-7 7017.75 ( 0.00%) 6586.93 ( 6.14%) Amean fault-both-12 11610.51 ( 0.00%) 9162.34 * 21.09%* Amean fault-both-18 17055.85 ( 0.00%) 11530.06 * 32.40%* Amean fault-both-24 19306.27 ( 0.00%) 17956.13 ( 6.99%) Amean fault-both-30 22516.49 ( 0.00%) 15686.47 * 30.33%* Amean fault-both-32 23442.93 ( 0.00%) 16564.83 * 29.34%* The allocation success rates are much improved baseline patches Percentage huge-3 85.99 ( 0.00%) 97.96 ( 13.92%) Percentage huge-5 88.27 ( 0.00%) 96.87 ( 9.74%) Percentage huge-7 85.87 ( 0.00%) 94.53 ( 10.09%) Percentage huge-12 82.38 ( 0.00%) 98.44 ( 19.49%) Percentage huge-18 83.29 ( 0.00%) 99.14 ( 19.04%) Percentage huge-24 81.41 ( 0.00%) 97.35 ( 19.57%) Percentage huge-30 80.98 ( 0.00%) 98.05 ( 21.08%) Percentage huge-32 80.53 ( 0.00%) 97.06 ( 20.53%) That's a nearly perfect allocation success rate. The biggest impact is on the scan rates Compaction migrate scanned 55893379 19341254 Compaction free scanned 474739990 11903963 The number of pages scanned for migration was reduced by 65% and the free scanner was reduced by 97.5%. So much less work in exchange for lower latency and better success rates. The series was also evaluated using a workload that heavily fragments memory but the benefits there are also significant, albeit not presented. It was commented that we should be rethinking scanning entirely and to a large extent I agree. However, to achieve that you need a lot of this series in place first so it's best to make the linear scanners as best as possible before ripping them out. This patch (of 22): The isolate and migrate scanners should never isolate more than a pageblock of pages so unsigned int is sufficient saving 8 bytes on a 64-bit build. Link: http://lkml.kernel.org/r/20190118175136.31341-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 536bc2a839b9..5564841fce36 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -185,8 +185,8 @@ struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ struct zone *zone; - unsigned long nr_freepages; /* Number of isolated free pages */ - unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned int nr_freepages; /* Number of isolated free pages */ + unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long total_migrate_scanned; unsigned long total_free_scanned; unsigned long free_pfn; /* isolate_freepages search base */ From c5943b9c5312d4fa23175ff146e901b865e4a60a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:28 -0800 Subject: [PATCH 053/159] mm, compaction: rearrange compact_control compact_control spans two cache lines with write-intensive lines on both. Rearrange so the most write-intensive fields are in the same cache line. This has a negligible impact on the overall performance of compaction and is more a tidying exercise than anything. Link: http://lkml.kernel.org/r/20190118175136.31341-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 5564841fce36..867af5425432 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -184,14 +184,14 @@ extern int user_min_free_kbytes; struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ - struct zone *zone; unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ - unsigned long total_migrate_scanned; - unsigned long total_free_scanned; unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ + struct zone *zone; + unsigned long total_migrate_scanned; + unsigned long total_free_scanned; const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ From 566e54e113eb2b669f9300db2c2df400cbb06646 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:32 -0800 Subject: [PATCH 054/159] mm, compaction: remove last_migrated_pfn from compact_control The last_migrated_pfn field is a bit dubious as to whether it really helps but either way, the information from it can be inferred without increasing the size of compact_control so remove the field. Link: http://lkml.kernel.org/r/20190118175136.31341-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 25 +++++++++---------------- mm/internal.h | 1 - 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index c15b4bbc9e9e..e59dd7a7564c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -886,15 +886,6 @@ isolate_success: cc->nr_migratepages++; nr_isolated++; - /* - * Record where we could have freed pages by migration and not - * yet flushed them to buddy allocator. - * - this is the lowest page that was isolated and likely be - * then freed by migration. - */ - if (!cc->last_migrated_pfn) - cc->last_migrated_pfn = low_pfn; - /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; @@ -918,7 +909,6 @@ isolate_fail: } putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; - cc->last_migrated_pfn = 0; nr_isolated = 0; } @@ -1539,6 +1529,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro enum compact_result ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); + unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); @@ -1584,7 +1575,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro cc->whole_zone = true; } - cc->last_migrated_pfn = 0; + last_migrated_pfn = 0; trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); @@ -1593,12 +1584,14 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { int err; + unsigned long start_pfn = cc->migrate_pfn; switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; + last_migrated_pfn = 0; goto out; case ISOLATE_NONE: /* @@ -1608,6 +1601,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro */ goto check_drain; case ISOLATE_SUCCESS: + last_migrated_pfn = start_pfn; ; } @@ -1639,8 +1633,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro cc->migrate_pfn = block_end_pfn( cc->migrate_pfn - 1, cc->order); /* Draining pcplists is useless in this case */ - cc->last_migrated_pfn = 0; - + last_migrated_pfn = 0; } } @@ -1652,18 +1645,18 @@ check_drain: * compact_finished() can detect immediately if allocation * would succeed. */ - if (cc->order > 0 && cc->last_migrated_pfn) { + if (cc->order > 0 && last_migrated_pfn) { int cpu; unsigned long current_block_start = block_start_pfn(cc->migrate_pfn, cc->order); - if (cc->last_migrated_pfn < current_block_start) { + if (last_migrated_pfn < current_block_start) { cpu = get_cpu(); lru_add_drain_cpu(cpu); drain_local_pages(zone); put_cpu(); /* No more flushing until we migrate again */ - cc->last_migrated_pfn = 0; + last_migrated_pfn = 0; } } diff --git a/mm/internal.h b/mm/internal.h index 867af5425432..f40d06d70683 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -188,7 +188,6 @@ struct compact_control { unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ struct zone *zone; unsigned long total_migrate_scanned; unsigned long total_free_scanned; From 40cacbcb324036233a927418441323459d28d19b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:36 -0800 Subject: [PATCH 055/159] mm, compaction: remove unnecessary zone parameter in some instances A zone parameter is passed into a number of top-level compaction functions despite the fact that it's already in compact_control. This is harmless but it did need an audit to check if zone actually ever changes meaningfully. This patches removes the parameter in a number of top-level functions. The change could be much deeper but this was enough to briefly clarify the flow. No functional change. Link: http://lkml.kernel.org/r/20190118175136.31341-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 54 ++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e59dd7a7564c..163841e1b167 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1300,8 +1300,7 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static enum compact_result __compact_finished(struct zone *zone, - struct compact_control *cc) +static enum compact_result __compact_finished(struct compact_control *cc) { unsigned int order; const int migratetype = cc->migratetype; @@ -1312,7 +1311,7 @@ static enum compact_result __compact_finished(struct zone *zone, /* Compaction run completes if the migrate and free scanner meet */ if (compact_scanners_met(cc)) { /* Let the next compaction start anew. */ - reset_cached_positions(zone); + reset_cached_positions(cc->zone); /* * Mark that the PG_migrate_skip information should be cleared @@ -1321,7 +1320,7 @@ static enum compact_result __compact_finished(struct zone *zone, * based on an allocation request. */ if (cc->direct_compaction) - zone->compact_blockskip_flush = true; + cc->zone->compact_blockskip_flush = true; if (cc->whole_zone) return COMPACT_COMPLETE; @@ -1345,7 +1344,7 @@ static enum compact_result __compact_finished(struct zone *zone, /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[order]; + struct free_area *area = &cc->zone->free_area[order]; bool can_steal; /* Job done if page is free of the right migratetype */ @@ -1391,13 +1390,12 @@ static enum compact_result __compact_finished(struct zone *zone, return COMPACT_NO_SUITABLE_PAGE; } -static enum compact_result compact_finished(struct zone *zone, - struct compact_control *cc) +static enum compact_result compact_finished(struct compact_control *cc) { int ret; - ret = __compact_finished(zone, cc); - trace_mm_compaction_finished(zone, cc->order, ret); + ret = __compact_finished(cc); + trace_mm_compaction_finished(cc->zone, cc->order, ret); if (ret == COMPACT_NO_SUITABLE_PAGE) ret = COMPACT_CONTINUE; @@ -1524,16 +1522,16 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, return false; } -static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) +static enum compact_result compact_zone(struct compact_control *cc) { enum compact_result ret; - unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = zone_end_pfn(zone); + unsigned long start_pfn = cc->zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(cc->zone); unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); - ret = compaction_suitable(zone, cc->order, cc->alloc_flags, + ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) @@ -1546,8 +1544,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro * Clear pageblock skip if there were failures recently and compaction * is about to be retried after being deferred. */ - if (compaction_restarting(zone, cc->order)) - __reset_isolation_suitable(zone); + if (compaction_restarting(cc->zone, cc->order)) + __reset_isolation_suitable(cc->zone); /* * Setup to move all movable pages to the end of the zone. Used cached @@ -1559,16 +1557,16 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro cc->migrate_pfn = start_pfn; cc->free_pfn = pageblock_start_pfn(end_pfn - 1); } else { - cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; - cc->free_pfn = zone->compact_cached_free_pfn; + cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync]; + cc->free_pfn = cc->zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { cc->free_pfn = pageblock_start_pfn(end_pfn - 1); - zone->compact_cached_free_pfn = cc->free_pfn; + cc->zone->compact_cached_free_pfn = cc->free_pfn; } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; - zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; + cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } if (cc->migrate_pfn == start_pfn) @@ -1582,11 +1580,11 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro migrate_prep_local(); - while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { + while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { int err; unsigned long start_pfn = cc->migrate_pfn; - switch (isolate_migratepages(zone, cc)) { + switch (isolate_migratepages(cc->zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); @@ -1653,7 +1651,7 @@ check_drain: if (last_migrated_pfn < current_block_start) { cpu = get_cpu(); lru_add_drain_cpu(cpu); - drain_local_pages(zone); + drain_local_pages(cc->zone); put_cpu(); /* No more flushing until we migrate again */ last_migrated_pfn = 0; @@ -1678,8 +1676,8 @@ out: * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() */ - if (free_pfn > zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = free_pfn; + if (free_pfn > cc->zone->compact_cached_free_pfn) + cc->zone->compact_cached_free_pfn = free_pfn; } count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); @@ -1716,7 +1714,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - ret = compact_zone(zone, &cc); + ret = compact_zone(&cc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); @@ -1834,7 +1832,7 @@ static void compact_node(int nid) INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - compact_zone(zone, &cc); + compact_zone(&cc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); @@ -1968,7 +1966,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (kthread_should_stop()) return; - status = compact_zone(zone, &cc); + status = compact_zone(&cc); if (status == COMPACT_SUCCESS) { compaction_defer_reset(zone, cc.order, false); From 4469ab98477b290f6728b79f8d225d9d88ce16e3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:39 -0800 Subject: [PATCH 056/159] mm, compaction: rename map_pages to split_map_pages It's non-obvious that high-order free pages are split into order-0 pages from the function name. Fix it. Link: http://lkml.kernel.org/r/20190118175136.31341-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 163841e1b167..32a88b49f973 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -66,7 +66,7 @@ static unsigned long release_freepages(struct list_head *freelist) return high_pfn; } -static void map_pages(struct list_head *list) +static void split_map_pages(struct list_head *list) { unsigned int i, order, nr_pages; struct page *page, *next; @@ -644,7 +644,7 @@ isolate_freepages_range(struct compact_control *cc, } /* __isolate_free_page() does not map the pages */ - map_pages(&freelist); + split_map_pages(&freelist); if (pfn < end_pfn) { /* Loop terminated early, cleanup. */ @@ -1141,7 +1141,7 @@ static void isolate_freepages(struct compact_control *cc) } /* __isolate_free_page() does not map the pages */ - map_pages(freelist); + split_map_pages(freelist); /* * Record where the free scanner will restart next time. Either we From 806031bb5ec36ed879d64249d5a5cf9c6657f89d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:43 -0800 Subject: [PATCH 057/159] mm, migrate: immediately fail migration of a page with no migration handler Pages with no migration handler use a fallback handler which sometimes works and sometimes persistently retries. A historical example was blockdev pages but there are others such as odd refcounting when page->private is used. These are retried multiple times which is wasteful during compaction so this patch will fail migration faster unless the caller specifies MIGRATE_SYNC. This is not expected to help THP allocation success rates but it did reduce latencies very slightly in some cases. 1-socket thpfioscale 4.20.0 4.20.0 noreserved-v2r15 failfast-v2r15 Amean fault-both-1 0.00 ( 0.00%) 0.00 * 0.00%* Amean fault-both-3 3839.67 ( 0.00%) 3833.72 ( 0.15%) Amean fault-both-5 5177.47 ( 0.00%) 4967.15 ( 4.06%) Amean fault-both-7 7245.03 ( 0.00%) 7139.19 ( 1.46%) Amean fault-both-12 11534.89 ( 0.00%) 11326.30 ( 1.81%) Amean fault-both-18 16241.10 ( 0.00%) 16270.70 ( -0.18%) Amean fault-both-24 19075.91 ( 0.00%) 19839.65 ( -4.00%) Amean fault-both-30 22712.11 ( 0.00%) 21707.05 ( 4.43%) Amean fault-both-32 21692.92 ( 0.00%) 21968.16 ( -1.27%) The 2-socket results are not materially different. Scan rates are similar as expected. Link: http://lkml.kernel.org/r/20190118175136.31341-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 0413596fc523..0e9888cb33ad 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -911,7 +911,7 @@ static int fallback_migrate_page(struct address_space *mapping, */ if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) - return -EAGAIN; + return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; return migrate_page(mapping, newpage, page, mode); } From efe771c7603bc524425070d651e70e9c56c57f28 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:46 -0800 Subject: [PATCH 058/159] mm, compaction: always finish scanning of a full pageblock When compaction is finishing, it uses a flag to ensure the pageblock is complete but it makes sense to always complete migration of a pageblock. Minimally, skip information is based on a pageblock and partially scanned pageblocks may incur more scanning in the future. The pageblock skip handling also becomes more strict later in the series and the hint is more useful if a complete pageblock was always scanned. The potentially impacts latency as more scanning is done but it's not a consistent win or loss as the scanning is not always a high percentage of the pageblock and sometimes it is offset by future reductions in scanning. Hence, the results are not presented this time due to a misleading mix of gains/losses without any clear pattern. However, full scanning of the pageblock is important for later patches. Link: http://lkml.kernel.org/r/20190118175136.31341-8-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 19 ++++++++----------- mm/internal.h | 1 - 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 32a88b49f973..3d11c209614a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1331,16 +1331,14 @@ static enum compact_result __compact_finished(struct compact_control *cc) if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; - if (cc->finishing_block) { - /* - * We have finished the pageblock, but better check again that - * we really succeeded. - */ - if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) - cc->finishing_block = false; - else - return COMPACT_CONTINUE; - } + /* + * Always finish scanning a pageblock to reduce the possibility of + * fallbacks in the future. This is particularly important when + * migration source is unmovable/reclaimable but it's not worth + * special casing. + */ + if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { @@ -1382,7 +1380,6 @@ static enum compact_result __compact_finished(struct compact_control *cc) return COMPACT_SUCCESS; } - cc->finishing_block = true; return COMPACT_CONTINUE; } } diff --git a/mm/internal.h b/mm/internal.h index f40d06d70683..9b32f4cab0ae 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -203,7 +203,6 @@ struct compact_control { bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ - bool finishing_block; /* Finishing current pageblock */ }; unsigned long From fd1444b2729289ea3ef6b6096be604f8983e9f9f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:50 -0800 Subject: [PATCH 059/159] mm, compaction: ignore the fragmentation avoidance boost for isolation and compaction When pageblocks get fragmented, watermarks are artifically boosted to reclaim pages to avoid further fragmentation events. However, compaction is often either fragmentation-neutral or moving movable pages away from unmovable/reclaimable pages. As the true watermarks are preserved, allow compaction to ignore the boost factor. The expected impact is very slight as the main benefit is that compaction is slightly more likely to succeed when the system has been fragmented very recently. On both 1-socket and 2-socket machines for THP-intensive allocation during fragmentation the success rate was increased by less than 1% which is marginal. However, detailed tracing indicated that failure of migration due to a premature ENOMEM triggered by watermark checks were eliminated. Link: http://lkml.kernel.org/r/20190118175136.31341-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8afb6f007f68..2e132b9e7a93 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2962,7 +2962,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * watermark, because we already know our high-order page * exists. */ - watermark = min_wmark_pages(zone) + (1UL << order); + watermark = zone->_watermark[WMARK_MIN] + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; From 70b44595eafe9c7c235f076d653a268ca1ab9fdb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:54 -0800 Subject: [PATCH 060/159] mm, compaction: use free lists to quickly locate a migration source The migration scanner is a linear scan of a zone with a potentiall large search space. Furthermore, many pageblocks are unusable such as those filled with reserved pages or partially filled with pages that cannot migrate. These still get scanned in the common case of allocating a THP and the cost accumulates. The patch uses a partial search of the free lists to locate a migration source candidate that is marked as MOVABLE when allocating a THP. It prefers picking a block with a larger number of free pages already on the basis that there are fewer pages to migrate to free the entire block. The lowest PFN found during searches is tracked as the basis of the start for the linear search after the first search of the free list fails. After the search, the free list is shuffled so that the next search will not encounter the same page. If the search fails then the subsequent searches will be shorter and the linear scanner is used. If this search fails, or if the request is for a small or unmovable/reclaimable allocation then the linear scanner is still used. It is somewhat pointless to use the list search in those cases. Small free pages must be used for the search and there is no guarantee that movable pages are located within that block that are contiguous. 5.0.0-rc1 5.0.0-rc1 noboost-v3r10 findmig-v3r15 Amean fault-both-3 3771.41 ( 0.00%) 3390.40 ( 10.10%) Amean fault-both-5 5409.05 ( 0.00%) 5082.28 ( 6.04%) Amean fault-both-7 7040.74 ( 0.00%) 7012.51 ( 0.40%) Amean fault-both-12 11887.35 ( 0.00%) 11346.63 ( 4.55%) Amean fault-both-18 16718.19 ( 0.00%) 15324.19 ( 8.34%) Amean fault-both-24 21157.19 ( 0.00%) 16088.50 * 23.96%* Amean fault-both-30 21175.92 ( 0.00%) 18723.42 * 11.58%* Amean fault-both-32 21339.03 ( 0.00%) 18612.01 * 12.78%* 5.0.0-rc1 5.0.0-rc1 noboost-v3r10 findmig-v3r15 Percentage huge-3 86.50 ( 0.00%) 89.83 ( 3.85%) Percentage huge-5 92.52 ( 0.00%) 91.96 ( -0.61%) Percentage huge-7 92.44 ( 0.00%) 92.85 ( 0.44%) Percentage huge-12 92.98 ( 0.00%) 92.74 ( -0.25%) Percentage huge-18 91.70 ( 0.00%) 91.71 ( 0.02%) Percentage huge-24 91.59 ( 0.00%) 92.13 ( 0.60%) Percentage huge-30 90.14 ( 0.00%) 93.79 ( 4.04%) Percentage huge-32 90.03 ( 0.00%) 91.27 ( 1.37%) This shows an improvement in allocation latencies with similar allocation success rates. While not presented, there was a 31% reduction in migration scanning and a 8% reduction on system CPU usage. A 2-socket machine showed similar benefits. [mgorman@techsingularity.net: several fixes] Link: http://lkml.kernel.org/r/20190204120111.GL9565@techsingularity.net [vbabka@suse.cz: migrate block that was found-fast, some optimisations] Link: http://lkml.kernel.org/r/20190118175136.31341-10-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/i915_utils.h | 6 - include/linux/list.h | 11 ++ mm/compaction.c | 178 +++++++++++++++++++++++++++++- mm/internal.h | 2 + 4 files changed, 188 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h index 9726df37c4c4..540e20eb032c 100644 --- a/drivers/gpu/drm/i915/i915_utils.h +++ b/drivers/gpu/drm/i915/i915_utils.h @@ -123,12 +123,6 @@ static inline u64 ptr_to_u64(const void *ptr) #include -static inline int list_is_first(const struct list_head *list, - const struct list_head *head) -{ - return head->next == list; -} - static inline void __list_del_many(struct list_head *head, struct list_head *first) { diff --git a/include/linux/list.h b/include/linux/list.h index edb7628e46ed..79626b5ab36c 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -206,6 +206,17 @@ static inline void list_bulk_move_tail(struct list_head *head, head->prev = last; } +/** + * list_is_first -- tests whether @ list is the first entry in list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_first(const struct list_head *list, + const struct list_head *head) +{ + return list->prev == head; +} + /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test diff --git a/mm/compaction.c b/mm/compaction.c index 3d11c209614a..55f7ab142af2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1040,6 +1040,12 @@ static bool suitable_migration_target(struct compact_control *cc, return false; } +static inline unsigned int +freelist_scan_limit(struct compact_control *cc) +{ + return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1; +} + /* * Test whether the free scanner has reached the same or lower pageblock than * the migration scanner, and compaction should thus terminate. @@ -1050,6 +1056,19 @@ static inline bool compact_scanners_met(struct compact_control *cc) <= (cc->migrate_pfn >> pageblock_order); } +/* Reorder the free list to reduce repeated future searches */ +static void +move_freelist_tail(struct list_head *freelist, struct page *freepage) +{ + LIST_HEAD(sublist); + + if (!list_is_first(freelist, &freepage->lru)) { + list_cut_position(&sublist, freelist, &freepage->lru); + if (!list_empty(&sublist)) + list_splice_tail(&sublist, freelist); + } +} + /* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. @@ -1207,6 +1226,148 @@ typedef enum { */ int sysctl_compact_unevictable_allowed __read_mostly = 1; +static inline void +update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) +{ + if (cc->fast_start_pfn == ULONG_MAX) + return; + + if (!cc->fast_start_pfn) + cc->fast_start_pfn = pfn; + + cc->fast_start_pfn = min(cc->fast_start_pfn, pfn); +} + +static inline unsigned long +reinit_migrate_pfn(struct compact_control *cc) +{ + if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX) + return cc->migrate_pfn; + + cc->migrate_pfn = cc->fast_start_pfn; + cc->fast_start_pfn = ULONG_MAX; + + return cc->migrate_pfn; +} + +/* + * Briefly search the free lists for a migration source that already has + * some free pages to reduce the number of pages that need migration + * before a pageblock is free. + */ +static unsigned long fast_find_migrateblock(struct compact_control *cc) +{ + unsigned int limit = freelist_scan_limit(cc); + unsigned int nr_scanned = 0; + unsigned long distance; + unsigned long pfn = cc->migrate_pfn; + unsigned long high_pfn; + int order; + + /* Skip hints are relied on to avoid repeats on the fast search */ + if (cc->ignore_skip_hint) + return pfn; + + /* + * If the migrate_pfn is not at the start of a zone or the start + * of a pageblock then assume this is a continuation of a previous + * scan restarted due to COMPACT_CLUSTER_MAX. + */ + if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) + return pfn; + + /* + * For smaller orders, just linearly scan as the number of pages + * to migrate should be relatively small and does not necessarily + * justify freeing up a large block for a small allocation. + */ + if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) + return pfn; + + /* + * Only allow kcompactd and direct requests for movable pages to + * quickly clear out a MOVABLE pageblock for allocation. This + * reduces the risk that a large movable pageblock is freed for + * an unmovable/reclaimable small allocation. + */ + if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) + return pfn; + + /* + * When starting the migration scanner, pick any pageblock within the + * first half of the search space. Otherwise try and pick a pageblock + * within the first eighth to reduce the chances that a migration + * target later becomes a source. + */ + distance = (cc->free_pfn - cc->migrate_pfn) >> 1; + if (cc->migrate_pfn != cc->zone->zone_start_pfn) + distance >>= 2; + high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); + + for (order = cc->order - 1; + order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit; + order--) { + struct free_area *area = &cc->zone->free_area[order]; + struct list_head *freelist; + unsigned long flags; + struct page *freepage; + + if (!area->nr_free) + continue; + + spin_lock_irqsave(&cc->zone->lock, flags); + freelist = &area->free_list[MIGRATE_MOVABLE]; + list_for_each_entry(freepage, freelist, lru) { + unsigned long free_pfn; + + nr_scanned++; + free_pfn = page_to_pfn(freepage); + if (free_pfn < high_pfn) { + update_fast_start_pfn(cc, free_pfn); + + /* + * Avoid if skipped recently. Ideally it would + * move to the tail but even safe iteration of + * the list assumes an entry is deleted, not + * reordered. + */ + if (get_pageblock_skip(freepage)) { + if (list_is_last(freelist, &freepage->lru)) + break; + + continue; + } + + /* Reorder to so a future search skips recent pages */ + move_freelist_tail(freelist, freepage); + + pfn = pageblock_start_pfn(free_pfn); + cc->fast_search_fail = 0; + set_pageblock_skip(freepage); + break; + } + + if (nr_scanned >= limit) { + cc->fast_search_fail++; + move_freelist_tail(freelist, freepage); + break; + } + } + spin_unlock_irqrestore(&cc->zone->lock, flags); + } + + cc->total_migrate_scanned += nr_scanned; + + /* + * If fast scanning failed then use a cached entry for a page block + * that had free pages as the basis for starting a linear scan. + */ + if (pfn == cc->migrate_pfn) + pfn = reinit_migrate_pfn(cc); + + return pfn; +} + /* * Isolate all pages that can be migrated from the first suitable block, * starting at the block pointed to by the migrate scanner pfn within @@ -1222,16 +1383,25 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); + bool fast_find_block; /* * Start at where we last stopped, or beginning of the zone as - * initialized by compact_zone() + * initialized by compact_zone(). The first failure will use + * the lowest PFN as the starting point for linear scanning. */ - low_pfn = cc->migrate_pfn; + low_pfn = fast_find_migrateblock(cc); block_start_pfn = pageblock_start_pfn(low_pfn); if (block_start_pfn < zone->zone_start_pfn) block_start_pfn = zone->zone_start_pfn; + /* + * fast_find_migrateblock marks a pageblock skipped so to avoid + * the isolation_suitable check below, check whether the fast + * search was successful. + */ + fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail; + /* Only scan within a pageblock boundary */ block_end_pfn = pageblock_end_pfn(low_pfn); @@ -1240,6 +1410,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * Do not cross the free scanner. */ for (; block_end_pfn <= cc->free_pfn; + fast_find_block = false, low_pfn = block_end_pfn, block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -1259,7 +1430,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; /* If isolation recently failed, do not retry */ - if (!isolation_suitable(cc, page)) + if (!isolation_suitable(cc, page) && !fast_find_block) continue; /* @@ -1550,6 +1721,7 @@ static enum compact_result compact_zone(struct compact_control *cc) * want to compact the whole zone), but check that it is initialised * by ensuring the values are within zone boundaries. */ + cc->fast_start_pfn = 0; if (cc->whole_zone) { cc->migrate_pfn = start_pfn; cc->free_pfn = pageblock_start_pfn(end_pfn - 1); diff --git a/mm/internal.h b/mm/internal.h index 9b32f4cab0ae..983cb975545f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -188,9 +188,11 @@ struct compact_control { unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ + unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; unsigned long total_free_scanned; + unsigned int fast_search_fail; /* failures to use free list searches */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ From e380bebe4771548df9bece8b7ad9dab07d9158a6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:58 -0800 Subject: [PATCH 061/159] mm, compaction: keep migration source private to a single compaction instance Due to either a fast search of the free list or a linear scan, it is possible for multiple compaction instances to pick the same pageblock for migration. This is lucky for one scanner and increased scanning for all the others. It also allows a race between requests on which first allocates the resulting free block. This patch tests and updates the pageblock skip for the migration scanner carefully. When isolating a block, it will check and skip if the block is already in use. Once the zone lock is acquired, it will be rechecked so that only one scanner can set the pageblock skip for exclusive use. Any scanner contending will continue with a linear scan. The skip bit is still set if no pages can be isolated in a range. While this may result in redundant scanning, it avoids unnecessarily acquiring the zone lock when there are no suitable migration sources. 1-socket thpscale Amean fault-both-1 0.00 ( 0.00%) 0.00 * 0.00%* Amean fault-both-3 3390.40 ( 0.00%) 3024.41 ( 10.80%) Amean fault-both-5 5082.28 ( 0.00%) 4749.30 ( 6.55%) Amean fault-both-7 7012.51 ( 0.00%) 6454.95 ( 7.95%) Amean fault-both-12 11346.63 ( 0.00%) 10324.83 ( 9.01%) Amean fault-both-18 15324.19 ( 0.00%) 12896.82 * 15.84%* Amean fault-both-24 16088.50 ( 0.00%) 13470.60 * 16.27%* Amean fault-both-30 18723.42 ( 0.00%) 17143.99 ( 8.44%) Amean fault-both-32 18612.01 ( 0.00%) 17743.91 ( 4.66%) 5.0.0-rc1 5.0.0-rc1 findmig-v3r15 isolmig-v3r15 Percentage huge-3 89.83 ( 0.00%) 92.96 ( 3.48%) Percentage huge-5 91.96 ( 0.00%) 93.26 ( 1.41%) Percentage huge-7 92.85 ( 0.00%) 93.63 ( 0.84%) Percentage huge-12 92.74 ( 0.00%) 92.80 ( 0.07%) Percentage huge-18 91.71 ( 0.00%) 91.62 ( -0.10%) Percentage huge-24 92.13 ( 0.00%) 91.50 ( -0.69%) Percentage huge-30 93.79 ( 0.00%) 92.73 ( -1.13%) Percentage huge-32 91.27 ( 0.00%) 91.94 ( 0.74%) This shows a reasonable reduction in latency as multiple compaction scanners do not operate on the same blocks with a similar allocation success rate. Compaction migrate scanned 41093126 25646769 Migration scan rates are reduced by 38%. Link: http://lkml.kernel.org/r/20190118175136.31341-11-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 124 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 99 insertions(+), 25 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 55f7ab142af2..097572e2ec5d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -285,13 +285,52 @@ void reset_isolation_suitable(pg_data_t *pgdat) } } +/* + * Sets the pageblock skip bit if it was clear. Note that this is a hint as + * locks are not required for read/writers. Returns true if it was already set. + */ +static bool test_and_set_skip(struct compact_control *cc, struct page *page, + unsigned long pfn) +{ + bool skip; + + /* Do no update if skip hint is being ignored */ + if (cc->ignore_skip_hint) + return false; + + if (!IS_ALIGNED(pfn, pageblock_nr_pages)) + return false; + + skip = get_pageblock_skip(page); + if (!skip && !cc->no_set_skip_hint) + set_pageblock_skip(page); + + return skip; +} + +static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) +{ + struct zone *zone = cc->zone; + + pfn = pageblock_end_pfn(pfn); + + /* Set for isolation rather than compaction */ + if (cc->no_set_skip_hint) + return; + + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; +} + /* * If no pages were isolated then mark this pageblock to be skipped in the * future. The information is later cleared by __reset_isolation_suitable(). */ static void update_pageblock_skip(struct compact_control *cc, - struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + struct page *page, unsigned long nr_isolated) { struct zone *zone = cc->zone; unsigned long pfn; @@ -310,16 +349,8 @@ static void update_pageblock_skip(struct compact_control *cc, pfn = page_to_pfn(page); /* Update where async and sync compaction should restart */ - if (migrate_scanner) { - if (pfn > zone->compact_cached_migrate_pfn[0]) - zone->compact_cached_migrate_pfn[0] = pfn; - if (cc->mode != MIGRATE_ASYNC && - pfn > zone->compact_cached_migrate_pfn[1]) - zone->compact_cached_migrate_pfn[1] = pfn; - } else { - if (pfn < zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = pfn; - } + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; } #else static inline bool isolation_suitable(struct compact_control *cc, @@ -334,10 +365,19 @@ static inline bool pageblock_skip_persistent(struct page *page) } static inline void update_pageblock_skip(struct compact_control *cc, - struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + struct page *page, unsigned long nr_isolated) { } + +static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) +{ +} + +static bool test_and_set_skip(struct compact_control *cc, struct page *page, + unsigned long pfn) +{ + return false; +} #endif /* CONFIG_COMPACTION */ /* @@ -567,7 +607,7 @@ isolate_fail: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, false); + update_pageblock_skip(cc, valid_page, total_isolated); cc->total_free_scanned += nr_scanned; if (total_isolated) @@ -702,6 +742,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long start_pfn = low_pfn; bool skip_on_failure = false; unsigned long next_skip_pfn = 0; + bool skip_updated = false; /* * Ensure that there are not too many pages isolated from the LRU @@ -768,8 +809,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, page = pfn_to_page(low_pfn); - if (!valid_page) + /* + * Check if the pageblock has already been marked skipped. + * Only the aligned PFN is checked as the caller isolates + * COMPACT_CLUSTER_MAX at a time so the second call must + * not falsely conclude that the block should be skipped. + */ + if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { + if (!cc->ignore_skip_hint && get_pageblock_skip(page)) { + low_pfn = end_pfn; + goto isolate_abort; + } valid_page = page; + } /* * Skip if free. We read page order here without zone lock @@ -850,8 +902,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!locked) { locked = compact_trylock_irqsave(zone_lru_lock(zone), &flags, cc); - if (!locked) + + /* Allow future scanning if the lock is contended */ + if (!locked) { + clear_pageblock_skip(page); break; + } + + /* Try get exclusive access under lock */ + if (!skip_updated) { + skip_updated = true; + if (test_and_set_skip(cc, page, low_pfn)) + goto isolate_abort; + } /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) @@ -929,15 +992,20 @@ isolate_fail: if (unlikely(low_pfn > end_pfn)) low_pfn = end_pfn; +isolate_abort: if (locked) spin_unlock_irqrestore(zone_lru_lock(zone), flags); /* - * Update the pageblock-skip information and cached scanner pfn, - * if the whole pageblock was scanned without isolating any page. + * Updated the cached scanner pfn if the pageblock was scanned + * without isolating a page. The pageblock may not be marked + * skipped already if there were no LRU pages in the block. */ - if (low_pfn == end_pfn) - update_pageblock_skip(cc, valid_page, nr_isolated, true); + if (low_pfn == end_pfn && !nr_isolated) { + if (valid_page && !skip_updated) + set_pageblock_skip(valid_page); + update_cached_migrate(cc, low_pfn); + } trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, nr_scanned, nr_isolated); @@ -1323,8 +1391,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) nr_scanned++; free_pfn = page_to_pfn(freepage); if (free_pfn < high_pfn) { - update_fast_start_pfn(cc, free_pfn); - /* * Avoid if skipped recently. Ideally it would * move to the tail but even safe iteration of @@ -1341,6 +1407,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) /* Reorder to so a future search skips recent pages */ move_freelist_tail(freelist, freepage); + update_fast_start_pfn(cc, free_pfn); pfn = pageblock_start_pfn(free_pfn); cc->fast_search_fail = 0; set_pageblock_skip(freepage); @@ -1429,8 +1496,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, if (!page) continue; - /* If isolation recently failed, do not retry */ - if (!isolation_suitable(cc, page) && !fast_find_block) + /* + * If isolation recently failed, do not retry. Only check the + * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock + * to be visited multiple times. Assume skip was checked + * before making it "skip" so other compaction instances do + * not scan the same block. + */ + if (IS_ALIGNED(low_pfn, pageblock_nr_pages) && + !fast_find_block && !isolation_suitable(cc, page)) continue; /* From 5a811889de10f1ebb8e03a2744be006e909c405c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:01 -0800 Subject: [PATCH 062/159] mm, compaction: use free lists to quickly locate a migration target Similar to the migration scanner, this patch uses the free lists to quickly locate a migration target. The search is different in that lower orders will be searched for a suitable high PFN if necessary but the search is still bound. This is justified on the grounds that the free scanner typically scans linearly much more than the migration scanner. If a free page is found, it is isolated and compaction continues if enough pages were isolated. For SYNC* scanning, the full pageblock is scanned for any remaining free pages so that is can be marked for skipping in the near future. 1-socket thpfioscale 5.0.0-rc1 5.0.0-rc1 isolmig-v3r15 findfree-v3r16 Amean fault-both-3 3024.41 ( 0.00%) 3200.68 ( -5.83%) Amean fault-both-5 4749.30 ( 0.00%) 4847.75 ( -2.07%) Amean fault-both-7 6454.95 ( 0.00%) 6658.92 ( -3.16%) Amean fault-both-12 10324.83 ( 0.00%) 11077.62 ( -7.29%) Amean fault-both-18 12896.82 ( 0.00%) 12403.97 ( 3.82%) Amean fault-both-24 13470.60 ( 0.00%) 15607.10 * -15.86%* Amean fault-both-30 17143.99 ( 0.00%) 18752.27 ( -9.38%) Amean fault-both-32 17743.91 ( 0.00%) 21207.54 * -19.52%* The impact on latency is variable but the search is optimistic and sensitive to the exact system state. Success rates are similar but the major impact is to the rate of scanning 5.0.0-rc1 5.0.0-rc1 isolmig-v3r15 findfree-v3r16 Compaction migrate scanned 25646769 29507205 Compaction free scanned 201558184 100359571 The free scan rates are reduced by 50%. The 2-socket reductions for the free scanner are more dramatic which is a likely reflection that the machine has more memory. [dan.carpenter@oracle.com: fix static checker warning] [vbabka@suse.cz: correct number of pages scanned for lower orders] Link: http://lkml.kernel.org/r/20190118175136.31341-12-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dan Carpenter Cc: Andrea Arcangeli Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 218 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 213 insertions(+), 5 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 097572e2ec5d..6d42ea126242 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1124,7 +1124,29 @@ static inline bool compact_scanners_met(struct compact_control *cc) <= (cc->migrate_pfn >> pageblock_order); } -/* Reorder the free list to reduce repeated future searches */ +/* + * Used when scanning for a suitable migration target which scans freelists + * in reverse. Reorders the list such as the unscanned pages are scanned + * first on the next iteration of the free scanner + */ +static void +move_freelist_head(struct list_head *freelist, struct page *freepage) +{ + LIST_HEAD(sublist); + + if (!list_is_last(freelist, &freepage->lru)) { + list_cut_before(&sublist, freelist, &freepage->lru); + if (!list_empty(&sublist)) + list_splice_tail(&sublist, freelist); + } +} + +/* + * Similar to move_freelist_head except used by the migration scanner + * when scanning forward. It's possible for these list operations to + * move against each other if they search the free list exactly in + * lockstep. + */ static void move_freelist_tail(struct list_head *freelist, struct page *freepage) { @@ -1137,6 +1159,186 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage) } } +static void +fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) +{ + unsigned long start_pfn, end_pfn; + struct page *page = pfn_to_page(pfn); + + /* Do not search around if there are enough pages already */ + if (cc->nr_freepages >= cc->nr_migratepages) + return; + + /* Minimise scanning during async compaction */ + if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC) + return; + + /* Pageblock boundaries */ + start_pfn = pageblock_start_pfn(pfn); + end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone)); + + /* Scan before */ + if (start_pfn != pfn) { + isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, false); + if (cc->nr_freepages >= cc->nr_migratepages) + return; + } + + /* Scan after */ + start_pfn = pfn + nr_isolated; + if (start_pfn != end_pfn) + isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, false); + + /* Skip this pageblock in the future as it's full or nearly full */ + if (cc->nr_freepages < cc->nr_migratepages) + set_pageblock_skip(page); +} + +static unsigned long +fast_isolate_freepages(struct compact_control *cc) +{ + unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1); + unsigned int nr_scanned = 0; + unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0; + unsigned long nr_isolated = 0; + unsigned long distance; + struct page *page = NULL; + bool scan_start = false; + int order; + + /* Full compaction passes in a negative order */ + if (cc->order <= 0) + return cc->free_pfn; + + /* + * If starting the scan, use a deeper search and use the highest + * PFN found if a suitable one is not found. + */ + if (cc->free_pfn == pageblock_start_pfn(zone_end_pfn(cc->zone) - 1)) { + limit = pageblock_nr_pages >> 1; + scan_start = true; + } + + /* + * Preferred point is in the top quarter of the scan space but take + * a pfn from the top half if the search is problematic. + */ + distance = (cc->free_pfn - cc->migrate_pfn); + low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2)); + min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1)); + + if (WARN_ON_ONCE(min_pfn > low_pfn)) + low_pfn = min_pfn; + + for (order = cc->order - 1; + order >= 0 && !page; + order--) { + struct free_area *area = &cc->zone->free_area[order]; + struct list_head *freelist; + struct page *freepage; + unsigned long flags; + unsigned int order_scanned = 0; + + if (!area->nr_free) + continue; + + spin_lock_irqsave(&cc->zone->lock, flags); + freelist = &area->free_list[MIGRATE_MOVABLE]; + list_for_each_entry_reverse(freepage, freelist, lru) { + unsigned long pfn; + + order_scanned++; + nr_scanned++; + pfn = page_to_pfn(freepage); + + if (pfn >= highest) + highest = pageblock_start_pfn(pfn); + + if (pfn >= low_pfn) { + cc->fast_search_fail = 0; + page = freepage; + break; + } + + if (pfn >= min_pfn && pfn > high_pfn) { + high_pfn = pfn; + + /* Shorten the scan if a candidate is found */ + limit >>= 1; + } + + if (order_scanned >= limit) + break; + } + + /* Use a minimum pfn if a preferred one was not found */ + if (!page && high_pfn) { + page = pfn_to_page(high_pfn); + + /* Update freepage for the list reorder below */ + freepage = page; + } + + /* Reorder to so a future search skips recent pages */ + move_freelist_head(freelist, freepage); + + /* Isolate the page if available */ + if (page) { + if (__isolate_free_page(page, order)) { + set_page_private(page, order); + nr_isolated = 1 << order; + cc->nr_freepages += nr_isolated; + list_add_tail(&page->lru, &cc->freepages); + count_compact_events(COMPACTISOLATED, nr_isolated); + } else { + /* If isolation fails, abort the search */ + order = -1; + page = NULL; + } + } + + spin_unlock_irqrestore(&cc->zone->lock, flags); + + /* + * Smaller scan on next order so the total scan ig related + * to freelist_scan_limit. + */ + if (order_scanned >= limit) + limit = min(1U, limit >> 1); + } + + if (!page) { + cc->fast_search_fail++; + if (scan_start) { + /* + * Use the highest PFN found above min. If one was + * not found, be pessemistic for direct compaction + * and use the min mark. + */ + if (highest) { + page = pfn_to_page(highest); + cc->free_pfn = highest; + } else { + if (cc->direct_compaction) { + page = pfn_to_page(min_pfn); + cc->free_pfn = min_pfn; + } + } + } + } + + if (highest && highest > cc->zone->compact_cached_free_pfn) + cc->zone->compact_cached_free_pfn = highest; + + cc->total_free_scanned += nr_scanned; + if (!page) + return cc->free_pfn; + + low_pfn = page_to_pfn(page); + fast_isolate_around(cc, low_pfn, nr_isolated); + return low_pfn; +} + /* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. @@ -1151,6 +1353,11 @@ static void isolate_freepages(struct compact_control *cc) unsigned long low_pfn; /* lowest pfn scanner is able to scan */ struct list_head *freelist = &cc->freepages; + /* Try a small search of the free lists for a candidate */ + isolate_start_pfn = fast_isolate_freepages(cc); + if (cc->nr_freepages) + goto splitmap; + /* * Initialise the free scanner. The starting point is where we last * successfully isolated from, zone-cached value, or the end of the @@ -1163,7 +1370,7 @@ static void isolate_freepages(struct compact_control *cc) * is using. */ isolate_start_pfn = cc->free_pfn; - block_start_pfn = pageblock_start_pfn(cc->free_pfn); + block_start_pfn = pageblock_start_pfn(isolate_start_pfn); block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); low_pfn = pageblock_end_pfn(cc->migrate_pfn); @@ -1227,9 +1434,6 @@ static void isolate_freepages(struct compact_control *cc) } } - /* __isolate_free_page() does not map the pages */ - split_map_pages(freelist); - /* * Record where the free scanner will restart next time. Either we * broke from the loop and set isolate_start_pfn based on the last @@ -1237,6 +1441,10 @@ static void isolate_freepages(struct compact_control *cc) * and the loop terminated due to isolate_start_pfn < low_pfn */ cc->free_pfn = isolate_start_pfn; + +splitmap: + /* __isolate_free_page() does not map the pages */ + split_map_pages(freelist); } /* From 804d3121ba5f03af0ab225e2f688ee3ee669c0d2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:07 -0800 Subject: [PATCH 063/159] mm, compaction: avoid rescanning the same pageblock multiple times Pageblocks are marked for skip when no pages are isolated after a scan. However, it's possible to hit corner cases where the migration scanner gets stuck near the boundary between the source and target scanner. Due to pages being migrated in blocks of COMPACT_CLUSTER_MAX, pages that are migrated can be reallocated before the pageblock is complete. The pageblock is not necessarily skipped so it can be rescanned multiple times. Similarly, a pageblock with some dirty/writeback pages may fail to migrate and be rescanned until writeback completes which is wasteful. This patch tracks if a pageblock is being rescanned. If so, then the entire pageblock will be migrated as one operation. This narrows the race window during which pages can be reallocated during migration. Secondly, if there are pages that cannot be isolated then the pageblock will still be fully scanned and marked for skipping. On the second rescan, the pageblock skip is set and the migration scanner makes progress. 5.0.0-rc1 5.0.0-rc1 findfree-v3r16 norescan-v3r16 Amean fault-both-1 0.00 ( 0.00%) 0.00 * 0.00%* Amean fault-both-3 3200.68 ( 0.00%) 3002.07 ( 6.21%) Amean fault-both-5 4847.75 ( 0.00%) 4684.47 ( 3.37%) Amean fault-both-7 6658.92 ( 0.00%) 6815.54 ( -2.35%) Amean fault-both-12 11077.62 ( 0.00%) 10864.02 ( 1.93%) Amean fault-both-18 12403.97 ( 0.00%) 12247.52 ( 1.26%) Amean fault-both-24 15607.10 ( 0.00%) 15683.99 ( -0.49%) Amean fault-both-30 18752.27 ( 0.00%) 18620.02 ( 0.71%) Amean fault-both-32 21207.54 ( 0.00%) 19250.28 * 9.23%* 5.0.0-rc1 5.0.0-rc1 findfree-v3r16 norescan-v3r16 Percentage huge-3 96.86 ( 0.00%) 95.00 ( -1.91%) Percentage huge-5 93.72 ( 0.00%) 94.22 ( 0.53%) Percentage huge-7 94.31 ( 0.00%) 92.35 ( -2.08%) Percentage huge-12 92.66 ( 0.00%) 91.90 ( -0.82%) Percentage huge-18 91.51 ( 0.00%) 89.58 ( -2.11%) Percentage huge-24 90.50 ( 0.00%) 90.03 ( -0.52%) Percentage huge-30 91.57 ( 0.00%) 89.14 ( -2.65%) Percentage huge-32 91.00 ( 0.00%) 90.58 ( -0.46%) Negligible difference but this was likely a case when the specific corner case was not hit. A previous run of the same patch based on an earlier iteration of the series showed large differences where migration rates could be halved when the corner case was hit. The specific corner case where migration scan rates go through the roof was due to a dirty/writeback pageblock located at the boundary of the migration/free scanner did not happen in this case. When it does happen, the scan rates multipled by massive margins. Link: http://lkml.kernel.org/r/20190118175136.31341-13-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 32 ++++++++++++++++++++++++++------ mm/internal.h | 1 + 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 6d42ea126242..00a5126b6548 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -949,8 +949,11 @@ isolate_success: cc->nr_migratepages++; nr_isolated++; - /* Avoid isolating too much */ - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { + /* + * Avoid isolating too much unless this block is being + * rescanned (e.g. dirty/writeback pages, parallel allocation). + */ + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX && !cc->rescan) { ++low_pfn; break; } @@ -997,11 +1000,14 @@ isolate_abort: spin_unlock_irqrestore(zone_lru_lock(zone), flags); /* - * Updated the cached scanner pfn if the pageblock was scanned - * without isolating a page. The pageblock may not be marked - * skipped already if there were no LRU pages in the block. + * Updated the cached scanner pfn once the pageblock has been scanned + * Pages will either be migrated in which case there is no point + * scanning in the near future or migration failed in which case the + * failure reason may persist. The block is marked for skipping if + * there were no pages isolated in the block or if the block is + * rescanned twice in a row. */ - if (low_pfn == end_pfn && !nr_isolated) { + if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { if (valid_page && !skip_updated) set_pageblock_skip(valid_page); update_cached_migrate(cc, low_pfn); @@ -2035,6 +2041,20 @@ static enum compact_result compact_zone(struct compact_control *cc) int err; unsigned long start_pfn = cc->migrate_pfn; + /* + * Avoid multiple rescans which can happen if a page cannot be + * isolated (dirty/writeback in async mode) or if the migrated + * pages are being allocated before the pageblock is cleared. + * The first rescan will capture the entire pageblock for + * migration. If it fails, it'll be marked skip and scanning + * will proceed as normal. + */ + cc->rescan = false; + if (pageblock_start_pfn(last_migrated_pfn) == + pageblock_start_pfn(start_pfn)) { + cc->rescan = true; + } + switch (isolate_migratepages(cc->zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; diff --git a/mm/internal.h b/mm/internal.h index 983cb975545f..d5b999e5eb5f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -205,6 +205,7 @@ struct compact_control { bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ + bool rescan; /* Rescanning the same pageblock */ }; unsigned long From cb2dcaf023c2cf12d45289c82d4030d33f7df73e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:11 -0800 Subject: [PATCH 064/159] mm, compaction: finish pageblock scanning on contention Async migration aborts on spinlock contention but contention can be high when there are multiple compaction attempts and kswapd is active. The consequence is that the migration scanners move forward uselessly while still contending on locks for longer while leaving suitable migration sources behind. This patch will acquire the lock but track when contention occurs. When it does, the current pageblock will finish as compaction may succeed for that block and then abort. This will have a variable impact on latency as in some cases useless scanning is avoided (reduces latency) but a lock will be contended (increase latency) or a single contended pageblock is scanned that would otherwise have been skipped (increase latency). 5.0.0-rc1 5.0.0-rc1 norescan-v3r16 finishcontend-v3r16 Amean fault-both-1 0.00 ( 0.00%) 0.00 * 0.00%* Amean fault-both-3 3002.07 ( 0.00%) 3153.17 ( -5.03%) Amean fault-both-5 4684.47 ( 0.00%) 4280.52 ( 8.62%) Amean fault-both-7 6815.54 ( 0.00%) 5811.50 * 14.73%* Amean fault-both-12 10864.02 ( 0.00%) 9276.85 ( 14.61%) Amean fault-both-18 12247.52 ( 0.00%) 11032.67 ( 9.92%) Amean fault-both-24 15683.99 ( 0.00%) 14285.70 ( 8.92%) Amean fault-both-30 18620.02 ( 0.00%) 16293.76 * 12.49%* Amean fault-both-32 19250.28 ( 0.00%) 16721.02 * 13.14%* 5.0.0-rc1 5.0.0-rc1 norescan-v3r16 finishcontend-v3r16 Percentage huge-1 0.00 ( 0.00%) 0.00 ( 0.00%) Percentage huge-3 95.00 ( 0.00%) 96.82 ( 1.92%) Percentage huge-5 94.22 ( 0.00%) 95.40 ( 1.26%) Percentage huge-7 92.35 ( 0.00%) 95.92 ( 3.86%) Percentage huge-12 91.90 ( 0.00%) 96.73 ( 5.25%) Percentage huge-18 89.58 ( 0.00%) 96.77 ( 8.03%) Percentage huge-24 90.03 ( 0.00%) 96.05 ( 6.69%) Percentage huge-30 89.14 ( 0.00%) 96.81 ( 8.60%) Percentage huge-32 90.58 ( 0.00%) 97.41 ( 7.54%) There is a variable impact that is mostly good on latency while allocation success rates are slightly higher. System CPU usage is reduced by about 10% but scan rate impact is mixed Compaction migrate scanned 27997659.00 20148867 Compaction free scanned 120782791.00 118324914 Migration scan rates are reduced 28% which is expected as a pageblock is used by the async scanner instead of skipped. The impact on the free scanner is known to be variable. Overall the primary justification for this patch is that completing scanning of a pageblock is very important for later patches. [yuehaibing@huawei.com: fix unused variable warning] Link: http://lkml.kernel.org/r/20190118175136.31341-14-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: YueHaibing Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 90 +++++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 56 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 00a5126b6548..5325211398f8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -382,24 +382,25 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page, /* * Compaction requires the taking of some coarse locks that are potentially - * very heavily contended. For async compaction, back out if the lock cannot - * be taken immediately. For sync compaction, spin on the lock if needed. + * very heavily contended. For async compaction, trylock and record if the + * lock is contended. The lock will still be acquired but compaction will + * abort when the current block is finished regardless of success rate. + * Sync compaction acquires the lock. * - * Returns true if the lock is held - * Returns false if the lock is not held and compaction should abort + * Always returns true which makes it easier to track lock state in callers. */ -static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, +static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, struct compact_control *cc) { - if (cc->mode == MIGRATE_ASYNC) { - if (!spin_trylock_irqsave(lock, *flags)) { - cc->contended = true; - return false; - } - } else { - spin_lock_irqsave(lock, *flags); + /* Track if the lock is contended in async mode */ + if (cc->mode == MIGRATE_ASYNC && !cc->contended) { + if (spin_trylock_irqsave(lock, *flags)) + return true; + + cc->contended = true; } + spin_lock_irqsave(lock, *flags); return true; } @@ -432,10 +433,8 @@ static bool compact_unlock_should_abort(spinlock_t *lock, } if (need_resched()) { - if (cc->mode == MIGRATE_ASYNC) { + if (cc->mode == MIGRATE_ASYNC) cc->contended = true; - return true; - } cond_resched(); } @@ -455,10 +454,8 @@ static inline bool compact_should_abort(struct compact_control *cc) { /* async compaction aborts if contended */ if (need_resched()) { - if (cc->mode == MIGRATE_ASYNC) { + if (cc->mode == MIGRATE_ASYNC) cc->contended = true; - return true; - } cond_resched(); } @@ -535,18 +532,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * recheck as well. */ if (!locked) { - /* - * The zone lock must be held to isolate freepages. - * Unfortunately this is a very coarse lock and can be - * heavily contended if there are parallel allocations - * or parallel compactions. For async compaction do not - * spin on the lock and we acquire the lock as late as - * possible. - */ - locked = compact_trylock_irqsave(&cc->zone->lock, + locked = compact_lock_irqsave(&cc->zone->lock, &flags, cc); - if (!locked) - break; /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) @@ -900,15 +887,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* If we already hold the lock, we can skip some rechecking */ if (!locked) { - locked = compact_trylock_irqsave(zone_lru_lock(zone), + locked = compact_lock_irqsave(zone_lru_lock(zone), &flags, cc); - /* Allow future scanning if the lock is contended */ - if (!locked) { - clear_pageblock_skip(page); - break; - } - /* Try get exclusive access under lock */ if (!skip_updated) { skip_updated = true; @@ -951,9 +932,12 @@ isolate_success: /* * Avoid isolating too much unless this block is being - * rescanned (e.g. dirty/writeback pages, parallel allocation). + * rescanned (e.g. dirty/writeback pages, parallel allocation) + * or a lock is contended. For contention, isolate quickly to + * potentially remove one source of contention. */ - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX && !cc->rescan) { + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX && + !cc->rescan && !cc->contended) { ++low_pfn; break; } @@ -1416,12 +1400,8 @@ static void isolate_freepages(struct compact_control *cc) isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, freelist, false); - /* - * If we isolated enough freepages, or aborted due to lock - * contention, terminate. - */ - if ((cc->nr_freepages >= cc->nr_migratepages) - || cc->contended) { + /* Are enough freepages isolated? */ + if (cc->nr_freepages >= cc->nr_migratepages) { if (isolate_start_pfn >= block_end_pfn) { /* * Restart at previous pageblock if more @@ -1463,13 +1443,8 @@ static struct page *compaction_alloc(struct page *migratepage, struct compact_control *cc = (struct compact_control *)data; struct page *freepage; - /* - * Isolate free pages if necessary, and if we are not aborting due to - * contention. - */ if (list_empty(&cc->freepages)) { - if (!cc->contended) - isolate_freepages(cc); + isolate_freepages(cc); if (list_empty(&cc->freepages)) return NULL; @@ -1733,7 +1708,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); - if (!low_pfn || cc->contended) + if (!low_pfn) return ISOLATE_ABORT; /* @@ -1763,9 +1738,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) { unsigned int order; const int migratetype = cc->migratetype; - - if (cc->contended || fatal_signal_pending(current)) - return COMPACT_CONTENDED; + int ret; /* Compaction run completes if the migrate and free scanner meet */ if (compact_scanners_met(cc)) { @@ -1800,6 +1773,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ + ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < MAX_ORDER; order++) { struct free_area *area = &cc->zone->free_area[order]; bool can_steal; @@ -1839,11 +1813,15 @@ static enum compact_result __compact_finished(struct compact_control *cc) return COMPACT_SUCCESS; } - return COMPACT_CONTINUE; + ret = COMPACT_CONTINUE; + break; } } - return COMPACT_NO_SUITABLE_PAGE; + if (cc->contended || fatal_signal_pending(current)) + ret = COMPACT_CONTENDED; + + return ret; } static enum compact_result compact_finished(struct compact_control *cc) From 9bebefd59084af7c75b66eeee241bf0777f39b88 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:14 -0800 Subject: [PATCH 065/159] mm, compaction: check early for huge pages encountered by the migration scanner When scanning for sources or targets, PageCompound is checked for huge pages as they can be skipped quickly but it happens relatively late after a lot of setup and checking. This patch short-cuts the check to make it earlier. It might still change when the lock is acquired but this has less overhead overall. The free scanner advances but the migration scanner does not. Typically the free scanner encounters more movable blocks that change state over the lifetime of the system and also tends to scan more aggressively as it's actively filling its portion of the physical address space with data. This could change in the future but for the moment, this worked better in practice and incurred fewer scan restarts. The impact on latency and allocation success rates is marginal but the free scan rates are reduced by 15% and system CPU usage is reduced by 3.3%. The 2-socket results are not materially different. Link: http://lkml.kernel.org/r/20190118175136.31341-15-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 5325211398f8..e609415059e8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1061,6 +1061,9 @@ static bool suitable_migration_source(struct compact_control *cc, { int block_mt; + if (pageblock_skip_persistent(page)) + return false; + if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) return true; @@ -1697,12 +1700,17 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; /* - * For async compaction, also only scan in MOVABLE blocks. - * Async compaction is optimistic to see if the minimum amount - * of work satisfies the allocation. + * For async compaction, also only scan in MOVABLE blocks + * without huge pages. Async compaction is optimistic to see + * if the minimum amount of work satisfies the allocation. + * The cached PFN is updated as it's possible that all + * remaining blocks between source and target are unsuitable + * and the compaction scanners fail to meet. */ - if (!suitable_migration_source(cc, page)) + if (!suitable_migration_source(cc, page)) { + update_cached_migrate(cc, block_end_pfn); continue; + } /* Perform the isolation */ low_pfn = isolate_migratepages_block(cc, low_pfn, From 8854c55f54bcc104e3adae42abe16948286ec75c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:18 -0800 Subject: [PATCH 066/159] mm, compaction: keep cached migration PFNs synced for unusable pageblocks Migrate has separate cached PFNs for ASYNC and SYNC* migration on the basis that some migrations will fail in ASYNC mode. However, if the cached PFNs match at the start of scanning and pageblocks are skipped due to having no isolation candidates, then the sync state does not matter. This patch keeps matching cached PFNs in sync until a pageblock with isolation candidates is found. The actual benefit is marginal given that the sync scanner following the async scanner will often skip a number of pageblocks but it's useless work. Any benefit depends heavily on whether the scanners restarted recently. Link: http://lkml.kernel.org/r/20190118175136.31341-16-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index e609415059e8..78ae182aaf34 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1971,6 +1971,7 @@ static enum compact_result compact_zone(struct compact_control *cc) unsigned long end_pfn = zone_end_pfn(cc->zone); unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; + bool update_cached; cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, @@ -2018,6 +2019,17 @@ static enum compact_result compact_zone(struct compact_control *cc) last_migrated_pfn = 0; + /* + * Migrate has separate cached PFNs for ASYNC and SYNC* migration on + * the basis that some migrations will fail in ASYNC mode. However, + * if the cached PFNs match and pageblocks are skipped due to having + * no isolation candidates, then the sync state does not matter. + * Until a pageblock with isolation candidates is found, keep the + * cached PFNs in sync to avoid revisiting the same blocks. + */ + update_cached = !sync && + cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1]; + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); @@ -2049,6 +2061,11 @@ static enum compact_result compact_zone(struct compact_control *cc) last_migrated_pfn = 0; goto out; case ISOLATE_NONE: + if (update_cached) { + cc->zone->compact_cached_migrate_pfn[1] = + cc->zone->compact_cached_migrate_pfn[0]; + } + /* * We haven't isolated and migrated anything, but * there might still be unflushed migrations from @@ -2056,6 +2073,7 @@ static enum compact_result compact_zone(struct compact_control *cc) */ goto check_drain; case ISOLATE_SUCCESS: + update_cached = false; last_migrated_pfn = start_pfn; ; } From cb810ad294d3c3a454e51b12fbb483bbb7096b98 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:21 -0800 Subject: [PATCH 067/159] mm, compaction: rework compact_should_abort as compact_check_resched With incremental changes, compact_should_abort no longer makes any documented sense. Rename to compact_check_resched and update the associated comments. There is no benefit other than reducing redundant code and making the intent slightly clearer. It could potentially be merged with earlier patches but it just makes the review slightly harder. Link: http://lkml.kernel.org/r/20190118175136.31341-17-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 61 +++++++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 38 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 78ae182aaf34..68e3c214bcbd 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -404,6 +404,21 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } +/* + * Aside from avoiding lock contention, compaction also periodically checks + * need_resched() and records async compaction as contended if necessary. + */ +static inline void compact_check_resched(struct compact_control *cc) +{ + /* async compaction aborts if contended */ + if (need_resched()) { + if (cc->mode == MIGRATE_ASYNC) + cc->contended = true; + + cond_resched(); + } +} + /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. The lock should be periodically unlocked to avoid @@ -432,33 +447,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock, return true; } - if (need_resched()) { - if (cc->mode == MIGRATE_ASYNC) - cc->contended = true; - cond_resched(); - } - - return false; -} - -/* - * Aside from avoiding lock contention, compaction also periodically checks - * need_resched() and either schedules in sync compaction or aborts async - * compaction. This is similar to what compact_unlock_should_abort() does, but - * is used where no lock is concerned. - * - * Returns false when no scheduling was needed, or sync compaction scheduled. - * Returns true when async compaction should abort. - */ -static inline bool compact_should_abort(struct compact_control *cc) -{ - /* async compaction aborts if contended */ - if (need_resched()) { - if (cc->mode == MIGRATE_ASYNC) - cc->contended = true; - - cond_resched(); - } + compact_check_resched(cc); return false; } @@ -747,8 +736,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, return 0; } - if (compact_should_abort(cc)) - return 0; + compact_check_resched(cc); if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { skip_on_failure = true; @@ -1379,12 +1367,10 @@ static void isolate_freepages(struct compact_control *cc) isolate_start_pfn = block_start_pfn) { /* * This can iterate a massively long zone without finding any - * suitable migration targets, so periodically check if we need - * to schedule, or even abort async compaction. + * suitable migration targets, so periodically check resched. */ - if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) - && compact_should_abort(cc)) - break; + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) + compact_check_resched(cc); page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone); @@ -1677,11 +1663,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* * This can potentially iterate a massively long zone with * many pageblocks unsuitable, so periodically check if we - * need to schedule, or even abort async compaction. + * need to schedule. */ - if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) - && compact_should_abort(cc)) - break; + if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) + compact_check_resched(cc); page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone); From cf66f0700c8f1d7c7c1c1d7e5e846a1836814601 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:24 -0800 Subject: [PATCH 068/159] mm, compaction: do not consider a need to reschedule as contention Scanning on large machines can take a considerable length of time and eventually need to be rescheduled. This is treated as an abort event but that's not appropriate as the attempt is likely to be retried after making numerous checks and taking another cycle through the page allocator. This patch will check the need to reschedule if necessary but continue the scanning. The main benefit is reduced scanning when compaction is taking a long time or the machine is over-saturated. It also avoids an unnecessary exit of compaction that ends up being retried by the page allocator in the outer loop. 5.0.0-rc1 5.0.0-rc1 synccached-v3r16 noresched-v3r17 Amean fault-both-1 0.00 ( 0.00%) 0.00 * 0.00%* Amean fault-both-3 2958.27 ( 0.00%) 2965.68 ( -0.25%) Amean fault-both-5 4091.90 ( 0.00%) 3995.90 ( 2.35%) Amean fault-both-7 5803.05 ( 0.00%) 5842.12 ( -0.67%) Amean fault-both-12 9481.06 ( 0.00%) 9550.87 ( -0.74%) Amean fault-both-18 14141.51 ( 0.00%) 13304.72 ( 5.92%) Amean fault-both-24 16438.00 ( 0.00%) 14618.59 ( 11.07%) Amean fault-both-30 17531.72 ( 0.00%) 16650.96 ( 5.02%) Amean fault-both-32 17101.96 ( 0.00%) 17145.15 ( -0.25%) Link: http://lkml.kernel.org/r/20190118175136.31341-18-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 68e3c214bcbd..9c7d43fd4655 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -404,21 +404,6 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } -/* - * Aside from avoiding lock contention, compaction also periodically checks - * need_resched() and records async compaction as contended if necessary. - */ -static inline void compact_check_resched(struct compact_control *cc) -{ - /* async compaction aborts if contended */ - if (need_resched()) { - if (cc->mode == MIGRATE_ASYNC) - cc->contended = true; - - cond_resched(); - } -} - /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. The lock should be periodically unlocked to avoid @@ -447,7 +432,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock, return true; } - compact_check_resched(cc); + cond_resched(); return false; } @@ -736,7 +721,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, return 0; } - compact_check_resched(cc); + cond_resched(); if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { skip_on_failure = true; @@ -1370,7 +1355,7 @@ static void isolate_freepages(struct compact_control *cc) * suitable migration targets, so periodically check resched. */ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) - compact_check_resched(cc); + cond_resched(); page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone); @@ -1666,7 +1651,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * need to schedule. */ if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) - compact_check_resched(cc); + cond_resched(); page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone); From d097a6f63522547dfc7c75c7084a05b6a7f9e838 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:28 -0800 Subject: [PATCH 069/159] mm, compaction: reduce premature advancement of the migration target scanner The fast isolation of free pages allows the cached PFN of the free scanner to advance faster than necessary depending on the contents of the free list. The key is that fast_isolate_freepages() can update zone->compact_cached_free_pfn via isolate_freepages_block(). When the fast search fails, the linear scan can start from a point that has skipped valid migration targets, particularly pageblocks with just low-order free pages. This can cause the migration source/target scanners to meet prematurely causing a reset. This patch starts by avoiding an update of the pageblock skip information and cached PFN from isolate_freepages_block() and puts the responsibility of updating that information in the callers. The fast scanner will update the cached PFN if and only if it finds a block that is higher than the existing cached PFN and sets the skip if the pageblock is full or nearly full. The linear scanner will update skipped information and the cached PFN only when a block is completely scanned. The total impact is that the free scanner advances more slowly as it is primarily driven by the linear scanner instead of the fast search. 5.0.0-rc1 5.0.0-rc1 noresched-v3r17 slowfree-v3r17 Amean fault-both-3 2965.68 ( 0.00%) 3036.75 ( -2.40%) Amean fault-both-5 3995.90 ( 0.00%) 4522.24 * -13.17%* Amean fault-both-7 5842.12 ( 0.00%) 6365.35 ( -8.96%) Amean fault-both-12 9550.87 ( 0.00%) 10340.93 ( -8.27%) Amean fault-both-18 13304.72 ( 0.00%) 14732.46 ( -10.73%) Amean fault-both-24 14618.59 ( 0.00%) 16288.96 ( -11.43%) Amean fault-both-30 16650.96 ( 0.00%) 16346.21 ( 1.83%) Amean fault-both-32 17145.15 ( 0.00%) 19317.49 ( -12.67%) The impact to latency is higher than the last version but it appears to be due to a slight increase in the free scan rates which is a potential side-effect of the patch. However, this is necessary for later patches that are more careful about how pageblocks are treated as earlier iterations of those patches hit corner cases where the restarts were punishing and very visible. Link: http://lkml.kernel.org/r/20190118175136.31341-19-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 9c7d43fd4655..452beef0541e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -330,10 +330,9 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) * future. The information is later cleared by __reset_isolation_suitable(). */ static void update_pageblock_skip(struct compact_control *cc, - struct page *page, unsigned long nr_isolated) + struct page *page, unsigned long pfn) { struct zone *zone = cc->zone; - unsigned long pfn; if (cc->no_set_skip_hint) return; @@ -341,13 +340,8 @@ static void update_pageblock_skip(struct compact_control *cc, if (!page) return; - if (nr_isolated) - return; - set_pageblock_skip(page); - pfn = page_to_pfn(page); - /* Update where async and sync compaction should restart */ if (pfn < zone->compact_cached_free_pfn) zone->compact_cached_free_pfn = pfn; @@ -365,7 +359,7 @@ static inline bool pageblock_skip_persistent(struct page *page) } static inline void update_pageblock_skip(struct compact_control *cc, - struct page *page, unsigned long nr_isolated) + struct page *page, unsigned long pfn) { } @@ -449,7 +443,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, bool strict) { int nr_scanned = 0, total_isolated = 0; - struct page *cursor, *valid_page = NULL; + struct page *cursor; unsigned long flags = 0; bool locked = false; unsigned long blockpfn = *start_pfn; @@ -476,9 +470,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (!pfn_valid_within(blockpfn)) goto isolate_fail; - if (!valid_page) - valid_page = page; - /* * For compound pages such as THP and hugetlbfs, we can save * potentially a lot of iterations if we skip them at once. @@ -566,10 +557,6 @@ isolate_fail: if (strict && blockpfn < end_pfn) total_isolated = 0; - /* Update the pageblock-skip if the whole pageblock was scanned */ - if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated); - cc->total_free_scanned += nr_scanned; if (total_isolated) count_compact_events(COMPACTISOLATED, total_isolated); @@ -1293,8 +1280,10 @@ fast_isolate_freepages(struct compact_control *cc) } } - if (highest && highest > cc->zone->compact_cached_free_pfn) + if (highest && highest >= cc->zone->compact_cached_free_pfn) { + highest -= pageblock_nr_pages; cc->zone->compact_cached_free_pfn = highest; + } cc->total_free_scanned += nr_scanned; if (!page) @@ -1374,6 +1363,10 @@ static void isolate_freepages(struct compact_control *cc) isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, freelist, false); + /* Update the skip hint if the full pageblock was scanned */ + if (isolate_start_pfn == block_end_pfn) + update_pageblock_skip(cc, page, block_start_pfn); + /* Are enough freepages isolated? */ if (cc->nr_freepages >= cc->nr_migratepages) { if (isolate_start_pfn >= block_end_pfn) { From dbe2d4e4f12e07c6a2215e3603a5f77056323081 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:31 -0800 Subject: [PATCH 070/159] mm, compaction: round-robin the order while searching the free lists for a target As compaction proceeds and creates high-order blocks, the free list search gets less efficient as the larger blocks are used as compaction targets. Eventually, the larger blocks will be behind the migration scanner for partially migrated pageblocks and the search fails. This patch round-robins what orders are searched so that larger blocks can be ignored and find smaller blocks that can be used as migration targets. The overall impact was small on 1-socket but it avoids corner cases where the migration/free scanners meet prematurely or situations where many of the pageblocks encountered by the free scanner are almost full instead of being properly packed. Previous testing had indicated that without this patch there were occasional large spikes in the free scanner without this patch. [dan.carpenter@oracle.com: fix static checker warning] Link: http://lkml.kernel.org/r/20190118175136.31341-20-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 33 ++++++++++++++++++++++++++++++--- mm/internal.h | 3 ++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 452beef0541e..b3055983a80f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1147,6 +1147,24 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long set_pageblock_skip(page); } +/* Search orders in round-robin fashion */ +static int next_search_order(struct compact_control *cc, int order) +{ + order--; + if (order < 0) + order = cc->order - 1; + + /* Search wrapped around? */ + if (order == cc->search_order) { + cc->search_order--; + if (cc->search_order < 0) + cc->search_order = cc->order - 1; + return -1; + } + + return order; +} + static unsigned long fast_isolate_freepages(struct compact_control *cc) { @@ -1183,9 +1201,15 @@ fast_isolate_freepages(struct compact_control *cc) if (WARN_ON_ONCE(min_pfn > low_pfn)) low_pfn = min_pfn; - for (order = cc->order - 1; - order >= 0 && !page; - order--) { + /* + * Search starts from the last successful isolation order or the next + * order to search after a previous failure + */ + cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order); + + for (order = cc->search_order; + !page && order >= 0; + order = next_search_order(cc, order)) { struct free_area *area = &cc->zone->free_area[order]; struct list_head *freelist; struct page *freepage; @@ -1209,6 +1233,7 @@ fast_isolate_freepages(struct compact_control *cc) if (pfn >= low_pfn) { cc->fast_search_fail = 0; + cc->search_order = order; page = freepage; break; } @@ -2138,6 +2163,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .total_migrate_scanned = 0, .total_free_scanned = 0, .order = order, + .search_order = order, .gfp_mask = gfp_mask, .zone = zone, .mode = (prio == COMPACT_PRIO_ASYNC) ? @@ -2369,6 +2395,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct zone *zone; struct compact_control cc = { .order = pgdat->kcompactd_max_order, + .search_order = pgdat->kcompactd_max_order, .total_migrate_scanned = 0, .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, diff --git a/mm/internal.h b/mm/internal.h index d5b999e5eb5f..31bb0be6fd52 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -192,7 +192,8 @@ struct compact_control { struct zone *zone; unsigned long total_migrate_scanned; unsigned long total_free_scanned; - unsigned int fast_search_fail; /* failures to use free list searches */ + unsigned short fast_search_fail;/* failures to use free list searches */ + short search_order; /* order to start a fast search at */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ From 4fca9730c51d51f643f2a3f8f10ebd718349c80f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:34 -0800 Subject: [PATCH 071/159] mm, compaction: sample pageblocks for free pages Once fast searching finishes, there is a possibility that the linear scanner is scanning full blocks found by the fast scanner earlier. This patch uses an adaptive stride to sample pageblocks for free pages. The more consecutive full pageblocks encountered, the larger the stride until a pageblock with free pages is found. The scanners might meet slightly sooner but it is an acceptable risk given that the search of the free lists may still encounter the pages and adjust the cached PFN of the free scanner accordingly. 5.0.0-rc1 5.0.0-rc1 roundrobin-v3r17 samplefree-v3r17 Amean fault-both-1 0.00 ( 0.00%) 0.00 * 0.00%* Amean fault-both-3 2752.37 ( 0.00%) 2729.95 ( 0.81%) Amean fault-both-5 4341.69 ( 0.00%) 4397.80 ( -1.29%) Amean fault-both-7 6308.75 ( 0.00%) 6097.61 ( 3.35%) Amean fault-both-12 10241.81 ( 0.00%) 9407.15 ( 8.15%) Amean fault-both-18 13736.09 ( 0.00%) 10857.63 * 20.96%* Amean fault-both-24 16853.95 ( 0.00%) 13323.24 * 20.95%* Amean fault-both-30 15862.61 ( 0.00%) 17345.44 ( -9.35%) Amean fault-both-32 18450.85 ( 0.00%) 16892.00 ( 8.45%) The latency is mildly improved offseting some overhead from earlier patches that are prerequisites for the rest of the series. However, a major impact is on the free scan rate with an 82% reduction. 5.0.0-rc1 5.0.0-rc1 roundrobin-v3r17 samplefree-v3r17 Compaction migrate scanned 21607271 20116887 Compaction free scanned 95336406 16668703 It's also the first time in the series where the number of pages scanned by the migration scanner is greater than the free scanner due to the increased search efficiency. Link: http://lkml.kernel.org/r/20190118175136.31341-21-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index b3055983a80f..b83cdb42f249 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -440,6 +440,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long *start_pfn, unsigned long end_pfn, struct list_head *freelist, + unsigned int stride, bool strict) { int nr_scanned = 0, total_isolated = 0; @@ -449,10 +450,14 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long blockpfn = *start_pfn; unsigned int order; + /* Strict mode is for isolation, speed is secondary */ + if (strict) + stride = 1; + cursor = pfn_to_page(blockpfn); /* Isolate free pages. */ - for (; blockpfn < end_pfn; blockpfn++, cursor++) { + for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) { int isolated; struct page *page = cursor; @@ -614,7 +619,7 @@ isolate_freepages_range(struct compact_control *cc, break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, &freelist, true); + block_end_pfn, &freelist, 0, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -1132,7 +1137,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long /* Scan before */ if (start_pfn != pfn) { - isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, false); + isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false); if (cc->nr_freepages >= cc->nr_migratepages) return; } @@ -1140,7 +1145,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long /* Scan after */ start_pfn = pfn + nr_isolated; if (start_pfn != end_pfn) - isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, false); + isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ if (cc->nr_freepages < cc->nr_migratepages) @@ -1332,6 +1337,7 @@ static void isolate_freepages(struct compact_control *cc) unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ struct list_head *freelist = &cc->freepages; + unsigned int stride; /* Try a small search of the free lists for a candidate */ isolate_start_pfn = fast_isolate_freepages(cc); @@ -1354,6 +1360,7 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); low_pfn = pageblock_end_pfn(cc->migrate_pfn); + stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1; /* * Isolate free pages until enough are available to migrate the @@ -1364,6 +1371,8 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { + unsigned long nr_isolated; + /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check resched. @@ -1385,8 +1394,8 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, - freelist, false); + nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, freelist, stride, false); /* Update the skip hint if the full pageblock was scanned */ if (isolate_start_pfn == block_end_pfn) @@ -1410,6 +1419,13 @@ static void isolate_freepages(struct compact_control *cc) */ break; } + + /* Adjust stride depending on isolation */ + if (nr_isolated) { + stride = 1; + continue; + } + stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1); } /* From e332f741a8dd1ec9a6dc8aa997296ecbfe64323e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:38 -0800 Subject: [PATCH 072/159] mm, compaction: be selective about what pageblocks to clear skip hints Pageblock hints are cleared when compaction restarts or kswapd makes enough progress that it can sleep but it's over-eager in that the bit is cleared for migration sources with no LRU pages and migration targets with no free pages. As pageblock skip hint flushes are relatively rare and out-of-band with respect to kswapd, this patch makes a few more expensive checks to see if it's appropriate to even clear the bit. Every pageblock that is not cleared will avoid 512 pages being scanned unnecessarily on x86-64. The impact is variable with different workloads showing small differences in latency, success rates and scan rates. This is expected as clearing the hints is not that common but doing a small amount of work out-of-band to avoid a large amount of work in-band later is generally a good thing. Link: http://lkml.kernel.org/r/20190118175136.31341-22-mgorman@techsingularity.net Signed-off-by: Mel Gorman Signed-off-by: Qian Cai Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing [cai@lca.pw: no stuck in __reset_isolation_pfn()] Link: http://lkml.kernel.org/r/20190206034732.75687-1-cai@lca.pw Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 + mm/compaction.c | 124 +++++++++++++++++++++++++++++++++++------ 2 files changed, 108 insertions(+), 18 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 842f9189537b..90c13cdeefb5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -480,6 +480,8 @@ struct zone { unsigned long compact_cached_free_pfn; /* pfn where async and sync compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[2]; + unsigned long compact_init_migrate_pfn; + unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION diff --git a/mm/compaction.c b/mm/compaction.c index b83cdb42f249..3084cee77fda 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -237,6 +237,70 @@ static bool pageblock_skip_persistent(struct page *page) return false; } +static bool +__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, + bool check_target) +{ + struct page *page = pfn_to_online_page(pfn); + struct page *end_page; + unsigned long block_pfn; + + if (!page) + return false; + if (zone != page_zone(page)) + return false; + if (pageblock_skip_persistent(page)) + return false; + + /* + * If skip is already cleared do no further checking once the + * restart points have been set. + */ + if (check_source && check_target && !get_pageblock_skip(page)) + return true; + + /* + * If clearing skip for the target scanner, do not select a + * non-movable pageblock as the starting point. + */ + if (!check_source && check_target && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + return false; + + /* + * Only clear the hint if a sample indicates there is either a + * free page or an LRU page in the block. One or other condition + * is necessary for the block to be a migration source/target. + */ + block_pfn = pageblock_start_pfn(pfn); + pfn = max(block_pfn, zone->zone_start_pfn); + page = pfn_to_page(pfn); + if (zone != page_zone(page)) + return false; + pfn = block_pfn + pageblock_nr_pages; + pfn = min(pfn, zone_end_pfn(zone)); + end_page = pfn_to_page(pfn); + + do { + if (pfn_valid_within(pfn)) { + if (check_source && PageLRU(page)) { + clear_pageblock_skip(page); + return true; + } + + if (check_target && PageBuddy(page)) { + clear_pageblock_skip(page); + return true; + } + } + + page += (1 << PAGE_ALLOC_COSTLY_ORDER); + pfn += (1 << PAGE_ALLOC_COSTLY_ORDER); + } while (page < end_page); + + return false; +} + /* * This function is called to clear all cached information on pageblocks that * should be skipped for page isolation when the migrate and free page scanner @@ -244,30 +308,54 @@ static bool pageblock_skip_persistent(struct page *page) */ static void __reset_isolation_suitable(struct zone *zone) { - unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = zone_end_pfn(zone); - unsigned long pfn; + unsigned long migrate_pfn = zone->zone_start_pfn; + unsigned long free_pfn = zone_end_pfn(zone); + unsigned long reset_migrate = free_pfn; + unsigned long reset_free = migrate_pfn; + bool source_set = false; + bool free_set = false; + + if (!zone->compact_blockskip_flush) + return; zone->compact_blockskip_flush = false; - /* Walk the zone and mark every pageblock as suitable for isolation */ - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { - struct page *page; - + /* + * Walk the zone and update pageblock skip information. Source looks + * for PageLRU while target looks for PageBuddy. When the scanner + * is found, both PageBuddy and PageLRU are checked as the pageblock + * is suitable as both source and target. + */ + for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages, + free_pfn -= pageblock_nr_pages) { cond_resched(); - page = pfn_to_online_page(pfn); - if (!page) - continue; - if (zone != page_zone(page)) - continue; - if (pageblock_skip_persistent(page)) - continue; + /* Update the migrate PFN */ + if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) && + migrate_pfn < reset_migrate) { + source_set = true; + reset_migrate = migrate_pfn; + zone->compact_init_migrate_pfn = reset_migrate; + zone->compact_cached_migrate_pfn[0] = reset_migrate; + zone->compact_cached_migrate_pfn[1] = reset_migrate; + } - clear_pageblock_skip(page); + /* Update the free PFN */ + if (__reset_isolation_pfn(zone, free_pfn, free_set, true) && + free_pfn > reset_free) { + free_set = true; + reset_free = free_pfn; + zone->compact_init_free_pfn = reset_free; + zone->compact_cached_free_pfn = reset_free; + } } - reset_cached_positions(zone); + /* Leave no distance if no suitable block was reset */ + if (reset_migrate >= reset_free) { + zone->compact_cached_migrate_pfn[0] = migrate_pfn; + zone->compact_cached_migrate_pfn[1] = migrate_pfn; + zone->compact_cached_free_pfn = free_pfn; + } } void reset_isolation_suitable(pg_data_t *pgdat) @@ -1190,7 +1278,7 @@ fast_isolate_freepages(struct compact_control *cc) * If starting the scan, use a deeper search and use the highest * PFN found if a suitable one is not found. */ - if (cc->free_pfn == pageblock_start_pfn(zone_end_pfn(cc->zone) - 1)) { + if (cc->free_pfn >= cc->zone->compact_init_free_pfn) { limit = pageblock_nr_pages >> 1; scan_start = true; } @@ -2017,7 +2105,7 @@ static enum compact_result compact_zone(struct compact_control *cc) cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } - if (cc->migrate_pfn == start_pfn) + if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn) cc->whole_zone = true; } From 5e1f0f098b4649fad53011246bcaeff011ffdf5d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:41 -0800 Subject: [PATCH 073/159] mm, compaction: capture a page under direct compaction Compaction is inherently race-prone as a suitable page freed during compaction can be allocated by any parallel task. This patch uses a capture_control structure to isolate a page immediately when it is freed by a direct compactor in the slow path of the page allocator. The intent is to avoid redundant scanning. 5.0.0-rc1 5.0.0-rc1 selective-v3r17 capture-v3r19 Amean fault-both-1 0.00 ( 0.00%) 0.00 * 0.00%* Amean fault-both-3 2582.11 ( 0.00%) 2563.68 ( 0.71%) Amean fault-both-5 4500.26 ( 0.00%) 4233.52 ( 5.93%) Amean fault-both-7 5819.53 ( 0.00%) 6333.65 ( -8.83%) Amean fault-both-12 9321.18 ( 0.00%) 9759.38 ( -4.70%) Amean fault-both-18 9782.76 ( 0.00%) 10338.76 ( -5.68%) Amean fault-both-24 15272.81 ( 0.00%) 13379.55 * 12.40%* Amean fault-both-30 15121.34 ( 0.00%) 16158.25 ( -6.86%) Amean fault-both-32 18466.67 ( 0.00%) 18971.21 ( -2.73%) Latency is only moderately affected but the devil is in the details. A closer examination indicates that base page fault latency is reduced but latency of huge pages is increased as it takes creater care to succeed. Part of the "problem" is that allocation success rates are close to 100% even when under pressure and compaction gets harder 5.0.0-rc1 5.0.0-rc1 selective-v3r17 capture-v3r19 Percentage huge-3 96.70 ( 0.00%) 98.23 ( 1.58%) Percentage huge-5 96.99 ( 0.00%) 95.30 ( -1.75%) Percentage huge-7 94.19 ( 0.00%) 97.24 ( 3.24%) Percentage huge-12 94.95 ( 0.00%) 97.35 ( 2.53%) Percentage huge-18 96.74 ( 0.00%) 97.30 ( 0.58%) Percentage huge-24 97.07 ( 0.00%) 97.55 ( 0.50%) Percentage huge-30 95.69 ( 0.00%) 98.50 ( 2.95%) Percentage huge-32 96.70 ( 0.00%) 99.27 ( 2.65%) And scan rates are reduced as expected by 6% for the migration scanner and 29% for the free scanner indicating that there is less redundant work. Compaction migrate scanned 20815362 19573286 Compaction free scanned 16352612 11510663 [mgorman@techsingularity.net: remove redundant check] Link: http://lkml.kernel.org/r/20190201143853.GH9565@techsingularity.net Link: http://lkml.kernel.org/r/20190118175136.31341-23-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 3 +- include/linux/sched.h | 4 +++ kernel/sched/core.c | 3 ++ mm/compaction.c | 31 ++++++++++++---- mm/internal.h | 9 +++++ mm/page_alloc.c | 73 +++++++++++++++++++++++++++++++++++--- 6 files changed, 111 insertions(+), 12 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 70d0256edd31..c960923d9ec2 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -93,7 +93,8 @@ extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, - const struct alloc_context *ac, enum compact_priority prio); + const struct alloc_context *ac, enum compact_priority prio, + struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int classzone_idx); diff --git a/include/linux/sched.h b/include/linux/sched.h index f9b43c989577..ebfb34fb9b30 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -47,6 +47,7 @@ struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; +struct capture_control; struct robust_list_head; struct sched_attr; struct sched_param; @@ -958,6 +959,9 @@ struct task_struct { struct io_context *io_context; +#ifdef CONFIG_COMPACTION + struct capture_control *capture_control; +#endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7cbb5658be80..916e956e92be 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2190,6 +2190,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif +#ifdef CONFIG_COMPACTION + p->capture_control = NULL; +#endif init_numa_balancing(clone_flags, p); } diff --git a/mm/compaction.c b/mm/compaction.c index 3084cee77fda..1cc871da3fda 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2056,7 +2056,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, return false; } -static enum compact_result compact_zone(struct compact_control *cc) +static enum compact_result +compact_zone(struct compact_control *cc, struct capture_control *capc) { enum compact_result ret; unsigned long start_pfn = cc->zone->zone_start_pfn; @@ -2225,6 +2226,11 @@ check_drain: } } + /* Stop if a page has been captured */ + if (capc && capc->page) { + ret = COMPACT_SUCCESS; + break; + } } out: @@ -2258,7 +2264,8 @@ out: static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, - unsigned int alloc_flags, int classzone_idx) + unsigned int alloc_flags, int classzone_idx, + struct page **capture) { enum compact_result ret; struct compact_control cc = { @@ -2279,14 +2286,24 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) }; + struct capture_control capc = { + .cc = &cc, + .page = NULL, + }; + + if (capture) + current->capture_control = &capc; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - ret = compact_zone(&cc); + ret = compact_zone(&cc, &capc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); + *capture = capc.page; + current->capture_control = NULL; + return ret; } @@ -2304,7 +2321,7 @@ int sysctl_extfrag_threshold = 500; */ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum compact_priority prio) + enum compact_priority prio, struct page **capture) { int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; @@ -2332,7 +2349,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } status = compact_zone_order(zone, order, gfp_mask, prio, - alloc_flags, ac_classzone_idx(ac)); + alloc_flags, ac_classzone_idx(ac), capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ @@ -2400,7 +2417,7 @@ static void compact_node(int nid) INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - compact_zone(&cc); + compact_zone(&cc, NULL); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); @@ -2535,7 +2552,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (kthread_should_stop()) return; - status = compact_zone(&cc); + status = compact_zone(&cc, NULL); if (status == COMPACT_SUCCESS) { compaction_defer_reset(zone, cc.order, false); diff --git a/mm/internal.h b/mm/internal.h index 31bb0be6fd52..9eeaf2b95166 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -209,6 +209,15 @@ struct compact_control { bool rescan; /* Rescanning the same pageblock */ }; +/* + * Used in direct compaction when a page should be taken from the freelists + * immediately when one is created during the free path. + */ +struct capture_control { + struct compact_control *cc; + struct page *page; +}; + unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2e132b9e7a93..09bf2c5f8b4b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -789,6 +789,57 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; } +#ifdef CONFIG_COMPACTION +static inline struct capture_control *task_capc(struct zone *zone) +{ + struct capture_control *capc = current->capture_control; + + return capc && + !(current->flags & PF_KTHREAD) && + !capc->page && + capc->cc->zone == zone && + capc->cc->direct_compaction ? capc : NULL; +} + +static inline bool +compaction_capture(struct capture_control *capc, struct page *page, + int order, int migratetype) +{ + if (!capc || order != capc->cc->order) + return false; + + /* Do not accidentally pollute CMA or isolated regions*/ + if (is_migrate_cma(migratetype) || + is_migrate_isolate(migratetype)) + return false; + + /* + * Do not let lower order allocations polluate a movable pageblock. + * This might let an unmovable request use a reclaimable pageblock + * and vice-versa but no more than normal fallback logic which can + * have trouble finding a high-order free page. + */ + if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) + return false; + + capc->page = page; + return true; +} + +#else +static inline struct capture_control *task_capc(struct zone *zone) +{ + return NULL; +} + +static inline bool +compaction_capture(struct capture_control *capc, struct page *page, + int order, int migratetype) +{ + return false; +} +#endif /* CONFIG_COMPACTION */ + /* * Freeing function for a buddy system allocator. * @@ -822,6 +873,7 @@ static inline void __free_one_page(struct page *page, unsigned long uninitialized_var(buddy_pfn); struct page *buddy; unsigned int max_order; + struct capture_control *capc = task_capc(zone); max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); @@ -837,6 +889,11 @@ static inline void __free_one_page(struct page *page, continue_merging: while (order < max_order - 1) { + if (compaction_capture(capc, page, order, migratetype)) { + __mod_zone_freepage_state(zone, -(1 << order), + migratetype); + return; + } buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); @@ -3710,7 +3767,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, enum compact_result *compact_result) { - struct page *page; + struct page *page = NULL; unsigned long pflags; unsigned int noreclaim_flag; @@ -3721,13 +3778,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - prio); + prio, &page); memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); - if (*compact_result <= COMPACT_INACTIVE) + if (*compact_result <= COMPACT_INACTIVE) { + WARN_ON_ONCE(page); return NULL; + } /* * At least in one zone compaction wasn't deferred or skipped, so let's @@ -3735,7 +3794,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + /* Prep a captured page if available */ + if (page) + prep_new_page(page, order, gfp_mask, alloc_flags); + + /* Try get a page from the freelist if available */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) { struct zone *zone = page_zone(page); From 147e1a97c4a0bdd43f55a582a9416bb9092563a9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 5 Mar 2019 15:45:45 -0800 Subject: [PATCH 074/159] fs: kernfs: add poll file operation Patch series "psi: pressure stall monitors", v3. Android is adopting psi to detect and remedy memory pressure that results in stuttering and decreased responsiveness on mobile devices. Psi gives us the stall information, but because we're dealing with latencies in the millisecond range, periodically reading the pressure files to detect stalls in a timely fashion is not feasible. Psi also doesn't aggregate its averages at a high enough frequency right now. This patch series extends the psi interface such that users can configure sensitive latency thresholds and use poll() and friends to be notified when these are breached. As high-frequency aggregation is costly, it implements an aggregation method that is optimized for fast, short-interval averaging, and makes the aggregation frequency adaptive, such that high-frequency updates only happen while monitored stall events are actively occurring. With these patches applied, Android can monitor for, and ward off, mounting memory shortages before they cause problems for the user. For example, using memory stall monitors in userspace low memory killer daemon (lmkd) we can detect mounting pressure and kill less important processes before device becomes visibly sluggish. In our memory stress testing psi memory monitors produce roughly 10x less false positives compared to vmpressure signals. Having ability to specify multiple triggers for the same psi metric allows other parts of Android framework to monitor memory state of the device and act accordingly. The new interface is straightforward. The user opens one of the pressure files for writing and writes a trigger description into the file descriptor that defines the stall state - some or full, and the maximum stall time over a given window of time. E.g.: /* Signal when stall time exceeds 100ms of a 1s window */ char trigger[] = "full 100000 1000000"; fd = open("/proc/pressure/memory"); write(fd, trigger, sizeof(trigger)); while (poll() >= 0) { ... } close(fd); When the monitored stall state is entered, psi adapts its aggregation frequency according to what the configured time window requires in order to emit event signals in a timely fashion. Once the stalling subsides, aggregation reverts back to normal. The trigger is associated with the open file descriptor. To stop monitoring, the user only needs to close the file descriptor and the trigger is discarded. Patches 1-4 prepare the psi code for polling support. Patch 5 implements the adaptive polling logic, the pressure growth detection optimized for short intervals, and hooks up write() and poll() on the pressure files. The patches were developed in collaboration with Johannes Weiner. This patch (of 5): Kernfs has a standardized poll/notification mechanism for waking all pollers on all fds when a filesystem node changes. To allow polling for custom events, add a .poll callback that can override the default. This is in preparation for pollable cgroup pressure files which have per-fd trigger configurations. Link: http://lkml.kernel.org/r/20190124211518.244221-2-surenb@google.com Signed-off-by: Johannes Weiner Signed-off-by: Suren Baghdasaryan Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/kernfs/file.c | 31 ++++++++++++++++++++----------- include/linux/kernfs.h | 6 ++++++ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index f8d5021a652e..ae948aaa4c53 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn) * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) +{ + struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); + struct kernfs_open_node *on = kn->attr.open; + + poll_wait(of->file, &on->poll, wait); + + if (of->event != atomic_read(&on->event)) + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + + return DEFAULT_POLLMASK; +} + static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); - struct kernfs_open_node *on = kn->attr.open; + __poll_t ret; if (!kernfs_get_active(kn)) - goto trigger; + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; - poll_wait(filp, &on->poll, wait); + if (kn->attr.ops->poll) + ret = kn->attr.ops->poll(of, wait); + else + ret = kernfs_generic_poll(of, wait); kernfs_put_active(kn); - - if (of->event != atomic_read(&on->event)) - goto trigger; - - return DEFAULT_POLLMASK; - - trigger: - return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + return ret; } static void kernfs_notify_workfn(struct work_struct *work) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5b36b1287a5a..0cac1207bb00 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -25,6 +25,7 @@ struct seq_file; struct vm_area_struct; struct super_block; struct file_system_type; +struct poll_table_struct; struct kernfs_open_node; struct kernfs_iattrs; @@ -261,6 +262,9 @@ struct kernfs_ops { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); + __poll_t (*poll)(struct kernfs_open_file *of, + struct poll_table_struct *pt); + int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -350,6 +354,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, + struct poll_table_struct *pt); void kernfs_notify(struct kernfs_node *kn); const void *kernfs_super_ns(struct super_block *sb); From dc50537bdd1a0804fa2cbc990565ee9a944e66fa Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 5 Mar 2019 15:45:48 -0800 Subject: [PATCH 075/159] kernel: cgroup: add poll file operation Cgroup has a standardized poll/notification mechanism for waking all pollers on all fds when a filesystem node changes. To allow polling for custom events, add a .poll callback that can override the default. This is in preparation for pollable cgroup pressure files which have per-fd trigger configurations. Link: http://lkml.kernel.org/r/20190124211518.244221-3-surenb@google.com Signed-off-by: Johannes Weiner Signed-off-by: Suren Baghdasaryan Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup-defs.h | 4 ++++ kernel/cgroup/cgroup.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8fcbae1b8db0..aad3babef007 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -32,6 +32,7 @@ struct kernfs_node; struct kernfs_ops; struct kernfs_open_file; struct seq_file; +struct poll_table_struct; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 @@ -574,6 +575,9 @@ struct cftype { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); + __poll_t (*poll)(struct kernfs_open_file *of, + struct poll_table_struct *pt); + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cef98502b124..17828333f7c3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3534,6 +3534,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ + struct cftype *cft = of->kn->priv; + + if (cft->poll) + return cft->poll(of, pt); + + return kernfs_generic_poll(of, pt); +} + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); @@ -3572,6 +3582,7 @@ static struct kernfs_ops cgroup_kf_single_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; @@ -3580,6 +3591,7 @@ static struct kernfs_ops cgroup_kf_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, From aa9694bb78bf6eb03810108d5f6064fafa4ae1e1 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 5 Mar 2019 15:45:52 -0800 Subject: [PATCH 076/159] mm, memcg: create mem_cgroup_from_seq This is the start of a series of patches similar to my earlier DEFINE_MEMCG_MAX_OR_VAL work, but with less Macro Magic(tm). There are a bunch of places we go from seq_file to mem_cgroup, which currently requires manually getting the css, then getting the mem_cgroup from the css. It's in enough places now that having mem_cgroup_from_seq makes sense (and also makes the next patch a bit nicer). Link: http://lkml.kernel.org/r/20190124194050.GA31341@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++++ mm/memcontrol.c | 24 ++++++++++++------------ mm/slab_common.c | 6 +++--- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b0eb29ea0d9c..1f3d880b7ca1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -429,6 +429,11 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) +{ + return mem_cgroup_from_css(seq_css(m)); +} + static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { struct mem_cgroup_per_node *mz; @@ -937,6 +942,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) +{ + return NULL; +} + static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { return NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f93f7f22a6f4..027abf9935d0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3337,7 +3337,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) const struct numa_stat *stat; int nid; unsigned long nr; - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); @@ -3388,7 +3388,7 @@ static const char *const memcg1_event_names[] = { static int memcg_stat_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; @@ -3820,7 +3820,7 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); @@ -5363,7 +5363,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_min_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long min = READ_ONCE(memcg->memory.min); if (min == PAGE_COUNTER_MAX) @@ -5393,7 +5393,7 @@ static ssize_t memory_min_write(struct kernfs_open_file *of, static int memory_low_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long low = READ_ONCE(memcg->memory.low); if (low == PAGE_COUNTER_MAX) @@ -5423,7 +5423,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long high = READ_ONCE(memcg->high); if (high == PAGE_COUNTER_MAX) @@ -5460,7 +5460,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long max = READ_ONCE(memcg->memory.max); if (max == PAGE_COUNTER_MAX) @@ -5522,7 +5522,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, static int memory_events_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "low %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_LOW])); @@ -5540,7 +5540,7 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); struct accumulated_stats acc; int i; @@ -5617,7 +5617,7 @@ static int memory_stat_show(struct seq_file *m, void *v) static int memory_oom_group_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "%d\n", memcg->oom_group); @@ -6600,7 +6600,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long max = READ_ONCE(memcg->swap.max); if (max == PAGE_COUNTER_MAX) @@ -6630,7 +6630,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, static int swap_events_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "max %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); diff --git a/mm/slab_common.c b/mm/slab_common.c index f9d89c1b5977..cd75b8985707 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1425,7 +1425,7 @@ void dump_unreclaimable_slab(void) #if defined(CONFIG_MEMCG) void *memcg_slab_start(struct seq_file *m, loff_t *pos) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); mutex_lock(&slab_mutex); return seq_list_start(&memcg->kmem_caches, *pos); @@ -1433,7 +1433,7 @@ void *memcg_slab_start(struct seq_file *m, loff_t *pos) void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); return seq_list_next(p, &memcg->kmem_caches, pos); } @@ -1447,7 +1447,7 @@ int memcg_slab_show(struct seq_file *m, void *p) { struct kmem_cache *s = list_entry(p, struct kmem_cache, memcg_params.kmem_caches_node); - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); if (p == memcg->kmem_caches.next) print_slabinfo_header(m); From 677dc9731b54dccaaadbdcea18f8eecc95cee832 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 5 Mar 2019 15:45:55 -0800 Subject: [PATCH 077/159] mm, memcg: extract memcg maxable seq_file logic to seq_show_memcg_tunable memcg has a significant number of files exposed to kernfs where their value is either exposed directly or is "max" in the case of PAGE_COUNTER_MAX. This patch makes this generic by providing a single function to do this work. In combination with the previous patch adding mem_cgroup_from_seq, this makes all of the seq_show feeder functions significantly more simple. Link: http://lkml.kernel.org/r/20190124194100.GA31425@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 64 +++++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 45 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 027abf9935d0..59425a35763c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5353,6 +5353,16 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) root_mem_cgroup->use_hierarchy = false; } +static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) +{ + if (value == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); + + return 0; +} + static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -5363,15 +5373,8 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_min_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - unsigned long min = READ_ONCE(memcg->memory.min); - - if (min == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); } static ssize_t memory_min_write(struct kernfs_open_file *of, @@ -5393,15 +5396,8 @@ static ssize_t memory_min_write(struct kernfs_open_file *of, static int memory_low_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - unsigned long low = READ_ONCE(memcg->memory.low); - - if (low == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); } static ssize_t memory_low_write(struct kernfs_open_file *of, @@ -5423,15 +5419,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - unsigned long high = READ_ONCE(memcg->high); - - if (high == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); } static ssize_t memory_high_write(struct kernfs_open_file *of, @@ -5460,15 +5448,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - unsigned long max = READ_ONCE(memcg->memory.max); - - if (max == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); } static ssize_t memory_max_write(struct kernfs_open_file *of, @@ -6600,15 +6581,8 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - unsigned long max = READ_ONCE(memcg->swap.max); - - if (max == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); } static ssize_t swap_max_write(struct kernfs_open_file *of, From afd07389d3f4933c7f7817a92fb5e053d59a3182 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 5 Mar 2019 15:45:59 -0800 Subject: [PATCH 078/159] mm/vmalloc.c: fix kernel BUG at mm/vmalloc.c:512! One of the vmalloc stress test case triggers the kernel BUG(): [60.562151] ------------[ cut here ]------------ [60.562154] kernel BUG at mm/vmalloc.c:512! [60.562206] invalid opcode: 0000 [#1] PREEMPT SMP PTI [60.562247] CPU: 0 PID: 430 Comm: vmalloc_test/0 Not tainted 4.20.0+ #161 [60.562293] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [60.562351] RIP: 0010:alloc_vmap_area+0x36f/0x390 it can happen due to big align request resulting in overflowing of calculated address, i.e. it becomes 0 after ALIGN()'s fixup. Fix it by checking if calculated address is within vstart/vend range. Link: http://lkml.kernel.org/r/20190124115648.9433-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Andrew Morton Cc: Ingo Molnar Cc: Joel Fernandes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Garnier Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e83961767dc1..77006fa1a90b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -498,7 +498,11 @@ nocache: } found: - if (addr + size > vend) + /* + * Check also calculated address against the vstart, + * because it can be 0 because of big align request. + */ + if (addr + size > vend || addr < vstart) goto overflow; va->va_start = addr; From 2d432cb7091e99881af803cdd67a31969b863005 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 5 Mar 2019 15:46:02 -0800 Subject: [PATCH 079/159] mm: prevent mapping slab pages to userspace It's never appropriate to map a page allocated by SLAB into userspace. A buggy device driver might try this, or an attacker might be able to find a way to make it happen. Christoph said: : Let's just fail the code. Currently this may work with SLUB. But SLAB : and SLOB overlay fields with mapcount. So you would have a corrupted page : struct if you mapped a slab page to user space. Link: http://lkml.kernel.org/r/20190125173827.2658-1-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Kees Cook Acked-by: Pekka Enberg Cc: Rik van Riel Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 222da66f16b4..a9897dcd530f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1452,7 +1452,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; retval = -EINVAL; - if (PageAnon(page)) + if (PageAnon(page) || PageSlab(page)) goto out; retval = -ENOMEM; flush_dcache_page(page); From 0ee930e6cafa048c1925893d0ca89918b2814f2c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 5 Mar 2019 15:46:06 -0800 Subject: [PATCH 080/159] mm/memory.c: prevent mapping typed pages to userspace Pages which use page_type must never be mapped to userspace as it would destroy their page type. Add an explicit check for this instead of assuming that kernel drivers always get this right. Link: http://lkml.kernel.org/r/20190129053830.3749-1-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Kees Cook Reviewed-by: David Hildenbrand Cc: Michael Ellerman Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index a9897dcd530f..79e0173a7d70 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1452,7 +1452,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; retval = -EINVAL; - if (PageAnon(page) || PageSlab(page)) + if (PageAnon(page) || PageSlab(page) || page_has_type(page)) goto out; retval = -ENOMEM; flush_dcache_page(page); From d9f7979c92f7b34469c1ca5d1f3add6681fd567c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 5 Mar 2019 15:46:09 -0800 Subject: [PATCH 081/159] mm: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Link: http://lkml.kernel.org/r/20190122152151.16139-14-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Cc: Michal Hocko Cc: Vlastimil Babka Cc: David Rientjes Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma_debug.c | 2 -- mm/failslab.c | 14 ++++---------- mm/gup_benchmark.c | 8 ++------ mm/huge_memory.c | 8 ++------ mm/memblock.c | 3 +-- mm/memory.c | 8 ++------ mm/page_alloc.c | 20 +++++--------------- mm/page_owner.c | 8 +++----- mm/vmstat.c | 15 ++++----------- 9 files changed, 23 insertions(+), 63 deletions(-) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index ad6723e9d110..b55f28fbe831 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -191,8 +191,6 @@ static int __init cma_debugfs_init(void) int i; cma_debugfs_root = debugfs_create_dir("cma", NULL); - if (!cma_debugfs_root) - return -ENOMEM; for (i = 0; i < cma_area_count; i++) cma_debugfs_add_one(&cma_areas[i], i); diff --git a/mm/failslab.c b/mm/failslab.c index b135ebb88b6f..ec5aad211c5b 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -48,18 +48,12 @@ static int __init failslab_debugfs_init(void) if (IS_ERR(dir)) return PTR_ERR(dir); - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_reclaim)) - goto fail; - if (!debugfs_create_bool("cache-filter", mode, dir, - &failslab.cache_filter)) - goto fail; + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_reclaim); + debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter); return 0; -fail: - debugfs_remove_recursive(dir); - - return -ENOMEM; } late_initcall(failslab_debugfs_init); diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 5b42d3d4b60a..6c0279e70cc4 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -122,12 +122,8 @@ static const struct file_operations gup_benchmark_fops = { static int gup_benchmark_init(void) { - void *ret; - - ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, - &gup_benchmark_fops); - if (!ret) - pr_warn("Failed to create gup_benchmark in debugfs"); + debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, + &gup_benchmark_fops); return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d066f7ca1ee8..af07527cd971 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2887,12 +2887,8 @@ DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, static int __init split_huge_pages_debugfs(void) { - void *ret; - - ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, - &split_huge_pages_fops); - if (!ret) - pr_warn("Failed to create split_huge_pages in debugfs"); + debugfs_create_file("split_huge_pages", 0200, NULL, NULL, + &split_huge_pages_fops); return 0; } late_initcall(split_huge_pages_debugfs); diff --git a/mm/memblock.c b/mm/memblock.c index ea31045ba704..470601115892 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2005,8 +2005,7 @@ DEFINE_SHOW_ATTRIBUTE(memblock_debug); static int __init memblock_init_debugfs(void) { struct dentry *root = debugfs_create_dir("memblock", NULL); - if (!root) - return -ENXIO; + debugfs_create_file("memory", 0444, root, &memblock.memory, &memblock_debug_fops); debugfs_create_file("reserved", 0444, root, diff --git a/mm/memory.c b/mm/memory.c index 79e0173a7d70..6aff43171a7b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3334,12 +3334,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, static int __init fault_around_debugfs(void) { - void *ret; - - ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, - &fault_around_bytes_fops); - if (!ret) - pr_warn("Failed to create fault_around_bytes in debugfs"); + debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); return 0; } late_initcall(fault_around_debugfs); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 09bf2c5f8b4b..9be9a22ebe35 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3230,24 +3230,14 @@ static int __init fail_page_alloc_debugfs(void) dir = fault_create_debugfs_attr("fail_page_alloc", NULL, &fail_page_alloc.attr); - if (IS_ERR(dir)) - return PTR_ERR(dir); - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_reclaim)) - goto fail; - if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem)) - goto fail; - if (!debugfs_create_u32("min-order", mode, dir, - &fail_page_alloc.min_order)) - goto fail; + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_reclaim); + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); return 0; -fail: - debugfs_remove_recursive(dir); - - return -ENOMEM; } late_initcall(fail_page_alloc_debugfs); diff --git a/mm/page_owner.c b/mm/page_owner.c index 28b06524939f..925b6f44a444 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -625,16 +625,14 @@ static const struct file_operations proc_page_owner_operations = { static int __init pageowner_init(void) { - struct dentry *dentry; - if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } - dentry = debugfs_create_file("page_owner", 0400, NULL, - NULL, &proc_page_owner_operations); + debugfs_create_file("page_owner", 0400, NULL, NULL, + &proc_page_owner_operations); - return PTR_ERR_OR_ZERO(dentry); + return 0; } late_initcall(pageowner_init) diff --git a/mm/vmstat.c b/mm/vmstat.c index 83b30edc2f7f..36b56f858f0f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2121,21 +2121,14 @@ static int __init extfrag_debug_init(void) struct dentry *extfrag_debug_root; extfrag_debug_root = debugfs_create_dir("extfrag", NULL); - if (!extfrag_debug_root) - return -ENOMEM; - if (!debugfs_create_file("unusable_index", 0444, - extfrag_debug_root, NULL, &unusable_file_ops)) - goto fail; + debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, + &unusable_file_ops); - if (!debugfs_create_file("extfrag_index", 0444, - extfrag_debug_root, NULL, &extfrag_file_ops)) - goto fail; + debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, + &extfrag_file_ops); return 0; -fail: - debugfs_remove_recursive(extfrag_debug_root); - return -ENOMEM; } module_init(extfrag_debug_init); From bbbe48029720d2c6b6733f78d02571a281511adb Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 5 Mar 2019 15:46:12 -0800 Subject: [PATCH 082/159] mm, oom: remove 'prefer children over parent' heuristic Since the start of the git history of Linux, the kernel after selecting the worst process to be oom-killed, prefer to kill its child (if the child does not share mm with the parent). Later it was changed to prefer to kill a child who is worst. If the parent is still the worst then the parent will be killed. This heuristic assumes that the children did less work than their parent and by killing one of them, the work lost will be less. However this is very workload dependent. If there is a workload which can benefit from this heuristic, can use oom_score_adj to prefer children to be killed before the parent. The select_bad_process() has already selected the worst process in the system/memcg. There is no need to recheck the badness of its children and hoping to find a worse candidate. That's a lot of unneeded racy work. Also the heuristic is dangerous because it make fork bomb like workloads to recover much later because we constantly pick and kill processes which are not memory hogs. So, let's remove this whole heuristic. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20190121215850.221745-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Roman Gushchin Acked-by: Johannes Weiner Cc: David Rientjes Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 78 ++++++++++++--------------------------------------- 1 file changed, 18 insertions(+), 60 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 26ea8636758f..2d5135c1b1ba 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -843,7 +843,7 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void __oom_kill_process(struct task_struct *victim) +static void __oom_kill_process(struct task_struct *victim, const char *message) { struct task_struct *p; struct mm_struct *mm; @@ -874,8 +874,9 @@ static void __oom_kill_process(struct task_struct *victim) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); - pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", - task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", + message, task_pid_nr(victim), victim->comm, + K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), K(get_mm_counter(victim->mm, MM_FILEPAGES)), K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); @@ -926,24 +927,19 @@ static void __oom_kill_process(struct task_struct *victim) * Kill provided task unless it's secured by setting * oom_score_adj to OOM_SCORE_ADJ_MIN. */ -static int oom_kill_memcg_member(struct task_struct *task, void *unused) +static int oom_kill_memcg_member(struct task_struct *task, void *message) { if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(task); - __oom_kill_process(task); + __oom_kill_process(task, message); } return 0; } static void oom_kill_process(struct oom_control *oc, const char *message) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *victim = oc->chosen; struct mem_cgroup *oom_group; - unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -952,57 +948,18 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * its children or threads, just give it access to memory reserves * so it can die quickly */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); + task_lock(victim); + if (task_will_free_mem(victim)) { + mark_oom_victim(victim); + wake_oom_reaper(victim); + task_unlock(victim); + put_task_struct(victim); return; } - task_unlock(p); + task_unlock(victim); if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - - /* - * The task 'p' might have already exited before reaching here. The - * put_task_struct() will free task_struct 'p' while the loop still try - * to access the field of 'p', so, get an extra reference. - */ - get_task_struct(p); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - put_task_struct(p); - read_unlock(&tasklist_lock); + dump_header(oc, victim); /* * Do we need to kill the entire memory cgroup? @@ -1011,14 +968,15 @@ static void oom_kill_process(struct oom_control *oc, const char *message) */ oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); - __oom_kill_process(victim); + __oom_kill_process(victim, message); /* * If necessary, kill all tasks in the selected memory cgroup. */ if (oom_group) { mem_cgroup_print_oom_group(oom_group); - mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); + mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, + (void*)message); mem_cgroup_put(oom_group); } } From 43cca0b1c51f7432d82e4fdb545c2a03606da149 Mon Sep 17 00:00:00 2001 From: Yang Fan Date: Tue, 5 Mar 2019 15:46:16 -0800 Subject: [PATCH 083/159] mm/mmap.c: remove some redundancy in arch_get_unmapped_area_topdown() The variable 'addr' is redundant in arch_get_unmapped_area_topdown(), just use parameter 'addr0' directly. Then remove the const qualifier of the parameter, and change its name to 'addr'. And in according with other functions, remove the const qualifier of all other no-pointer parameters in function arch_get_unmapped_area_topdown(). Link: http://lkml.kernel.org/r/20190127041112.25599-1-nullptr.cpp@gmail.com Signed-off-by: Yang Fan Reviewed-by: Mike Rapoport Cc: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index fc1809b1bed6..eccba2650ef6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2126,13 +2126,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, */ #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; struct vm_unmapped_area_info info; const unsigned long mmap_end = arch_get_mmap_end(addr); From 8aa49762dba3e8ce9a52a9b6da221e61a0c6de08 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 5 Mar 2019 15:46:19 -0800 Subject: [PATCH 084/159] mm/page_owner: move config option to mm/Kconfig.debug Move the PAGE_OWNER option from submenu "Compile-time checks and compiler options" to dedicated submenu "Memory Debugging". Link: http://lkml.kernel.org/r/20190120024254.6270-1-changbin.du@gmail.com Signed-off-by: Changbin Du Acked-by: Vlastimil Babka Cc: Masahiro Yamada Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 17 ----------------- mm/Kconfig.debug | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 48f584393e28..e6a7b01932e6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -265,23 +265,6 @@ config UNUSED_SYMBOLS you really need it, and what the merge plan to the mainline kernel for your module is. -config PAGE_OWNER - bool "Track page owner" - depends on DEBUG_KERNEL && STACKTRACE_SUPPORT - select DEBUG_FS - select STACKTRACE - select STACKDEPOT - select PAGE_EXTENSION - help - This keeps track of what call chain is the owner of a page, may - help to find bare alloc_page(s) leaks. Even if you include this - feature on your build, it is disabled in default. You should pass - "page_owner=on" to boot parameter in order to enable it. Eats - a fair amount of memory if enabled. See tools/vm/page_owner_sort.c - for user-space helper. - - If unsure, say N. - config DEBUG_FS bool "Debug Filesystem" help diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 9a7b8b049d04..e3df921208c0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -39,6 +39,23 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT Enable debug page memory allocations by default? This value can be overridden by debug_pagealloc=off|on. +config PAGE_OWNER + bool "Track page owner" + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + select DEBUG_FS + select STACKTRACE + select STACKDEPOT + select PAGE_EXTENSION + help + This keeps track of what call chain is the owner of a page, may + help to find bare alloc_page(s) leaks. Even if you include this + feature on your build, it is disabled in default. You should pass + "page_owner=on" to boot parameter in order to enable it. Eats + a fair amount of memory if enabled. See tools/vm/page_owner_sort.c + for user-space helper. + + If unsure, say N. + config PAGE_POISONING bool "Poison pages after freeing" select PAGE_POISONING_NO_SANITY if HIBERNATION From 8bb4e7a2ee26c05a94ae6cb0aec2f82a3523cf35 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 5 Mar 2019 15:46:22 -0800 Subject: [PATCH 085/159] mm: fix some typos in mm directory No functional change. Link: http://lkml.kernel.org/r/20190118235123.27843-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Pekka Enberg Acked-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- mm/migrate.c | 2 +- mm/mmap.c | 8 ++++---- mm/page_alloc.c | 4 ++-- mm/slub.c | 2 +- mm/vmscan.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 90c13cdeefb5..6d3290cd1f6f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1301,7 +1301,7 @@ void memory_present(int nid, unsigned long start, unsigned long end); /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we - * need to check pfn validility within that MAX_ORDER_NR_PAGES block. + * need to check pfn validity within that MAX_ORDER_NR_PAGES block. * pfn_valid_within() should be used in this case; we optimise this away * when we have no holes within a MAX_ORDER_NR_PAGES block. */ diff --git a/mm/migrate.c b/mm/migrate.c index 0e9888cb33ad..5308d6abd384 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -100,7 +100,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) /* * Check PageMovable before holding a PG_lock because page's owner * assumes anybody doesn't touch PG_lock of newly allocated page - * so unconditionally grapping the lock ruins page's owner side. + * so unconditionally grabbing the lock ruins page's owner side. */ if (unlikely(!__PageMovable(page))) goto out_putpage; diff --git a/mm/mmap.c b/mm/mmap.c index eccba2650ef6..41eb48d9b527 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -438,7 +438,7 @@ static void vma_gap_update(struct vm_area_struct *vma) { /* * As it turns out, RB_DECLARE_CALLBACKS() already created a callback - * function that does exacltly what we want. + * function that does exactly what we want. */ vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } @@ -1012,7 +1012,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, * VM_SOFTDIRTY should not prevent from VMA merging, if we * match the flags but dirty bit -- the caller should mark * merged VMA as dirty. If dirty bit won't be excluded from - * comparison, we increase pressue on the memory system forcing + * comparison, we increase pressure on the memory system forcing * the kernel to generate new VMAs when old one could be * extended instead. */ @@ -1115,7 +1115,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN * might become case 1 below case 2 below case 3 below * - * It is important for case 8 that the the vma NNNN overlapping the + * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must * be extended in region AAAA and NNNN must be removed. This way in * all cases where vma_merge succeeds, the moment vma_adjust drops the @@ -1645,7 +1645,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) #endif /* __ARCH_WANT_SYS_OLD_MMAP */ /* - * Some shared mappigns will want the pages marked read-only + * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9be9a22ebe35..ec250453f5e8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7551,7 +7551,7 @@ static void __setup_per_zone_wmarks(void) * value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) - * deltas control asynch page reclaim, and so should + * deltas control async page reclaim, and so should * not be capped for highmem. */ unsigned long min_pages; @@ -8028,7 +8028,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, /* * Hugepages are not in LRU lists, but they're movable. - * We need not scan over tail pages bacause we don't + * We need not scan over tail pages because we don't * handle each tail page individually in migration. */ if (PageHuge(page)) { diff --git a/mm/slub.c b/mm/slub.c index d8b1eee2dd86..017a2ce5ba23 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2129,7 +2129,7 @@ redo: if (!lock) { lock = 1; /* - * Taking the spinlock removes the possiblity + * Taking the spinlock removes the possibility * that acquire_slab() will see a slab page that * is frozen */ diff --git a/mm/vmscan.c b/mm/vmscan.c index e979705bbf32..63195364ab2e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3527,7 +3527,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is - * found to have free_pages <= high_wmark_pages(zone), any page is that zone + * found to have free_pages <= high_wmark_pages(zone), any page in that zone * or lower is eligible for reclaim until at least one usable zone is * balanced. */ From 0cbe3e26abe0cfe7effb67f620a77d46cce628b2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:46:26 -0800 Subject: [PATCH 086/159] mm: update ptep_modify_prot_start/commit to take vm_area_struct as arg Patch series "NestMMU pte upgrade workaround for mprotect", v5. We can upgrade pte access (R -> RW transition) via mprotect. We need to make sure we follow the recommended pte update sequence as outlined in commit bd5050e38aec ("powerpc/mm/radix: Change pte relax sequence to handle nest MMU hang") for such updates. This patch series does that. This patch (of 5): Some architectures may want to call flush_tlb_range from these helpers. Link: http://lkml.kernel.org/r/20190116085035.29729-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Nicholas Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 4 ++-- arch/s390/mm/pgtable.c | 6 ++++-- arch/x86/include/asm/paravirt.h | 11 ++++++----- arch/x86/include/asm/paravirt_types.h | 5 +++-- arch/x86/xen/mmu.h | 4 ++-- arch/x86/xen/mmu_pv.c | 8 ++++---- fs/proc/task_mmu.c | 4 ++-- include/asm-generic/pgtable.h | 16 ++++++++-------- mm/memory.c | 4 ++-- mm/mprotect.c | 4 ++-- 10 files changed, 35 insertions(+), 31 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 063732414dfb..5d730199e37b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1069,8 +1069,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION -pte_t ptep_modify_prot_start(struct mm_struct *, unsigned long, pte_t *); -void ptep_modify_prot_commit(struct mm_struct *, unsigned long, pte_t *, pte_t); +pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); +void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, pte_t *, pte_t); #define __HAVE_ARCH_PTEP_CLEAR_FLUSH static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 689b66f29fc6..71aa01170768 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -301,12 +301,13 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_xchg_lazy); -pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pgste_t pgste; pte_t old; int nodat; + struct mm_struct *mm = vma->vm_mm; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); @@ -319,10 +320,11 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, return old; } -void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { pgste_t pgste; + struct mm_struct *mm = vma->vm_mm; if (!MACHINE_HAS_NX) pte_val(pte) &= ~_PAGE_NOEXEC; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a97f28d914d5..c5a7f18cce7e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -422,25 +422,26 @@ static inline pgdval_t pgd_val(pgd_t pgd) } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION -static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, +static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pteval_t ret; - ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep); + ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); return (pte_t) { .pte = ret }; } -static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, +static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { + if (sizeof(pteval_t) > sizeof(long)) /* 5 arg words */ - pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte); + pv_ops.mmu.ptep_modify_prot_commit(vma, addr, ptep, pte); else PVOP_VCALL4(mmu.ptep_modify_prot_commit, - mm, addr, ptep, pte.pte); + vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 488c59686a73..2474e434a6f7 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -55,6 +55,7 @@ struct task_struct; struct cpumask; struct flush_tlb_info; struct mmu_gather; +struct vm_area_struct; /* * Wrapper type for pointers to code which uses the non-standard @@ -254,9 +255,9 @@ struct pv_mmu_ops { pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); - void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + void (*ptep_modify_prot_commit)(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte); struct paravirt_callee_save pte_val; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index a7e47cf7ec6c..6e4c6bd62203 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -17,8 +17,8 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, +pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte); unsigned long xen_read_cr2_direct(void); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 0f4fe206dcc2..856a85814f00 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -306,20 +306,20 @@ static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, __xen_set_pte(ptep, pteval); } -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, +pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ - trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); + trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep); return *ptep; } -void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, +void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { struct mmu_update u; - trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); + trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte); xen_mc_batch(); u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 85b0ef890b28..9c2ef731dd5f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -948,10 +948,10 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, pte_t ptent = *pte; if (pte_present(ptent)) { - ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); + ptent = ptep_modify_prot_start(vma, addr, pte); ptent = pte_wrprotect(ptent); ptent = pte_clear_soft_dirty(ptent); - ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 05e61e6c843f..8b0e933efe26 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -606,7 +606,7 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd) return 0; } -static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, +static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -615,10 +615,10 @@ static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, * non-present, preventing the hardware from asynchronously * updating it. */ - return ptep_get_and_clear(mm, addr, ptep); + return ptep_get_and_clear(vma->vm_mm, addr, ptep); } -static inline void __ptep_modify_prot_commit(struct mm_struct *mm, +static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -626,7 +626,7 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm, * The pte is non-present, so there's no hardware state to * preserve. */ - set_pte_at(mm, addr, ptep, pte); + set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -644,22 +644,22 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm, * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ -static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, +static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return __ptep_modify_prot_start(mm, addr, ptep); + return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. */ -static inline void ptep_modify_prot_commit(struct mm_struct *mm, +static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { - __ptep_modify_prot_commit(mm, addr, ptep, pte); + __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ diff --git a/mm/memory.c b/mm/memory.c index 6aff43171a7b..5ade52502ea0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3619,12 +3619,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Make it present again, Depending on how arch implementes non * accessible ptes, some can allow access by kernel mode. */ - pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); + pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); pte = pte_modify(pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); diff --git a/mm/mprotect.c b/mm/mprotect.c index 36cb358db170..c89ce07923c8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -110,7 +110,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; } - ptent = ptep_modify_prot_start(mm, addr, pte); + ptent = ptep_modify_prot_start(vma, addr, pte); ptent = pte_modify(ptent, newprot); if (preserve_write) ptent = pte_mk_savedwrite(ptent); @@ -121,7 +121,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, !(vma->vm_flags & VM_SOFTDIRTY))) { ptent = pte_mkwrite(ptent); } - ptep_modify_prot_commit(mm, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, ptent); pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); From 04a8645304500be88b3345b65fef7efe58016166 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:46:29 -0800 Subject: [PATCH 087/159] mm: update ptep_modify_prot_commit to take old pte value as arg Architectures like ppc64 require to do a conditional tlb flush based on the old and new value of pte. Enable that by passing old pte value as the arg. Link: http://lkml.kernel.org/r/20190116085035.29729-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 3 ++- arch/s390/mm/pgtable.c | 2 +- arch/x86/include/asm/paravirt.h | 2 +- fs/proc/task_mmu.c | 8 +++++--- include/asm-generic/pgtable.h | 2 +- mm/memory.c | 8 ++++---- mm/mprotect.c | 6 +++--- 7 files changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 5d730199e37b..76dc344edb8c 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1070,7 +1070,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); -void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, pte_t *, pte_t); +void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, + pte_t *, pte_t, pte_t); #define __HAVE_ARCH_PTEP_CLEAR_FLUSH static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 71aa01170768..8485d6dc2754 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -321,7 +321,7 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, } void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t old_pte, pte_t pte) { pgste_t pgste; struct mm_struct *mm = vma->vm_mm; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c5a7f18cce7e..c25c38a05c1c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -433,7 +433,7 @@ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned } static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t old_pte, pte_t pte) { if (sizeof(pteval_t) > sizeof(long)) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9c2ef731dd5f..beccb0b1d57c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -948,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, pte_t ptent = *pte; if (pte_present(ptent)) { - ptent = ptep_modify_prot_start(vma, addr, pte); - ptent = pte_wrprotect(ptent); + pte_t old_pte; + + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); - ptep_modify_prot_commit(vma, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 8b0e933efe26..fa782fba51ee 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -657,7 +657,7 @@ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } diff --git a/mm/memory.c b/mm/memory.c index 5ade52502ea0..557c6fffedd1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3599,7 +3599,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) int last_cpupid; int target_nid; bool migrated = false; - pte_t pte; + pte_t pte, old_pte; bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; @@ -3619,12 +3619,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Make it present again, Depending on how arch implementes non * accessible ptes, some can allow access by kernel mode. */ - pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); - pte = pte_modify(pte, vma->vm_page_prot); + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); diff --git a/mm/mprotect.c b/mm/mprotect.c index c89ce07923c8..028c724dcb1a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -110,8 +110,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; } - ptent = ptep_modify_prot_start(vma, addr, pte); - ptent = pte_modify(ptent, newprot); + oldpte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_modify(oldpte, newprot); if (preserve_write) ptent = pte_mk_savedwrite(ptent); @@ -121,7 +121,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, !(vma->vm_flags & VM_SOFTDIRTY))) { ptent = pte_mkwrite(ptent); } - ptep_modify_prot_commit(vma, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); From 5b323367ef2567dff0d3daff477186002754f0bd Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:46:33 -0800 Subject: [PATCH 088/159] arch/powerpc/mm: Nest MMU workaround for mprotect RW upgrade NestMMU requires us to mark the pte invalid and flush the tlb when we do a RW upgrade of pte. We fixed a variant of this in the fault path in bd5050e38aec ("powerpc/mm/radix: Change pte relax sequence to handle nest MMU hang"). Do the same for mprotect upgrades. Hugetlb is handled in the next patch. Link: http://lkml.kernel.org/r/20190116085035.29729-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable.h | 18 ++++++++++++++ arch/powerpc/include/asm/book3s/64/radix.h | 4 ++++ arch/powerpc/mm/pgtable-book3s64.c | 25 ++++++++++++++++++++ arch/powerpc/mm/pgtable-radix.c | 18 ++++++++++++++ 4 files changed, 65 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index d8c8d7c9df15..868fcaf56f6b 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1306,6 +1306,24 @@ static inline int pud_pfn(pud_t pud) BUILD_BUG(); return 0; } +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); +void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, + pte_t *, pte_t, pte_t); + +/* + * Returns true for a R -> RW upgrade of pte + */ +static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_val) +{ + if (!(old_val & _PAGE_READ)) + return false; + + if ((!(old_val & _PAGE_WRITE)) && (new_val & _PAGE_WRITE)) + return true; + + return false; +} #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 7d1a3d1543fc..5ab134eeed20 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -127,6 +127,10 @@ extern void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep pte_t entry, unsigned long address, int psize); +extern void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte); + static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, unsigned long set) { diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index ecd31569a120..e7da590c7a78 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -401,6 +401,31 @@ void arch_report_meminfo(struct seq_file *m) } #endif /* CONFIG_PROC_FS */ +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + unsigned long pte_val; + + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0); + + return __pte(pte_val); + +} + +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + if (radix_enabled()) + return radix__ptep_modify_prot_commit(vma, addr, + ptep, old_pte, pte); + set_pte_at(vma->vm_mm, addr, ptep, pte); +} + /* * For hash translation mode, we use the deposited table to store hash slot * information and they are stored at PTRS_PER_PMD offset from related pmd diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 931156069a81..dced3cd241c2 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -1063,3 +1063,21 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, } /* See ptesync comment in radix__set_pte_at */ } + +void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * To avoid NMMU hang while relaxing access we need to flush the tlb before + * we set the new value. We need to do this only for radix, because hash + * translation does flush when updating the linux pte. + */ + if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + (atomic_read(&mm->context.copros) > 0)) + radix__flush_tlb_page(vma, addr); + + set_pte_at(mm, addr, ptep, pte); +} From 023bdd00235eb0dcb71fd98f0b8347a9bb85d417 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:46:37 -0800 Subject: [PATCH 089/159] mm/hugetlb: add prot_modify_start/commit sequence for hugetlb update Architectures like ppc64 require to do a conditional tlb flush based on the old and new value of pte. Follow the regular pte change protection sequence for hugetlb too. This allows the architectures to override the update sequence. Link: http://lkml.kernel.org/r/20190116085035.29729-5-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 20 ++++++++++++++++++++ mm/hugetlb.c | 8 +++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4cc3871b65fc..54c317c8355f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -580,6 +580,26 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr set_huge_pte_at(mm, addr, ptep, pte); } #endif + +#ifndef huge_ptep_modify_prot_start +#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start +static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); +} +#endif + +#ifndef huge_ptep_modify_prot_commit +#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit +static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} +#endif + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page(v, a, r) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2fb3062a3595..0c7848fccf93 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4399,10 +4399,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, continue; } if (!huge_pte_none(pte)) { - pte = huge_ptep_get_and_clear(mm, address, ptep); - pte = pte_mkhuge(huge_pte_modify(pte, newprot)); + pte_t old_pte; + + old_pte = huge_ptep_modify_prot_start(vma, address, ptep); + pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); pte = arch_make_huge_pte(pte, vma, NULL, 0); - set_huge_pte_at(mm, address, ptep, pte); + huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } spin_unlock(ptl); From 8ef5cbde6dafce8f30edb53f87cb485ceace63df Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:46:40 -0800 Subject: [PATCH 090/159] arch/powerpc/mm/hugetlb: NestMMU workaround for hugetlb mprotect RW upgrade NestMMU requires us to mark the pte invalid and flush the tlb when we do a RW upgrade of pte. We fixed a variant of this in the fault path in bd5050e38aec ("powerpc/mm/radix: Change pte relax sequence to handle nest MMU hang"). Link: http://lkml.kernel.org/r/20190116085035.29729-6-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 12 ++++++++++ arch/powerpc/mm/hugetlbpage-hash64.c | 25 ++++++++++++++++++++ arch/powerpc/mm/hugetlbpage-radix.c | 17 +++++++++++++ 3 files changed, 54 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 5b0177733994..66c1e4f88d65 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -13,6 +13,10 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); +extern void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte); + static inline int hstate_get_psize(struct hstate *hstate) { unsigned long shift; @@ -42,4 +46,12 @@ static inline bool gigantic_page_supported(void) /* hugepd entry valid bit */ #define HUGEPD_VAL_BITS (0x8000000000000000UL) +#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start +extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + +#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit +extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t new_pte); #endif diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 2e6a8f9345d3..367ce3a4a503 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -121,3 +121,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } + +pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + unsigned long pte_val; + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, + _PAGE_PRESENT, _PAGE_INVALID, 1); + + return __pte(pte_val); +} + +void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + + if (radix_enabled()) + return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, + old_pte, pte); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c index 2486bee0f93e..11d9ea28a816 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -90,3 +90,20 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return vm_unmapped_area(&info); } + +void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * To avoid NMMU hang while relaxing access we need to flush the tlb before + * we set the new value. + */ + if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + (atomic_read(&mm->context.copros) > 0)) + radix__flush_hugetlb_page(vma, addr); + + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} From 23a7052a5db478bdacf45bea55e6f50171f5eede Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 5 Mar 2019 15:46:43 -0800 Subject: [PATCH 091/159] mm/page_alloc.c: check return value of memblock_alloc_node_nopanic() There are two early memory allocations that use memblock_alloc_node_nopanic() and do not check its return value. While this happens very early during boot and chances that the allocation will fail are diminishing, it is still worth to have proper checks for the allocation errors. Link: http://lkml.kernel.org/r/1547734941-944-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ec250453f5e8..11a5f50efd97 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6431,10 +6431,14 @@ static void __ref setup_usemap(struct pglist_data *pgdat, { unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; - if (usemapsize) + if (usemapsize) { zone->pageblock_flags = memblock_alloc_node_nopanic(usemapsize, pgdat->node_id); + if (!zone->pageblock_flags) + panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", + usemapsize, zone->name, pgdat->node_id); + } } #else static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, @@ -6664,6 +6668,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); map = memblock_alloc_node_nopanic(size, pgdat->node_id); + if (!map) + panic("Failed to allocate %ld bytes for node %d memory map\n", + size, pgdat->node_id); pgdat->node_mem_map = map + offset; } pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", From 7775face207922ea62a4e96b9cd45abfdc7b9840 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 5 Mar 2019 15:46:47 -0800 Subject: [PATCH 092/159] memcg: killed threads should not invoke memcg OOM killer If a memory cgroup contains a single process with many threads (including different process group sharing the mm) then it is possible to trigger a race when the oom killer complains that there are no oom elible tasks and complain into the log which is both annoying and confusing because there is no actual problem. The race looks as follows: P1 oom_reaper P2 try_charge try_charge mem_cgroup_out_of_memory mutex_lock(oom_lock) out_of_memory oom_kill_process(P1,P2) wake_oom_reaper mutex_unlock(oom_lock) oom_reap_task mutex_lock(oom_lock) select_bad_process # no victim The problem is more visible with many threads. Fix this by checking for fatal_signal_pending from mem_cgroup_out_of_memory when the oom_lock is already held. The oom bypass is safe because we do the same early in the try_charge path already. The situation migh have changed in the mean time. It should be safe to check for fatal_signal_pending and tsk_is_oom_victim but for a better code readability abstract the current charge bypass condition into should_force_charge and reuse it from that path. " Link: http://lkml.kernel.org/r/01370f70-e1f6-ebe4-b95e-0df21a0bc15e@i-love.sakura.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: David Rientjes Cc: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59425a35763c..1eca627566ff 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -248,6 +248,12 @@ enum res_type { iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) +static inline bool should_force_charge(void) +{ + return tsk_is_oom_victim(current) || fatal_signal_pending(current) || + (current->flags & PF_EXITING); +} + /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -1389,8 +1395,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, }; bool ret; - mutex_lock(&oom_lock); - ret = out_of_memory(&oc); + if (mutex_lock_killable(&oom_lock)) + return true; + /* + * A few threads which were not waiting at mutex_lock_killable() can + * fail to bail out. Therefore, check again after holding oom_lock. + */ + ret = should_force_charge() || out_of_memory(&oc); mutex_unlock(&oom_lock); return ret; } @@ -2209,9 +2220,7 @@ retry: * bypass the last charges so that they can exit quickly and * free their memory. */ - if (unlikely(tsk_is_oom_victim(current) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) + if (unlikely(should_force_charge())) goto force; /* From 2e25644e8da4ed3a27e7b8315aaae74660be72dc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 5 Mar 2019 15:46:50 -0800 Subject: [PATCH 093/159] mm, mempolicy: fix uninit memory access Syzbot with KMSAN reports (excerpt): ================================================================== BUG: KMSAN: uninit-value in mpol_rebind_policy mm/mempolicy.c:353 [inline] BUG: KMSAN: uninit-value in mpol_rebind_mm+0x249/0x370 mm/mempolicy.c:384 CPU: 1 PID: 17420 Comm: syz-executor4 Not tainted 4.20.0-rc7+ #15 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x173/0x1d0 lib/dump_stack.c:113 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613 __msan_warning+0x82/0xf0 mm/kmsan/kmsan_instr.c:295 mpol_rebind_policy mm/mempolicy.c:353 [inline] mpol_rebind_mm+0x249/0x370 mm/mempolicy.c:384 update_tasks_nodemask+0x608/0xca0 kernel/cgroup/cpuset.c:1120 update_nodemasks_hier kernel/cgroup/cpuset.c:1185 [inline] update_nodemask kernel/cgroup/cpuset.c:1253 [inline] cpuset_write_resmask+0x2a98/0x34b0 kernel/cgroup/cpuset.c:1728 ... Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline] kmsan_internal_poison_shadow+0x92/0x150 mm/kmsan/kmsan.c:158 kmsan_kmalloc+0xa6/0x130 mm/kmsan/kmsan_hooks.c:176 kmem_cache_alloc+0x572/0xb90 mm/slub.c:2777 mpol_new mm/mempolicy.c:276 [inline] do_mbind mm/mempolicy.c:1180 [inline] kernel_mbind+0x8a7/0x31a0 mm/mempolicy.c:1347 __do_sys_mbind mm/mempolicy.c:1354 [inline] As it's difficult to report where exactly the uninit value resides in the mempolicy object, we have to guess a bit. mm/mempolicy.c:353 contains this part of mpol_rebind_policy(): if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) "mpol_store_user_nodemask(pol)" is testing pol->flags, which I couldn't ever see being uninitialized after leaving mpol_new(). So I'll guess it's actually about accessing pol->w.cpuset_mems_allowed on line 354, but still part of statement starting on line 353. For w.cpuset_mems_allowed to be not initialized, and the nodes_equal() reachable for a mempolicy where mpol_set_nodemask() is called in do_mbind(), it seems the only possibility is a MPOL_PREFERRED policy with empty set of nodes, i.e. MPOL_LOCAL equivalent, with MPOL_F_LOCAL flag. Let's exclude such policies from the nodes_equal() check. Note the uninit access should be benign anyway, as rebinding this kind of policy is always a no-op. Therefore no actual need for stable inclusion. Link: http://lkml.kernel.org/r/a71997c3-e8ae-a787-d5ce-3db05768b27c@suse.cz Link: http://lkml.kernel.org/r/73da3e9c-cc84-509e-17d9-0c434bb9967d@suse.cz Signed-off-by: Vlastimil Babka Reported-by: syzbot+b19c2dc2c990ea657a71@syzkaller.appspotmail.com Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: David Rientjes Cc: Yisheng Xie Cc: zhong jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 76e7e4bc3335..af171ccb56a2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -350,7 +350,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && + if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; From a9e7c39fa9fd908bc914d691045c96fdc97da7cd Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 5 Mar 2019 15:46:55 -0800 Subject: [PATCH 094/159] mm/vmscan.c: remove 7th argument of isolate_lru_pages() We may simply check for sc->may_unmap in isolate_lru_pages() instead of doing that in both of its callers. Link: http://lkml.kernel.org/r/154748280735.29962.15867846875217618569.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 63195364ab2e..a1c2f78cb78c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1653,7 +1653,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, - isolate_mode_t mode, enum lru_list lru) + enum lru_list lru) { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; @@ -1662,6 +1662,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long skipped = 0; unsigned long scan, total_scan, nr_pages; LIST_HEAD(pages_skipped); + isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED); scan = 0; for (total_scan = 0; @@ -1900,7 +1901,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_reclaimed = 0; unsigned long nr_taken; struct reclaim_stat stat = {}; - isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; @@ -1921,13 +1921,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, lru_add_drain(); - if (!sc->may_unmap) - isolate_mode |= ISOLATE_UNMAPPED; - spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, - &nr_scanned, sc, isolate_mode, lru); + &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; @@ -2084,19 +2081,15 @@ static void shrink_active_list(unsigned long nr_to_scan, struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; - isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); - if (!sc->may_unmap) - isolate_mode |= ISOLATE_UNMAPPED; - spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, - &nr_scanned, sc, isolate_mode, lru); + &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; From c5bf121e4350a933bd431385e6fcb72a898ecc68 Mon Sep 17 00:00:00 2001 From: Vineeth Remanan Pillai Date: Tue, 5 Mar 2019 15:46:58 -0800 Subject: [PATCH 095/159] mm: refactor swap-in logic out of shmem_getpage_gfp swapin logic can be reused independently without rest of the logic in shmem_getpage_gfp. So lets refactor it out as an independent function. Link: http://lkml.kernel.org/r/20190114153129.4852-1-vpillai@digitalocean.com Signed-off-by: Vineeth Remanan Pillai Reviewed-by: Andrew Morton Cc: Huang Ying Cc: Hugh Dickins Cc: Kelley Nielsen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 463 +++++++++++++++++++++++++++++------------------------ 1 file changed, 251 insertions(+), 212 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2c012eee133d..b4d27ef87496 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -123,6 +123,10 @@ static unsigned long shmem_default_max_inodes(void) static bool shmem_should_replace_page(struct page *page, gfp_t gfp); static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); +static int shmem_swapin_page(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, @@ -1575,6 +1579,116 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, return error; } +/* + * Swap in the page pointed to by *pagep. + * Caller has to make sure that *pagep contains a valid swapped page. + * Returns 0 and the page in pagep if success. On failure, returns the + * the error code and NULL in *pagep. + */ +static int shmem_swapin_page(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; + struct mem_cgroup *memcg; + struct page *page; + swp_entry_t swap; + int error; + + VM_BUG_ON(!*pagep || !xa_is_value(*pagep)); + swap = radix_to_swp_entry(*pagep); + *pagep = NULL; + + /* Look it up and read it in.. */ + page = lookup_swap_cache(swap, NULL, 0); + if (!page) { + /* Or update major stats only when swapin succeeds?? */ + if (fault_type) { + *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(charge_mm, PGMAJFAULT); + } + /* Here we actually start the io */ + page = shmem_swapin(swap, gfp, info, index); + if (!page) { + error = -ENOMEM; + goto failed; + } + } + + /* We have to do this with page locked to prevent races */ + lock_page(page); + if (!PageSwapCache(page) || page_private(page) != swap.val || + !shmem_confirm_swap(mapping, index, swap)) { + error = -EEXIST; + goto unlock; + } + if (!PageUptodate(page)) { + error = -EIO; + goto failed; + } + wait_on_page_writeback(page); + + if (shmem_should_replace_page(page, gfp)) { + error = shmem_replace_page(&page, gfp, info, index); + if (error) + goto failed; + } + + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, + false); + if (!error) { + error = shmem_add_to_page_cache(page, mapping, index, + swp_to_radix_entry(swap), gfp); + /* + * We already confirmed swap under page lock, and make + * no memory allocation here, so usually no possibility + * of error; but free_swap_and_cache() only trylocks a + * page, so it is just possible that the entry has been + * truncated or holepunched since swap was confirmed. + * shmem_undo_range() will have done some of the + * unaccounting, now delete_from_swap_cache() will do + * the rest. + */ + if (error) { + mem_cgroup_cancel_charge(page, memcg, false); + delete_from_swap_cache(page); + } + } + if (error) + goto failed; + + mem_cgroup_commit_charge(page, memcg, true, false); + + spin_lock_irq(&info->lock); + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + + if (sgp == SGP_WRITE) + mark_page_accessed(page); + + delete_from_swap_cache(page); + set_page_dirty(page); + swap_free(swap); + + *pagep = page; + return 0; +failed: + if (!shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; +unlock: + if (page) { + unlock_page(page); + put_page(page); + } + + return error; +} + /* * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * @@ -1596,7 +1710,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct mm_struct *charge_mm; struct mem_cgroup *memcg; struct page *page; - swp_entry_t swap; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; int error; @@ -1608,17 +1721,23 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) sgp = SGP_CACHE; repeat: - swap.val = 0; - page = find_lock_entry(mapping, index); - if (xa_is_value(page)) { - swap = radix_to_swp_entry(page); - page = NULL; - } - if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { - error = -EINVAL; - goto unlock; + return -EINVAL; + } + + sbinfo = SHMEM_SB(inode->i_sb); + charge_mm = vma ? vma->vm_mm : current->mm; + + page = find_lock_entry(mapping, index); + if (xa_is_value(page)) { + error = shmem_swapin_page(inode, index, &page, + sgp, gfp, vma, fault_type); + if (error == -EEXIST) + goto repeat; + + *pagep = page; + return error; } if (page && sgp == SGP_WRITE) @@ -1632,7 +1751,7 @@ repeat: put_page(page); page = NULL; } - if (page || (sgp == SGP_READ && !swap.val)) { + if (page || sgp == SGP_READ) { *pagep = page; return 0; } @@ -1641,215 +1760,138 @@ repeat: * Fast cache lookup did not find it: * bring it back from swap or allocate. */ - sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : current->mm; - if (swap.val) { - /* Look it up and read it in.. */ - page = lookup_swap_cache(swap, NULL, 0); - if (!page) { - /* Or update major stats only when swapin succeeds?? */ - if (fault_type) { - *fault_type |= VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - count_memcg_event_mm(charge_mm, PGMAJFAULT); - } - /* Here we actually start the io */ - page = shmem_swapin(swap, gfp, info, index); - if (!page) { - error = -ENOMEM; - goto failed; - } - } + if (vma && userfaultfd_missing(vma)) { + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); + return 0; + } - /* We have to do this with page locked to prevent races */ - lock_page(page); - if (!PageSwapCache(page) || page_private(page) != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { - error = -EEXIST; /* try again */ - goto unlock; - } - if (!PageUptodate(page)) { - error = -EIO; - goto failed; - } - wait_on_page_writeback(page); - - if (shmem_should_replace_page(page, gfp)) { - error = shmem_replace_page(&page, gfp, info, index); - if (error) - goto failed; - } - - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - false); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap), gfp); - /* - * We already confirmed swap under page lock, and make - * no memory allocation here, so usually no possibility - * of error; but free_swap_and_cache() only trylocks a - * page, so it is just possible that the entry has been - * truncated or holepunched since swap was confirmed. - * shmem_undo_range() will have done some of the - * unaccounting, now delete_from_swap_cache() will do - * the rest. - * Reset swap.val? No, leave it so "failed" goes back to - * "repeat": reading a hole and writing should succeed. - */ - if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - delete_from_swap_cache(page); - } - } - if (error) - goto failed; - - mem_cgroup_commit_charge(page, memcg, true, false); - - spin_lock_irq(&info->lock); - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - - if (sgp == SGP_WRITE) - mark_page_accessed(page); - - delete_from_swap_cache(page); - set_page_dirty(page); - swap_free(swap); - - } else { - if (vma && userfaultfd_missing(vma)) { - *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); - return 0; - } - - /* shmem_symlink() */ - if (mapping->a_ops != &shmem_aops) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_FORCE) + /* shmem_symlink() */ + if (mapping->a_ops != &shmem_aops) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_FORCE) + goto alloc_huge; + switch (sbinfo->huge) { + loff_t i_size; + pgoff_t off; + case SHMEM_HUGE_NEVER: + goto alloc_nohuge; + case SHMEM_HUGE_WITHIN_SIZE: + off = round_up(index, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) goto alloc_huge; - switch (sbinfo->huge) { - loff_t i_size; - pgoff_t off; - case SHMEM_HUGE_NEVER: - goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(index, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - goto alloc_huge; - /* fallthrough */ - case SHMEM_HUGE_ADVISE: - if (sgp_huge == SGP_HUGE) - goto alloc_huge; - /* TODO: implement fadvise() hints */ - goto alloc_nohuge; - } + /* fallthrough */ + case SHMEM_HUGE_ADVISE: + if (sgp_huge == SGP_HUGE) + goto alloc_huge; + /* TODO: implement fadvise() hints */ + goto alloc_nohuge; + } alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); - if (IS_ERR(page)) { -alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, - index, false); - } - if (IS_ERR(page)) { - int retry = 5; - error = PTR_ERR(page); - page = NULL; - if (error != -ENOSPC) - goto failed; - /* - * Try to reclaim some spece by splitting a huge page - * beyond i_size on the filesystem. - */ - while (retry--) { - int ret; - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); - if (ret == SHRINK_STOP) - break; - if (ret) - goto alloc_nohuge; - } - goto failed; - } - - if (PageTransHuge(page)) - hindex = round_down(index, HPAGE_PMD_NR); - else - hindex = index; - - if (sgp == SGP_WRITE) - __SetPageReferenced(page); - - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - PageTransHuge(page)); - if (error) - goto unacct; - error = shmem_add_to_page_cache(page, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK); - if (error) { - mem_cgroup_cancel_charge(page, memcg, - PageTransHuge(page)); - goto unacct; - } - mem_cgroup_commit_charge(page, memcg, false, - PageTransHuge(page)); - lru_cache_add_anon(page); - - spin_lock_irq(&info->lock); - info->alloced += 1 << compound_order(page); - inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - alloced = true; - - if (PageTransHuge(page) && - DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - hindex + HPAGE_PMD_NR - 1) { - /* - * Part of the huge page is beyond i_size: subject - * to shrink under memory pressure. - */ - spin_lock(&sbinfo->shrinklist_lock); - /* - * _careful to defend against unlocked access to - * ->shrink_list in shmem_unused_huge_shrink() - */ - if (list_empty_careful(&info->shrinklist)) { - list_add_tail(&info->shrinklist, - &sbinfo->shrinklist); - sbinfo->shrinklist_len++; - } - spin_unlock(&sbinfo->shrinklist_lock); - } + page = shmem_alloc_and_acct_page(gfp, inode, index, true); + if (IS_ERR(page)) { +alloc_nohuge: + page = shmem_alloc_and_acct_page(gfp, inode, + index, false); + } + if (IS_ERR(page)) { + int retry = 5; + error = PTR_ERR(page); + page = NULL; + if (error != -ENOSPC) + goto unlock; /* - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + * Try to reclaim some space by splitting a huge page + * beyond i_size on the filesystem. */ - if (sgp == SGP_FALLOC) - sgp = SGP_WRITE; + while (retry--) { + int ret; + + ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); + if (ret == SHRINK_STOP) + break; + if (ret) + goto alloc_nohuge; + } + goto unlock; + } + + if (PageTransHuge(page)) + hindex = round_down(index, HPAGE_PMD_NR); + else + hindex = index; + + if (sgp == SGP_WRITE) + __SetPageReferenced(page); + + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, + PageTransHuge(page)); + if (error) + goto unacct; + error = shmem_add_to_page_cache(page, mapping, hindex, + NULL, gfp & GFP_RECLAIM_MASK); + if (error) { + mem_cgroup_cancel_charge(page, memcg, + PageTransHuge(page)); + goto unacct; + } + mem_cgroup_commit_charge(page, memcg, false, + PageTransHuge(page)); + lru_cache_add_anon(page); + + spin_lock_irq(&info->lock); + info->alloced += 1 << compound_order(page); + inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + alloced = true; + + if (PageTransHuge(page) && + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < + hindex + HPAGE_PMD_NR - 1) { + /* + * Part of the huge page is beyond i_size: subject + * to shrink under memory pressure. + */ + spin_lock(&sbinfo->shrinklist_lock); + /* + * _careful to defend against unlocked access to + * ->shrink_list in shmem_unused_huge_shrink() + */ + if (list_empty_careful(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + } + spin_unlock(&sbinfo->shrinklist_lock); + } + + /* + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; clear: - /* - * Let SGP_WRITE caller clear ends if write does not fill page; - * but SGP_FALLOC on a page fallocated earlier must initialize - * it now, lest undo on failure cancel our earlier guarantee. - */ - if (sgp != SGP_WRITE && !PageUptodate(page)) { - struct page *head = compound_head(page); - int i; + /* + * Let SGP_WRITE caller clear ends if write does not fill page; + * but SGP_FALLOC on a page fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. + */ + if (sgp != SGP_WRITE && !PageUptodate(page)) { + struct page *head = compound_head(page); + int i; - for (i = 0; i < (1 << compound_order(head)); i++) { - clear_highpage(head + i); - flush_dcache_page(head + i); - } - SetPageUptodate(head); + for (i = 0; i < (1 << compound_order(head)); i++) { + clear_highpage(head + i); + flush_dcache_page(head + i); } + SetPageUptodate(head); } /* Perhaps the file has been truncated since we checked */ @@ -1879,9 +1921,6 @@ unacct: put_page(page); goto alloc_nohuge; } -failed: - if (swap.val && !shmem_confirm_swap(mapping, index, swap)) - error = -EEXIST; unlock: if (page) { unlock_page(page); From b56a2d8af9147a4efe4011b60d93779c0461ca97 Mon Sep 17 00:00:00 2001 From: Vineeth Remanan Pillai Date: Tue, 5 Mar 2019 15:47:03 -0800 Subject: [PATCH 096/159] mm: rid swapoff of quadratic complexity This patch was initially posted by Kelley Nielsen. Reposting the patch with all review comments addressed and with minor modifications and optimizations. Also, folding in the fixes offered by Hugh Dickins and Huang Ying. Tests were rerun and commit message updated with new results. try_to_unuse() is of quadratic complexity, with a lot of wasted effort. It unuses swap entries one by one, potentially iterating over all the page tables for all the processes in the system for each one. This new proposed implementation of try_to_unuse simplifies its complexity to linear. It iterates over the system's mms once, unusing all the affected entries as it walks each set of page tables. It also makes similar changes to shmem_unuse. Improvement swapoff was called on a swap partition containing about 6G of data, in a VM(8cpu, 16G RAM), and calls to unuse_pte_range() were counted. Present implementation....about 1200M calls(8min, avg 80% cpu util). Prototype.................about 9.0K calls(3min, avg 5% cpu util). Details In shmem_unuse(), iterate over the shmem_swaplist and, for each shmem_inode_info that contains a swap entry, pass it to shmem_unuse_inode(), along with the swap type. In shmem_unuse_inode(), iterate over its associated xarray, and store the index and value of each swap entry in an array for passing to shmem_swapin_page() outside of the RCU critical section. In try_to_unuse(), instead of iterating over the entries in the type and unusing them one by one, perhaps walking all the page tables for all the processes for each one, iterate over the mmlist, making one pass. Pass each mm to unuse_mm() to begin its page table walk, and during the walk, unuse all the ptes that have backing store in the swap type received by try_to_unuse(). After the walk, check the type for orphaned swap entries with find_next_to_unuse(), and remove them from the swap cache. If find_next_to_unuse() starts over at the beginning of the type, repeat the check of the shmem_swaplist and the walk a maximum of three times. Change unuse_mm() and the intervening walk functions down to unuse_pte_range() to take the type as a parameter, and to iterate over their entire range, calling the next function down on every iteration. In unuse_pte_range(), make a swap entry from each pte in the range using the passed in type. If it has backing store in the type, call swapin_readahead() to retrieve the page and pass it to unuse_pte(). Pass the count of pages_to_unuse down the page table walks in try_to_unuse(), and return from the walk when the desired number of pages has been swapped back in. Link: http://lkml.kernel.org/r/20190114153129.4852-2-vpillai@digitalocean.com Signed-off-by: Vineeth Remanan Pillai Signed-off-by: Kelley Nielsen Signed-off-by: Huang Ying Acked-by: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 7 + include/linux/shmem_fs.h | 3 +- mm/shmem.c | 271 ++++++++++++----------- mm/swapfile.c | 447 +++++++++++++++----------------------- 4 files changed, 328 insertions(+), 400 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 011965c08b93..6d775984905b 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -7,6 +7,13 @@ #include #include +/* + * Return code to denote that requested number of + * frontswap pages are unused(moved to page cache). + * Used in in shmem_unuse and try_to_unuse. + */ +#define FRONTSWAP_PAGES_UNUSED 2 + struct frontswap_ops { void (*init)(unsigned); /* this swap type was just swapon'ed */ int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f155dc607112..f3fb1edb3526 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -72,7 +72,8 @@ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); -extern int shmem_unuse(swp_entry_t entry, struct page *page); +extern int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, diff --git a/mm/shmem.c b/mm/shmem.c index b4d27ef87496..283a1833dafc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -36,6 +36,7 @@ #include #include #include +#include #include /* for arch/microblaze update_mmu_cache() */ @@ -1093,159 +1094,184 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -static unsigned long find_swap_entry(struct xarray *xa, void *item) +extern struct swap_info_struct *swap_info[]; + +static int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices, + bool frontswap) { - XA_STATE(xas, xa, 0); - unsigned int checked = 0; - void *entry; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; + unsigned int ret = 0; + + if (!nr_entries) + return 0; rcu_read_lock(); - xas_for_each(&xas, entry, ULONG_MAX) { - if (xas_retry(&xas, entry)) + xas_for_each(&xas, page, ULONG_MAX) { + if (xas_retry(&xas, page)) continue; - if (entry == item) + + if (!xa_is_value(page)) + continue; + + if (frontswap) { + swp_entry_t entry = radix_to_swp_entry(page); + + if (!frontswap_test(swap_info[swp_type(entry)], + swp_offset(entry))) + continue; + } + + indices[ret] = xas.xa_index; + entries[ret] = page; + + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + if (++ret == nr_entries) break; - checked++; - if ((checked % XA_CHECK_SCHED) != 0) - continue; - xas_pause(&xas); - cond_resched_rcu(); } rcu_read_unlock(); - return entry ? xas.xa_index : -1; + return ret; +} + +/* + * Move the swapped pages for an inode to page cache. Returns the count + * of pages swapped in, or the error in case of failure. + */ +static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, + pgoff_t *indices) +{ + int i = 0; + int ret = 0; + int error = 0; + struct address_space *mapping = inode->i_mapping; + + for (i = 0; i < pvec.nr; i++) { + struct page *page = pvec.pages[i]; + + if (!xa_is_value(page)) + continue; + error = shmem_swapin_page(inode, indices[i], + &page, SGP_CACHE, + mapping_gfp_mask(mapping), + NULL, NULL); + if (error == 0) { + unlock_page(page); + put_page(page); + ret++; + } + if (error == -ENOMEM) + break; + error = 0; + } + return error ? error : ret; } /* * If swap found in inode, free it and move page from swapcache to filecache. */ -static int shmem_unuse_inode(struct shmem_inode_info *info, - swp_entry_t swap, struct page **pagep) +static int shmem_unuse_inode(struct inode *inode, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) { - struct address_space *mapping = info->vfs_inode.i_mapping; - void *radswap; - pgoff_t index; - gfp_t gfp; - int error = 0; + struct address_space *mapping = inode->i_mapping; + pgoff_t start = 0; + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); + int ret = 0; - radswap = swp_to_radix_entry(swap); - index = find_swap_entry(&mapping->i_pages, radswap); - if (index == -1) - return -EAGAIN; /* tell shmem_unuse we found nothing */ + pagevec_init(&pvec); + do { + unsigned int nr_entries = PAGEVEC_SIZE; - /* - * Move _head_ to start search for next from here. - * But be careful: shmem_evict_inode checks list_empty without taking - * mutex, and there's an instant in list_move_tail when info->swaplist - * would appear empty, if it were the only one on shmem_swaplist. - */ - if (shmem_swaplist.next != &info->swaplist) - list_move_tail(&shmem_swaplist, &info->swaplist); + if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) + nr_entries = *fs_pages_to_unuse; - gfp = mapping_gfp_mask(mapping); - if (shmem_should_replace_page(*pagep, gfp)) { - mutex_unlock(&shmem_swaplist_mutex); - error = shmem_replace_page(pagep, gfp, info, index); - mutex_lock(&shmem_swaplist_mutex); - /* - * We needed to drop mutex to make that restrictive page - * allocation, but the inode might have been freed while we - * dropped it: although a racing shmem_evict_inode() cannot - * complete without emptying the page cache, our page lock - * on this swapcache page is not enough to prevent that - - * free_swap_and_cache() of our swap entry will only - * trylock_page(), removing swap from page cache whatever. - * - * We must not proceed to shmem_add_to_page_cache() if the - * inode has been freed, but of course we cannot rely on - * inode or mapping or info to check that. However, we can - * safely check if our swap entry is still in use (and here - * it can't have got reused for another page): if it's still - * in use, then the inode cannot have been freed yet, and we - * can safely proceed (if it's no longer in use, that tells - * nothing about the inode, but we don't need to unuse swap). - */ - if (!page_swapcount(*pagep)) - error = -ENOENT; - } - - /* - * We rely on shmem_swaplist_mutex, not only to protect the swaplist, - * but also to hold up shmem_evict_inode(): so inode cannot be freed - * beneath us (pagelock doesn't help until the page is in pagecache). - */ - if (!error) - error = shmem_add_to_page_cache(*pagep, mapping, index, - radswap, gfp); - if (error != -ENOMEM) { - /* - * Truncation and eviction use free_swap_and_cache(), which - * only does trylock page: if we raced, best clean up here. - */ - delete_from_swap_cache(*pagep); - set_page_dirty(*pagep); - if (!error) { - spin_lock_irq(&info->lock); - info->swapped--; - spin_unlock_irq(&info->lock); - swap_free(swap); + pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, + pvec.pages, indices, + frontswap); + if (pvec.nr == 0) { + ret = 0; + break; } - } - return error; + + ret = shmem_unuse_swap_entries(inode, pvec, indices); + if (ret < 0) + break; + + if (frontswap_partial) { + *fs_pages_to_unuse -= ret; + if (*fs_pages_to_unuse == 0) { + ret = FRONTSWAP_PAGES_UNUSED; + break; + } + } + + start = indices[pvec.nr - 1]; + } while (true); + + return ret; } /* - * Search through swapped inodes to find and replace swap by page. + * Read all the shared memory data that resides in the swap + * device 'type' back into memory, so the swap device can be + * unused. */ -int shmem_unuse(swp_entry_t swap, struct page *page) +int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { - struct list_head *this, *next; - struct shmem_inode_info *info; - struct mem_cgroup *memcg; + struct shmem_inode_info *info, *next; + struct inode *inode; + struct inode *prev_inode = NULL; int error = 0; - /* - * There's a faint possibility that swap page was replaced before - * caller locked it: caller will come back later with the right page. - */ - if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) - goto out; - - /* - * Charge page using GFP_KERNEL while we can wait, before taking - * the shmem_swaplist_mutex which might hold up shmem_writepage(). - * Charged back to the user (not to caller) when swap account is used. - */ - error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, - &memcg, false); - if (error) - goto out; - /* No memory allocation: swap entry occupies the slot for the page */ - error = -EAGAIN; + if (list_empty(&shmem_swaplist)) + return 0; mutex_lock(&shmem_swaplist_mutex); - list_for_each_safe(this, next, &shmem_swaplist) { - info = list_entry(this, struct shmem_inode_info, swaplist); - if (info->swapped) - error = shmem_unuse_inode(info, swap, &page); - else + + /* + * The extra refcount on the inode is necessary to safely dereference + * p->next after re-acquiring the lock. New shmem inodes with swap + * get added to the end of the list and we will scan them all. + */ + list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { + if (!info->swapped) { list_del_init(&info->swaplist); + continue; + } + + inode = igrab(&info->vfs_inode); + if (!inode) + continue; + + mutex_unlock(&shmem_swaplist_mutex); + if (prev_inode) + iput(prev_inode); + prev_inode = inode; + + error = shmem_unuse_inode(inode, type, frontswap, + fs_pages_to_unuse); cond_resched(); - if (error != -EAGAIN) + + mutex_lock(&shmem_swaplist_mutex); + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); + if (error) break; - /* found nothing in this: move on to search the next */ } mutex_unlock(&shmem_swaplist_mutex); - if (error) { - if (error != -ENOMEM) - error = 0; - mem_cgroup_cancel_charge(page, memcg, false); - } else - mem_cgroup_commit_charge(page, memcg, true, false); -out: - unlock_page(page); - put_page(page); + if (prev_inode) + iput(prev_inode); + return error; } @@ -1329,7 +1355,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) */ mutex_lock(&shmem_swaplist_mutex); if (list_empty(&info->swaplist)) - list_add_tail(&info->swaplist, &shmem_swaplist); + list_add(&info->swaplist, &shmem_swaplist); if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { spin_lock_irq(&info->lock); @@ -3886,7 +3912,8 @@ int __init shmem_init(void) return 0; } -int shmem_unuse(swp_entry_t swap, struct page *page) +int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { return 0; } diff --git a/mm/swapfile.c b/mm/swapfile.c index dbac1d49469d..6de46984d59d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1799,44 +1799,77 @@ out_nolock: } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned long addr, unsigned long end, + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { - pte_t swp_pte = swp_entry_to_pte(entry); + struct page *page; + swp_entry_t entry; pte_t *pte; + struct swap_info_struct *si; + unsigned long offset; int ret = 0; + volatile unsigned char *swap_map; - /* - * We don't actually need pte lock while scanning for swp_pte: since - * we hold page lock and mmap_sem, swp_pte cannot be inserted into the - * page table while we're scanning; though it could get zapped, and on - * some architectures (e.g. x86_32 with PAE) we might catch a glimpse - * of unmatched parts which look like swp_pte, so unuse_pte must - * recheck under pte lock. Scanning without pte lock lets it be - * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. - */ + si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { - /* - * swapoff spends a _lot_ of time in this loop! - * Test inline before going to call unuse_pte. - */ - if (unlikely(pte_same_as_swp(*pte, swp_pte))) { - pte_unmap(pte); - ret = unuse_pte(vma, pmd, addr, entry, page); - if (ret) - goto out; - pte = pte_offset_map(pmd, addr); + struct vm_fault vmf; + + if (!is_swap_pte(*pte)) + continue; + + entry = pte_to_swp_entry(*pte); + if (swp_type(entry) != type) + continue; + + offset = swp_offset(entry); + if (frontswap && !frontswap_test(si, offset)) + continue; + + pte_unmap(pte); + swap_map = &si->swap_map[offset]; + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + if (!page) { + if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) + goto try_next; + return -ENOMEM; } + + lock_page(page); + wait_on_page_writeback(page); + ret = unuse_pte(vma, pmd, addr, entry, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + goto out; + } + + try_to_free_swap(page); + unlock_page(page); + put_page(page); + + if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { + ret = FRONTSWAP_PAGES_UNUSED; + goto out; + } +try_next: + pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); + + ret = 0; out: return ret; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { pmd_t *pmd; unsigned long next; @@ -1848,7 +1881,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + ret = unuse_pte_range(vma, pmd, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pmd++, addr = next, addr != end); @@ -1857,7 +1891,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { pud_t *pud; unsigned long next; @@ -1868,7 +1903,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + ret = unuse_pmd_range(vma, pud, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pud++, addr = next, addr != end); @@ -1877,7 +1913,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { p4d_t *p4d; unsigned long next; @@ -1888,78 +1925,66 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; - ret = unuse_pud_range(vma, p4d, addr, next, entry, page); + ret = unuse_pud_range(vma, p4d, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } -static int unuse_vma(struct vm_area_struct *vma, - swp_entry_t entry, struct page *page) +static int unuse_vma(struct vm_area_struct *vma, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) { pgd_t *pgd; unsigned long addr, end, next; int ret; - if (page_anon_vma(page)) { - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) - return 0; - else - end = addr + PAGE_SIZE; - } else { - addr = vma->vm_start; - end = vma->vm_end; - } + addr = vma->vm_start; + end = vma->vm_end; pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); + ret = unuse_p4d_range(vma, pgd, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } -static int unuse_mm(struct mm_struct *mm, - swp_entry_t entry, struct page *page) +static int unuse_mm(struct mm_struct *mm, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) { struct vm_area_struct *vma; int ret = 0; - if (!down_read_trylock(&mm->mmap_sem)) { - /* - * Activate page so shrink_inactive_list is unlikely to unmap - * its ptes while lock is dropped, so swapoff can make progress. - */ - activate_page(page); - unlock_page(page); - down_read(&mm->mmap_sem); - lock_page(page); - } + down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) - break; + if (vma->anon_vma) { + ret = unuse_vma(vma, type, frontswap, + fs_pages_to_unuse); + if (ret) + break; + } cond_resched(); } up_read(&mm->mmap_sem); - return (ret < 0)? ret: 0; + return ret; } /* * Scan swap_map (or frontswap_map if frontswap parameter is true) - * from current position to next entry still in use. - * Recycle to start on reaching the end, returning 0 when empty. + * from current position to next entry still in use. Return 0 + * if there are no inuse entries after prev till end of the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev, bool frontswap) { - unsigned int max = si->max; - unsigned int i = prev; + unsigned int i; unsigned char count; /* @@ -1968,20 +1993,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ - for (;;) { - if (++i >= max) { - if (!prev) { - i = 0; - break; - } - /* - * No entries in use at top of swap_map, - * loop back to start and recheck there. - */ - max = prev + 1; - prev = 0; - i = 1; - } + for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) if (!frontswap || frontswap_test(si, i)) @@ -1989,240 +2001,121 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, if ((i % LATENCY_LIMIT) == 0) cond_resched(); } + + if (i == si->max) + i = 0; + return i; } /* - * We completely avoid races by reading each swap page in advance, - * and then search for the process using it. All the necessary - * page table adjustments can then be made atomically. - * - * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * If the boolean frontswap is true, only unuse pages_to_unuse pages; * pages_to_unuse==0 means all pages; ignored if frontswap is false */ +#define SWAP_UNUSE_MAX_TRIES 3 int try_to_unuse(unsigned int type, bool frontswap, unsigned long pages_to_unuse) { + struct mm_struct *prev_mm; + struct mm_struct *mm; + struct list_head *p; + int retval = 0; struct swap_info_struct *si = swap_info[type]; - struct mm_struct *start_mm; - volatile unsigned char *swap_map; /* swap_map is accessed without - * locking. Mark it as volatile - * to prevent compiler doing - * something odd. - */ - unsigned char swcount; struct page *page; swp_entry_t entry; - unsigned int i = 0; - int retval = 0; + unsigned int i; + int retries = 0; - /* - * When searching mms for an entry, a good strategy is to - * start at the first mm we freed the previous entry from - * (though actually we don't notice whether we or coincidence - * freed the entry). Initialize this start_mm with a hold. - * - * A simpler strategy would be to start at the last mm we - * freed the previous entry from; but that would take less - * advantage of mmlist ordering, which clusters forked mms - * together, child after parent. If we race with dup_mmap(), we - * prefer to resolve parent before child, lest we miss entries - * duplicated after we scanned child: using last mm would invert - * that. - */ - start_mm = &init_mm; - mmget(&init_mm); + if (!si->inuse_pages) + return 0; - /* - * Keep on scanning until all entries have gone. Usually, - * one pass through swap_map is enough, but not necessarily: - * there are races when an instance of an entry might be missed. - */ - while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { + if (!frontswap) + pages_to_unuse = 0; + +retry: + retval = shmem_unuse(type, frontswap, &pages_to_unuse); + if (retval) + goto out; + + prev_mm = &init_mm; + mmget(prev_mm); + + spin_lock(&mmlist_lock); + p = &init_mm.mmlist; + while ((p = p->next) != &init_mm.mmlist) { if (signal_pending(current)) { retval = -EINTR; break; } - /* - * Get a page for the entry, using the existing swap - * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. - */ - swap_map = &si->swap_map[i]; - entry = swp_entry(type, i); - page = read_swap_cache_async(entry, - GFP_HIGHUSER_MOVABLE, NULL, 0, false); - if (!page) { - /* - * Either swap_duplicate() failed because entry - * has been freed independently, and will not be - * reused since sys_swapoff() already disabled - * allocation from here, or alloc_page() failed. - */ - swcount = *swap_map; - /* - * We don't hold lock here, so the swap entry could be - * SWAP_MAP_BAD (when the cluster is discarding). - * Instead of fail out, We can just skip the swap - * entry because swapoff will wait for discarding - * finish anyway. - */ - if (!swcount || swcount == SWAP_MAP_BAD) - continue; - retval = -ENOMEM; - break; - } - - /* - * Don't hold on to start_mm if it looks like exiting. - */ - if (atomic_read(&start_mm->mm_users) == 1) { - mmput(start_mm); - start_mm = &init_mm; - mmget(&init_mm); - } - - /* - * Wait for and lock page. When do_swap_page races with - * try_to_unuse, do_swap_page can handle the fault much - * faster than try_to_unuse can locate the entry. This - * apparently redundant "wait_on_page_locked" lets try_to_unuse - * defer to do_swap_page in such a case - in some tests, - * do_swap_page and try_to_unuse repeatedly compete. - */ - wait_on_page_locked(page); - wait_on_page_writeback(page); - lock_page(page); - wait_on_page_writeback(page); - - /* - * Remove all references to entry. - */ - swcount = *swap_map; - if (swap_count(swcount) == SWAP_MAP_SHMEM) { - retval = shmem_unuse(entry, page); - /* page has already been unlocked and released */ - if (retval < 0) - break; + mm = list_entry(p, struct mm_struct, mmlist); + if (!mmget_not_zero(mm)) continue; - } - if (swap_count(swcount) && start_mm != &init_mm) - retval = unuse_mm(start_mm, entry, page); + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); - if (swap_count(*swap_map)) { - int set_start_mm = (*swap_map >= swcount); - struct list_head *p = &start_mm->mmlist; - struct mm_struct *new_start_mm = start_mm; - struct mm_struct *prev_mm = start_mm; - struct mm_struct *mm; - - mmget(new_start_mm); - mmget(prev_mm); - spin_lock(&mmlist_lock); - while (swap_count(*swap_map) && !retval && - (p = p->next) != &start_mm->mmlist) { - mm = list_entry(p, struct mm_struct, mmlist); - if (!mmget_not_zero(mm)) - continue; - spin_unlock(&mmlist_lock); - mmput(prev_mm); - prev_mm = mm; - - cond_resched(); - - swcount = *swap_map; - if (!swap_count(swcount)) /* any usage ? */ - ; - else if (mm == &init_mm) - set_start_mm = 1; - else - retval = unuse_mm(mm, entry, page); - - if (set_start_mm && *swap_map < swcount) { - mmput(new_start_mm); - mmget(mm); - new_start_mm = mm; - set_start_mm = 0; - } - spin_lock(&mmlist_lock); - } - spin_unlock(&mmlist_lock); - mmput(prev_mm); - mmput(start_mm); - start_mm = new_start_mm; - } if (retval) { - unlock_page(page); - put_page(page); - break; + mmput(prev_mm); + goto out; } - /* - * If a reference remains (rare), we would like to leave - * the page in the swap cache; but try_to_unmap could - * then re-duplicate the entry once we drop page lock, - * so we might loop indefinitely; also, that page could - * not be swapped out to other storage meanwhile. So: - * delete from cache even if there's another reference, - * after ensuring that the data has been saved to disk - - * since if the reference remains (rarer), it will be - * read from disk into another page. Splitting into two - * pages would be incorrect if swap supported "shared - * private" pages, but they are handled by tmpfs files. - * - * Given how unuse_vma() targets one particular offset - * in an anon_vma, once the anon_vma has been determined, - * this splitting happens to be just what is needed to - * handle where KSM pages have been swapped out: re-reading - * is unnecessarily slow, but we can fix that later on. - */ - if (swap_count(*swap_map) && - PageDirty(page) && PageSwapCache(page)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; - - swap_writepage(compound_head(page), &wbc); - lock_page(page); - wait_on_page_writeback(page); - } - - /* - * It is conceivable that a racing task removed this page from - * swap cache just before we acquired the page lock at the top, - * or while we dropped it in unuse_mm(). The page might even - * be back in swap cache on another swap area: that we must not - * delete, since it may not have been written out to swap yet. - */ - if (PageSwapCache(page) && - likely(page_private(page) == entry.val) && - (!PageTransCompound(page) || - !swap_page_trans_huge_swapped(si, entry))) - delete_from_swap_cache(compound_head(page)); - - /* - * So we could skip searching mms once swap count went - * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so shrink_page_list will preserve it. - */ - SetPageDirty(page); - unlock_page(page); - put_page(page); - /* * Make sure that we aren't completely killing * interactive performance. */ cond_resched(); - if (frontswap && pages_to_unuse > 0) { - if (!--pages_to_unuse) - break; - } + spin_lock(&mmlist_lock); + } + spin_unlock(&mmlist_lock); + + mmput(prev_mm); + + i = 0; + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { + + entry = swp_entry(type, i); + page = find_get_page(swap_address_space(entry), i); + if (!page) + continue; + + /* + * It is conceivable that a racing task removed this page from + * swap cache just before we acquired the page lock. The page + * might even be back in swap cache on another swap area. But + * that is okay, try_to_free_swap() only removes stale pages. + */ + lock_page(page); + wait_on_page_writeback(page); + try_to_free_swap(page); + unlock_page(page); + put_page(page); + + /* + * For frontswap, we just need to unuse pages_to_unuse, if + * it was specified. Need not check frontswap again here as + * we already zeroed out pages_to_unuse if not frontswap. + */ + if (pages_to_unuse && --pages_to_unuse == 0) + goto out; } - mmput(start_mm); - return retval; + /* + * Lets check again to see if there are still swap entries in the map. + * If yes, we would need to do retry the unuse logic again. + * Under global memory pressure, swap entries can be reinserted back + * into process space after the mmlist loop above passes over them. + * Its not worth continuosuly retrying to unuse the swap in this case. + * So we try SWAP_UNUSE_MAX_TRIES times. + */ + if (++retries >= SWAP_UNUSE_MAX_TRIES) + retval = -EBUSY; + else if (si->inuse_pages) + goto retry; + +out: + return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; } /* From 750b317f853899e2e0ed786c117935501c742dc8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:47:07 -0800 Subject: [PATCH 097/159] agp: efficeon: no need to set PG_reserved on GATT tables Patch series "mm: PG_reserved cleanups and documentation", v2. I was recently going over all users of PG_reserved. Short story: it is difficult and sometimes not really clear if setting/checking for PG_reserved is only a relict from the past. Easy to break things. I guess I now have a pretty good idea wh things are like that nowadays and how they evolved. I had way more cleanups in this series inititally, but some architectures take PG_reserved as a way to apply a different caching strategy (for MMIO pages). So I decided to only include the most obvious changes (that are less likely to break something). So the big chunk of manual SetPageReserved users are MMIO/DMA related things on device buffers. Most notably, for device memory we will hopefully soon stop setting PG_reserved. Then the documentation has to be updated. This patch (of 9): The l1 GATT page table is kept in a special on-chip page with 64 entries. We allocate the l2 page table pages via get_zeroed_page() and enter them into the table. These l2 pages are modified accordingly when inserting/removing memory via efficeon_insert_memory and efficeon_remove_memory. Apart from that, these pages are not exposed or ioremap'ed. We can stop setting them reserved (propably copied from generic code). Link: http://lkml.kernel.org/r/20190114125903.24845-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: David Airlie Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/agp/efficeon-agp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c index 7f88490b5479..c53f0f9ef5b0 100644 --- a/drivers/char/agp/efficeon-agp.c +++ b/drivers/char/agp/efficeon-agp.c @@ -163,7 +163,6 @@ static int efficeon_free_gatt_table(struct agp_bridge_data *bridge) unsigned long page = efficeon_private.l1_table[index]; if (page) { efficeon_private.l1_table[index] = 0; - ClearPageReserved(virt_to_page((char *)page)); free_page(page); freed++; } @@ -219,7 +218,6 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge) efficeon_free_gatt_table(agp_bridge); return -ENOMEM; } - SetPageReserved(virt_to_page((char *)page)); for (offset = 0; offset < PAGE_SIZE; offset += clflush_chunk) clflush((char *)page+offset); From 446d29645b7d2411a502885fc1cbd1746bcf80be Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:47:10 -0800 Subject: [PATCH 098/159] s390/vdso: don't clear PG_reserved The VDSO is part of the kernel image and therefore the struct pages are marked as reserved during boot. As we install a special mapping, the actual struct pages will never be exposed to MM via the page tables. We can therefore leave the pages marked as reserved. Link: http://lkml.kernel.org/r/20190114125903.24845-3-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Martin Schwidefsky Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Michal Hocko Cc: Vasily Gorbik Cc: Kees Cook Cc: Souptick Joarder Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/vdso.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 4ff354887db4..e7920a68a12e 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -291,7 +291,6 @@ static int __init vdso_init(void) BUG_ON(vdso32_pagelist == NULL); for (i = 0; i < vdso32_pages - 1; i++) { struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); - ClearPageReserved(pg); get_page(pg); vdso32_pagelist[i] = pg; } @@ -309,7 +308,6 @@ static int __init vdso_init(void) BUG_ON(vdso64_pagelist == NULL); for (i = 0; i < vdso64_pages - 1; i++) { struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); - ClearPageReserved(pg); get_page(pg); vdso64_pagelist[i] = pg; } From f55b74170b6aabc79af8c813b5068d3014e68ef1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:47:14 -0800 Subject: [PATCH 099/159] powerpc/vdso: don't clear PG_reserved The VDSO is part of the kernel image and therefore the struct pages are marked as reserved during boot. As we install a special mapping, the actual struct pages will never be exposed to MM via the page tables. We can therefore leave the pages marked as reserved. Link: http://lkml.kernel.org/r/20190114125903.24845-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michael Ellerman [powerpc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Christophe Leroy Cc: Kees Cook Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/vdso.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 7725a9714736..a31b6234fcd7 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -798,7 +798,6 @@ static int __init vdso_init(void) BUG_ON(vdso32_pagelist == NULL); for (i = 0; i < vdso32_pages; i++) { struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); - ClearPageReserved(pg); get_page(pg); vdso32_pagelist[i] = pg; } @@ -812,7 +811,6 @@ static int __init vdso_init(void) BUG_ON(vdso64_pagelist == NULL); for (i = 0; i < vdso64_pages; i++) { struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); - ClearPageReserved(pg); get_page(pg); vdso64_pagelist[i] = pg; } From 795c230604cb78ee927ca3904ec299b777b5f6c9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:47:18 -0800 Subject: [PATCH 100/159] riscv/vdso: don't clear PG_reserved The VDSO is part of the kernel image and therefore the struct pages are marked as reserved during boot. As we install a special mapping, the actual struct pages will never be exposed to MM via the page tables. We can therefore leave the pages marked as reserved. Link: http://lkml.kernel.org/r/20190114125903.24845-5-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Palmer Dabbelt Reviewed-by: Christoph Hellwig Cc: Palmer Dabbelt Cc: Albert Ou Cc: Tobias Klauser Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/riscv/kernel/vdso.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 582cb153eb24..0cd044122234 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -54,7 +54,6 @@ static int __init vdso_init(void) struct page *pg; pg = virt_to_page(vdso_start + (i << PAGE_SHIFT)); - ClearPageReserved(pg); vdso_pagelist[i] = pg; } vdso_pagelist[i] = virt_to_page(vdso_data); From 5ffb90b39334c857ce365cb48fbc7486fb817b45 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:47:21 -0800 Subject: [PATCH 101/159] m68k/mm: use __ClearPageReserved() The PG_reserved flag is cleared from memory that is part of the kernel image (and therefore marked as PG_reserved). Avoid using PG_reserved directly. Link: http://lkml.kernel.org/r/20190114125903.24845-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Geert Uytterhoeven Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m68k/mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index b86a2e21693b..227c04fe60d2 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c @@ -51,7 +51,7 @@ void __init init_pointer_table(unsigned long ptable) pr_debug("init_pointer_table: %lx, %x\n", ptable, PD_MARKBITS(dp)); /* unreserve the page so it's possible to free that page */ - PD_PAGE(dp)->flags &= ~(1 << PG_reserved); + __ClearPageReserved(PD_PAGE(dp)); init_page_count(PD_PAGE(dp)); return; From aee494424414aa6f511bb837624557e9d3b84823 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:47:25 -0800 Subject: [PATCH 102/159] arm64: kexec: no need to ClearPageReserved() This will be done by free_reserved_page(). Link: http://lkml.kernel.org/r/20190114125903.24845-7-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: James Morse Reviewed-by: Bhupesh Sharma Cc: Catalin Marinas Cc: Will Deacon Cc: James Morse Cc: Marc Zyngier Cc: Dave Kleikamp Cc: Mark Rutland Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/machine_kexec.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index aa9c94113700..6f0587b5e941 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -361,7 +361,6 @@ void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) for (addr = begin; addr < end; addr += PAGE_SIZE) { page = phys_to_page(addr); - ClearPageReserved(page); free_reserved_page(page); } } From d9fa9d951779eb8110879f796434876a58321ae9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:47:28 -0800 Subject: [PATCH 103/159] arm64: kdump: no need to mark crashkernel pages manually PG_reserved The crashkernel is reserved via memblock_reserve(). memblock_free_all() will call free_low_memory_core_early(), which will go over all reserved memblocks, marking the pages as PG_reserved. So manually marking pages as PG_reserved is not necessary, they are already in the desired state (otherwise they would have been handed over to the buddy as free pages and bad things would happen). Link: http://lkml.kernel.org/r/20190114125903.24845-8-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthias Brugger Reviewed-by: Bhupesh Sharma Cc: Catalin Marinas Cc: Will Deacon Cc: James Morse Cc: David Hildenbrand Cc: Mark Rutland Cc: Dave Kleikamp Cc: Mike Rapoport Cc: Michal Hocko Cc: Florian Fainelli Cc: Stefan Agner Cc: Laura Abbott Cc: Greg Hackmann Cc: Johannes Weiner Cc: Kristina Martsenko Cc: CHANDAN VN Cc: AKASHI Takahiro Cc: Logan Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/machine_kexec.c | 2 +- arch/arm64/mm/init.c | 27 --------------------------- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 6f0587b5e941..66b5d697d943 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -321,7 +321,7 @@ void crash_post_resume(void) * but does not hold any data of loaded kernel image. * * Note that all the pages in crash dump kernel memory have been initially - * marked as Reserved in kexec_reserve_crashkres_pages(). + * marked as Reserved as memory was allocated via memblock_reserve(). * * In hibernation, the pages which are Reserved and yet "nosave" are excluded * from the hibernation iamge. crash_is_nosave() does thich check for crash diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 7205a9085b4d..c38976b70069 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -118,35 +118,10 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; } - -static void __init kexec_reserve_crashkres_pages(void) -{ -#ifdef CONFIG_HIBERNATION - phys_addr_t addr; - struct page *page; - - if (!crashk_res.end) - return; - - /* - * To reduce the size of hibernation image, all the pages are - * marked as Reserved initially. - */ - for (addr = crashk_res.start; addr < (crashk_res.end + 1); - addr += PAGE_SIZE) { - page = phys_to_page(addr); - SetPageReserved(page); - } -#endif -} #else static void __init reserve_crashkernel(void) { } - -static void __init kexec_reserve_crashkres_pages(void) -{ -} #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_CRASH_DUMP @@ -586,8 +561,6 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ memblock_free_all(); - kexec_reserve_crashkres_pages(); - mem_init_print_info(NULL); /* From 731351d1bd3211101b4de8975540e273bcc99838 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:47:32 -0800 Subject: [PATCH 104/159] ia64: perfmon: don't mark buffer pages as PG_reserved In the old days, remap_pfn_range() required pages to be marked as PG_reserved, so they would e.g. never get swapped out. This was required for special mappings. Nowadays, this is fully handled via the VMA (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP inside remap_pfn_range() to be precise). PG_reserved is no longer required but only a relic from the past. So only architecture specific MM handling might require it (e.g. to detect them as MMIO pages). As there are no architecture specific checks for PageReserved() apart from MCA handling in ia64code, this can go. Use simple vzalloc()/vfree() instead. Note that before calling vzalloc(), size has already been aligned to PAGE_SIZE, no need to align again. Link: http://lkml.kernel.org/r/20190114125903.24845-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Tony Luck Cc: Fenghua Yu Cc: Oleg Nesterov Cc: David Hildenbrand Cc: David Howells Cc: Mike Rapoport Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 59 +++----------------------------------- 1 file changed, 4 insertions(+), 55 deletions(-) diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 46bff1661836..7a969f4c3534 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -583,17 +583,6 @@ pfm_put_task(struct task_struct *task) if (task != current) put_task_struct(task); } -static inline void -pfm_reserve_page(unsigned long a) -{ - SetPageReserved(vmalloc_to_page((void *)a)); -} -static inline void -pfm_unreserve_page(unsigned long a) -{ - ClearPageReserved(vmalloc_to_page((void*)a)); -} - static inline unsigned long pfm_protect_ctx_ctxsw(pfm_context_t *x) { @@ -816,44 +805,6 @@ pfm_reset_msgq(pfm_context_t *ctx) DPRINT(("ctx=%p msgq reset\n", ctx)); } -static void * -pfm_rvmalloc(unsigned long size) -{ - void *mem; - unsigned long addr; - - size = PAGE_ALIGN(size); - mem = vzalloc(size); - if (mem) { - //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem); - addr = (unsigned long)mem; - while (size > 0) { - pfm_reserve_page(addr); - addr+=PAGE_SIZE; - size-=PAGE_SIZE; - } - } - return mem; -} - -static void -pfm_rvfree(void *mem, unsigned long size) -{ - unsigned long addr; - - if (mem) { - DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size)); - addr = (unsigned long) mem; - while ((long) size > 0) { - pfm_unreserve_page(addr); - addr+=PAGE_SIZE; - size-=PAGE_SIZE; - } - vfree(mem); - } - return; -} - static pfm_context_t * pfm_context_alloc(int ctx_flags) { @@ -1498,7 +1449,7 @@ pfm_free_smpl_buffer(pfm_context_t *ctx) /* * free the buffer */ - pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size); + vfree(ctx->ctx_smpl_hdr); ctx->ctx_smpl_hdr = NULL; ctx->ctx_smpl_size = 0UL; @@ -2137,7 +2088,7 @@ doit: * All memory free operations (especially for vmalloc'ed memory) * MUST be done with interrupts ENABLED. */ - if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size); + vfree(smpl_buf_addr); /* * return the memory used by the context @@ -2266,10 +2217,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t /* * We do the easy to undo allocations first. - * - * pfm_rvmalloc(), clears the buffer, so there is no leak */ - smpl_buf = pfm_rvmalloc(size); + smpl_buf = vzalloc(size); if (smpl_buf == NULL) { DPRINT(("Can't allocate sampling buffer\n")); return -ENOMEM; @@ -2346,7 +2295,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t error: vm_area_free(vma); error_kmem: - pfm_rvfree(smpl_buf, size); + vfree(smpl_buf); return -ENOMEM; } From 6e2e07cd35f6f72d1950453b170f6bfb6c668c46 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:47:36 -0800 Subject: [PATCH 105/159] mm: better document PG_reserved The usage of PG_reserved and how PG_reserved pages are to be treated is buried deep down in different parts of the kernel. Let's shine some light onto these details by documenting current users and expected behavior. Especially, clarify on the "Some of them might not even exist" case. These are physical memory gaps that will never be dumped as they are not marked as IORESOURCE_SYSRAM. PG_reserved does in general not hinder anybody from dumping or swapping. In some cases, these pages will not be stored in the hibernation image. Link: http://lkml.kernel.org/r/20190114125903.24845-10-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Stephen Rothwell Cc: Pavel Tatashin Cc: Michal Hocko Cc: Alexander Duyck Cc: Matthew Wilcox Cc: Anthony Yznaga Cc: Miles Chen Cc: Cc: Dan Williams Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 808b4183e30d..9f8712a4b1a5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -17,8 +17,37 @@ /* * Various page->flags bits: * - * PG_reserved is set for special pages, which can never be swapped out. Some - * of them might not even exist... + * PG_reserved is set for special pages. The "struct page" of such a page + * should in general not be touched (e.g. set dirty) except by its owner. + * Pages marked as PG_reserved include: + * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, + * initrd, HW tables) + * - Pages reserved or allocated early during boot (before the page allocator + * was initialized). This includes (depending on the architecture) the + * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much + * much more. Once (if ever) freed, PG_reserved is cleared and they will + * be given to the page allocator. + * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying + * to read/write these pages might end badly. Don't touch! + * - The zero page(s) + * - Pages not added to the page allocator when onlining a section because + * they were excluded via the online_page_callback() or because they are + * PG_hwpoison. + * - Pages allocated in the context of kexec/kdump (loaded kernel image, + * control pages, vmcoreinfo) + * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are + * not marked PG_reserved (as they might be in use by somebody else who does + * not respect the caching strategy). + * - Pages part of an offline section (struct pages of offline sections should + * not be trusted as they will be initialized when first onlined). + * - MCA pages on ia64 + * - Pages holding CPU notes for POWER Firmware Assisted Dump + * - Device memory (e.g. PMEM, DAX, HMM) + * Some PG_reserved pages will be excluded from the hibernation image. + * PG_reserved does in general not hinder anybody from dumping or swapping + * and is no longer required for remap_pfn_range(). ioremap might require it. + * Consequently, PG_reserved for a page mapped into user space can indicate + * the zero page, the vDSO, MMIO pages or device memory. * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by From d7fefcc8de9147cc37d0c00df12e7ea4f77999b5 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:47:40 -0800 Subject: [PATCH 106/159] mm/cma: add PF flag to force non cma alloc Patch series "mm/kvm/vfio/ppc64: Migrate compound pages out of CMA region", v8. ppc64 uses the CMA area for the allocation of guest page table (hash page table). We won't be able to start guest if we fail to allocate hash page table. We have observed hash table allocation failure because we failed to migrate pages out of CMA region because they were pinned. This happen when we are using VFIO. VFIO on ppc64 pins the entire guest RAM. If the guest RAM pages get allocated out of CMA region, we won't be able to migrate those pages. The pages are also pinned for the lifetime of the guest. Currently we support migration of non-compound pages. With THP and with the addition of hugetlb migration we can end up allocating compound pages from CMA region. This patch series add support for migrating compound pages. This patch (of 4): Add PF_MEMALLOC_NOCMA which make sure any allocation in that context is marked non-movable and hence cannot be satisfied by CMA region. This is useful with get_user_pages_longterm where we want to take a page pin by migrating pages from CMA region. Marking the section PF_MEMALLOC_NOCMA ensures that we avoid unnecessary page migration later. Link: http://lkml.kernel.org/r/20190114095438.32470-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Suggested-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Michal Hocko Cc: Alexey Kardashevskiy Cc: David Gibson Cc: Michael Ellerman Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + include/linux/sched/mm.h | 48 +++++++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ebfb34fb9b30..36ec6e7e8291 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1407,6 +1407,7 @@ extern struct pid *cad_pid; #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ +#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3bfa6a0cbba4..0cd9f10423fb 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -148,17 +148,25 @@ static inline bool in_vfork(struct task_struct *tsk) * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_NOCMA implies no allocation from CMA region. */ static inline gfp_t current_gfp_context(gfp_t flags) { - /* - * NOIO implies both NOIO and NOFS and it is a weaker context - * so always make sure it makes precedence - */ - if (unlikely(current->flags & PF_MEMALLOC_NOIO)) - flags &= ~(__GFP_IO | __GFP_FS); - else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) - flags &= ~__GFP_FS; + if (unlikely(current->flags & + (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) { + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precedence + */ + if (current->flags & PF_MEMALLOC_NOIO) + flags &= ~(__GFP_IO | __GFP_FS); + else if (current->flags & PF_MEMALLOC_NOFS) + flags &= ~__GFP_FS; +#ifdef CONFIG_CMA + if (current->flags & PF_MEMALLOC_NOCMA) + flags &= ~__GFP_MOVABLE; +#endif + } return flags; } @@ -248,6 +256,30 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +#ifdef CONFIG_CMA +static inline unsigned int memalloc_nocma_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; + + current->flags |= PF_MEMALLOC_NOCMA; + return flags; +} + +static inline void memalloc_nocma_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; +} +#else +static inline unsigned int memalloc_nocma_save(void) +{ + return 0; +} + +static inline void memalloc_nocma_restore(unsigned int flags) +{ +} +#endif + #ifdef CONFIG_MEMCG /** * memalloc_use_memcg - Starts the remote memcg charging scope. From 9a4e9f3b2d7393d50256762c21e7466b4b6b1c9c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:47:44 -0800 Subject: [PATCH 107/159] mm: update get_user_pages_longterm to migrate pages allocated from CMA region This patch updates get_user_pages_longterm to migrate pages allocated out of CMA region. This makes sure that we don't keep non-movable pages (due to page reference count) in the CMA area. This will be used by ppc64 in a later patch to avoid pinning pages in the CMA region. ppc64 uses CMA region for allocation of the hardware page table (hash page table) and not able to migrate pages out of CMA region results in page table allocation failures. One case where we hit this easy is when a guest using a VFIO passthrough device. VFIO locks all the guest's memory and if the guest memory is backed by CMA region, it becomes unmovable resulting in fragmenting the CMA and possibly preventing other guests from allocation a large enough hash page table. NOTE: We allocate the new page without using __GFP_THISNODE Link: http://lkml.kernel.org/r/20190114095438.32470-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Alexey Kardashevskiy Cc: Andrea Arcangeli Cc: David Gibson Cc: Michael Ellerman Cc: Michal Hocko Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 + include/linux/mm.h | 3 +- mm/gup.c | 202 +++++++++++++++++++++++++++++++++++----- mm/hugetlb.c | 4 +- 4 files changed, 183 insertions(+), 28 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 54c317c8355f..ea35263eb76b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -371,6 +371,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bb6408fe73..20ec56f8e2bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1536,7 +1536,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -#ifdef CONFIG_FS_DAX + +#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); diff --git a/mm/gup.c b/mm/gup.c index 75029649baca..22291db50013 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include #include #include @@ -1126,7 +1129,167 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); +#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) + #ifdef CONFIG_FS_DAX +static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) +{ + long i; + struct vm_area_struct *vma_prev = NULL; + + for (i = 0; i < nr_pages; i++) { + struct vm_area_struct *vma = vmas[i]; + + if (vma == vma_prev) + continue; + + vma_prev = vma; + + if (vma_is_fsdax(vma)) + return true; + } + return false; +} +#else +static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) +{ + return false; +} +#endif + +#ifdef CONFIG_CMA +static struct page *new_non_cma_page(struct page *page, unsigned long private) +{ + /* + * We want to make sure we allocate the new page from the same node + * as the source page. + */ + int nid = page_to_nid(page); + /* + * Trying to allocate a page for migration. Ignore allocation + * failure warnings. We don't force __GFP_THISNODE here because + * this node here is the node where we have CMA reservation and + * in some case these nodes will have really less non movable + * allocation memory. + */ + gfp_t gfp_mask = GFP_USER | __GFP_NOWARN; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + +#ifdef CONFIG_HUGETLB_PAGE + if (PageHuge(page)) { + struct hstate *h = page_hstate(page); + /* + * We don't want to dequeue from the pool because pool pages will + * mostly be from the CMA region. + */ + return alloc_migrate_huge_page(h, gfp_mask, nid, NULL); + } +#endif + if (PageTransHuge(page)) { + struct page *thp; + /* + * ignore allocation failure warnings + */ + gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN; + + /* + * Remove the movable mask so that we don't allocate from + * CMA area again. + */ + thp_gfpmask &= ~__GFP_MOVABLE; + thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; + } + + return __alloc_pages_node(nid, gfp_mask, 0); +} + +static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, + unsigned int gup_flags, + struct page **pages, + struct vm_area_struct **vmas) +{ + long i; + bool drain_allow = true; + bool migrate_allow = true; + LIST_HEAD(cma_page_list); + +check_again: + for (i = 0; i < nr_pages; i++) { + /* + * If we get a page from the CMA zone, since we are going to + * be pinning these entries, we might as well move them out + * of the CMA zone if possible. + */ + if (is_migrate_cma_page(pages[i])) { + + struct page *head = compound_head(pages[i]); + + if (PageHuge(head)) { + isolate_huge_page(head, &cma_page_list); + } else { + if (!PageLRU(head) && drain_allow) { + lru_add_drain_all(); + drain_allow = false; + } + + if (!isolate_lru_page(head)) { + list_add_tail(&head->lru, &cma_page_list); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + + page_is_file_cache(head), + hpage_nr_pages(head)); + } + } + } + } + + if (!list_empty(&cma_page_list)) { + /* + * drop the above get_user_pages reference. + */ + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + + if (migrate_pages(&cma_page_list, new_non_cma_page, + NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) { + /* + * some of the pages failed migration. Do get_user_pages + * without migration. + */ + migrate_allow = false; + + if (!list_empty(&cma_page_list)) + putback_movable_pages(&cma_page_list); + } + /* + * We did migrate all the pages, Try to get the page references again + * migrating any new CMA pages which we failed to isolate earlier. + */ + nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + if ((nr_pages > 0) && migrate_allow) { + drain_allow = true; + goto check_again; + } + } + + return nr_pages; +} +#else +static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, + unsigned int gup_flags, + struct page **pages, + struct vm_area_struct **vmas) +{ + return nr_pages; +} +#endif + /* * This is the same as get_user_pages() in that it assumes we are * operating on the current task's mm, but it goes further to validate @@ -1140,11 +1303,11 @@ EXPORT_SYMBOL(get_user_pages); * Contrast this to iov_iter_get_pages() usages which are transient. */ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas_arg) { struct vm_area_struct **vmas = vmas_arg; - struct vm_area_struct *vma_prev = NULL; + unsigned long flags; long rc, i; if (!pages) @@ -1157,31 +1320,20 @@ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, return -ENOMEM; } + flags = memalloc_nocma_save(); rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); - - for (i = 0; i < rc; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma == vma_prev) - continue; - - vma_prev = vma; - - if (vma_is_fsdax(vma)) - break; - } - - /* - * Either get_user_pages() failed, or the vma validation - * succeeded, in either case we don't need to put_page() before - * returning. - */ - if (i >= rc) + memalloc_nocma_restore(flags); + if (rc < 0) goto out; - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; + if (check_dax_vmas(vmas, rc)) { + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; + goto out; + } + + rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); out: if (vmas != vmas_arg) kfree(vmas); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0c7848fccf93..97b1e0290c66 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1587,8 +1587,8 @@ out_unlock: return page; } -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { struct page *page; From 678e174c4c16a940ecfd94e52b7bad73062507f0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:47:47 -0800 Subject: [PATCH 108/159] powerpc/mm/iommu: allow migration of cma allocated pages during mm_iommu_do_alloc The current code doesn't do page migration if the page allocated is a compound page. With HugeTLB migration support, we can end up allocating hugetlb pages from CMA region. Also, THP pages can be allocated from CMA region. This patch updates the code to handle compound pages correctly. The patch also switches to a single get_user_pages with the right count, instead of doing one get_user_pages per page. That avoids reading page table multiple times. This is done by using get_user_pages_longterm, because that also takes care of DAX backed pages. DAX pages lifetime is dictated by file system rules and as such, we need to make sure that we free these pages on operations like truncate and punch hole. If we have long term pin on these pages, which are mostly return to userspace with elevated page count, the entity holding the long term pin may not be aware of the fact that file got truncated and the file system blocks possibly got reused. That can result in corruption. The patch also converts the hpas member of mm_iommu_table_group_mem_t to a union. We use the same storage location to store pointers to struct page. We cannot update all the code path use struct page *, because we access hpas in real mode and we can't do that struct page * to pfn conversion in real mode. [aneesh.kumar@linux.ibm.com: address review feedback, update changelog] Link: http://lkml.kernel.org/r/20190227144736.5872-4-aneesh.kumar@linux.ibm.com Link: http://lkml.kernel.org/r/20190114095438.32470-5-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Michael Ellerman Cc: Alexey Kardashevskiy Cc: Andrea Arcangeli Cc: David Gibson Cc: Michal Hocko Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/mmu_context_iommu.c | 125 +++++++++------------------- 1 file changed, 38 insertions(+), 87 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index a712a650a8b6..85b4e9f5c615 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -21,6 +21,7 @@ #include #include #include +#include static DEFINE_MUTEX(mem_list_mutex); @@ -34,8 +35,18 @@ struct mm_iommu_table_group_mem_t { atomic64_t mapped; unsigned int pageshift; u64 ua; /* userspace address */ - u64 entries; /* number of entries in hpas[] */ - u64 *hpas; /* vmalloc'ed */ + u64 entries; /* number of entries in hpas/hpages[] */ + /* + * in mm_iommu_get we temporarily use this to store + * struct page address. + * + * We need to convert ua to hpa in real mode. Make it + * simpler by storing physical address. + */ + union { + struct page **hpages; /* vmalloc'ed */ + phys_addr_t *hpas; + }; #define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) u64 dev_hpa; /* Device memory base address */ }; @@ -80,64 +91,15 @@ bool mm_iommu_preregistered(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mm_iommu_preregistered); -/* - * Taken from alloc_migrate_target with changes to remove CMA allocations - */ -struct page *new_iommu_non_cma_page(struct page *page, unsigned long private) -{ - gfp_t gfp_mask = GFP_USER; - struct page *new_page; - - if (PageCompound(page)) - return NULL; - - if (PageHighMem(page)) - gfp_mask |= __GFP_HIGHMEM; - - /* - * We don't want the allocation to force an OOM if possibe - */ - new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN); - return new_page; -} - -static int mm_iommu_move_page_from_cma(struct page *page) -{ - int ret = 0; - LIST_HEAD(cma_migrate_pages); - - /* Ignore huge pages for now */ - if (PageCompound(page)) - return -EBUSY; - - lru_add_drain(); - ret = isolate_lru_page(page); - if (ret) - return ret; - - list_add(&page->lru, &cma_migrate_pages); - put_page(page); /* Drop the gup reference */ - - ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page, - NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE); - if (ret) { - if (!list_empty(&cma_migrate_pages)) - putback_movable_pages(&cma_migrate_pages); - } - - return 0; -} - static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, - unsigned long entries, unsigned long dev_hpa, - struct mm_iommu_table_group_mem_t **pmem) + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem) { struct mm_iommu_table_group_mem_t *mem; - long i, j, ret = 0, locked_entries = 0; + long i, ret, locked_entries = 0; unsigned int pageshift; unsigned long flags; unsigned long cur_ua; - struct page *page = NULL; mutex_lock(&mem_list_mutex); @@ -187,41 +149,25 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, goto unlock_exit; } + down_read(&mm->mmap_sem); + ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + up_read(&mm->mmap_sem); + if (ret != entries) { + /* free the reference taken */ + for (i = 0; i < ret; i++) + put_page(mem->hpages[i]); + + vfree(mem->hpas); + kfree(mem); + ret = -EFAULT; + goto unlock_exit; + } + + pageshift = PAGE_SHIFT; for (i = 0; i < entries; ++i) { + struct page *page = mem->hpages[i]; + cur_ua = ua + (i << PAGE_SHIFT); - if (1 != get_user_pages_fast(cur_ua, - 1/* pages */, 1/* iswrite */, &page)) { - ret = -EFAULT; - for (j = 0; j < i; ++j) - put_page(pfn_to_page(mem->hpas[j] >> - PAGE_SHIFT)); - vfree(mem->hpas); - kfree(mem); - goto unlock_exit; - } - /* - * If we get a page from the CMA zone, since we are going to - * be pinning these entries, we might as well move them out - * of the CMA zone if possible. NOTE: faulting in + migration - * can be expensive. Batching can be considered later - */ - if (is_migrate_cma_page(page)) { - if (mm_iommu_move_page_from_cma(page)) - goto populate; - if (1 != get_user_pages_fast(cur_ua, - 1/* pages */, 1/* iswrite */, - &page)) { - ret = -EFAULT; - for (j = 0; j < i; ++j) - put_page(pfn_to_page(mem->hpas[j] >> - PAGE_SHIFT)); - vfree(mem->hpas); - kfree(mem); - goto unlock_exit; - } - } -populate: - pageshift = PAGE_SHIFT; if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) { pte_t *pte; struct page *head = compound_head(page); @@ -239,10 +185,15 @@ populate: local_irq_restore(flags); } mem->pageshift = min(mem->pageshift, pageshift); + /* + * We don't need struct page reference any more, switch + * to physical address. + */ mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; } good_exit: + ret = 0; atomic64_set(&mem->mapped, 1); mem->used = 1; mem->ua = ua; From 7f18825174203526a47c127c12a50f897ee0b511 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:47:51 -0800 Subject: [PATCH 109/159] powerpc/mm/iommu: allow large IOMMU page size only for hugetlb backing THP pages can get split during different code paths. An incremented reference count does imply we will not split the compound page. But the pmd entry can be converted to level 4 pte entries. Keep the code simpler by allowing large IOMMU page size only if the guest ram is backed by hugetlb pages. Link: http://lkml.kernel.org/r/20190114095438.32470-6-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Alexey Kardashevskiy Cc: Andrea Arcangeli Cc: David Gibson Cc: Michael Ellerman Cc: Michal Hocko Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/mmu_context_iommu.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 85b4e9f5c615..e7a9c4f6bfca 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -98,8 +98,6 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, struct mm_iommu_table_group_mem_t *mem; long i, ret, locked_entries = 0; unsigned int pageshift; - unsigned long flags; - unsigned long cur_ua; mutex_lock(&mem_list_mutex); @@ -167,22 +165,14 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, for (i = 0; i < entries; ++i) { struct page *page = mem->hpages[i]; - cur_ua = ua + (i << PAGE_SHIFT); - if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) { - pte_t *pte; + /* + * Allow to use larger than 64k IOMMU pages. Only do that + * if we are backed by hugetlb. + */ + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { struct page *head = compound_head(page); - unsigned int compshift = compound_order(head); - unsigned int pteshift; - local_irq_save(flags); /* disables as well */ - pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift); - - /* Double check it is still the same pinned page */ - if (pte && pte_page(*pte) == head && - pteshift == compshift + PAGE_SHIFT) - pageshift = max_t(unsigned int, pteshift, - PAGE_SHIFT); - local_irq_restore(flags); + pageshift = compound_order(head) + PAGE_SHIFT; } mem->pageshift = min(mem->pageshift, pageshift); /* From ab3948f58ff841e51feb845720624665ef5b7ef3 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 5 Mar 2019 15:47:54 -0800 Subject: [PATCH 110/159] mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd Android uses ashmem for sharing memory regions. We are looking forward to migrating all usecases of ashmem to memfd so that we can possibly remove the ashmem driver in the future from staging while also benefiting from using memfd and contributing to it. Note staging drivers are also not ABI and generally can be removed at anytime. One of the main usecases Android has is the ability to create a region and mmap it as writeable, then add protection against making any "future" writes while keeping the existing already mmap'ed writeable-region active. This allows us to implement a usecase where receivers of the shared memory buffer can get a read-only view, while the sender continues to write to the buffer. See CursorWindow documentation in Android for more details: https://developer.android.com/reference/android/database/CursorWindow This usecase cannot be implemented with the existing F_SEAL_WRITE seal. To support the usecase, this patch adds a new F_SEAL_FUTURE_WRITE seal which prevents any future mmap and write syscalls from succeeding while keeping the existing mmap active. A better way to do F_SEAL_FUTURE_WRITE seal was discussed [1] last week where we don't need to modify core VFS structures to get the same behavior of the seal. This solves several side-effects pointed by Andy. self-tests are provided in later patch to verify the expected semantics. [1] https://lore.kernel.org/lkml/20181111173650.GA256781@google.com/ Thanks a lot to Andy for suggestions to improve code. Link: http://lkml.kernel.org/r/20190112203816.85534-2-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Acked-by: John Stultz Cc: Andy Lutomirski Cc: Minchan Kim Cc: Jann Horn Cc: Al Viro Cc: Andy Lutomirski Cc: Hugh Dickins Cc: J. Bruce Fields Cc: Jeff Layton Cc: Marc-Andr Lureau Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- include/uapi/linux/fcntl.h | 1 + mm/memfd.c | 3 ++- mm/shmem.c | 25 ++++++++++++++++++++++--- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a7fa037b876b..b0eef008de67 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6448cdd9a350..a2f8658f1c55 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -41,6 +41,7 @@ #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ /* (1U << 31) is reserved for signed error codes */ /* diff --git a/mm/memfd.c b/mm/memfd.c index 97264c79d2cd..650e65a46b9c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ - F_SEAL_WRITE) + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE) static int memfd_add_seals(struct file *file, unsigned int seals) { diff --git a/mm/shmem.c b/mm/shmem.c index 283a1833dafc..b3db3779a30a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2190,6 +2190,24 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + + if (info->seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED + * read-only mapping, take care to not allow mprotect to revert + * protections. + */ + vma->vm_flags &= ~(VM_MAYWRITE); + } + file_accessed(file); vma->vm_ops = &shmem_vm_ops; if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && @@ -2440,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ - if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { - if (info->seals & F_SEAL_WRITE) + if (unlikely(info->seals & (F_SEAL_GROW | + F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; @@ -2704,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } From 544029862cbb1d7903e19f2e58f48d4884e1201b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 5 Mar 2019 15:47:58 -0800 Subject: [PATCH 111/159] selftests/memfd: add tests for F_SEAL_FUTURE_WRITE seal Add tests to verify sealing memfds with the F_SEAL_FUTURE_WRITE works as expected. Link: http://lkml.kernel.org/r/20190112203816.85534-3-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Shuah Khan Cc: Al Viro Cc: Andy Lutomirski Cc: Hugh Dickins Cc: Jann Horn Cc: J. Bruce Fields Cc: Jeff Layton Cc: John Stultz Cc: Marc-Andr Lureau Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/memfd/memfd_test.c | 74 ++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 10baa1652fc2..c67d32eeb668 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -54,6 +54,22 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) return fd; } +static int mfd_assert_reopen_fd(int fd_in) +{ + int r, fd; + char path[100]; + + sprintf(path, "/proc/self/fd/%d", fd_in); + + fd = open(path, O_RDWR); + if (fd < 0) { + printf("re-open of existing fd %d failed\n", fd_in); + abort(); + } + + return fd; +} + static void mfd_fail_new(const char *name, unsigned int flags) { int r; @@ -255,6 +271,25 @@ static void mfd_assert_read(int fd) munmap(p, mfd_def_size); } +/* Test that PROT_READ + MAP_SHARED mappings work. */ +static void mfd_assert_read_shared(int fd) +{ + void *p; + + /* verify PROT_READ and MAP_SHARED *is* allowed */ + p = mmap(NULL, + mfd_def_size, + PROT_READ, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + munmap(p, mfd_def_size); +} + static void mfd_assert_write(int fd) { ssize_t l; @@ -692,6 +727,44 @@ static void test_seal_write(void) close(fd); } +/* + * Test SEAL_FUTURE_WRITE + * Test whether SEAL_FUTURE_WRITE actually prevents modifications. + */ +static void test_seal_future_write(void) +{ + int fd, fd2; + void *p; + + printf("%s SEAL-FUTURE-WRITE\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_future_write", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + p = mfd_assert_mmap_shared(fd); + + mfd_assert_has_seals(fd, 0); + + mfd_assert_add_seals(fd, F_SEAL_FUTURE_WRITE); + mfd_assert_has_seals(fd, F_SEAL_FUTURE_WRITE); + + /* read should pass, writes should fail */ + mfd_assert_read(fd); + mfd_assert_read_shared(fd); + mfd_fail_write(fd); + + fd2 = mfd_assert_reopen_fd(fd); + /* read should pass, writes should still fail */ + mfd_assert_read(fd2); + mfd_assert_read_shared(fd2); + mfd_fail_write(fd2); + + munmap(p, mfd_def_size); + close(fd2); + close(fd); +} + /* * Test SEAL_SHRINK * Test whether SEAL_SHRINK actually prevents shrinking @@ -945,6 +1018,7 @@ int main(int argc, char **argv) test_basic(); test_seal_write(); + test_seal_future_write(); test_seal_shrink(); test_seal_grow(); test_seal_resize(); From 59118c42a60b997d277ad04d2309a6ec30682e5e Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 5 Mar 2019 15:48:02 -0800 Subject: [PATCH 112/159] mm: swap: use mem_cgroup_is_root() instead of deferencing css->parent mem_cgroup_is_root() is the preferred API to check if memcg is root or not. Use it instead of deferencing css->parent. Link: http://lkml.kernel.org/r/1547232913-118148-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Michal Hocko Cc: Huang Ying Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 622025ac1461..649529be91f2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -625,7 +625,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return vm_swappiness; /* root ? */ - if (mem_cgroup_disabled() || !memcg->css.parent) + if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) return vm_swappiness; return memcg->swappiness; From 2bb0f34fe3c1f04196cbcf8aa86b0a9371f6938d Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 5 Mar 2019 15:48:05 -0800 Subject: [PATCH 113/159] mm: vmscan: do not iterate all mem cgroups for global direct reclaim In current implementation, both kswapd and direct reclaim has to iterate all mem cgroups. It is not a problem before offline mem cgroups could be iterated. But, currently with iterating offline mem cgroups, it could be very time consuming. In our workloads, we saw over 400K mem cgroups accumulated in some cases, only a few hundred are online memcgs. Although kswapd could help out to reduce the number of memcgs, direct reclaim still get hit with iterating a number of offline memcgs in some cases. We experienced the responsiveness problems due to this occassionally. A simple test with pref shows it may take around 220ms to iterate 8K memcgs in direct reclaim: dd 13873 [011] 578.542919: vmscan:mm_vmscan_direct_reclaim_begin dd 13873 [011] 578.758689: vmscan:mm_vmscan_direct_reclaim_end So for 400K, it may take around 11 seconds to iterate all memcgs. Here just break the iteration once it reclaims enough pages as what memcg direct reclaim does. This may hurt the fairness among memcgs. But the cached iterator cookie could help to achieve the fairness more or less. Link: http://lkml.kernel.org/r/1548799877-10949-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a1c2f78cb78c..07a68dcd5f58 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2747,16 +2747,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_reclaimed - reclaimed); /* - * Direct reclaim and kswapd have to scan all memory - * cgroups to fulfill the overall scan target for the - * node. + * Kswapd have to scan all memory cgroups to fulfill + * the overall scan target for the node. * * Limit reclaim, on the other hand, only cares about * nr_to_reclaim pages to be reclaimed and it will * retry with decreasing priority if one round over the * whole hierarchy is not sufficient. */ - if (!global_reclaim(sc) && + if (!current_is_kswapd() && sc->nr_reclaimed >= sc->nr_to_reclaim) { mem_cgroup_iter_break(root, memcg); break; From 1ff9e6e1798c7670ea6a7680a1ad5582df2fa914 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 5 Mar 2019 15:48:09 -0800 Subject: [PATCH 114/159] mm: memcontrol: expose THP events on a per-memcg basis Currently THP allocation events data is fairly opaque, since you can only get it system-wide. This patch makes it easier to reason about transparent hugepage behaviour on a per-memcg basis. For anonymous THP-backed pages, we already have MEMCG_RSS_HUGE in v1, which is used for v1's rss_huge [sic]. This is reused here as it's fairly involved to untangle NR_ANON_THPS right now to make it per-memcg, since right now some of this is delegated to rmap before we have any memcg actually assigned to the page. It's a good idea to rework that, but let's leave untangling THP allocation for a future patch. [akpm@linux-foundation.org: fix build] [chris@chrisdown.name: fix memcontrol build when THP is disabled] Link: http://lkml.kernel.org/r/20190131160802.GA5777@chrisdown.name Link: http://lkml.kernel.org/r/20190129205852.GA7310@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 16 ++++++++++++++++ mm/huge_memory.c | 2 ++ mm/khugepaged.c | 2 ++ mm/memcontrol.c | 16 ++++++++++++++++ 4 files changed, 36 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7bf3f129c68b..53d3288c328b 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1189,6 +1189,10 @@ PAGE_SIZE multiple when read back. Amount of cached filesystem data that was modified and is currently being written back to disk + anon_thp + Amount of memory used in anonymous mappings backed by + transparent hugepages + inactive_anon, active_anon, inactive_file, active_file, unevictable Amount of memory, swap-backed and filesystem-backed, on the internal memory management lists used by the @@ -1248,6 +1252,18 @@ PAGE_SIZE multiple when read back. Amount of reclaimed lazyfree pages + thp_fault_alloc + + Number of transparent hugepages which were allocated to satisfy + a page fault, including COW faults. This counter is not present + when CONFIG_TRANSPARENT_HUGEPAGE is not set. + + thp_collapse_alloc + + Number of transparent hugepages which were allocated to allow + collapsing an existing range of pages. This counter is not + present when CONFIG_TRANSPARENT_HUGEPAGE is not set. + memory.swap.current A read-only single value file which exists on non-root cgroups. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index af07527cd971..d4847026d4b1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -617,6 +617,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); + count_memcg_events(memcg, THP_FAULT_ALLOC, 1); } return 0; @@ -1338,6 +1339,7 @@ alloc: } count_vm_event(THP_FAULT_ALLOC); + count_memcg_events(memcg, THP_FAULT_ALLOC, 1); if (!page) clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4f017339ddb2..449044378782 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1074,6 +1074,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); mem_cgroup_commit_charge(new_page, memcg, false, true); + count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -1502,6 +1503,7 @@ xa_unlocked: page_ref_add(new_page, HPAGE_PMD_NR - 1); set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); + count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_anon(new_page); /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1eca627566ff..30bda8d7fb5c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -5571,6 +5572,15 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "file_writeback %llu\n", (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); + /* + * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter + * with the NR_ANON_THP vm counter, but right now it's a pain in the + * arse because it requires migrating the work out of rmap to a place + * where the page->mem_cgroup is set up and stable. + */ + seq_printf(m, "anon_thp %llu\n", + (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE); + for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], (u64)acc.lru_pages[i] * PAGE_SIZE); @@ -5602,6 +5612,12 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); + seq_printf(m, "thp_collapse_alloc %lu\n", + acc.events[THP_COLLAPSE_ALLOC]); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return 0; } From 2cee57d1b088778bb8cb35f36076cbe492984a09 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 5 Mar 2019 15:48:12 -0800 Subject: [PATCH 115/159] mm: ksm: do not block on page lock when searching stable tree ksmd needs to search the stable tree to look for the suitable KSM page, but the KSM page might be locked for a while due to i.e. KSM page rmap walk. Basically it is not a big deal since commit 2c653d0ee2ae ("ksm: introduce ksm_max_page_sharing per page deduplication limit"), since max_page_sharing limits the number of shared KSM pages. But it still sounds not worth waiting for the lock, the page can be skip, then try to merge it in the next scan to avoid potential stall if its content is still intact. Introduce trylock mode to get_ksm_page() to not block on page lock, like what try_to_merge_one_page() does. And, define three possible operations (nolock, lock and trylock) as enum type to avoid stacking up bools and make the code more readable. Return -EBUSY if trylock fails, since NULL means not find suitable KSM page, which is a valid case. With the default max_page_sharing setting (256), there is almost no observed change comparing lock vs trylock. However, with ksm02 of LTP, the reduced ksmd full scan time can be observed, which has set max_page_sharing to 786432. With lock version, ksmd may tak 10s - 11s to run two full scans, with trylock version ksmd may take 8s - 11s to run two full scans. And, the number of pages_sharing and pages_to_scan keep same. Basically, this change has no harm. [hughd@google.com: fix BUG_ON()] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1902182122280.6914@eggly.anvils Link: http://lkml.kernel.org/r/1548793753-62377-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Signed-off-by: Hugh Dickins Suggested-by: John Hubbard Reviewed-by: Kirill Tkhai Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 983fbac24bda..fc64874dc6f4 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -667,6 +667,12 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) free_stable_node(stable_node); } +enum get_ksm_page_flags { + GET_KSM_PAGE_NOLOCK, + GET_KSM_PAGE_LOCK, + GET_KSM_PAGE_TRYLOCK +}; + /* * get_ksm_page: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. @@ -686,7 +692,8 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) +static struct page *get_ksm_page(struct stable_node *stable_node, + enum get_ksm_page_flags flags) { struct page *page; void *expected_mapping; @@ -729,8 +736,15 @@ again: goto stale; } - if (lock_it) { + if (flags == GET_KSM_PAGE_TRYLOCK) { + if (!trylock_page(page)) { + put_page(page); + return ERR_PTR(-EBUSY); + } + } else if (flags == GET_KSM_PAGE_LOCK) lock_page(page); + + if (flags != GET_KSM_PAGE_NOLOCK) { if (READ_ONCE(page->mapping) != expected_mapping) { unlock_page(page); put_page(page); @@ -764,7 +778,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, true); + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); if (!page) goto out; @@ -864,7 +878,7 @@ static int remove_stable_node(struct stable_node *stable_node) struct page *page; int err; - page = get_ksm_page(stable_node, true); + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); if (!page) { /* * get_ksm_page did remove_node_from_stable_tree itself. @@ -1386,7 +1400,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup, * stable_node parameter itself will be freed from * under us if it returns NULL. */ - _tree_page = get_ksm_page(dup, false); + _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); if (!_tree_page) continue; nr += 1; @@ -1509,7 +1523,7 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; - return get_ksm_page(stable_node, false); + return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); } /* * _stable_node_dup set to NULL means the stable_node @@ -1614,7 +1628,8 @@ again: * wrprotected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, false); + tree_page = get_ksm_page(stable_node_any, + GET_KSM_PAGE_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { @@ -1674,7 +1689,12 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node_dup, true); + tree_page = get_ksm_page(stable_node_dup, + GET_KSM_PAGE_TRYLOCK); + + if (PTR_ERR(tree_page) == -EBUSY) + return ERR_PTR(-EBUSY); + if (unlikely(!tree_page)) /* * The tree may have been rebalanced, @@ -1843,7 +1863,8 @@ again: * wrprotected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, false); + tree_page = get_ksm_page(stable_node_any, + GET_KSM_PAGE_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { @@ -2069,6 +2090,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) remove_rmap_item_from_tree(rmap_item); if (kpage) { + if (PTR_ERR(kpage) == -EBUSY) + return; + err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { /* @@ -2243,7 +2267,8 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { - page = get_ksm_page(stable_node, false); + page = get_ksm_page(stable_node, + GET_KSM_PAGE_NOLOCK); if (page) put_page(page); cond_resched(); From 060f005f074791ec15e3ea111a0b0cac28abab06 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 5 Mar 2019 15:48:15 -0800 Subject: [PATCH 116/159] mm/vmscan.c: do not allocate duplicate stack variables in shrink_page_list() On path shrink_inactive_list() ---> shrink_page_list() we allocate stack variables for the statistics twice. This is completely useless, and this just consumes stack much more, then we really need. The patch kills duplicate stack variables from shrink_page_list(), and this reduce stack usage and object file size significantly: Stack usage: Before: vmscan.c:1122:22:shrink_page_list 648 static After: vmscan.c:1122:22:shrink_page_list 616 static Size of vmscan.o: text data bss dec hex filename Before: 56866 4720 128 61714 f112 mm/vmscan.o After: 56770 4720 128 61618 f0b2 mm/vmscan.o Link: http://lkml.kernel.org/r/154894900030.5211.12104993874109647641.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Daniel Jordan Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 07a68dcd5f58..209c2c78a087 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1106,16 +1106,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); - int pgactivate = 0; - unsigned nr_unqueued_dirty = 0; - unsigned nr_dirty = 0; - unsigned nr_congested = 0; unsigned nr_reclaimed = 0; - unsigned nr_writeback = 0; - unsigned nr_immediate = 0; - unsigned nr_ref_keep = 0; - unsigned nr_unmap_fail = 0; + memset(stat, 0, sizeof(*stat)); cond_resched(); while (!list_empty(page_list)) { @@ -1159,10 +1152,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ page_check_dirty_writeback(page, &dirty, &writeback); if (dirty || writeback) - nr_dirty++; + stat->nr_dirty++; if (dirty && !writeback) - nr_unqueued_dirty++; + stat->nr_unqueued_dirty++; /* * Treat this page as congested if the underlying BDI is or if @@ -1174,7 +1167,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (((dirty || writeback) && mapping && inode_write_congested(mapping->host)) || (writeback && PageReclaim(page))) - nr_congested++; + stat->nr_congested++; /* * If a page at the tail of the LRU is under writeback, there @@ -1223,7 +1216,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (current_is_kswapd() && PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { - nr_immediate++; + stat->nr_immediate++; goto activate_locked; /* Case 2 above */ @@ -1241,7 +1234,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * and it's also appropriate in global reclaim. */ SetPageReclaim(page); - nr_writeback++; + stat->nr_writeback++; goto activate_locked; /* Case 3 above */ @@ -1261,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGEREF_ACTIVATE: goto activate_locked; case PAGEREF_KEEP: - nr_ref_keep++; + stat->nr_ref_keep++; goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: @@ -1326,7 +1319,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; if (!try_to_unmap(page, flags)) { - nr_unmap_fail++; + stat->nr_unmap_fail++; goto activate_locked; } } @@ -1474,7 +1467,7 @@ activate_locked: VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { SetPageActive(page); - pgactivate++; + stat->nr_activate++; count_memcg_page_event(page, PGACTIVATE); } keep_locked: @@ -1489,18 +1482,8 @@ keep: free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); - count_vm_events(PGACTIVATE, pgactivate); + count_vm_events(PGACTIVATE, stat->nr_activate); - if (stat) { - stat->nr_dirty = nr_dirty; - stat->nr_congested = nr_congested; - stat->nr_unqueued_dirty = nr_unqueued_dirty; - stat->nr_writeback = nr_writeback; - stat->nr_immediate = nr_immediate; - stat->nr_activate = pgactivate; - stat->nr_ref_keep = nr_ref_keep; - stat->nr_unmap_fail = nr_unmap_fail; - } return nr_reclaimed; } @@ -1512,6 +1495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; + struct reclaim_stat dummy_stat; unsigned long ret; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1525,7 +1509,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, NULL, true); + TTU_IGNORE_ACCESS, &dummy_stat, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1900,7 +1884,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - struct reclaim_stat stat = {}; + struct reclaim_stat stat; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; From c10d38cc8d3e43f946b6c2bf4602c86791587f30 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 5 Mar 2019 15:48:19 -0800 Subject: [PATCH 117/159] mm, swap: bounds check swap_info array accesses to avoid NULL derefs Dan Carpenter reports a potential NULL dereference in get_swap_page_of_type: Smatch complains that the NULL checks on "si" aren't consistent. This seems like a real bug because we have not ensured that the type is valid and so "si" can be NULL. Add the missing check for NULL, taking care to use a read barrier to ensure CPU1 observes CPU0's updates in the correct order: CPU0 CPU1 alloc_swap_info() if (type >= nr_swapfiles) swap_info[type] = p /* handle invalid entry */ smp_wmb() smp_rmb() ++nr_swapfiles p = swap_info[type] Without smp_rmb, CPU1 might observe CPU0's write to nr_swapfiles before CPU0's write to swap_info[type] and read NULL from swap_info[type]. Ying Huang noticed other places in swapfile.c don't order these reads properly. Introduce swap_type_to_swap_info to encourage correct usage. Use READ_ONCE and WRITE_ONCE to follow the Linux Kernel Memory Model (see tools/memory-model/Documentation/explanation.txt). This ordering need not be enforced in places where swap_lock is held (e.g. si_swapinfo) because swap_lock serializes updates to nr_swapfiles and the swap_info array. Link: http://lkml.kernel.org/r/20190131024410.29859-1-daniel.m.jordan@oracle.com Fixes: ec8acf20afb8 ("swap: add per-partition lock for swapfile") Signed-off-by: Daniel Jordan Reported-by: Dan Carpenter Suggested-by: "Huang, Ying" Reviewed-by: Andrea Parri Acked-by: Peter Zijlstra (Intel) Cc: Alan Stern Cc: Andi Kleen Cc: Dave Hansen Cc: Omar Sandoval Cc: Paul McKenney Cc: Shaohua Li Cc: Stephen Rothwell Cc: Tejun Heo Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 6de46984d59d..57e9b1b31d55 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -98,6 +98,15 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); +static struct swap_info_struct *swap_type_to_swap_info(int type) +{ + if (type >= READ_ONCE(nr_swapfiles)) + return NULL; + + smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ + return READ_ONCE(swap_info[type]); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ @@ -1044,12 +1053,14 @@ noswap: /* The only caller of this function is now suspend routine */ swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si; + struct swap_info_struct *si = swap_type_to_swap_info(type); pgoff_t offset; - si = swap_info[type]; + if (!si) + goto fail; + spin_lock(&si->lock); - if (si && (si->flags & SWP_WRITEOK)) { + if (si->flags & SWP_WRITEOK) { atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); @@ -1060,6 +1071,7 @@ swp_entry_t get_swap_page_of_type(int type) atomic_long_inc(&nr_swap_pages); } spin_unlock(&si->lock); +fail: return (swp_entry_t) {0}; } @@ -1071,9 +1083,9 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) if (!entry.val) goto out; type = swp_type(entry); - if (type >= nr_swapfiles) + p = swap_type_to_swap_info(type); + if (!p) goto bad_nofile; - p = swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); @@ -1697,10 +1709,9 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) sector_t swapdev_block(int type, pgoff_t offset) { struct block_device *bdev; + struct swap_info_struct *si = swap_type_to_swap_info(type); - if ((unsigned int)type >= nr_swapfiles) - return 0; - if (!(swap_info[type]->flags & SWP_WRITEOK)) + if (!si || !(si->flags & SWP_WRITEOK)) return 0; return map_swap_entry(swp_entry(type, offset), &bdev); } @@ -2151,7 +2162,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) struct swap_extent *se; pgoff_t offset; - sis = swap_info[swp_type(entry)]; + sis = swp_swap_info(entry); *bdev = sis->bdev; offset = swp_offset(entry); @@ -2593,9 +2604,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; type < nr_swapfiles; type++) { - smp_rmb(); /* read nr_swapfiles before swap_info[type] */ - si = swap_info[type]; + for (type = 0; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2615,9 +2624,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) else type = si->type + 1; - for (; type < nr_swapfiles; type++) { - smp_rmb(); /* read nr_swapfiles before swap_info[type] */ - si = swap_info[type]; + for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; ++*pos; @@ -2724,14 +2731,14 @@ static struct swap_info_struct *alloc_swap_info(void) } if (type >= nr_swapfiles) { p->type = type; - swap_info[type] = p; + WRITE_ONCE(swap_info[type], p); /* * Write swap_info[type] before nr_swapfiles, in case a * racing procfs swap_start() or swap_next() is reading them. * (We never shrink nr_swapfiles, we never free this entry.) */ smp_wmb(); - nr_swapfiles++; + WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); } else { kvfree(p); p = swap_info[type]; @@ -3251,7 +3258,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; - unsigned long offset, type; + unsigned long offset; unsigned char count; unsigned char has_cache; int err = -EINVAL; @@ -3259,10 +3266,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) if (non_swap_entry(entry)) goto out; - type = swp_type(entry); - if (type >= nr_swapfiles) + p = swp_swap_info(entry); + if (!p) goto bad_file; - p = swap_info[type]; + offset = swp_offset(entry); if (unlikely(offset >= p->max)) goto out; @@ -3359,7 +3366,7 @@ int swapcache_prepare(swp_entry_t entry) struct swap_info_struct *swp_swap_info(swp_entry_t entry) { - return swap_info[swp_type(entry)]; + return swap_type_to_swap_info(swp_type(entry)); } struct swap_info_struct *page_swap_info(struct page *page) From d342a0b38674867ea67fde47b0e1e60ffe9f17a2 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 5 Mar 2019 15:48:22 -0800 Subject: [PATCH 118/159] mm,oom: don't kill global init via memory.oom.group Since setting global init process to some memory cgroup is technically possible, oom_kill_memcg_member() must check it. Tasks in /test1 are going to be killed due to memory.oom.group set Memory cgroup out of memory: Killed process 1 (systemd) total-vm:43400kB, anon-rss:1228kB, file-rss:3992kB, shmem-rss:0kB oom_reaper: reaped process 1 (systemd), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000008b #include #include #include #include #include #include int main(int argc, char *argv[]) { static char buffer[10485760]; static int pipe_fd[2] = { EOF, EOF }; unsigned int i; int fd; char buf[64] = { }; if (pipe(pipe_fd)) return 1; if (chdir("/sys/fs/cgroup/")) return 1; fd = open("cgroup.subtree_control", O_WRONLY); write(fd, "+memory", 7); close(fd); mkdir("test1", 0755); fd = open("test1/memory.oom.group", O_WRONLY); write(fd, "1", 1); close(fd); fd = open("test1/cgroup.procs", O_WRONLY); write(fd, "1", 1); snprintf(buf, sizeof(buf) - 1, "%d", getpid()); write(fd, buf, strlen(buf)); close(fd); snprintf(buf, sizeof(buf) - 1, "%lu", sizeof(buffer) * 5); fd = open("test1/memory.max", O_WRONLY); write(fd, buf, strlen(buf)); close(fd); for (i = 0; i < 10; i++) if (fork() == 0) { char c; close(pipe_fd[1]); read(pipe_fd[0], &c, 1); memset(buffer, 0, sizeof(buffer)); sleep(3); _exit(0); } close(pipe_fd[0]); close(pipe_fd[1]); sleep(3); return 0; } [ 37.052923][ T9185] a.out invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 [ 37.056169][ T9185] CPU: 4 PID: 9185 Comm: a.out Kdump: loaded Not tainted 5.0.0-rc4-next-20190131 #280 [ 37.059205][ T9185] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 [ 37.062954][ T9185] Call Trace: [ 37.063976][ T9185] dump_stack+0x67/0x95 [ 37.065263][ T9185] dump_header+0x51/0x570 [ 37.066619][ T9185] ? trace_hardirqs_on+0x3f/0x110 [ 37.068171][ T9185] ? _raw_spin_unlock_irqrestore+0x3d/0x70 [ 37.069967][ T9185] oom_kill_process+0x18d/0x210 [ 37.071515][ T9185] out_of_memory+0x11b/0x380 [ 37.072936][ T9185] mem_cgroup_out_of_memory+0xb6/0xd0 [ 37.074601][ T9185] try_charge+0x790/0x820 [ 37.076021][ T9185] mem_cgroup_try_charge+0x42/0x1d0 [ 37.077629][ T9185] mem_cgroup_try_charge_delay+0x11/0x30 [ 37.079370][ T9185] do_anonymous_page+0x105/0x5e0 [ 37.080939][ T9185] __handle_mm_fault+0x9cb/0x1070 [ 37.082485][ T9185] handle_mm_fault+0x1b2/0x3a0 [ 37.083819][ T9185] ? handle_mm_fault+0x47/0x3a0 [ 37.085181][ T9185] __do_page_fault+0x255/0x4c0 [ 37.086529][ T9185] do_page_fault+0x28/0x260 [ 37.087788][ T9185] ? page_fault+0x8/0x30 [ 37.088978][ T9185] page_fault+0x1e/0x30 [ 37.090142][ T9185] RIP: 0033:0x7f8b183aefe0 [ 37.091433][ T9185] Code: 20 f3 44 0f 7f 44 17 d0 f3 44 0f 7f 47 30 f3 44 0f 7f 44 17 c0 48 01 fa 48 83 e2 c0 48 39 d1 74 a3 66 0f 1f 84 00 00 00 00 00 <66> 44 0f 7f 01 66 44 0f 7f 41 10 66 44 0f 7f 41 20 66 44 0f 7f 41 [ 37.096917][ T9185] RSP: 002b:00007fffc5d329e8 EFLAGS: 00010206 [ 37.098615][ T9185] RAX: 00000000006010e0 RBX: 0000000000000008 RCX: 0000000000c30000 [ 37.100905][ T9185] RDX: 00000000010010c0 RSI: 0000000000000000 RDI: 00000000006010e0 [ 37.103349][ T9185] RBP: 0000000000000000 R08: 00007f8b188f4740 R09: 0000000000000000 [ 37.105797][ T9185] R10: 00007fffc5d32420 R11: 00007f8b183aef40 R12: 0000000000000005 [ 37.108228][ T9185] R13: 0000000000000000 R14: ffffffffffffffff R15: 0000000000000000 [ 37.110840][ T9185] memory: usage 51200kB, limit 51200kB, failcnt 125 [ 37.113045][ T9185] memory+swap: usage 0kB, limit 9007199254740988kB, failcnt 0 [ 37.115808][ T9185] kmem: usage 0kB, limit 9007199254740988kB, failcnt 0 [ 37.117660][ T9185] Memory cgroup stats for /test1: cache:0KB rss:49484KB rss_huge:30720KB shmem:0KB mapped_file:0KB dirty:0KB writeback:0KB inactive_anon:0KB active_anon:49700KB inactive_file:0KB active_file:0KB unevictable:0KB [ 37.123371][ T9185] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0,oom_memcg=/test1,task_memcg=/test1,task=a.out,pid=9188,uid=0 [ 37.128158][ T9185] Memory cgroup out of memory: Killed process 9188 (a.out) total-vm:14456kB, anon-rss:10324kB, file-rss:504kB, shmem-rss:0kB [ 37.132710][ T9185] Tasks in /test1 are going to be killed due to memory.oom.group set [ 37.132833][ T54] oom_reaper: reaped process 9188 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [ 37.135498][ T9185] Memory cgroup out of memory: Killed process 1 (systemd) total-vm:43400kB, anon-rss:1228kB, file-rss:3992kB, shmem-rss:0kB [ 37.143434][ T9185] Memory cgroup out of memory: Killed process 9182 (a.out) total-vm:14456kB, anon-rss:76kB, file-rss:588kB, shmem-rss:0kB [ 37.144328][ T54] oom_reaper: reaped process 1 (systemd), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [ 37.147585][ T9185] Memory cgroup out of memory: Killed process 9183 (a.out) total-vm:14456kB, anon-rss:6228kB, file-rss:512kB, shmem-rss:0kB [ 37.157222][ T9185] Memory cgroup out of memory: Killed process 9184 (a.out) total-vm:14456kB, anon-rss:6228kB, file-rss:508kB, shmem-rss:0kB [ 37.157259][ T9185] Memory cgroup out of memory: Killed process 9185 (a.out) total-vm:14456kB, anon-rss:6228kB, file-rss:512kB, shmem-rss:0kB [ 37.157291][ T9185] Memory cgroup out of memory: Killed process 9186 (a.out) total-vm:14456kB, anon-rss:4180kB, file-rss:508kB, shmem-rss:0kB [ 37.157306][ T54] oom_reaper: reaped process 9183 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [ 37.157328][ T9185] Memory cgroup out of memory: Killed process 9187 (a.out) total-vm:14456kB, anon-rss:4180kB, file-rss:512kB, shmem-rss:0kB [ 37.157452][ T9185] Memory cgroup out of memory: Killed process 9189 (a.out) total-vm:14456kB, anon-rss:6228kB, file-rss:512kB, shmem-rss:0kB [ 37.158733][ T9185] Memory cgroup out of memory: Killed process 9190 (a.out) total-vm:14456kB, anon-rss:552kB, file-rss:512kB, shmem-rss:0kB [ 37.160083][ T54] oom_reaper: reaped process 9186 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [ 37.160187][ T54] oom_reaper: reaped process 9189 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [ 37.206941][ T54] oom_reaper: reaped process 9185 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [ 37.212300][ T9185] Memory cgroup out of memory: Killed process 9191 (a.out) total-vm:14456kB, anon-rss:4180kB, file-rss:512kB, shmem-rss:0kB [ 37.212317][ T54] oom_reaper: reaped process 9190 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [ 37.218860][ T9185] Memory cgroup out of memory: Killed process 9192 (a.out) total-vm:14456kB, anon-rss:1080kB, file-rss:512kB, shmem-rss:0kB [ 37.227667][ T54] oom_reaper: reaped process 9192 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [ 37.292323][ T9193] abrt-hook-ccpp (9193) used greatest stack depth: 10480 bytes left [ 37.351843][ T1] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000008b [ 37.354833][ T1] CPU: 7 PID: 1 Comm: systemd Kdump: loaded Not tainted 5.0.0-rc4-next-20190131 #280 [ 37.357876][ T1] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 [ 37.361685][ T1] Call Trace: [ 37.363239][ T1] dump_stack+0x67/0x95 [ 37.365010][ T1] panic+0xfc/0x2b0 [ 37.366853][ T1] do_exit+0xd55/0xd60 [ 37.368595][ T1] do_group_exit+0x47/0xc0 [ 37.370415][ T1] get_signal+0x32a/0x920 [ 37.372449][ T1] ? _raw_spin_unlock_irqrestore+0x3d/0x70 [ 37.374596][ T1] do_signal+0x32/0x6e0 [ 37.376430][ T1] ? exit_to_usermode_loop+0x26/0x9b [ 37.378418][ T1] ? prepare_exit_to_usermode+0xa8/0xd0 [ 37.380571][ T1] exit_to_usermode_loop+0x3e/0x9b [ 37.382588][ T1] prepare_exit_to_usermode+0xa8/0xd0 [ 37.384594][ T1] ? page_fault+0x8/0x30 [ 37.386453][ T1] retint_user+0x8/0x18 [ 37.388160][ T1] RIP: 0033:0x7f42c06974a8 [ 37.389922][ T1] Code: Bad RIP value. [ 37.391788][ T1] RSP: 002b:00007ffc3effd388 EFLAGS: 00010213 [ 37.394075][ T1] RAX: 000000000000000e RBX: 00007ffc3effd390 RCX: 0000000000000000 [ 37.396963][ T1] RDX: 000000000000002a RSI: 00007ffc3effd390 RDI: 0000000000000004 [ 37.399550][ T1] RBP: 00007ffc3effd680 R08: 0000000000000000 R09: 0000000000000000 [ 37.402334][ T1] R10: 00000000ffffffff R11: 0000000000000246 R12: 0000000000000001 [ 37.404890][ T1] R13: ffffffffffffffff R14: 0000000000000884 R15: 000056460b1ac3b0 Link: http://lkml.kernel.org/r/201902010336.x113a4EO027170@www262.sakura.ne.jp Fixes: 3d8b38eb81cac813 ("mm, oom: introduce memory.oom.group") Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2d5135c1b1ba..3a2484884cfd 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -929,7 +929,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) */ static int oom_kill_memcg_member(struct task_struct *task, void *message) { - if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && + !is_global_init(task)) { get_task_struct(task); __oom_kill_process(task, message); } From b9726c26dc21b15a2faea96fae3a42f2f7fffdcb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:48:26 -0800 Subject: [PATCH 119/159] numa: make "nr_node_ids" unsigned int Number of NUMA nodes can't be negative. This saves a few bytes on x86_64: add/remove: 0/0 grow/shrink: 4/21 up/down: 27/-265 (-238) Function old new delta hv_synic_alloc.cold 88 110 +22 prealloc_shrinker 260 262 +2 bootstrap 249 251 +2 sched_init_numa 1566 1567 +1 show_slab_objects 778 777 -1 s_show 1201 1200 -1 kmem_cache_init 346 345 -1 __alloc_workqueue_key 1146 1145 -1 mem_cgroup_css_alloc 1614 1612 -2 __do_sys_swapon 4702 4699 -3 __list_lru_init 655 651 -4 nic_probe 2379 2374 -5 store_user_store 118 111 -7 red_zone_store 106 99 -7 poison_store 106 99 -7 wq_numa_init 348 338 -10 __kmem_cache_empty 75 65 -10 task_numa_free 186 173 -13 merge_across_nodes_store 351 336 -15 irq_create_affinity_masks 1261 1246 -15 do_numa_crng_init 343 321 -22 task_numa_fault 4760 4737 -23 swapfile_init 179 156 -23 hv_synic_alloc 536 492 -44 apply_wqattrs_prepare 746 695 -51 Link: http://lkml.kernel.org/r/20190201223029.GA15820@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/numa.c | 2 +- arch/powerpc/mm/numa.c | 2 +- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/mm/numa.c | 4 ++-- include/linux/nodemask.h | 4 ++-- mm/list_lru.c | 3 +-- mm/memcontrol.c | 2 +- mm/page_alloc.c | 2 +- mm/slab.c | 3 +-- mm/slub.c | 2 +- mm/swapfile.c | 2 +- mm/vmscan.c | 2 +- 12 files changed, 14 insertions(+), 16 deletions(-) diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index ae34e3a1cef1..7a0a555b366a 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -120,7 +120,7 @@ static void __init setup_node_to_cpumask_map(void) } /* cpumask_of_node() will now work */ - pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } /* diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 270cefb75cca..df1e11ebbabb 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -84,7 +84,7 @@ static void __init setup_node_to_cpumask_map(void) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ - dbg("Node to cpumask map for %d nodes\n", nr_node_ids); + dbg("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init fake_numa_create_new_node(unsigned long end_pfn, diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e8796fcd7e5a..13af08827eef 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -171,7 +171,7 @@ void __init setup_per_cpu_areas(void) unsigned long delta; int rc; - pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n", + pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1308f5408bf7..12c1b7a83ed7 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -123,7 +123,7 @@ void __init setup_node_to_cpumask_map(void) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ - pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init numa_add_memblk_to(int nid, u64 start, u64 end, @@ -866,7 +866,7 @@ const struct cpumask *cpumask_of_node(int node) { if (node >= nr_node_ids) { printk(KERN_WARNING - "cpumask_of_node(%d): node > nr_node_ids(%d)\n", + "cpumask_of_node(%d): node > nr_node_ids(%u)\n", node, nr_node_ids); dump_stack(); return cpu_none_mask; diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 5a30ad594ccc..962c5e783d50 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -444,7 +444,7 @@ static inline int next_memory_node(int nid) return next_node(nid, node_states[N_MEMORY]); } -extern int nr_node_ids; +extern unsigned int nr_node_ids; extern int nr_online_nodes; static inline void node_set_online(int nid) @@ -485,7 +485,7 @@ static inline int num_node_state(enum node_states state) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) -#define nr_node_ids 1 +#define nr_node_ids 1U #define nr_online_nodes 1 #define node_set_online(node) node_set_state((node), N_ONLINE) diff --git a/mm/list_lru.c b/mm/list_lru.c index 5b30625fd365..0730bf8ff39f 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -601,7 +601,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker) { int i; - size_t size = sizeof(*lru->node) * nr_node_ids; int err = -ENOMEM; #ifdef CONFIG_MEMCG_KMEM @@ -612,7 +611,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, #endif memcg_get_cache_ids(); - lru->node = kzalloc(size, GFP_KERNEL); + lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) goto out; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 30bda8d7fb5c..45cd1f84268a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4429,7 +4429,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - size_t size; + unsigned int size; int node; size = sizeof(struct mem_cgroup); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 11a5f50efd97..8df43caf2eb7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 -int nr_node_ids __read_mostly = MAX_NUMNODES; +unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); diff --git a/mm/slab.c b/mm/slab.c index 757e646baa5d..7510a1b489df 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -677,12 +677,11 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { struct alien_cache **alc_ptr; - size_t memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) limit = 12; - alc_ptr = kzalloc_node(memsize, gfp, node); + alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node); if (!alc_ptr) return NULL; diff --git a/mm/slub.c b/mm/slub.c index 017a2ce5ba23..1b08fbcb7e61 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4262,7 +4262,7 @@ void __init kmem_cache_init(void) cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); diff --git a/mm/swapfile.c b/mm/swapfile.c index 57e9b1b31d55..a14257ac0476 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2713,7 +2713,7 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; unsigned int type; int i; - int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); + unsigned int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); p = kvzalloc(size, GFP_KERNEL); if (!p) diff --git a/mm/vmscan.c b/mm/vmscan.c index 209c2c78a087..e1f7ccdc0a90 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -374,7 +374,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone */ int prealloc_shrinker(struct shrinker *shrinker) { - size_t size = sizeof(*shrinker->nr_deferred); + unsigned int size = sizeof(*shrinker->nr_deferred); if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; From ce0725f78a56a59bdb07cef003bc6fef722da38e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:48:29 -0800 Subject: [PATCH 120/159] numa: make "nr_online_nodes" unsigned int Number of online NUMA nodes can't be negative as well. This doesn't save space as the variable is used only in 32-bit context, but do it anyway for consistency. Link: http://lkml.kernel.org/r/20190201223151.GB15820@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 4 ++-- mm/page_alloc.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 962c5e783d50..27e7fa36f707 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -445,7 +445,7 @@ static inline int next_memory_node(int nid) } extern unsigned int nr_node_ids; -extern int nr_online_nodes; +extern unsigned int nr_online_nodes; static inline void node_set_online(int nid) { @@ -486,7 +486,7 @@ static inline int num_node_state(enum node_states state) #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U -#define nr_online_nodes 1 +#define nr_online_nodes 1U #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8df43caf2eb7..c29828ec9183 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -290,7 +290,7 @@ EXPORT_SYMBOL(movable_zone); #if MAX_NUMNODES > 1 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; -int nr_online_nodes __read_mostly = 1; +unsigned int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif @@ -5664,7 +5664,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", + pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", nr_online_nodes, page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); From 6d2bef9df7ccf3a2db0160be24f8b92a3f24708a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 5 Mar 2019 15:48:33 -0800 Subject: [PATCH 121/159] mm/page_poison: update comment after code moved mm/debug-pagealloc.c is no more, so of course header now needs to be updated. This seems like something checkpatch should be able to catch - worth looking into? Link: http://lkml.kernel.org/r/20190207191113.14039-1-mst@redhat.com Fixes: 8823b1dbc05f ("mm/page_poison.c: enable PAGE_POISONING as a separate option") Signed-off-by: Michael S. Tsirkin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/poison.h b/include/linux/poison.h index 15927ebc22f2..5046bad0c1c5 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -30,7 +30,7 @@ */ #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) -/********** mm/debug-pagealloc.c **********/ +/********** mm/page_poison.c **********/ #ifdef CONFIG_PAGE_POISONING_ZERO #define PAGE_POISON 0x00 #else From 92eac16819e47ab919bd8f28ed49f8fadad0954e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 5 Mar 2019 15:48:36 -0800 Subject: [PATCH 122/159] docs/mm: vmalloc: re-indent kernel-doc comemnts Some kernel-doc comments in mm/vmalloc.c have leading tab in indentation. This leads to excessive indentation in the generated HTML and to the inconsistency of its layout ([1] vs [2]). Besides, multi-line Note: sections are not handled properly with extra indentation. [1] https://www.kernel.org/doc/html/v4.20/core-api/mm-api.html?#c.vm_map_ram [2] https://www.kernel.org/doc/html/v4.20/core-api/mm-api.html?#c.vfree Link: http://lkml.kernel.org/r/1549549644-4903-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 345 +++++++++++++++++++++++++-------------------------- 1 file changed, 171 insertions(+), 174 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 77006fa1a90b..03cbba890301 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1191,6 +1191,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; + /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -1425,13 +1426,13 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, } /** - * get_vm_area - reserve a contiguous kernel virtual area - * @size: size of the area - * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC + * get_vm_area - reserve a contiguous kernel virtual area + * @size: size of the area + * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * - * Search an area of @size in the kernel virtual mapping area, - * and reserved it for out purposes. Returns the area descriptor - * on success or %NULL on failure. + * Search an area of @size in the kernel virtual mapping area, + * and reserved it for out purposes. Returns the area descriptor + * on success or %NULL on failure. */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { @@ -1448,12 +1449,12 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, } /** - * find_vm_area - find a continuous kernel virtual area - * @addr: base address + * find_vm_area - find a continuous kernel virtual area + * @addr: base address * - * Search for the kernel VM area starting at @addr, and return it. - * It is up to the caller to do all required locking to keep the returned - * pointer valid. + * Search for the kernel VM area starting at @addr, and return it. + * It is up to the caller to do all required locking to keep the returned + * pointer valid. */ struct vm_struct *find_vm_area(const void *addr) { @@ -1467,12 +1468,12 @@ struct vm_struct *find_vm_area(const void *addr) } /** - * remove_vm_area - find and remove a continuous kernel virtual area - * @addr: base address + * remove_vm_area - find and remove a continuous kernel virtual area + * @addr: base address * - * Search for the kernel VM area starting at @addr, and remove it. - * This function returns the found VM area, but using it is NOT safe - * on SMP machines, except for its size or flags. + * Search for the kernel VM area starting at @addr, and remove it. + * This function returns the found VM area, but using it is NOT safe + * on SMP machines, except for its size or flags. */ struct vm_struct *remove_vm_area(const void *addr) { @@ -1552,11 +1553,11 @@ static inline void __vfree_deferred(const void *addr) } /** - * vfree_atomic - release memory allocated by vmalloc() - * @addr: memory base address + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address * - * This one is just like vfree() but can be called in any atomic context - * except NMIs. + * This one is just like vfree() but can be called in any atomic context + * except NMIs. */ void vfree_atomic(const void *addr) { @@ -1578,20 +1579,20 @@ static void __vfree(const void *addr) } /** - * vfree - release memory allocated by vmalloc() - * @addr: memory base address + * vfree - release memory allocated by vmalloc() + * @addr: memory base address * - * Free the virtually continuous memory area starting at @addr, as - * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is - * NULL, no operation is performed. + * Free the virtually continuous memory area starting at @addr, as + * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is + * NULL, no operation is performed. * - * Must not be called in NMI context (strictly speaking, only if we don't - * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling - * conventions for vfree() arch-depenedent would be a really bad idea) + * Must not be called in NMI context (strictly speaking, only if we don't + * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling + * conventions for vfree() arch-depenedent would be a really bad idea) * - * May sleep if called *not* from interrupt context. + * May sleep if called *not* from interrupt context. * - * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) + * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) */ void vfree(const void *addr) { @@ -1609,13 +1610,13 @@ void vfree(const void *addr) EXPORT_SYMBOL(vfree); /** - * vunmap - release virtual mapping obtained by vmap() - * @addr: memory base address + * vunmap - release virtual mapping obtained by vmap() + * @addr: memory base address * - * Free the virtually contiguous memory area starting at @addr, - * which was created from the page array passed to vmap(). + * Free the virtually contiguous memory area starting at @addr, + * which was created from the page array passed to vmap(). * - * Must not be called in interrupt context. + * Must not be called in interrupt context. */ void vunmap(const void *addr) { @@ -1627,17 +1628,17 @@ void vunmap(const void *addr) EXPORT_SYMBOL(vunmap); /** - * vmap - map an array of pages into virtually contiguous space - * @pages: array of page pointers - * @count: number of pages to map - * @flags: vm_area->flags - * @prot: page protection for the mapping + * vmap - map an array of pages into virtually contiguous space + * @pages: array of page pointers + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping * - * Maps @count pages from @pages into contiguous kernel virtual - * space. + * Maps @count pages from @pages into contiguous kernel virtual + * space. */ void *vmap(struct page **pages, unsigned int count, - unsigned long flags, pgprot_t prot) + unsigned long flags, pgprot_t prot) { struct vm_struct *area; unsigned long size; /* In bytes */ @@ -1724,20 +1725,20 @@ fail: } /** - * __vmalloc_node_range - allocate virtually contiguous memory - * @size: allocation size - * @align: desired alignment - * @start: vm area range start - * @end: vm area range end - * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages - * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) - * @node: node to use for allocation or NUMA_NO_NODE - * @caller: caller's return address + * __vmalloc_node_range - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @start: vm area range start + * @end: vm area range end + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) + * @node: node to use for allocation or NUMA_NO_NODE + * @caller: caller's return address * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, @@ -1788,24 +1789,23 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); #endif /** - * __vmalloc_node - allocate virtually contiguous memory - * @size: allocation size - * @align: desired alignment - * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages - * @node: node to use for allocation or NUMA_NO_NODE - * @caller: caller's return address + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or NUMA_NO_NODE + * @caller: caller's return address * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. * - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL - * and __GFP_NOFAIL are not supported - * - * Any use of gfp flags outside of GFP_KERNEL should be consulted - * with mm people. + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL + * and __GFP_NOFAIL are not supported * + * Any use of gfp flags outside of GFP_KERNEL should be consulted + * with mm people. */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, @@ -1837,13 +1837,14 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, } /** - * vmalloc - allocate virtually contiguous memory - * @size: allocation size - * Allocate enough pages to cover @size from the page level - * allocator and map them into contiguous kernel virtual space. + * vmalloc - allocate virtually contiguous memory + * @size: allocation size * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. */ void *vmalloc(unsigned long size) { @@ -1853,14 +1854,15 @@ void *vmalloc(unsigned long size) EXPORT_SYMBOL(vmalloc); /** - * vzalloc - allocate virtually contiguous memory with zero fill - * @size: allocation size - * Allocate enough pages to cover @size from the page level - * allocator and map them into contiguous kernel virtual space. - * The memory allocated is set to zero. + * vzalloc - allocate virtually contiguous memory with zero fill + * @size: allocation size * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. */ void *vzalloc(unsigned long size) { @@ -1886,15 +1888,15 @@ void *vmalloc_user(unsigned long size) EXPORT_SYMBOL(vmalloc_user); /** - * vmalloc_node - allocate memory on a specific node - * @size: allocation size - * @node: numa node + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node * - * Allocate enough pages to cover @size from the page level - * allocator and map them into contiguous kernel virtual space. + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. */ void *vmalloc_node(unsigned long size, int node) { @@ -1923,17 +1925,16 @@ void *vzalloc_node(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node); /** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size + * vmalloc_exec - allocate virtually contiguous, executable memory + * @size: allocation size * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. */ - void *vmalloc_exec(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, @@ -1953,11 +1954,11 @@ void *vmalloc_exec(unsigned long size) #endif /** - * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) - * @size: allocation size + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size * - * Allocate enough 32bit PA addressable pages to cover @size from the - * page level allocator and map them into contiguous kernel virtual space. + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into contiguous kernel virtual space. */ void *vmalloc_32(unsigned long size) { @@ -1968,7 +1969,7 @@ EXPORT_SYMBOL(vmalloc_32); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory - * @size: allocation size + * @size: allocation size * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. @@ -2064,31 +2065,29 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) } /** - * vread() - read vmalloc area in a safe way. - * @buf: buffer for reading data - * @addr: vm address. - * @count: number of bytes to be read. + * vread() - read vmalloc area in a safe way. + * @buf: buffer for reading data + * @addr: vm address. + * @count: number of bytes to be read. * - * Returns # of bytes which addr and buf should be increased. - * (same number to @count). Returns 0 if [addr...addr+count) doesn't - * includes any intersect with alive vmalloc area. + * Returns # of bytes which addr and buf should be increased. + * (same number to @count). Returns 0 if [addr...addr+count) doesn't + * includes any intersect with alive vmalloc area. * - * This function checks that addr is a valid vmalloc'ed area, and - * copy data from that area to a given buffer. If the given memory range - * of [addr...addr+count) includes some valid address, data is copied to - * proper area of @buf. If there are memory holes, they'll be zero-filled. - * IOREMAP area is treated as memory hole and no copy is done. + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from that area to a given buffer. If the given memory range + * of [addr...addr+count) includes some valid address, data is copied to + * proper area of @buf. If there are memory holes, they'll be zero-filled. + * IOREMAP area is treated as memory hole and no copy is done. * - * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. - * - * Note: In usual ops, vread() is never necessary because the caller - * should know vmalloc() area is valid and can use memcpy(). - * This is for routines which have to access vmalloc area without - * any informaion, as /dev/kmem. + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. @buf should be kernel's buffer. * + * Note: In usual ops, vread() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. */ - long vread(char *buf, char *addr, unsigned long count) { struct vmap_area *va; @@ -2145,31 +2144,30 @@ finished: } /** - * vwrite() - write vmalloc area in a safe way. - * @buf: buffer for source data - * @addr: vm address. - * @count: number of bytes to be read. + * vwrite() - write vmalloc area in a safe way. + * @buf: buffer for source data + * @addr: vm address. + * @count: number of bytes to be read. * - * Returns # of bytes which addr and buf should be incresed. - * (same number to @count). - * If [addr...addr+count) doesn't includes any intersect with valid - * vmalloc area, returns 0. + * Returns # of bytes which addr and buf should be incresed. + * (same number to @count). + * If [addr...addr+count) doesn't includes any intersect with valid + * vmalloc area, returns 0. * - * This function checks that addr is a valid vmalloc'ed area, and - * copy data from a buffer to the given addr. If specified range of - * [addr...addr+count) includes some valid address, data is copied from - * proper area of @buf. If there are memory holes, no copy to hole. - * IOREMAP area is treated as memory hole and no copy is done. + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from a buffer to the given addr. If specified range of + * [addr...addr+count) includes some valid address, data is copied from + * proper area of @buf. If there are memory holes, no copy to hole. + * IOREMAP area is treated as memory hole and no copy is done. * - * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. @buf should be kernel's buffer. * - * Note: In usual ops, vwrite() is never necessary because the caller - * should know vmalloc() area is valid and can use memcpy(). - * This is for routines which have to access vmalloc area without - * any informaion, as /dev/kmem. + * Note: In usual ops, vwrite() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. */ - long vwrite(char *buf, char *addr, unsigned long count) { struct vmap_area *va; @@ -2221,20 +2219,20 @@ finished: } /** - * remap_vmalloc_range_partial - map vmalloc pages to userspace - * @vma: vma to cover - * @uaddr: target user address to start at - * @kaddr: virtual address of vmalloc kernel memory - * @size: size of map area + * remap_vmalloc_range_partial - map vmalloc pages to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc kernel memory + * @size: size of map area * - * Returns: 0 for success, -Exxx on failure + * Returns: 0 for success, -Exxx on failure * - * This function checks that @kaddr is a valid vmalloc'ed area, - * and that it is big enough to cover the range starting at - * @uaddr in @vma. Will return failure if that criteria isn't - * met. + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. * - * Similar to remap_pfn_range() (see mm/memory.c) + * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, void *kaddr, unsigned long size) @@ -2276,18 +2274,18 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, EXPORT_SYMBOL(remap_vmalloc_range_partial); /** - * remap_vmalloc_range - map vmalloc pages to userspace - * @vma: vma to cover (map full range of vma) - * @addr: vmalloc memory - * @pgoff: number of pages into addr before first page to map + * remap_vmalloc_range - map vmalloc pages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map * - * Returns: 0 for success, -Exxx on failure + * Returns: 0 for success, -Exxx on failure * - * This function checks that addr is a valid vmalloc'ed area, and - * that it is big enough to cover the vma. Will return failure if - * that criteria isn't met. + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. * - * Similar to remap_pfn_range() (see mm/memory.c) + * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) @@ -2319,18 +2317,18 @@ static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) } /** - * alloc_vm_area - allocate a range of kernel address space - * @size: size of the area - * @ptes: returns the PTEs for the address space + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * @ptes: returns the PTEs for the address space * - * Returns: NULL on failure, vm_struct on success + * Returns: NULL on failure, vm_struct on success * - * This function reserves a range of kernel address space, and - * allocates pagetables to map that range. No actual mappings - * are created. + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. * - * If @ptes is non-NULL, pointers to the PTEs (in init_mm) - * allocated for the VM area are returned. + * If @ptes is non-NULL, pointers to the PTEs (in init_mm) + * allocated for the VM area are returned. */ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { @@ -2756,4 +2754,3 @@ static int __init proc_vmalloc_init(void) module_init(proc_vmalloc_init); #endif - From bc8ff3ca6589d63c6d10f5ee8bed38f74851b469 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 5 Mar 2019 15:48:39 -0800 Subject: [PATCH 123/159] docs/core-api/mm: fix user memory accessors formatting The descriptions of userspace memory access functions had minor issues with formatting that made kernel-doc unable to properly detect the function/macro names and the return value sections: ./arch/x86/include/asm/uaccess.h:80: info: Scanning doc for ./arch/x86/include/asm/uaccess.h:139: info: Scanning doc for ./arch/x86/include/asm/uaccess.h:231: info: Scanning doc for ./arch/x86/include/asm/uaccess.h:505: info: Scanning doc for ./arch/x86/include/asm/uaccess.h:530: info: Scanning doc for ./arch/x86/lib/usercopy_32.c:58: info: Scanning doc for ./arch/x86/lib/usercopy_32.c:69: warning: No description found for return value of 'clear_user' ./arch/x86/lib/usercopy_32.c:78: info: Scanning doc for ./arch/x86/lib/usercopy_32.c:90: warning: No description found for return value of '__clear_user' Fix the formatting. Link: http://lkml.kernel.org/r/1549549644-4903-3-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 24 ++++++++++++------------ arch/x86/lib/usercopy_32.c | 8 ++++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5e49a0acb5ee..62004d22524a 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -75,7 +75,7 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un #endif /** - * access_ok: - Checks if a user space pointer is valid + * access_ok - Checks if a user space pointer is valid * @addr: User space pointer to start of block to check * @size: Size of block to check * @@ -84,12 +84,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * * Checks if a pointer to a block of memory in user space is valid. * - * Returns true (nonzero) if the memory block may be valid, false (zero) - * if it is definitely invalid. - * * Note that, depending on architecture, this function probably just * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. + * + * Return: true (nonzero) if the memory block may be valid, false (zero) + * if it is definitely invalid. */ #define access_ok(addr, size) \ ({ \ @@ -134,7 +134,7 @@ extern int __get_user_bad(void); __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) /** - * get_user: - Get a simple variable from user space. + * get_user - Get a simple variable from user space. * @x: Variable to store result. * @ptr: Source address, in user space. * @@ -148,7 +148,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * - * Returns zero on success, or -EFAULT on error. + * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ /* @@ -226,7 +226,7 @@ extern void __put_user_4(void); extern void __put_user_8(void); /** - * put_user: - Write a simple value into user space. + * put_user - Write a simple value into user space. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * @@ -240,7 +240,7 @@ extern void __put_user_8(void); * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * - * Returns zero on success, or -EFAULT on error. + * Return: zero on success, or -EFAULT on error. */ #define put_user(x, ptr) \ ({ \ @@ -502,7 +502,7 @@ struct __large_struct { unsigned long buf[100]; }; } while (0) /** - * __get_user: - Get a simple variable from user space, with less checking. + * __get_user - Get a simple variable from user space, with less checking. * @x: Variable to store result. * @ptr: Source address, in user space. * @@ -519,7 +519,7 @@ struct __large_struct { unsigned long buf[100]; }; * Caller must check the pointer with access_ok() before calling this * function. * - * Returns zero on success, or -EFAULT on error. + * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ @@ -527,7 +527,7 @@ struct __large_struct { unsigned long buf[100]; }; __get_user_nocheck((x), (ptr), sizeof(*(ptr))) /** - * __put_user: - Write a simple value into user space, with less checking. + * __put_user - Write a simple value into user space, with less checking. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * @@ -544,7 +544,7 @@ struct __large_struct { unsigned long buf[100]; }; * Caller must check the pointer with access_ok() before calling this * function. * - * Returns zero on success, or -EFAULT on error. + * Return: zero on success, or -EFAULT on error. */ #define __put_user(x, ptr) \ diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index bfd94e7812fc..7d290777246d 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -54,13 +54,13 @@ do { \ } while (0) /** - * clear_user: - Zero a block of memory in user space. + * clear_user - Zero a block of memory in user space. * @to: Destination address, in user space. * @n: Number of bytes to zero. * * Zero a block of memory in user space. * - * Returns number of bytes that could not be cleared. + * Return: number of bytes that could not be cleared. * On success, this will be zero. */ unsigned long @@ -74,14 +74,14 @@ clear_user(void __user *to, unsigned long n) EXPORT_SYMBOL(clear_user); /** - * __clear_user: - Zero a block of memory in user space, with less checking. + * __clear_user - Zero a block of memory in user space, with less checking. * @to: Destination address, in user space. * @n: Number of bytes to zero. * * Zero a block of memory in user space. Caller must check * the specified block with access_ok() before calling this function. * - * Returns number of bytes that could not be cleared. + * Return: number of bytes that could not be cleared. * On success, this will be zero. */ unsigned long From a862f68a8b360086f248cbc3606029441b5f5197 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 5 Mar 2019 15:48:42 -0800 Subject: [PATCH 124/159] docs/core-api/mm: fix return value descriptions in mm/ Many kernel-doc comments in mm/ have the return value descriptions either misformatted or omitted at all which makes kernel-doc script unhappy: $ make V=1 htmldocs ... ./mm/util.c:36: info: Scanning doc for kstrdup ./mm/util.c:41: warning: No description found for return value of 'kstrdup' ./mm/util.c:57: info: Scanning doc for kstrdup_const ./mm/util.c:66: warning: No description found for return value of 'kstrdup_const' ./mm/util.c:75: info: Scanning doc for kstrndup ./mm/util.c:83: warning: No description found for return value of 'kstrndup' ... Fixing the formatting and adding the missing return value descriptions eliminates ~100 such warnings. Link: http://lkml.kernel.org/r/1549549644-4903-4-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 13 +++++--- mm/filemap.c | 73 ++++++++++++++++++++++++++++++++++++++------- mm/memory.c | 26 +++++++++++----- mm/mempool.c | 8 +++++ mm/page-writeback.c | 24 ++++++++++----- mm/page_alloc.c | 24 +++++++++++---- mm/readahead.c | 2 ++ mm/slab.c | 14 +++++++++ mm/slab_common.c | 6 ++++ mm/truncate.c | 6 ++-- mm/util.c | 37 ++++++++++++++++------- mm/vmalloc.c | 47 ++++++++++++++++++++++------- 12 files changed, 221 insertions(+), 59 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 6d4b97e7e9e9..76a160083506 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -114,10 +114,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL); * @size: size of the blocks in this pool. * @align: alignment requirement for blocks; must be a power of two * @boundary: returned blocks won't cross this power of two boundary - * Context: !in_interrupt() + * Context: not in_interrupt() * - * Returns a dma allocation pool with the requested characteristics, or - * null if one can't be created. Given one of these pools, dma_pool_alloc() + * Given one of these pools, dma_pool_alloc() * may be used to allocate memory. Such memory will all have "consistent" * DMA mappings, accessible by the device and its driver without using * cache flushing primitives. The actual size of blocks allocated may be @@ -127,6 +126,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL); * cross that size boundary. This is useful for devices which have * addressing restrictions on individual DMA transfers, such as not crossing * boundaries of 4KBytes. + * + * Return: a dma allocation pool with the requested characteristics, or + * %NULL if one can't be created. */ struct dma_pool *dma_pool_create(const char *name, struct device *dev, size_t size, size_t align, size_t boundary) @@ -313,7 +315,7 @@ EXPORT_SYMBOL(dma_pool_destroy); * @mem_flags: GFP_* bitmask * @handle: pointer to dma address of block * - * This returns the kernel virtual address of a currently unused block, + * Return: the kernel virtual address of a currently unused block, * and reports its dma address through the handle. * If such a memory block can't be allocated, %NULL is returned. */ @@ -498,6 +500,9 @@ static int dmam_pool_match(struct device *dev, void *res, void *match_data) * * Managed dma_pool_create(). DMA pool created with this function is * automatically destroyed on driver detach. + * + * Return: a managed dma allocation pool with the requested + * characteristics, or %NULL if one can't be created. */ struct dma_pool *dmam_pool_create(const char *name, struct device *dev, size_t size, size_t align, size_t allocation) diff --git a/mm/filemap.c b/mm/filemap.c index e59fdecdab74..ae0022f6106d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -392,6 +392,8 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. + * + * Return: %0 on success, negative error code otherwise. */ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) @@ -438,6 +440,8 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. + * + * Return: %0 on success, negative error code otherwise. */ int filemap_flush(struct address_space *mapping) { @@ -453,6 +457,9 @@ EXPORT_SYMBOL(filemap_flush); * * Find at least one page in the range supplied, usually used to check if * direct writing in this range will trigger a writeback. + * + * Return: %true if at least one page exists in the specified range, + * %false otherwise. */ bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) @@ -529,6 +536,8 @@ static void __filemap_fdatawait_range(struct address_space *mapping, * Since the error status of the address space is cleared by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. + * + * Return: error status of the address space. */ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) @@ -551,6 +560,8 @@ EXPORT_SYMBOL(filemap_fdatawait_range); * Since the error status of the file is advanced by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. + * + * Return: error status of the address space vs. the file->f_wb_err cursor. */ int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) { @@ -572,6 +583,8 @@ EXPORT_SYMBOL(file_fdatawait_range); * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) + * + * Return: error status of the address space. */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { @@ -623,6 +636,8 @@ EXPORT_SYMBOL(filemap_write_and_wait); * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). + * + * Return: error status of the address space. */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) @@ -678,6 +693,8 @@ EXPORT_SYMBOL(__filemap_set_wb_err); * While we handle mapping->wb_err with atomic operations, the f_wb_err * value is protected by the f_lock since we must ensure that it reflects * the latest value swapped in for this file descriptor. + * + * Return: %0 on success, negative error code otherwise. */ int file_check_and_advance_wb_err(struct file *file) { @@ -720,6 +737,8 @@ EXPORT_SYMBOL(file_check_and_advance_wb_err); * * After writing out and waiting on the data, we check and advance the * f_wb_err cursor to the latest value, and return any errors detected there. + * + * Return: %0 on success, negative error code otherwise. */ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) { @@ -753,6 +772,8 @@ EXPORT_SYMBOL(file_write_and_wait_range); * caller must do that. * * The remove + add is atomic. This function cannot fail. + * + * Return: %0 */ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { @@ -867,6 +888,8 @@ error: * * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. + * + * Return: %0 on success, negative error code otherwise. */ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) @@ -1463,7 +1486,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Otherwise, %NULL is returned. + * Return: the found page or shadow entry, %NULL if nothing is found. */ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { @@ -1521,9 +1544,9 @@ EXPORT_SYMBOL(find_get_entry); * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Otherwise, %NULL is returned. - * * find_lock_entry() may sleep. + * + * Return: the found page or shadow entry, %NULL if nothing is found. */ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { @@ -1563,12 +1586,14 @@ EXPORT_SYMBOL(find_lock_entry); * - FGP_CREAT: If page is not present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU * list. The page is returned locked and with an increased - * refcount. Otherwise, NULL is returned. + * refcount. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. * * If there is a page cache page, it is returned with an increased refcount. + * + * Return: the found page or %NULL otherwise. */ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t gfp_mask) @@ -1656,8 +1681,7 @@ EXPORT_SYMBOL(pagecache_get_page); * Any shadow entries of evicted pages, or swap entries from * shmem/tmpfs, are included in the returned array. * - * find_get_entries() returns the number of pages and shadow entries - * which were found. + * Return: the number of pages and shadow entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, @@ -1727,8 +1751,8 @@ retry: * indexes. There may be holes in the indices due to not-present pages. * We also update @start to index the next page for the traversal. * - * find_get_pages_range() returns the number of pages which were found. If this - * number is smaller than @nr_pages, the end of specified range has been + * Return: the number of pages which were found. If this number is + * smaller than @nr_pages, the end of specified range has been * reached. */ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, @@ -1801,7 +1825,7 @@ out: * find_get_pages_contig() works exactly like find_get_pages(), except * that the returned number of pages are guaranteed to be contiguous. * - * find_get_pages_contig() returns the number of pages which were found. + * Return: the number of pages which were found. */ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) @@ -1862,6 +1886,8 @@ EXPORT_SYMBOL(find_get_pages_contig); * * Like find_get_pages, except we only return pages which are tagged with * @tag. We update @index to index the next page for the traversal. + * + * Return: the number of pages which were found. */ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag, unsigned int nr_pages, @@ -1939,6 +1965,8 @@ EXPORT_SYMBOL(find_get_pages_range_tag); * * Like find_get_entries, except we only return entries which are tagged with * @tag. + * + * Return: the number of entries which were found. */ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, xa_mark_t tag, unsigned int nr_entries, @@ -2024,6 +2052,10 @@ static void shrink_readahead_size_eio(struct file *filp, * * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. + * + * Return: + * * total number of bytes copied, including those the were already @written + * * negative error code if nothing was copied */ static ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) @@ -2285,6 +2317,9 @@ out: * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. + * Return: + * * number of bytes copied, even for partial reads + * * negative error code if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) @@ -2352,6 +2387,8 @@ EXPORT_SYMBOL(generic_file_read_iter); * * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. + * + * Return: %0 on success, negative error code otherwise. */ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) { @@ -2466,6 +2503,8 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * has not been released. * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. + * + * Return: bitwise-OR of %VM_FAULT_ codes. */ vm_fault_t filemap_fault(struct vm_fault *vmf) { @@ -2851,6 +2890,8 @@ out: * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. + * + * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, @@ -2871,6 +2912,8 @@ EXPORT_SYMBOL(read_cache_page); * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. + * + * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, @@ -3254,6 +3297,10 @@ EXPORT_SYMBOL(generic_perform_write); * This function does *not* take care of syncing data in case of O_SYNC write. * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_mutex. + * + * Return: + * * number of bytes written, even for truncated writes + * * negative error code if no data has been written at all */ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -3338,6 +3385,10 @@ EXPORT_SYMBOL(__generic_file_write_iter); * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file * and acquires i_mutex as needed. + * Return: + * * negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * number of bytes written, even for truncated writes */ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -3364,8 +3415,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * @gfp_mask: memory allocation flags (and I/O mode) * * The address_space is to try to release any data against the page - * (presumably at page->private). If the release was successful, return '1'. - * Otherwise return zero. + * (presumably at page->private). * * This may also be called if PG_fscache is set on a page, indicating that the * page is known to the local caching routines. @@ -3373,6 +3423,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * The @gfp_mask argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). * + * Return: %1 if the release was successful, otherwise return zero. */ int try_to_release_page(struct page *page, gfp_t gfp_mask) { diff --git a/mm/memory.c b/mm/memory.c index 557c6fffedd1..706c4c4a2b8e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1504,6 +1504,8 @@ out: * under mm->mmap_sem write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. + * + * Return: %0 on success, negative error code otherwise. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) @@ -1831,7 +1833,9 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, * @size: size of map area * @prot: page protection flags for this mapping * - * Note: this is only safe if the mm semaphore is held when called. + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) @@ -1904,6 +1908,8 @@ EXPORT_SYMBOL(remap_pfn_range); * * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get * whatever write-combining details or similar. + * + * Return: %0 on success, negative error code otherwise. */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { @@ -2382,12 +2388,13 @@ oom: * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. - * It handles locking of PTE and modifying it. The function returns - * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE - * lock. + * It handles locking of PTE and modifying it. * * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). + * + * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before + * we acquired PTE lock. */ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { @@ -3214,6 +3221,8 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * * Target users are page handler itself and implementations of * vm_ops->map_pages. + * + * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) @@ -3274,11 +3283,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, * This function handles all that is needed to finish a page fault once the * page to fault in is prepared. It handles locking of PTEs, inserts PTE for * given page, adds reverse page mapping, handles memcg charges and LRU - * addition. The function returns 0 on success, VM_FAULT_ code in case of - * error. + * addition. * * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). + * + * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t finish_fault(struct vm_fault *vmf) { @@ -4159,7 +4169,7 @@ EXPORT_SYMBOL(follow_pte_pmd); * * Only IO mappings and raw PFN mappings are allowed. * - * Returns zero and the pfn at @pfn on success, -ve otherwise. + * Return: zero and the pfn at @pfn on success, -ve otherwise. */ int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn) @@ -4309,6 +4319,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from source to destination. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) diff --git a/mm/mempool.c b/mm/mempool.c index 0ef8cc8d1602..85efab3da720 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -222,6 +222,8 @@ EXPORT_SYMBOL(mempool_init_node); * * Like mempool_create(), but initializes the pool in (i.e. embedded in another * structure). + * + * Return: %0 on success, negative error code otherwise. */ int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) @@ -245,6 +247,8 @@ EXPORT_SYMBOL(mempool_init); * functions. This function might sleep. Both the alloc_fn() and the free_fn() * functions might sleep - as long as the mempool_alloc() function is not called * from IRQ contexts. + * + * Return: pointer to the created memory pool object or %NULL on error. */ mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) @@ -289,6 +293,8 @@ EXPORT_SYMBOL(mempool_create_node); * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. + * + * Return: %0 on success, negative error code otherwise. */ int mempool_resize(mempool_t *pool, int new_min_nr) { @@ -363,6 +369,8 @@ EXPORT_SYMBOL(mempool_resize); * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) * Note: using __GFP_ZERO is not supported. + * + * Return: pointer to the allocated element or %NULL on error. */ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7d1010453fb9..9f61dfec6a1f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -270,7 +270,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, * node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node * - * Returns the node's number of pages potentially available for dirty + * Return: the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. */ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) @@ -355,7 +355,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) /** * global_dirtyable_memory - number of globally dirtyable pages * - * Returns the global number of pages potentially available for dirty + * Return: the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) @@ -470,7 +470,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) * node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node * - * Returns the maximum number of dirty pages allowed in a node, based + * Return: the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. */ static unsigned long node_dirty_limit(struct pglist_data *pgdat) @@ -495,7 +495,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) * node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check * - * Returns %true when the dirty pages in @pgdat are within the node's + * Return: %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ bool node_dirty_ok(struct pglist_data *pgdat) @@ -743,9 +743,6 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest * - * Returns @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. - * * Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error @@ -759,6 +756,9 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + * + * Return: @wb's dirty limit in pages. The term "dirty" in the context of + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { @@ -1918,7 +1918,9 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited); * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's - * clean enough. Returns %true if writeback should continue. + * clean enough. + * + * Return: %true if writeback should continue. */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { @@ -2147,6 +2149,8 @@ EXPORT_SYMBOL(tag_pages_for_writeback); * lock/page writeback access order inversion - we should only ever lock * multiple pages in ascending page->index order, and looping back to the start * of the file violates that rule and causes deadlocks. + * + * Return: %0 on success, negative error code otherwise */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@ -2305,6 +2309,8 @@ static int __writepage(struct page *page, struct writeback_control *wbc, * * This is a library function, which implements the writepages() * address_space_operation. + * + * Return: %0 on success, negative error code otherwise */ int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -2351,6 +2357,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) * * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this * function returns. + * + * Return: %0 on success, negative error code otherwise */ int write_one_page(struct page *page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c29828ec9183..4e1d9118ae52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4816,6 +4816,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, * This function is also limited by MAX_ORDER. * * Memory allocated by this function must be released by free_pages_exact(). + * + * Return: pointer to the allocated area or %NULL in case of error. */ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) { @@ -4836,6 +4838,8 @@ EXPORT_SYMBOL(alloc_pages_exact); * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. + * + * Return: pointer to the allocated area or %NULL in case of error. */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { @@ -4869,11 +4873,13 @@ EXPORT_SYMBOL(free_pages_exact); * nr_free_zone_pages - count number of pages beyond high watermark * @offset: The zone index of the highest zone * - * nr_free_zone_pages() counts the number of counts pages which are beyond the + * nr_free_zone_pages() counts the number of pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: * * nr_free_zone_pages = managed_pages - high_pages + * + * Return: number of pages beyond high watermark. */ static unsigned long nr_free_zone_pages(int offset) { @@ -4900,6 +4906,9 @@ static unsigned long nr_free_zone_pages(int offset) * * nr_free_buffer_pages() counts the number of pages which are beyond the high * watermark within ZONE_DMA and ZONE_NORMAL. + * + * Return: number of pages beyond high watermark within ZONE_DMA and + * ZONE_NORMAL. */ unsigned long nr_free_buffer_pages(void) { @@ -4912,6 +4921,8 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages); * * nr_free_pagecache_pages() counts the number of pages which are beyond the * high watermark within all zones. + * + * Return: number of pages beyond high watermark within all zones. */ unsigned long nr_free_pagecache_pages(void) { @@ -5358,7 +5369,8 @@ static int node_load[MAX_NUMNODES]; * from each node to each node in the system), and should also prefer nodes * with no CPUs, since presumably they'll have very little allocation pressure * on them otherwise. - * It returns -1 if no node is found. + * + * Return: node id of the found node or %NUMA_NO_NODE if no node is found. */ static int find_next_best_node(int node, nodemask_t *used_node_mask) { @@ -6269,7 +6281,7 @@ unsigned long __init __absent_pages_in_range(int nid, * @start_pfn: The start PFN to start searching for holes * @end_pfn: The end PFN to stop searching for holes * - * It returns the number of pages frames in memory holes within a range. + * Return: the number of pages frames in memory holes within a range. */ unsigned long __init absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn) @@ -6826,7 +6838,7 @@ void __init setup_nr_node_ids(void) * model has fine enough granularity to avoid incorrect mapping for the * populated node map. * - * Returns the determined alignment in pfn's. 0 if there is no alignment + * Return: the determined alignment in pfn's. 0 if there is no alignment * requirement (single node). */ unsigned long __init node_map_pfn_alignment(void) @@ -6881,7 +6893,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) /** * find_min_pfn_with_active_regions - Find the minimum PFN registered * - * It returns the minimum PFN based on information provided via + * Return: the minimum PFN based on information provided via * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) @@ -8174,7 +8186,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * pageblocks in the range. Once isolated, the pageblocks should not * be modified by others. * - * Returns zero on success or negative error code. On success all + * Return: zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and * need to be freed with free_contig_range(). */ diff --git a/mm/readahead.c b/mm/readahead.c index 1ae16522412a..a4593654a26c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -81,6 +81,8 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping, * @data: private data for the callback routine. * * Hides the details of the LRU cache etc from the filesystems. + * + * Returns: %0 on success, error return by @filler otherwise */ int read_cache_pages(struct address_space *mapping, struct list_head *pages, int (*filler)(void *, struct page *), void *data) diff --git a/mm/slab.c b/mm/slab.c index 7510a1b489df..28652e4218e0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1727,6 +1727,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * This could be made much more intelligent. For now, try to avoid using * high order pages for slabs. When the gfp() functions are more friendly * towards high-order requests, this should be changed. + * + * Return: number of left-over bytes in a slab */ static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, slab_flags_t flags) @@ -1975,6 +1977,8 @@ static bool set_on_slab_cache(struct kmem_cache *cachep, * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. + * + * Return: a pointer to the created cache or %NULL in case of error */ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) { @@ -3542,6 +3546,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, * * Allocate an object from this cache. The flags are only relevant * if the cache has no available objects. + * + * Return: pointer to the new object or %NULL in case of error */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { @@ -3631,6 +3637,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); * node, which can improve the performance for cpu bound structures. * * Fallback to other node is possible if __GFP_THISNODE is not set. + * + * Return: pointer to the new object or %NULL in case of error */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { @@ -3699,6 +3707,8 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). * @caller: function caller for debug tracking of the caller + * + * Return: pointer to the allocated memory or %NULL in case of error */ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, unsigned long caller) @@ -4164,6 +4174,8 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) * @buffer: user buffer * @count: data length * @ppos: unused + * + * Return: %0 on success, negative error code otherwise. */ ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) @@ -4457,6 +4469,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, * The caller must guarantee that objp points to a valid object previously * allocated with either kmalloc() or kmem_cache_alloc(). The object * must not be freed during the duration of the call. + * + * Return: size of the actual memory used by @objp in bytes */ size_t ksize(const void *objp) { diff --git a/mm/slab_common.c b/mm/slab_common.c index cd75b8985707..03eeb8b7b4b1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -939,6 +939,8 @@ EXPORT_SYMBOL(kmem_cache_destroy); * * Releases as many slabs as possible for a cache. * To help debugging, a zero exit status indicates all slabs were released. + * + * Return: %0 if all slabs were released, non-zero otherwise */ int kmem_cache_shrink(struct kmem_cache *cachep) { @@ -1528,6 +1530,8 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, * This function is like krealloc() except it never frees the originally * allocated buffer. Use this if you don't want to free the buffer immediately * like, for example, with RCU. + * + * Return: pointer to the allocated memory or %NULL in case of error */ void *__krealloc(const void *p, size_t new_size, gfp_t flags) { @@ -1549,6 +1553,8 @@ EXPORT_SYMBOL(__krealloc); * lesser of the new and old sizes. If @p is %NULL, krealloc() * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a * %NULL pointer, the object pointed to is freed. + * + * Return: pointer to the allocated memory or %NULL in case of error */ void *krealloc(const void *p, size_t new_size, gfp_t flags) { diff --git a/mm/truncate.c b/mm/truncate.c index 798e7ccfb030..b7d3c99f00c9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -539,6 +539,8 @@ EXPORT_SYMBOL(truncate_inode_pages_final); * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. + * + * Return: the number of the pages that were invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -664,7 +666,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EBUSY if any pages could not be invalidated. + * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -761,7 +763,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EBUSY if any pages could not be invalidated. + * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2(struct address_space *mapping) { diff --git a/mm/util.c b/mm/util.c index 379319b1bcfd..d559bde497a9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -36,6 +36,8 @@ EXPORT_SYMBOL(kfree_const); * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrdup(const char *s, gfp_t gfp) { @@ -58,9 +60,10 @@ EXPORT_SYMBOL(kstrdup); * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * - * Function returns source string if it is in .rodata section otherwise it - * fallbacks to kstrdup. - * Strings allocated by kstrdup_const should be freed by kfree_const. + * Note: Strings allocated by kstrdup_const should be freed by kfree_const. + * + * Return: source string if it is in .rodata section otherwise + * fallback to kstrdup. */ const char *kstrdup_const(const char *s, gfp_t gfp) { @@ -78,6 +81,8 @@ EXPORT_SYMBOL(kstrdup_const); * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Use kmemdup_nul() instead if the size is known exactly. + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { @@ -103,6 +108,8 @@ EXPORT_SYMBOL(kstrndup); * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use + * + * Return: newly allocated copy of @src or %NULL in case of error */ void *kmemdup(const void *src, size_t len, gfp_t gfp) { @@ -120,6 +127,9 @@ EXPORT_SYMBOL(kmemdup); * @s: The data to stringify * @len: The size of the data * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s with NUL-termination or %NULL in + * case of error */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { @@ -143,7 +153,7 @@ EXPORT_SYMBOL(kmemdup_nul); * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. Result is physically + * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ void *memdup_user(const void __user *src, size_t len) @@ -169,7 +179,7 @@ EXPORT_SYMBOL(memdup_user); * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. Result may be not + * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ void *vmemdup_user(const void __user *src, size_t len) @@ -193,6 +203,8 @@ EXPORT_SYMBOL(vmemdup_user); * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *strndup_user(const char __user *s, long n) { @@ -224,7 +236,7 @@ EXPORT_SYMBOL(strndup_user); * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. + * Return: an ERR_PTR() on failure. */ void *memdup_user_nul(const void __user *src, size_t len) { @@ -310,10 +322,6 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - * * get_user_pages_fast provides equivalent functionality to get_user_pages, * operating on current and current->mm, with force=0 and vma=NULL. However * unlike get_user_pages, it must be called without mmap_sem held. @@ -325,6 +333,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * pages have to be faulted in, it may turn out to be slightly slower so * callers need to carefully consider what to use. On many architectures, * get_user_pages_fast simply falls back to get_user_pages. + * + * Return: number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. */ int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) @@ -386,6 +398,8 @@ EXPORT_SYMBOL(vm_mmap); * * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not * fall back to vmalloc. + * + * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { @@ -729,7 +743,8 @@ error: * @buffer: the buffer to copy to. * @buflen: the length of the buffer. Larger cmdline values are truncated * to this length. - * Returns the size of the cmdline field copied. Note that the copy does + * + * Return: the size of the cmdline field copied. Note that the copy does * not guarantee an ending NULL byte. */ int get_cmdline(struct task_struct *task, char *buffer, int buflen) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 03cbba890301..e86ba6e74b50 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -844,7 +844,7 @@ static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) * @order: how many 2^order pages should be occupied in newly allocated block * @gfp_mask: flags for the page level allocator * - * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) + * Return: virtual address in a newly allocated block or ERR_PTR(-errno) */ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) { @@ -1433,6 +1433,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, * Search an area of @size in the kernel virtual mapping area, * and reserved it for out purposes. Returns the area descriptor * on success or %NULL on failure. + * + * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { @@ -1455,6 +1457,8 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, * Search for the kernel VM area starting at @addr, and return it. * It is up to the caller to do all required locking to keep the returned * pointer valid. + * + * Return: pointer to the found area or %NULL on faulure */ struct vm_struct *find_vm_area(const void *addr) { @@ -1474,6 +1478,8 @@ struct vm_struct *find_vm_area(const void *addr) * Search for the kernel VM area starting at @addr, and remove it. * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. + * + * Return: pointer to the found area or %NULL on faulure */ struct vm_struct *remove_vm_area(const void *addr) { @@ -1636,6 +1642,8 @@ EXPORT_SYMBOL(vunmap); * * Maps @count pages from @pages into contiguous kernel virtual * space. + * + * Return: the address of the area or %NULL on failure */ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) @@ -1739,6 +1747,8 @@ fail: * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. + * + * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, @@ -1806,6 +1816,8 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); * * Any use of gfp flags outside of GFP_KERNEL should be consulted * with mm people. + * + * Return: pointer to the allocated memory or %NULL on error */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, @@ -1845,6 +1857,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, * * For tight control over page level allocator and protection flags * use __vmalloc() instead. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc(unsigned long size) { @@ -1863,6 +1877,8 @@ EXPORT_SYMBOL(vmalloc); * * For tight control over page level allocator and protection flags * use __vmalloc() instead. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc(unsigned long size) { @@ -1877,6 +1893,8 @@ EXPORT_SYMBOL(vzalloc); * * The resulting memory area is zeroed so it can be mapped to userspace * without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_user(unsigned long size) { @@ -1897,6 +1915,8 @@ EXPORT_SYMBOL(vmalloc_user); * * For tight control over page level allocator and protection flags * use __vmalloc() instead. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_node(unsigned long size, int node) { @@ -1916,6 +1936,8 @@ EXPORT_SYMBOL(vmalloc_node); * * For tight control over page level allocator and protection flags * use __vmalloc_node() instead. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { @@ -1934,6 +1956,8 @@ EXPORT_SYMBOL(vzalloc_node); * * For tight control over page level allocator and protection flags * use __vmalloc() instead. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_exec(unsigned long size) { @@ -1959,6 +1983,8 @@ void *vmalloc_exec(unsigned long size) * * Allocate enough 32bit PA addressable pages to cover @size from the * page level allocator and map them into contiguous kernel virtual space. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32(unsigned long size) { @@ -1973,6 +1999,8 @@ EXPORT_SYMBOL(vmalloc_32); * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32_user(unsigned long size) { @@ -2070,10 +2098,6 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) * @addr: vm address. * @count: number of bytes to be read. * - * Returns # of bytes which addr and buf should be increased. - * (same number to @count). Returns 0 if [addr...addr+count) doesn't - * includes any intersect with alive vmalloc area. - * * This function checks that addr is a valid vmalloc'ed area, and * copy data from that area to a given buffer. If the given memory range * of [addr...addr+count) includes some valid address, data is copied to @@ -2087,6 +2111,10 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any informaion, as /dev/kmem. + * + * Return: number of bytes for which addr and buf should be increased + * (same number as @count) or %0 if [addr...addr+count) doesn't + * include any intersection with valid vmalloc area */ long vread(char *buf, char *addr, unsigned long count) { @@ -2149,11 +2177,6 @@ finished: * @addr: vm address. * @count: number of bytes to be read. * - * Returns # of bytes which addr and buf should be incresed. - * (same number to @count). - * If [addr...addr+count) doesn't includes any intersect with valid - * vmalloc area, returns 0. - * * This function checks that addr is a valid vmalloc'ed area, and * copy data from a buffer to the given addr. If specified range of * [addr...addr+count) includes some valid address, data is copied from @@ -2167,6 +2190,10 @@ finished: * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any informaion, as /dev/kmem. + * + * Return: number of bytes for which addr and buf should be + * increased (same number as @count) or %0 if [addr...addr+count) + * doesn't include any intersection with valid vmalloc area */ long vwrite(char *buf, char *addr, unsigned long count) { From f900482da560941f978b0d36660e96f48ea78752 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 5 Mar 2019 15:48:46 -0800 Subject: [PATCH 125/159] mm/migrate.c: cleanup expected_page_refs() Andrea has noted that page migration code propagates page_mapping(page) through the whole migration stack down to migrate_page() function so it seems stupid to then use page_mapping(page) in expected_page_refs() instead of passed down 'mapping' argument. I agree so let's make expected_page_refs() more in line with the rest of the migration stack. Link: http://lkml.kernel.org/r/20190207112314.24872-1-jack@suse.cz Signed-off-by: Jan Kara Suggested-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 5308d6abd384..ac6f4939bb59 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -374,7 +374,7 @@ unlock: } #endif -static int expected_page_refs(struct page *page) +static int expected_page_refs(struct address_space *mapping, struct page *page) { int expected_count = 1; @@ -384,7 +384,7 @@ static int expected_page_refs(struct page *page) */ expected_count += is_device_private_page(page); expected_count += is_device_public_page(page); - if (page_mapping(page)) + if (mapping) expected_count += hpage_nr_pages(page) + page_has_private(page); return expected_count; @@ -405,7 +405,7 @@ int migrate_page_move_mapping(struct address_space *mapping, XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; int dirty; - int expected_count = expected_page_refs(page) + extra_count; + int expected_count = expected_page_refs(mapping, page) + extra_count; if (!mapping) { /* Anonymous page without mapping */ @@ -750,7 +750,7 @@ static int __buffer_migrate_page(struct address_space *mapping, return migrate_page(mapping, newpage, page, mode); /* Check whether page does not have extra refs before we do more work */ - expected_count = expected_page_refs(page); + expected_count = expected_page_refs(mapping, page); if (page_count(page) != expected_count) return -EAGAIN; From 494eec70f054965e2e699db450cde2c08db1c008 Mon Sep 17 00:00:00 2001 From: "john.hubbard@gmail.com" Date: Tue, 5 Mar 2019 15:48:49 -0800 Subject: [PATCH 126/159] mm: page_cache_add_speculative(): refactor out some code duplication From: John Hubbard This combines the common elements of these routines: page_cache_get_speculative() page_cache_add_speculative() This was anticipated by the original author, as shown by the comment in commit ce0ad7f095258 ("powerpc/mm: Lockless get_user_pages_fast() for 64-bit (v3)"): "Same as above, but add instead of inc (could just be merged)" There is no intention to introduce any behavioral change, but there is a small risk of that, due to slightly differing ways of expressing the TINY_RCU and related configurations. This also removes the VM_BUG_ON(in_interrupt()) that was in page_cache_add_speculative(), but not in page_cache_get_speculative(). This provides slightly less detection of such bugs, but it given that it was only there on the "add" path anyway, we can likely do without it just fine. And it removes the VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page); that page_cache_add_speculative() had. Link: http://lkml.kernel.org/r/20190206231016.22734-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Dave Kleikamp Cc: Hugh Dickins Cc: Jeff Layton Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e2d7039af6a3..b477a70cc2e4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -164,7 +164,7 @@ void release_pages(struct page **pages, int nr); * will find the page or it will not. Likewise, the old find_get_page could run * either before the insertion or afterwards, depending on timing. */ -static inline int page_cache_get_speculative(struct page *page) +static inline int __page_cache_add_speculative(struct page *page, int count) { #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT @@ -180,10 +180,10 @@ static inline int page_cache_get_speculative(struct page *page) * SMP requires. */ VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_inc(page); + page_ref_add(page, count); #else - if (unlikely(!get_page_unless_zero(page))) { + if (unlikely(!page_ref_add_unless(page, count, 0))) { /* * Either the page has been freed, or will be freed. * In either case, retry here and the caller should @@ -197,27 +197,14 @@ static inline int page_cache_get_speculative(struct page *page) return 1; } -/* - * Same as above, but add instead of inc (could just be merged) - */ +static inline int page_cache_get_speculative(struct page *page) +{ + return __page_cache_add_speculative(page, 1); +} + static inline int page_cache_add_speculative(struct page *page, int count) { - VM_BUG_ON(in_interrupt()); - -#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) -# ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic() && !irqs_disabled()); -# endif - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, count); - -#else - if (unlikely(!page_ref_add_unless(page, count, 0))) - return 0; -#endif - VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page); - - return 1; + return __page_cache_add_speculative(page, count); } #ifdef CONFIG_NUMA From daf3538ad5a4800507b13bea6f37601da9cc28d5 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 5 Mar 2019 15:48:53 -0800 Subject: [PATCH 127/159] mm,memory_hotplug: explicitly pass the head to isolate_huge_page isolate_huge_page() expects we pass the head of hugetlb page to it: bool isolate_huge_page(...) { ... VM_BUG_ON_PAGE(!PageHead(page), page); ... } While I really cannot think of any situation where we end up with a non-head page between hands in do_migrate_range(), let us make sure the code is as sane as possible by explicitly passing the Head. Since we already got the pointer, it does not take us extra effort. Link: http://lkml.kernel.org/r/20190208090604.975-1-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anthony Yznaga Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index eb1a28469b66..ebc8c1e75355 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1378,12 +1378,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { struct page *head = compound_head(page); - pfn = page_to_pfn(head) + (1< PFN_SECTION_SHIFT) { ret = -EBUSY; break; } - isolate_huge_page(page, &source); + pfn = page_to_pfn(head) + (1< Date: Tue, 5 Mar 2019 15:48:56 -0800 Subject: [PATCH 128/159] include/linux/compaction.h: fix potential build error Declaration of struct node is required regardless. On UMA systems, including compaction.h without preceding node.h shouldn't cause a build error. Link: http://lkml.kernel.org/r/20190208080437.253322-1-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Andrew Morton Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index c960923d9ec2..9569e7c786d3 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -226,8 +226,8 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i #endif /* CONFIG_COMPACTION */ -#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) struct node; +#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) extern int compaction_register_node(struct node *node); extern void compaction_unregister_node(struct node *node); From 402ad96adcce6b5251de41bee0e336b2cf6b049e Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 5 Mar 2019 15:48:59 -0800 Subject: [PATCH 129/159] tools/vm/slabinfo: update options in usage message Currently usage message list only a subset of the available options. should list them all. Update options in usage massage to include all available options. Link: http://lkml.kernel.org/r/20190212001219.27769-2-tobin@kernel.org Signed-off-by: Tobin C. Harding Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 334b16db0ebb..d3908c50c87c 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -110,7 +110,7 @@ static void fatal(const char *x, ...) static void usage(void) { printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" - "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" + "slabinfo [-aADefhilnosrStTvz1LXBU] [N=K] [-dafzput] [slab-regexp]\n" "-a|--aliases Show aliases\n" "-A|--activity Most active slabs first\n" "-d|--debug= Set/Clear Debug options\n" From b80fd3080317f561a2d84c0f1046ca74afa147ae Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 5 Mar 2019 15:49:03 -0800 Subject: [PATCH 130/159] tools/vm/slabinfo: put options in alphabetic order Primarily the usage message lists options in alphabetic order however there are a bunch of the options that are not in alphabetic order. Put options in alphabetic order. Link: http://lkml.kernel.org/r/20190212001219.27769-3-tobin@kernel.org Signed-off-by: Tobin C. Harding Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index d3908c50c87c..5119adb9d009 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -113,6 +113,7 @@ static void usage(void) "slabinfo [-aADefhilnosrStTvz1LXBU] [N=K] [-dafzput] [slab-regexp]\n" "-a|--aliases Show aliases\n" "-A|--activity Most active slabs first\n" + "-B|--Bytes Show size in bytes\n" "-d|--debug= Set/Clear Debug options\n" "-D|--display-active Switch line format to activity\n" "-e|--empty Show empty slabs\n" @@ -120,21 +121,21 @@ static void usage(void) "-h|--help Show usage information\n" "-i|--inverted Inverted list\n" "-l|--slabs Show slabs\n" + "-L|--Loss Sort by loss\n" "-n|--numa Show NUMA information\n" + "-N|--lines=K Show the first K slabs\n" "-o|--ops Show kmem_cache_ops\n" - "-s|--shrink Shrink slabs\n" "-r|--report Detailed report on single slabs\n" + "-s|--shrink Shrink slabs\n" "-S|--Size Sort by size\n" "-t|--tracking Show alloc/free information\n" "-T|--Totals Show summary information\n" + "-U|--Unreclaim Show unreclaimable slabs only\n" "-v|--validate Validate slabs\n" "-z|--zero Include empty slabs\n" "-1|--1ref Single reference\n" - "-N|--lines=K Show the first K slabs\n" - "-L|--Loss Sort by loss\n" "-X|--Xtotals Show extended summary information\n" - "-B|--Bytes Show size in bytes\n" - "-U|--Unreclaim Show unreclaimable slabs only\n" + "\nValid debug options (FZPUT may be combined)\n" "a / A Switch on all debug options (=FZUP)\n" "- Switch off all debug options\n" From 3c89ff9aedead8ac48ed8bcd1ad8eb66ba6bd219 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 5 Mar 2019 15:49:07 -0800 Subject: [PATCH 131/159] tools/vm/slabinfo: align usage output columns Usage message uses spaces not tabspaces, a few tabspaces have snuck in making the columns not align correctly when output. Align usage output columns using spaces instead of tabspaces. Link: http://lkml.kernel.org/r/20190212001219.27769-4-tobin@kernel.org Signed-off-by: Tobin C. Harding Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 5119adb9d009..97715b73af70 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -124,13 +124,13 @@ static void usage(void) "-L|--Loss Sort by loss\n" "-n|--numa Show NUMA information\n" "-N|--lines=K Show the first K slabs\n" - "-o|--ops Show kmem_cache_ops\n" - "-r|--report Detailed report on single slabs\n" + "-o|--ops Show kmem_cache_ops\n" + "-r|--report Detailed report on single slabs\n" "-s|--shrink Shrink slabs\n" "-S|--Size Sort by size\n" "-t|--tracking Show alloc/free information\n" "-T|--Totals Show summary information\n" - "-U|--Unreclaim Show unreclaimable slabs only\n" + "-U|--Unreclaim Show unreclaimable slabs only\n" "-v|--validate Validate slabs\n" "-z|--zero Include empty slabs\n" "-1|--1ref Single reference\n" From b2f0246d69c0cc909f64469c9117a9616758d9fc Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 5 Mar 2019 15:49:10 -0800 Subject: [PATCH 132/159] tools/vm/slabinfo: clean up usage menu debug items Attempt to make the usage comment for debug options a little cleaner. Link: http://lkml.kernel.org/r/20190212001219.27769-5-tobin@kernel.org Signed-off-by: Tobin C. Harding Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 97715b73af70..73818f1b2ef8 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -114,7 +114,6 @@ static void usage(void) "-a|--aliases Show aliases\n" "-A|--activity Most active slabs first\n" "-B|--Bytes Show size in bytes\n" - "-d|--debug= Set/Clear Debug options\n" "-D|--display-active Switch line format to activity\n" "-e|--empty Show empty slabs\n" "-f|--first-alias Show first alias\n" @@ -136,14 +135,17 @@ static void usage(void) "-1|--1ref Single reference\n" "-X|--Xtotals Show extended summary information\n" - "\nValid debug options (FZPUT may be combined)\n" - "a / A Switch on all debug options (=FZUP)\n" - "- Switch off all debug options\n" - "f / F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" - "z / Z Redzoning\n" - "p / P Poisoning\n" - "u / U Tracking\n" - "t / T Tracing\n" + "\n" + "-d | --debug Switch off all debug options\n" + "-da | --debug=a Switch on all debug options (--debug=FZPU)\n" + + "\n" + "-d[afzput] | --debug=[afzput]\n" + " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" + " z | Z Redzoning\n" + " p | P Poisoning\n" + " u | U Tracking\n" + " t | T Tracing\n" ); } From afa00112893f4ca02777c3cf4f93011577af5ffc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Mar 2019 15:49:13 -0800 Subject: [PATCH 133/159] mm: unexport free_reserved_area This function is only used by built-in code, which makes perfect sense given the purpose of it. Link: http://lkml.kernel.org/r/20190213174621.29297-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e1d9118ae52..3eb01dedfb50 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7341,7 +7341,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } -EXPORT_SYMBOL(free_reserved_area); #ifdef CONFIG_HIGHMEM void free_highmem_page(struct page *page) From 5d3ee42f8f5fa5e7ccc8980878fe6e18a129b9ff Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Tue, 5 Mar 2019 15:49:17 -0800 Subject: [PATCH 134/159] mm/shmem: make find_get_pages_range() work for huge page find_get_pages_range() and find_get_pages_range_tag() already correctly increment reference count on head when seeing compound page, but they may still use page index from tail. Page index from tail is always zero, so these functions don't work on huge shmem. This hasn't been a problem because, AFAIK, nobody calls these functions on (huge) shmem. Fix them anyway just in case. Link: http://lkml.kernel.org/r/20190110030838.84446-1-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: William Kucharski Cc: Matthew Wilcox Cc: Amir Goldstein Cc: Dave Chinner Cc: "Darrick J . Wong" Cc: Johannes Weiner Cc: Souptick Joarder Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ae0022f6106d..a41e01c472f3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1789,7 +1789,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pages[ret] = page; if (++ret == nr_pages) { - *start = page->index + 1; + *start = xas.xa_index + 1; goto out; } continue; @@ -1927,7 +1927,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pages[ret] = page; if (++ret == nr_pages) { - *index = page->index + 1; + *index = xas.xa_index + 1; goto out; } continue; From 2367fab5b397bdd6cbba384e073a2e834b94bc36 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 5 Mar 2019 15:49:20 -0800 Subject: [PATCH 135/159] MAINTAINERS: add entry for memblock Add entry for memblock in MAINTAINERS file Link: http://lkml.kernel.org/r/20190214093630.GC9063@rapoport-lnx Signed-off-by: Mike Rapoport Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fbbb96236d98..5f8d5051081e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9814,6 +9814,14 @@ F: kernel/sched/membarrier.c F: include/uapi/linux/membarrier.h F: arch/powerpc/include/asm/membarrier.h +MEMBLOCK +M: Mike Rapoport +L: linux-mm@kvack.org +S: Maintained +F: include/linux/memblock.h +F: mm/memblock.c +F: Documentation/core-api/boot-time-mm.rst + MEMORY MANAGEMENT L: linux-mm@kvack.org W: http://www.linux-mm.org From a33228682c2039f086823ac798242734ec0eb4e5 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:49:24 -0800 Subject: [PATCH 136/159] tmpfs: test link accounting with O_TMPFILE Mount tmpfs with "nr_inodes=3" for easy check. Link: http://lkml.kernel.org/r/20190219215016.GA20084@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Darrick J. Wong Cc: Hugh Dickins Cc: Matej Kupljen Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/tmpfs/.gitignore | 1 + tools/testing/selftests/tmpfs/Makefile | 7 ++ .../selftests/tmpfs/bug-link-o-tmpfile.c | 67 +++++++++++++++++++ 4 files changed, 76 insertions(+) create mode 100644 tools/testing/selftests/tmpfs/.gitignore create mode 100644 tools/testing/selftests/tmpfs/Makefile create mode 100644 tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 400ee81a3043..6a94f07c4164 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -48,6 +48,7 @@ TARGETS += sysctl ifneq (1, $(quicktest)) TARGETS += timers endif +TARGETS += tmpfs TARGETS += user TARGETS += vm TARGETS += x86 diff --git a/tools/testing/selftests/tmpfs/.gitignore b/tools/testing/selftests/tmpfs/.gitignore new file mode 100644 index 000000000000..a96838fad74d --- /dev/null +++ b/tools/testing/selftests/tmpfs/.gitignore @@ -0,0 +1 @@ +/bug-link-o-tmpfile diff --git a/tools/testing/selftests/tmpfs/Makefile b/tools/testing/selftests/tmpfs/Makefile new file mode 100644 index 000000000000..953c81299181 --- /dev/null +++ b/tools/testing/selftests/tmpfs/Makefile @@ -0,0 +1,7 @@ +CFLAGS += -Wall -O2 +CFLAGS += -D_GNU_SOURCE + +TEST_GEN_PROGS := +TEST_GEN_PROGS += bug-link-o-tmpfile + +include ../lib.mk diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c new file mode 100644 index 000000000000..b5c3ddb90942 --- /dev/null +++ b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test that open(O_TMPFILE), linkat() doesn't screw accounting. */ +#include +#include +#include +#include +#include +#include +#include +#include + +int main(void) +{ + int fd; + + if (unshare(CLONE_NEWNS) == -1) { + if (errno == ENOSYS || errno == EPERM) { + fprintf(stderr, "error: unshare, errno %d\n", errno); + return 4; + } + fprintf(stderr, "error: unshare, errno %d\n", errno); + return 1; + } + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { + fprintf(stderr, "error: mount '/', errno %d\n", errno); + return 1; + } + + /* Our heroes: 1 root inode, 1 O_TMPFILE inode, 1 permanent inode. */ + if (mount(NULL, "/tmp", "tmpfs", 0, "nr_inodes=3") == -1) { + fprintf(stderr, "error: mount tmpfs, errno %d\n", errno); + return 1; + } + + fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); + if (fd == -1) { + fprintf(stderr, "error: open 1, errno %d\n", errno); + return 1; + } + if (linkat(fd, "", AT_FDCWD, "/tmp/1", AT_EMPTY_PATH) == -1) { + fprintf(stderr, "error: linkat, errno %d\n", errno); + return 1; + } + close(fd); + + fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); + if (fd == -1) { + fprintf(stderr, "error: open 2, errno %d\n", errno); + return 1; + } + + return 0; +} From 5a7f1b2f2fbeb40c735639c9a86910d86fd5ec41 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Tue, 5 Mar 2019 15:49:27 -0800 Subject: [PATCH 137/159] mm/cma_debug.c: remove static scoped cma_debugfs_root Currently cma_debugfs_root is static storage. That is unnecessary since it will be only used by next cma_debugfs_add_one(). We can just pass it to following calling to save thisspace. Also remove useless idx parameter. Link: http://lkml.kernel.org/r/20190221040130.8940-1-zbestahu@gmail.com Signed-off-by: Yue Hu Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: David Rientjes Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma_debug.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index b55f28fbe831..8d7b2fd52225 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -21,8 +21,6 @@ struct cma_mem { unsigned long n; }; -static struct dentry *cma_debugfs_root; - static int cma_debugfs_get(void *data, u64 *val) { unsigned long *p = data; @@ -162,7 +160,7 @@ static int cma_alloc_write(void *data, u64 val) } DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); -static void cma_debugfs_add_one(struct cma *cma, int idx) +static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { struct dentry *tmp; char name[16]; @@ -170,7 +168,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) scnprintf(name, sizeof(name), "cma-%s", cma->name); - tmp = debugfs_create_dir(name, cma_debugfs_root); + tmp = debugfs_create_dir(name, root_dentry); debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); @@ -188,12 +186,13 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) static int __init cma_debugfs_init(void) { + struct dentry *cma_debugfs_root; int i; cma_debugfs_root = debugfs_create_dir("cma", NULL); for (i = 0; i < cma_area_count; i++) - cma_debugfs_add_one(&cma_areas[i], i); + cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root); return 0; } From 960087445cd263c30ecc32b9e218887a349597ce Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 5 Mar 2019 15:49:31 -0800 Subject: [PATCH 138/159] mm/swapfile.c: use struct_size() in kvzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; struct boo entry[]; }; size = sizeof(struct foo) + count * sizeof(struct boo); instance = kvzalloc(size, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kvzalloc(struct_size(instance, entry, count), GFP_KERNEL); Notice that, in this case, variable size is not necessary, hence it is removed. This code was detected with the help of Coccinelle. Link: http://lkml.kernel.org/r/20190221154622.GA19599@embeddedor Signed-off-by: Gustavo A. R. Silva Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index a14257ac0476..2b8d9c3fbb47 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2713,9 +2713,8 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; unsigned int type; int i; - unsigned int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); - p = kvzalloc(size, GFP_KERNEL); + p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); From a7ca12f9d905e7437dd3beb9cbb8e85bc2b991f4 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 5 Mar 2019 15:49:35 -0800 Subject: [PATCH 139/159] mm/workingset: remove unused @mapping argument in workingset_eviction() workingset_eviction() doesn't use and never did use the @mapping argument. Remove it. Link: http://lkml.kernel.org/r/20190228083329.31892-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Michal Hocko Cc: William Kucharski Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/vmscan.c | 2 +- mm/workingset.c | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 649529be91f2..fc50e21b3b88 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -307,7 +307,7 @@ struct vma_swap_readahead { }; /* linux/mm/workingset.c */ -void *workingset_eviction(struct address_space *mapping, struct page *page); +void *workingset_eviction(struct page *page); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); diff --git a/mm/vmscan.c b/mm/vmscan.c index e1f7ccdc0a90..dda6b80d045f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -952,7 +952,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) - shadow = workingset_eviction(mapping, page); + shadow = workingset_eviction(page); __delete_from_page_cache(page, shadow); xa_unlock_irqrestore(&mapping->i_pages, flags); diff --git a/mm/workingset.c b/mm/workingset.c index dcb994f2acc2..0bedf67502d5 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -215,13 +215,12 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, /** * workingset_eviction - note the eviction of a page from memory - * @mapping: address space the page was backing * @page: the page being evicted * - * Returns a shadow entry to be stored in @mapping->i_pages in place + * Returns a shadow entry to be stored in @page->mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ -void *workingset_eviction(struct address_space *mapping, struct page *page) +void *workingset_eviction(struct page *page) { struct pglist_data *pgdat = page_pgdat(page); struct mem_cgroup *memcg = page_memcg(page); From f4b7e272b5c0425915e2115068e0a5a20a3a628e Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 5 Mar 2019 15:49:39 -0800 Subject: [PATCH 140/159] mm: remove zone_lru_lock() function, access ->lru_lock directly We have common pattern to access lru_lock from a page pointer: zone_lru_lock(page_zone(page)) Which is silly, because it unfolds to this: &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]->zone_pgdat->lru_lock while we can simply do &NODE_DATA(page_to_nid(page))->lru_lock Remove zone_lru_lock() function, since it's only complicate things. Use 'page_pgdat(page)->lru_lock' pattern instead. [aryabinin@virtuozzo.com: a slightly better version of __split_huge_page()] Link: http://lkml.kernel.org/r/20190301121651.7741-1-aryabinin@virtuozzo.com Link: http://lkml.kernel.org/r/20190228083329.31892-2-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: William Kucharski Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroup-v1/memcg_test.txt | 4 ++-- Documentation/cgroup-v1/memory.txt | 4 ++-- include/linux/mm_types.h | 2 +- include/linux/mmzone.h | 4 ---- mm/compaction.c | 15 ++++++++------- mm/filemap.c | 4 ++-- mm/huge_memory.c | 10 +++++----- mm/memcontrol.c | 14 +++++++------- mm/mlock.c | 14 +++++++------- mm/page_idle.c | 8 ++++---- mm/rmap.c | 2 +- mm/swap.c | 16 ++++++++-------- mm/vmscan.c | 16 ++++++++-------- 13 files changed, 55 insertions(+), 58 deletions(-) diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt index 5c7f310f32bb..621e29ffb358 100644 --- a/Documentation/cgroup-v1/memcg_test.txt +++ b/Documentation/cgroup-v1/memcg_test.txt @@ -107,9 +107,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. 8. LRU Each memcg has its own private LRU. Now, its handling is under global - VM's control (means that it's handled under global zone_lru_lock). + VM's control (means that it's handled under global pgdat->lru_lock). Almost all routines around memcg's LRU is called by global LRU's - list management functions under zone_lru_lock(). + list management functions under pgdat->lru_lock. A special function is mem_cgroup_isolate_pages(). This scans memcg's private LRU and call __isolate_lru_page() to extract a page diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt index 3682e99234c2..a347fc9293e5 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.txt @@ -267,11 +267,11 @@ When oom event notifier is registered, event will be delivered. Other lock order is following: PG_locked. mm->page_table_lock - zone_lru_lock + pgdat->lru_lock lock_page_cgroup. In many cases, just lock_page_cgroup() is called. per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by - zone_lru_lock, it has no lock of its own. + pgdat->lru_lock, it has no lock of its own. 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0a36a22228e7..ab9b48420200 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -80,7 +80,7 @@ struct page { struct { /* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by - * zone_lru_lock. Sometimes used as a generic list + * pgdat->lru_lock. Sometimes used as a generic list * by the page owner. */ struct list_head lru; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6d3290cd1f6f..fba7741533be 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -730,10 +730,6 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) -static inline spinlock_t *zone_lru_lock(struct zone *zone) -{ - return &zone->zone_pgdat->lru_lock; -} static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) { diff --git a/mm/compaction.c b/mm/compaction.c index 1cc871da3fda..e054276cf397 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -775,6 +775,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn, isolate_mode_t isolate_mode) { struct zone *zone = cc->zone; + pg_data_t *pgdat = zone->zone_pgdat; unsigned long nr_scanned = 0, nr_isolated = 0; struct lruvec *lruvec; unsigned long flags = 0; @@ -839,8 +840,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * if contended. */ if (!(low_pfn % SWAP_CLUSTER_MAX) - && compact_unlock_should_abort(zone_lru_lock(zone), flags, - &locked, cc)) + && compact_unlock_should_abort(&pgdat->lru_lock, + flags, &locked, cc)) break; if (!pfn_valid_within(low_pfn)) @@ -910,7 +911,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(__PageMovable(page)) && !PageIsolated(page)) { if (locked) { - spin_unlock_irqrestore(zone_lru_lock(zone), + spin_unlock_irqrestore(&pgdat->lru_lock, flags); locked = false; } @@ -940,7 +941,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* If we already hold the lock, we can skip some rechecking */ if (!locked) { - locked = compact_lock_irqsave(zone_lru_lock(zone), + locked = compact_lock_irqsave(&pgdat->lru_lock, &flags, cc); /* Try get exclusive access under lock */ @@ -965,7 +966,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } } - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) @@ -1007,7 +1008,7 @@ isolate_fail: */ if (nr_isolated) { if (locked) { - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); locked = false; } putback_movable_pages(&cc->migratepages); @@ -1034,7 +1035,7 @@ isolate_fail: isolate_abort: if (locked) - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); /* * Updated the cached scanner pfn once the pageblock has been scanned diff --git a/mm/filemap.c b/mm/filemap.c index a41e01c472f3..a3b4021c448f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -98,8 +98,8 @@ * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) - * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) - * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) + * ->pgdat->lru_lock (follow_page->mark_page_accessed) + * ->pgdat->lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d4847026d4b1..fcf657886b4b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2440,11 +2440,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end, unsigned long flags) { struct page *head = compound_head(page); - struct zone *zone = page_zone(head); + pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; int i; - lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(head, pgdat); /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); @@ -2475,7 +2475,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, xa_unlock(&head->mapping->i_pages); } - spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); remap_page(head); @@ -2686,7 +2686,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) lru_add_drain(); /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); + spin_lock_irqsave(&pgdata->lru_lock, flags); if (mapping) { XA_STATE(xas, &mapping->i_pages, page_index(head)); @@ -2731,7 +2731,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_unlock(&pgdata->split_queue_lock); fail: if (mapping) xa_unlock(&mapping->i_pages); - spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); + spin_unlock_irqrestore(&pgdata->lru_lock, flags); remap_page(head); ret = -EBUSY; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 45cd1f84268a..7160cfab8107 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2362,13 +2362,13 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void lock_page_lru(struct page *page, int *isolated) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_lru(page)); *isolated = 1; @@ -2378,17 +2378,17 @@ static void lock_page_lru(struct page *page, int *isolated) static void unlock_page_lru(struct page *page, int isolated) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); if (isolated) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, page_lru(page)); } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); } static void commit_charge(struct page *page, struct mem_cgroup *memcg, @@ -2674,7 +2674,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) /* * Because tail pages are not marked as "used", set it. We're under - * zone_lru_lock and migration entries setup in all page mappings. + * pgdat->lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { diff --git a/mm/mlock.c b/mm/mlock.c index 41cc47e28ad6..080f3b36415b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -182,7 +182,7 @@ static void __munlock_isolation_failed(struct page *page) unsigned int munlock_vma_page(struct page *page) { int nr_pages; - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); @@ -194,7 +194,7 @@ unsigned int munlock_vma_page(struct page *page) * might otherwise copy PageMlocked to part of the tail pages before * we clear it in the head page. It also stabilizes hpage_nr_pages(). */ - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); if (!TestClearPageMlocked(page)) { /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ @@ -203,17 +203,17 @@ unsigned int munlock_vma_page(struct page *page) } nr_pages = hpage_nr_pages(page); - __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); if (__munlock_isolate_lru_page(page, true)) { - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); __munlock_isolated_page(page); goto out; } __munlock_isolation_failed(page); unlock_out: - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); out: return nr_pages - 1; @@ -298,7 +298,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_init(&pvec_putback); /* Phase 1: page isolation */ - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&zone->zone_pgdat->lru_lock); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; @@ -325,7 +325,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pvec->pages[i] = NULL; } __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&zone->zone_pgdat->lru_lock); /* Now we can release pins of pages that we are not munlocking */ pagevec_release(&pvec_putback); diff --git a/mm/page_idle.c b/mm/page_idle.c index b9e4b42b33ab..0b39ec0c945c 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -31,7 +31,7 @@ static struct page *page_idle_get_page(unsigned long pfn) { struct page *page; - struct zone *zone; + pg_data_t *pgdat; if (!pfn_valid(pfn)) return NULL; @@ -41,13 +41,13 @@ static struct page *page_idle_get_page(unsigned long pfn) !get_page_unless_zero(page)) return NULL; - zone = page_zone(page); - spin_lock_irq(zone_lru_lock(zone)); + pgdat = page_pgdat(page); + spin_lock_irq(&pgdat->lru_lock); if (unlikely(!PageLRU(page))) { put_page(page); page = NULL; } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); return page; } diff --git a/mm/rmap.c b/mm/rmap.c index 0454ecc29537..b30c7c71d1d9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -27,7 +27,7 @@ * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock - * zone_lru_lock (in mark_page_accessed, isolate_lru_page) + * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) diff --git a/mm/swap.c b/mm/swap.c index 4d7d37eb3c40..301ed4e04320 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -58,16 +58,16 @@ static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); static void __page_cache_release(struct page *page) { if (PageLRU(page)) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; unsigned long flags; - spin_lock_irqsave(zone_lru_lock(zone), flags); - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + spin_lock_irqsave(&pgdat->lru_lock, flags); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); } __ClearPageWaiters(page); mem_cgroup_uncharge(page); @@ -322,12 +322,12 @@ static inline void activate_page_drain(int cpu) void activate_page(struct page *page) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); page = compound_head(page); - spin_lock_irq(zone_lru_lock(zone)); - __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); - spin_unlock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); + __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL); + spin_unlock_irq(&pgdat->lru_lock); } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index dda6b80d045f..a5ad0b35ab8e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1614,8 +1614,8 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } -/* - * zone_lru_lock is heavily contended. Some of the functions that +/** + * pgdat->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * @@ -1750,11 +1750,11 @@ int isolate_lru_page(struct page *page) WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (PageLRU(page)) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; - spin_lock_irq(zone_lru_lock(zone)); - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + spin_lock_irq(&pgdat->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, pgdat); if (PageLRU(page)) { int lru = page_lru(page); get_page(page); @@ -1762,7 +1762,7 @@ int isolate_lru_page(struct page *page) del_page_from_lru_list(page, lruvec, lru); ret = 0; } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); } return ret; } @@ -1990,9 +1990,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * processes, from rmap. * * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold zone_lru_lock across the whole operation. But if + * appropriate to hold pgdat->lru_lock across the whole operation. But if * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop zone_lru_lock around each page. It's impossible to balance + * should drop pgdat->lru_lock around each page. It's impossible to balance * this, so instead we remove the pages from the LRU while processing them. * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. From 5f438eee8f2e972e910b558a1a243def508b6a35 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 5 Mar 2019 15:49:42 -0800 Subject: [PATCH 141/159] mm/compaction: pass pgdat to too_many_isolated() instead of zone too_many_isolated() in mm/compaction.c looks only at node state, so it makes more sense to change argument to pgdat instead of zone. Link: http://lkml.kernel.org/r/20190228083329.31892-3-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Vlastimil Babka Acked-by: Rik van Riel Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Michal Hocko Cc: William Kucharski Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e054276cf397..f171a83707ce 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -738,16 +738,16 @@ isolate_freepages_range(struct compact_control *cc, } /* Similar to reclaim, but different enough that they don't share logic */ -static bool too_many_isolated(struct zone *zone) +static bool too_many_isolated(pg_data_t *pgdat) { unsigned long active, inactive, isolated; - inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + - node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); - active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + - node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); - isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + - node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); + inactive = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_ANON); + active = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_ANON); + isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + + node_page_state(pgdat, NR_ISOLATED_ANON); return isolated > (inactive + active) / 2; } @@ -774,8 +774,7 @@ static unsigned long isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn, isolate_mode_t isolate_mode) { - struct zone *zone = cc->zone; - pg_data_t *pgdat = zone->zone_pgdat; + pg_data_t *pgdat = cc->zone->zone_pgdat; unsigned long nr_scanned = 0, nr_isolated = 0; struct lruvec *lruvec; unsigned long flags = 0; @@ -791,7 +790,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * list by either parallel reclaimers or compaction. If there are, * delay for some time until fewer pages are isolated */ - while (unlikely(too_many_isolated(zone))) { + while (unlikely(too_many_isolated(pgdat))) { /* async migration should just abort */ if (cc->mode == MIGRATE_ASYNC) return 0; From 0c81585499601acd1d0e1cbf424cabfaee60628c Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 5 Mar 2019 15:49:46 -0800 Subject: [PATCH 142/159] mm/page_ext.c: fix an imbalance with kmemleak After offlining a memory block, kmemleak scan will trigger a crash, as it encounters a page ext address that has already been freed during memory offlining. At the beginning in alloc_page_ext(), it calls kmemleak_alloc(), but it does not call kmemleak_free() in free_page_ext(). BUG: unable to handle kernel paging request at ffff888453d00000 PGD 128a01067 P4D 128a01067 PUD 128a04067 PMD 47e09e067 PTE 800ffffbac2ff060 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI CPU: 1 PID: 1594 Comm: bash Not tainted 5.0.0-rc8+ #15 Hardware name: HP ProLiant DL180 Gen9/ProLiant DL180 Gen9, BIOS U20 10/25/2017 RIP: 0010:scan_block+0xb5/0x290 Code: 85 6e 01 00 00 48 b8 00 00 30 f5 81 88 ff ff 48 39 c3 0f 84 5b 01 00 00 48 89 d8 48 c1 e8 03 42 80 3c 20 00 0f 85 87 01 00 00 <4c> 8b 3b e8 f3 0c fa ff 4c 39 3d 0c 6b 4c 01 0f 87 08 01 00 00 4c RSP: 0018:ffff8881ec57f8e0 EFLAGS: 00010082 RAX: 0000000000000000 RBX: ffff888453d00000 RCX: ffffffffa61e5a54 RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff888453d00000 RBP: ffff8881ec57f920 R08: fffffbfff4ed588d R09: fffffbfff4ed588c R10: fffffbfff4ed588c R11: ffffffffa76ac463 R12: dffffc0000000000 R13: ffff888453d00ff9 R14: ffff8881f80cef48 R15: ffff8881f80cef48 FS: 00007f6c0e3f8740(0000) GS:ffff8881f7680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff888453d00000 CR3: 00000001c4244003 CR4: 00000000001606a0 Call Trace: scan_gray_list+0x269/0x430 kmemleak_scan+0x5a8/0x10f0 kmemleak_write+0x541/0x6ca full_proxy_write+0xf8/0x190 __vfs_write+0xeb/0x980 vfs_write+0x15a/0x4f0 ksys_write+0xd2/0x1b0 __x64_sys_write+0x73/0xb0 do_syscall_64+0xeb/0xaaa entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f6c0dad73b8 Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 63 2d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55 RSP: 002b:00007ffd5b863cb8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007f6c0dad73b8 RDX: 0000000000000005 RSI: 000055a9216e1710 RDI: 0000000000000001 RBP: 000055a9216e1710 R08: 000000000000000a R09: 00007ffd5b863840 R10: 000000000000000a R11: 0000000000000246 R12: 00007f6c0dda9780 R13: 0000000000000005 R14: 00007f6c0dda4740 R15: 0000000000000005 Modules linked in: nls_iso8859_1 nls_cp437 vfat fat kvm_intel kvm irqbypass efivars ip_tables x_tables xfs sd_mod ahci libahci igb i2c_algo_bit libata i2c_core dm_mirror dm_region_hash dm_log dm_mod efivarfs CR2: ffff888453d00000 ---[ end trace ccf646c7456717c5 ]--- Kernel panic - not syncing: Fatal exception Shutting down cpus with NMI Kernel Offset: 0x24c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]--- Link: http://lkml.kernel.org/r/20190227173147.75650-1-cai@lca.pw Signed-off-by: Qian Cai Reviewed-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_ext.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_ext.c b/mm/page_ext.c index 762d5b7eb523..ab4244920e0f 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -273,6 +273,7 @@ static void free_page_ext(void *addr) table_size = get_entry_size() * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); + kmemleak_free(addr); free_pages_exact(addr, table_size); } } From 0d3bd18a5efd66097ef58622b898d3139790aa9d Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 5 Mar 2019 15:49:50 -0800 Subject: [PATCH 143/159] mm/cma.c: cma_declare_contiguous: correct err handling In case cma_init_reserved_mem failed, need to free the memblock allocated by memblock_reserve or memblock_alloc_range. Quote Catalin's comments: https://lkml.org/lkml/2019/2/26/482 Kmemleak is supposed to work with the memblock_{alloc,free} pair and it ignores the memblock_reserve() as a memblock_alloc() implementation detail. It is, however, tolerant to memblock_free() being called on a sub-range or just a different range from a previous memblock_alloc(). So the original patch looks fine to me. FWIW: Link: http://lkml.kernel.org/r/20190227144631.16708-1-peng.fan@nxp.com Signed-off-by: Peng Fan Reviewed-by: Catalin Marinas Reviewed-by: Mike Rapoport Cc: Laura Abbott Cc: Joonsoo Kim Cc: Michal Hocko Cc: Vlastimil Babka Cc: Marek Szyprowski Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/cma.c b/mm/cma.c index c7b39dd3b4f6..f4f3a8a57d86 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -353,12 +353,14 @@ int __init cma_declare_contiguous(phys_addr_t base, ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); if (ret) - goto err; + goto free_mem; pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, &base); return 0; +free_mem: + memblock_free(base, size); err: pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); return ret; From 82ede7ee38e7ea9dd3cf5f6cc1501172a272337e Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 5 Mar 2019 15:49:53 -0800 Subject: [PATCH 144/159] mm/memcontrol.c: fix bad line in comment Commit 230671533d64 ("mm: memory.low hierarchical behavior") missed an asterisk in one of the comments. mm/memcontrol.c:5774: warning: bad line: | 0, otherwise. Link: http://lkml.kernel.org/r/20190301143734.94393-1-cai@lca.pw Acked-by: Souptick Joarder Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7160cfab8107..532e0e2a4817 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5752,7 +5752,7 @@ struct cgroup_subsys memory_cgrp_subsys = { * * | memory.current, if memory.current < memory.low * low_usage = | - | 0, otherwise. + * | 0, otherwise. * * * Such definition of the effective memory.low provides the expected From cd02cf1aceeac5b0bbbb2d6fbf614c987dcd396f Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 5 Mar 2019 15:49:57 -0800 Subject: [PATCH 145/159] mm/hotplug: fix an imbalance with DEBUG_PAGEALLOC When onlining a memory block with DEBUG_PAGEALLOC, it unmaps the pages in the block from kernel, However, it does not map those pages while offlining at the beginning. As the result, it triggers a panic below while onlining on ppc64le as it checks if the pages are mapped before unmapping. However, the imbalance exists for all arches where double-unmappings could happen. Therefore, let kernel map those pages in generic_online_page() before they have being freed into the page allocator for the first time where it will set the page count to one. On the other hand, it works fine during the boot, because at least for IBM POWER8, it does, early_setup early_init_mmu harsh__early_init_mmu htab_initialize [1] htab_bolt_mapping [2] where it effectively map all memblock regions just like kernel_map_linear_page(), so later mem_init() -> memblock_free_all() will unmap them just fine without any imbalance. On other arches without this imbalance checking, it still unmap them once at the most. [1] for_each_memblock(memory, reg) { base = (unsigned long)__va(reg->base); size = reg->size; DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", base, size, prot); BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), prot, mmu_linear_psize, mmu_kernel_ssize)); } [2] linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; kernel BUG at arch/powerpc/mm/hash_utils_64.c:1815! Oops: Exception in kernel mode, sig: 5 [#1] LE SMP NR_CPUS=256 DEBUG_PAGEALLOC NUMA pSeries CPU: 2 PID: 4298 Comm: bash Not tainted 5.0.0-rc7+ #15 NIP: c000000000062670 LR: c00000000006265c CTR: 0000000000000000 REGS: c0000005bf8a75b0 TRAP: 0700 Not tainted (5.0.0-rc7+) MSR: 800000000282b033 CR: 28422842 XER: 00000000 CFAR: c000000000804f44 IRQMASK: 1 NIP [c000000000062670] __kernel_map_pages+0x2e0/0x4f0 LR [c00000000006265c] __kernel_map_pages+0x2cc/0x4f0 Call Trace: __kernel_map_pages+0x2cc/0x4f0 free_unref_page_prepare+0x2f0/0x4d0 free_unref_page+0x44/0x90 __online_page_free+0x84/0x110 online_pages_range+0xc0/0x150 walk_system_ram_range+0xc8/0x120 online_pages+0x280/0x5a0 memory_subsys_online+0x1b4/0x270 device_online+0xc0/0xf0 state_store+0xc0/0x180 dev_attr_store+0x3c/0x60 sysfs_kf_write+0x70/0xb0 kernfs_fop_write+0x10c/0x250 __vfs_write+0x48/0x240 vfs_write+0xd8/0x210 ksys_write+0x70/0x120 system_call+0x5c/0x70 Link: http://lkml.kernel.org/r/20190301220814.97339-1-cai@lca.pw Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman [powerpc] Cc: Michal Hocko Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ebc8c1e75355..6b05576fb4ec 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -658,6 +658,7 @@ EXPORT_SYMBOL_GPL(__online_page_free); static void generic_online_page(struct page *page, unsigned int order) { + kernel_map_pages(page, 1 << order, 1); __free_pages_core(page, order); totalram_pages_add(1UL << order); #ifdef CONFIG_HIGHMEM From 70516b936bb0afac4fa2fc54a5e61870ece998da Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 5 Mar 2019 15:50:00 -0800 Subject: [PATCH 146/159] mm/huge_memory.c: fix "orig_pud" set but not used Commit a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages") introduced pudp_huge_get_and_clear_full() but no one uses its return code. In order to not diverge from pmdp_huge_get_and_clear_full(), just change zap_huge_pud() to not assign the return value from pudp_huge_get_and_clear_full(). mm/huge_memory.c: In function 'zap_huge_pud': mm/huge_memory.c:1982:8: warning: variable 'orig_pud' set but not used [-Wunused-but-set-variable] pud_t orig_pud; ^~~~~~~~ Link: http://lkml.kernel.org/r/20190301221956.97493-1-cai@lca.pw Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fcf657886b4b..404acdcd0455 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1982,7 +1982,6 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr) { - pud_t orig_pud; spinlock_t *ptl; ptl = __pud_trans_huge_lock(pud, vma); @@ -1994,8 +1993,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, * pgtable_trans_huge_withdraw after finishing pudp related * operations. */ - orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, - tlb->fullmm); + pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_dax(vma)) { spin_unlock(ptl); From a9519defc771d574888ffe01e84747889152ec35 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Tue, 5 Mar 2019 15:50:03 -0800 Subject: [PATCH 147/159] writeback: fix inode cgroup switching comment Commit 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") refers to inode_switch_wb_work_fn() which never got merged. Switch the comments to inode_switch_wbs_work_fn(). Link: http://lkml.kernel.org/r/20190305004617.142590-1-gthelen@google.com Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") Signed-off-by: Greg Thelen Reviewed-by: Andrew Morton Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 2 +- include/linux/fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c28a47cbe355..f9b029180241 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -365,7 +365,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) rcu_read_lock(); /* - * Paired with store_release in inode_switch_wb_work_fn() and + * Paired with store_release in inode_switch_wbs_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; diff --git a/include/linux/fs.h b/include/linux/fs.h index fd423fec8d83..08f26046233e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2091,7 +2091,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See - * inode_switch_wb_work_fn() for details. + * inode_switch_wbs_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper * and work dirs among overlayfs mounts. From fc8efd2ddfed3f343c11b693e87140ff358d7ff5 Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Tue, 5 Mar 2019 15:50:08 -0800 Subject: [PATCH 148/159] mm/memory.c: do_fault: avoid usage of stale vm_area_struct LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8. This is a stress test, where one thread mmaps/writes/munmaps memory area and other thread is trying to read from it: CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51 Hardware name: IBM 2964 N63 400 (z/VM 6.4.0) Krnl PSW : 0404e00180000000 00000000001ac8d8 (__lock_acquire+0x7/0x7a8) Call Trace: ([<0000000000000000>] (null)) [<00000000001adae4>] lock_acquire+0xec/0x258 [<000000000080d1ac>] _raw_spin_lock_bh+0x5c/0x98 [<000000000012a780>] page_table_free+0x48/0x1a8 [<00000000002f6e54>] do_fault+0xdc/0x670 [<00000000002fadae>] __handle_mm_fault+0x416/0x5f0 [<00000000002fb138>] handle_mm_fault+0x1b0/0x320 [<00000000001248cc>] do_dat_exception+0x19c/0x2c8 [<000000000080e5ee>] pgm_check_handler+0x19e/0x200 page_table_free() is called with NULL mm parameter, but because "0" is a valid address on s390 (see S390_lowcore), it keeps going until it eventually crashes in lockdep's lock_acquire. This crash is reproducible at least since 4.14. Problem is that "vmf->vma" used in do_fault() can become stale. Because mmap_sem may be released, other threads can come in, call munmap() and cause "vma" be returned to kmem cache, and get zeroed/re-initialized and re-used: handle_mm_fault | __handle_mm_fault | do_fault | vma = vmf->vma | do_read_fault | __do_fault | vma->vm_ops->fault(vmf); | mmap_sem is released | | | do_munmap() | remove_vma_list() | remove_vma() | vm_area_free() | # vma is released | ... | # same vma is allocated | # from kmem cache | do_mmap() | vm_area_alloc() | memset(vma, 0, ...) | pte_free(vma->vm_mm, ...); | page_table_free | spin_lock_bh(&mm->context.lock);| | Cache mm_struct to avoid using potentially stale "vma". [1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Link: http://lkml.kernel.org/r/5b3fdf19e2a5be460a384b936f5b56e13733f1b8.1551595137.git.jstancek@redhat.com Signed-off-by: Jan Stancek Reviewed-by: Andrea Arcangeli Reviewed-by: Matthew Wilcox Acked-by: Rafael Aquini Reviewed-by: Minchan Kim Acked-by: Kirill A. Shutemov Cc: Rik van Riel Cc: Michal Hocko Cc: Huang Ying Cc: Souptick Joarder Cc: Jerome Glisse Cc: Aneesh Kumar K.V Cc: David Hildenbrand Cc: Andrea Arcangeli Cc: David Rientjes Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 706c4c4a2b8e..47fe250307c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3536,10 +3536,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). + * If mmap_sem is released, vma may become invalid (for example + * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; /* @@ -3580,7 +3583,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vma->vm_mm, vmf->prealloc_pte); + pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; From d778015ac95bc036af73342c878ab19250e01fe1 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 5 Mar 2019 15:50:11 -0800 Subject: [PATCH 149/159] mm/sparse: fix a bad comparison next_present_section_nr() could only return an unsigned number -1, so just check it specifically where compilers will convert -1 to unsigned if needed. mm/sparse.c: In function 'sparse_init_nid': mm/sparse.c:200:20: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits] ((section_nr >= 0) && \ ^~ mm/sparse.c:478:2: note: in expansion of macro 'for_each_present_section_nr' for_each_present_section_nr(pnum_begin, pnum) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~ mm/sparse.c:200:20: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits] ((section_nr >= 0) && \ ^~ mm/sparse.c:497:2: note: in expansion of macro 'for_each_present_section_nr' for_each_present_section_nr(pnum_begin, pnum) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~ mm/sparse.c: In function 'sparse_init': mm/sparse.c:200:20: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits] ((section_nr >= 0) && \ ^~ mm/sparse.c:520:2: note: in expansion of macro 'for_each_present_section_nr' for_each_present_section_nr(pnum_begin + 1, pnum_end) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~ Link: http://lkml.kernel.org/r/20190228181839.86504-1-cai@lca.pw Fixes: c4e1be9ec113 ("mm, sparsemem: break out of loops early") Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index 7ea5dc6c6b19..77a0554fa5bd 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -197,7 +197,7 @@ static inline int next_present_section_nr(int section_nr) } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ - ((section_nr >= 0) && \ + ((section_nr != -1) && \ (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) From ea2c3f6f5545610ed0bd8afa8a05355b49d817af Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 5 Mar 2019 15:50:14 -0800 Subject: [PATCH 150/159] mm,mremap: bail out earlier in mremap_to under map pressure When using mremap() syscall in addition to MREMAP_FIXED flag, mremap() calls mremap_to() which does the following: 1) unmaps the destination region where we are going to move the map 2) If the new region is going to be smaller, we unmap the last part of the old region Then, we will eventually call move_vma() to do the actual move. move_vma() checks whether we are at least 4 maps below max_map_count before going further, otherwise it bails out with -ENOMEM. The problem is that we might have already unmapped the vma's in steps 1) and 2), so it is not possible for userspace to figure out the state of the vmas after it gets -ENOMEM, and it gets tricky for userspace to clean up properly on error path. While it is true that we can return -ENOMEM for more reasons (e.g: see may_expand_vm() or move_page_tables()), I think that we can avoid this scenario if we check early in mremap_to() if the operation has high chances to succeed map-wise. Should that not be the case, we can bail out before we even try to unmap anything, so we make sure the vma's are left untouched in case we are likely to be short of maps. The thumb-rule now is to rely on the worst-scenario case we can have. That is when both vma's (old region and new region) are going to be split in 3, so we get two more maps to the ones we already hold (one per each). If current map count + 2 maps still leads us to 4 maps below the threshold, we are going to pass the check in move_vma(). Of course, this is not free, as it might generate false positives when it is true that we are tight map-wise, but the unmap operation can release several vma's leading us to a good state. Another approach was also investigated [1], but it may be too much hassle for what it brings. [1] https://lore.kernel.org/lkml/20190219155320.tkfkwvqk53tfdojt@d104.suse.de/ Link: http://lkml.kernel.org/r/20190226091314.18446-1-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Yang Shi Cc: Mel Gorman Cc: Joel Fernandes Cc: Cyril Hrubis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/mremap.c b/mm/mremap.c index 3320616ed93f..e3edef6b7a12 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -516,6 +516,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; + /* + * move_vma() need us to stay 4 maps below the threshold, otherwise + * it will bail out at the very beginning. + * That is a problem if we have already unmaped the regions here + * (new_addr, and old_addr), because userspace will not know the + * state of the vma's after it gets -ENOMEM. + * So, to avoid such scenario we can pre-compute if the whole + * operation has high chances to success map-wise. + * Worst-scenario case is when both vma's (new_addr and old_addr) get + * split in 3 before unmaping it. + * That means 2 more maps (1 for each) to the ones we already hold. + * Check whether current map count plus 2 still leads us to 4 maps below + * the threshold, otherwise return -ENOMEM here to be more safe. + */ + if ((mm->map_count + 2) >= sysctl_max_map_count - 3) + return -ENOMEM; + ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) goto out; From 0338c838367b7993edd1c154789c563c13a3b35e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:50:18 -0800 Subject: [PATCH 151/159] proc: return exit code 4 for skipped tests Test harness uses 4 for SKIP, not 2. Link: http://lkml.kernel.org/r/20190108193108.GA12259@avx2 Signed-off-by: Alexey Dobriyan Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/proc-loadavg-001.c | 2 +- tools/testing/selftests/proc/proc-self-map-files-002.c | 2 +- tools/testing/selftests/proc/proc-self-syscall.c | 2 +- tools/testing/selftests/proc/proc-self-wchan.c | 2 +- tools/testing/selftests/proc/read.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c index fcff7047000d..471e2aa28077 100644 --- a/tools/testing/selftests/proc/proc-loadavg-001.c +++ b/tools/testing/selftests/proc/proc-loadavg-001.c @@ -30,7 +30,7 @@ int main(void) if (unshare(CLONE_NEWPID) == -1) { if (errno == ENOSYS || errno == EPERM) - return 2; + return 4; return 1; } diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c index 85744425b08d..762cb01f2ca7 100644 --- a/tools/testing/selftests/proc/proc-self-map-files-002.c +++ b/tools/testing/selftests/proc/proc-self-map-files-002.c @@ -63,7 +63,7 @@ int main(void) p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); if (p == MAP_FAILED) { if (errno == EPERM) - return 2; + return 4; return 1; } diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c index 5ab5f4810e43..b2993a67a4b3 100644 --- a/tools/testing/selftests/proc/proc-self-syscall.c +++ b/tools/testing/selftests/proc/proc-self-syscall.c @@ -39,7 +39,7 @@ int main(void) fd = open("/proc/self/syscall", O_RDONLY); if (fd == -1) { if (errno == ENOENT) - return 2; + return 4; return 1; } diff --git a/tools/testing/selftests/proc/proc-self-wchan.c b/tools/testing/selftests/proc/proc-self-wchan.c index a38b2fbaa7ad..b467b98a457d 100644 --- a/tools/testing/selftests/proc/proc-self-wchan.c +++ b/tools/testing/selftests/proc/proc-self-wchan.c @@ -27,7 +27,7 @@ int main(void) fd = open("/proc/self/wchan", O_RDONLY); if (fd == -1) { if (errno == ENOENT) - return 2; + return 4; return 1; } diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c index 563e752e6eba..b2bd8da58efe 100644 --- a/tools/testing/selftests/proc/read.c +++ b/tools/testing/selftests/proc/read.c @@ -126,7 +126,7 @@ int main(void) d = opendir("/proc"); if (!d) - return 2; + return 4; f(d, 0); return 0; } From 756ca74c7f656b6ed3cb60344845878226b658ae Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 5 Mar 2019 15:50:22 -0800 Subject: [PATCH 152/159] fs/proc/self.c: code cleanup for proc_setup_self() Remove unnecessary ERR_PTR()/PTR_ERR() cast in proc_setup_self(). Link: http://lkml.kernel.org/r/20190124030150.8472-1-cgxu519@gmx.com Signed-off-by: Chengguang Xu Reviewed-by: Andrew Morton Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/self.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/proc/self.c b/fs/proc/self.c index 127265e5c55f..57c0a1047250 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s) struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *self; + int ret = -ENOMEM; inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); @@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s) inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; d_add(self, inode); + ret = 0; } else { dput(self); - self = ERR_PTR(-ENOMEM); } - } else { - self = ERR_PTR(-ENOMEM); } inode_unlock(root_inode); - if (IS_ERR(self)) { + + if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); - return PTR_ERR(self); - } - ns->proc_self = self; - return 0; + else + ns->proc_self = self; + + return ret; } void __init proc_self_init(void) From 45f68ab50234e825cdc7aee76a40d227d92eea14 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 5 Mar 2019 15:50:25 -0800 Subject: [PATCH 153/159] fs/proc/thread_self.c: code cleanup for proc_setup_thread_self() Remove unnecessary ERR_PTR()/PTR_ERR() cast in proc_setup_thread_self(). Link: http://lkml.kernel.org/r/20190124030150.8472-2-cgxu519@gmx.com Signed-off-by: Chengguang Xu Reviewed-by: Andrew Morton Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/thread_self.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index b905010ca9eb..f61ae53533f5 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s) struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *thread_self; + int ret = -ENOMEM; inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); @@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s) inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_thread_self_inode_operations; d_add(thread_self, inode); + ret = 0; } else { dput(thread_self); - thread_self = ERR_PTR(-ENOMEM); } - } else { - thread_self = ERR_PTR(-ENOMEM); } inode_unlock(root_inode); - if (IS_ERR(thread_self)) { + + if (ret) pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); - return PTR_ERR(thread_self); - } - ns->proc_thread_self = thread_self; - return 0; + else + ns->proc_thread_self = thread_self; + + return ret; } void __init proc_thread_self_init(void) From 867aaccf1f2c35eff4706ea69299f731f2a1953e Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Tue, 5 Mar 2019 15:50:29 -0800 Subject: [PATCH 154/159] proc: remove unused argument in proc_pid_lookup() [adobriyan@gmail.com: delete "extern" from prototype] Link: http://lkml.kernel.org/r/20190114195635.GA9372@avx2 Signed-off-by: Zhikang Zhang Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- fs/proc/internal.h | 2 +- fs/proc/root.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index f5ed9512d193..fcb0180d3988 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3161,7 +3161,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, return d_splice_alias(inode, dentry); } -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) +struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; unsigned tgid; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 95b14196f284..4fc5a9b68f76 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -162,7 +162,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); -extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); +struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ diff --git a/fs/proc/root.c b/fs/proc/root.c index f4b1a9d2eca6..621e6ec322ca 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -154,7 +154,7 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat, static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_pid_lookup(dir, dentry, flags)) + if (!proc_pid_lookup(dentry, flags)) return NULL; return proc_lookup(dir, dentry, flags); From 5713f35c0575a1137b705e13d10f8ee58f2ec7e8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:50:32 -0800 Subject: [PATCH 155/159] proc: read kernel cpu stat pointer once Help gcc generate better code: $ ./scripts/bloat-o-meter ../vmlinux-000 ../vmlinux-001 add/remove: 2/2 grow/shrink: 0/1 up/down: 92/-142 (-50) Function old new delta get_iowait_time.isra - 46 +46 get_idle_time.isra - 46 +46 show_stat 1489 1477 -12 get_iowait_time 65 - -65 get_idle_time 65 - -65 Link: http://lkml.kernel.org/r/20190114195907.GA9680@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 60 +++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 535eda7857cf..49aa0a2b0d9e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -23,21 +23,21 @@ #ifdef arch_idle_time -static u64 get_idle_time(int cpu) +static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle; - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle = kcs->cpustat[CPUTIME_IDLE]; if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) idle += arch_idle_time(cpu); return idle; } -static u64 get_iowait_time(int cpu) +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait; - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + iowait = kcs->cpustat[CPUTIME_IOWAIT]; if (cpu_online(cpu) && nr_iowait_cpu(cpu)) iowait += arch_idle_time(cpu); return iowait; @@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu) #else -static u64 get_idle_time(int cpu) +static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; @@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu) if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle = kcs->cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } -static u64 get_iowait_time(int cpu) +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; @@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu) if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + iowait = kcs->cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; @@ -95,16 +95,18 @@ static int show_stat(struct seq_file *p, void *v) getboottime64(&boottime); for_each_possible_cpu(i) { - user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle += get_idle_time(i); - iowait += get_iowait_time(i); - irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + struct kernel_cpustat *kcs = &kcpustat_cpu(i); + + user += kcs->cpustat[CPUTIME_USER]; + nice += kcs->cpustat[CPUTIME_NICE]; + system += kcs->cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(kcs, i); + iowait += get_iowait_time(kcs, i); + irq += kcs->cpustat[CPUTIME_IRQ]; + softirq += kcs->cpustat[CPUTIME_SOFTIRQ]; + steal += kcs->cpustat[CPUTIME_STEAL]; + guest += kcs->cpustat[CPUTIME_GUEST]; + guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); @@ -130,17 +132,19 @@ static int show_stat(struct seq_file *p, void *v) seq_putc(p, '\n'); for_each_online_cpu(i) { + struct kernel_cpustat *kcs = &kcpustat_cpu(i); + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle = get_idle_time(i); - iowait = get_iowait_time(i); - irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + user = kcs->cpustat[CPUTIME_USER]; + nice = kcs->cpustat[CPUTIME_NICE]; + system = kcs->cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(kcs, i); + iowait = get_iowait_time(kcs, i); + irq = kcs->cpustat[CPUTIME_IRQ]; + softirq = kcs->cpustat[CPUTIME_SOFTIRQ]; + steal = kcs->cpustat[CPUTIME_STEAL]; + guest = kcs->cpustat[CPUTIME_GUEST]; + guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); From 08b55775133b77acc9975ad772b41813cbfea674 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:50:35 -0800 Subject: [PATCH 156/159] proc: use seq_puts() everywhere seq_printf() without format specifiers == faster seq_puts() Link: http://lkml.kernel.org/r/20190114200545.GC9680@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 16 ++++++++-------- fs/proc/base.c | 2 +- fs/proc/task_nommu.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 9d428d5a0ac8..2edbb657f859 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif - seq_printf(m, "\nSpeculation_Store_Bypass:\t"); + seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { case -EINVAL: - seq_printf(m, "unknown"); + seq_puts(m, "unknown"); break; case PR_SPEC_NOT_AFFECTED: - seq_printf(m, "not vulnerable"); + seq_puts(m, "not vulnerable"); break; case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: - seq_printf(m, "thread force mitigated"); + seq_puts(m, "thread force mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_DISABLE: - seq_printf(m, "thread mitigated"); + seq_puts(m, "thread mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_ENABLE: - seq_printf(m, "thread vulnerable"); + seq_puts(m, "thread vulnerable"); break; case PR_SPEC_DISABLE: - seq_printf(m, "globally mitigated"); + seq_puts(m, "globally mitigated"); break; default: - seq_printf(m, "vulnerable"); + seq_puts(m, "vulnerable"); break; } seq_putc(m, '\n'); diff --git a/fs/proc/base.c b/fs/proc/base.c index fcb0180d3988..511b279ec69c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -456,7 +456,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { if (unlikely(!sched_info_on())) - seq_printf(m, "0 0 0\n"); + seq_puts(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 0b63d68dedb2..3b7e310297d2 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) seq_file_path(m, file, ""); } else if (mm && is_stack(vma)) { seq_pad(m, ' '); - seq_printf(m, "[stack]"); + seq_puts(m, "[stack]"); } seq_putc(m, '\n'); From e483b0208784146864a2c195e316230647e9d297 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:50:39 -0800 Subject: [PATCH 157/159] proc: test /proc/*/maps, smaps, smaps_rollup, statm Start testing VM related fiels found in per-process files. Do it by jiting small executable which brings its address space to precisely known state, then comparing /proc/*/maps, smaps, smaps_rollup, and statm files to expected values. Currently only x86_64 is supported. [adobriyan@gmail.com: exit correctly in /proc/*/maps test] Link: http://lkml.kernel.org/r/20190206073659.GB15311@avx2 Link: http://lkml.kernel.org/r/20190203165806.GA14568@avx2 Signed-off-by: Alexey Dobriyan Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/proc-pid-vm.c | 406 +++++++++++++++++++++ 3 files changed, 408 insertions(+) create mode 100644 tools/testing/selftests/proc/proc-pid-vm.c diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 29bac5ef9a93..444ad39d3700 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -2,6 +2,7 @@ /fd-002-posix-eq /fd-003-kthread /proc-loadavg-001 +/proc-pid-vm /proc-self-map-files-001 /proc-self-map-files-002 /proc-self-syscall diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 434d033ee067..5163dc887aa3 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS += fd-001-lookup TEST_GEN_PROGS += fd-002-posix-eq TEST_GEN_PROGS += fd-003-kthread TEST_GEN_PROGS += proc-loadavg-001 +TEST_GEN_PROGS += proc-pid-vm TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 TEST_GEN_PROGS += proc-self-syscall diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c new file mode 100644 index 000000000000..bbe8150d18aa --- /dev/null +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2019 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Fork and exec tiny 1 page executable which precisely controls its VM. + * Test /proc/$PID/maps + * Test /proc/$PID/smaps + * Test /proc/$PID/smaps_rollup + * Test /proc/$PID/statm + * + * FIXME require CONFIG_TMPFS which can be disabled + * FIXME test other values from "smaps" + * FIXME support other archs + */ +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags) +{ + return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags); +} + +static void make_private_tmp(void) +{ + if (unshare(CLONE_NEWNS) == -1) { + if (errno == ENOSYS || errno == EPERM) { + exit(4); + } + exit(1); + } + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { + exit(1); + } + if (mount(NULL, "/tmp", "tmpfs", 0, NULL) == -1) { + exit(1); + } +} + +static pid_t pid = -1; +static void ate(void) +{ + if (pid > 0) { + kill(pid, SIGTERM); + } +} + +struct elf64_hdr { + uint8_t e_ident[16]; + uint16_t e_type; + uint16_t e_machine; + uint32_t e_version; + uint64_t e_entry; + uint64_t e_phoff; + uint64_t e_shoff; + uint32_t e_flags; + uint16_t e_ehsize; + uint16_t e_phentsize; + uint16_t e_phnum; + uint16_t e_shentsize; + uint16_t e_shnum; + uint16_t e_shstrndx; +}; + +struct elf64_phdr { + uint32_t p_type; + uint32_t p_flags; + uint64_t p_offset; + uint64_t p_vaddr; + uint64_t p_paddr; + uint64_t p_filesz; + uint64_t p_memsz; + uint64_t p_align; +}; + +#ifdef __x86_64__ +#define PAGE_SIZE 4096 +#define VADDR (1UL << 32) +#define MAPS_OFFSET 73 + +#define syscall 0x0f, 0x05 +#define mov_rdi(x) \ + 0x48, 0xbf, \ + (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \ + ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff + +#define mov_rsi(x) \ + 0x48, 0xbe, \ + (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \ + ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff + +#define mov_eax(x) \ + 0xb8, (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff + +static const uint8_t payload[] = { + /* Casually unmap stack, vDSO and everything else. */ + /* munmap */ + mov_rdi(VADDR + 4096), + mov_rsi((1ULL << 47) - 4096 - VADDR - 4096), + mov_eax(11), + syscall, + + /* Ping parent. */ + /* write(0, &c, 1); */ + 0x31, 0xff, /* xor edi, edi */ + 0x48, 0x8d, 0x35, 0x00, 0x00, 0x00, 0x00, /* lea rsi, [rip] */ + 0xba, 0x01, 0x00, 0x00, 0x00, /* mov edx, 1 */ + mov_eax(1), + syscall, + + /* 1: pause(); */ + mov_eax(34), + syscall, + + 0xeb, 0xf7, /* jmp 1b */ +}; + +static int make_exe(const uint8_t *payload, size_t len) +{ + struct elf64_hdr h; + struct elf64_phdr ph; + + struct iovec iov[3] = { + {&h, sizeof(struct elf64_hdr)}, + {&ph, sizeof(struct elf64_phdr)}, + {(void *)payload, len}, + }; + int fd, fd1; + char buf[64]; + + memset(&h, 0, sizeof(h)); + h.e_ident[0] = 0x7f; + h.e_ident[1] = 'E'; + h.e_ident[2] = 'L'; + h.e_ident[3] = 'F'; + h.e_ident[4] = 2; + h.e_ident[5] = 1; + h.e_ident[6] = 1; + h.e_ident[7] = 0; + h.e_type = 2; + h.e_machine = 0x3e; + h.e_version = 1; + h.e_entry = VADDR + sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr); + h.e_phoff = sizeof(struct elf64_hdr); + h.e_shoff = 0; + h.e_flags = 0; + h.e_ehsize = sizeof(struct elf64_hdr); + h.e_phentsize = sizeof(struct elf64_phdr); + h.e_phnum = 1; + h.e_shentsize = 0; + h.e_shnum = 0; + h.e_shstrndx = 0; + + memset(&ph, 0, sizeof(ph)); + ph.p_type = 1; + ph.p_flags = (1<<2)|1; + ph.p_offset = 0; + ph.p_vaddr = VADDR; + ph.p_paddr = 0; + ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload); + ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload); + ph.p_align = 4096; + + fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700); + if (fd == -1) { + exit(1); + } + + if (writev(fd, iov, 3) != sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len) { + exit(1); + } + + /* Avoid ETXTBSY on exec. */ + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd); + fd1 = open(buf, O_RDONLY|O_CLOEXEC); + close(fd); + + return fd1; +} +#endif + +#ifdef __x86_64__ +int main(void) +{ + int pipefd[2]; + int exec_fd; + + atexit(ate); + + make_private_tmp(); + + /* Reserve fd 0 for 1-byte pipe ping from child. */ + close(0); + if (open("/", O_RDONLY|O_DIRECTORY|O_PATH) != 0) { + return 1; + } + + exec_fd = make_exe(payload, sizeof(payload)); + + if (pipe(pipefd) == -1) { + return 1; + } + if (dup2(pipefd[1], 0) != 0) { + return 1; + } + + pid = fork(); + if (pid == -1) { + return 1; + } + if (pid == 0) { + sys_execveat(exec_fd, "", NULL, NULL, AT_EMPTY_PATH); + return 1; + } + + char _; + if (read(pipefd[0], &_, 1) != 1) { + return 1; + } + + struct stat st; + if (fstat(exec_fd, &st) == -1) { + return 1; + } + + /* Generate "head -n1 /proc/$PID/maps" */ + char buf0[256]; + memset(buf0, ' ', sizeof(buf0)); + int len = snprintf(buf0, sizeof(buf0), + "%08lx-%08lx r-xp 00000000 %02lx:%02lx %llu", + VADDR, VADDR + PAGE_SIZE, + MAJOR(st.st_dev), MINOR(st.st_dev), + (unsigned long long)st.st_ino); + buf0[len] = ' '; + snprintf(buf0 + MAPS_OFFSET, sizeof(buf0) - MAPS_OFFSET, + "/tmp/#%llu (deleted)\n", (unsigned long long)st.st_ino); + + + /* Test /proc/$PID/maps */ + { + char buf[256]; + ssize_t rv; + int fd; + + snprintf(buf, sizeof(buf), "/proc/%u/maps", pid); + fd = open(buf, O_RDONLY); + if (fd == -1) { + return 1; + } + rv = read(fd, buf, sizeof(buf)); + assert(rv == strlen(buf0)); + assert(memcmp(buf, buf0, strlen(buf0)) == 0); + } + + /* Test /proc/$PID/smaps */ + { + char buf[1024]; + ssize_t rv; + int fd; + + snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid); + fd = open(buf, O_RDONLY); + if (fd == -1) { + return 1; + } + rv = read(fd, buf, sizeof(buf)); + assert(0 <= rv && rv <= sizeof(buf)); + + assert(rv >= strlen(buf0)); + assert(memcmp(buf, buf0, strlen(buf0)) == 0); + +#define RSS1 "Rss: 4 kB\n" +#define RSS2 "Rss: 0 kB\n" +#define PSS1 "Pss: 4 kB\n" +#define PSS2 "Pss: 0 kB\n" + assert(memmem(buf, rv, RSS1, strlen(RSS1)) || + memmem(buf, rv, RSS2, strlen(RSS2))); + assert(memmem(buf, rv, PSS1, strlen(PSS1)) || + memmem(buf, rv, PSS2, strlen(PSS2))); + + static const char *S[] = { + "Size: 4 kB\n", + "KernelPageSize: 4 kB\n", + "MMUPageSize: 4 kB\n", + "Anonymous: 0 kB\n", + "AnonHugePages: 0 kB\n", + "Shared_Hugetlb: 0 kB\n", + "Private_Hugetlb: 0 kB\n", + "Locked: 0 kB\n", + }; + int i; + + for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) { + assert(memmem(buf, rv, S[i], strlen(S[i]))); + } + } + + /* Test /proc/$PID/smaps_rollup */ + { + char bufr[256]; + memset(bufr, ' ', sizeof(bufr)); + len = snprintf(bufr, sizeof(bufr), + "%08lx-%08lx ---p 00000000 00:00 0", + VADDR, VADDR + PAGE_SIZE); + bufr[len] = ' '; + snprintf(bufr + MAPS_OFFSET, sizeof(bufr) - MAPS_OFFSET, + "[rollup]\n"); + + char buf[1024]; + ssize_t rv; + int fd; + + snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid); + fd = open(buf, O_RDONLY); + if (fd == -1) { + return 1; + } + rv = read(fd, buf, sizeof(buf)); + assert(0 <= rv && rv <= sizeof(buf)); + + assert(rv >= strlen(bufr)); + assert(memcmp(buf, bufr, strlen(bufr)) == 0); + + assert(memmem(buf, rv, RSS1, strlen(RSS1)) || + memmem(buf, rv, RSS2, strlen(RSS2))); + assert(memmem(buf, rv, PSS1, strlen(PSS1)) || + memmem(buf, rv, PSS2, strlen(PSS2))); + + static const char *S[] = { + "Anonymous: 0 kB\n", + "AnonHugePages: 0 kB\n", + "Shared_Hugetlb: 0 kB\n", + "Private_Hugetlb: 0 kB\n", + "Locked: 0 kB\n", + }; + int i; + + for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) { + assert(memmem(buf, rv, S[i], strlen(S[i]))); + } + } + + /* Test /proc/$PID/statm */ + { + char buf[64]; + ssize_t rv; + int fd; + + snprintf(buf, sizeof(buf), "/proc/%u/statm", pid); + fd = open(buf, O_RDONLY); + if (fd == -1) { + return 1; + } + rv = read(fd, buf, sizeof(buf)); + assert(rv == 7 * 2); + + assert(buf[0] == '1'); /* ->total_vm */ + assert(buf[1] == ' '); + assert(buf[2] == '0' || buf[2] == '1'); /* rss */ + assert(buf[3] == ' '); + assert(buf[4] == '0' || buf[2] == '1'); /* file rss */ + assert(buf[5] == ' '); + assert(buf[6] == '1'); /* ELF executable segments */ + assert(buf[7] == ' '); + assert(buf[8] == '0'); + assert(buf[9] == ' '); + assert(buf[10] == '0'); /* ->data_vm + ->stack_vm */ + assert(buf[11] == ' '); + assert(buf[12] == '0'); + assert(buf[13] == '\n'); + } + + return 0; +} +#else +int main(void) +{ + return 4; +} +#endif From 332e0e804d64894cf32db363e7f14c64a6ce8061 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:50:42 -0800 Subject: [PATCH 158/159] proc: more robust bulk read test /proc may not be mounted and test will exit successfully. Ensure proc is mounted at /proc. Link: http://lkml.kernel.org/r/20190209105613.GA10384@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/read.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c index b2bd8da58efe..b3ef9e14d6cc 100644 --- a/tools/testing/selftests/proc/read.c +++ b/tools/testing/selftests/proc/read.c @@ -26,8 +26,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -123,10 +125,22 @@ static void f(DIR *d, unsigned int level) int main(void) { DIR *d; + struct statfs sfs; d = opendir("/proc"); if (!d) return 4; + + /* Ensure /proc is proc. */ + if (fstatfs(dirfd(d), &sfs) == -1) { + return 1; + } + if (sfs.f_type != 0x9fa0) { + fprintf(stderr, "error: unexpected f_type %lx\n", (long)sfs.f_type); + return 2; + } + f(d, 0); + return 0; } From fff04900ea79915939ef6a3aad78fca6511a3034 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Tue, 5 Mar 2019 15:50:45 -0800 Subject: [PATCH 159/159] tools/testing/selftests/proc/proc-self-syscall.c: remove duplicate include Remove duplicate header which is included twice. Link: http://lkml.kernel.org/r/20190304182719.GA6606@jordon-HP-15-Notebook-PC Signed-off-by: Sabyasachi Gupta Signed-off-by: Souptick Joarder Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/proc-self-syscall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c index b2993a67a4b3..9f6d000c0245 100644 --- a/tools/testing/selftests/proc/proc-self-syscall.c +++ b/tools/testing/selftests/proc/proc-self-syscall.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include