From 441a627806873c1b63d06dea4391e79c88b8e496 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 21 May 2019 09:05:03 +0530 Subject: [PATCH 01/54] arm64/hugetlb: Use macros for contiguous huge page sizes Replace all open encoded contiguous huge page size computations with available macro encodings CONT_PTE_SIZE and CONT_PMD_SIZE. There are other instances where these macros are used in the file and this change makes it consistently use the same mnemonic. Signed-off-by: Anshuman Khandual Cc: Will Deacon Cc: Steve Capper Cc: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/mm/hugetlbpage.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index f475e54fbc43..bbeb6a5a6ba6 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -228,7 +228,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (sz == PUD_SIZE) { ptep = (pte_t *)pudp; - } else if (sz == (PAGE_SIZE * CONT_PTES)) { + } else if (sz == (CONT_PTE_SIZE)) { pmdp = pmd_alloc(mm, pudp, addr); WARN_ON(addr & (sz - 1)); @@ -246,7 +246,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, ptep = huge_pmd_share(mm, addr, pudp); else ptep = (pte_t *)pmd_alloc(mm, pudp, addr); - } else if (sz == (PMD_SIZE * CONT_PMDS)) { + } else if (sz == (CONT_PMD_SIZE)) { pmdp = pmd_alloc(mm, pudp, addr); WARN_ON(addr & (sz - 1)); return (pte_t *)pmdp; @@ -454,9 +454,9 @@ static int __init hugetlbpage_init(void) #ifdef CONFIG_ARM64_4K_PAGES add_huge_page_size(PUD_SIZE); #endif - add_huge_page_size(PMD_SIZE * CONT_PMDS); + add_huge_page_size(CONT_PMD_SIZE); add_huge_page_size(PMD_SIZE); - add_huge_page_size(PAGE_SIZE * CONT_PTES); + add_huge_page_size(CONT_PTE_SIZE); return 0; } @@ -470,9 +470,9 @@ static __init int setup_hugepagesz(char *opt) #ifdef CONFIG_ARM64_4K_PAGES case PUD_SIZE: #endif - case PMD_SIZE * CONT_PMDS: + case CONT_PMD_SIZE: case PMD_SIZE: - case PAGE_SIZE * CONT_PTES: + case CONT_PTE_SIZE: add_huge_page_size(ps); return 1; } From 201d355c15c170d16741d43b865d5c0ab88a0afc Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 21 May 2019 09:36:27 +0530 Subject: [PATCH 02/54] arm64/mm: Move PTE_VALID from SW defined to HW page table entry definitions PTE_VALID signifies that the last level page table entry is valid and it is MMU recognized while walking the page table. This is not a software defined PTE bit and should not be listed like one. Just move it to appropriate header file. Signed-off-by: Anshuman Khandual Cc: Will Deacon Cc: Steve Capper Cc: Suzuki Poulose Cc: James Morse Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 1 + arch/arm64/include/asm/pgtable-prot.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index a69259cc1f16..974f0114ef1b 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -153,6 +153,7 @@ /* * Level 3 descriptor (PTE). */ +#define PTE_VALID (_AT(pteval_t, 1) << 0) #define PTE_TYPE_MASK (_AT(pteval_t, 3) << 0) #define PTE_TYPE_FAULT (_AT(pteval_t, 0) << 0) #define PTE_TYPE_PAGE (_AT(pteval_t, 3) << 0) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 986e41c4c32b..38c7148e2023 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -24,7 +24,6 @@ /* * Software defined PTE bits definition. */ -#define PTE_VALID (_AT(pteval_t, 1) << 0) #define PTE_WRITE (PTE_DBM) /* same as DBM (51) */ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) From 6fa9b41f6f15d79666e1fbcb7a8c122bea58d9cb Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 21 May 2019 18:21:37 +0100 Subject: [PATCH 03/54] arm64/fpsimd: Remove the prototype for sve_flush_cpu_state() The function sve_flush_cpu_state() has been removed in commit 21cdd7fd76e3 ("KVM: arm64: Remove eager host SVE state saving"). So remove the associated prototype in asm/fpsimd.h. Reviewed-by: Dave Martin Acked-by: Marc Zyngier Signed-off-by: Julien Grall Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/fpsimd.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index df62bbd33a9a..b73d12fcc7f9 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -64,7 +64,6 @@ extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state, extern void fpsimd_flush_task_state(struct task_struct *target); extern void fpsimd_flush_cpu_state(void); -extern void sve_flush_cpu_state(void); /* Maximum VL that SVE VL-agnostic software can transparently support */ #define SVE_VL_ARCH_MAX 0x100 From 54b8c7cbc57c1ce21f4e35101f2609092c4af49a Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 21 May 2019 18:21:38 +0100 Subject: [PATCH 04/54] arm64/fpsimd: Introduce fpsimd_save_and_flush_cpu_state() and use it The only external user of fpsimd_save() and fpsimd_flush_cpu_state() is the KVM FPSIMD code. A following patch will introduce a mechanism to acquire owernship of the FPSIMD/SVE context for performing context management operations. Rather than having to export the new helpers to get/put the context, we can just introduce a new function to combine fpsimd_save() and fpsimd_flush_cpu_state(). This has also the advantage to remove any external call of fpsimd_save() and fpsimd_flush_cpu_state(), so they can be turned static. Lastly, the new function can also be used in the PM notifier. Reviewed-by: Dave Martin Acked-by: Marc Zyngier Signed-off-by: Julien Grall Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/fpsimd.h | 4 +--- arch/arm64/kernel/fpsimd.c | 17 +++++++++++++---- arch/arm64/kvm/fpsimd.c | 4 +--- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index b73d12fcc7f9..4154851c21ab 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -48,8 +48,6 @@ struct task_struct; extern void fpsimd_save_state(struct user_fpsimd_state *state); extern void fpsimd_load_state(struct user_fpsimd_state *state); -extern void fpsimd_save(void); - extern void fpsimd_thread_switch(struct task_struct *next); extern void fpsimd_flush_thread(void); @@ -63,7 +61,7 @@ extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state, void *sve_state, unsigned int sve_vl); extern void fpsimd_flush_task_state(struct task_struct *target); -extern void fpsimd_flush_cpu_state(void); +extern void fpsimd_save_and_flush_cpu_state(void); /* Maximum VL that SVE VL-agnostic software can transparently support */ #define SVE_VL_ARCH_MAX 0x100 diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index a38bf74bcca8..6448921a2f59 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -246,7 +246,7 @@ static void task_fpsimd_load(void) * * Softirqs (and preemption) must be disabled. */ -void fpsimd_save(void) +static void fpsimd_save(void) { struct fpsimd_last_state_struct const *last = this_cpu_ptr(&fpsimd_last_state); @@ -1122,12 +1122,22 @@ void fpsimd_flush_task_state(struct task_struct *t) * Invalidate any task's FPSIMD state that is present on this cpu. * This function must be called with softirqs disabled. */ -void fpsimd_flush_cpu_state(void) +static void fpsimd_flush_cpu_state(void) { __this_cpu_write(fpsimd_last_state.st, NULL); set_thread_flag(TIF_FOREIGN_FPSTATE); } +/* + * Save the FPSIMD state to memory and invalidate cpu view. + * This function must be called with softirqs (and preemption) disabled. + */ +void fpsimd_save_and_flush_cpu_state(void) +{ + fpsimd_save(); + fpsimd_flush_cpu_state(); +} + #ifdef CONFIG_KERNEL_MODE_NEON DEFINE_PER_CPU(bool, kernel_neon_busy); @@ -1284,8 +1294,7 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self, { switch (cmd) { case CPU_PM_ENTER: - fpsimd_save(); - fpsimd_flush_cpu_state(); + fpsimd_save_and_flush_cpu_state(); break; case CPU_PM_EXIT: break; diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 6e3c9c8b2df9..525010504f9d 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -112,9 +112,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { u64 *guest_zcr = &vcpu->arch.ctxt.sys_regs[ZCR_EL1]; - /* Clean guest FP state to memory and invalidate cpu view */ - fpsimd_save(); - fpsimd_flush_cpu_state(); + fpsimd_save_and_flush_cpu_state(); if (guest_has_sve) *guest_zcr = read_sysreg_s(SYS_ZCR_EL12); From 6dcdefcde413c1068b394eeabdfdf6a85213ebe2 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 21 May 2019 18:21:39 +0100 Subject: [PATCH 05/54] arm64/fpsimd: Don't disable softirq when touching FPSIMD/SVE state When the kernel is compiled with CONFIG_KERNEL_MODE_NEON, some part of the kernel may be able to use FPSIMD/SVE. This is for instance the case for crypto code. Any use of FPSIMD/SVE in the kernel are clearly marked by using the function kernel_neon_{begin, end}. Furthermore, this can only be used when may_use_simd() returns true. The current implementation of may_use_simd() allows softirq to use FPSIMD/SVE unless it is currently in use (i.e kernel_neon_busy is true). When in use, softirqs usually fall back to a software method. At the moment, as a softirq may use FPSIMD/SVE, softirqs are disabled when touching the FPSIMD/SVE context. This has the drawback to disable all softirqs even if they are not using FPSIMD/SVE. Since a softirq is supposed to check may_use_simd() anyway before attempting to use FPSIMD/SVE, there is limited reason to keep softirq disabled when touching the FPSIMD/SVE context. Instead, we can simply disable preemption and mark the FPSIMD/SVE context as in use by setting CPU's fpsimd_context_busy flag. Two new helpers {get, put}_cpu_fpsimd_context are introduced to mark the area using FPSIMD/SVE context and they are used to replace local_bh_{disable, enable}. The functions kernel_neon_{begin, end} are also re-implemented to use the new helpers. Additionally, double-underscored versions of the helpers are provided to called when preemption is already disabled. These are only relevant on paths where irqs are disabled anyway, so they are not needed for correctness in the current code. Let's use them anyway though: this marks critical sections clearly and will help to avoid mistakes during future maintenance. The change has been benchmarked on Linux 5.1-rc4 with defconfig. On Juno2: * hackbench 100 process 1000 (10 times) * .7% quicker On ThunderX 2: * hackbench 1000 process 1000 (20 times) * 3.4% quicker Reviewed-by: Dave Martin Acked-by: Marc Zyngier Signed-off-by: Julien Grall Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/simd.h | 10 +-- arch/arm64/kernel/fpsimd.c | 124 +++++++++++++++++++++++----------- 2 files changed, 89 insertions(+), 45 deletions(-) diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h index 6495cc51246f..a6307e43b8c2 100644 --- a/arch/arm64/include/asm/simd.h +++ b/arch/arm64/include/asm/simd.h @@ -15,9 +15,9 @@ #include #include -#ifdef CONFIG_KERNEL_MODE_NEON +DECLARE_PER_CPU(bool, fpsimd_context_busy); -DECLARE_PER_CPU(bool, kernel_neon_busy); +#ifdef CONFIG_KERNEL_MODE_NEON /* * may_use_simd - whether it is allowable at this time to issue SIMD @@ -29,15 +29,15 @@ DECLARE_PER_CPU(bool, kernel_neon_busy); static __must_check inline bool may_use_simd(void) { /* - * kernel_neon_busy is only set while preemption is disabled, + * fpsimd_context_busy is only set while preemption is disabled, * and is clear whenever preemption is enabled. Since - * this_cpu_read() is atomic w.r.t. preemption, kernel_neon_busy + * this_cpu_read() is atomic w.r.t. preemption, fpsimd_context_busy * cannot change under our feet -- if it's set we cannot be * migrated, and if it's clear we cannot be migrated to a CPU * where it is set. */ return !in_irq() && !irqs_disabled() && !in_nmi() && - !this_cpu_read(kernel_neon_busy); + !this_cpu_read(fpsimd_context_busy); } #else /* ! CONFIG_KERNEL_MODE_NEON */ diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 6448921a2f59..c7c454df2779 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -92,7 +92,8 @@ * To prevent this from racing with the manipulation of the task's FPSIMD state * from task context and thereby corrupting the state, it is necessary to * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE - * flag with local_bh_disable() unless softirqs are already masked. + * flag with {, __}get_cpu_fpsimd_context(). This will still allow softirqs to + * run but prevent them to use FPSIMD. * * For a certain task, the sequence may look something like this: * - the task gets scheduled in; if both the task's fpsimd_cpu field @@ -155,6 +156,56 @@ extern void __percpu *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ +DEFINE_PER_CPU(bool, fpsimd_context_busy); +EXPORT_PER_CPU_SYMBOL(fpsimd_context_busy); + +static void __get_cpu_fpsimd_context(void) +{ + bool busy = __this_cpu_xchg(fpsimd_context_busy, true); + + WARN_ON(busy); +} + +/* + * Claim ownership of the CPU FPSIMD context for use by the calling context. + * + * The caller may freely manipulate the FPSIMD context metadata until + * put_cpu_fpsimd_context() is called. + * + * The double-underscore version must only be called if you know the task + * can't be preempted. + */ +static void get_cpu_fpsimd_context(void) +{ + preempt_disable(); + __get_cpu_fpsimd_context(); +} + +static void __put_cpu_fpsimd_context(void) +{ + bool busy = __this_cpu_xchg(fpsimd_context_busy, false); + + WARN_ON(!busy); /* No matching get_cpu_fpsimd_context()? */ +} + +/* + * Release the CPU FPSIMD context. + * + * Must be called from a context in which get_cpu_fpsimd_context() was + * previously called, with no call to put_cpu_fpsimd_context() in the + * meantime. + */ +static void put_cpu_fpsimd_context(void) +{ + __put_cpu_fpsimd_context(); + preempt_enable(); +} + +static bool have_cpu_fpsimd_context(void) +{ + return !preemptible() && __this_cpu_read(fpsimd_context_busy); +} + /* * Call __sve_free() directly only if you know task can't be scheduled * or preempted. @@ -225,12 +276,10 @@ static void sve_free(struct task_struct *task) * This function should be called only when the FPSIMD/SVE state in * thread_struct is known to be up to date, when preparing to enter * userspace. - * - * Softirqs (and preemption) must be disabled. */ static void task_fpsimd_load(void) { - WARN_ON(!in_softirq() && !irqs_disabled()); + WARN_ON(!have_cpu_fpsimd_context()); if (system_supports_sve() && test_thread_flag(TIF_SVE)) sve_load_state(sve_pffr(¤t->thread), @@ -243,8 +292,6 @@ static void task_fpsimd_load(void) /* * Ensure FPSIMD/SVE storage in memory for the loaded context is up to * date with respect to the CPU registers. - * - * Softirqs (and preemption) must be disabled. */ static void fpsimd_save(void) { @@ -252,7 +299,7 @@ static void fpsimd_save(void) this_cpu_ptr(&fpsimd_last_state); /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */ - WARN_ON(!in_softirq() && !irqs_disabled()); + WARN_ON(!have_cpu_fpsimd_context()); if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { if (system_supports_sve() && test_thread_flag(TIF_SVE)) { @@ -357,7 +404,8 @@ static int __init sve_sysctl_init(void) { return 0; } * task->thread.sve_state. * * Task can be a non-runnable task, or current. In the latter case, - * softirqs (and preemption) must be disabled. + * the caller must have ownership of the cpu FPSIMD context before calling + * this function. * task->thread.sve_state must point to at least sve_state_size(task) * bytes of allocated kernel memory. * task->thread.uw.fpsimd_state must be up to date before calling this @@ -384,7 +432,8 @@ static void fpsimd_to_sve(struct task_struct *task) * task->thread.uw.fpsimd_state. * * Task can be a non-runnable task, or current. In the latter case, - * softirqs (and preemption) must be disabled. + * the caller must have ownership of the cpu FPSIMD context before calling + * this function. * task->thread.sve_state must point to at least sve_state_size(task) * bytes of allocated kernel memory. * task->thread.sve_state must be up to date before calling this function. @@ -544,7 +593,7 @@ int sve_set_vector_length(struct task_struct *task, * non-SVE thread. */ if (task == current) { - local_bh_disable(); + get_cpu_fpsimd_context(); fpsimd_save(); } @@ -554,7 +603,7 @@ int sve_set_vector_length(struct task_struct *task, sve_to_fpsimd(task); if (task == current) - local_bh_enable(); + put_cpu_fpsimd_context(); /* * Force reallocation of task SVE state to the correct size @@ -867,7 +916,7 @@ asmlinkage void do_sve_acc(unsigned int esr, struct pt_regs *regs) sve_alloc(current); - local_bh_disable(); + get_cpu_fpsimd_context(); fpsimd_save(); @@ -878,7 +927,7 @@ asmlinkage void do_sve_acc(unsigned int esr, struct pt_regs *regs) if (test_and_set_thread_flag(TIF_SVE)) WARN_ON(1); /* SVE access shouldn't have trapped */ - local_bh_enable(); + put_cpu_fpsimd_context(); } /* @@ -922,6 +971,8 @@ void fpsimd_thread_switch(struct task_struct *next) if (!system_supports_fpsimd()) return; + __get_cpu_fpsimd_context(); + /* Save unsaved fpsimd state, if any: */ fpsimd_save(); @@ -936,6 +987,8 @@ void fpsimd_thread_switch(struct task_struct *next) update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, wrong_task || wrong_cpu); + + __put_cpu_fpsimd_context(); } void fpsimd_flush_thread(void) @@ -945,7 +998,7 @@ void fpsimd_flush_thread(void) if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); fpsimd_flush_task_state(current); memset(¤t->thread.uw.fpsimd_state, 0, @@ -986,7 +1039,7 @@ void fpsimd_flush_thread(void) current->thread.sve_vl_onexec = 0; } - local_bh_enable(); + put_cpu_fpsimd_context(); } /* @@ -998,9 +1051,9 @@ void fpsimd_preserve_current_state(void) if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); fpsimd_save(); - local_bh_enable(); + put_cpu_fpsimd_context(); } /* @@ -1017,7 +1070,8 @@ void fpsimd_signal_preserve_current_state(void) /* * Associate current's FPSIMD context with this cpu - * Preemption must be disabled when calling this function. + * The caller must have ownership of the cpu FPSIMD context before calling + * this function. */ void fpsimd_bind_task_to_cpu(void) { @@ -1063,14 +1117,14 @@ void fpsimd_restore_current_state(void) if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { task_fpsimd_load(); fpsimd_bind_task_to_cpu(); } - local_bh_enable(); + put_cpu_fpsimd_context(); } /* @@ -1083,7 +1137,7 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state) if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); current->thread.uw.fpsimd_state = *state; if (system_supports_sve() && test_thread_flag(TIF_SVE)) @@ -1094,7 +1148,7 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state) clear_thread_flag(TIF_FOREIGN_FPSTATE); - local_bh_enable(); + put_cpu_fpsimd_context(); } /* @@ -1120,7 +1174,8 @@ void fpsimd_flush_task_state(struct task_struct *t) /* * Invalidate any task's FPSIMD state that is present on this cpu. - * This function must be called with softirqs disabled. + * The FPSIMD context should be acquired with get_cpu_fpsimd_context() + * before calling this function. */ static void fpsimd_flush_cpu_state(void) { @@ -1130,19 +1185,19 @@ static void fpsimd_flush_cpu_state(void) /* * Save the FPSIMD state to memory and invalidate cpu view. - * This function must be called with softirqs (and preemption) disabled. + * This function must be called with preemption disabled. */ void fpsimd_save_and_flush_cpu_state(void) { + WARN_ON(preemptible()); + __get_cpu_fpsimd_context(); fpsimd_save(); fpsimd_flush_cpu_state(); + __put_cpu_fpsimd_context(); } #ifdef CONFIG_KERNEL_MODE_NEON -DEFINE_PER_CPU(bool, kernel_neon_busy); -EXPORT_PER_CPU_SYMBOL(kernel_neon_busy); - /* * Kernel-side NEON support functions */ @@ -1167,19 +1222,13 @@ void kernel_neon_begin(void) BUG_ON(!may_use_simd()); - local_bh_disable(); - - __this_cpu_write(kernel_neon_busy, true); + get_cpu_fpsimd_context(); /* Save unsaved fpsimd state, if any: */ fpsimd_save(); /* Invalidate any task state remaining in the fpsimd regs: */ fpsimd_flush_cpu_state(); - - preempt_disable(); - - local_bh_enable(); } EXPORT_SYMBOL(kernel_neon_begin); @@ -1194,15 +1243,10 @@ EXPORT_SYMBOL(kernel_neon_begin); */ void kernel_neon_end(void) { - bool busy; - if (!system_supports_fpsimd()) return; - busy = __this_cpu_xchg(kernel_neon_busy, false); - WARN_ON(!busy); /* No matching kernel_neon_begin()? */ - - preempt_enable(); + put_cpu_fpsimd_context(); } EXPORT_SYMBOL(kernel_neon_end); From 9a83c84c3a491cbe7fc9dea3c43e26a8e67204d2 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Tue, 28 May 2019 10:16:53 +0800 Subject: [PATCH 06/54] drivers: base: cacheinfo: Add variable to record max cache line size Add coherency_max_size variable to record the maximum cache line size for different cache levels. If it is available, we will synchronize it as cache line size, otherwise we will use CTR_EL0.CWG reporting in cache_line_size() for arm64. Cc: "Rafael J. Wysocki" Cc: Jeremy Linton Cc: Will Deacon Reviewed-by: Sudeep Holla Reviewed-by: Greg Kroah-Hartman Signed-off-by: Shaokun Zhang Signed-off-by: Catalin Marinas --- drivers/base/cacheinfo.c | 5 +++++ include/linux/cacheinfo.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index a7359535caf5..8827c60f51e2 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -213,6 +213,8 @@ int __weak cache_setup_acpi(unsigned int cpu) return -ENOTSUPP; } +unsigned int coherency_max_size; + static int cache_shared_cpu_map_setup(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -251,6 +253,9 @@ static int cache_shared_cpu_map_setup(unsigned int cpu) cpumask_set_cpu(i, &this_leaf->shared_cpu_map); } } + /* record the maximum cache line size */ + if (this_leaf->coherency_line_size > coherency_max_size) + coherency_max_size = this_leaf->coherency_line_size; } return 0; diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 70e19bc6cc9f..46b92cd61d0c 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -17,6 +17,8 @@ enum cache_type { CACHE_TYPE_UNIFIED = BIT(2), }; +extern unsigned int coherency_max_size; + /** * struct cacheinfo - represent a cache leaf node * @id: This cache's id. It is unique among caches with the same (type, level). From 7b8c87b297a7c1b3badabc1d054b6e0b758952df Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Tue, 28 May 2019 10:16:54 +0800 Subject: [PATCH 07/54] arm64: cacheinfo: Update cache_line_size detected from DT or PPTT cache_line_size is derived from CTR_EL0.CWG field and is called mostly for I/O device drivers. For some platforms like the HiSilicon Kunpeng920 server SoC, cache line sizes are different between L1/2 cache and L3 cache while L1 cache line size is 64-byte and L3 is 128-byte, but CTR_EL0.CWG is misreporting using L1 cache line size. We shall correct the right value which is important for I/O performance. Let's update the cache line size if it is detected from DT or PPTT information. Cc: Will Deacon Cc: Jeremy Linton Cc: Zhenfa Qiu Reported-by: Zhenfa Qiu Suggested-by: Catalin Marinas Reviewed-by: Sudeep Holla Signed-off-by: Shaokun Zhang Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cache.h | 6 +----- arch/arm64/kernel/cacheinfo.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 926434f413fa..758af6340314 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -91,11 +91,7 @@ static inline u32 cache_type_cwg(void) #define __read_mostly __attribute__((__section__(".data..read_mostly"))) -static inline int cache_line_size(void) -{ - u32 cwg = cache_type_cwg(); - return cwg ? 4 << cwg : ARCH_DMA_MINALIGN; -} +int cache_line_size(void); /* * Read the effective value of CTR_EL0. diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index 0bf0a835122f..0c0cd4d26b87 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -28,6 +28,17 @@ #define CLIDR_CTYPE(clidr, level) \ (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) +int cache_line_size(void) +{ + u32 cwg = cache_type_cwg(); + + if (coherency_max_size != 0) + return coherency_max_size; + + return cwg ? 4 << cwg : ARCH_DMA_MINALIGN; +} +EXPORT_SYMBOL_GPL(cache_line_size); + static inline enum cache_type get_cache_type(int level) { u64 clidr; From f7f0097af67c3c119f6dc7046234630e77f4877e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 27 May 2019 09:28:15 +0530 Subject: [PATCH 08/54] arm64/mm: Simplify protection flag creation for kernel huge mappings Even though they have got the same value, PMD_TYPE_SECT and PUD_TYPE_SECT get used for kernel huge mappings. But before that first the table bit gets cleared using leaf level PTE_TABLE_BIT. Though functionally they are same, we should use page table level specific macros to be consistent as per the MMU specifications. Create page table level specific wrappers for kernel huge mapping entries and just drop mk_sect_prot() which does not have any other user. Signed-off-by: Anshuman Khandual Cc: Will Deacon Cc: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 9 +++++++-- arch/arm64/mm/mmu.c | 8 ++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 2c41b04708fe..6478dd121228 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -335,9 +335,14 @@ static inline pmd_t pte_pmd(pte_t pte) return __pmd(pte_val(pte)); } -static inline pgprot_t mk_sect_prot(pgprot_t prot) +static inline pgprot_t mk_pud_sect_prot(pgprot_t prot) { - return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT); + return __pgprot((pgprot_val(prot) & ~PUD_TABLE_BIT) | PUD_TYPE_SECT); +} + +static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT); } #ifdef CONFIG_NUMA_BALANCING diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a1bfc4413982..22c2e4f0768f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -971,9 +971,7 @@ int __init arch_ioremap_pmd_supported(void) int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) { - pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT | - pgprot_val(mk_sect_prot(prot))); - pud_t new_pud = pfn_pud(__phys_to_pfn(phys), sect_prot); + pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot)); /* Only allow permission changes for now */ if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)), @@ -987,9 +985,7 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) { - pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT | - pgprot_val(mk_sect_prot(prot))); - pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), sect_prot); + pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot)); /* Only allow permission changes for now */ if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)), From 0c1f14ed12262f45a3af1d588e4d7bd12438b8f5 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Wed, 29 May 2019 00:08:20 +0800 Subject: [PATCH 09/54] arm64: mm: make CONFIG_ZONE_DMA32 configurable This change makes CONFIG_ZONE_DMA32 defuly y and allows users to overwrite it only when CONFIG_EXPERT=y. For the SoCs that do not need CONFIG_ZONE_DMA32, this is the first step to manage all available memory by a single zone(normal zone) to reduce the overhead of multiple zones. The change also fixes a build error when CONFIG_NUMA=y and CONFIG_ZONE_DMA32=n. arch/arm64/mm/init.c:195:17: error: use of undeclared identifier 'ZONE_DMA32' max_zone_pfns[ZONE_DMA32] = PFN_DOWN(max_zone_dma_phys()); Change since v1: 1. only expose CONFIG_ZONE_DMA32 when CONFIG_EXPERT=y 2. remove redundant IS_ENABLED(CONFIG_ZONE_DMA32) Cc: Robin Murphy Signed-off-by: Miles Chen Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 3 ++- arch/arm64/mm/init.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 697ea0510729..cf5f1dafcf74 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -260,7 +260,8 @@ config GENERIC_CALIBRATE_DELAY def_bool y config ZONE_DMA32 - def_bool y + bool "Support DMA32 zone" if EXPERT + default y config HAVE_GENERIC_GUP def_bool y diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index d2adffb81b5d..f643bd45ff69 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -191,8 +191,9 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - max_zone_pfns[ZONE_DMA32] = PFN_DOWN(max_zone_dma_phys()); +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = PFN_DOWN(max_zone_dma_phys()); +#endif max_zone_pfns[ZONE_NORMAL] = max; free_area_init_nodes(max_zone_pfns); From 27e6e7d63fc2b43334ce79070a727a9ca6e58700 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Thu, 30 May 2019 12:30:58 +0100 Subject: [PATCH 10/54] arm64/cpufeature: Convert hook_lock to raw_spin_lock_t in cpu_enable_ssbs() cpu_enable_ssbs() is called via stop_machine() as part of the cpu_enable callback. A spin lock is used to ensure the hook is registered before the rest of the callback is executed. On -RT spin_lock() may sleep. However, all the callees in stop_machine() are expected to not sleep. Therefore a raw_spin_lock() is required here. Given this is already done under stop_machine() and the work done under the lock is quite small, the latency should not increase too much. Signed-off-by: Julien Grall Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index ca27e08e3d8a..2a7159fda3ce 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1194,14 +1194,14 @@ static struct undef_hook ssbs_emulation_hook = { static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused) { static bool undef_hook_registered = false; - static DEFINE_SPINLOCK(hook_lock); + static DEFINE_RAW_SPINLOCK(hook_lock); - spin_lock(&hook_lock); + raw_spin_lock(&hook_lock); if (!undef_hook_registered) { register_undef_hook(&ssbs_emulation_hook); undef_hook_registered = true; } - spin_unlock(&hook_lock); + raw_spin_unlock(&hook_lock); if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); From 2e6aee5af330e8b4ccfdfcdb3fedcb385d052d78 Mon Sep 17 00:00:00 2001 From: Liu Song Date: Sat, 1 Jun 2019 10:08:08 +0800 Subject: [PATCH 11/54] arm64: kernel: use aff3 instead of aff2 in comment Should use aff3 instead of aff2 in comment. Signed-off-by: Liu Song Signed-off-by: Catalin Marinas --- arch/arm64/kernel/sleep.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 3e53ffa07994..f5b04dd8a710 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -27,7 +27,7 @@ * aff0 = mpidr_masked & 0xff; * aff1 = mpidr_masked & 0xff00; * aff2 = mpidr_masked & 0xff0000; - * aff2 = mpidr_masked & 0xff00000000; + * aff3 = mpidr_masked & 0xff00000000; * dst = (aff0 >> rs0 | aff1 >> rs1 | aff2 >> rs2 | aff3 >> rs3); *} * Input registers: rs0, rs1, rs2, rs3, mpidr, mask From 87dedf7c61ab07d7fe53bcf93103d2d845d804d8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 27 May 2019 12:33:29 +0530 Subject: [PATCH 12/54] arm64/mm: Change BUG_ON() to VM_BUG_ON() in [pmd|pud]_set_huge() There are no callers for the functions which will pass unaligned physical addresses. Hence just change these BUG_ON() checks into VM_BUG_ON() which gets compiled out unless CONFIG_VM_DEBUG is enabled. Signed-off-by: Anshuman Khandual Cc: Will Deacon Cc: Mark Rutland Cc: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 22c2e4f0768f..69e65b6585e6 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -978,7 +978,7 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) pud_val(new_pud))) return 0; - BUG_ON(phys & ~PUD_MASK); + VM_BUG_ON(phys & ~PUD_MASK); set_pud(pudp, new_pud); return 1; } @@ -992,7 +992,7 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) pmd_val(new_pmd))) return 0; - BUG_ON(phys & ~PMD_MASK); + VM_BUG_ON(phys & ~PMD_MASK); set_pmd(pmdp, new_pmd); return 1; } From 01de1776f62e6437ccd207181ec503e8a56e6128 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Sun, 5 May 2019 09:45:12 +0530 Subject: [PATCH 13/54] arm64/mm: Identify user instruction aborts We don't currently set the FAULT_FLAG_INSTRUCTION mm flag for EL0 instruction aborts. This has no functional impact, as we don't override arch_vma_access_permitted(), and the default implementation always returns true. However, it would be helpful to provide the flag so that it can be consumed by tracepoints such as dax_pmd_fault. This patch sets the FAULT_FLAG_INSTRUCTION flag for EL0 instruction aborts. Signed-off-by: Anshuman Khandual Cc: Will Deacon Cc: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a30818ed9c60..392386a693fe 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -464,6 +464,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (is_el0_instruction_abort(esr)) { vm_flags = VM_EXEC; + mm_flags |= FAULT_FLAG_INSTRUCTION; } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) { vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; From a0509313d5dea0a27a5968f04bd556d05e6349fd Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 3 Jun 2019 12:11:22 +0530 Subject: [PATCH 14/54] arm64/mm: Drop mmap_sem before calling __do_kernel_fault() There is an inconsistency between down_read_trylock() success and failure paths while dealing with kernel access for non exception table areas where it calls __do_kernel_fault(). In case of failure it just bails out without holding mmap_sem but when it succeeds it does so while holding mmap_sem. Fix this inconsistency by just dropping mmap_sem in success path as well. __do_kernel_fault() calls die_kernel_fault() which then calls show_pte(). show_pte() in this path might become bit more unreliable without holding mmap_sem. But there are already instances [1] in do_page_fault() where die_kernel_fault() gets called without holding mmap_sem. show_pte() can be made more robust independently but in a later patch. [1] Conditional block for (is_ttbr0_addr && is_el1_permission_fault) Signed-off-by: Anshuman Khandual Cc: Will Deacon Cc: Mark Rutland Cc: James Morse Cc: Andrey Konovalov Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 392386a693fe..2256a1a09f1b 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -504,8 +504,10 @@ retry: */ might_sleep(); #ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && !search_exception_tables(regs->pc)) + if (!user_mode(regs) && !search_exception_tables(regs->pc)) { + up_read(&mm->mmap_sem); goto no_context; + } #endif } From 616810360043183a9db73e39f5aadbe48952c383 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 3 Jun 2019 12:11:23 +0530 Subject: [PATCH 15/54] arm64/mm: Drop task_struct argument from __do_page_fault() The task_struct argument is not getting used in __do_page_fault(). Hence just drop it and use current or cuurent->mm instead where ever required. This does not change any functionality. Signed-off-by: Anshuman Khandual Cc: Will Deacon Cc: Mark Rutland Cc: James Morse Cc: Andrey Konovalov Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 2256a1a09f1b..7c1c8f435f86 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -395,8 +395,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re #define VM_FAULT_BADACCESS 0x020000 static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, - unsigned int mm_flags, unsigned long vm_flags, - struct task_struct *tsk) + unsigned int mm_flags, unsigned long vm_flags) { struct vm_area_struct *vma; vm_fault_t fault; @@ -440,8 +439,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; - struct task_struct *tsk; - struct mm_struct *mm; + struct mm_struct *mm = current->mm; vm_fault_t fault, major = 0; unsigned long vm_flags = VM_READ | VM_WRITE; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; @@ -449,9 +447,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (notify_page_fault(regs, esr)) return 0; - tsk = current; - mm = tsk->mm; - /* * If we're in an interrupt or have no user context, we must not take * the fault. @@ -511,7 +506,7 @@ retry: #endif } - fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk); + fault = __do_page_fault(mm, addr, mm_flags, vm_flags); major |= fault & VM_FAULT_MAJOR; if (fault & VM_FAULT_RETRY) { @@ -551,11 +546,11 @@ retry: * that point. */ if (major) { - tsk->maj_flt++; + current->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, addr); } else { - tsk->min_flt++; + current->min_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, addr); } From 15532fd6f57c297c45ef3f5c17d2fbcdcc8092e4 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 23 May 2019 10:06:15 +0100 Subject: [PATCH 16/54] ptrace: move clearing of TIF_SYSCALL_EMU flag to core While the TIF_SYSCALL_EMU is set in ptrace_resume independent of any architecture, currently only powerpc and x86 unset the TIF_SYSCALL_EMU flag in ptrace_disable which gets called from ptrace_detach. Let's move the clearing of TIF_SYSCALL_EMU flag to __ptrace_unlink which gets executed from ptrace_detach and also keep it along with or close to clearing of TIF_SYSCALL_TRACE. Cc: Paul Mackerras Cc: Michael Ellerman Cc: Thomas Gleixner Cc: Ingo Molnar Acked-by: Oleg Nesterov Signed-off-by: Sudeep Holla Signed-off-by: Catalin Marinas --- arch/powerpc/kernel/ptrace.c | 1 - arch/x86/kernel/ptrace.c | 3 --- kernel/ptrace.c | 3 +++ 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 684b0b315c32..8c92febf5f44 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -2521,7 +2521,6 @@ void ptrace_disable(struct task_struct *child) { /* make sure the single step bit is not set. */ user_disable_single_step(child); - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); } #ifdef CONFIG_PPC_ADV_DEBUG_REGS diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a166c960bc9e..36998e0c3fc4 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -747,9 +747,6 @@ static int ioperm_get(struct task_struct *target, void ptrace_disable(struct task_struct *child) { user_disable_single_step(child); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); -#endif } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 5710d07e67cf..ab14654b2436 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -118,6 +118,9 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#endif child->parent = child->real_parent; list_del_init(&child->ptrace_entry); From fd3866381be2681395e398c511699fd61a098609 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 23 May 2019 10:06:17 +0100 Subject: [PATCH 17/54] arm64: add PTRACE_SYSEMU{,SINGLESTEP} definations to uapi headers x86 and um use 31 and 32 for PTRACE_SYSEMU and PTRACE_SYSEMU_SINGLESTEP while powerpc uses different value maybe for legacy reasons. Though handling of PTRACE_SYSEMU can be made architecture independent, it's hard to make these definations generic. To add to this existing mess few architectures like arm, c6x and sh use 31 for PTRACE_GETFDPIC (get the ELF fdpic loadmap address). It's not possible to move the definations to generic headers. So we unfortunately have to duplicate the same defination to ARM64 if we need to support PTRACE_SYSEMU and PTRACE_SYSEMU_SINGLESTEP. Cc: Will Deacon Signed-off-by: Sudeep Holla Signed-off-by: Catalin Marinas --- arch/arm64/include/uapi/asm/ptrace.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index d78623acb649..627ac57c1581 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -62,6 +62,9 @@ #define PSR_x 0x0000ff00 /* Extension */ #define PSR_c 0x000000ff /* Control */ +/* syscall emulation path in ptrace */ +#define PTRACE_SYSEMU 31 +#define PTRACE_SYSEMU_SINGLESTEP 32 #ifndef __ASSEMBLY__ From f086f67485c5c126bcec4b0e96ac7319a2e59ab8 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 23 May 2019 10:06:18 +0100 Subject: [PATCH 18/54] arm64: ptrace: add support for syscall emulation Add PTRACE_SYSEMU and PTRACE_SYSEMU_SINGLESTEP support on arm64. We don't need any special handling for PTRACE_SYSEMU_SINGLESTEP. It's quite difficult to generalize handling PTRACE_SYSEMU cross architectures and avoid calls to tracehook_report_syscall_entry twice. Different architecture have different mechanism to indicate NO_SYSCALL and trying to generalise adds more code for no gain. Cc: Will Deacon Signed-off-by: Sudeep Holla Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/thread_info.h | 5 ++++- arch/arm64/kernel/ptrace.c | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index eb3ef73e07cf..c285d1ce7186 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -75,6 +75,7 @@ void arch_release_task_struct(struct task_struct *tsk); * TIF_SYSCALL_TRACE - syscall trace active * TIF_SYSCALL_TRACEPOINT - syscall tracepoint for ftrace * TIF_SYSCALL_AUDIT - syscall auditing + * TIF_SYSCALL_EMU - syscall emulation active * TIF_SECOMP - syscall secure computing * TIF_SIGPENDING - signal pending * TIF_NEED_RESCHED - rescheduling necessary @@ -91,6 +92,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_SYSCALL_AUDIT 9 #define TIF_SYSCALL_TRACEPOINT 10 #define TIF_SECCOMP 11 +#define TIF_SYSCALL_EMU 12 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ #define TIF_FREEZE 19 #define TIF_RESTORE_SIGMASK 20 @@ -109,6 +111,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_32BIT (1 << TIF_32BIT) @@ -120,7 +123,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ - _TIF_NOHZ) + _TIF_NOHZ | _TIF_SYSCALL_EMU) #define INIT_THREAD_INFO(tsk) \ { \ diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index b82e0a9b3da3..9353355cb91a 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1819,8 +1819,12 @@ static void tracehook_report_syscall(struct pt_regs *regs, int syscall_trace_enter(struct pt_regs *regs) { - if (test_thread_flag(TIF_SYSCALL_TRACE)) + if (test_thread_flag(TIF_SYSCALL_TRACE) || + test_thread_flag(TIF_SYSCALL_EMU)) { tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); + if (!in_syscall(regs) || test_thread_flag(TIF_SYSCALL_EMU)) + return -1; + } /* Do the secure computing after ptrace; failures should be fast. */ if (secure_computing(NULL) == -1) From 8e01076afd97521d992e13fb89fb59a6e48fbeec Mon Sep 17 00:00:00 2001 From: Odin Ugedal Date: Fri, 7 Jun 2019 01:49:10 +0200 Subject: [PATCH 19/54] arm64: Fix comment after #endif The config value used in the if was changed in b433dce056d3814dc4b33e5a8a533d6401ffcfb0, but the comment on the corresponding end was not changed. Signed-off-by: Odin Ugedal Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 69e65b6585e6..21f3931c73fd 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -776,7 +776,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, return 0; } -#endif /* CONFIG_ARM64_64K_PAGES */ +#endif /* !ARM64_SWAPPER_USES_SECTION_MAPS */ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { From c49bd02f4c7412f0182252ae2ef6e916ca4ff359 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 7 Jun 2019 14:43:05 +0530 Subject: [PATCH 20/54] arm64/mm: Document write abort detection from ESR This patch adds an is_write_abort() wrapper and documents the detection of the abort type on cache maintenance operations. Cc: Will Deacon Cc: James Morse Cc: Andrey Konovalov Acked-by: Mark Rutland Signed-off-by: Anshuman Khandual [catalin.marinas@arm.com: only keep the is_write_abort() wrapper] Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 7c1c8f435f86..765b5eb601ca 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -435,6 +435,15 @@ static bool is_el0_instruction_abort(unsigned int esr) return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; } +/* + * Note: not valid for EL1 DC IVAC, but we never use that such that it + * should fault. EL0 cannot issue DC IVAC (undef). + */ +static bool is_write_abort(unsigned int esr) +{ + return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); +} + static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -460,7 +469,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (is_el0_instruction_abort(esr)) { vm_flags = VM_EXEC; mm_flags |= FAULT_FLAG_INSTRUCTION; - } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) { + } else if (is_write_abort(esr)) { vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } From 4745224b45097d333358bce298aea2137246183c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 7 Jun 2019 14:43:06 +0530 Subject: [PATCH 21/54] arm64/mm: Refactor __do_page_fault() __do_page_fault() is over complicated with multiple goto statements. This cleans up the code flow and while there drops local variable vm_fault_t. Reviewed-by: Mark Rutland Signed-off-by: Anshuman Khandual Cc: Will Deacon Cc: James Morse Cc: Andrey Konovalov Cc: Christoph Hellwig Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 765b5eb601ca..582061dec89f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -397,37 +397,29 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int mm_flags, unsigned long vm_flags) { - struct vm_area_struct *vma; - vm_fault_t fault; + struct vm_area_struct *vma = find_vma(mm, addr); - vma = find_vma(mm, addr); - fault = VM_FAULT_BADMAP; if (unlikely(!vma)) - goto out; - if (unlikely(vma->vm_start > addr)) - goto check_stack; + return VM_FAULT_BADMAP; /* * Ok, we have a good vm_area for this memory access, so we can handle * it. */ -good_area: + if (unlikely(vma->vm_start > addr)) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + return VM_FAULT_BADMAP; + if (expand_stack(vma, addr)) + return VM_FAULT_BADMAP; + } + /* * Check that the permissions on the VMA allow for the fault which * occurred. */ - if (!(vma->vm_flags & vm_flags)) { - fault = VM_FAULT_BADACCESS; - goto out; - } - + if (!(vma->vm_flags & vm_flags)) + return VM_FAULT_BADACCESS; return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags); - -check_stack: - if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) - goto good_area; -out: - return fault; } static bool is_el0_instruction_abort(unsigned int esr) From 9b604722059039a1a3ff69fb8dfd024264046024 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Jun 2019 13:41:07 +0100 Subject: [PATCH 22/54] arm64: mm: avoid redundant READ_ONCE(*ptep) In set_pte_at(), we read the old pte value so that it can be passed into checks for racy hw updates. These checks are only performed for CONFIG_DEBUG_VM, and the value is not used otherwise. Since we read the pte value with READ_ONCE(), the compiler cannot elide the redundant read for !CONFIG_DEBUG_VM kernels. Let's ameliorate matters by moving the read and the checks into a helper, __check_racy_pte_update(), which only performs the read when the value will be used. This also allows us to reformat the conditions for clarity. Acked-by: Will Deacon Signed-off-by: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 47 ++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6478dd121228..2023eab19d73 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -246,29 +246,42 @@ extern void __sync_icache_dcache(pte_t pteval); * * PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY) */ -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) + +static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep, + pte_t pte) { pte_t old_pte; + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + + old_pte = READ_ONCE(*ptep); + + if (!pte_valid(old_pte) || !pte_valid(pte)) + return; + if (mm != current->active_mm && atomic_read(&mm->mm_users) <= 1) + return; + + /* + * Check for potential race with hardware updates of the pte + * (ptep_set_access_flags safely changes valid ptes without going + * through an invalid entry). + */ + VM_WARN_ONCE(!pte_young(pte), + "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", + __func__, pte_val(old_pte), pte_val(pte)); + VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte), + "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx", + __func__, pte_val(old_pte), pte_val(pte)); +} + +static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) __sync_icache_dcache(pte); - /* - * If the existing pte is valid, check for potential race with - * hardware updates of the pte (ptep_set_access_flags safely changes - * valid ptes without going through an invalid entry). - */ - old_pte = READ_ONCE(*ptep); - if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(old_pte) && pte_valid(pte) && - (mm == current->active_mm || atomic_read(&mm->mm_users) > 1)) { - VM_WARN_ONCE(!pte_young(pte), - "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", - __func__, pte_val(old_pte), pte_val(pte)); - VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte), - "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx", - __func__, pte_val(old_pte), pte_val(pte)); - } + __check_racy_pte_update(mm, ptep, pte); set_pte(ptep, pte); } From 2b37c1c3e7bbfaaa3910ee66f44f5a4138229884 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 1 May 2019 18:43:26 +0000 Subject: [PATCH 23/54] dt-bindings: perf: imx8-ddr: add imx8qxp ddr performance monitor Add binding documentation for the imx8qxp DDR performance monitor unit. Signed-off-by: Frank Li Reviewed-by: Rob Herring [will: Removed trailing newline] Signed-off-by: Will Deacon --- .../devicetree/bindings/perf/fsl-imx-ddr.txt | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt new file mode 100644 index 000000000000..d77e3f26f9e6 --- /dev/null +++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt @@ -0,0 +1,21 @@ +* Freescale(NXP) IMX8 DDR performance monitor + +Required properties: + +- compatible: should be one of: + "fsl,imx8-ddr-pmu" + "fsl,imx8m-ddr-pmu" + +- reg: physical address and size + +- interrupts: single interrupt + generated by the control block + +Example: + + ddr-pmu@5c020000 { + compatible = "fsl,imx8-ddr-pmu"; + reg = <0x5c020000 0x10000>; + interrupt-parent = <&gic>; + interrupts = ; + }; From 9a66d36cc7ace8062bd703d1edfb99437a2ddf2b Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 1 May 2019 18:43:29 +0000 Subject: [PATCH 24/54] drivers/perf: imx_ddr: Add DDR performance counter support to perf Add DDR performance monitor support for iMX8QXP. The PMU consists of 3 programmable event counters and a single dedicated cycle counter. Example usage: $ perf stat -a -e \ imx8_ddr0/read-cycles/,imx8_ddr0/write-cycles/,imx8_ddr0/precharge/ ls - or - $ perf stat -a -e \ imx8_ddr0/cycles/,imx8_ddr0/read-access/,imx8_ddr0/write-access/ ls Other events are supported, and advertised via perf list. Reviewed-by: Andrey Smirnov Signed-off-by: Frank Li [will: rewrote commit message/kconfig and used #defines for dev/cpuhp names] Signed-off-by: Will Deacon --- drivers/perf/Kconfig | 8 + drivers/perf/Makefile | 1 + drivers/perf/fsl_imx8_ddr_perf.c | 554 +++++++++++++++++++++++++++++++ 3 files changed, 563 insertions(+) create mode 100644 drivers/perf/fsl_imx8_ddr_perf.c diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index e4221a107dca..09ae8a970880 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -71,6 +71,14 @@ config ARM_DSU_PMU system, control logic. The PMU allows counting various events related to DSU. +config FSL_IMX8_DDR_PMU + tristate "Freescale i.MX8 DDR perf monitor" + depends on ARCH_MXC + help + Provides support for the DDR performance monitor in i.MX8, which + can give information about memory throughput and other related + events. + config HISI_PMU bool "HiSilicon SoC PMU" depends on ARM64 && ACPI diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 30489941f3d6..2ebb4de17815 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o +obj-$(CONFIG_FSL_IMX8_DDR_PMU) += fsl_imx8_ddr_perf.o obj-$(CONFIG_HISI_PMU) += hisilicon/ obj-$(CONFIG_QCOM_L2_PMU) += qcom_l2_pmu.o obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c new file mode 100644 index 000000000000..63fe21600072 --- /dev/null +++ b/drivers/perf/fsl_imx8_ddr_perf.c @@ -0,0 +1,554 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2017 NXP + * Copyright 2016 Freescale Semiconductor, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define COUNTER_CNTL 0x0 +#define COUNTER_READ 0x20 + +#define COUNTER_DPCR1 0x30 + +#define CNTL_OVER 0x1 +#define CNTL_CLEAR 0x2 +#define CNTL_EN 0x4 +#define CNTL_EN_MASK 0xFFFFFFFB +#define CNTL_CLEAR_MASK 0xFFFFFFFD +#define CNTL_OVER_MASK 0xFFFFFFFE + +#define CNTL_CSV_SHIFT 24 +#define CNTL_CSV_MASK (0xFF << CNTL_CSV_SHIFT) + +#define EVENT_CYCLES_ID 0 +#define EVENT_CYCLES_COUNTER 0 +#define NUM_COUNTERS 4 + +#define to_ddr_pmu(p) container_of(p, struct ddr_pmu, pmu) + +#define DDR_PERF_DEV_NAME "imx8_ddr" +#define DDR_CPUHP_CB_NAME DDR_PERF_DEV_NAME "_perf_pmu" + +static DEFINE_IDA(ddr_ida); + +static const struct of_device_id imx_ddr_pmu_dt_ids[] = { + { .compatible = "fsl,imx8-ddr-pmu",}, + { .compatible = "fsl,imx8m-ddr-pmu",}, + { /* sentinel */ } +}; + +struct ddr_pmu { + struct pmu pmu; + void __iomem *base; + unsigned int cpu; + struct hlist_node node; + struct device *dev; + struct perf_event *events[NUM_COUNTERS]; + int active_events; + enum cpuhp_state cpuhp_state; + int irq; + int id; +}; + +static ssize_t ddr_perf_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ddr_pmu *pmu = dev_get_drvdata(dev); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(pmu->cpu)); +} + +static struct device_attribute ddr_perf_cpumask_attr = + __ATTR(cpumask, 0444, ddr_perf_cpumask_show, NULL); + +static struct attribute *ddr_perf_cpumask_attrs[] = { + &ddr_perf_cpumask_attr.attr, + NULL, +}; + +static struct attribute_group ddr_perf_cpumask_attr_group = { + .attrs = ddr_perf_cpumask_attrs, +}; + +static ssize_t +ddr_pmu_event_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); + return sprintf(page, "event=0x%02llx\n", pmu_attr->id); +} + +#define IMX8_DDR_PMU_EVENT_ATTR(_name, _id) \ + (&((struct perf_pmu_events_attr[]) { \ + { .attr = __ATTR(_name, 0444, ddr_pmu_event_show, NULL),\ + .id = _id, } \ + })[0].attr.attr) + +static struct attribute *ddr_perf_events_attrs[] = { + IMX8_DDR_PMU_EVENT_ATTR(cycles, EVENT_CYCLES_ID), + IMX8_DDR_PMU_EVENT_ATTR(selfresh, 0x01), + IMX8_DDR_PMU_EVENT_ATTR(read-accesses, 0x04), + IMX8_DDR_PMU_EVENT_ATTR(write-accesses, 0x05), + IMX8_DDR_PMU_EVENT_ATTR(read-queue-depth, 0x08), + IMX8_DDR_PMU_EVENT_ATTR(write-queue-depth, 0x09), + IMX8_DDR_PMU_EVENT_ATTR(lp-read-credit-cnt, 0x10), + IMX8_DDR_PMU_EVENT_ATTR(hp-read-credit-cnt, 0x11), + IMX8_DDR_PMU_EVENT_ATTR(write-credit-cnt, 0x12), + IMX8_DDR_PMU_EVENT_ATTR(read-command, 0x20), + IMX8_DDR_PMU_EVENT_ATTR(write-command, 0x21), + IMX8_DDR_PMU_EVENT_ATTR(read-modify-write-command, 0x22), + IMX8_DDR_PMU_EVENT_ATTR(hp-read, 0x23), + IMX8_DDR_PMU_EVENT_ATTR(hp-req-nocredit, 0x24), + IMX8_DDR_PMU_EVENT_ATTR(hp-xact-credit, 0x25), + IMX8_DDR_PMU_EVENT_ATTR(lp-req-nocredit, 0x26), + IMX8_DDR_PMU_EVENT_ATTR(lp-xact-credit, 0x27), + IMX8_DDR_PMU_EVENT_ATTR(wr-xact-credit, 0x29), + IMX8_DDR_PMU_EVENT_ATTR(read-cycles, 0x2a), + IMX8_DDR_PMU_EVENT_ATTR(write-cycles, 0x2b), + IMX8_DDR_PMU_EVENT_ATTR(read-write-transition, 0x30), + IMX8_DDR_PMU_EVENT_ATTR(precharge, 0x31), + IMX8_DDR_PMU_EVENT_ATTR(activate, 0x32), + IMX8_DDR_PMU_EVENT_ATTR(load-mode, 0x33), + IMX8_DDR_PMU_EVENT_ATTR(perf-mwr, 0x34), + IMX8_DDR_PMU_EVENT_ATTR(read, 0x35), + IMX8_DDR_PMU_EVENT_ATTR(read-activate, 0x36), + IMX8_DDR_PMU_EVENT_ATTR(refresh, 0x37), + IMX8_DDR_PMU_EVENT_ATTR(write, 0x38), + IMX8_DDR_PMU_EVENT_ATTR(raw-hazard, 0x39), + NULL, +}; + +static struct attribute_group ddr_perf_events_attr_group = { + .name = "events", + .attrs = ddr_perf_events_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); + +static struct attribute *ddr_perf_format_attrs[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group ddr_perf_format_attr_group = { + .name = "format", + .attrs = ddr_perf_format_attrs, +}; + +static const struct attribute_group *attr_groups[] = { + &ddr_perf_events_attr_group, + &ddr_perf_format_attr_group, + &ddr_perf_cpumask_attr_group, + NULL, +}; + +static u32 ddr_perf_alloc_counter(struct ddr_pmu *pmu, int event) +{ + int i; + + /* + * Always map cycle event to counter 0 + * Cycles counter is dedicated for cycle event + * can't used for the other events + */ + if (event == EVENT_CYCLES_ID) { + if (pmu->events[EVENT_CYCLES_COUNTER] == NULL) + return EVENT_CYCLES_COUNTER; + else + return -ENOENT; + } + + for (i = 1; i < NUM_COUNTERS; i++) { + if (pmu->events[i] == NULL) + return i; + } + + return -ENOENT; +} + +static void ddr_perf_free_counter(struct ddr_pmu *pmu, int counter) +{ + pmu->events[counter] = NULL; +} + +static u32 ddr_perf_read_counter(struct ddr_pmu *pmu, int counter) +{ + return readl_relaxed(pmu->base + COUNTER_READ + counter * 4); +} + +static int ddr_perf_event_init(struct perf_event *event) +{ + struct ddr_pmu *pmu = to_ddr_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + struct perf_event *sibling; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EOPNOTSUPP; + + if (event->cpu < 0) { + dev_warn(pmu->dev, "Can't provide per-task data!\n"); + return -EOPNOTSUPP; + } + + /* + * We must NOT create groups containing mixed PMUs, although software + * events are acceptable (for example to create a CCN group + * periodically read when a hrtimer aka cpu-clock leader triggers). + */ + if (event->group_leader->pmu != event->pmu && + !is_software_event(event->group_leader)) + return -EINVAL; + + for_each_sibling_event(sibling, event->group_leader) { + if (sibling->pmu != event->pmu && + !is_software_event(sibling)) + return -EINVAL; + } + + event->cpu = pmu->cpu; + hwc->idx = -1; + + return 0; +} + + +static void ddr_perf_event_update(struct perf_event *event) +{ + struct ddr_pmu *pmu = to_ddr_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u64 delta, prev_raw_count, new_raw_count; + int counter = hwc->idx; + + do { + prev_raw_count = local64_read(&hwc->prev_count); + new_raw_count = ddr_perf_read_counter(pmu, counter); + } while (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count); + + delta = (new_raw_count - prev_raw_count) & 0xFFFFFFFF; + + local64_add(delta, &event->count); +} + +static void ddr_perf_counter_enable(struct ddr_pmu *pmu, int config, + int counter, bool enable) +{ + u8 reg = counter * 4 + COUNTER_CNTL; + int val; + + if (enable) { + /* + * must disable first, then enable again + * otherwise, cycle counter will not work + * if previous state is enabled. + */ + writel(0, pmu->base + reg); + val = CNTL_EN | CNTL_CLEAR; + val |= FIELD_PREP(CNTL_CSV_MASK, config); + writel(val, pmu->base + reg); + } else { + /* Disable counter */ + writel(0, pmu->base + reg); + } +} + +static void ddr_perf_event_start(struct perf_event *event, int flags) +{ + struct ddr_pmu *pmu = to_ddr_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int counter = hwc->idx; + + local64_set(&hwc->prev_count, 0); + + ddr_perf_counter_enable(pmu, event->attr.config, counter, true); + + hwc->state = 0; +} + +static int ddr_perf_event_add(struct perf_event *event, int flags) +{ + struct ddr_pmu *pmu = to_ddr_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int counter; + int cfg = event->attr.config; + + counter = ddr_perf_alloc_counter(pmu, cfg); + if (counter < 0) { + dev_dbg(pmu->dev, "There are not enough counters\n"); + return -EOPNOTSUPP; + } + + pmu->events[counter] = event; + pmu->active_events++; + hwc->idx = counter; + + hwc->state |= PERF_HES_STOPPED; + + if (flags & PERF_EF_START) + ddr_perf_event_start(event, flags); + + return 0; +} + +static void ddr_perf_event_stop(struct perf_event *event, int flags) +{ + struct ddr_pmu *pmu = to_ddr_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int counter = hwc->idx; + + ddr_perf_counter_enable(pmu, event->attr.config, counter, false); + ddr_perf_event_update(event); + + hwc->state |= PERF_HES_STOPPED; +} + +static void ddr_perf_event_del(struct perf_event *event, int flags) +{ + struct ddr_pmu *pmu = to_ddr_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int counter = hwc->idx; + + ddr_perf_event_stop(event, PERF_EF_UPDATE); + + ddr_perf_free_counter(pmu, counter); + pmu->active_events--; + hwc->idx = -1; +} + +static void ddr_perf_pmu_enable(struct pmu *pmu) +{ + struct ddr_pmu *ddr_pmu = to_ddr_pmu(pmu); + + /* enable cycle counter if cycle is not active event list */ + if (ddr_pmu->events[EVENT_CYCLES_COUNTER] == NULL) + ddr_perf_counter_enable(ddr_pmu, + EVENT_CYCLES_ID, + EVENT_CYCLES_COUNTER, + true); +} + +static void ddr_perf_pmu_disable(struct pmu *pmu) +{ + struct ddr_pmu *ddr_pmu = to_ddr_pmu(pmu); + + if (ddr_pmu->events[EVENT_CYCLES_COUNTER] == NULL) + ddr_perf_counter_enable(ddr_pmu, + EVENT_CYCLES_ID, + EVENT_CYCLES_COUNTER, + false); +} + +static int ddr_perf_init(struct ddr_pmu *pmu, void __iomem *base, + struct device *dev) +{ + *pmu = (struct ddr_pmu) { + .pmu = (struct pmu) { + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .task_ctx_nr = perf_invalid_context, + .attr_groups = attr_groups, + .event_init = ddr_perf_event_init, + .add = ddr_perf_event_add, + .del = ddr_perf_event_del, + .start = ddr_perf_event_start, + .stop = ddr_perf_event_stop, + .read = ddr_perf_event_update, + .pmu_enable = ddr_perf_pmu_enable, + .pmu_disable = ddr_perf_pmu_disable, + }, + .base = base, + .dev = dev, + }; + + pmu->id = ida_simple_get(&ddr_ida, 0, 0, GFP_KERNEL); + return pmu->id; +} + +static irqreturn_t ddr_perf_irq_handler(int irq, void *p) +{ + int i; + struct ddr_pmu *pmu = (struct ddr_pmu *) p; + struct perf_event *event, *cycle_event = NULL; + + /* all counter will stop if cycle counter disabled */ + ddr_perf_counter_enable(pmu, + EVENT_CYCLES_ID, + EVENT_CYCLES_COUNTER, + false); + /* + * When the cycle counter overflows, all counters are stopped, + * and an IRQ is raised. If any other counter overflows, it + * continues counting, and no IRQ is raised. + * + * Cycles occur at least 4 times as often as other events, so we + * can update all events on a cycle counter overflow and not + * lose events. + * + */ + for (i = 0; i < NUM_COUNTERS; i++) { + + if (!pmu->events[i]) + continue; + + event = pmu->events[i]; + + ddr_perf_event_update(event); + + if (event->hw.idx == EVENT_CYCLES_COUNTER) + cycle_event = event; + } + + ddr_perf_counter_enable(pmu, + EVENT_CYCLES_ID, + EVENT_CYCLES_COUNTER, + true); + if (cycle_event) + ddr_perf_event_update(cycle_event); + + return IRQ_HANDLED; +} + +static int ddr_perf_offline_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct ddr_pmu *pmu = hlist_entry_safe(node, struct ddr_pmu, node); + int target; + + if (cpu != pmu->cpu) + return 0; + + target = cpumask_any_but(cpu_online_mask, cpu); + if (target >= nr_cpu_ids) + return 0; + + perf_pmu_migrate_context(&pmu->pmu, cpu, target); + pmu->cpu = target; + + WARN_ON(irq_set_affinity_hint(pmu->irq, cpumask_of(pmu->cpu))); + + return 0; +} + +static int ddr_perf_probe(struct platform_device *pdev) +{ + struct ddr_pmu *pmu; + struct device_node *np; + void __iomem *base; + char *name; + int num; + int ret; + int irq; + + base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(base)) + return PTR_ERR(base); + + np = pdev->dev.of_node; + + pmu = devm_kzalloc(&pdev->dev, sizeof(*pmu), GFP_KERNEL); + if (!pmu) + return -ENOMEM; + + num = ddr_perf_init(pmu, base, &pdev->dev); + + platform_set_drvdata(pdev, pmu); + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, DDR_PERF_DEV_NAME "%d", + num); + if (!name) + return -ENOMEM; + + pmu->cpu = raw_smp_processor_id(); + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + DDR_CPUHP_CB_NAME, + NULL, + ddr_perf_offline_cpu); + + if (ret < 0) { + dev_err(&pdev->dev, "cpuhp_setup_state_multi failed\n"); + goto ddr_perf_err; + } + + pmu->cpuhp_state = ret; + + /* Register the pmu instance for cpu hotplug */ + cpuhp_state_add_instance_nocalls(pmu->cpuhp_state, &pmu->node); + + /* Request irq */ + irq = of_irq_get(np, 0); + if (irq < 0) { + dev_err(&pdev->dev, "Failed to get irq: %d", irq); + ret = irq; + goto ddr_perf_err; + } + + ret = devm_request_irq(&pdev->dev, irq, + ddr_perf_irq_handler, + IRQF_NOBALANCING | IRQF_NO_THREAD, + DDR_CPUHP_CB_NAME, + pmu); + if (ret < 0) { + dev_err(&pdev->dev, "Request irq failed: %d", ret); + goto ddr_perf_err; + } + + pmu->irq = irq; + ret = irq_set_affinity_hint(pmu->irq, cpumask_of(pmu->cpu)); + if (ret) { + dev_err(pmu->dev, "Failed to set interrupt affinity!\n"); + goto ddr_perf_err; + } + + ret = perf_pmu_register(&pmu->pmu, name, -1); + if (ret) + goto ddr_perf_err; + + return 0; + +ddr_perf_err: + if (pmu->cpuhp_state) + cpuhp_state_remove_instance_nocalls(pmu->cpuhp_state, &pmu->node); + + ida_simple_remove(&ddr_ida, pmu->id); + dev_warn(&pdev->dev, "i.MX8 DDR Perf PMU failed (%d), disabled\n", ret); + return ret; +} + +static int ddr_perf_remove(struct platform_device *pdev) +{ + struct ddr_pmu *pmu = platform_get_drvdata(pdev); + + cpuhp_state_remove_instance_nocalls(pmu->cpuhp_state, &pmu->node); + irq_set_affinity_hint(pmu->irq, NULL); + + perf_pmu_unregister(&pmu->pmu); + + ida_simple_remove(&ddr_ida, pmu->id); + return 0; +} + +static struct platform_driver imx_ddr_pmu_driver = { + .driver = { + .name = "imx-ddr-pmu", + .of_match_table = imx_ddr_pmu_dt_ids, + }, + .probe = ddr_perf_probe, + .remove = ddr_perf_remove, +}; + +module_platform_driver(imx_ddr_pmu_driver); +MODULE_LICENSE("GPL v2"); From ae9924667a7ee91d49c91ebbc076451ca6b6293a Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 1 May 2019 18:43:35 +0000 Subject: [PATCH 25/54] MAINTAINERS: Add maintainer entry for the imx8 DDR PMU driver Frank Li wrote the imx8 DDR PMU driver and has access to the hardware, so add him as maintainer for the files in question. Signed-off-by: Frank Li [will: fixed case of title] Signed-off-by: Will Deacon --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a6954776a37e..a2a8b567d5e8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6328,6 +6328,13 @@ L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/busses/i2c-cpm.c +FREESCALE IMX DDR PMU DRIVER +M: Frank Li +L: linux-arm-kernel@lists.infradead.org +S: Maintained +F: drivers/perf/fsl_imx8_ddr_perf.c +F: Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt + FREESCALE IMX LPI2C DRIVER M: Dong Aisheng L: linux-i2c@vger.kernel.org From 1a2a66db4967d66402501c43bdfe9d68be54f648 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 12 Apr 2019 11:42:16 +0200 Subject: [PATCH 26/54] arm64: remove redundant 'default n' from Kconfig 'default n' is the default value for any bool or tristate Kconfig setting so there is no need to write it explicitly. Also since commit f467c5640c29 ("kconfig: only write '# CONFIG_FOO is not set' for visible symbols") the Kconfig behavior is the same regardless of 'default n' being present or not: ... One side effect of (and the main motivation for) this change is making the following two definitions behave exactly the same: config FOO bool config FOO bool default n With this change, neither of these will generate a '# CONFIG_FOO is not set' line (assuming FOO isn't selected/implied). That might make it clearer to people that a bare 'default n' is redundant. ... Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index cf5f1dafcf74..acd72e5f78ae 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -934,7 +934,6 @@ config PARAVIRT config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" select PARAVIRT - default n help Select this option to enable fine granularity task steal time accounting. Time spent executing other tasks in parallel with From 8f5c9037a55b22e847f636f9a39fa98fe67923d1 Mon Sep 17 00:00:00 2001 From: Masayoshi Mizuma Date: Fri, 14 Jun 2019 09:11:41 -0400 Subject: [PATCH 27/54] arm64/mm: Correct the cache line size warning with non coherent device If the cache line size is greater than ARCH_DMA_MINALIGN (128), the warning shows and it's tainted as TAINT_CPU_OUT_OF_SPEC. However, it's not good because as discussed in the thread [1], the cpu cache line size will be problem only on non-coherent devices. Since the coherent flag is already introduced to struct device, show the warning only if the device is non-coherent device and ARCH_DMA_MINALIGN is smaller than the cpu cache size. [1] https://lore.kernel.org/linux-arm-kernel/20180514145703.celnlobzn3uh5tc2@localhost/ Signed-off-by: Masayoshi Mizuma Reviewed-by: Hidetoshi Seto Tested-by: Zhang Lei [catalin.marinas@arm.com: removed 'if' block for WARN_TAINT] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cache.h | 7 +++++++ arch/arm64/kernel/cacheinfo.c | 4 +--- arch/arm64/mm/dma-mapping.c | 12 ++++++++---- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 758af6340314..d24b7c1ecd9b 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -91,6 +91,13 @@ static inline u32 cache_type_cwg(void) #define __read_mostly __attribute__((__section__(".data..read_mostly"))) +static inline int cache_line_size_of_cpu(void) +{ + u32 cwg = cache_type_cwg(); + + return cwg ? 4 << cwg : ARCH_DMA_MINALIGN; +} + int cache_line_size(void); /* diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index 0c0cd4d26b87..969fcc3be556 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -30,12 +30,10 @@ int cache_line_size(void) { - u32 cwg = cache_type_cwg(); - if (coherency_max_size != 0) return coherency_max_size; - return cwg ? 4 << cwg : ARCH_DMA_MINALIGN; + return cache_line_size_of_cpu(); } EXPORT_SYMBOL_GPL(cache_line_size); diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 674860e3e478..ff410195dc1c 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -91,10 +91,6 @@ static int __swiotlb_mmap_pfn(struct vm_area_struct *vma, static int __init arm64_dma_init(void) { - WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(), - TAINT_CPU_OUT_OF_SPEC, - "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)", - ARCH_DMA_MINALIGN, cache_line_size()); return dma_atomic_pool_init(GFP_DMA32, __pgprot(PROT_NORMAL_NC)); } arch_initcall(arm64_dma_init); @@ -472,6 +468,14 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { + int cls = cache_line_size_of_cpu(); + + WARN_TAINT(!coherent && cls > ARCH_DMA_MINALIGN, + TAINT_CPU_OUT_OF_SPEC, + "%s %s: ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)", + dev_driver_string(dev), dev_name(dev), + ARCH_DMA_MINALIGN, cls); + dev->dma_coherent = coherent; __iommu_setup_dma_ops(dev, dma_base, size, iommu); From 9034f6251572a4744597c51dea5ab73a55f2b938 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Tue, 11 Jun 2019 10:38:06 +0100 Subject: [PATCH 28/54] arm64: Do not enable IRQs for ct_user_exit For el0_dbg and el0_error, DAIF bits get explicitly cleared before calling ct_user_exit. When context tracking is disabled, DAIF gets set (almost) immediately after. When context tracking is enabled, among the first things done is disabling IRQs. What is actually needed is: - PSR.D = 0 so the system can be debugged (should be already the case) - PSR.A = 0 so async error can be handled during context tracking Do not clear PSR.I in those two locations. Reviewed-by: Marc Zyngier Acked-by: Mark Rutland Reviewed-by: James Morse Cc: Will Deacon Signed-off-by: Julien Thierry Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index cd0c7af8e4a8..89ab6bd896c4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -870,7 +870,7 @@ el0_dbg: mov x1, x25 mov x2, sp bl do_debug_exception - enable_daif + enable_da_f ct_user_exit b ret_to_user el0_inv: @@ -922,7 +922,7 @@ el0_error_naked: enable_dbg mov x0, sp bl do_serror - enable_daif + enable_da_f ct_user_exit b ret_to_user ENDPROC(el0_error) From 19c36b185a1d13f79f3a382e08695a2633155e5a Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Tue, 11 Jun 2019 10:38:07 +0100 Subject: [PATCH 29/54] arm64: irqflags: Pass flags as readonly operand to restore instruction Flags are only read by the instructions doing the irqflags restore operation. Pass the operand as read only to the asm inline instead of read-write. Cc: Will Deacon Reviewed-by: Marc Zyngier Acked-by: Mark Rutland Signed-off-by: Julien Thierry Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/irqflags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index 629963189085..9c93152c9af7 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -119,8 +119,8 @@ static inline void arch_local_irq_restore(unsigned long flags) __msr_s(SYS_ICC_PMR_EL1, "%0") "dsb sy", ARM64_HAS_IRQ_PRIO_MASKING) - : "+r" (flags) : + : "r" (flags) : "memory"); } From f57065782f245ca96f1472209a485073bbc11247 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Tue, 11 Jun 2019 10:38:08 +0100 Subject: [PATCH 30/54] arm64: irqflags: Add condition flags to inline asm clobber list Some of the inline assembly instruction use the condition flags and need to include "cc" in the clobber list. Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: # 5.1.x- Suggested-by: Marc Zyngier Cc: Will Deacon Reviewed-by: Marc Zyngier Acked-by: Mark Rutland Signed-off-by: Julien Thierry Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/irqflags.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index 9c93152c9af7..fbe1aba6ffb3 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -92,7 +92,7 @@ static inline unsigned long arch_local_save_flags(void) ARM64_HAS_IRQ_PRIO_MASKING) : "=&r" (flags), "+r" (daif_bits) : "r" ((unsigned long) GIC_PRIO_IRQOFF) - : "memory"); + : "cc", "memory"); return flags; } @@ -136,7 +136,7 @@ static inline int arch_irqs_disabled_flags(unsigned long flags) ARM64_HAS_IRQ_PRIO_MASKING) : "=&r" (res) : "r" ((int) flags) - : "memory"); + : "cc", "memory"); return res; } From 17ce302f3117e9518395847a3120c8a108b587b8 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Tue, 11 Jun 2019 10:38:09 +0100 Subject: [PATCH 31/54] arm64: Fix interrupt tracing in the presence of NMIs In the presence of any form of instrumentation, nmi_enter() should be done before calling any traceable code and any instrumentation code. Currently, nmi_enter() is done in handle_domain_nmi(), which is much too late as instrumentation code might get called before. Move the nmi_enter/exit() calls to the arch IRQ vector handler. On arm64, it is not possible to know if the IRQ vector handler was called because of an NMI before acknowledging the interrupt. However, It is possible to know whether normal interrupts could be taken in the interrupted context (i.e. if taking an NMI in that context could introduce a potential race condition). When interrupting a context with IRQs disabled, call nmi_enter() as soon as possible. In contexts with IRQs enabled, defer this to the interrupt controller, which is in a better position to know if an interrupt taken is an NMI. Fixes: bc3c03ccb464 ("arm64: Enable the support of pseudo-NMIs") Cc: # 5.1.x- Cc: Will Deacon Cc: Thomas Gleixner Cc: Jason Cooper Cc: Mark Rutland Reviewed-by: Marc Zyngier Signed-off-by: Julien Thierry Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 46 ++++++++++++++++++++++++++---------- arch/arm64/kernel/irq.c | 17 +++++++++++++ drivers/irqchip/irq-gic-v3.c | 7 ++++++ kernel/irq/irqdesc.c | 8 +++++-- 4 files changed, 64 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 89ab6bd896c4..6d5966346710 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -435,6 +435,20 @@ tsk .req x28 // current thread_info irq_stack_exit .endm +#ifdef CONFIG_ARM64_PSEUDO_NMI + /* + * Set res to 0 if irqs were unmasked in interrupted context. + * Otherwise set res to non-0 value. + */ + .macro test_irqs_unmasked res:req, pmr:req +alternative_if ARM64_HAS_IRQ_PRIO_MASKING + sub \res, \pmr, #GIC_PRIO_IRQON +alternative_else + mov \res, xzr +alternative_endif + .endm +#endif + .text /* @@ -631,21 +645,21 @@ ENDPROC(el1_sync) el1_irq: kernel_entry 1 enable_da_f -#ifdef CONFIG_TRACE_IRQFLAGS + #ifdef CONFIG_ARM64_PSEUDO_NMI alternative_if ARM64_HAS_IRQ_PRIO_MASKING ldr x20, [sp, #S_PMR_SAVE] -alternative_else - mov x20, #GIC_PRIO_IRQON -alternative_endif - cmp x20, #GIC_PRIO_IRQOFF - /* Irqs were disabled, don't trace */ - b.ls 1f -#endif - bl trace_hardirqs_off +alternative_else_nop_endif + test_irqs_unmasked res=x0, pmr=x20 + cbz x0, 1f + bl asm_nmi_enter 1: #endif +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif + irq_handler #ifdef CONFIG_PREEMPT @@ -662,14 +676,22 @@ alternative_else_nop_endif bl preempt_schedule_irq // irq en/disable is done inside 1: #endif -#ifdef CONFIG_TRACE_IRQFLAGS + #ifdef CONFIG_ARM64_PSEUDO_NMI /* * if IRQs were disabled when we received the interrupt, we have an NMI * and we are not re-enabling interrupt upon eret. Skip tracing. */ - cmp x20, #GIC_PRIO_IRQOFF - b.ls 1f + test_irqs_unmasked res=x0, pmr=x20 + cbz x0, 1f + bl asm_nmi_exit +1: +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS +#ifdef CONFIG_ARM64_PSEUDO_NMI + test_irqs_unmasked res=x0, pmr=x20 + cbnz x0, 1f #endif bl trace_hardirqs_on 1: diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 92fa81798fb9..fdd9cb27fed5 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -27,8 +27,10 @@ #include #include #include +#include #include #include +#include #include unsigned long irq_err_count; @@ -76,3 +78,18 @@ void __init init_IRQ(void) if (!handle_arch_irq) panic("No interrupt controller found."); } + +/* + * Stubs to make nmi_enter/exit() code callable from ASM + */ +asmlinkage void notrace asm_nmi_enter(void) +{ + nmi_enter(); +} +NOKPROBE_SYMBOL(asm_nmi_enter); + +asmlinkage void notrace asm_nmi_exit(void) +{ + nmi_exit(); +} +NOKPROBE_SYMBOL(asm_nmi_exit); diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index f44cd89cfc40..b176700bb387 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -472,8 +472,12 @@ static void gic_deactivate_unhandled(u32 irqnr) static inline void gic_handle_nmi(u32 irqnr, struct pt_regs *regs) { + bool irqs_enabled = interrupts_enabled(regs); int err; + if (irqs_enabled) + nmi_enter(); + if (static_branch_likely(&supports_deactivate_key)) gic_write_eoir(irqnr); /* @@ -485,6 +489,9 @@ static inline void gic_handle_nmi(u32 irqnr, struct pt_regs *regs) err = handle_domain_nmi(gic_data.domain, irqnr, regs); if (err) gic_deactivate_unhandled(irqnr); + + if (irqs_enabled) + nmi_exit(); } static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index c52b737ab8e3..a92b33593b8d 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -680,6 +680,8 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, * @hwirq: The HW irq number to convert to a logical one * @regs: Register file coming from the low-level handling code * + * This function must be called from an NMI context. + * * Returns: 0 on success, or -EINVAL if conversion has failed */ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, @@ -689,7 +691,10 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, unsigned int irq; int ret = 0; - nmi_enter(); + /* + * NMI context needs to be setup earlier in order to deal with tracing. + */ + WARN_ON(!in_nmi()); irq = irq_find_mapping(domain, hwirq); @@ -702,7 +707,6 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, else ret = -EINVAL; - nmi_exit(); set_irq_regs(old_regs); return ret; } From bd82d4bd21880b7c4d5f5756be435095d6ae07b5 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Tue, 11 Jun 2019 10:38:10 +0100 Subject: [PATCH 32/54] arm64: Fix incorrect irqflag restore for priority masking When using IRQ priority masking to disable interrupts, in order to deal with the PSR.I state, local_irq_save() would convert the I bit into a PMR value (GIC_PRIO_IRQOFF). This resulted in local_irq_restore() potentially modifying the value of PMR in undesired location due to the state of PSR.I upon flag saving [1]. In an attempt to solve this issue in a less hackish manner, introduce a bit (GIC_PRIO_IGNORE_PMR) for the PMR values that can represent whether PSR.I is being used to disable interrupts, in which case it takes precedence of the status of interrupt masking via PMR. GIC_PRIO_PSR_I_SET is chosen such that ( | GIC_PRIO_PSR_I_SET) does not mask more interrupts than as some sections (e.g. arch_cpu_idle(), interrupt acknowledge path) requires PMR not to mask interrupts that could be signaled to the CPU when using only PSR.I. [1] https://www.spinics.net/lists/arm-kernel/msg716956.html Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: # 5.1.x- Reported-by: Zenghui Yu Cc: Steven Rostedt Cc: Wei Li Cc: Will Deacon Cc: Christoffer Dall Cc: James Morse Cc: Suzuki K Pouloze Cc: Oleg Nesterov Reviewed-by: Marc Zyngier Signed-off-by: Julien Thierry Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/arch_gicv3.h | 4 +- arch/arm64/include/asm/daifflags.h | 72 +++++++++++++++++------------ arch/arm64/include/asm/irqflags.h | 67 +++++++++++---------------- arch/arm64/include/asm/kvm_host.h | 7 +-- arch/arm64/include/asm/ptrace.h | 10 +++- arch/arm64/kernel/entry.S | 38 +++++++++++++-- arch/arm64/kernel/process.c | 2 +- arch/arm64/kernel/smp.c | 8 ++-- arch/arm64/kvm/hyp/switch.c | 2 +- 9 files changed, 125 insertions(+), 85 deletions(-) diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 14b41ddc68ba..9e991b628706 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -163,7 +163,9 @@ static inline bool gic_prio_masking_enabled(void) static inline void gic_pmr_mask_irqs(void) { - BUILD_BUG_ON(GICD_INT_DEF_PRI <= GIC_PRIO_IRQOFF); + BUILD_BUG_ON(GICD_INT_DEF_PRI < (GIC_PRIO_IRQOFF | + GIC_PRIO_PSR_I_SET)); + BUILD_BUG_ON(GICD_INT_DEF_PRI >= GIC_PRIO_IRQON); gic_write_pmr(GIC_PRIO_IRQOFF); } diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index db452aa9e651..f93204f319da 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -18,6 +18,7 @@ #include +#include #include #define DAIF_PROCCTX 0 @@ -32,6 +33,11 @@ static inline void local_daif_mask(void) : : : "memory"); + + /* Don't really care for a dsb here, we don't intend to enable IRQs */ + if (system_uses_irq_prio_masking()) + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); + trace_hardirqs_off(); } @@ -43,7 +49,7 @@ static inline unsigned long local_daif_save(void) if (system_uses_irq_prio_masking()) { /* If IRQs are masked with PMR, reflect it in the flags */ - if (read_sysreg_s(SYS_ICC_PMR_EL1) <= GIC_PRIO_IRQOFF) + if (read_sysreg_s(SYS_ICC_PMR_EL1) != GIC_PRIO_IRQON) flags |= PSR_I_BIT; } @@ -59,36 +65,44 @@ static inline void local_daif_restore(unsigned long flags) if (!irq_disabled) { trace_hardirqs_on(); - if (system_uses_irq_prio_masking()) - arch_local_irq_enable(); - } else if (!(flags & PSR_A_BIT)) { - /* - * If interrupts are disabled but we can take - * asynchronous errors, we can take NMIs - */ if (system_uses_irq_prio_masking()) { - flags &= ~PSR_I_BIT; - /* - * There has been concern that the write to daif - * might be reordered before this write to PMR. - * From the ARM ARM DDI 0487D.a, section D1.7.1 - * "Accessing PSTATE fields": - * Writes to the PSTATE fields have side-effects on - * various aspects of the PE operation. All of these - * side-effects are guaranteed: - * - Not to be visible to earlier instructions in - * the execution stream. - * - To be visible to later instructions in the - * execution stream - * - * Also, writes to PMR are self-synchronizing, so no - * interrupts with a lower priority than PMR is signaled - * to the PE after the write. - * - * So we don't need additional synchronization here. - */ - arch_local_irq_disable(); + gic_write_pmr(GIC_PRIO_IRQON); + dsb(sy); } + } else if (system_uses_irq_prio_masking()) { + u64 pmr; + + if (!(flags & PSR_A_BIT)) { + /* + * If interrupts are disabled but we can take + * asynchronous errors, we can take NMIs + */ + flags &= ~PSR_I_BIT; + pmr = GIC_PRIO_IRQOFF; + } else { + pmr = GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET; + } + + /* + * There has been concern that the write to daif + * might be reordered before this write to PMR. + * From the ARM ARM DDI 0487D.a, section D1.7.1 + * "Accessing PSTATE fields": + * Writes to the PSTATE fields have side-effects on + * various aspects of the PE operation. All of these + * side-effects are guaranteed: + * - Not to be visible to earlier instructions in + * the execution stream. + * - To be visible to later instructions in the + * execution stream + * + * Also, writes to PMR are self-synchronizing, so no + * interrupts with a lower priority than PMR is signaled + * to the PE after the write. + * + * So we don't need additional synchronization here. + */ + gic_write_pmr(pmr); } write_sysreg(flags, daif); diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index fbe1aba6ffb3..a1372722f12e 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -67,43 +67,46 @@ static inline void arch_local_irq_disable(void) */ static inline unsigned long arch_local_save_flags(void) { - unsigned long daif_bits; unsigned long flags; - daif_bits = read_sysreg(daif); - - /* - * The asm is logically equivalent to: - * - * if (system_uses_irq_prio_masking()) - * flags = (daif_bits & PSR_I_BIT) ? - * GIC_PRIO_IRQOFF : - * read_sysreg_s(SYS_ICC_PMR_EL1); - * else - * flags = daif_bits; - */ asm volatile(ALTERNATIVE( - "mov %0, %1\n" - "nop\n" - "nop", - __mrs_s("%0", SYS_ICC_PMR_EL1) - "ands %1, %1, " __stringify(PSR_I_BIT) "\n" - "csel %0, %0, %2, eq", - ARM64_HAS_IRQ_PRIO_MASKING) - : "=&r" (flags), "+r" (daif_bits) - : "r" ((unsigned long) GIC_PRIO_IRQOFF) - : "cc", "memory"); + "mrs %0, daif", + __mrs_s("%0", SYS_ICC_PMR_EL1), + ARM64_HAS_IRQ_PRIO_MASKING) + : "=&r" (flags) + : + : "memory"); return flags; } +static inline int arch_irqs_disabled_flags(unsigned long flags) +{ + int res; + + asm volatile(ALTERNATIVE( + "and %w0, %w1, #" __stringify(PSR_I_BIT), + "eor %w0, %w1, #" __stringify(GIC_PRIO_IRQON), + ARM64_HAS_IRQ_PRIO_MASKING) + : "=&r" (res) + : "r" ((int) flags) + : "memory"); + + return res; +} + static inline unsigned long arch_local_irq_save(void) { unsigned long flags; flags = arch_local_save_flags(); - arch_local_irq_disable(); + /* + * There are too many states with IRQs disabled, just keep the current + * state if interrupts are already disabled/masked. + */ + if (!arch_irqs_disabled_flags(flags)) + arch_local_irq_disable(); return flags; } @@ -124,21 +127,5 @@ static inline void arch_local_irq_restore(unsigned long flags) : "memory"); } -static inline int arch_irqs_disabled_flags(unsigned long flags) -{ - int res; - - asm volatile(ALTERNATIVE( - "and %w0, %w1, #" __stringify(PSR_I_BIT) "\n" - "nop", - "cmp %w1, #" __stringify(GIC_PRIO_IRQOFF) "\n" - "cset %w0, ls", - ARM64_HAS_IRQ_PRIO_MASKING) - : "=&r" (res) - : "r" ((int) flags) - : "cc", "memory"); - - return res; -} #endif #endif diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 4bcd9c1291d5..33410635b015 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -608,11 +608,12 @@ static inline void kvm_arm_vhe_guest_enter(void) * will not signal the CPU of interrupts of lower priority, and the * only way to get out will be via guest exceptions. * Naturally, we want to avoid this. + * + * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a + * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU. */ - if (system_uses_irq_prio_masking()) { - gic_write_pmr(GIC_PRIO_IRQON); + if (system_uses_irq_prio_masking()) dsb(sy); - } } static inline void kvm_arm_vhe_guest_exit(void) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index b2de32939ada..da2242248466 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -35,9 +35,15 @@ * means masking more IRQs (or at least that the same IRQs remain masked). * * To mask interrupts, we clear the most significant bit of PMR. + * + * Some code sections either automatically switch back to PSR.I or explicitly + * require to not use priority masking. If bit GIC_PRIO_PSR_I_SET is included + * in the the priority mask, it indicates that PSR.I should be set and + * interrupt disabling temporarily does not rely on IRQ priorities. */ -#define GIC_PRIO_IRQON 0xf0 -#define GIC_PRIO_IRQOFF (GIC_PRIO_IRQON & ~0x80) +#define GIC_PRIO_IRQON 0xc0 +#define GIC_PRIO_IRQOFF (GIC_PRIO_IRQON & ~0x80) +#define GIC_PRIO_PSR_I_SET (1 << 4) /* Additional SPSR bits not exposed in the UABI */ #define PSR_IL_BIT (1 << 20) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 6d5966346710..165da78815c5 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -258,6 +258,7 @@ alternative_else_nop_endif /* * Registers that may be useful after this macro is invoked: * + * x20 - ICC_PMR_EL1 * x21 - aborted SP * x22 - aborted PC * x23 - aborted PSTATE @@ -449,6 +450,24 @@ alternative_endif .endm #endif + .macro gic_prio_kentry_setup, tmp:req +#ifdef CONFIG_ARM64_PSEUDO_NMI + alternative_if ARM64_HAS_IRQ_PRIO_MASKING + mov \tmp, #(GIC_PRIO_PSR_I_SET | GIC_PRIO_IRQON) + msr_s SYS_ICC_PMR_EL1, \tmp + alternative_else_nop_endif +#endif + .endm + + .macro gic_prio_irq_setup, pmr:req, tmp:req +#ifdef CONFIG_ARM64_PSEUDO_NMI + alternative_if ARM64_HAS_IRQ_PRIO_MASKING + orr \tmp, \pmr, #GIC_PRIO_PSR_I_SET + msr_s SYS_ICC_PMR_EL1, \tmp + alternative_else_nop_endif +#endif + .endm + .text /* @@ -627,6 +646,7 @@ el1_dbg: cmp x24, #ESR_ELx_EC_BRK64 // if BRK64 cinc x24, x24, eq // set bit '0' tbz x24, #0, el1_inv // EL1 only + gic_prio_kentry_setup tmp=x3 mrs x0, far_el1 mov x2, sp // struct pt_regs bl do_debug_exception @@ -644,12 +664,10 @@ ENDPROC(el1_sync) .align 6 el1_irq: kernel_entry 1 + gic_prio_irq_setup pmr=x20, tmp=x1 enable_da_f #ifdef CONFIG_ARM64_PSEUDO_NMI -alternative_if ARM64_HAS_IRQ_PRIO_MASKING - ldr x20, [sp, #S_PMR_SAVE] -alternative_else_nop_endif test_irqs_unmasked res=x0, pmr=x20 cbz x0, 1f bl asm_nmi_enter @@ -679,8 +697,9 @@ alternative_else_nop_endif #ifdef CONFIG_ARM64_PSEUDO_NMI /* - * if IRQs were disabled when we received the interrupt, we have an NMI - * and we are not re-enabling interrupt upon eret. Skip tracing. + * When using IRQ priority masking, we can get spurious interrupts while + * PMR is set to GIC_PRIO_IRQOFF. An NMI might also have occurred in a + * section with interrupts disabled. Skip tracing in those cases. */ test_irqs_unmasked res=x0, pmr=x20 cbz x0, 1f @@ -809,6 +828,7 @@ el0_ia: * Instruction abort handling */ mrs x26, far_el1 + gic_prio_kentry_setup tmp=x0 enable_da_f #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off @@ -854,6 +874,7 @@ el0_sp_pc: * Stack or PC alignment exception handling */ mrs x26, far_el1 + gic_prio_kentry_setup tmp=x0 enable_da_f #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off @@ -888,6 +909,7 @@ el0_dbg: * Debug exception handling */ tbnz x24, #0, el0_inv // EL0 only + gic_prio_kentry_setup tmp=x3 mrs x0, far_el1 mov x1, x25 mov x2, sp @@ -909,7 +931,9 @@ ENDPROC(el0_sync) el0_irq: kernel_entry 0 el0_irq_naked: + gic_prio_irq_setup pmr=x20, tmp=x0 enable_da_f + #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off #endif @@ -931,6 +955,7 @@ ENDPROC(el0_irq) el1_error: kernel_entry 1 mrs x1, esr_el1 + gic_prio_kentry_setup tmp=x2 enable_dbg mov x0, sp bl do_serror @@ -941,6 +966,7 @@ el0_error: kernel_entry 0 el0_error_naked: mrs x1, esr_el1 + gic_prio_kentry_setup tmp=x2 enable_dbg mov x0, sp bl do_serror @@ -965,6 +991,7 @@ work_pending: */ ret_to_user: disable_daif + gic_prio_kentry_setup tmp=x3 ldr x1, [tsk, #TSK_TI_FLAGS] and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending @@ -981,6 +1008,7 @@ ENDPROC(ret_to_user) */ .align 6 el0_svc: + gic_prio_kentry_setup tmp=x1 mov x0, sp bl el0_svc_handler b ret_to_user diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 3767fb21a5b8..58efc3727778 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -94,7 +94,7 @@ static void __cpu_do_idle_irqprio(void) * be raised. */ pmr = gic_read_pmr(); - gic_write_pmr(GIC_PRIO_IRQON); + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); __cpu_do_idle(); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index bb4b3f07761a..4deaee3c2a33 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -192,11 +192,13 @@ static void init_gic_priority_masking(void) WARN_ON(!(cpuflags & PSR_I_BIT)); - gic_write_pmr(GIC_PRIO_IRQOFF); - /* We can only unmask PSR.I if we can take aborts */ - if (!(cpuflags & PSR_A_BIT)) + if (!(cpuflags & PSR_A_BIT)) { + gic_write_pmr(GIC_PRIO_IRQOFF); write_sysreg(cpuflags & ~PSR_I_BIT, daif); + } else { + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); + } } /* diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 8799e0c267d4..b89fcf0173b7 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -615,7 +615,7 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu) * Naturally, we want to avoid this. */ if (system_uses_irq_prio_masking()) { - gic_write_pmr(GIC_PRIO_IRQON); + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); dsb(sy); } From 48ce8f80f5901f1f031b00be66d659d39f33b0a1 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Tue, 11 Jun 2019 10:38:11 +0100 Subject: [PATCH 33/54] arm64: irqflags: Introduce explicit debugging for IRQ priorities Using IRQ priority masking to enable/disable interrupts is a bit sensitive as it requires to deal with both ICC_PMR_EL1 and PSR.I. Introduce some validity checks to both highlight the states in which functions dealing with IRQ enabling/disabling can (not) be called, and bark a warning when called in an unexpected state. Since these checks are done on hotpaths, introduce a build option to choose whether to do the checking. Cc: Will Deacon Reviewed-by: Marc Zyngier Signed-off-by: Julien Thierry Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 11 +++++++++++ arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/include/asm/daifflags.h | 7 +++++++ arch/arm64/include/asm/irqflags.h | 12 ++++++++++++ 4 files changed, 36 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index acd72e5f78ae..bd3915ae7b53 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1436,6 +1436,17 @@ config ARM64_PSEUDO_NMI If unsure, say N +if ARM64_PSEUDO_NMI +config ARM64_DEBUG_PRIORITY_MASKING + bool "Debug interrupt priority masking" + help + This adds runtime checks to functions enabling/disabling + interrupts when using priority masking. The additional checks verify + the validity of ICC_PMR_EL1 when calling concerned functions. + + If unsure, say N +endif + config RELOCATABLE bool help diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index bc895c869892..693a086e2148 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -617,6 +617,12 @@ static inline bool system_uses_irq_prio_masking(void) cpus_have_const_cap(ARM64_HAS_IRQ_PRIO_MASKING); } +static inline bool system_has_prio_mask_debugging(void) +{ + return IS_ENABLED(CONFIG_ARM64_DEBUG_PRIORITY_MASKING) && + system_uses_irq_prio_masking(); +} + #define ARM64_SSBD_UNKNOWN -1 #define ARM64_SSBD_FORCE_DISABLE 0 #define ARM64_SSBD_KERNEL 1 diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index f93204f319da..eca5bee1d85b 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -28,6 +28,10 @@ /* mask/save/unmask/restore all exceptions, including interrupts. */ static inline void local_daif_mask(void) { + WARN_ON(system_has_prio_mask_debugging() && + (read_sysreg_s(SYS_ICC_PMR_EL1) == (GIC_PRIO_IRQOFF | + GIC_PRIO_PSR_I_SET))); + asm volatile( "msr daifset, #0xf // local_daif_mask\n" : @@ -62,6 +66,9 @@ static inline void local_daif_restore(unsigned long flags) { bool irq_disabled = flags & PSR_I_BIT; + WARN_ON(system_has_prio_mask_debugging() && + !(read_sysreg(daif) & PSR_I_BIT)); + if (!irq_disabled) { trace_hardirqs_on(); diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index a1372722f12e..cac2d2a3c24e 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -40,6 +40,12 @@ */ static inline void arch_local_irq_enable(void) { + if (system_has_prio_mask_debugging()) { + u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1); + + WARN_ON_ONCE(pmr != GIC_PRIO_IRQON && pmr != GIC_PRIO_IRQOFF); + } + asm volatile(ALTERNATIVE( "msr daifclr, #2 // arch_local_irq_enable\n" "nop", @@ -53,6 +59,12 @@ static inline void arch_local_irq_enable(void) static inline void arch_local_irq_disable(void) { + if (system_has_prio_mask_debugging()) { + u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1); + + WARN_ON_ONCE(pmr != GIC_PRIO_IRQON && pmr != GIC_PRIO_IRQOFF); + } + asm volatile(ALTERNATIVE( "msr daifset, #2 // arch_local_irq_disable", __msr_s(SYS_ICC_PMR_EL1, "%0"), From e1d22385ea6686ff3dcd7092d84465c193849829 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Tue, 11 Jun 2019 10:38:12 +0100 Subject: [PATCH 34/54] arm64: fix kernel stack overflow in kdump capture kernel When enabling ARM64_PSEUDO_NMI feature in kdump capture kernel, it will report a kernel stack overflow exception: [ 0.000000] CPU features: detected: IRQ priority masking [ 0.000000] alternatives: patching kernel code [ 0.000000] Insufficient stack space to handle exception! [ 0.000000] ESR: 0x96000044 -- DABT (current EL) [ 0.000000] FAR: 0x0000000000000040 [ 0.000000] Task stack: [0xffff0000097f0000..0xffff0000097f4000] [ 0.000000] IRQ stack: [0x0000000000000000..0x0000000000004000] [ 0.000000] Overflow stack: [0xffff80002b7cf290..0xffff80002b7d0290] [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.34-lw+ #3 [ 0.000000] pstate: 400003c5 (nZcv DAIF -PAN -UAO) [ 0.000000] pc : el1_sync+0x0/0xb8 [ 0.000000] lr : el1_irq+0xb8/0x140 [ 0.000000] sp : 0000000000000040 [ 0.000000] pmr_save: 00000070 [ 0.000000] x29: ffff0000097f3f60 x28: ffff000009806240 [ 0.000000] x27: 0000000080000000 x26: 0000000000004000 [ 0.000000] x25: 0000000000000000 x24: ffff000009329028 [ 0.000000] x23: 0000000040000005 x22: ffff000008095c6c [ 0.000000] x21: ffff0000097f3f70 x20: 0000000000000070 [ 0.000000] x19: ffff0000097f3e30 x18: ffffffffffffffff [ 0.000000] x17: 0000000000000000 x16: 0000000000000000 [ 0.000000] x15: ffff0000097f9708 x14: ffff000089a382ef [ 0.000000] x13: ffff000009a382fd x12: ffff000009824000 [ 0.000000] x11: ffff0000097fb7b0 x10: ffff000008730028 [ 0.000000] x9 : ffff000009440018 x8 : 000000000000000d [ 0.000000] x7 : 6b20676e69686374 x6 : 000000000000003b [ 0.000000] x5 : 0000000000000000 x4 : ffff000008093600 [ 0.000000] x3 : 0000000400000008 x2 : 7db2e689fc2b8e00 [ 0.000000] x1 : 0000000000000000 x0 : ffff0000097f3e30 [ 0.000000] Kernel panic - not syncing: kernel stack overflow [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.34-lw+ #3 [ 0.000000] Call trace: [ 0.000000] dump_backtrace+0x0/0x1b8 [ 0.000000] show_stack+0x24/0x30 [ 0.000000] dump_stack+0xa8/0xcc [ 0.000000] panic+0x134/0x30c [ 0.000000] __stack_chk_fail+0x0/0x28 [ 0.000000] handle_bad_stack+0xfc/0x108 [ 0.000000] __bad_stack+0x90/0x94 [ 0.000000] el1_sync+0x0/0xb8 [ 0.000000] init_gic_priority_masking+0x4c/0x70 [ 0.000000] smp_prepare_boot_cpu+0x60/0x68 [ 0.000000] start_kernel+0x1e8/0x53c [ 0.000000] ---[ end Kernel panic - not syncing: kernel stack overflow ]--- The reason is init_gic_priority_masking() may unmask PSR.I while the irq stacks are not inited yet. Some "NMI" could be raised unfortunately and it will just go into this exception. In this patch, we just write the PMR in smp_prepare_boot_cpu(), and delay unmasking PSR.I after irq stacks inited in init_IRQ(). Fixes: e79321883842 ("arm64: Switch to PMR masking when starting CPUs") Cc: Will Deacon Reviewed-by: Marc Zyngier Signed-off-by: Wei Li [JT: make init_gic_priority_masking() not modify daif, rebase on other priority masking fixes] Signed-off-by: Julien Thierry Signed-off-by: Catalin Marinas --- arch/arm64/kernel/irq.c | 9 +++++++++ arch/arm64/kernel/smp.c | 8 +------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index fdd9cb27fed5..e8daa7aa77bc 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -77,6 +77,15 @@ void __init init_IRQ(void) irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); + + if (system_uses_irq_prio_masking()) { + /* + * Now that we have a stack for our IRQ handler, set + * the PMR/PSR pair to a consistent state. + */ + WARN_ON(read_sysreg(daif) & PSR_A_BIT); + local_daif_restore(DAIF_PROCCTX_NOIRQ); + } } /* diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 4deaee3c2a33..83cdb0aa2ff1 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -192,13 +192,7 @@ static void init_gic_priority_masking(void) WARN_ON(!(cpuflags & PSR_I_BIT)); - /* We can only unmask PSR.I if we can take aborts */ - if (!(cpuflags & PSR_A_BIT)) { - gic_write_pmr(GIC_PRIO_IRQOFF); - write_sysreg(cpuflags & ~PSR_I_BIT, daif); - } else { - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); - } + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); } /* From 2a438ffa74c0f3b718dc71ac59fbdce5f2e352e4 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Tue, 11 Jun 2019 10:38:13 +0100 Subject: [PATCH 35/54] arm64: Allow selecting Pseudo-NMI again Now that Pseudo-NMI are fixed, allow the use of that option again This reverts commit 96a13f57b946be7a6c10405e4bd780c0b6b6fe63 ("arm64: Kconfig: Make ARM64_PSEUDO_NMI depend on BROKEN for now"). Cc: Will Deacon Reviewed-by: Marc Zyngier Signed-off-by: Julien Thierry Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index bd3915ae7b53..57b02d4ceb71 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1423,7 +1423,6 @@ config ARM64_MODULE_PLTS config ARM64_PSEUDO_NMI bool "Support for NMI-like interrupts" - depends on BROKEN # 1556553607-46531-1-git-send-email-julien.thierry@arm.com select CONFIG_ARM_GIC_V3 help Adds support for mimicking Non-Maskable Interrupts through the use of From 2af22f3ec3ca452f1e79b967f634708ff01ced8a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 19 Jun 2019 14:18:31 +0200 Subject: [PATCH 36/54] acpi/arm64: ignore 5.1 FADTs that are reported as 5.0 Some Qualcomm Snapdragon based laptops built to run Microsoft Windows are clearly ACPI 5.1 based, given that that is the first ACPI revision that supports ARM, and introduced the FADT 'arm_boot_flags' field, which has a non-zero field on those systems. So in these cases, infer from the ARM boot flags that the FADT must be 5.1 or later, and treat it as 5.1. Acked-by: Sudeep Holla Tested-by: Lee Jones Reviewed-by: Graeme Gregory Acked-by: Lorenzo Pieralisi Acked-by: Hanjun Guo Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/kernel/acpi.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 803f0494dd3e..7722e85fb69c 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -155,10 +155,14 @@ static int __init acpi_fadt_sanity_check(void) */ if (table->revision < 5 || (table->revision == 5 && fadt->minor_revision < 1)) { - pr_err("Unsupported FADT revision %d.%d, should be 5.1+\n", + pr_err(FW_BUG "Unsupported FADT revision %d.%d, should be 5.1+\n", table->revision, fadt->minor_revision); - ret = -EINVAL; - goto out; + + if (!fadt->arm_boot_flags) { + ret = -EINVAL; + goto out; + } + pr_err("FADT has ARM boot flags set, assuming 5.1\n"); } if (!(fadt->flags & ACPI_FADT_HW_REDUCED)) { From 58557e486f89823763039502923647037ef7032f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 17 Jun 2019 15:29:59 -0700 Subject: [PATCH 37/54] arm64: Allow user selection of ARM64_MODULE_PLTS Make ARM64_MODULE_PLTS a selectable Kconfig symbol, since some people might have very big modules spilling out of the dedicated module area into vmalloc. Help text is copied from the ARM 32-bit counterpart and modified to a mention of KASLR and specific ARM errata workaround(s). Acked-by: Will Deacon Signed-off-by: Florian Fainelli Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 57b02d4ceb71..22f8e6b3b0f9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1418,8 +1418,23 @@ config ARM64_SVE KVM in the same kernel image. config ARM64_MODULE_PLTS - bool + bool "Use PLTs to allow module memory to spill over into vmalloc area" select HAVE_MOD_ARCH_SPECIFIC + help + Allocate PLTs when loading modules so that jumps and calls whose + targets are too far away for their relative offsets to be encoded + in the instructions themselves can be bounced via veneers in the + module's PLT. This allows modules to be allocated in the generic + vmalloc area after the dedicated module memory area has been + exhausted. + + When running with address space randomization (KASLR), the module + region itself may be too far away for ordinary relative jumps and + calls, and so in that case, module PLTs are required and cannot be + disabled. + + Specific errata workaround(s) might also force module PLTs to be + enabled (ARM64_ERRATUM_843419). config ARM64_PSEUDO_NMI bool "Support for NMI-like interrupts" From 7dfac3c5f40eb92841147eccf1b96f428b10131f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 23 May 2019 11:22:53 +0100 Subject: [PATCH 38/54] arm64: module: create module allocations without exec permissions Now that the core code manages the executable permissions of code regions of modules explicitly, it is no longer necessary to create the module vmalloc regions with RWX permissions, and we can create them with RW- permissions instead, which is preferred from a security perspective. Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index dd080837e6a9..5b5936b7868c 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -41,7 +41,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, module_alloc_base + MODULES_VSIZE, - gfp_mask, PAGE_KERNEL_EXEC, 0, + gfp_mask, PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && @@ -57,7 +57,7 @@ void *module_alloc(unsigned long size) */ p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, module_alloc_base + SZ_2G, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { From 4739d53fcd1df8a9f6f72bb02a3a1d852ad252b3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 23 May 2019 11:22:54 +0100 Subject: [PATCH 39/54] arm64/mm: wire up CONFIG_ARCH_HAS_SET_DIRECT_MAP Wire up the special helper functions to manipulate aliases of vmalloc regions in the linear map. Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/cacheflush.h | 3 ++ arch/arm64/mm/pageattr.c | 48 ++++++++++++++++++++++++----- mm/vmalloc.c | 11 ------- 4 files changed, 44 insertions(+), 19 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 22f8e6b3b0f9..61b4a2d35508 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -26,6 +26,7 @@ config ARM64 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS + select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 19844211a4e6..b9ee5510067f 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -187,4 +187,7 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) int set_memory_valid(unsigned long addr, int numpages, int enable); +int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_default_noflush(struct page *page); + #endif diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 6cd645edcf35..9c6b9039ec8f 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -159,17 +159,48 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) __pgprot(PTE_VALID)); } -#ifdef CONFIG_DEBUG_PAGEALLOC +int set_direct_map_invalid_noflush(struct page *page) +{ + struct page_change_data data = { + .set_mask = __pgprot(0), + .clear_mask = __pgprot(PTE_VALID), + }; + + if (!rodata_full) + return 0; + + return apply_to_page_range(&init_mm, + (unsigned long)page_address(page), + PAGE_SIZE, change_page_range, &data); +} + +int set_direct_map_default_noflush(struct page *page) +{ + struct page_change_data data = { + .set_mask = __pgprot(PTE_VALID | PTE_WRITE), + .clear_mask = __pgprot(PTE_RDONLY), + }; + + if (!rodata_full) + return 0; + + return apply_to_page_range(&init_mm, + (unsigned long)page_address(page), + PAGE_SIZE, change_page_range, &data); +} + void __kernel_map_pages(struct page *page, int numpages, int enable) { + if (!debug_pagealloc_enabled() && !rodata_full) + return; + set_memory_valid((unsigned long)page_address(page), numpages, enable); } -#ifdef CONFIG_HIBERNATION + /* - * When built with CONFIG_DEBUG_PAGEALLOC and CONFIG_HIBERNATION, this function - * is used to determine if a linear map page has been marked as not-valid by - * CONFIG_DEBUG_PAGEALLOC. Walk the page table and check the PTE_VALID bit. - * This is based on kern_addr_valid(), which almost does what we need. + * This function is used to determine if a linear map page has been marked as + * not-valid. Walk the page table and check the PTE_VALID bit. This is based + * on kern_addr_valid(), which almost does what we need. * * Because this is only called on the kernel linear map, p?d_sect() implies * p?d_present(). When debug_pagealloc is enabled, sections mappings are @@ -183,6 +214,9 @@ bool kernel_page_present(struct page *page) pte_t *ptep; unsigned long addr = (unsigned long)page_address(page); + if (!debug_pagealloc_enabled() && !rodata_full) + return true; + pgdp = pgd_offset_k(addr); if (pgd_none(READ_ONCE(*pgdp))) return false; @@ -204,5 +238,3 @@ bool kernel_page_present(struct page *page) ptep = pte_offset_kernel(pmdp, addr); return pte_valid(READ_ONCE(*ptep)); } -#endif /* CONFIG_HIBERNATION */ -#endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7350a124524b..6bd7b515995c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2128,17 +2128,6 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int i; - /* - * The below block can be removed when all architectures that have - * direct map permissions also have set_direct_map_() implementations. - * This is concerned with resetting the direct map any an vm alias with - * execute permissions, without leaving a RW+X window. - */ - if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { - set_memory_nx(addr, area->nr_pages); - set_memory_rw(addr, area->nr_pages); - } - remove_vm_area(area->addr); /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ From f83b4f8860046e0f5244eef35b25fc3e405d7fee Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 23 May 2019 11:22:55 +0100 Subject: [PATCH 40/54] arm64/kprobes: set VM_FLUSH_RESET_PERMS on kprobe instruction pages In order to avoid transient inconsistencies where freed code pages are remapped writable while stale TLB entries still exist on other cores, mark the kprobes text pages with the VM_FLUSH_RESET_PERMS attribute. This instructs the core vmalloc code not to defer the TLB flush when this region is unmapped and returned to the page allocator. Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/kernel/probes/kprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 88ce502c8e6f..bd5dfffca272 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -122,8 +122,10 @@ void *alloc_insn_page(void) void *page; page = vmalloc_exec(PAGE_SIZE); - if (page) + if (page) { set_memory_ro((unsigned long)page, 1); + set_vm_flush_reset_perms(page); + } return page; } From 3f750706486227f32991092fe57c25e1290691d5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 23 May 2019 11:22:56 +0100 Subject: [PATCH 41/54] arm64: bpf: do not allocate executable memory The BPF code now takes care of mapping the code pages executable after mapping them read-only, to ensure that no RWX mapped regions are needed, even transiently. This means we can drop the executable permissions from the mapping at allocation time. Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index df845cee438e..aef4ff467222 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -981,7 +981,7 @@ void *bpf_jit_alloc_exec(unsigned long size) { return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, BPF_JIT_REGION_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); } From faaa73bcec4179ac6ff4493e2e5b8c17001c8779 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 25 Jun 2019 09:32:11 +0100 Subject: [PATCH 42/54] arm64: ARM64_MODULES_PLTS must depend on MODULES Otherwise, selecting it without MODULES leads to build failures. Fixes: 58557e486f89 ("arm64: Allow user selection of ARM64_MODULE_PLTS") Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 61b4a2d35508..089a834b0ed0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1420,6 +1420,7 @@ config ARM64_SVE config ARM64_MODULE_PLTS bool "Use PLTs to allow module memory to spill over into vmalloc area" + depends on MODULES select HAVE_MOD_ARCH_SPECIFIC help Allocate PLTs when loading modules so that jumps and calls whose From 8049672bb17a53f2545fbeaa6cfbb48055f51cde Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 24 Jun 2019 10:58:50 -0700 Subject: [PATCH 43/54] arm64: defconfig: enable CONFIG_RANDOMIZE_BASE For testing coverage and improved defense in depth, enable KASLR by default. Acked-by: Ard Biesheuvel Acked-by: Will Deacon Reviewed-by: Kees Cook Suggested-by: Arnd Bergmann Suggested-by: Olof Johansson Signed-off-by: Nick Desaulniers Signed-off-by: Catalin Marinas --- arch/arm64/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 4d583514258c..a7cbf7cd84b4 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -68,6 +68,7 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_XEN=y CONFIG_COMPAT=y +CONFIG_RANDOMIZE_BASE=y CONFIG_HIBERNATION=y CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y CONFIG_ARM_CPUIDLE=y From 1201937491822b61641c1878ebcd16a93aed4540 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 18 Jun 2019 19:10:54 +0100 Subject: [PATCH 44/54] arm64: Expose ARMv8.5 CondM capability to userspace ARMv8.5 adds new instructions XAFLAG and AXFLAG to translate the representation of the results of floating point comparisons between the native ARM format and an alternative format used by some software. Add a hwcap allowing userspace to determine if they are present, since we referred to earlier CondM extensions as FLAGM call these extensions FLAGM2. Signed-off-by: Mark Brown Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- Documentation/arm64/elf_hwcaps.txt | 4 ++++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpufeature.c | 1 + arch/arm64/kernel/cpuinfo.c | 1 + 5 files changed, 8 insertions(+) diff --git a/Documentation/arm64/elf_hwcaps.txt b/Documentation/arm64/elf_hwcaps.txt index b73a2519ecf2..ee8dbfe652b6 100644 --- a/Documentation/arm64/elf_hwcaps.txt +++ b/Documentation/arm64/elf_hwcaps.txt @@ -207,6 +207,10 @@ HWCAP_FLAGM Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0001. +HWCAP2_FLAGM2 + + Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0010. + HWCAP_SSBS Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010. diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index b4bfb6672168..838c47f90389 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -95,6 +95,7 @@ #define KERNEL_HWCAP_SVEBITPERM __khwcap2_feature(SVEBITPERM) #define KERNEL_HWCAP_SVESHA3 __khwcap2_feature(SVESHA3) #define KERNEL_HWCAP_SVESM4 __khwcap2_feature(SVESM4) +#define KERNEL_HWCAP_FLAGM2 __khwcap2_feature(FLAGM2) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 1a772b162191..7902ae4f38b4 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -63,5 +63,6 @@ #define HWCAP2_SVEBITPERM (1 << 4) #define HWCAP2_SVESHA3 (1 << 5) #define HWCAP2_SVESM4 (1 << 6) +#define HWCAP2_FLAGM2 (1 << 7) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 2a7159fda3ce..8350016dbb28 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1628,6 +1628,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index f6f7936be6e7..62102f75dc1e 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -92,6 +92,7 @@ static const char *const hwcap_str[] = { "svebitperm", "svesha3", "svesm4", + "flagm2", NULL }; From ca9503fc9e9812aa6258e55d44edb03eb30fc46f Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 18 Jun 2019 19:10:55 +0100 Subject: [PATCH 45/54] arm64: Expose FRINT capabilities to userspace ARMv8.5 introduces the FRINT series of instructions for rounding floating point numbers to integers. Provide a capability to userspace in order to allow applications to determine if the system supports these instructions. Signed-off-by: Mark Brown Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- Documentation/arm64/elf_hwcaps.txt | 4 ++++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpufeature.c | 1 + arch/arm64/kernel/cpuinfo.c | 1 + 6 files changed, 9 insertions(+) diff --git a/Documentation/arm64/elf_hwcaps.txt b/Documentation/arm64/elf_hwcaps.txt index ee8dbfe652b6..5ae2ef2c12f3 100644 --- a/Documentation/arm64/elf_hwcaps.txt +++ b/Documentation/arm64/elf_hwcaps.txt @@ -227,6 +227,10 @@ HWCAP_PACG ID_AA64ISAR1_EL1.GPI == 0b0001, as described by Documentation/arm64/pointer-authentication.txt. +HWCAP2_FRINT + + Functionality implied by ID_AA64ISAR1_EL1.FRINTTS == 0b0001. + 4. Unused AT_HWCAP bits ----------------------- diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 838c47f90389..8371202e0a8b 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -96,6 +96,7 @@ #define KERNEL_HWCAP_SVESHA3 __khwcap2_feature(SVESHA3) #define KERNEL_HWCAP_SVESM4 __khwcap2_feature(SVESM4) #define KERNEL_HWCAP_FLAGM2 __khwcap2_feature(FLAGM2) +#define KERNEL_HWCAP_FRINT __khwcap2_feature(FRINT) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 902d75b60914..601972771807 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -560,6 +560,7 @@ /* id_aa64isar1 */ #define ID_AA64ISAR1_SB_SHIFT 36 +#define ID_AA64ISAR1_FRINTTS_SHIFT 32 #define ID_AA64ISAR1_GPI_SHIFT 28 #define ID_AA64ISAR1_GPA_SHIFT 24 #define ID_AA64ISAR1_LRCPC_SHIFT 20 diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 7902ae4f38b4..a1e72886b30c 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -64,5 +64,6 @@ #define HWCAP2_SVESHA3 (1 << 5) #define HWCAP2_SVESM4 (1 << 6) #define HWCAP2_FLAGM2 (1 << 7) +#define HWCAP2_FRINT (1 << 8) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 8350016dbb28..a0f00917b8b9 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1640,6 +1640,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FRINTTS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FRINT), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB), HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 62102f75dc1e..fda8ded8b739 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -93,6 +93,7 @@ static const char *const hwcap_str[] = { "svesha3", "svesm4", "flagm2", + "frint", NULL }; From dccc9da22dedad203acea355b0e4d946b71172e5 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Mon, 17 Jun 2019 23:35:18 +0300 Subject: [PATCH 46/54] arm64: Improve parking of stopped CPUs The current code puts the stopped cpus in an 'yield' instruction loop. Using a busy loop here is unnecessary, we can use the cpu_park_loop() function here to do a wfi/wfe. Signed-off-by: Jayachandran C Signed-off-by: Aaro Koskinen Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/smp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 83cdb0aa2ff1..932461da5dd7 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -850,9 +850,7 @@ static void ipi_cpu_stop(unsigned int cpu) local_daif_mask(); sdei_mask_local_cpu(); - - while (1) - cpu_relax(); + cpu_park_loop(); } #ifdef CONFIG_KEXEC_CORE From d914d4d4974529da898f2d2618e39df757147c2f Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Mon, 17 Jun 2019 23:35:19 +0300 Subject: [PATCH 47/54] arm64: Implement panic_smp_self_stop() Currently arm64 uses the default implementation of panic_smp_self_stop() where the CPU runs in a cpu_relax() loop unable to receive IPIs anymore. As a result, when two CPUs panic() simultaneously we get "SMP: failed to stop secondary CPUs" warnings and extra delays before a reset, because smp_send_stop() still tries to stop the other paniced CPU. Provide an implementation of panic_smp_self_stop() that is identical to the IPI CPU stop handler, so that the online status of stopped CPUs gets properly updated. Acked-by: Will Deacon Signed-off-by: Aaro Koskinen Signed-off-by: Catalin Marinas --- arch/arm64/kernel/smp.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 932461da5dd7..434d6714358b 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -841,18 +841,25 @@ void arch_irq_work_raise(void) } #endif -/* - * ipi_cpu_stop - handle IPI from smp_send_stop() - */ -static void ipi_cpu_stop(unsigned int cpu) +static void local_cpu_stop(void) { - set_cpu_online(cpu, false); + set_cpu_online(smp_processor_id(), false); local_daif_mask(); sdei_mask_local_cpu(); cpu_park_loop(); } +/* + * We need to implement panic_smp_self_stop() for parallel panic() calls, so + * that cpu_online_mask gets correctly updated and smp_send_stop() can skip + * CPUs that have already stopped themselves. + */ +void panic_smp_self_stop(void) +{ + local_cpu_stop(); +} + #ifdef CONFIG_KEXEC_CORE static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0); #endif @@ -903,7 +910,7 @@ void handle_IPI(int ipinr, struct pt_regs *regs) case IPI_CPU_STOP: irq_enter(); - ipi_cpu_stop(cpu); + local_cpu_stop(); irq_exit(); break; From d9db691d3cb5abdae59545b6b2e50f7e1cc81609 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 26 Jun 2019 14:06:10 +0530 Subject: [PATCH 48/54] arm64/mm: Drop [PTE|PMD]_TYPE_FAULT This was added part of the original commit which added MMU definitions. commit 4f04d8f00545 ("arm64: MMU definitions"). These symbols never got used as confirmed from a git log search. git log -p arch/arm64/ | grep PTE_TYPE_FAULT git log -p arch/arm64/ | grep PMD_TYPE_FAULT These probably meant to identify non present entries which can now be achieved with PMD_SECT_VALID or PTE_VALID bits. Hence just drop these unused symbols which are not required anymore. Cc: Will Deacon Cc: Steve Capper Signed-off-by: Anshuman Khandual Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 974f0114ef1b..529e83f8e607 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -126,7 +126,6 @@ * Level 2 descriptor (PMD). */ #define PMD_TYPE_MASK (_AT(pmdval_t, 3) << 0) -#define PMD_TYPE_FAULT (_AT(pmdval_t, 0) << 0) #define PMD_TYPE_TABLE (_AT(pmdval_t, 3) << 0) #define PMD_TYPE_SECT (_AT(pmdval_t, 1) << 0) #define PMD_TABLE_BIT (_AT(pmdval_t, 1) << 1) @@ -155,7 +154,6 @@ */ #define PTE_VALID (_AT(pteval_t, 1) << 0) #define PTE_TYPE_MASK (_AT(pteval_t, 3) << 0) -#define PTE_TYPE_FAULT (_AT(pteval_t, 0) << 0) #define PTE_TYPE_PAGE (_AT(pteval_t, 3) << 0) #define PTE_TABLE_BIT (_AT(pteval_t, 1) << 1) #define PTE_USER (_AT(pteval_t, 1) << 6) /* AP[1] */ From 7b71665603bba0fa53e2cd63de9125a0bc3f6e17 Mon Sep 17 00:00:00 2001 From: jinho lim Date: Wed, 26 Jun 2019 20:50:13 +0900 Subject: [PATCH 49/54] arm64: rename dump_instr as dump_kernel_instr In traps.c, only __die calls dump_instr. However, this function has sub-function as __dump_instr. dump_kernel_instr can replace those functions. By using aarch64_insn_read, it does not have to change fs to KERNEL_DS. Acked-by: Will Deacon Signed-off-by: jinho lim Signed-off-by: Catalin Marinas --- arch/arm64/kernel/traps.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 177c0f6ebabf..8a395523adcf 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -66,16 +66,19 @@ static void dump_backtrace_entry(unsigned long where) printk(" %pS\n", (void *)where); } -static void __dump_instr(const char *lvl, struct pt_regs *regs) +static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) { unsigned long addr = instruction_pointer(regs); char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; int i; + if (user_mode(regs)) + return; + for (i = -4; i < 1; i++) { unsigned int val, bad; - bad = get_user(val, &((u32 *)addr)[i]); + bad = aarch64_insn_read(&((u32 *)addr)[i], &val); if (!bad) p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); @@ -84,19 +87,8 @@ static void __dump_instr(const char *lvl, struct pt_regs *regs) break; } } - printk("%sCode: %s\n", lvl, str); -} -static void dump_instr(const char *lvl, struct pt_regs *regs) -{ - if (!user_mode(regs)) { - mm_segment_t fs = get_fs(); - set_fs(KERNEL_DS); - __dump_instr(lvl, regs); - set_fs(fs); - } else { - __dump_instr(lvl, regs); - } + printk("%sCode: %s\n", lvl, str); } void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) @@ -182,8 +174,7 @@ static int __die(const char *str, int err, struct pt_regs *regs) print_modules(); show_regs(regs); - if (!user_mode(regs)) - dump_instr(KERN_EMERG, regs); + dump_kernel_instr(KERN_EMERG, regs); return ret; } From b07d7d5c7b421462dc91f0b775e31aae49804050 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Tue, 11 Jun 2019 15:56:27 +0100 Subject: [PATCH 50/54] x86/entry: Simplify _TIF_SYSCALL_EMU handling The usage of emulated and _TIF_SYSCALL_EMU flags in syscall_trace_enter is more complicated than required. Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Borislav Petkov Acked-by: Oleg Nesterov Reviewed-by: Thomas Gleixner Signed-off-by: Sudeep Holla Signed-off-by: Catalin Marinas --- arch/x86/entry/common.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a986b3c8294c..0a61705d62ec 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -72,23 +72,18 @@ static long syscall_trace_enter(struct pt_regs *regs) struct thread_info *ti = current_thread_info(); unsigned long ret = 0; - bool emulated = false; u32 work; if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); - work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; + work = READ_ONCE(ti->flags); - if (unlikely(work & _TIF_SYSCALL_EMU)) - emulated = true; - - if ((emulated || (work & _TIF_SYSCALL_TRACE)) && - tracehook_report_syscall_entry(regs)) - return -1L; - - if (emulated) - return -1L; + if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + ret = tracehook_report_syscall_entry(regs); + if (ret || (work & _TIF_SYSCALL_EMU)) + return -1L; + } #ifdef CONFIG_SECCOMP /* From ed2b664fcc8073c09394393756df3fc86977bbac Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 26 Jun 2019 16:37:15 -0500 Subject: [PATCH 51/54] ACPI/PPTT: Modify node flag detection to find last IDENTICAL The ACPI specification implies that the IDENTICAL flag should be set on all non leaf nodes where the children are identical. This means that we need to be searching for the last node with the identical flag set rather than the first one. Since this flag is also dependent on the table revision, we need to add a bit of extra code to verify the table revision, and the next node's state in the traversal. Since we want to avoid function pointers here, lets just special case the IDENTICAL flag. Acked-by: Rafael J. Wysocki Tested-by: Hanjun Guo Reviewed-by: Sudeep Holla Signed-off-by: Jeremy Linton Signed-off-by: Will Deacon --- drivers/acpi/pptt.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index b72e6afaa8fb..05344413f199 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -432,17 +432,40 @@ static void cache_setup_acpi_cpu(struct acpi_table_header *table, } } +static bool flag_identical(struct acpi_table_header *table_hdr, + struct acpi_pptt_processor *cpu) +{ + struct acpi_pptt_processor *next; + + /* heterogeneous machines must use PPTT revision > 1 */ + if (table_hdr->revision < 2) + return false; + + /* Locate the last node in the tree with IDENTICAL set */ + if (cpu->flags & ACPI_PPTT_ACPI_IDENTICAL) { + next = fetch_pptt_node(table_hdr, cpu->parent); + if (!(next && next->flags & ACPI_PPTT_ACPI_IDENTICAL)) + return true; + } + + return false; +} + /* Passing level values greater than this will result in search termination */ #define PPTT_ABORT_PACKAGE 0xFF -static struct acpi_pptt_processor *acpi_find_processor_package_id(struct acpi_table_header *table_hdr, - struct acpi_pptt_processor *cpu, - int level, int flag) +static struct acpi_pptt_processor *acpi_find_processor_tag(struct acpi_table_header *table_hdr, + struct acpi_pptt_processor *cpu, + int level, int flag) { struct acpi_pptt_processor *prev_node; while (cpu && level) { - if (cpu->flags & flag) + /* special case the identical flag to find last identical */ + if (flag == ACPI_PPTT_ACPI_IDENTICAL) { + if (flag_identical(table_hdr, cpu)) + break; + } else if (cpu->flags & flag) break; pr_debug("level %d\n", level); prev_node = fetch_pptt_node(table_hdr, cpu->parent); @@ -480,8 +503,8 @@ static int topology_get_acpi_cpu_tag(struct acpi_table_header *table, cpu_node = acpi_find_processor_node(table, acpi_cpu_id); if (cpu_node) { - cpu_node = acpi_find_processor_package_id(table, cpu_node, - level, flag); + cpu_node = acpi_find_processor_tag(table, cpu_node, + level, flag); /* * As per specification if the processor structure represents * an actual processor, then ACPI processor ID must be valid. From 56855a99f3d0d1e9f1f4e24f5851f9bf14c83296 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 26 Jun 2019 16:37:16 -0500 Subject: [PATCH 52/54] ACPI/PPTT: Add function to return ACPI 6.3 Identical tokens ACPI 6.3 adds a flag to indicate that child nodes are all identical cores. This is useful to authoritatively determine if a set of (possibly offline) cores are identical or not. Since the flag doesn't give us a unique id we can generate one and use it to create bitmaps of sibling nodes, or simply in a loop to determine if a subset of cores are identical. Acked-by: Rafael J. Wysocki Tested-by: Hanjun Guo Reviewed-by: Sudeep Holla Signed-off-by: Jeremy Linton Signed-off-by: Will Deacon --- drivers/acpi/pptt.c | 26 ++++++++++++++++++++++++++ include/linux/acpi.h | 5 +++++ 2 files changed, 31 insertions(+) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index 05344413f199..1e7ac0bd0d3a 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -683,3 +683,29 @@ int find_acpi_cpu_topology_package(unsigned int cpu) return find_acpi_cpu_topology_tag(cpu, PPTT_ABORT_PACKAGE, ACPI_PPTT_PHYSICAL_PACKAGE); } + +/** + * find_acpi_cpu_topology_hetero_id() - Get a core architecture tag + * @cpu: Kernel logical CPU number + * + * Determine a unique heterogeneous tag for the given CPU. CPUs with the same + * implementation should have matching tags. + * + * The returned tag can be used to group peers with identical implementation. + * + * The search terminates when a level is found with the identical implementation + * flag set or we reach a root node. + * + * Due to limitations in the PPTT data structure, there may be rare situations + * where two cores in a heterogeneous machine may be identical, but won't have + * the same tag. + * + * Return: -ENOENT if the PPTT doesn't exist, or the CPU cannot be found. + * Otherwise returns a value which represents a group of identical cores + * similar to this CPU. + */ +int find_acpi_cpu_topology_hetero_id(unsigned int cpu) +{ + return find_acpi_cpu_topology_tag(cpu, PPTT_ABORT_PACKAGE, + ACPI_PPTT_ACPI_IDENTICAL); +} diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d315d86844e4..5bcd23e5ccd6 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1303,6 +1303,7 @@ static inline int lpit_read_residency_count_address(u64 *address) #ifdef CONFIG_ACPI_PPTT int find_acpi_cpu_topology(unsigned int cpu, int level); int find_acpi_cpu_topology_package(unsigned int cpu); +int find_acpi_cpu_topology_hetero_id(unsigned int cpu); int find_acpi_cpu_cache_topology(unsigned int cpu, int level); #else static inline int find_acpi_cpu_topology(unsigned int cpu, int level) @@ -1313,6 +1314,10 @@ static inline int find_acpi_cpu_topology_package(unsigned int cpu) { return -EINVAL; } +static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) +{ + return -EINVAL; +} static inline int find_acpi_cpu_cache_topology(unsigned int cpu, int level) { return -EINVAL; From d24a0c7099b32b6981d7f126c45348e381718350 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 26 Jun 2019 16:37:17 -0500 Subject: [PATCH 53/54] arm_pmu: acpi: spe: Add initial MADT/SPE probing ACPI 6.3 adds additional fields to the MADT GICC structure to describe SPE PPI's. We pick these out of the cached reference to the madt_gicc structure similarly to the core PMU code. We then create a platform device referring to the IRQ and let the user/module loader decide whether to load the SPE driver. Tested-by: Hanjun Guo Reviewed-by: Sudeep Holla Reviewed-by: Lorenzo Pieralisi Signed-off-by: Jeremy Linton Signed-off-by: Will Deacon --- arch/arm64/include/asm/acpi.h | 3 ++ drivers/perf/arm_pmu_acpi.c | 72 +++++++++++++++++++++++++++++++++++ include/linux/perf/arm_pmu.h | 2 + 3 files changed, 77 insertions(+) diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index 7628efbe6c12..d10399b9f998 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -41,6 +41,9 @@ (!(entry) || (entry)->header.length < ACPI_MADT_GICC_MIN_LENGTH || \ (unsigned long)(entry) + (entry)->header.length > (end)) +#define ACPI_MADT_GICC_SPE (ACPI_OFFSET(struct acpi_madt_generic_interrupt, \ + spe_interrupt) + sizeof(u16)) + /* Basic configuration for ACPI */ #ifdef CONFIG_ACPI pgprot_t __acpi_get_mem_attribute(phys_addr_t addr); diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c index 0f197516d708..864d7ebe45e9 100644 --- a/drivers/perf/arm_pmu_acpi.c +++ b/drivers/perf/arm_pmu_acpi.c @@ -74,6 +74,76 @@ static void arm_pmu_acpi_unregister_irq(int cpu) acpi_unregister_gsi(gsi); } +#if IS_ENABLED(CONFIG_ARM_SPE_PMU) +static struct resource spe_resources[] = { + { + /* irq */ + .flags = IORESOURCE_IRQ, + } +}; + +static struct platform_device spe_dev = { + .name = ARMV8_SPE_PDEV_NAME, + .id = -1, + .resource = spe_resources, + .num_resources = ARRAY_SIZE(spe_resources) +}; + +/* + * For lack of a better place, hook the normal PMU MADT walk + * and create a SPE device if we detect a recent MADT with + * a homogeneous PPI mapping. + */ +static void arm_spe_acpi_register_device(void) +{ + int cpu, hetid, irq, ret; + bool first = true; + u16 gsi = 0; + + /* + * Sanity check all the GICC tables for the same interrupt number. + * For now, we only support homogeneous ACPI/SPE machines. + */ + for_each_possible_cpu(cpu) { + struct acpi_madt_generic_interrupt *gicc; + + gicc = acpi_cpu_get_madt_gicc(cpu); + if (gicc->header.length < ACPI_MADT_GICC_SPE) + return; + + if (first) { + gsi = gicc->spe_interrupt; + if (!gsi) + return; + hetid = find_acpi_cpu_topology_hetero_id(cpu); + first = false; + } else if ((gsi != gicc->spe_interrupt) || + (hetid != find_acpi_cpu_topology_hetero_id(cpu))) { + pr_warn("ACPI: SPE must be homogeneous\n"); + return; + } + } + + irq = acpi_register_gsi(NULL, gsi, ACPI_LEVEL_SENSITIVE, + ACPI_ACTIVE_HIGH); + if (irq < 0) { + pr_warn("ACPI: SPE Unable to register interrupt: %d\n", gsi); + return; + } + + spe_resources[0].start = irq; + ret = platform_device_register(&spe_dev); + if (ret < 0) { + pr_warn("ACPI: SPE: Unable to register device\n"); + acpi_unregister_gsi(gsi); + } +} +#else +static inline void arm_spe_acpi_register_device(void) +{ +} +#endif /* CONFIG_ARM_SPE_PMU */ + static int arm_pmu_acpi_parse_irqs(void) { int irq, cpu, irq_cpu, err; @@ -279,6 +349,8 @@ static int arm_pmu_acpi_init(void) if (acpi_disabled) return 0; + arm_spe_acpi_register_device(); + ret = arm_pmu_acpi_parse_irqs(); if (ret) return ret; diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 4641e850b204..784bc58f165a 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -175,4 +175,6 @@ void armpmu_free_irq(int irq, int cpu); #endif /* CONFIG_ARM_PMU */ +#define ARMV8_SPE_PDEV_NAME "arm,spe-v1" + #endif /* __ARM_PMU_H__ */ From d482e575fbf0f7ec9319bded951f21bbc84312bf Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 26 Jun 2019 16:37:18 -0500 Subject: [PATCH 54/54] perf: arm_spe: Enable ACPI/Platform automatic module loading Lets add the MODULE_TABLE and platform id_table entries so that the SPE driver can attach to the ACPI platform device created by the core pmu code. Tested-by: Hanjun Guo Reviewed-by: Sudeep Holla Signed-off-by: Jeremy Linton Signed-off-by: Will Deacon --- drivers/perf/arm_spe_pmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index e120f933412a..4fb65c61c8ea 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -1168,7 +1169,13 @@ static const struct of_device_id arm_spe_pmu_of_match[] = { }; MODULE_DEVICE_TABLE(of, arm_spe_pmu_of_match); -static int arm_spe_pmu_device_dt_probe(struct platform_device *pdev) +static const struct platform_device_id arm_spe_match[] = { + { ARMV8_SPE_PDEV_NAME, 0}, + { } +}; +MODULE_DEVICE_TABLE(platform, arm_spe_match); + +static int arm_spe_pmu_device_probe(struct platform_device *pdev) { int ret; struct arm_spe_pmu *spe_pmu; @@ -1228,11 +1235,12 @@ static int arm_spe_pmu_device_remove(struct platform_device *pdev) } static struct platform_driver arm_spe_pmu_driver = { + .id_table = arm_spe_match, .driver = { .name = DRVNAME, .of_match_table = of_match_ptr(arm_spe_pmu_of_match), }, - .probe = arm_spe_pmu_device_dt_probe, + .probe = arm_spe_pmu_device_probe, .remove = arm_spe_pmu_device_remove, };