powerpc/xive: Native exploitation of the XIVE interrupt controller

The XIVE interrupt controller is the new interrupt controller
found in POWER9. It supports advanced virtualization capabilities
among other things.

Currently we use a set of firmware calls that simulate the old
"XICS" interrupt controller but this is fairly inefficient.

This adds the framework for using XIVE along with a native
backend which OPAL for configuration. Later, a backend allowing
the use in a KVM or PowerVM guest will also be provided.

This disables some fast path for interrupts in KVM when XIVE is
enabled as these rely on the firmware emulation code which is no
longer available when the XIVE is used natively by Linux.

A latter patch will make KVM also directly exploit the XIVE, thus
recovering the lost performance (and more).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Fixup pr_xxx("XIVE:"...), don't split pr_xxx() strings,
 tweak Kconfig so XIVE_NATIVE selects XIVE and depends on POWERNV,
 fix build errors when SMP=n, fold in fixes from Ben:
   Don't call cpu_online() on an invalid CPU number
   Fix irq target selection returning out of bounds cpu#
   Extra sanity checks on cpu numbers
 ]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
Benjamin Herrenschmidt 2017-04-05 17:54:50 +10:00 committed by Michael Ellerman
parent a978e13965
commit 243e25112d
15 changed files with 2427 additions and 12 deletions

View File

@ -0,0 +1,97 @@
/*
* Copyright 2016,2017 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ASM_POWERPC_XIVE_REGS_H
#define _ASM_POWERPC_XIVE_REGS_H
/*
* Thread Management (aka "TM") registers
*/
/* TM register offsets */
#define TM_QW0_USER 0x000 /* All rings */
#define TM_QW1_OS 0x010 /* Ring 0..2 */
#define TM_QW2_HV_POOL 0x020 /* Ring 0..1 */
#define TM_QW3_HV_PHYS 0x030 /* Ring 0..1 */
/* Byte offsets inside a QW QW0 QW1 QW2 QW3 */
#define TM_NSR 0x0 /* + + - + */
#define TM_CPPR 0x1 /* - + - + */
#define TM_IPB 0x2 /* - + + + */
#define TM_LSMFB 0x3 /* - + + + */
#define TM_ACK_CNT 0x4 /* - + - - */
#define TM_INC 0x5 /* - + - + */
#define TM_AGE 0x6 /* - + - + */
#define TM_PIPR 0x7 /* - + - + */
#define TM_WORD0 0x0
#define TM_WORD1 0x4
/*
* QW word 2 contains the valid bit at the top and other fields
* depending on the QW.
*/
#define TM_WORD2 0x8
#define TM_QW0W2_VU PPC_BIT32(0)
#define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ?
#define TM_QW1W2_VO PPC_BIT32(0)
#define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31)
#define TM_QW2W2_VP PPC_BIT32(0)
#define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31)
#define TM_QW3W2_VT PPC_BIT32(0)
#define TM_QW3W2_LP PPC_BIT32(6)
#define TM_QW3W2_LE PPC_BIT32(7)
#define TM_QW3W2_T PPC_BIT32(31)
/*
* In addition to normal loads to "peek" and writes (only when invalid)
* using 4 and 8 bytes accesses, the above registers support these
* "special" byte operations:
*
* - Byte load from QW0[NSR] - User level NSR (EBB)
* - Byte store to QW0[NSR] - User level NSR (EBB)
* - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
* - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
* otherwise VT||0000000
* - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
*
* Then we have all these "special" CI ops at these offset that trigger
* all sorts of side effects:
*/
#define TM_SPC_ACK_EBB 0x800 /* Load8 ack EBB to reg*/
#define TM_SPC_ACK_OS_REG 0x810 /* Load16 ack OS irq to reg */
#define TM_SPC_PUSH_USR_CTX 0x808 /* Store32 Push/Validate user context */
#define TM_SPC_PULL_USR_CTX 0x808 /* Load32 Pull/Invalidate user context */
#define TM_SPC_SET_OS_PENDING 0x812 /* Store8 Set OS irq pending bit */
#define TM_SPC_PULL_OS_CTX 0x818 /* Load32/Load64 Pull/Invalidate OS context to reg */
#define TM_SPC_PULL_POOL_CTX 0x828 /* Load32/Load64 Pull/Invalidate Pool context to reg*/
#define TM_SPC_ACK_HV_REG 0x830 /* Load16 ack HV irq to reg */
#define TM_SPC_PULL_USR_CTX_OL 0xc08 /* Store8 Pull/Inval usr ctx to odd line */
#define TM_SPC_ACK_OS_EL 0xc10 /* Store8 ack OS irq to even line */
#define TM_SPC_ACK_HV_POOL_EL 0xc20 /* Store8 ack HV evt pool to even line */
#define TM_SPC_ACK_HV_EL 0xc30 /* Store8 ack HV irq to even line */
/* XXX more... */
/* NSR fields for the various QW ack types */
#define TM_QW0_NSR_EB PPC_BIT8(0)
#define TM_QW1_NSR_EO PPC_BIT8(0)
#define TM_QW3_NSR_HE PPC_BITMASK8(0,1)
#define TM_QW3_NSR_HE_NONE 0
#define TM_QW3_NSR_HE_POOL 1
#define TM_QW3_NSR_HE_PHYS 2
#define TM_QW3_NSR_HE_LSI 3
#define TM_QW3_NSR_I PPC_BIT8(2)
#define TM_QW3_NSR_GRP_LVL PPC_BIT8(3,7)
/* Utilities to manipulate these (originaly from OPAL) */
#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1)
#define GETFIELD(m, v) (((v) & (m)) >> MASK_TO_LSH(m))
#define SETFIELD(m, v, val) \
(((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
#endif /* _ASM_POWERPC_XIVE_REGS_H */

View File

@ -0,0 +1,163 @@
/*
* Copyright 2016,2017 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ASM_POWERPC_XIVE_H
#define _ASM_POWERPC_XIVE_H
#define XIVE_INVALID_VP 0xffffffff
#ifdef CONFIG_PPC_XIVE
/*
* Thread Interrupt Management Area (TIMA)
*
* This is a global MMIO region divided in 4 pages of varying access
* permissions, providing access to per-cpu interrupt management
* functions. It always identifies the CPU doing the access based
* on the PowerBus initiator ID, thus we always access via the
* same offset regardless of where the code is executing
*/
extern void __iomem *xive_tima;
/*
* Offset in the TM area of our current execution level (provided by
* the backend)
*/
extern u32 xive_tima_offset;
/*
* Per-irq data (irq_get_handler_data for normal IRQs), IPIs
* have it stored in the xive_cpu structure. We also cache
* for normal interrupts the current target CPU.
*
* This structure is setup by the backend for each interrupt.
*/
struct xive_irq_data {
u64 flags;
u64 eoi_page;
void __iomem *eoi_mmio;
u64 trig_page;
void __iomem *trig_mmio;
u32 esb_shift;
int src_chip;
/* Setup/used by frontend */
int target;
bool saved_p;
};
#define XIVE_IRQ_FLAG_STORE_EOI 0x01
#define XIVE_IRQ_FLAG_LSI 0x02
#define XIVE_IRQ_FLAG_SHIFT_BUG 0x04
#define XIVE_IRQ_FLAG_MASK_FW 0x08
#define XIVE_IRQ_FLAG_EOI_FW 0x10
#define XIVE_INVALID_CHIP_ID -1
/* A queue tracking structure in a CPU */
struct xive_q {
__be32 *qpage;
u32 msk;
u32 idx;
u32 toggle;
u64 eoi_phys;
u32 esc_irq;
atomic_t count;
atomic_t pending_count;
};
/*
* "magic" Event State Buffer (ESB) MMIO offsets.
*
* Each interrupt source has a 2-bit state machine called ESB
* which can be controlled by MMIO. It's made of 2 bits, P and
* Q. P indicates that an interrupt is pending (has been sent
* to a queue and is waiting for an EOI). Q indicates that the
* interrupt has been triggered while pending.
*
* This acts as a coalescing mechanism in order to guarantee
* that a given interrupt only occurs at most once in a queue.
*
* When doing an EOI, the Q bit will indicate if the interrupt
* needs to be re-triggered.
*
* The following offsets into the ESB MMIO allow to read or
* manipulate the PQ bits. They must be used with an 8-bytes
* load instruction. They all return the previous state of the
* interrupt (atomically).
*
* Additionally, some ESB pages support doing an EOI via a
* store at 0 and some ESBs support doing a trigger via a
* separate trigger page.
*/
#define XIVE_ESB_GET 0x800
#define XIVE_ESB_SET_PQ_00 0xc00
#define XIVE_ESB_SET_PQ_01 0xd00
#define XIVE_ESB_SET_PQ_10 0xe00
#define XIVE_ESB_SET_PQ_11 0xf00
#define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01
#define XIVE_ESB_VAL_P 0x2
#define XIVE_ESB_VAL_Q 0x1
/* Global enable flags for the XIVE support */
extern bool __xive_enabled;
static inline bool xive_enabled(void) { return __xive_enabled; }
extern bool xive_native_init(void);
extern void xive_smp_probe(void);
extern int xive_smp_prepare_cpu(unsigned int cpu);
extern void xive_smp_setup_cpu(void);
extern void xive_smp_disable_cpu(void);
extern void xive_kexec_teardown_cpu(int secondary);
extern void xive_shutdown(void);
extern void xive_flush_interrupt(void);
/* xmon hook */
extern void xmon_xive_do_dump(int cpu);
/* APIs used by KVM */
extern u32 xive_native_default_eq_shift(void);
extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
extern void xive_native_free_vp_block(u32 vp_base);
extern int xive_native_populate_irq_data(u32 hw_irq,
struct xive_irq_data *data);
extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
extern u32 xive_native_alloc_irq(void);
extern void xive_native_free_irq(u32 irq);
extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
__be32 *qpage, u32 order, bool can_escalate);
extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
extern bool __xive_irq_trigger(struct xive_irq_data *xd);
extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
extern bool is_xive_irq(struct irq_chip *chip);
#else
static inline bool xive_enabled(void) { return false; }
static inline bool xive_native_init(void) { return false; }
static inline void xive_smp_probe(void) { }
extern inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
static inline void xive_smp_setup_cpu(void) { }
static inline void xive_smp_disable_cpu(void) { }
static inline void xive_kexec_teardown_cpu(int secondary) { }
static inline void xive_shutdown(void) { }
static inline void xive_flush_interrupt(void) { }
static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; }
static inline void xive_native_free_vp_block(u32 vp_base) { }
#endif
#endif /* _ASM_POWERPC_XIVE_H */

View File

@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { };
extern int cpus_are_in_xmon(void);
#endif
extern void xmon_printf(const char *format, ...);
#endif /* __KERNEL __ */
#endif /* __ASM_POWERPC_XMON_H */

View File

@ -23,6 +23,7 @@
#include <asm/kvm_book3s.h>
#include <asm/archrandom.h>
#include <asm/xics.h>
#include <asm/xive.h>
#include <asm/dbell.h>
#include <asm/cputhreads.h>
#include <asm/io.h>
@ -224,6 +225,10 @@ void kvmhv_rm_send_ipi(int cpu)
return;
}
/* We should never reach this */
if (WARN_ON_ONCE(xive_enabled()))
return;
/* Else poke the target with an IPI */
xics_phys = paca[cpu].kvm_hstate.xics_phys;
if (xics_phys)
@ -386,6 +391,9 @@ long kvmppc_read_intr(void)
long rc;
bool again;
if (xive_enabled())
return 1;
do {
again = false;
rc = kvmppc_read_one_intr(&again);

View File

@ -4,6 +4,7 @@ config PPC_POWERNV
select PPC_NATIVE
select PPC_XICS
select PPC_ICP_NATIVE
select PPC_XIVE_NATIVE
select PPC_P7_NAP
select PCI
select PCI_MSI

View File

@ -32,6 +32,7 @@
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/xics.h>
#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/kexec.h>
#include <asm/smp.h>
@ -76,7 +77,9 @@ static void __init pnv_init(void)
static void __init pnv_init_IRQ(void)
{
xics_init();
/* Try using a XIVE if available, otherwise use a XICS */
if (!xive_native_init())
xics_init();
WARN_ON(!ppc_md.get_irq);
}
@ -218,10 +221,12 @@ static void pnv_kexec_wait_secondaries_down(void)
static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
{
xics_kexec_teardown_cpu(secondary);
if (xive_enabled())
xive_kexec_teardown_cpu(secondary);
else
xics_kexec_teardown_cpu(secondary);
/* On OPAL, we return all CPUs to firmware */
if (!firmware_has_feature(FW_FEATURE_OPAL))
return;
@ -237,6 +242,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
/* Primary waits for the secondaries to have reached OPAL */
pnv_kexec_wait_secondaries_down();
/* Switch XIVE back to emulation mode */
if (xive_enabled())
xive_shutdown();
/*
* We might be running as little-endian - now that interrupts
* are disabled, reset the HILE bit to big-endian so we don't

View File

@ -29,6 +29,7 @@
#include <asm/vdso_datapage.h>
#include <asm/cputhreads.h>
#include <asm/xics.h>
#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/runlatch.h>
#include <asm/code-patching.h>
@ -47,7 +48,9 @@
static void pnv_smp_setup_cpu(int cpu)
{
if (cpu != boot_cpuid)
if (xive_enabled())
xive_smp_setup_cpu();
else if (cpu != boot_cpuid)
xics_setup_cpu();
#ifdef CONFIG_PPC_DOORBELL
@ -132,7 +135,10 @@ static int pnv_smp_cpu_disable(void)
vdso_data->processorCount--;
if (cpu == boot_cpuid)
boot_cpuid = cpumask_any(cpu_online_mask);
xics_migrate_irqs_away();
if (xive_enabled())
xive_smp_disable_cpu();
else
xics_migrate_irqs_away();
return 0;
}
@ -213,9 +219,12 @@ static void pnv_smp_cpu_kill_self(void)
if (((srr1 & wmask) == SRR1_WAKEEE) ||
((srr1 & wmask) == SRR1_WAKEHVI) ||
(local_paca->irq_happened & PACA_IRQ_EE)) {
if (cpu_has_feature(CPU_FTR_ARCH_300))
icp_opal_flush_interrupt();
else
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
if (xive_enabled())
xive_flush_interrupt();
else
icp_opal_flush_interrupt();
} else
icp_native_flush_interrupt();
} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
@ -252,10 +261,26 @@ static int pnv_cpu_bootable(unsigned int nr)
return smp_generic_cpu_bootable(nr);
}
static int pnv_smp_prepare_cpu(int cpu)
{
if (xive_enabled())
return xive_smp_prepare_cpu(cpu);
return 0;
}
static void __init pnv_smp_probe(void)
{
if (xive_enabled())
xive_smp_probe();
else
xics_smp_probe();
}
static struct smp_ops_t pnv_smp_ops = {
.message_pass = smp_muxed_ipi_message_pass,
.cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */
.probe = xics_smp_probe,
.cause_ipi = NULL, /* Filled at runtime by xi{cs,ve}_smp_probe() */
.probe = pnv_smp_probe,
.prepare_cpu = pnv_smp_prepare_cpu,
.kick_cpu = pnv_smp_kick_cpu,
.setup_cpu = pnv_smp_setup_cpu,
.cpu_bootable = pnv_cpu_bootable,

View File

@ -28,6 +28,7 @@ config PPC_MSI_BITMAP
default y if PPC_POWERNV
source "arch/powerpc/sysdev/xics/Kconfig"
source "arch/powerpc/sysdev/xive/Kconfig"
config PPC_SCOM
bool

View File

@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
obj-$(CONFIG_PPC_XICS) += xics/
obj-$(CONFIG_PPC_XIVE) += xive/
obj-$(CONFIG_GE_FPGA) += ge/

View File

@ -0,0 +1,11 @@
config PPC_XIVE
bool
default n
select PPC_SMP_MUXED_IPI
select HARDIRQS_SW_RESEND
config PPC_XIVE_NATIVE
bool
default n
select PPC_XIVE
depends on PPC_POWERNV

View File

@ -0,0 +1,4 @@
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
obj-y += common.o
obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,639 @@
/*
* Copyright 2016,2017 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "xive: " fmt
#include <linux/types.h>
#include <linux/irq.h>
#include <linux/debugfs.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/of.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
#include <linux/cpumask.h>
#include <linux/mm.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/irq.h>
#include <asm/errno.h>
#include <asm/xive.h>
#include <asm/xive-regs.h>
#include <asm/opal.h>
#include "xive-internal.h"
static u32 xive_provision_size;
static u32 *xive_provision_chips;
static u32 xive_provision_chip_count;
static u32 xive_queue_shift;
static u32 xive_pool_vps = XIVE_INVALID_VP;
static struct kmem_cache *xive_provision_cache;
int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
{
__be64 flags, eoi_page, trig_page;
__be32 esb_shift, src_chip;
u64 opal_flags;
s64 rc;
memset(data, 0, sizeof(*data));
rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page,
&esb_shift, &src_chip);
if (rc) {
pr_err("opal_xive_get_irq_info(0x%x) returned %lld\n",
hw_irq, rc);
return -EINVAL;
}
opal_flags = be64_to_cpu(flags);
if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI)
data->flags |= XIVE_IRQ_FLAG_STORE_EOI;
if (opal_flags & OPAL_XIVE_IRQ_LSI)
data->flags |= XIVE_IRQ_FLAG_LSI;
if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG)
data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG;
if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW)
data->flags |= XIVE_IRQ_FLAG_MASK_FW;
if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW)
data->flags |= XIVE_IRQ_FLAG_EOI_FW;
data->eoi_page = be64_to_cpu(eoi_page);
data->trig_page = be64_to_cpu(trig_page);
data->esb_shift = be32_to_cpu(esb_shift);
data->src_chip = be32_to_cpu(src_chip);
data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift);
if (!data->eoi_mmio) {
pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq);
return -ENOMEM;
}
if (!data->trig_page)
return 0;
if (data->trig_page == data->eoi_page) {
data->trig_mmio = data->eoi_mmio;
return 0;
}
data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift);
if (!data->trig_mmio) {
pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq);
return -ENOMEM;
}
return 0;
}
int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
{
s64 rc;
for (;;) {
rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq);
if (rc != OPAL_BUSY)
break;
msleep(1);
}
return rc == 0 ? 0 : -ENXIO;
}
/* This can be called multiple time to change a queue configuration */
int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
__be32 *qpage, u32 order, bool can_escalate)
{
s64 rc = 0;
__be64 qeoi_page_be;
__be32 esc_irq_be;
u64 flags, qpage_phys;
/* If there's an actual queue page, clean it */
if (order) {
if (WARN_ON(!qpage))
return -EINVAL;
qpage_phys = __pa(qpage);
} else
qpage_phys = 0;
/* Initialize the rest of the fields */
q->msk = order ? ((1u << (order - 2)) - 1) : 0;
q->idx = 0;
q->toggle = 0;
rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL,
&qeoi_page_be,
&esc_irq_be,
NULL);
if (rc) {
pr_err("Error %lld getting queue info prio %d\n", rc, prio);
rc = -EIO;
goto fail;
}
q->eoi_phys = be64_to_cpu(qeoi_page_be);
/* Default flags */
flags = OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED;
/* Escalation needed ? */
if (can_escalate) {
q->esc_irq = be32_to_cpu(esc_irq_be);
flags |= OPAL_XIVE_EQ_ESCALATE;
}
/* Configure and enable the queue in HW */
for (;;) {
rc = opal_xive_set_queue_info(vp_id, prio, qpage_phys, order, flags);
if (rc != OPAL_BUSY)
break;
msleep(1);
}
if (rc) {
pr_err("Error %lld setting queue for prio %d\n", rc, prio);
rc = -EIO;
} else {
/*
* KVM code requires all of the above to be visible before
* q->qpage is set due to how it manages IPI EOIs
*/
wmb();
q->qpage = qpage;
}
fail:
return rc;
}
static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
{
s64 rc;
/* Disable the queue in HW */
for (;;) {
rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0);
break;
msleep(1);
}
if (rc)
pr_err("Error %lld disabling queue for prio %d\n", rc, prio);
}
void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
{
__xive_native_disable_queue(vp_id, q, prio);
}
static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
{
struct xive_q *q = &xc->queue[prio];
unsigned int alloc_order;
struct page *pages;
__be32 *qpage;
alloc_order = (xive_queue_shift > PAGE_SHIFT) ?
(xive_queue_shift - PAGE_SHIFT) : 0;
pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order);
if (!pages)
return -ENOMEM;
qpage = (__be32 *)page_address(pages);
memset(qpage, 0, 1 << xive_queue_shift);
return xive_native_configure_queue(get_hard_smp_processor_id(cpu),
q, prio, qpage, xive_queue_shift, false);
}
static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
{
struct xive_q *q = &xc->queue[prio];
unsigned int alloc_order;
/*
* We use the variant with no iounmap as this is called on exec
* from an IPI and iounmap isn't safe
*/
__xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio);
alloc_order = (xive_queue_shift > PAGE_SHIFT) ?
(xive_queue_shift - PAGE_SHIFT) : 0;
free_pages((unsigned long)q->qpage, alloc_order);
q->qpage = NULL;
}
static bool xive_native_match(struct device_node *node)
{
return of_device_is_compatible(node, "ibm,opal-xive-vc");
}
#ifdef CONFIG_SMP
static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
{
struct device_node *np;
unsigned int chip_id;
s64 irq;
/* Find the chip ID */
np = of_get_cpu_node(cpu, NULL);
if (np) {
if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0)
chip_id = 0;
}
/* Allocate an IPI and populate info about it */
for (;;) {
irq = opal_xive_allocate_irq(chip_id);
if (irq == OPAL_BUSY) {
msleep(1);
continue;
}
if (irq < 0) {
pr_err("Failed to allocate IPI on CPU %d\n", cpu);
return -ENXIO;
}
xc->hw_ipi = irq;
break;
}
return 0;
}
u32 xive_native_alloc_irq(void)
{
s64 rc;
for (;;) {
rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP);
if (rc != OPAL_BUSY)
break;
msleep(1);
}
if (rc < 0)
return 0;
return rc;
}
void xive_native_free_irq(u32 irq)
{
for (;;) {
s64 rc = opal_xive_free_irq(irq);
if (rc != OPAL_BUSY)
break;
msleep(1);
}
}
static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc)
{
s64 rc;
/* Free the IPI */
if (!xc->hw_ipi)
return;
for (;;) {
rc = opal_xive_free_irq(xc->hw_ipi);
if (rc == OPAL_BUSY) {
msleep(1);
continue;
}
xc->hw_ipi = 0;
break;
}
}
#endif /* CONFIG_SMP */
static void xive_native_shutdown(void)
{
/* Switch the XIVE to emulation mode */
opal_xive_reset(OPAL_XIVE_MODE_EMU);
}
/*
* Perform an "ack" cycle on the current thread, thus
* grabbing the pending active priorities and updating
* the CPPR to the most favored one.
*/
static void xive_native_update_pending(struct xive_cpu *xc)
{
u8 he, cppr;
u16 ack;
/* Perform the acknowledge hypervisor to register cycle */
ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_HV_REG));
/* Synchronize subsequent queue accesses */
mb();
/*
* Grab the CPPR and the "HE" field which indicates the source
* of the hypervisor interrupt (if any)
*/
cppr = ack & 0xff;
he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8));
switch(he) {
case TM_QW3_NSR_HE_NONE: /* Nothing to see here */
break;
case TM_QW3_NSR_HE_PHYS: /* Physical thread interrupt */
if (cppr == 0xff)
return;
/* Mark the priority pending */
xc->pending_prio |= 1 << cppr;
/*
* A new interrupt should never have a CPPR less favored
* than our current one.
*/
if (cppr >= xc->cppr)
pr_err("CPU %d odd ack CPPR, got %d at %d\n",
smp_processor_id(), cppr, xc->cppr);
/* Update our idea of what the CPPR is */
xc->cppr = cppr;
break;
case TM_QW3_NSR_HE_POOL: /* HV Pool interrupt (unused) */
case TM_QW3_NSR_HE_LSI: /* Legacy FW LSI (unused) */
pr_err("CPU %d got unexpected interrupt type HE=%d\n",
smp_processor_id(), he);
return;
}
}
static void xive_native_eoi(u32 hw_irq)
{
/*
* Not normally used except if specific interrupts need
* a workaround on EOI.
*/
opal_int_eoi(hw_irq);
}
static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc)
{
s64 rc;
u32 vp;
__be64 vp_cam_be;
u64 vp_cam;
if (xive_pool_vps == XIVE_INVALID_VP)
return;
/* Enable the pool VP */
vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp);
for (;;) {
rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0);
if (rc != OPAL_BUSY)
break;
msleep(1);
}
if (rc) {
pr_err("Failed to enable pool VP on CPU %d\n", cpu);
return;
}
/* Grab it's CAM value */
rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL);
if (rc) {
pr_err("Failed to get pool VP info CPU %d\n", cpu);
return;
}
vp_cam = be64_to_cpu(vp_cam_be);
pr_debug("VP CAM = %llx\n", vp_cam);
/* Push it on the CPU (set LSMFB to 0xff to skip backlog scan) */
pr_debug("(Old HW value: %08x)\n",
in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2));
out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD0, 0xff);
out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2,
TM_QW2W2_VP | vp_cam);
pr_debug("(New HW value: %08x)\n",
in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2));
}
static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
{
s64 rc;
u32 vp;
if (xive_pool_vps == XIVE_INVALID_VP)
return;
/* Pull the pool VP from the CPU */
in_be64(xive_tima + TM_SPC_PULL_POOL_CTX);
/* Disable it */
vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
for (;;) {
rc = opal_xive_set_vp_info(vp, 0, 0);
if (rc != OPAL_BUSY)
break;
msleep(1);
}
}
static void xive_native_sync_source(u32 hw_irq)
{
opal_xive_sync(XIVE_SYNC_EAS, hw_irq);
}
static const struct xive_ops xive_native_ops = {
.populate_irq_data = xive_native_populate_irq_data,
.configure_irq = xive_native_configure_irq,
.setup_queue = xive_native_setup_queue,
.cleanup_queue = xive_native_cleanup_queue,
.match = xive_native_match,
.shutdown = xive_native_shutdown,
.update_pending = xive_native_update_pending,
.eoi = xive_native_eoi,
.setup_cpu = xive_native_setup_cpu,
.teardown_cpu = xive_native_teardown_cpu,
.sync_source = xive_native_sync_source,
#ifdef CONFIG_SMP
.get_ipi = xive_native_get_ipi,
.put_ipi = xive_native_put_ipi,
#endif /* CONFIG_SMP */
.name = "native",
};
static bool xive_parse_provisioning(struct device_node *np)
{
int rc;
if (of_property_read_u32(np, "ibm,xive-provision-page-size",
&xive_provision_size) < 0)
return true;
rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4);
if (rc < 0) {
pr_err("Error %d getting provision chips array\n", rc);
return false;
}
xive_provision_chip_count = rc;
if (rc == 0)
return true;
xive_provision_chips = kzalloc(4 * xive_provision_chip_count,
GFP_KERNEL);
if (WARN_ON(!xive_provision_chips))
return false;
rc = of_property_read_u32_array(np, "ibm,xive-provision-chips",
xive_provision_chips,
xive_provision_chip_count);
if (rc < 0) {
pr_err("Error %d reading provision chips array\n", rc);
return false;
}
xive_provision_cache = kmem_cache_create("xive-provision",
xive_provision_size,
xive_provision_size,
0, NULL);
if (!xive_provision_cache) {
pr_err("Failed to allocate provision cache\n");
return false;
}
return true;
}
u32 xive_native_default_eq_shift(void)
{
return xive_queue_shift;
}
bool xive_native_init(void)
{
struct device_node *np;
struct resource r;
void __iomem *tima;
struct property *prop;
u8 max_prio = 7;
const __be32 *p;
u32 val;
s64 rc;
if (xive_cmdline_disabled)
return false;
pr_devel("xive_native_init()\n");
np = of_find_compatible_node(NULL, NULL, "ibm,opal-xive-pe");
if (!np) {
pr_devel("not found !\n");
return false;
}
pr_devel("Found %s\n", np->full_name);
/* Resource 1 is HV window */
if (of_address_to_resource(np, 1, &r)) {
pr_err("Failed to get thread mgmnt area resource\n");
return false;
}
tima = ioremap(r.start, resource_size(&r));
if (!tima) {
pr_err("Failed to map thread mgmnt area\n");
return false;
}
/* Read number of priorities */
if (of_property_read_u32(np, "ibm,xive-#priorities", &val) == 0)
max_prio = val - 1;
/* Iterate the EQ sizes and pick one */
of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) {
xive_queue_shift = val;
if (val == PAGE_SHIFT)
break;
}
/* Grab size of provisioning pages */
xive_parse_provisioning(np);
/* Switch the XIVE to exploitation mode */
rc = opal_xive_reset(OPAL_XIVE_MODE_EXPL);
if (rc) {
pr_err("Switch to exploitation mode failed with error %lld\n", rc);
return false;
}
/* Initialize XIVE core with our backend */
if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS,
max_prio)) {
opal_xive_reset(OPAL_XIVE_MODE_EMU);
return false;
}
pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10));
return true;
}
static bool xive_native_provision_pages(void)
{
u32 i;
void *p;
for (i = 0; i < xive_provision_chip_count; i++) {
u32 chip = xive_provision_chips[i];
/*
* XXX TODO: Try to make the allocation local to the node where
* the chip resides.
*/
p = kmem_cache_alloc(xive_provision_cache, GFP_KERNEL);
if (!p) {
pr_err("Failed to allocate provisioning page\n");
return false;
}
opal_xive_donate_page(chip, __pa(p));
}
return true;
}
u32 xive_native_alloc_vp_block(u32 max_vcpus)
{
s64 rc;
u32 order;
order = fls(max_vcpus) - 1;
if (max_vcpus > (1 << order))
order++;
pr_info("VP block alloc, for max VCPUs %d use order %d\n",
max_vcpus, order);
for (;;) {
rc = opal_xive_alloc_vp_block(order);
switch (rc) {
case OPAL_BUSY:
msleep(1);
break;
case OPAL_XIVE_PROVISIONING:
if (!xive_native_provision_pages())
return XIVE_INVALID_VP;
break;
default:
if (rc < 0) {
pr_err("OPAL failed to allocate VCPUs order %d, err %lld\n",
order, rc);
return XIVE_INVALID_VP;
}
return rc;
}
}
}
EXPORT_SYMBOL_GPL(xive_native_alloc_vp_block);
void xive_native_free_vp_block(u32 vp_base)
{
s64 rc;
if (vp_base == XIVE_INVALID_VP)
return;
rc = opal_xive_free_vp_block(vp_base);
if (rc < 0)
pr_warn("OPAL error %lld freeing VP block\n", rc);
}
EXPORT_SYMBOL_GPL(xive_native_free_vp_block);

View File

@ -0,0 +1,62 @@
/*
* Copyright 2016,2017 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef __XIVE_INTERNAL_H
#define __XIVE_INTERNAL_H
/* Each CPU carry one of these with various per-CPU state */
struct xive_cpu {
#ifdef CONFIG_SMP
/* HW irq number and data of IPI */
u32 hw_ipi;
struct xive_irq_data ipi_data;
#endif /* CONFIG_SMP */
int chip_id;
/* Queue datas. Only one is populated */
#define XIVE_MAX_QUEUES 8
struct xive_q queue[XIVE_MAX_QUEUES];
/*
* Pending mask. Each bit corresponds to a priority that
* potentially has pending interrupts.
*/
u8 pending_prio;
/* Cache of HW CPPR */
u8 cppr;
};
/* Backend ops */
struct xive_ops {
int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data);
int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc);
bool (*match)(struct device_node *np);
void (*shutdown)(void);
void (*update_pending)(struct xive_cpu *xc);
void (*eoi)(u32 hw_irq);
void (*sync_source)(u32 hw_irq);
#ifdef CONFIG_SMP
int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc);
void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc);
#endif
const char *name;
};
bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
u8 max_prio);
extern bool xive_cmdline_disabled;
#endif /* __XIVE_INTERNAL_H */

View File

@ -30,6 +30,7 @@
#include <linux/ctype.h>
#include <asm/ptrace.h>
#include <asm/smp.h>
#include <asm/string.h>
#include <asm/prom.h>
#include <asm/machdep.h>
@ -48,7 +49,7 @@
#include <asm/reg.h>
#include <asm/debug.h>
#include <asm/hw_breakpoint.h>
#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/firmware.h>
@ -232,7 +233,13 @@ Commands:\n\
"\
dr dump stream of raw bytes\n\
dt dump the tracing buffers (uses printk)\n\
e print exception information\n\
"
#ifdef CONFIG_PPC_POWERNV
" dx# dump xive on CPU #\n\
dxi# dump xive irq state #\n\
dxa dump xive on all CPUs\n"
#endif
" e print exception information\n\
f flush cache\n\
la lookup symbol+offset of specified address\n\
ls lookup address of specified symbol\n\
@ -2338,6 +2345,81 @@ static void dump_pacas(void)
}
#endif
#ifdef CONFIG_PPC_POWERNV
static void dump_one_xive(int cpu)
{
unsigned int hwid = get_hard_smp_processor_id(cpu);
opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
opal_xive_dump(XIVE_DUMP_VP, hwid);
opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
if (setjmp(bus_error_jmp) != 0) {
catch_memory_errors = 0;
printf("*** Error dumping xive on cpu %d\n", cpu);
return;
}
catch_memory_errors = 1;
sync();
xmon_xive_do_dump(cpu);
sync();
__delay(200);
catch_memory_errors = 0;
}
static void dump_all_xives(void)
{
int cpu;
if (num_possible_cpus() == 0) {
printf("No possible cpus, use 'dx #' to dump individual cpus\n");
return;
}
for_each_possible_cpu(cpu)
dump_one_xive(cpu);
}
static void dump_one_xive_irq(u32 num)
{
s64 rc;
__be64 vp;
u8 prio;
__be32 lirq;
rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq);
xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n",
num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc);
}
static void dump_xives(void)
{
unsigned long num;
int c;
c = inchar();
if (c == 'a') {
dump_all_xives();
return;
} else if (c == 'i') {
if (scanhex(&num))
dump_one_xive_irq(num);
return;
}
termch = c; /* Put c back, it wasn't 'a' */
if (scanhex(&num))
dump_one_xive(num);
else
dump_one_xive(xmon_owner);
}
#endif /* CONFIG_PPC_POWERNV */
static void dump_by_size(unsigned long addr, long count, int size)
{
unsigned char temp[16];
@ -2386,6 +2468,14 @@ dump(void)
return;
}
#endif
#ifdef CONFIG_PPC_POWERNV
if (c == 'x') {
xmon_start_pagination();
dump_xives();
xmon_end_pagination();
return;
}
#endif
if (c == '\n')
termch = c;