linux-brain/arch/powerpc/platforms/pseries/hotplug-memory.c

1086 lines
23 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* pseries Memory Hotplug infrastructure.
*
* Copyright (C) 2008 Badari Pulavarty, IBM Corporation
*/
powerpc/pseries: Create new device hotplug entry point The current hotplug (or dlpar) of devices (the process is generally the same for memory, cpu, and pci) on PowerVM systems is initiated from the HMC, which communicates the request to the partitions through the RSCT framework. The RSCT framework then invokes the drmgr command. The drmgr command performs the hotplug operation by doing some pieces, such as most of the rtas calls and device tree parsing, in userspace and make requests to the kernel to online/offline the device, update the device tree and add/remove the device. For PowerKVM the approach for device hotplug is to follow what is currently being done for pci hotplug. A hotplug request is initiated from the host. QEMU then generates an EPOW interrupt to the guest which causes the guest to make the rtas,check-exception call. In QEMU, the rtas,check-exception call returns a rtas hotplug event to the guest. Please note that the current pci hotplug path for PowerKVM involves the kernel receiving the rtas hotplug event, passing it to rtas_errd in userspace, and having rtas_errd invoke drmgr. The drmgr command then handles the request as described above for PowerVM systems. There is no need for this circuitous route, we should just handle the entire hotplug of devices in the kernel. What I am planning is to enable this by moving the code to handle hotplug from drmgr into the kernel to provide a single path for handling device hotplug for both PowerVM and PowerKVM systems. This patch provides the common iframework and entry point. For PowerKVM a future update to the kernel rtas code will recognize rtas hotplug events returned from rtas,check-exception calls and use the common entry point to handle hotplug of the device. For PowerVM systems, This patch creates /sys/kernel/dlpar that can be used by the drmgr command to initiate hotplug requests. In order to do this a string of the format "<resource> <action> <id_type> <id>" is written to this file. The string consists of a resource (cpu, memory, pci, phb), an action (add or remove), an id_type (count, drc index, drc name), and the corresponding id. The kernel will parse the string and create a rtas hotplug section that can be passed to the common entry point for handling hotplug requests. It should be noted that there is no chance of updating how we receive hotplug (dlpar) requests from the HMC on PowerVM systems. Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2015-02-11 04:47:02 +09:00
#define pr_fmt(fmt) "pseries-hotplug-mem: " fmt
#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/memblock.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/slab.h>
#include <asm/firmware.h>
#include <asm/machdep.h>
#include <asm/prom.h>
#include <asm/sparsemem.h>
#include <asm/fadump.h>
#include <asm/drmem.h>
#include "pseries.h"
static bool rtas_hp_event;
unsigned long pseries_memory_block_size(void)
{
struct device_node *np;
u64 memblock_size = MIN_MEMORY_BLOCK_SIZE;
struct resource r;
np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (np) {
const __be64 *size;
size = of_get_property(np, "ibm,lmb-size", NULL);
if (size)
memblock_size = be64_to_cpup(size);
of_node_put(np);
} else if (machine_is(pseries)) {
/* This fallback really only applies to pseries */
unsigned int memzero_size = 0;
np = of_find_node_by_path("/memory@0");
if (np) {
if (!of_address_to_resource(np, 0, &r))
memzero_size = resource_size(&r);
of_node_put(np);
}
if (memzero_size) {
/* We now know the size of memory@0, use this to find
* the first memoryblock and get its size.
*/
char buf[64];
sprintf(buf, "/memory@%x", memzero_size);
np = of_find_node_by_path(buf);
if (np) {
if (!of_address_to_resource(np, 0, &r))
memblock_size = resource_size(&r);
of_node_put(np);
}
}
}
return memblock_size;
}
static void dlpar_free_property(struct property *prop)
{
kfree(prop->name);
kfree(prop->value);
kfree(prop);
}
static struct property *dlpar_clone_property(struct property *prop,
u32 prop_size)
{
struct property *new_prop;
new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
if (!new_prop)
return NULL;
new_prop->name = kstrdup(prop->name, GFP_KERNEL);
new_prop->value = kzalloc(prop_size, GFP_KERNEL);
if (!new_prop->name || !new_prop->value) {
dlpar_free_property(new_prop);
return NULL;
}
memcpy(new_prop->value, prop->value, prop->length);
new_prop->length = prop_size;
of_property_set_flag(new_prop, OF_DYNAMIC);
return new_prop;
}
static bool find_aa_index(struct device_node *dr_node,
struct property *ala_prop,
const u32 *lmb_assoc, u32 *aa_index)
{
u32 *assoc_arrays, new_prop_size;
struct property *new_prop;
int aa_arrays, aa_array_entries, aa_array_sz;
int i, index;
/*
* The ibm,associativity-lookup-arrays property is defined to be
* a 32-bit value specifying the number of associativity arrays
* followed by a 32-bitvalue specifying the number of entries per
* array, followed by the associativity arrays.
*/
assoc_arrays = ala_prop->value;
aa_arrays = be32_to_cpu(assoc_arrays[0]);
aa_array_entries = be32_to_cpu(assoc_arrays[1]);
aa_array_sz = aa_array_entries * sizeof(u32);
for (i = 0; i < aa_arrays; i++) {
index = (i * aa_array_entries) + 2;
if (memcmp(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz))
continue;
*aa_index = i;
return true;
}
new_prop_size = ala_prop->length + aa_array_sz;
new_prop = dlpar_clone_property(ala_prop, new_prop_size);
if (!new_prop)
return false;
assoc_arrays = new_prop->value;
/* increment the number of entries in the lookup array */
assoc_arrays[0] = cpu_to_be32(aa_arrays + 1);
/* copy the new associativity into the lookup array */
index = aa_arrays * aa_array_entries + 2;
memcpy(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz);
of_update_property(dr_node, new_prop);
/*
* The associativity lookup array index for this lmb is
* number of entries - 1 since we added its associativity
* to the end of the lookup array.
*/
*aa_index = be32_to_cpu(assoc_arrays[0]) - 1;
return true;
}
static int update_lmb_associativity_index(struct drmem_lmb *lmb)
{
struct device_node *parent, *lmb_node, *dr_node;
struct property *ala_prop;
const u32 *lmb_assoc;
u32 aa_index;
bool found;
parent = of_find_node_by_path("/");
if (!parent)
return -ENODEV;
lmb_node = dlpar_configure_connector(cpu_to_be32(lmb->drc_index),
parent);
of_node_put(parent);
if (!lmb_node)
return -EINVAL;
lmb_assoc = of_get_property(lmb_node, "ibm,associativity", NULL);
if (!lmb_assoc) {
dlpar_free_cc_nodes(lmb_node);
return -ENODEV;
}
dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (!dr_node) {
dlpar_free_cc_nodes(lmb_node);
return -ENODEV;
}
ala_prop = of_find_property(dr_node, "ibm,associativity-lookup-arrays",
NULL);
if (!ala_prop) {
of_node_put(dr_node);
dlpar_free_cc_nodes(lmb_node);
return -ENODEV;
}
found = find_aa_index(dr_node, ala_prop, lmb_assoc, &aa_index);
of_node_put(dr_node);
dlpar_free_cc_nodes(lmb_node);
if (!found) {
pr_err("Could not find LMB associativity\n");
return -1;
}
lmb->aa_index = aa_index;
return 0;
}
static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb)
{
unsigned long section_nr;
struct mem_section *mem_sect;
struct memory_block *mem_block;
section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr));
mem_sect = __nr_to_section(section_nr);
mem_block = find_memory_block(mem_sect);
return mem_block;
}
static int get_lmb_range(u32 drc_index, int n_lmbs,
struct drmem_lmb **start_lmb,
struct drmem_lmb **end_lmb)
{
struct drmem_lmb *lmb, *start, *end;
powerpc/pseries: Avoid NULL pointer dereference when drmem is unavailable commit a83836dbc53e96f13fec248ecc201d18e1e3111d upstream. In guests without hotplugagble memory drmem structure is only zero initialized. Trying to manipulate DLPAR parameters results in a crash. $ echo "memory add count 1" > /sys/kernel/dlpar Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries ... NIP: c0000000000ff294 LR: c0000000000ff248 CTR: 0000000000000000 REGS: c0000000fb9d3880 TRAP: 0300 Tainted: G E (5.5.0-rc6-2-default) MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 28242428 XER: 20000000 CFAR: c0000000009a6c10 DAR: 0000000000000010 DSISR: 40000000 IRQMASK: 0 ... NIP dlpar_memory+0x6e4/0xd00 LR dlpar_memory+0x698/0xd00 Call Trace: dlpar_memory+0x698/0xd00 (unreliable) handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 __vfs_write+0x3c/0x70 vfs_write+0xd0/0x260 ksys_write+0xdc/0x130 system_call+0x5c/0x68 Taking closer look at the code, I can see that for_each_drmem_lmb is a macro expanding into `for (lmb = &drmem_info->lmbs[0]; lmb <= &drmem_info->lmbs[drmem_info->n_lmbs - 1]; lmb++)`. When drmem_info->lmbs is NULL, the loop would iterate through the whole address range if it weren't stopped by the NULL pointer dereference on the next line. This patch aligns for_each_drmem_lmb and for_each_drmem_lmb_in_range macro behavior with the common C semantics, where the end marker does not belong to the scanned range, and alters get_lmb_range() semantics. As a side effect, the wraparound observed in the crash is prevented. Fixes: 6c6ea53725b3 ("powerpc/mm: Separate ibm, dynamic-memory data from DT format") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Libor Pechacek <lpechacek@suse.cz> Signed-off-by: Michal Suchanek <msuchanek@suse.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200131132829.10281-1-msuchanek@suse.de Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-01-31 22:28:29 +09:00
struct drmem_lmb *limit;
start = NULL;
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
start = lmb;
break;
}
}
if (!start)
return -EINVAL;
powerpc/pseries: Avoid NULL pointer dereference when drmem is unavailable commit a83836dbc53e96f13fec248ecc201d18e1e3111d upstream. In guests without hotplugagble memory drmem structure is only zero initialized. Trying to manipulate DLPAR parameters results in a crash. $ echo "memory add count 1" > /sys/kernel/dlpar Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries ... NIP: c0000000000ff294 LR: c0000000000ff248 CTR: 0000000000000000 REGS: c0000000fb9d3880 TRAP: 0300 Tainted: G E (5.5.0-rc6-2-default) MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 28242428 XER: 20000000 CFAR: c0000000009a6c10 DAR: 0000000000000010 DSISR: 40000000 IRQMASK: 0 ... NIP dlpar_memory+0x6e4/0xd00 LR dlpar_memory+0x698/0xd00 Call Trace: dlpar_memory+0x698/0xd00 (unreliable) handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 __vfs_write+0x3c/0x70 vfs_write+0xd0/0x260 ksys_write+0xdc/0x130 system_call+0x5c/0x68 Taking closer look at the code, I can see that for_each_drmem_lmb is a macro expanding into `for (lmb = &drmem_info->lmbs[0]; lmb <= &drmem_info->lmbs[drmem_info->n_lmbs - 1]; lmb++)`. When drmem_info->lmbs is NULL, the loop would iterate through the whole address range if it weren't stopped by the NULL pointer dereference on the next line. This patch aligns for_each_drmem_lmb and for_each_drmem_lmb_in_range macro behavior with the common C semantics, where the end marker does not belong to the scanned range, and alters get_lmb_range() semantics. As a side effect, the wraparound observed in the crash is prevented. Fixes: 6c6ea53725b3 ("powerpc/mm: Separate ibm, dynamic-memory data from DT format") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Libor Pechacek <lpechacek@suse.cz> Signed-off-by: Michal Suchanek <msuchanek@suse.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200131132829.10281-1-msuchanek@suse.de Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-01-31 22:28:29 +09:00
end = &start[n_lmbs];
powerpc/pseries: Avoid NULL pointer dereference when drmem is unavailable commit a83836dbc53e96f13fec248ecc201d18e1e3111d upstream. In guests without hotplugagble memory drmem structure is only zero initialized. Trying to manipulate DLPAR parameters results in a crash. $ echo "memory add count 1" > /sys/kernel/dlpar Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries ... NIP: c0000000000ff294 LR: c0000000000ff248 CTR: 0000000000000000 REGS: c0000000fb9d3880 TRAP: 0300 Tainted: G E (5.5.0-rc6-2-default) MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 28242428 XER: 20000000 CFAR: c0000000009a6c10 DAR: 0000000000000010 DSISR: 40000000 IRQMASK: 0 ... NIP dlpar_memory+0x6e4/0xd00 LR dlpar_memory+0x698/0xd00 Call Trace: dlpar_memory+0x698/0xd00 (unreliable) handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 __vfs_write+0x3c/0x70 vfs_write+0xd0/0x260 ksys_write+0xdc/0x130 system_call+0x5c/0x68 Taking closer look at the code, I can see that for_each_drmem_lmb is a macro expanding into `for (lmb = &drmem_info->lmbs[0]; lmb <= &drmem_info->lmbs[drmem_info->n_lmbs - 1]; lmb++)`. When drmem_info->lmbs is NULL, the loop would iterate through the whole address range if it weren't stopped by the NULL pointer dereference on the next line. This patch aligns for_each_drmem_lmb and for_each_drmem_lmb_in_range macro behavior with the common C semantics, where the end marker does not belong to the scanned range, and alters get_lmb_range() semantics. As a side effect, the wraparound observed in the crash is prevented. Fixes: 6c6ea53725b3 ("powerpc/mm: Separate ibm, dynamic-memory data from DT format") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Libor Pechacek <lpechacek@suse.cz> Signed-off-by: Michal Suchanek <msuchanek@suse.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200131132829.10281-1-msuchanek@suse.de Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-01-31 22:28:29 +09:00
limit = &drmem_info->lmbs[drmem_info->n_lmbs];
if (end > limit)
return -EINVAL;
*start_lmb = start;
*end_lmb = end;
return 0;
}
static int dlpar_change_lmb_state(struct drmem_lmb *lmb, bool online)
powerpc/pseries: Check memory device state before onlining/offlining When DLPAR adding or removing memory we need to check the device offline status before trying to online/offline the memory. This is needed because calls to device_online() and device_offline() will return non-zero for memory that is already online and offline respectively. This update resolves two scenarios. First, for a kernel built with auto-online memory enabled (CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y), memory will be onlined as part of calls to add_memory(). After adding the memory the pseries DLPAR code tries to online it and fails since the memory is already online. The DLPAR code then tries to remove the memory which produces the oops message below because the memory is not offline. The second scenario occurs when removing memory that is already offline, i.e. marking memory offline (via sysfs) and then trying to remove that memory. This doesn't work because offlining the already offline memory does not succeed and the DLPAR code then fails the DLPAR remove operation. The fix for both scenarios is to check the device.offline status before making the calls to device_online() or device_offline(). kernel BUG at mm/memory_hotplug.c:1936! ... NIP [c0000000002ca428] .remove_memory+0xb8/0xc0 LR [c0000000002ca3cc] .remove_memory+0x5c/0xc0 Call Trace: .remove_memory+0x5c/0xc0 (unreliable) .dlpar_add_lmb+0x384/0x400 .dlpar_memory+0x5dc/0xca0 .handle_dlpar_errorlog+0x74/0xe0 .pseries_hp_work_fn+0x2c/0x90 .process_one_work+0x17c/0x460 .worker_thread+0x88/0x500 .kthread+0x15c/0x1a0 .ret_from_kernel_thread+0x58/0xc0 Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memory'") Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> [mpe: Use bool, add explicit rc=0 case, change log typos & formatting] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-08-03 03:03:22 +09:00
{
struct memory_block *mem_block;
int rc;
mem_block = lmb_to_memblock(lmb);
if (!mem_block)
return -EINVAL;
if (online && mem_block->dev.offline)
rc = device_online(&mem_block->dev);
else if (!online && !mem_block->dev.offline)
rc = device_offline(&mem_block->dev);
else
rc = 0;
put_device(&mem_block->dev);
return rc;
}
static int dlpar_online_lmb(struct drmem_lmb *lmb)
powerpc/pseries: Check memory device state before onlining/offlining When DLPAR adding or removing memory we need to check the device offline status before trying to online/offline the memory. This is needed because calls to device_online() and device_offline() will return non-zero for memory that is already online and offline respectively. This update resolves two scenarios. First, for a kernel built with auto-online memory enabled (CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y), memory will be onlined as part of calls to add_memory(). After adding the memory the pseries DLPAR code tries to online it and fails since the memory is already online. The DLPAR code then tries to remove the memory which produces the oops message below because the memory is not offline. The second scenario occurs when removing memory that is already offline, i.e. marking memory offline (via sysfs) and then trying to remove that memory. This doesn't work because offlining the already offline memory does not succeed and the DLPAR code then fails the DLPAR remove operation. The fix for both scenarios is to check the device.offline status before making the calls to device_online() or device_offline(). kernel BUG at mm/memory_hotplug.c:1936! ... NIP [c0000000002ca428] .remove_memory+0xb8/0xc0 LR [c0000000002ca3cc] .remove_memory+0x5c/0xc0 Call Trace: .remove_memory+0x5c/0xc0 (unreliable) .dlpar_add_lmb+0x384/0x400 .dlpar_memory+0x5dc/0xca0 .handle_dlpar_errorlog+0x74/0xe0 .pseries_hp_work_fn+0x2c/0x90 .process_one_work+0x17c/0x460 .worker_thread+0x88/0x500 .kthread+0x15c/0x1a0 .ret_from_kernel_thread+0x58/0xc0 Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memory'") Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> [mpe: Use bool, add explicit rc=0 case, change log typos & formatting] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-08-03 03:03:22 +09:00
{
return dlpar_change_lmb_state(lmb, true);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
static int dlpar_offline_lmb(struct drmem_lmb *lmb)
powerpc/pseries: Check memory device state before onlining/offlining When DLPAR adding or removing memory we need to check the device offline status before trying to online/offline the memory. This is needed because calls to device_online() and device_offline() will return non-zero for memory that is already online and offline respectively. This update resolves two scenarios. First, for a kernel built with auto-online memory enabled (CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y), memory will be onlined as part of calls to add_memory(). After adding the memory the pseries DLPAR code tries to online it and fails since the memory is already online. The DLPAR code then tries to remove the memory which produces the oops message below because the memory is not offline. The second scenario occurs when removing memory that is already offline, i.e. marking memory offline (via sysfs) and then trying to remove that memory. This doesn't work because offlining the already offline memory does not succeed and the DLPAR code then fails the DLPAR remove operation. The fix for both scenarios is to check the device.offline status before making the calls to device_online() or device_offline(). kernel BUG at mm/memory_hotplug.c:1936! ... NIP [c0000000002ca428] .remove_memory+0xb8/0xc0 LR [c0000000002ca3cc] .remove_memory+0x5c/0xc0 Call Trace: .remove_memory+0x5c/0xc0 (unreliable) .dlpar_add_lmb+0x384/0x400 .dlpar_memory+0x5dc/0xca0 .handle_dlpar_errorlog+0x74/0xe0 .pseries_hp_work_fn+0x2c/0x90 .process_one_work+0x17c/0x460 .worker_thread+0x88/0x500 .kthread+0x15c/0x1a0 .ret_from_kernel_thread+0x58/0xc0 Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memory'") Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> [mpe: Use bool, add explicit rc=0 case, change log typos & formatting] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-08-03 03:03:22 +09:00
{
return dlpar_change_lmb_state(lmb, false);
}
static int pseries_remove_memblock(unsigned long base, unsigned long memblock_size)
{
unsigned long block_sz, start_pfn;
int sections_per_block;
int i, nid;
start_pfn = base >> PAGE_SHIFT;
powerpc/pseries: Protect remove_memory() with device hotplug lock While testing memory hot-remove, I found following dead lock: Process #1141 is drmgr, trying to remove some memory, i.e. memory499. It holds the memory_hotplug_mutex, and blocks when trying to remove file "online" under dir memory499, in kernfs_drain(), at wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); Process #1120 is trying to online memory499 by echo 1 > memory499/online In .kernfs_fop_write, it uses kernfs_get_active() to increase &kn->active, thus blocking process #1141. While itself is blocked later when trying to acquire memory_hotplug_mutex, which is held by process The backtrace of both processes are shown below: [<c000000001b18600>] 0xc000000001b18600 [<c000000000015044>] .__switch_to+0x144/0x200 [<c000000000263ca4>] .online_pages+0x74/0x7b0 [<c00000000055b40c>] .memory_subsys_online+0x9c/0x150 [<c00000000053cbe8>] .device_online+0xb8/0x120 [<c00000000053cd04>] .online_store+0xb4/0xc0 [<c000000000538ce4>] .dev_attr_store+0x64/0xa0 [<c00000000030f4ec>] .sysfs_kf_write+0x7c/0xb0 [<c00000000030e574>] .kernfs_fop_write+0x154/0x1e0 [<c000000000268450>] .vfs_write+0xe0/0x260 [<c000000000269144>] .SyS_write+0x64/0x110 [<c000000000009ffc>] syscall_exit+0x0/0x7c [<c000000001b18600>] 0xc000000001b18600 [<c000000000015044>] .__switch_to+0x144/0x200 [<c00000000030be14>] .__kernfs_remove+0x204/0x300 [<c00000000030d428>] .kernfs_remove_by_name_ns+0x68/0xf0 [<c00000000030fb38>] .sysfs_remove_file_ns+0x38/0x60 [<c000000000539354>] .device_remove_attrs+0x54/0xc0 [<c000000000539fd8>] .device_del+0x158/0x250 [<c00000000053a104>] .device_unregister+0x34/0xa0 [<c00000000055bc14>] .unregister_memory_section+0x164/0x170 [<c00000000024ee18>] .__remove_pages+0x108/0x4c0 [<c00000000004b590>] .arch_remove_memory+0x60/0xc0 [<c00000000026446c>] .remove_memory+0x8c/0xe0 [<c00000000007f9f4>] .pseries_remove_memblock+0xd4/0x160 [<c00000000007fcfc>] .pseries_memory_notifier+0x27c/0x290 [<c0000000008ae6cc>] .notifier_call_chain+0x8c/0x100 [<c0000000000d858c>] .__blocking_notifier_call_chain+0x6c/0xe0 [<c00000000071ddec>] .of_property_notify+0x7c/0xc0 [<c00000000071ed3c>] .of_update_property+0x3c/0x1b0 [<c0000000000756cc>] .ofdt_write+0x3dc/0x740 [<c0000000002f60fc>] .proc_reg_write+0xac/0x110 [<c000000000268450>] .vfs_write+0xe0/0x260 [<c000000000269144>] .SyS_write+0x64/0x110 [<c000000000009ffc>] syscall_exit+0x0/0x7c This patch uses lock_device_hotplug() to protect remove_memory() called in pseries_remove_memblock(), which is also stated before function remove_memory(): * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call, as required by * try_offline_node(). */ void __ref remove_memory(int nid, u64 start, u64 size) With this lock held, the other process(#1120 above) trying to online the memory block will retry the system call when calling lock_device_hotplug_sysfs(), and finally find No such device error. Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-04-10 17:25:31 +09:00
lock_device_hotplug();
if (!pfn_valid(start_pfn))
goto out;
block_sz = pseries_memory_block_size();
sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
nid = memory_add_physaddr_to_nid(base);
for (i = 0; i < sections_per_block; i++) {
mm/memory_hotplug: make remove_memory() take the device_hotplug_lock Patch series "mm: online/offline_pages called w.o. mem_hotplug_lock", v3. Reading through the code and studying how mem_hotplug_lock is to be used, I noticed that there are two places where we can end up calling device_online()/device_offline() - online_pages()/offline_pages() without the mem_hotplug_lock. And there are other places where we call device_online()/device_offline() without the device_hotplug_lock. While e.g. echo "online" > /sys/devices/system/memory/memory9/state is fine, e.g. echo 1 > /sys/devices/system/memory/memory9/online Will not take the mem_hotplug_lock. However the device_lock() and device_hotplug_lock. E.g. via memory_probe_store(), we can end up calling add_memory()->online_pages() without the device_hotplug_lock. So we can have concurrent callers in online_pages(). We e.g. touch in online_pages() basically unprotected zone->present_pages then. Looks like there is a longer history to that (see Patch #2 for details), and fixing it to work the way it was intended is not really possible. We would e.g. have to take the mem_hotplug_lock in device/base/core.c, which sounds wrong. Summary: We had a lock inversion on mem_hotplug_lock and device_lock(). More details can be found in patch 3 and patch 6. I propose the general rules (documentation added in patch 6): 1. add_memory/add_memory_resource() must only be called with device_hotplug_lock. 2. remove_memory() must only be called with device_hotplug_lock. This is already documented and holds for all callers. 3. device_online()/device_offline() must only be called with device_hotplug_lock. This is already documented and true for now in core code. Other callers (related to memory hotplug) have to be fixed up. 4. mem_hotplug_lock is taken inside of add_memory/remove_memory/ online_pages/offline_pages. To me, this looks way cleaner than what we have right now (and easier to verify). And looking at the documentation of remove_memory, using lock_device_hotplug also for add_memory() feels natural. This patch (of 6): remove_memory() is exported right now but requires the device_hotplug_lock, which is not exported. So let's provide a variant that takes the lock and only export that one. The lock is already held in arch/powerpc/platforms/pseries/hotplug-memory.c drivers/acpi/acpi_memhotplug.c arch/powerpc/platforms/powernv/memtrace.c Apart from that, there are not other users in the tree. Link: http://lkml.kernel.org/r/20180925091457.28651-2-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com> Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Reviewed-by: Rashmica Gupta <rashmica.g@gmail.com> Reviewed-by: Oscar Salvador <osalvador@suse.de> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Len Brown <lenb@kernel.org> Cc: Rashmica Gupta <rashmica.g@gmail.com> Cc: Michael Neuling <mikey@neuling.org> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Nathan Fontenot <nfont@linux.vnet.ibm.com> Cc: John Allen <jallen@linux.vnet.ibm.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: YASUAKI ISHIMATSU <yasu.isimatu@gmail.com> Cc: Mathieu Malaterre <malat@debian.org> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Haiyang Zhang <haiyangz@microsoft.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Juergen Gross <jgross@suse.com> Cc: Kate Stewart <kstewart@linuxfoundation.org> Cc: "K. Y. Srinivasan" <kys@microsoft.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Philippe Ombredanne <pombredanne@nexb.com> Cc: Stephen Hemminger <sthemmin@microsoft.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-31 07:10:18 +09:00
__remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
base += MIN_MEMORY_BLOCK_SIZE;
memory-hotplug: suppress "Trying to free nonexistent resource <XXXXXXXXXXXXXXXX-YYYYYYYYYYYYYYYY>" warning When our x86 box calls __remove_pages(), release_mem_region() shows many warnings. And x86 box cannot unregister iomem_resource. "Trying to free nonexistent resource <XXXXXXXXXXXXXXXX-YYYYYYYYYYYYYYYY>" release_mem_region() has been changed to be called in each PAGES_PER_SECTION by commit de7f0cba9678 ("memory hotplug: release memory regions in PAGES_PER_SECTION chunks"). Because powerpc registers iomem_resource in each PAGES_PER_SECTION chunk. But when I hot add memory on x86 box, iomem_resource is register in each _CRS not PAGES_PER_SECTION chunk. So x86 box unregisters iomem_resource. The patch fixes the problem. Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Jiang Liu <liuj97@gmail.com> Cc: Len Brown <len.brown@intel.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Christoph Lameter <cl@linux.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Nathan Fontenot <nfont@austin.ibm.com> Cc: Badari Pulavarty <pbadari@us.ibm.com> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-09 08:34:14 +09:00
}
powerpc/pseries: Protect remove_memory() with device hotplug lock While testing memory hot-remove, I found following dead lock: Process #1141 is drmgr, trying to remove some memory, i.e. memory499. It holds the memory_hotplug_mutex, and blocks when trying to remove file "online" under dir memory499, in kernfs_drain(), at wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); Process #1120 is trying to online memory499 by echo 1 > memory499/online In .kernfs_fop_write, it uses kernfs_get_active() to increase &kn->active, thus blocking process #1141. While itself is blocked later when trying to acquire memory_hotplug_mutex, which is held by process The backtrace of both processes are shown below: [<c000000001b18600>] 0xc000000001b18600 [<c000000000015044>] .__switch_to+0x144/0x200 [<c000000000263ca4>] .online_pages+0x74/0x7b0 [<c00000000055b40c>] .memory_subsys_online+0x9c/0x150 [<c00000000053cbe8>] .device_online+0xb8/0x120 [<c00000000053cd04>] .online_store+0xb4/0xc0 [<c000000000538ce4>] .dev_attr_store+0x64/0xa0 [<c00000000030f4ec>] .sysfs_kf_write+0x7c/0xb0 [<c00000000030e574>] .kernfs_fop_write+0x154/0x1e0 [<c000000000268450>] .vfs_write+0xe0/0x260 [<c000000000269144>] .SyS_write+0x64/0x110 [<c000000000009ffc>] syscall_exit+0x0/0x7c [<c000000001b18600>] 0xc000000001b18600 [<c000000000015044>] .__switch_to+0x144/0x200 [<c00000000030be14>] .__kernfs_remove+0x204/0x300 [<c00000000030d428>] .kernfs_remove_by_name_ns+0x68/0xf0 [<c00000000030fb38>] .sysfs_remove_file_ns+0x38/0x60 [<c000000000539354>] .device_remove_attrs+0x54/0xc0 [<c000000000539fd8>] .device_del+0x158/0x250 [<c00000000053a104>] .device_unregister+0x34/0xa0 [<c00000000055bc14>] .unregister_memory_section+0x164/0x170 [<c00000000024ee18>] .__remove_pages+0x108/0x4c0 [<c00000000004b590>] .arch_remove_memory+0x60/0xc0 [<c00000000026446c>] .remove_memory+0x8c/0xe0 [<c00000000007f9f4>] .pseries_remove_memblock+0xd4/0x160 [<c00000000007fcfc>] .pseries_memory_notifier+0x27c/0x290 [<c0000000008ae6cc>] .notifier_call_chain+0x8c/0x100 [<c0000000000d858c>] .__blocking_notifier_call_chain+0x6c/0xe0 [<c00000000071ddec>] .of_property_notify+0x7c/0xc0 [<c00000000071ed3c>] .of_update_property+0x3c/0x1b0 [<c0000000000756cc>] .ofdt_write+0x3dc/0x740 [<c0000000002f60fc>] .proc_reg_write+0xac/0x110 [<c000000000268450>] .vfs_write+0xe0/0x260 [<c000000000269144>] .SyS_write+0x64/0x110 [<c000000000009ffc>] syscall_exit+0x0/0x7c This patch uses lock_device_hotplug() to protect remove_memory() called in pseries_remove_memblock(), which is also stated before function remove_memory(): * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call, as required by * try_offline_node(). */ void __ref remove_memory(int nid, u64 start, u64 size) With this lock held, the other process(#1120 above) trying to online the memory block will retry the system call when calling lock_device_hotplug_sysfs(), and finally find No such device error. Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-04-10 17:25:31 +09:00
out:
/* Update memory regions for memory remove */
memblock_remove(base, memblock_size);
powerpc/pseries: Protect remove_memory() with device hotplug lock While testing memory hot-remove, I found following dead lock: Process #1141 is drmgr, trying to remove some memory, i.e. memory499. It holds the memory_hotplug_mutex, and blocks when trying to remove file "online" under dir memory499, in kernfs_drain(), at wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); Process #1120 is trying to online memory499 by echo 1 > memory499/online In .kernfs_fop_write, it uses kernfs_get_active() to increase &kn->active, thus blocking process #1141. While itself is blocked later when trying to acquire memory_hotplug_mutex, which is held by process The backtrace of both processes are shown below: [<c000000001b18600>] 0xc000000001b18600 [<c000000000015044>] .__switch_to+0x144/0x200 [<c000000000263ca4>] .online_pages+0x74/0x7b0 [<c00000000055b40c>] .memory_subsys_online+0x9c/0x150 [<c00000000053cbe8>] .device_online+0xb8/0x120 [<c00000000053cd04>] .online_store+0xb4/0xc0 [<c000000000538ce4>] .dev_attr_store+0x64/0xa0 [<c00000000030f4ec>] .sysfs_kf_write+0x7c/0xb0 [<c00000000030e574>] .kernfs_fop_write+0x154/0x1e0 [<c000000000268450>] .vfs_write+0xe0/0x260 [<c000000000269144>] .SyS_write+0x64/0x110 [<c000000000009ffc>] syscall_exit+0x0/0x7c [<c000000001b18600>] 0xc000000001b18600 [<c000000000015044>] .__switch_to+0x144/0x200 [<c00000000030be14>] .__kernfs_remove+0x204/0x300 [<c00000000030d428>] .kernfs_remove_by_name_ns+0x68/0xf0 [<c00000000030fb38>] .sysfs_remove_file_ns+0x38/0x60 [<c000000000539354>] .device_remove_attrs+0x54/0xc0 [<c000000000539fd8>] .device_del+0x158/0x250 [<c00000000053a104>] .device_unregister+0x34/0xa0 [<c00000000055bc14>] .unregister_memory_section+0x164/0x170 [<c00000000024ee18>] .__remove_pages+0x108/0x4c0 [<c00000000004b590>] .arch_remove_memory+0x60/0xc0 [<c00000000026446c>] .remove_memory+0x8c/0xe0 [<c00000000007f9f4>] .pseries_remove_memblock+0xd4/0x160 [<c00000000007fcfc>] .pseries_memory_notifier+0x27c/0x290 [<c0000000008ae6cc>] .notifier_call_chain+0x8c/0x100 [<c0000000000d858c>] .__blocking_notifier_call_chain+0x6c/0xe0 [<c00000000071ddec>] .of_property_notify+0x7c/0xc0 [<c00000000071ed3c>] .of_update_property+0x3c/0x1b0 [<c0000000000756cc>] .ofdt_write+0x3dc/0x740 [<c0000000002f60fc>] .proc_reg_write+0xac/0x110 [<c000000000268450>] .vfs_write+0xe0/0x260 [<c000000000269144>] .SyS_write+0x64/0x110 [<c000000000009ffc>] syscall_exit+0x0/0x7c This patch uses lock_device_hotplug() to protect remove_memory() called in pseries_remove_memblock(), which is also stated before function remove_memory(): * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call, as required by * try_offline_node(). */ void __ref remove_memory(int nid, u64 start, u64 size) With this lock held, the other process(#1120 above) trying to online the memory block will retry the system call when calling lock_device_hotplug_sysfs(), and finally find No such device error. Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-04-10 17:25:31 +09:00
unlock_device_hotplug();
return 0;
}
static int pseries_remove_mem_node(struct device_node *np)
{
const __be32 *prop;
unsigned long base;
unsigned long lmb_size;
int ret = -EINVAL;
int addr_cells, size_cells;
/*
* Check to see if we are actually removing memory
*/
if (!of_node_is_type(np, "memory"))
return 0;
/*
* Find the base address and size of the memblock
*/
prop = of_get_property(np, "reg", NULL);
if (!prop)
return ret;
addr_cells = of_n_addr_cells(np);
size_cells = of_n_size_cells(np);
/*
* "reg" property represents (addr,size) tuple.
*/
base = of_read_number(prop, addr_cells);
prop += addr_cells;
lmb_size = of_read_number(prop, size_cells);
pseries_remove_memblock(base, lmb_size);
return 0;
}
static bool lmb_is_removable(struct drmem_lmb *lmb)
{
int i, scns_per_block;
int rc = 1;
unsigned long pfn, block_sz;
u64 phys_addr;
if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
return false;
block_sz = memory_block_size_bytes();
scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
phys_addr = lmb->base_addr;
#ifdef CONFIG_FA_DUMP
/*
* Don't hot-remove memory that falls in fadump boot memory area
* and memory that is reserved for capturing old kernel memory.
*/
if (is_fadump_memory_area(phys_addr, block_sz))
return false;
#endif
for (i = 0; i < scns_per_block; i++) {
pfn = PFN_DOWN(phys_addr);
if (!pfn_present(pfn)) {
phys_addr += MIN_MEMORY_BLOCK_SIZE;
continue;
}
rc &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
phys_addr += MIN_MEMORY_BLOCK_SIZE;
}
return rc ? true : false;
}
static int dlpar_add_lmb(struct drmem_lmb *);
static int dlpar_remove_lmb(struct drmem_lmb *lmb)
{
pseries/drmem: don't cache node id in drmem_lmb struct [ Upstream commit e5e179aa3a39c818db8fbc2dce8d2cd24adaf657 ] At memory hot-remove time we can retrieve an LMB's nid from its corresponding memory_block. There is no need to store the nid in multiple locations. Note that lmb_to_memblock() uses find_memory_block() to get the corresponding memory_block. As find_memory_block() runs in sub-linear time this approach is negligibly slower than what we do at present. In exchange for this lookup at hot-remove time we no longer need to call memory_add_physaddr_to_nid() during drmem_init() for each LMB. On powerpc, memory_add_physaddr_to_nid() is a linear search, so this spares us an O(n^2) initialization during boot. On systems with many LMBs that initialization overhead is palpable and disruptive. For example, on a box with 249854 LMBs we're seeing drmem_init() take upwards of 30 seconds to complete: [ 53.721639] drmem: initializing drmem v2 [ 80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1] [ 80.604377] Modules linked in: [ 80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4 [ 80.604397] NIP: c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000 [ 80.604407] REGS: c0002dbff8493830 TRAP: 0901 Not tainted (5.6.0-rc2+) [ 80.604412] MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 44000248 XER: 0000000d [ 80.604431] CFAR: c0000000000a4a38 IRQMASK: 0 [ 80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30 [ 80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000 [ 80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001 [ 80.604431] GPR12: 0000000000000000 c00000001e8ec200 [ 80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0 [ 80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 [ 80.604492] Call Trace: [ 80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable) [ 80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60 [ 80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0 [ 80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0 [ 80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0 [ 80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148 [ 80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74 [ 80.604567] Instruction dump: [ 80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214 [ 80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040 [ 89.047390] drmem: 249854 LMB(s) With a patched kernel on the same machine we're no longer seeing the soft lockup. drmem_init() now completes in negligible time, even when the LMB count is large. Fixes: b2d3b5ee66f2 ("powerpc/pseries: Track LMB nid instead of using device tree") Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com> Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200811015115.63677-1-cheloha@linux.ibm.com Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-08-11 10:51:15 +09:00
struct memory_block *mem_block;
unsigned long block_sz;
powerpc/pseries: Track LMB nid instead of using device tree When removing memory we need to remove the memory from the node it was added to instead of looking up the node it should be in in the device tree. During testing we have seen scenarios where the affinity for a LMB changes due to a partition migration or PRRN event. In these cases the node the LMB exists in may not match the node the device tree indicates it belongs in. This can lead to a system crash when trying to DLPAR remove the LMB after a migration or PRRN event. The current code looks up the node in the device tree to remove the LMB from, the crash occurs when we try to offline this node and it does not have any data, i.e. node_data[nid] == NULL. 36:mon> e cpu 0x36: Vector: 300 (Data Access) at [c0000001828b7810] pc: c00000000036d08c: try_offline_node+0x2c/0x1b0 lr: c0000000003a14ec: remove_memory+0xbc/0x110 sp: c0000001828b7a90 msr: 800000000280b033 dar: 9a28 dsisr: 40000000 current = 0xc0000006329c4c80 paca = 0xc000000007a55200 softe: 0 irq_happened: 0x01 pid = 76926, comm = kworker/u320:3 36:mon> t [link register ] c0000000003a14ec remove_memory+0xbc/0x110 [c0000001828b7a90] c00000000006a1cc arch_remove_memory+0x9c/0xd0 (unreliable) [c0000001828b7ad0] c0000000003a14e0 remove_memory+0xb0/0x110 [c0000001828b7b20] c0000000000c7db4 dlpar_remove_lmb+0x94/0x160 [c0000001828b7b60] c0000000000c8ef8 dlpar_memory+0x7e8/0xd10 [c0000001828b7bf0] c0000000000bf828 handle_dlpar_errorlog+0xf8/0x160 [c0000001828b7c60] c0000000000bf8cc pseries_hp_work_fn+0x3c/0xa0 [c0000001828b7c90] c000000000128cd8 process_one_work+0x298/0x5a0 [c0000001828b7d20] c000000000129068 worker_thread+0x88/0x620 [c0000001828b7dc0] c00000000013223c kthread+0x1ac/0x1c0 [c0000001828b7e30] c00000000000b45c ret_from_kernel_thread+0x5c/0x80 To resolve this we need to track the node a LMB belongs to when it is added to the system so we can remove it from that node instead of the node that the device tree indicates it should belong to. Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-10-03 00:35:59 +09:00
int rc;
if (!lmb_is_removable(lmb))
return -EINVAL;
pseries/drmem: don't cache node id in drmem_lmb struct [ Upstream commit e5e179aa3a39c818db8fbc2dce8d2cd24adaf657 ] At memory hot-remove time we can retrieve an LMB's nid from its corresponding memory_block. There is no need to store the nid in multiple locations. Note that lmb_to_memblock() uses find_memory_block() to get the corresponding memory_block. As find_memory_block() runs in sub-linear time this approach is negligibly slower than what we do at present. In exchange for this lookup at hot-remove time we no longer need to call memory_add_physaddr_to_nid() during drmem_init() for each LMB. On powerpc, memory_add_physaddr_to_nid() is a linear search, so this spares us an O(n^2) initialization during boot. On systems with many LMBs that initialization overhead is palpable and disruptive. For example, on a box with 249854 LMBs we're seeing drmem_init() take upwards of 30 seconds to complete: [ 53.721639] drmem: initializing drmem v2 [ 80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1] [ 80.604377] Modules linked in: [ 80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4 [ 80.604397] NIP: c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000 [ 80.604407] REGS: c0002dbff8493830 TRAP: 0901 Not tainted (5.6.0-rc2+) [ 80.604412] MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 44000248 XER: 0000000d [ 80.604431] CFAR: c0000000000a4a38 IRQMASK: 0 [ 80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30 [ 80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000 [ 80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001 [ 80.604431] GPR12: 0000000000000000 c00000001e8ec200 [ 80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0 [ 80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 [ 80.604492] Call Trace: [ 80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable) [ 80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60 [ 80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0 [ 80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0 [ 80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0 [ 80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148 [ 80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74 [ 80.604567] Instruction dump: [ 80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214 [ 80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040 [ 89.047390] drmem: 249854 LMB(s) With a patched kernel on the same machine we're no longer seeing the soft lockup. drmem_init() now completes in negligible time, even when the LMB count is large. Fixes: b2d3b5ee66f2 ("powerpc/pseries: Track LMB nid instead of using device tree") Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com> Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200811015115.63677-1-cheloha@linux.ibm.com Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-08-11 10:51:15 +09:00
mem_block = lmb_to_memblock(lmb);
if (mem_block == NULL)
return -EINVAL;
powerpc/pseries: Check memory device state before onlining/offlining When DLPAR adding or removing memory we need to check the device offline status before trying to online/offline the memory. This is needed because calls to device_online() and device_offline() will return non-zero for memory that is already online and offline respectively. This update resolves two scenarios. First, for a kernel built with auto-online memory enabled (CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y), memory will be onlined as part of calls to add_memory(). After adding the memory the pseries DLPAR code tries to online it and fails since the memory is already online. The DLPAR code then tries to remove the memory which produces the oops message below because the memory is not offline. The second scenario occurs when removing memory that is already offline, i.e. marking memory offline (via sysfs) and then trying to remove that memory. This doesn't work because offlining the already offline memory does not succeed and the DLPAR code then fails the DLPAR remove operation. The fix for both scenarios is to check the device.offline status before making the calls to device_online() or device_offline(). kernel BUG at mm/memory_hotplug.c:1936! ... NIP [c0000000002ca428] .remove_memory+0xb8/0xc0 LR [c0000000002ca3cc] .remove_memory+0x5c/0xc0 Call Trace: .remove_memory+0x5c/0xc0 (unreliable) .dlpar_add_lmb+0x384/0x400 .dlpar_memory+0x5dc/0xca0 .handle_dlpar_errorlog+0x74/0xe0 .pseries_hp_work_fn+0x2c/0x90 .process_one_work+0x17c/0x460 .worker_thread+0x88/0x500 .kthread+0x15c/0x1a0 .ret_from_kernel_thread+0x58/0xc0 Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memory'") Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> [mpe: Use bool, add explicit rc=0 case, change log typos & formatting] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-08-03 03:03:22 +09:00
rc = dlpar_offline_lmb(lmb);
pseries/drmem: don't cache node id in drmem_lmb struct [ Upstream commit e5e179aa3a39c818db8fbc2dce8d2cd24adaf657 ] At memory hot-remove time we can retrieve an LMB's nid from its corresponding memory_block. There is no need to store the nid in multiple locations. Note that lmb_to_memblock() uses find_memory_block() to get the corresponding memory_block. As find_memory_block() runs in sub-linear time this approach is negligibly slower than what we do at present. In exchange for this lookup at hot-remove time we no longer need to call memory_add_physaddr_to_nid() during drmem_init() for each LMB. On powerpc, memory_add_physaddr_to_nid() is a linear search, so this spares us an O(n^2) initialization during boot. On systems with many LMBs that initialization overhead is palpable and disruptive. For example, on a box with 249854 LMBs we're seeing drmem_init() take upwards of 30 seconds to complete: [ 53.721639] drmem: initializing drmem v2 [ 80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1] [ 80.604377] Modules linked in: [ 80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4 [ 80.604397] NIP: c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000 [ 80.604407] REGS: c0002dbff8493830 TRAP: 0901 Not tainted (5.6.0-rc2+) [ 80.604412] MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 44000248 XER: 0000000d [ 80.604431] CFAR: c0000000000a4a38 IRQMASK: 0 [ 80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30 [ 80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000 [ 80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001 [ 80.604431] GPR12: 0000000000000000 c00000001e8ec200 [ 80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0 [ 80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 [ 80.604492] Call Trace: [ 80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable) [ 80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60 [ 80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0 [ 80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0 [ 80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0 [ 80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148 [ 80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74 [ 80.604567] Instruction dump: [ 80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214 [ 80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040 [ 89.047390] drmem: 249854 LMB(s) With a patched kernel on the same machine we're no longer seeing the soft lockup. drmem_init() now completes in negligible time, even when the LMB count is large. Fixes: b2d3b5ee66f2 ("powerpc/pseries: Track LMB nid instead of using device tree") Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com> Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200811015115.63677-1-cheloha@linux.ibm.com Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-08-11 10:51:15 +09:00
if (rc) {
put_device(&mem_block->dev);
return rc;
pseries/drmem: don't cache node id in drmem_lmb struct [ Upstream commit e5e179aa3a39c818db8fbc2dce8d2cd24adaf657 ] At memory hot-remove time we can retrieve an LMB's nid from its corresponding memory_block. There is no need to store the nid in multiple locations. Note that lmb_to_memblock() uses find_memory_block() to get the corresponding memory_block. As find_memory_block() runs in sub-linear time this approach is negligibly slower than what we do at present. In exchange for this lookup at hot-remove time we no longer need to call memory_add_physaddr_to_nid() during drmem_init() for each LMB. On powerpc, memory_add_physaddr_to_nid() is a linear search, so this spares us an O(n^2) initialization during boot. On systems with many LMBs that initialization overhead is palpable and disruptive. For example, on a box with 249854 LMBs we're seeing drmem_init() take upwards of 30 seconds to complete: [ 53.721639] drmem: initializing drmem v2 [ 80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1] [ 80.604377] Modules linked in: [ 80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4 [ 80.604397] NIP: c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000 [ 80.604407] REGS: c0002dbff8493830 TRAP: 0901 Not tainted (5.6.0-rc2+) [ 80.604412] MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 44000248 XER: 0000000d [ 80.604431] CFAR: c0000000000a4a38 IRQMASK: 0 [ 80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30 [ 80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000 [ 80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001 [ 80.604431] GPR12: 0000000000000000 c00000001e8ec200 [ 80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0 [ 80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 [ 80.604492] Call Trace: [ 80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable) [ 80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60 [ 80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0 [ 80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0 [ 80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0 [ 80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148 [ 80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74 [ 80.604567] Instruction dump: [ 80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214 [ 80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040 [ 89.047390] drmem: 249854 LMB(s) With a patched kernel on the same machine we're no longer seeing the soft lockup. drmem_init() now completes in negligible time, even when the LMB count is large. Fixes: b2d3b5ee66f2 ("powerpc/pseries: Track LMB nid instead of using device tree") Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com> Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200811015115.63677-1-cheloha@linux.ibm.com Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-08-11 10:51:15 +09:00
}
block_sz = pseries_memory_block_size();
pseries/drmem: don't cache node id in drmem_lmb struct [ Upstream commit e5e179aa3a39c818db8fbc2dce8d2cd24adaf657 ] At memory hot-remove time we can retrieve an LMB's nid from its corresponding memory_block. There is no need to store the nid in multiple locations. Note that lmb_to_memblock() uses find_memory_block() to get the corresponding memory_block. As find_memory_block() runs in sub-linear time this approach is negligibly slower than what we do at present. In exchange for this lookup at hot-remove time we no longer need to call memory_add_physaddr_to_nid() during drmem_init() for each LMB. On powerpc, memory_add_physaddr_to_nid() is a linear search, so this spares us an O(n^2) initialization during boot. On systems with many LMBs that initialization overhead is palpable and disruptive. For example, on a box with 249854 LMBs we're seeing drmem_init() take upwards of 30 seconds to complete: [ 53.721639] drmem: initializing drmem v2 [ 80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1] [ 80.604377] Modules linked in: [ 80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4 [ 80.604397] NIP: c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000 [ 80.604407] REGS: c0002dbff8493830 TRAP: 0901 Not tainted (5.6.0-rc2+) [ 80.604412] MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 44000248 XER: 0000000d [ 80.604431] CFAR: c0000000000a4a38 IRQMASK: 0 [ 80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30 [ 80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000 [ 80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001 [ 80.604431] GPR12: 0000000000000000 c00000001e8ec200 [ 80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0 [ 80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 [ 80.604492] Call Trace: [ 80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable) [ 80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60 [ 80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0 [ 80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0 [ 80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0 [ 80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148 [ 80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74 [ 80.604567] Instruction dump: [ 80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214 [ 80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040 [ 89.047390] drmem: 249854 LMB(s) With a patched kernel on the same machine we're no longer seeing the soft lockup. drmem_init() now completes in negligible time, even when the LMB count is large. Fixes: b2d3b5ee66f2 ("powerpc/pseries: Track LMB nid instead of using device tree") Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com> Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200811015115.63677-1-cheloha@linux.ibm.com Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-08-11 10:51:15 +09:00
__remove_memory(mem_block->nid, lmb->base_addr, block_sz);
put_device(&mem_block->dev);
/* Update memory regions for memory remove */
memblock_remove(lmb->base_addr, block_sz);
invalidate_lmb_associativity_index(lmb);
lmb->flags &= ~DRCONF_MEM_ASSIGNED;
return 0;
}
static int dlpar_memory_remove_by_count(u32 lmbs_to_remove)
{
struct drmem_lmb *lmb;
int lmbs_removed = 0;
int lmbs_available = 0;
int rc;
pr_info("Attempting to hot-remove %d LMB(s)\n", lmbs_to_remove);
if (lmbs_to_remove == 0)
return -EINVAL;
/* Validate that there are enough LMBs to satisfy the request */
for_each_drmem_lmb(lmb) {
if (lmb_is_removable(lmb))
lmbs_available++;
if (lmbs_available == lmbs_to_remove)
break;
}
if (lmbs_available < lmbs_to_remove) {
pr_info("Not enough LMBs available (%d of %d) to satisfy request\n",
lmbs_available, lmbs_to_remove);
return -EINVAL;
}
for_each_drmem_lmb(lmb) {
rc = dlpar_remove_lmb(lmb);
if (rc)
continue;
/* Mark this lmb so we can add it later if all of the
* requested LMBs cannot be removed.
*/
drmem_mark_lmb_reserved(lmb);
lmbs_removed++;
if (lmbs_removed == lmbs_to_remove)
break;
}
if (lmbs_removed != lmbs_to_remove) {
pr_err("Memory hot-remove failed, adding LMB's back\n");
for_each_drmem_lmb(lmb) {
if (!drmem_lmb_reserved(lmb))
continue;
rc = dlpar_add_lmb(lmb);
if (rc)
pr_err("Failed to add LMB back, drc index %x\n",
lmb->drc_index);
drmem_remove_lmb_reservation(lmb);
}
rc = -EINVAL;
} else {
for_each_drmem_lmb(lmb) {
if (!drmem_lmb_reserved(lmb))
continue;
dlpar_release_drc(lmb->drc_index);
pr_info("Memory at %llx was hot-removed\n",
lmb->base_addr);
drmem_remove_lmb_reservation(lmb);
}
rc = 0;
}
return rc;
}
static int dlpar_memory_remove_by_index(u32 drc_index)
{
struct drmem_lmb *lmb;
int lmb_found;
int rc;
pr_info("Attempting to hot-remove LMB, drc index %x\n", drc_index);
lmb_found = 0;
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
lmb_found = 1;
rc = dlpar_remove_lmb(lmb);
if (!rc)
dlpar_release_drc(lmb->drc_index);
break;
}
}
if (!lmb_found)
rc = -EINVAL;
if (rc)
pr_info("Failed to hot-remove memory at %llx\n",
lmb->base_addr);
else
pr_info("Memory at %llx was hot-removed\n", lmb->base_addr);
return rc;
}
static int dlpar_memory_readd_by_index(u32 drc_index)
{
struct drmem_lmb *lmb;
int lmb_found;
int rc;
pr_info("Attempting to update LMB, drc index %x\n", drc_index);
lmb_found = 0;
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
lmb_found = 1;
rc = dlpar_remove_lmb(lmb);
if (!rc) {
rc = dlpar_add_lmb(lmb);
if (rc)
dlpar_release_drc(lmb->drc_index);
}
break;
}
}
if (!lmb_found)
rc = -EINVAL;
if (rc)
pr_info("Failed to update memory at %llx\n",
lmb->base_addr);
else
pr_info("Memory at %llx was updated\n", lmb->base_addr);
return rc;
}
static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
{
struct drmem_lmb *lmb, *start_lmb, *end_lmb;
int lmbs_available = 0;
int rc;
pr_info("Attempting to hot-remove %u LMB(s) at %x\n",
lmbs_to_remove, drc_index);
if (lmbs_to_remove == 0)
return -EINVAL;
rc = get_lmb_range(drc_index, lmbs_to_remove, &start_lmb, &end_lmb);
if (rc)
return -EINVAL;
/* Validate that there are enough LMBs to satisfy the request */
for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
if (lmb->flags & DRCONF_MEM_RESERVED)
break;
lmbs_available++;
}
if (lmbs_available < lmbs_to_remove)
return -EINVAL;
for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
continue;
rc = dlpar_remove_lmb(lmb);
if (rc)
break;
drmem_mark_lmb_reserved(lmb);
}
if (rc) {
pr_err("Memory indexed-count-remove failed, adding any removed LMBs\n");
for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
if (!drmem_lmb_reserved(lmb))
continue;
rc = dlpar_add_lmb(lmb);
if (rc)
pr_err("Failed to add LMB, drc index %x\n",
lmb->drc_index);
drmem_remove_lmb_reservation(lmb);
}
rc = -EINVAL;
} else {
for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
if (!drmem_lmb_reserved(lmb))
continue;
dlpar_release_drc(lmb->drc_index);
pr_info("Memory at %llx (drc index %x) was hot-removed\n",
lmb->base_addr, lmb->drc_index);
drmem_remove_lmb_reservation(lmb);
}
}
return rc;
}
#else
static inline int pseries_remove_memblock(unsigned long base,
unsigned long memblock_size)
{
return -EOPNOTSUPP;
}
static inline int pseries_remove_mem_node(struct device_node *np)
{
return 0;
}
static inline int dlpar_memory_remove(struct pseries_hp_errorlog *hp_elog)
{
return -EOPNOTSUPP;
}
static int dlpar_remove_lmb(struct drmem_lmb *lmb)
{
return -EOPNOTSUPP;
}
static int dlpar_memory_remove_by_count(u32 lmbs_to_remove)
{
return -EOPNOTSUPP;
}
static int dlpar_memory_remove_by_index(u32 drc_index)
{
return -EOPNOTSUPP;
}
static int dlpar_memory_readd_by_index(u32 drc_index)
{
return -EOPNOTSUPP;
}
static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
{
return -EOPNOTSUPP;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
static int dlpar_add_lmb(struct drmem_lmb *lmb)
{
unsigned long block_sz;
pseries/drmem: don't cache node id in drmem_lmb struct [ Upstream commit e5e179aa3a39c818db8fbc2dce8d2cd24adaf657 ] At memory hot-remove time we can retrieve an LMB's nid from its corresponding memory_block. There is no need to store the nid in multiple locations. Note that lmb_to_memblock() uses find_memory_block() to get the corresponding memory_block. As find_memory_block() runs in sub-linear time this approach is negligibly slower than what we do at present. In exchange for this lookup at hot-remove time we no longer need to call memory_add_physaddr_to_nid() during drmem_init() for each LMB. On powerpc, memory_add_physaddr_to_nid() is a linear search, so this spares us an O(n^2) initialization during boot. On systems with many LMBs that initialization overhead is palpable and disruptive. For example, on a box with 249854 LMBs we're seeing drmem_init() take upwards of 30 seconds to complete: [ 53.721639] drmem: initializing drmem v2 [ 80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1] [ 80.604377] Modules linked in: [ 80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4 [ 80.604397] NIP: c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000 [ 80.604407] REGS: c0002dbff8493830 TRAP: 0901 Not tainted (5.6.0-rc2+) [ 80.604412] MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 44000248 XER: 0000000d [ 80.604431] CFAR: c0000000000a4a38 IRQMASK: 0 [ 80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30 [ 80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000 [ 80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001 [ 80.604431] GPR12: 0000000000000000 c00000001e8ec200 [ 80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0 [ 80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 [ 80.604492] Call Trace: [ 80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable) [ 80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60 [ 80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0 [ 80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0 [ 80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0 [ 80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148 [ 80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74 [ 80.604567] Instruction dump: [ 80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214 [ 80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040 [ 89.047390] drmem: 249854 LMB(s) With a patched kernel on the same machine we're no longer seeing the soft lockup. drmem_init() now completes in negligible time, even when the LMB count is large. Fixes: b2d3b5ee66f2 ("powerpc/pseries: Track LMB nid instead of using device tree") Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com> Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200811015115.63677-1-cheloha@linux.ibm.com Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-08-11 10:51:15 +09:00
int nid, rc;
if (lmb->flags & DRCONF_MEM_ASSIGNED)
return -EINVAL;
rc = update_lmb_associativity_index(lmb);
if (rc) {
dlpar_release_drc(lmb->drc_index);
return rc;
}
block_sz = memory_block_size_bytes();
pseries/drmem: don't cache node id in drmem_lmb struct [ Upstream commit e5e179aa3a39c818db8fbc2dce8d2cd24adaf657 ] At memory hot-remove time we can retrieve an LMB's nid from its corresponding memory_block. There is no need to store the nid in multiple locations. Note that lmb_to_memblock() uses find_memory_block() to get the corresponding memory_block. As find_memory_block() runs in sub-linear time this approach is negligibly slower than what we do at present. In exchange for this lookup at hot-remove time we no longer need to call memory_add_physaddr_to_nid() during drmem_init() for each LMB. On powerpc, memory_add_physaddr_to_nid() is a linear search, so this spares us an O(n^2) initialization during boot. On systems with many LMBs that initialization overhead is palpable and disruptive. For example, on a box with 249854 LMBs we're seeing drmem_init() take upwards of 30 seconds to complete: [ 53.721639] drmem: initializing drmem v2 [ 80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1] [ 80.604377] Modules linked in: [ 80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4 [ 80.604397] NIP: c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000 [ 80.604407] REGS: c0002dbff8493830 TRAP: 0901 Not tainted (5.6.0-rc2+) [ 80.604412] MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 44000248 XER: 0000000d [ 80.604431] CFAR: c0000000000a4a38 IRQMASK: 0 [ 80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30 [ 80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000 [ 80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001 [ 80.604431] GPR12: 0000000000000000 c00000001e8ec200 [ 80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0 [ 80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 [ 80.604492] Call Trace: [ 80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable) [ 80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60 [ 80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0 [ 80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0 [ 80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0 [ 80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148 [ 80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74 [ 80.604567] Instruction dump: [ 80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214 [ 80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040 [ 89.047390] drmem: 249854 LMB(s) With a patched kernel on the same machine we're no longer seeing the soft lockup. drmem_init() now completes in negligible time, even when the LMB count is large. Fixes: b2d3b5ee66f2 ("powerpc/pseries: Track LMB nid instead of using device tree") Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com> Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200811015115.63677-1-cheloha@linux.ibm.com Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-08-11 10:51:15 +09:00
/* Find the node id for this address. */
nid = memory_add_physaddr_to_nid(lmb->base_addr);
/* Add the memory */
pseries/drmem: don't cache node id in drmem_lmb struct [ Upstream commit e5e179aa3a39c818db8fbc2dce8d2cd24adaf657 ] At memory hot-remove time we can retrieve an LMB's nid from its corresponding memory_block. There is no need to store the nid in multiple locations. Note that lmb_to_memblock() uses find_memory_block() to get the corresponding memory_block. As find_memory_block() runs in sub-linear time this approach is negligibly slower than what we do at present. In exchange for this lookup at hot-remove time we no longer need to call memory_add_physaddr_to_nid() during drmem_init() for each LMB. On powerpc, memory_add_physaddr_to_nid() is a linear search, so this spares us an O(n^2) initialization during boot. On systems with many LMBs that initialization overhead is palpable and disruptive. For example, on a box with 249854 LMBs we're seeing drmem_init() take upwards of 30 seconds to complete: [ 53.721639] drmem: initializing drmem v2 [ 80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1] [ 80.604377] Modules linked in: [ 80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4 [ 80.604397] NIP: c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000 [ 80.604407] REGS: c0002dbff8493830 TRAP: 0901 Not tainted (5.6.0-rc2+) [ 80.604412] MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 44000248 XER: 0000000d [ 80.604431] CFAR: c0000000000a4a38 IRQMASK: 0 [ 80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30 [ 80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000 [ 80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001 [ 80.604431] GPR12: 0000000000000000 c00000001e8ec200 [ 80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0 [ 80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 [ 80.604492] Call Trace: [ 80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable) [ 80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60 [ 80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0 [ 80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0 [ 80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0 [ 80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148 [ 80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74 [ 80.604567] Instruction dump: [ 80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214 [ 80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040 [ 89.047390] drmem: 249854 LMB(s) With a patched kernel on the same machine we're no longer seeing the soft lockup. drmem_init() now completes in negligible time, even when the LMB count is large. Fixes: b2d3b5ee66f2 ("powerpc/pseries: Track LMB nid instead of using device tree") Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com> Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200811015115.63677-1-cheloha@linux.ibm.com Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-08-11 10:51:15 +09:00
rc = __add_memory(nid, lmb->base_addr, block_sz);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
}
rc = dlpar_online_lmb(lmb);
if (rc) {
pseries/drmem: don't cache node id in drmem_lmb struct [ Upstream commit e5e179aa3a39c818db8fbc2dce8d2cd24adaf657 ] At memory hot-remove time we can retrieve an LMB's nid from its corresponding memory_block. There is no need to store the nid in multiple locations. Note that lmb_to_memblock() uses find_memory_block() to get the corresponding memory_block. As find_memory_block() runs in sub-linear time this approach is negligibly slower than what we do at present. In exchange for this lookup at hot-remove time we no longer need to call memory_add_physaddr_to_nid() during drmem_init() for each LMB. On powerpc, memory_add_physaddr_to_nid() is a linear search, so this spares us an O(n^2) initialization during boot. On systems with many LMBs that initialization overhead is palpable and disruptive. For example, on a box with 249854 LMBs we're seeing drmem_init() take upwards of 30 seconds to complete: [ 53.721639] drmem: initializing drmem v2 [ 80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1] [ 80.604377] Modules linked in: [ 80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4 [ 80.604397] NIP: c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000 [ 80.604407] REGS: c0002dbff8493830 TRAP: 0901 Not tainted (5.6.0-rc2+) [ 80.604412] MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 44000248 XER: 0000000d [ 80.604431] CFAR: c0000000000a4a38 IRQMASK: 0 [ 80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30 [ 80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000 [ 80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001 [ 80.604431] GPR12: 0000000000000000 c00000001e8ec200 [ 80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0 [ 80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 [ 80.604492] Call Trace: [ 80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable) [ 80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60 [ 80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0 [ 80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0 [ 80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0 [ 80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148 [ 80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74 [ 80.604567] Instruction dump: [ 80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214 [ 80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040 [ 89.047390] drmem: 249854 LMB(s) With a patched kernel on the same machine we're no longer seeing the soft lockup. drmem_init() now completes in negligible time, even when the LMB count is large. Fixes: b2d3b5ee66f2 ("powerpc/pseries: Track LMB nid instead of using device tree") Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com> Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200811015115.63677-1-cheloha@linux.ibm.com Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-08-11 10:51:15 +09:00
__remove_memory(nid, lmb->base_addr, block_sz);
invalidate_lmb_associativity_index(lmb);
} else {
lmb->flags |= DRCONF_MEM_ASSIGNED;
}
return rc;
}
static int dlpar_memory_add_by_count(u32 lmbs_to_add)
{
struct drmem_lmb *lmb;
int lmbs_available = 0;
int lmbs_added = 0;
int rc;
pr_info("Attempting to hot-add %d LMB(s)\n", lmbs_to_add);
if (lmbs_to_add == 0)
return -EINVAL;
/* Validate that there are enough LMBs to satisfy the request */
for_each_drmem_lmb(lmb) {
if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
lmbs_available++;
if (lmbs_available == lmbs_to_add)
break;
}
if (lmbs_available < lmbs_to_add)
return -EINVAL;
for_each_drmem_lmb(lmb) {
if (lmb->flags & DRCONF_MEM_ASSIGNED)
continue;
rc = dlpar_acquire_drc(lmb->drc_index);
if (rc)
continue;
rc = dlpar_add_lmb(lmb);
if (rc) {
dlpar_release_drc(lmb->drc_index);
continue;
}
/* Mark this lmb so we can remove it later if all of the
* requested LMBs cannot be added.
*/
drmem_mark_lmb_reserved(lmb);
lmbs_added++;
if (lmbs_added == lmbs_to_add)
break;
}
if (lmbs_added != lmbs_to_add) {
pr_err("Memory hot-add failed, removing any added LMBs\n");
for_each_drmem_lmb(lmb) {
if (!drmem_lmb_reserved(lmb))
continue;
rc = dlpar_remove_lmb(lmb);
if (rc)
pr_err("Failed to remove LMB, drc index %x\n",
lmb->drc_index);
else
dlpar_release_drc(lmb->drc_index);
drmem_remove_lmb_reservation(lmb);
}
rc = -EINVAL;
} else {
for_each_drmem_lmb(lmb) {
if (!drmem_lmb_reserved(lmb))
continue;
pr_info("Memory at %llx (drc index %x) was hot-added\n",
lmb->base_addr, lmb->drc_index);
drmem_remove_lmb_reservation(lmb);
}
rc = 0;
}
return rc;
}
static int dlpar_memory_add_by_index(u32 drc_index)
{
struct drmem_lmb *lmb;
int rc, lmb_found;
pr_info("Attempting to hot-add LMB, drc index %x\n", drc_index);
lmb_found = 0;
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
lmb_found = 1;
rc = dlpar_acquire_drc(lmb->drc_index);
if (!rc) {
rc = dlpar_add_lmb(lmb);
if (rc)
dlpar_release_drc(lmb->drc_index);
}
break;
}
}
if (!lmb_found)
rc = -EINVAL;
if (rc)
pr_info("Failed to hot-add memory, drc index %x\n", drc_index);
else
pr_info("Memory at %llx (drc index %x) was hot-added\n",
lmb->base_addr, drc_index);
return rc;
}
static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index)
{
struct drmem_lmb *lmb, *start_lmb, *end_lmb;
int lmbs_available = 0;
int rc;
pr_info("Attempting to hot-add %u LMB(s) at index %x\n",
lmbs_to_add, drc_index);
if (lmbs_to_add == 0)
return -EINVAL;
rc = get_lmb_range(drc_index, lmbs_to_add, &start_lmb, &end_lmb);
if (rc)
return -EINVAL;
/* Validate that the LMBs in this range are not reserved */
for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
if (lmb->flags & DRCONF_MEM_RESERVED)
break;
lmbs_available++;
}
if (lmbs_available < lmbs_to_add)
return -EINVAL;
for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
if (lmb->flags & DRCONF_MEM_ASSIGNED)
continue;
rc = dlpar_acquire_drc(lmb->drc_index);
if (rc)
break;
rc = dlpar_add_lmb(lmb);
if (rc) {
dlpar_release_drc(lmb->drc_index);
break;
}
drmem_mark_lmb_reserved(lmb);
}
if (rc) {
pr_err("Memory indexed-count-add failed, removing any added LMBs\n");
for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
if (!drmem_lmb_reserved(lmb))
continue;
rc = dlpar_remove_lmb(lmb);
if (rc)
pr_err("Failed to remove LMB, drc index %x\n",
lmb->drc_index);
else
dlpar_release_drc(lmb->drc_index);
drmem_remove_lmb_reservation(lmb);
}
rc = -EINVAL;
} else {
for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
if (!drmem_lmb_reserved(lmb))
continue;
pr_info("Memory at %llx (drc index %x) was hot-added\n",
lmb->base_addr, lmb->drc_index);
drmem_remove_lmb_reservation(lmb);
}
}
return rc;
}
powerpc/pseries: Create new device hotplug entry point The current hotplug (or dlpar) of devices (the process is generally the same for memory, cpu, and pci) on PowerVM systems is initiated from the HMC, which communicates the request to the partitions through the RSCT framework. The RSCT framework then invokes the drmgr command. The drmgr command performs the hotplug operation by doing some pieces, such as most of the rtas calls and device tree parsing, in userspace and make requests to the kernel to online/offline the device, update the device tree and add/remove the device. For PowerKVM the approach for device hotplug is to follow what is currently being done for pci hotplug. A hotplug request is initiated from the host. QEMU then generates an EPOW interrupt to the guest which causes the guest to make the rtas,check-exception call. In QEMU, the rtas,check-exception call returns a rtas hotplug event to the guest. Please note that the current pci hotplug path for PowerKVM involves the kernel receiving the rtas hotplug event, passing it to rtas_errd in userspace, and having rtas_errd invoke drmgr. The drmgr command then handles the request as described above for PowerVM systems. There is no need for this circuitous route, we should just handle the entire hotplug of devices in the kernel. What I am planning is to enable this by moving the code to handle hotplug from drmgr into the kernel to provide a single path for handling device hotplug for both PowerVM and PowerKVM systems. This patch provides the common iframework and entry point. For PowerKVM a future update to the kernel rtas code will recognize rtas hotplug events returned from rtas,check-exception calls and use the common entry point to handle hotplug of the device. For PowerVM systems, This patch creates /sys/kernel/dlpar that can be used by the drmgr command to initiate hotplug requests. In order to do this a string of the format "<resource> <action> <id_type> <id>" is written to this file. The string consists of a resource (cpu, memory, pci, phb), an action (add or remove), an id_type (count, drc index, drc name), and the corresponding id. The kernel will parse the string and create a rtas hotplug section that can be passed to the common entry point for handling hotplug requests. It should be noted that there is no chance of updating how we receive hotplug (dlpar) requests from the HMC on PowerVM systems. Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2015-02-11 04:47:02 +09:00
int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
{
u32 count, drc_index;
int rc;
powerpc/pseries: Create new device hotplug entry point The current hotplug (or dlpar) of devices (the process is generally the same for memory, cpu, and pci) on PowerVM systems is initiated from the HMC, which communicates the request to the partitions through the RSCT framework. The RSCT framework then invokes the drmgr command. The drmgr command performs the hotplug operation by doing some pieces, such as most of the rtas calls and device tree parsing, in userspace and make requests to the kernel to online/offline the device, update the device tree and add/remove the device. For PowerKVM the approach for device hotplug is to follow what is currently being done for pci hotplug. A hotplug request is initiated from the host. QEMU then generates an EPOW interrupt to the guest which causes the guest to make the rtas,check-exception call. In QEMU, the rtas,check-exception call returns a rtas hotplug event to the guest. Please note that the current pci hotplug path for PowerKVM involves the kernel receiving the rtas hotplug event, passing it to rtas_errd in userspace, and having rtas_errd invoke drmgr. The drmgr command then handles the request as described above for PowerVM systems. There is no need for this circuitous route, we should just handle the entire hotplug of devices in the kernel. What I am planning is to enable this by moving the code to handle hotplug from drmgr into the kernel to provide a single path for handling device hotplug for both PowerVM and PowerKVM systems. This patch provides the common iframework and entry point. For PowerKVM a future update to the kernel rtas code will recognize rtas hotplug events returned from rtas,check-exception calls and use the common entry point to handle hotplug of the device. For PowerVM systems, This patch creates /sys/kernel/dlpar that can be used by the drmgr command to initiate hotplug requests. In order to do this a string of the format "<resource> <action> <id_type> <id>" is written to this file. The string consists of a resource (cpu, memory, pci, phb), an action (add or remove), an id_type (count, drc index, drc name), and the corresponding id. The kernel will parse the string and create a rtas hotplug section that can be passed to the common entry point for handling hotplug requests. It should be noted that there is no chance of updating how we receive hotplug (dlpar) requests from the HMC on PowerVM systems. Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2015-02-11 04:47:02 +09:00
lock_device_hotplug();
switch (hp_elog->action) {
case PSERIES_HP_ELOG_ACTION_ADD:
switch (hp_elog->id_type) {
case PSERIES_HP_ELOG_ID_DRC_COUNT:
count = hp_elog->_drc_u.drc_count;
rc = dlpar_memory_add_by_count(count);
break;
case PSERIES_HP_ELOG_ID_DRC_INDEX:
drc_index = hp_elog->_drc_u.drc_index;
rc = dlpar_memory_add_by_index(drc_index);
break;
case PSERIES_HP_ELOG_ID_DRC_IC:
count = hp_elog->_drc_u.ic.count;
drc_index = hp_elog->_drc_u.ic.index;
rc = dlpar_memory_add_by_ic(count, drc_index);
break;
default:
rc = -EINVAL;
break;
}
break;
case PSERIES_HP_ELOG_ACTION_REMOVE:
switch (hp_elog->id_type) {
case PSERIES_HP_ELOG_ID_DRC_COUNT:
count = hp_elog->_drc_u.drc_count;
rc = dlpar_memory_remove_by_count(count);
break;
case PSERIES_HP_ELOG_ID_DRC_INDEX:
drc_index = hp_elog->_drc_u.drc_index;
rc = dlpar_memory_remove_by_index(drc_index);
break;
case PSERIES_HP_ELOG_ID_DRC_IC:
count = hp_elog->_drc_u.ic.count;
drc_index = hp_elog->_drc_u.ic.index;
rc = dlpar_memory_remove_by_ic(count, drc_index);
break;
default:
rc = -EINVAL;
break;
}
break;
case PSERIES_HP_ELOG_ACTION_READD:
drc_index = hp_elog->_drc_u.drc_index;
rc = dlpar_memory_readd_by_index(drc_index);
break;
powerpc/pseries: Create new device hotplug entry point The current hotplug (or dlpar) of devices (the process is generally the same for memory, cpu, and pci) on PowerVM systems is initiated from the HMC, which communicates the request to the partitions through the RSCT framework. The RSCT framework then invokes the drmgr command. The drmgr command performs the hotplug operation by doing some pieces, such as most of the rtas calls and device tree parsing, in userspace and make requests to the kernel to online/offline the device, update the device tree and add/remove the device. For PowerKVM the approach for device hotplug is to follow what is currently being done for pci hotplug. A hotplug request is initiated from the host. QEMU then generates an EPOW interrupt to the guest which causes the guest to make the rtas,check-exception call. In QEMU, the rtas,check-exception call returns a rtas hotplug event to the guest. Please note that the current pci hotplug path for PowerKVM involves the kernel receiving the rtas hotplug event, passing it to rtas_errd in userspace, and having rtas_errd invoke drmgr. The drmgr command then handles the request as described above for PowerVM systems. There is no need for this circuitous route, we should just handle the entire hotplug of devices in the kernel. What I am planning is to enable this by moving the code to handle hotplug from drmgr into the kernel to provide a single path for handling device hotplug for both PowerVM and PowerKVM systems. This patch provides the common iframework and entry point. For PowerKVM a future update to the kernel rtas code will recognize rtas hotplug events returned from rtas,check-exception calls and use the common entry point to handle hotplug of the device. For PowerVM systems, This patch creates /sys/kernel/dlpar that can be used by the drmgr command to initiate hotplug requests. In order to do this a string of the format "<resource> <action> <id_type> <id>" is written to this file. The string consists of a resource (cpu, memory, pci, phb), an action (add or remove), an id_type (count, drc index, drc name), and the corresponding id. The kernel will parse the string and create a rtas hotplug section that can be passed to the common entry point for handling hotplug requests. It should be noted that there is no chance of updating how we receive hotplug (dlpar) requests from the HMC on PowerVM systems. Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2015-02-11 04:47:02 +09:00
default:
pr_err("Invalid action (%d) specified\n", hp_elog->action);
rc = -EINVAL;
break;
}
if (!rc) {
rtas_hp_event = true;
rc = drmem_update_dt();
rtas_hp_event = false;
}
powerpc/pseries: Create new device hotplug entry point The current hotplug (or dlpar) of devices (the process is generally the same for memory, cpu, and pci) on PowerVM systems is initiated from the HMC, which communicates the request to the partitions through the RSCT framework. The RSCT framework then invokes the drmgr command. The drmgr command performs the hotplug operation by doing some pieces, such as most of the rtas calls and device tree parsing, in userspace and make requests to the kernel to online/offline the device, update the device tree and add/remove the device. For PowerKVM the approach for device hotplug is to follow what is currently being done for pci hotplug. A hotplug request is initiated from the host. QEMU then generates an EPOW interrupt to the guest which causes the guest to make the rtas,check-exception call. In QEMU, the rtas,check-exception call returns a rtas hotplug event to the guest. Please note that the current pci hotplug path for PowerKVM involves the kernel receiving the rtas hotplug event, passing it to rtas_errd in userspace, and having rtas_errd invoke drmgr. The drmgr command then handles the request as described above for PowerVM systems. There is no need for this circuitous route, we should just handle the entire hotplug of devices in the kernel. What I am planning is to enable this by moving the code to handle hotplug from drmgr into the kernel to provide a single path for handling device hotplug for both PowerVM and PowerKVM systems. This patch provides the common iframework and entry point. For PowerKVM a future update to the kernel rtas code will recognize rtas hotplug events returned from rtas,check-exception calls and use the common entry point to handle hotplug of the device. For PowerVM systems, This patch creates /sys/kernel/dlpar that can be used by the drmgr command to initiate hotplug requests. In order to do this a string of the format "<resource> <action> <id_type> <id>" is written to this file. The string consists of a resource (cpu, memory, pci, phb), an action (add or remove), an id_type (count, drc index, drc name), and the corresponding id. The kernel will parse the string and create a rtas hotplug section that can be passed to the common entry point for handling hotplug requests. It should be noted that there is no chance of updating how we receive hotplug (dlpar) requests from the HMC on PowerVM systems. Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2015-02-11 04:47:02 +09:00
unlock_device_hotplug();
return rc;
}
static int pseries_add_mem_node(struct device_node *np)
{
const __be32 *prop;
unsigned long base;
unsigned long lmb_size;
int ret = -EINVAL;
int addr_cells, size_cells;
/*
* Check to see if we are actually adding memory
*/
if (!of_node_is_type(np, "memory"))
return 0;
/*
* Find the base and size of the memblock
*/
prop = of_get_property(np, "reg", NULL);
if (!prop)
return ret;
addr_cells = of_n_addr_cells(np);
size_cells = of_n_size_cells(np);
/*
* "reg" property represents (addr,size) tuple.
*/
base = of_read_number(prop, addr_cells);
prop += addr_cells;
lmb_size = of_read_number(prop, size_cells);
/*
* Update memory region to represent the memory add
*/
ret = memblock_add(base, lmb_size);
return (ret < 0) ? -EINVAL : 0;
}
static int pseries_update_drconf_memory(struct of_reconfig_data *pr)
{
struct of_drconf_cell_v1 *new_drmem, *old_drmem;
unsigned long memblock_size;
u32 entries;
__be32 *p;
int i, rc = -EINVAL;
if (rtas_hp_event)
return 0;
memblock_size = pseries_memory_block_size();
if (!memblock_size)
return -EINVAL;
if (!pr->old_prop)
return 0;
p = (__be32 *) pr->old_prop->value;
if (!p)
return -EINVAL;
/* The first int of the property is the number of lmb's described
* by the property. This is followed by an array of of_drconf_cell
* entries. Get the number of entries and skip to the array of
* of_drconf_cell's.
*/
entries = be32_to_cpu(*p++);
old_drmem = (struct of_drconf_cell_v1 *)p;
p = (__be32 *)pr->prop->value;
p++;
new_drmem = (struct of_drconf_cell_v1 *)p;
for (i = 0; i < entries; i++) {
if ((be32_to_cpu(old_drmem[i].flags) & DRCONF_MEM_ASSIGNED) &&
(!(be32_to_cpu(new_drmem[i].flags) & DRCONF_MEM_ASSIGNED))) {
rc = pseries_remove_memblock(
be64_to_cpu(old_drmem[i].base_addr),
memblock_size);
break;
} else if ((!(be32_to_cpu(old_drmem[i].flags) &
DRCONF_MEM_ASSIGNED)) &&
(be32_to_cpu(new_drmem[i].flags) &
DRCONF_MEM_ASSIGNED)) {
rc = memblock_add(be64_to_cpu(old_drmem[i].base_addr),
memblock_size);
rc = (rc < 0) ? -EINVAL : 0;
break;
}
}
return rc;
}
static int pseries_memory_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
struct of_reconfig_data *rd = data;
int err = 0;
switch (action) {
case OF_RECONFIG_ATTACH_NODE:
err = pseries_add_mem_node(rd->dn);
break;
case OF_RECONFIG_DETACH_NODE:
err = pseries_remove_mem_node(rd->dn);
break;
case OF_RECONFIG_UPDATE_PROPERTY:
if (!strcmp(rd->prop->name, "ibm,dynamic-memory"))
err = pseries_update_drconf_memory(rd);
break;
}
return notifier_from_errno(err);
}
static struct notifier_block pseries_mem_nb = {
.notifier_call = pseries_memory_notifier,
};
static int __init pseries_memory_hotplug_init(void)
{
if (firmware_has_feature(FW_FEATURE_LPAR))
of_reconfig_notifier_register(&pseries_mem_nb);
return 0;
}
machine_device_initcall(pseries, pseries_memory_hotplug_init);